# Pull specific quantization
ollama pull hf.co/ermiaazarkhalili/Qwen2.5-7B-SFT-Capybara-GGUF:Q4_K_M
# Or create from local file
cat > Modelfile << EOF
FROM ./qwen2.5-7b-sft-capybara-q4_k_m.gguf
EOF
ollama create qwen2.5-7b-sft-capybara -f Modelfile
ollama run qwen2.5-7b-sft-capybara
llama.cpp
bash
# Run with llama-cli
./llama-cli -m qwen2.5-7b-sft-capybara-q4_k_m.gguf -p "Your prompt here" -n 256
# Run as server
./llama-server -m qwen2.5-7b-sft-capybara-q4_k_m.gguf --host 0.0.0.0 --port 8080
llama-cpp-python
python
from llama_cpp import Llama
llm = Llama(
model_path="qwen2.5-7b-sft-capybara-q4_k_m.gguf",
n_ctx=2048,
n_gpu_layers=-1 # Use all GPU layers
)
output = llm(
"What is machine learning?",
max_tokens=256,
temperature=0.7,
)
print(output['choices'][0]['text'])
LM Studio
Download the desired GGUF file from this repository
Open LM Studio and navigate to the Models tab
Click "Add Model" and select the downloaded GGUF file