# Qwen2.5-Math-1.5B-mobile — Verified Usage Examples
# Chat format: chatml
# CPU speed: 15.7 t/s
# Verified: June 2026

# === Using dispatchai SDK ===
from dispatchai import load_model
model = load_model("Qwen2.5-Math-1.5B-mobile", backend="gguf")

# Chat
response = model.chat("What is the capital of France?")
print(f"Capital: {response}")

# With system prompt
response = model.chat("Summarize this: The meeting is at 3pm.", system="You are a concise assistant.")
print(f"Summary: {response}")

# === Using llama-cpp-python directly ===
from llama_cpp import Llama
llm = Llama(model_path="model.gguf", chat_format="chatml", n_ctx=512, n_threads=4, verbose=False)

response = llm.create_chat_completion(
    messages=[{"role": "user", "content": "What is 2+2?"}],
    max_tokens=30,
)
print(f"Math: {response['choices'][0]['message']['content']}")

# === Using llama.cpp CLI ===
# llama-cli -m model.gguf -p "Hello!" -n 30 -t 4 -st --chat-format chatml