# Quick inference test for Qwen2.5-0.5B-Coder-mobile
# Run: python inference_test.py
from llama_cpp import Llama
import time

print("Loading Qwen2.5-0.5B-Coder-mobile...")
llm = Llama(model_path="model.gguf", chat_format="chatml", n_ctx=512, n_threads=4, verbose=False)

tests = [
    "What is the capital of France?",
    "What is 2+2? Just the number.",
    "Write a one-sentence greeting.",
]

for prompt in tests:
    t0 = time.time()
    resp = llm.create_chat_completion(messages=[{"role":"user","content":prompt}], max_tokens=30, temperature=0.3)
    t1 = time.time()
    answer = resp["choices"][0]["message"]["content"].strip()
    tokens = resp.get("usage", {}).get("completion_tokens", 0)
    tps = tokens / (t1-t0) if (t1-t0) > 0 else 0
    print(f"Q: {prompt}")
    print(f"A: {answer} ({tps:.1f} t/s)\n")