# Quick inference test for Qwen2.5-0.5B-Coder-mobile # Run: python inference_test.py from llama_cpp import Llama import time print("Loading Qwen2.5-0.5B-Coder-mobile...") llm = Llama(model_path="model.gguf", chat_format="chatml", n_ctx=512, n_threads=4, verbose=False) tests = [ "What is the capital of France?", "What is 2+2? Just the number.", "Write a one-sentence greeting.", ] for prompt in tests: t0 = time.time() resp = llm.create_chat_completion(messages=[{"role":"user","content":prompt}], max_tokens=30, temperature=0.3) t1 = time.time() answer = resp["choices"][0]["message"]["content"].strip() tokens = resp.get("usage", {}).get("completion_tokens", 0) tps = tokens / (t1-t0) if (t1-t0) > 0 else 0 print(f"Q: {prompt}") print(f"A: {answer} ({tps:.1f} t/s)\n")