Qwen2.5-Coder-7B-mobile / inference_test.py
3morixd's picture
Upload inference_test.py with huggingface_hub
02f4e01 verified
Raw
History Blame Contribute Delete
815 Bytes
# Quick inference test for Qwen2.5-Coder-7B-mobile
# Run: python inference_test.py
from llama_cpp import Llama
import time
print("Loading Qwen2.5-Coder-7B-mobile...")
llm = Llama(model_path="model.gguf", chat_format="chatml", n_ctx=512, n_threads=4, verbose=False)
tests = [
"What is the capital of France?",
"What is 2+2? Just the number.",
"Write a one-sentence greeting.",
]
for prompt in tests:
t0 = time.time()
resp = llm.create_chat_completion(messages=[{"role":"user","content":prompt}], max_tokens=30, temperature=0.3)
t1 = time.time()
answer = resp["choices"][0]["message"]["content"].strip()
tokens = resp.get("usage", {}).get("completion_tokens", 0)
tps = tokens / (t1-t0) if (t1-t0) > 0 else 0
print(f"Q: {prompt}")
print(f"A: {answer} ({tps:.1f} t/s)\n")