File size: 815 Bytes
02f4e01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# Quick inference test for Qwen2.5-Coder-7B-mobile
# Run: python inference_test.py
from llama_cpp import Llama
import time

print("Loading Qwen2.5-Coder-7B-mobile...")
llm = Llama(model_path="model.gguf", chat_format="chatml", n_ctx=512, n_threads=4, verbose=False)

tests = [
    "What is the capital of France?",
    "What is 2+2? Just the number.",
    "Write a one-sentence greeting.",
]

for prompt in tests:
    t0 = time.time()
    resp = llm.create_chat_completion(messages=[{"role":"user","content":prompt}], max_tokens=30, temperature=0.3)
    t1 = time.time()
    answer = resp["choices"][0]["message"]["content"].strip()
    tokens = resp.get("usage", {}).get("completion_tokens", 0)
    tps = tokens / (t1-t0) if (t1-t0) > 0 else 0
    print(f"Q: {prompt}")
    print(f"A: {answer} ({tps:.1f} t/s)\n")