"""Hit the underlying llama-server endpoint with a trivial English prompt to see if the LLM itself is broken or only the witness path is.""" import json, sys, time, urllib.request from pathlib import Path REPO = Path(__file__).resolve().parent.parent sys.path.insert(0, str(REPO)) from config import load_settings settings = load_settings() out = REPO / "runtime" / "tmp" / "llm_probe.txt" out.parent.mkdir(parents=True, exist_ok=True) prompts = [ ("system: You are a helpful assistant. Respond ONLY in English.\nuser: What is 2 + 2? Reply with the digit.", [{"role": "system", "content": "You are a helpful assistant. Respond ONLY in English."}, {"role": "user", "content": "What is 2 + 2? Reply with the digit."}]), ("system: Reply in English.\nuser: Name a single fruit.", [{"role": "system", "content": "Reply in English."}, {"role": "user", "content": "Name a single fruit."}]), ("system: -\nuser: Hello, how are you?", [{"role": "user", "content": "Hello, how are you?"}]), ] lines: list[str] = [] for label, msgs in prompts: payload = { "model": settings.minicpm_quantization or settings.llm_model, "messages": msgs, "temperature": 0.3, "max_tokens": 80, "stream": False, } req = urllib.request.Request( "http://127.0.0.1:19060/v1/chat/completions", data=json.dumps(payload).encode("utf-8"), headers={"Content-Type": "application/json"}, method="POST", ) start = time.perf_counter() with urllib.request.urlopen(req, timeout=60) as resp: data = json.loads(resp.read().decode("utf-8")) elapsed = time.perf_counter() - start text = data["choices"][0]["message"]["content"].strip() lines.append(f"=== {label}") lines.append(f"elapsed: {elapsed:.2f}s") lines.append(f"reply: {text!r}") lines.append(f"raw : {text}") lines.append("") out.write_text("\n".join(lines), encoding="utf-8") print(f"wrote: {out}")