ankira / debug_llm.py
nofater's picture
mvp
2a0825b
Raw
History Blame Contribute Delete
3.47 kB
#!/usr/bin/env python3
"""Controlled experiment to find why the Space's dictation comes back empty.
Sends several payload variants to MODAL_LLM_URL, changing ONE variable at a
time vs. the known-good pipeline_tts.py recipe, and prints the raw response so
we can see which variable empties the output and where it disappears.
MODAL_LLM_URL=https://<ws>--lfm25-8b-a1b-serve.modal.run uv run python debug_llm.py
"""
import json
import os
import sys
import requests
URL = os.environ.get("MODAL_LLM_URL")
if not URL:
sys.exit("Set MODAL_LLM_URL first.")
ENDPOINT = f"{URL.rstrip('/')}/v1/chat/completions"
MODEL = "LiquidAI/LFM2.5-8B-A1B-GGUF"
from prompts import DICTATION_SYSTEM_PROMPT, build_user_prompt # noqa: E402
WORDS = ["angeblich", "ablehnen", "Apfel", "Birne", "mitkriegen"]
LEVEL = "B1"
PIPELINE_SAMPLING = {"temperature": 0.2, "top_k": 80, "repeat_penalty": 1.05}
APP_SAMPLING = {"temperature": 0.1, "top_p": 0.1, "top_k": 50, "repeat_penalty": 1.05}
USER_MSG = {"role": "user", "content": build_user_prompt(WORDS, LEVEL)}
SYSTEM_MSG = {"role": "system", "content": DICTATION_SYSTEM_PROMPT}
# Each variant flips one thing relative to "A" (the known-good pipeline recipe).
VARIANTS = {
"A pipeline recipe (user-only, pipeline sampling, 1024)": {
"messages": [USER_MSG], "max_tokens": 1024, **PIPELINE_SAMPLING},
"B +system role (pipeline sampling)": {
"messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 1024, **PIPELINE_SAMPLING},
"C app sampling, user-only": {
"messages": [USER_MSG], "max_tokens": 512, **APP_SAMPLING},
"D full app config (system + app sampling, 512)": {
"messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 512, **APP_SAMPLING},
# E: the fix — same as D but with the larger budget now in app.py.
"E FIX: app config + max_tokens 2048": {
"messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 2048, **APP_SAMPLING},
# F: better optimization — ask the server to skip thinking entirely. If the
# LFM2.5 chat template honors this, reasoning_content vanishes and a small
# budget suffices again (faster, cheaper). If ignored, behaves like D.
"F try disable thinking (chat_template_kwargs, 512)": {
"messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 512,
"chat_template_kwargs": {"enable_thinking": False}, **APP_SAMPLING},
}
def run(name: str, extra: dict) -> None:
payload = {"model": MODEL, **extra}
print(f"\n===== {name} =====")
try:
r = requests.post(ENDPOINT, json=payload, timeout=900)
r.raise_for_status()
data = r.json()
except Exception as e:
print(" REQUEST FAILED:", e)
return
choice = (data.get("choices") or [{}])[0]
msg = choice.get("message") or {}
content = msg.get("content")
print(" finish_reason:", choice.get("finish_reason"))
print(" content len :", len(content or ""))
print(" content repr :", repr(content)[:300])
# Surface any non-standard fields (reasoning channel, etc.).
extra_keys = {k: v for k, v in msg.items() if k not in ("role", "content")}
if extra_keys:
print(" other message keys:", json.dumps(extra_keys, ensure_ascii=False)[:300])
if not (content or "").strip():
print(" >> EMPTY. full response:", json.dumps(data, ensure_ascii=False)[:1500])
if __name__ == "__main__":
print("endpoint:", ENDPOINT)
for name, extra in VARIANTS.items():
run(name, extra)