#!/usr/bin/env python3 """Controlled experiment to find why the Space's dictation comes back empty. Sends several payload variants to MODAL_LLM_URL, changing ONE variable at a time vs. the known-good pipeline_tts.py recipe, and prints the raw response so we can see which variable empties the output and where it disappears. MODAL_LLM_URL=https://--lfm25-8b-a1b-serve.modal.run uv run python debug_llm.py """ import json import os import sys import requests URL = os.environ.get("MODAL_LLM_URL") if not URL: sys.exit("Set MODAL_LLM_URL first.") ENDPOINT = f"{URL.rstrip('/')}/v1/chat/completions" MODEL = "LiquidAI/LFM2.5-8B-A1B-GGUF" from prompts import DICTATION_SYSTEM_PROMPT, build_user_prompt # noqa: E402 WORDS = ["angeblich", "ablehnen", "Apfel", "Birne", "mitkriegen"] LEVEL = "B1" PIPELINE_SAMPLING = {"temperature": 0.2, "top_k": 80, "repeat_penalty": 1.05} APP_SAMPLING = {"temperature": 0.1, "top_p": 0.1, "top_k": 50, "repeat_penalty": 1.05} USER_MSG = {"role": "user", "content": build_user_prompt(WORDS, LEVEL)} SYSTEM_MSG = {"role": "system", "content": DICTATION_SYSTEM_PROMPT} # Each variant flips one thing relative to "A" (the known-good pipeline recipe). VARIANTS = { "A pipeline recipe (user-only, pipeline sampling, 1024)": { "messages": [USER_MSG], "max_tokens": 1024, **PIPELINE_SAMPLING}, "B +system role (pipeline sampling)": { "messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 1024, **PIPELINE_SAMPLING}, "C app sampling, user-only": { "messages": [USER_MSG], "max_tokens": 512, **APP_SAMPLING}, "D full app config (system + app sampling, 512)": { "messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 512, **APP_SAMPLING}, # E: the fix — same as D but with the larger budget now in app.py. "E FIX: app config + max_tokens 2048": { "messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 2048, **APP_SAMPLING}, # F: better optimization — ask the server to skip thinking entirely. If the # LFM2.5 chat template honors this, reasoning_content vanishes and a small # budget suffices again (faster, cheaper). If ignored, behaves like D. "F try disable thinking (chat_template_kwargs, 512)": { "messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 512, "chat_template_kwargs": {"enable_thinking": False}, **APP_SAMPLING}, } def run(name: str, extra: dict) -> None: payload = {"model": MODEL, **extra} print(f"\n===== {name} =====") try: r = requests.post(ENDPOINT, json=payload, timeout=900) r.raise_for_status() data = r.json() except Exception as e: print(" REQUEST FAILED:", e) return choice = (data.get("choices") or [{}])[0] msg = choice.get("message") or {} content = msg.get("content") print(" finish_reason:", choice.get("finish_reason")) print(" content len :", len(content or "")) print(" content repr :", repr(content)[:300]) # Surface any non-standard fields (reasoning channel, etc.). extra_keys = {k: v for k, v in msg.items() if k not in ("role", "content")} if extra_keys: print(" other message keys:", json.dumps(extra_keys, ensure_ascii=False)[:300]) if not (content or "").strip(): print(" >> EMPTY. full response:", json.dumps(data, ensure_ascii=False)[:1500]) if __name__ == "__main__": print("endpoint:", ENDPOINT) for name, extra in VARIANTS.items(): run(name, extra)