Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """Controlled experiment to find why the Space's dictation comes back empty. | |
| Sends several payload variants to MODAL_LLM_URL, changing ONE variable at a | |
| time vs. the known-good pipeline_tts.py recipe, and prints the raw response so | |
| we can see which variable empties the output and where it disappears. | |
| MODAL_LLM_URL=https://<ws>--lfm25-8b-a1b-serve.modal.run uv run python debug_llm.py | |
| """ | |
| import json | |
| import os | |
| import sys | |
| import requests | |
| URL = os.environ.get("MODAL_LLM_URL") | |
| if not URL: | |
| sys.exit("Set MODAL_LLM_URL first.") | |
| ENDPOINT = f"{URL.rstrip('/')}/v1/chat/completions" | |
| MODEL = "LiquidAI/LFM2.5-8B-A1B-GGUF" | |
| from prompts import DICTATION_SYSTEM_PROMPT, build_user_prompt # noqa: E402 | |
| WORDS = ["angeblich", "ablehnen", "Apfel", "Birne", "mitkriegen"] | |
| LEVEL = "B1" | |
| PIPELINE_SAMPLING = {"temperature": 0.2, "top_k": 80, "repeat_penalty": 1.05} | |
| APP_SAMPLING = {"temperature": 0.1, "top_p": 0.1, "top_k": 50, "repeat_penalty": 1.05} | |
| USER_MSG = {"role": "user", "content": build_user_prompt(WORDS, LEVEL)} | |
| SYSTEM_MSG = {"role": "system", "content": DICTATION_SYSTEM_PROMPT} | |
| # Each variant flips one thing relative to "A" (the known-good pipeline recipe). | |
| VARIANTS = { | |
| "A pipeline recipe (user-only, pipeline sampling, 1024)": { | |
| "messages": [USER_MSG], "max_tokens": 1024, **PIPELINE_SAMPLING}, | |
| "B +system role (pipeline sampling)": { | |
| "messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 1024, **PIPELINE_SAMPLING}, | |
| "C app sampling, user-only": { | |
| "messages": [USER_MSG], "max_tokens": 512, **APP_SAMPLING}, | |
| "D full app config (system + app sampling, 512)": { | |
| "messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 512, **APP_SAMPLING}, | |
| # E: the fix — same as D but with the larger budget now in app.py. | |
| "E FIX: app config + max_tokens 2048": { | |
| "messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 2048, **APP_SAMPLING}, | |
| # F: better optimization — ask the server to skip thinking entirely. If the | |
| # LFM2.5 chat template honors this, reasoning_content vanishes and a small | |
| # budget suffices again (faster, cheaper). If ignored, behaves like D. | |
| "F try disable thinking (chat_template_kwargs, 512)": { | |
| "messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 512, | |
| "chat_template_kwargs": {"enable_thinking": False}, **APP_SAMPLING}, | |
| } | |
| def run(name: str, extra: dict) -> None: | |
| payload = {"model": MODEL, **extra} | |
| print(f"\n===== {name} =====") | |
| try: | |
| r = requests.post(ENDPOINT, json=payload, timeout=900) | |
| r.raise_for_status() | |
| data = r.json() | |
| except Exception as e: | |
| print(" REQUEST FAILED:", e) | |
| return | |
| choice = (data.get("choices") or [{}])[0] | |
| msg = choice.get("message") or {} | |
| content = msg.get("content") | |
| print(" finish_reason:", choice.get("finish_reason")) | |
| print(" content len :", len(content or "")) | |
| print(" content repr :", repr(content)[:300]) | |
| # Surface any non-standard fields (reasoning channel, etc.). | |
| extra_keys = {k: v for k, v in msg.items() if k not in ("role", "content")} | |
| if extra_keys: | |
| print(" other message keys:", json.dumps(extra_keys, ensure_ascii=False)[:300]) | |
| if not (content or "").strip(): | |
| print(" >> EMPTY. full response:", json.dumps(data, ensure_ascii=False)[:1500]) | |
| if __name__ == "__main__": | |
| print("endpoint:", ENDPOINT) | |
| for name, extra in VARIANTS.items(): | |
| run(name, extra) | |