ankira

Running

App Files Files Community

ankira / debug_llm.py

nofater

mvp

2a0825b 18 days ago

Raw

History Blame Contribute Delete

3.47 kB

	#!/usr/bin/env python3
	"""Controlled experiment to find why the Space's dictation comes back empty.

	Sends several payload variants to MODAL_LLM_URL, changing ONE variable at a
	time vs. the known-good pipeline_tts.py recipe, and prints the raw response so
	we can see which variable empties the output and where it disappears.

	MODAL_LLM_URL=https://<ws>--lfm25-8b-a1b-serve.modal.run uv run python debug_llm.py
	"""

	import json
	import os
	import sys

	import requests

	URL = os.environ.get("MODAL_LLM_URL")
	if not URL:
	sys.exit("Set MODAL_LLM_URL first.")
	ENDPOINT = f"{URL.rstrip('/')}/v1/chat/completions"
	MODEL = "LiquidAI/LFM2.5-8B-A1B-GGUF"

	from prompts import DICTATION_SYSTEM_PROMPT, build_user_prompt # noqa: E402

	WORDS = ["angeblich", "ablehnen", "Apfel", "Birne", "mitkriegen"]
	LEVEL = "B1"

	PIPELINE_SAMPLING = {"temperature": 0.2, "top_k": 80, "repeat_penalty": 1.05}
	APP_SAMPLING = {"temperature": 0.1, "top_p": 0.1, "top_k": 50, "repeat_penalty": 1.05}
	USER_MSG = {"role": "user", "content": build_user_prompt(WORDS, LEVEL)}
	SYSTEM_MSG = {"role": "system", "content": DICTATION_SYSTEM_PROMPT}

	# Each variant flips one thing relative to "A" (the known-good pipeline recipe).
	VARIANTS = {
	"A pipeline recipe (user-only, pipeline sampling, 1024)": {
	"messages": [USER_MSG], "max_tokens": 1024, **PIPELINE_SAMPLING},
	"B +system role (pipeline sampling)": {
	"messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 1024, **PIPELINE_SAMPLING},
	"C app sampling, user-only": {
	"messages": [USER_MSG], "max_tokens": 512, **APP_SAMPLING},
	"D full app config (system + app sampling, 512)": {
	"messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 512, **APP_SAMPLING},
	# E: the fix — same as D but with the larger budget now in app.py.
	"E FIX: app config + max_tokens 2048": {
	"messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 2048, **APP_SAMPLING},
	# F: better optimization — ask the server to skip thinking entirely. If the
	# LFM2.5 chat template honors this, reasoning_content vanishes and a small
	# budget suffices again (faster, cheaper). If ignored, behaves like D.
	"F try disable thinking (chat_template_kwargs, 512)": {
	"messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 512,
	"chat_template_kwargs": {"enable_thinking": False}, **APP_SAMPLING},
	}


	def run(name: str, extra: dict) -> None:
	payload = {"model": MODEL, **extra}
	print(f"\n===== {name} =====")
	try:
	r = requests.post(ENDPOINT, json=payload, timeout=900)
	r.raise_for_status()
	data = r.json()
	except Exception as e:
	print(" REQUEST FAILED:", e)
	return
	choice = (data.get("choices") or [{}])[0]
	msg = choice.get("message") or {}
	content = msg.get("content")
	print(" finish_reason:", choice.get("finish_reason"))
	print(" content len :", len(content or ""))
	print(" content repr :", repr(content)[:300])
	# Surface any non-standard fields (reasoning channel, etc.).
	extra_keys = {k: v for k, v in msg.items() if k not in ("role", "content")}
	if extra_keys:
	print(" other message keys:", json.dumps(extra_keys, ensure_ascii=False)[:300])
	if not (content or "").strip():
	print(" >> EMPTY. full response:", json.dumps(data, ensure_ascii=False)[:1500])


	if __name__ == "__main__":
	print("endpoint:", ENDPOINT)
	for name, extra in VARIANTS.items():
	run(name, extra)