#!/usr/bin/env python3
"""Controlled experiment to find why the Space's dictation comes back empty.

Sends several payload variants to MODAL_LLM_URL, changing ONE variable at a
time vs. the known-good pipeline_tts.py recipe, and prints the raw response so
we can see which variable empties the output and where it disappears.

  MODAL_LLM_URL=https://<ws>--lfm25-8b-a1b-serve.modal.run uv run python debug_llm.py
"""

import json
import os
import sys

import requests

URL = os.environ.get("MODAL_LLM_URL")
if not URL:
    sys.exit("Set MODAL_LLM_URL first.")
ENDPOINT = f"{URL.rstrip('/')}/v1/chat/completions"
MODEL = "LiquidAI/LFM2.5-8B-A1B-GGUF"

from prompts import DICTATION_SYSTEM_PROMPT, build_user_prompt  # noqa: E402

WORDS = ["angeblich", "ablehnen", "Apfel", "Birne", "mitkriegen"]
LEVEL = "B1"

PIPELINE_SAMPLING = {"temperature": 0.2, "top_k": 80, "repeat_penalty": 1.05}
APP_SAMPLING = {"temperature": 0.1, "top_p": 0.1, "top_k": 50, "repeat_penalty": 1.05}
USER_MSG = {"role": "user", "content": build_user_prompt(WORDS, LEVEL)}
SYSTEM_MSG = {"role": "system", "content": DICTATION_SYSTEM_PROMPT}

# Each variant flips one thing relative to "A" (the known-good pipeline recipe).
VARIANTS = {
    "A  pipeline recipe (user-only, pipeline sampling, 1024)": {
        "messages": [USER_MSG], "max_tokens": 1024, **PIPELINE_SAMPLING},
    "B  +system role (pipeline sampling)": {
        "messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 1024, **PIPELINE_SAMPLING},
    "C  app sampling, user-only": {
        "messages": [USER_MSG], "max_tokens": 512, **APP_SAMPLING},
    "D  full app config (system + app sampling, 512)": {
        "messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 512, **APP_SAMPLING},
    # E: the fix — same as D but with the larger budget now in app.py.
    "E  FIX: app config + max_tokens 2048": {
        "messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 2048, **APP_SAMPLING},
    # F: better optimization — ask the server to skip thinking entirely. If the
    #    LFM2.5 chat template honors this, reasoning_content vanishes and a small
    #    budget suffices again (faster, cheaper). If ignored, behaves like D.
    "F  try disable thinking (chat_template_kwargs, 512)": {
        "messages": [SYSTEM_MSG, USER_MSG], "max_tokens": 512,
        "chat_template_kwargs": {"enable_thinking": False}, **APP_SAMPLING},
}


def run(name: str, extra: dict) -> None:
    payload = {"model": MODEL, **extra}
    print(f"\n===== {name} =====")
    try:
        r = requests.post(ENDPOINT, json=payload, timeout=900)
        r.raise_for_status()
        data = r.json()
    except Exception as e:
        print("  REQUEST FAILED:", e)
        return
    choice = (data.get("choices") or [{}])[0]
    msg = choice.get("message") or {}
    content = msg.get("content")
    print("  finish_reason:", choice.get("finish_reason"))
    print("  content len   :", len(content or ""))
    print("  content repr  :", repr(content)[:300])
    # Surface any non-standard fields (reasoning channel, etc.).
    extra_keys = {k: v for k, v in msg.items() if k not in ("role", "content")}
    if extra_keys:
        print("  other message keys:", json.dumps(extra_keys, ensure_ascii=False)[:300])
    if not (content or "").strip():
        print("  >> EMPTY. full response:", json.dumps(data, ensure_ascii=False)[:1500])


if __name__ == "__main__":
    print("endpoint:", ENDPOINT)
    for name, extra in VARIANTS.items():
        run(name, extra)