"""Nemotron Nano 4B (text-only) -> raw quest JSON. Pluggable GPU backend.

FROGQUEST_BACKEND selects WHERE the GPU work runs (the public functions are identical either way):
  - "zerogpu" (default): construct the Llama via llama.cpp INSIDE a @spaces.GPU function on the
    HF Space's ZeroGPU. (First call ~60-90s, then disk-cached & fast.)
  - "modal": forward to a deployed Modal class (see modal_app.py); the Space itself runs on
    CPU-basic and imports NOTHING heavy here.

The LLM's job is ONLY to write JSON to the contract in schema.py. Output is constrained with a
JSON-schema response_format and then validated/clamped by the caller. Shared prompts / the JSON
extractor / model config live in gpu_shared.py so both backends stay in lockstep.
"""
from __future__ import annotations

import os

os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")  # MUST precede huggingface_hub import

BACKEND = os.environ.get("FROGQUEST_BACKEND", "zerogpu").lower()
if BACKEND != "modal":  # the local/ZeroGPU path (default + any unrecognized value) needs the decorator
    import spaces  # noqa: E402

from schema import CAMPAIGN_RESPONSE_SCHEMA, INTENT_SCHEMA, RESPONSE_SCHEMA  # noqa: E402
from gpu_shared import (  # noqa: E402
    CAMPAIGN_SYSTEM_PROMPT,
    GGUF_FILE,
    GGUF_REPO,
    INTENT_SYSTEM_PROMPT,
    LOW_VRAM_GB,
    N_CTX,
    N_CTX_SMALL,
    SYSTEM_PROMPT,
    extract_json,
    preload_cuda_libs,
)

# Best-effort: warm the HF cache at startup so the FIRST @spaces.GPU call doesn't spend its
# (metered, on ZeroGPU) duration downloading ~4GB. Local-path only — on a CPU-basic Space (modal
# backend) we must NOT download the GGUF. No-op if offline or on a fresh local checkout.
if BACKEND != "modal":
    try:
        from huggingface_hub import hf_hub_download, list_repo_files
        _gguf = next((f for f in list_repo_files(GGUF_REPO) if "Q8_0" in f and f.endswith(".gguf")), None)
        if _gguf:
            hf_hub_download(GGUF_REPO, _gguf)
    except Exception:
        pass

_llm = None


def _get_llm():
    """Lazily download + construct the Llama model on the GPU (must run inside @spaces.GPU).

    First call downloads the GGUF then disk-caches it, so later calls are fast.
    """
    global _llm
    if _llm is None:
        # The prebuilt CUDA llama-cpp-python wheel links libcudart.so.12 / libcublas etc., which
        # ship inside the nvidia-*-cu12 packages torch pulls in but are NOT on the loader path.
        # Without help you get "libcudart.so.12: cannot open shared object file".
        # 1) importing torch loads many of them RTLD_GLOBAL;
        # 2) belt-and-suspenders: explicitly preload the nvidia-* CUDA libs too.
        import torch  # noqa: F401
        preload_cuda_libs()
        from llama_cpp import Llama

        vram_gb = (torch.cuda.get_device_properties(0).total_memory / 1e9
                   if torch.cuda.is_available() else 0)
        n_ctx = N_CTX if vram_gb >= LOW_VRAM_GB else N_CTX_SMALL
        _llm = Llama.from_pretrained(
            repo_id=GGUF_REPO,
            filename=GGUF_FILE,   # glob -> resolves the exact Q8_0 file (warmed at import)
            n_gpu_layers=-1,      # offload all layers (Q8 4B ~4.3GB fits even on a T4)
            n_ctx=n_ctx,
            verbose=False,
        )
    return _llm


# ----------------------------- local (in-Space, ZeroGPU) implementations -----------------------------

def _generate_quests_local(todos: str, theme: str) -> dict:
    """Return the model's raw JSON object (UNVALIDATED - caller must validate_and_clamp)."""
    llm = _get_llm()
    system = SYSTEM_PROMPT.replace("{theme}", theme)
    user = f"Theme: {theme}\nMy to-do list / goals:\n{todos.strip()}"

    out = llm.create_chat_completion(
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        response_format={"type": "json_object", "schema": RESPONSE_SCHEMA},
        temperature=0.0,
        max_tokens=4096,
    )
    return extract_json(out["choices"][0]["message"]["content"])


def _generate_campaign_local(goal: str, theme: str, snippets: str = "") -> dict:
    """One long-term goal (+ optional research snippets) -> raw campaign JSON (UNVALIDATED -
    caller must validate_campaign)."""
    llm = _get_llm()
    system = CAMPAIGN_SYSTEM_PROMPT.replace("{theme}", theme)
    user = f"Theme: {theme}\nLong-term goal:\n{goal.strip()}"
    if (snippets or "").strip():
        user += f"\n\nResearch notes:\n{snippets.strip()}"
    out = llm.create_chat_completion(
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        response_format={"type": "json_object", "schema": CAMPAIGN_RESPONSE_SCHEMA},
        temperature=0.0,
        max_tokens=4096,
    )
    return extract_json(out["choices"][0]["message"]["content"])


def _route_intent_local(message: str, context: str) -> dict:
    """Classify one Frog Master chat message into {intent, target_task?, reason?}.

    `context` is a SHORT text summary of the current log (does a log exist + quest titles/ids/
    status) - never images (CLAUDE.md rule). Falls back to {"intent": "unknown"} on bad output.
    """
    llm = _get_llm()
    user = f"Context:\n{context.strip()}\n\nUser message:\n{message.strip()}"
    out = llm.create_chat_completion(
        messages=[
            {"role": "system", "content": INTENT_SYSTEM_PROMPT},
            {"role": "user", "content": user},
        ],
        response_format={"type": "json_object", "schema": INTENT_SCHEMA},
        temperature=0.0,
        max_tokens=256,
    )
    parsed = extract_json(out["choices"][0]["message"]["content"])
    if not isinstance(parsed, dict) or parsed.get("intent") not in (
        "forge", "add_tasks", "mark_done", "mark_couldnt", "unknown",
    ):
        return {"intent": "unknown"}
    return parsed


# ----------------------------- modal (off-Space) wrappers -----------------------------

def _generate_quests_modal(todos: str, theme: str) -> dict:
    import modal
    llm = modal.Cls.from_name("frogquest", "LLM")()
    return llm.generate_quests.remote(todos, theme)


def _generate_campaign_modal(goal: str, theme: str, snippets: str = "") -> dict:
    import modal
    llm = modal.Cls.from_name("frogquest", "LLM")()
    return llm.generate_campaign.remote(goal, theme, snippets)


def _route_intent_modal(message: str, context: str) -> dict:
    import modal
    llm = modal.Cls.from_name("frogquest", "LLM")()
    return llm.route_intent.remote(message, context)


# ----------------------------- bind public names from the backend -----------------------------
# app.py imports these by name; signatures are identical across backends.
if BACKEND == "modal":
    generate_quests_raw = _generate_quests_modal
    generate_campaign_raw = _generate_campaign_modal
    route_intent = _route_intent_modal
else:
    generate_quests_raw = spaces.GPU(duration=70)(_generate_quests_local)
    generate_campaign_raw = spaces.GPU(duration=70)(_generate_campaign_local)
    route_intent = spaces.GPU(duration=45)(_route_intent_local)