from __future__ import annotations

import os
from dataclasses import dataclass
from typing import Any, Protocol

try:  # ZeroGPU: present on the Space, absent in local/CPU dev
    import spaces
except ImportError:  # pragma: no cover - local fallback
    spaces = None

from .json_utils import parse_json_object


@dataclass(frozen=True)
class RuntimeStatus:
    label: str
    used_fallback: bool = False


def _same_look(a: dict, b: dict) -> bool:
    """True when two genome dicts render identically: the visible creature is
    driven only by archetype + palette + parts, so a tweak that leaves all three
    untouched produces no on-screen change."""
    keys = ("archetype", "palette", "parts")
    return all(a.get(k) == b.get(k) for k in keys)


# The app runs a single model: Gemma 4 12B (the only family that draws hard voxel
# forms in the model-selection spike — see wiki/model-selection-spike.md). Every
# AI moment — freeform generation and hatch fusion — goes through it. Gemma 4
# needs llama-cpp-python > 0.3.19 (the Space pin); the local sibling venv has
# 0.3.28. Override via env for a local gguf. See
# docs/superpowers/specs/2026-06-15-freeform-voxel-generation-design.md.
FREEFORM_MODEL_REPO = os.getenv("BUDDY_FREEFORM_MODEL_REPO", "unsloth/gemma-4-12b-it-GGUF")
FREEFORM_MODEL_FILE = os.getenv("BUDDY_FREEFORM_MODEL_FILE", "gemma-4-12b-it-Q4_K_M.gguf")

# The edit path no longer uses a model: an English keyword parser
# (edit_parser.parse_edit) maps the instruction to a catalog part + parameters
# and the assembler snaps it on deterministically — instant, no GPU. The
# generative model above is reserved for full generation + hatch fusion, and for
# the freeform-edit safety valve when no catalog part matches.


def _blocking_completion(
    *,
    model_path: str,
    messages: list[dict[str, str]],
    schema: dict[str, Any] | None,
    max_tokens: int,
    temperature: float,
    top_p: float,
    n_ctx: int,
    n_threads: int | None,
    n_gpu_layers: int,
    flash_attn: bool,
    label: str = "zerogpu-bench",
) -> str:
    """Build a Llama and run one chat completion.

    On ZeroGPU the GPU is attached only inside the @spaces.GPU wrappers below
    and detaches when they return, so the model must be (re)constructed here
    every call with ``n_gpu_layers=-1`` — there is no persisting it across
    calls (a llama.cpp object built in global scope would stay on CPU, since
    ``spaces`` only hooks PyTorch's ``.to("cuda")``).
    """
    import time

    from llama_cpp import Llama

    t0 = time.perf_counter()
    llm = Llama(
        model_path=model_path,
        n_ctx=n_ctx,
        n_threads=n_threads,
        n_gpu_layers=n_gpu_layers,
        flash_attn=flash_attn,
        verbose=False,
    )
    t1 = time.perf_counter()
    kwargs: dict[str, Any] = dict(
        messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p
    )
    if schema is not None:
        # llama-cpp-python only honors {"type": "json_object", "schema": ...}; the
        # OpenAI-style {"type": "json_schema"} is silently ignored, which left the
        # model unconstrained (it returned `type` instead of `kind`, bare strings,
        # or a {"genome": {...}} wrapper) and forced every edit onto the keyword
        # fallback. json_object + schema turns the schema into an enforced GBNF
        # grammar so fuse/tweak output actually matches GENOME_SCHEMA.
        kwargs["response_format"] = {"type": "json_object", "schema": schema}
    response = llm.create_chat_completion(**kwargs)
    t2 = time.perf_counter()
    usage = response.get("usage") or {}
    tokens = int(usage.get("completion_tokens") or 0)
    gen_s = t2 - t1
    tok_s = tokens / gen_s if gen_s > 0 else 0.0
    # Printed to the Space logs so each call's cost is visible (reload + gen).
    print(
        f"[{label}] reload={t1 - t0:.2f}s gen={gen_s:.2f}s "
        f"total={t2 - t0:.2f}s tokens={tokens} ({tok_s:.1f} tok/s) "
        f"n_gpu_layers={n_gpu_layers}",
        flush=True,
    )
    return response["choices"][0]["message"]["content"]


def _run_completion(**kwargs: Any) -> str:
    return _blocking_completion(**kwargs)


def _run_freeform_completion(**kwargs: Any) -> str:
    return _blocking_completion(**kwargs)


def _warmup_completion(**kwargs: Any) -> str:
    return _blocking_completion(label="zerogpu-warmup", **kwargs)


# Freeform Gemma-4-12B box-geometry calls generate 300-600+ tokens at ~20 tok/s
# and routinely hit 25-31s total. On the old shared duration=30 they raced the
# ZeroGPU detach and crashed mid-decode (ggml_cuda_mul_mat_q CUDA abort -> the
# call dies and the user gets the fallback after a full wait). Give them their
# own larger budget. Override with BUDDY_FREEFORM_GPU_DURATION if 12B gen grows.
_FREEFORM_GPU_DURATION = int(os.getenv("BUDDY_FREEFORM_GPU_DURATION", "60"))

if spaces is not None:  # pragma: no cover - only active on the ZeroGPU Space
    # Short calls (e.g. tweak suggestions) are quick once warm, so keep their GPU
    # budget small for better queue behavior. The warmup eats the one-time ~75s
    # CUDA kernel JIT compile, so it needs a larger budget.
    _run_completion = spaces.GPU(duration=30)(_run_completion)
    _run_freeform_completion = spaces.GPU(duration=_FREEFORM_GPU_DURATION)(_run_freeform_completion)
    _warmup_completion = spaces.GPU(duration=90)(_warmup_completion)


class BuddyRuntime(Protocol):
    def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]:
        raise NotImplementedError

    def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]:
        raise NotImplementedError

    def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]:
        raise NotImplementedError

    def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]:
        raise NotImplementedError

    def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]:
        raise NotImplementedError

    def suggest_tweaks(self, boxes: list[dict]) -> list[str]:
        raise NotImplementedError


class FakeBuddyRuntime:
    _STATUS = RuntimeStatus("Generated by the local fallback", used_fallback=True)

    def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]:
        from .genome import genome_from_dict, genome_to_dict
        from .fallback import fuse_fallback
        child = fuse_fallback(genome_from_dict(parent_a), genome_from_dict(parent_b), seed=1)
        return genome_to_dict(child), self._STATUS

    def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]:
        from .genome import genome_from_dict, genome_to_dict
        from .fallback import voice_edit_genome
        edited = voice_edit_genome(genome_from_dict(genome), transcript)
        return genome_to_dict(edited), self._STATUS

    def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]:
        from .exemplars import pick_exemplar
        ex = pick_exemplar(prompt)
        genome = {"name": prompt.strip()[:40] or "Freeform Buddy", "archetype": "chick", "parts": []}
        return genome, list(ex["boxes"]), self._STATUS

    def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]:
        from .fallback import fuse_boxes_fallback
        boxes = fuse_boxes_fallback(parent_a.get("boxes"), parent_b.get("boxes"), seed=1)
        name = f"{(parent_a.get('name') or 'A')[:8]}×{(parent_b.get('name') or 'B')[:8]}"
        return {"name": name, "archetype": "chick", "parts": []}, boxes, self._STATUS

    def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]:
        # Same model-free edit path as the real runtime: the keyword parser maps
        # the instruction to a catalog part + parameters, the assembler snaps it
        # on. An unmatched instruction defaults to horns so local UI testing
        # always produces a visible change.
        from .assembler import assemble_part
        from .parts import PARTS
        from .edit_parser import parse_edit
        from .fallback import suggest_tweaks_fallback
        spec = parse_edit(instruction)
        if spec is not None:
            nb = assemble_part(boxes, spec.part, spec.anchor, spec.scale,
                               spec.color, spec.count, spec.rotation)
        else:
            nb = assemble_part(boxes, "horns", PARTS["horns"]["default_anchor"])
        return genome, nb, suggest_tweaks_fallback(nb), self._STATUS

    def suggest_tweaks(self, boxes: list[dict]) -> list[str]:
        from .fallback import suggest_tweaks_fallback
        return suggest_tweaks_fallback(boxes)


@dataclass(frozen=True)
class LlamaRuntimeConfig:
    model_path: str | None = None
    model_repo: str = FREEFORM_MODEL_REPO
    model_file: str = FREEFORM_MODEL_FILE
    n_ctx: int = 2048
    n_threads: int | None = None
    n_gpu_layers: int = -1
    flash_attn: bool = True
    timeout_seconds: float = 35.0
    max_tokens: int = 420

    @classmethod
    def from_env(cls) -> "LlamaRuntimeConfig":
        return cls(
            model_path=os.getenv("BUDDY_MODEL_PATH"),
            model_repo=os.getenv("BUDDY_MODEL_REPO", FREEFORM_MODEL_REPO),
            model_file=os.getenv("BUDDY_MODEL_FILE", FREEFORM_MODEL_FILE),
            n_ctx=int(os.getenv("BUDDY_N_CTX", "2048")),
            n_threads=_optional_int(os.getenv("BUDDY_N_THREADS")),
            n_gpu_layers=int(os.getenv("BUDDY_N_GPU_LAYERS", "-1")),
            flash_attn=os.getenv("BUDDY_FLASH_ATTN", "1") != "0",
            timeout_seconds=float(os.getenv("BUDDY_TIMEOUT_SECONDS", "35")),
            max_tokens=int(os.getenv("BUDDY_MAX_TOKENS", "420")),
        )


class LlamaCppBuddyRuntime:
    def __init__(self, config: LlamaRuntimeConfig | None = None) -> None:
        self.config = config or LlamaRuntimeConfig.from_env()

    def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]:
        from .genome import genome_from_dict, genome_to_dict, repair_genome
        from .fallback import fuse_fallback
        from .prompts import build_fuse_messages, GENOME_SCHEMA
        fb = fuse_fallback(genome_from_dict(parent_a), genome_from_dict(parent_b), seed=1)
        try:
            raw = self._chat_json(
                build_fuse_messages(parent_a, parent_b), GENOME_SCHEMA,
                max_tokens=int(os.environ.get("BUDDY_FUSE_MAX_TOKENS", "400")), temperature=0.9,
            )
            child = repair_genome(raw, fb)
            return genome_to_dict(child), RuntimeStatus("Generated by the local model")
        except Exception:  # noqa: BLE001 - runtime failures must not break the demo
            return genome_to_dict(fb), RuntimeStatus("Used the fallback splice", used_fallback=True)

    def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]:
        from .genome import genome_from_dict, genome_to_dict, repair_genome
        from .fallback import voice_edit_genome
        from .prompts import build_genome_command_messages, GENOME_EDIT_SCHEMA
        base = genome_from_dict(genome)
        base_out = genome_to_dict(base)
        fb = genome_to_dict(voice_edit_genome(base, transcript))
        try:
            edit = self._chat_json(
                build_genome_command_messages(genome, transcript), GENOME_EDIT_SCHEMA,
                max_tokens=int(os.environ.get("BUDDY_EDIT_MAX_TOKENS", "200")), temperature=0.5,
            ) or {}
            merged = {**genome,
                      "parts": (edit.get("parts") or genome.get("parts")),
                      "palette": (edit.get("palette") or genome.get("palette")),
                      "archetype": (edit.get("archetype") or genome.get("archetype"))}
            model_out = genome_to_dict(repair_genome(merged, base))
            # A small model often returns a valid-but-empty edit (no parts /
            # palette / archetype), which silently leaves the creature untouched
            # so the user sees no reaction. When the model changed nothing visible
            # but the deterministic keyword editor did, surface that instead so a
            # tweak always lands. (The success path used to keep the original and
            # never reach the keyword fallback, which only fired on an exception.)
            if _same_look(model_out, base_out) and not _same_look(fb, base_out):
                return fb, RuntimeStatus("Used the keyword editor", used_fallback=True)
            return model_out, RuntimeStatus("Generated by the local model")
        except Exception:  # noqa: BLE001 - runtime failures must not break the demo
            return fb, RuntimeStatus("Used the keyword editor", used_fallback=True)

    def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]:
        # Model-free by design: the bench's from-scratch box must never call the
        # model — only Splice (fuse_creatures) does. Pick a reference body by
        # keyword, the same deterministic path the fake runtime uses, so real and
        # fake agree. (The model-backed draw used to live here; it was retired when
        # the bench input was made fully model-free.)
        from .exemplars import pick_exemplar
        ex = pick_exemplar(prompt)
        genome = {"name": prompt.strip()[:40] or "Freeform Buddy", "archetype": "chick", "parts": []}
        return genome, list(ex["boxes"]), RuntimeStatus("Built from a reference template")

    def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]:
        import json
        from .exemplars import pick_exemplar
        from .boxes import repair_boxes
        from .fallback import fuse_boxes_fallback
        from .prompts import build_freeform_fuse_messages, BOX_SCHEMA
        ex = pick_exemplar((parent_a.get("name") or "") + " " + (parent_b.get("name") or ""))
        name = f"{(parent_a.get('name') or 'A')[:8]}×{(parent_b.get('name') or 'B')[:8]}"
        fb_boxes = fuse_boxes_fallback(parent_a.get("boxes"), parent_b.get("boxes"), seed=1)
        try:
            raw = self._chat_json(
                build_freeform_fuse_messages(parent_a, parent_b, json.dumps(ex)), BOX_SCHEMA,
                max_tokens=int(os.environ.get("BUDDY_FREEFORM_MAX_TOKENS", "1000")),
                temperature=0.9,
                heavy=True,
            ) or {}
            boxes = repair_boxes(raw.get("boxes") if isinstance(raw, dict) else None)
            child_name = (raw.get("name") if isinstance(raw, dict) else None) or name
            if boxes:
                return {"name": child_name, "archetype": "chick", "parts": []}, boxes, RuntimeStatus("Generated by the local model")
        except Exception:  # noqa: BLE001 - runtime failures must not break the demo
            pass
        return {"name": name, "archetype": "chick", "parts": []}, fb_boxes, RuntimeStatus("Used a reference template", used_fallback=True)

    def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]:
        # Model-free by design: the bench tweak box must never call the model — only
        # Splice (fuse_creatures) does. An English keyword parser maps the instruction
        # to a catalog part + parameters and the assembler snaps it on deterministically.
        # An instruction that matches no catalog part defaults to horns, so a tweak
        # always produces a visible change (mirrors the fake runtime). (The model-backed
        # freeform edit used to be a fallback here; it was retired with the box.)
        from .assembler import assemble_part
        from .parts import PARTS
        from .edit_parser import parse_edit
        from .fallback import suggest_tweaks_fallback
        spec = parse_edit(instruction)
        if spec is not None:
            print(f"[edit] part={spec.part} anchor={spec.anchor} count={spec.count} "
                  f"scale={spec.scale} rot={spec.rotation}", flush=True)
            new_boxes = assemble_part(boxes, spec.part, spec.anchor, spec.scale,
                                      spec.color, spec.count, spec.rotation)
        else:
            new_boxes = assemble_part(boxes, "horns", PARTS["horns"]["default_anchor"])
        return genome, new_boxes, suggest_tweaks_fallback(new_boxes), RuntimeStatus("Built from the part catalog")

    def suggest_tweaks(self, boxes: list[dict]) -> list[str]:
        from .fallback import suggest_tweaks_fallback
        from .prompts import build_freeform_suggest_messages, TWEAK_SCHEMA
        try:
            raw = self._chat_json(
                build_freeform_suggest_messages(boxes), TWEAK_SCHEMA,
                max_tokens=int(os.environ.get("BUDDY_SUGGEST_MAX_TOKENS", "200")),
                temperature=0.9,
            ) or {}
            tweaks = raw.get("tweaks") if isinstance(raw, dict) else None
            clean = [str(t).strip() for t in tweaks if str(t).strip()] if isinstance(tweaks, list) else []
            if clean:
                # Pad/trim to exactly 4, topping up from the fallback if short.
                fill = suggest_tweaks_fallback(boxes)
                while len(clean) < 4:
                    clean.append(next((f for f in fill if f not in clean), fill[len(clean) % len(fill)]))
                return clean[:4]
        except Exception:  # noqa: BLE001 - suggestions are advisory; never break the bench
            pass
        return suggest_tweaks_fallback(boxes)

    def _chat_json(
        self,
        messages: list[dict[str, str]],
        schema: dict[str, Any],
        *,
        max_tokens: int | None = None,
        temperature: float = 0.85,
        heavy: bool = False,
    ) -> dict[str, Any] | None:
        # Resolve the model file outside the GPU call (hf_hub_download caches
        # to disk), then run inference inside the @spaces.GPU-wrapped helper so
        # ZeroGPU attaches a GPU for the duration of the completion. heavy=True
        # routes the long-running 12B freeform calls to the larger-duration GPU
        # wrapper so they do not race the ZeroGPU detach and crash mid-decode.
        model_path = self._resolve_model_path()
        run = _run_freeform_completion if heavy else _run_completion
        # On the freeform box path, BUDDY_FREEFORM_NO_GRAMMAR=1 drops the GBNF
        # grammar: it ~2x's 12B decode (the grammar is intrinsic per-token CPU
        # overhead, not schema bloat) while parse_json_object + repair_boxes catch
        # the now-unconstrained output. Short (non-heavy) calls always keep the
        # grammar. See wiki/deployment-strategy.md.
        effective_schema = schema
        if heavy and os.getenv("BUDDY_FREEFORM_NO_GRAMMAR") == "1":
            effective_schema = None
        content = run(
            model_path=model_path,
            messages=messages,
            schema=effective_schema,
            max_tokens=max_tokens or self.config.max_tokens,
            temperature=temperature,
            top_p=0.9,
            n_ctx=max(self.config.n_ctx, 4096),
            n_threads=self.config.n_threads,
            n_gpu_layers=self.config.n_gpu_layers,
            flash_attn=self.config.flash_attn,
        )
        return parse_json_object(content)

    def _resolve_model_path(self) -> str:
        # An explicit local path wins (local dev / tests); otherwise download the
        # configured GGUF (hf_hub_download caches to disk). On a download failure
        # each runtime method already falls back to a hand-authored exemplar, so
        # there is no second model to fall back to.
        if self.config.model_path:
            return self.config.model_path
        from huggingface_hub import hf_hub_download

        return hf_hub_download(repo_id=self.config.model_repo, filename=self.config.model_file)


def default_runtime() -> BuddyRuntime:
    if os.getenv("BUDDY_FORCE_FAKE_RUNTIME") == "1":
        return FakeBuddyRuntime()
    return LlamaCppBuddyRuntime()


_warmed = False


def warmup() -> None:
    """Trigger the one-time CUDA kernel JIT compile ahead of real traffic.

    Wired to Gradio's ``demo.load`` so it fires when the page opens: the ~48s
    first-call compile happens while the player reads the intro, so their first
    real action is already warm. Runs once per container; safe to call
    repeatedly and on CPU / when ``spaces`` is absent. No-ops in the fake runtime.
    """
    global _warmed
    if _warmed or os.getenv("BUDDY_FORCE_FAKE_RUNTIME") == "1":
        return
    _warmed = True  # claim first so concurrent page loads do not double-fire
    try:
        cfg = LlamaRuntimeConfig.from_env()
        # Warm the model every real action — draw / tweak / splice — runs.
        model_path = LlamaCppBuddyRuntime(cfg)._resolve_model_path()
        _warmup_completion(
            model_path=model_path,
            messages=[{"role": "user", "content": "warmup"}],
            schema=None,
            max_tokens=8,
            temperature=0.0,
            top_p=1.0,
            n_ctx=cfg.n_ctx,
            n_threads=cfg.n_threads,
            n_gpu_layers=cfg.n_gpu_layers,
            flash_attn=cfg.flash_attn,
        )
        if os.getenv("BUDDY_GRAMMAR_BENCH") == "1":
            _grammar_bench(cfg, model_path)
    except Exception as exc:  # noqa: BLE001 - warmup is best-effort
        _warmed = False  # allow a retry on the next page load
        print(f"[zerogpu-warmup] failed: {exc}", flush=True)


def _grammar_bench(cfg: "LlamaRuntimeConfig", model_path: str) -> None:
    """One-off diagnostic: run the freeform prompt UNCONSTRAINED (no GBNF grammar)
    to isolate how much grammar-constrained sampling costs vs raw 12B decode.

    Only the grammar-OFF run lives here: the grammar-ON throughput is already the
    normal [zerogpu-bench] number every real freeform call prints (~17-22 tok/s),
    so there is no need to re-measure it. Running BOTH inside this one demo.load
    handler stacked a 3rd GPU acquisition after the warmup gen and ZeroGPU aborted
    it ("GPU task aborted"); keeping the handler to 2 acquisitions (warmup + this)
    is in the range the log shows succeeds. Uses the same n_ctx=4096 / freeform
    GPU wrapper as the real calls so the number transfers directly. Gated by
    BUDDY_GRAMMAR_BENCH=1; read [grammar-bench-off] off the log, compare against
    the [zerogpu-bench] (grammar-on) lines, then unset the env."""
    import json
    from .exemplars import pick_exemplar
    from .prompts import build_freeform_messages

    prompt = "a small round dragon with two curved horns and a long tail"
    ex = pick_exemplar(prompt)
    try:
        _run_freeform_completion(
            model_path=model_path,
            messages=build_freeform_messages(prompt, json.dumps(ex)),
            schema=None,
            max_tokens=int(os.getenv("BUDDY_GRAMMAR_BENCH_TOKENS", "400")),
            temperature=0.8,
            top_p=0.9,
            n_ctx=max(cfg.n_ctx, 4096),
            n_threads=cfg.n_threads,
            n_gpu_layers=cfg.n_gpu_layers,
            flash_attn=cfg.flash_attn,
            label="grammar-bench-off",
        )
    except Exception as exc:  # noqa: BLE001 - diagnostic only, never break warmup
        print(f"[grammar-bench] failed: {exc}", flush=True)


def _optional_int(value: str | None) -> int | None:
    if value is None or value == "":
        return None
    return int(value)