"""Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.

Canonical ZeroGPU pattern: the main process stays light (no model in it) and
the model is loaded **on the GPU inside** the ``@spaces.GPU`` function, where
the GPU actually exists. ``device_map="cuda"`` (accelerate) puts every shard on
the allocated GPU. An ``lru_cache`` keeps it resident for the life of each GPU
worker.

Why not load once at import? On ZeroGPU there is no GPU outside ``@spaces.GPU``,
and ZeroGPU forks the main process for every GPU call. Loading the 15 GB model
into the main process makes that fork heavy and tangled with gradio's asyncio
loop, and the GPU task never runs (it just times out -> "GPU task aborted").
Keeping the main process model-free is what makes the GPU call actually execute.

Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without a GPU or
the model download.
"""
from __future__ import annotations

import os
from functools import lru_cache

MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
STUB = os.environ.get("GDRAG_STUB_LLM") == "1"

# Import spaces BEFORE torch so ZeroGPU can patch CUDA. Degrade to a no-op
# decorator (CPU) when running locally without the package.
try:
    import spaces
    GPU = spaces.GPU
    ON_ZERO = True
except Exception:                                  # not on a Space
    ON_ZERO = False

    def GPU(*dargs, **dkwargs):
        def deco(fn):
            return fn
        if dargs and callable(dargs[0]):
            return dargs[0]
        return deco


@lru_cache(maxsize=1)
def _model_and_tokenizer():
    """Load tokenizer + 4-bit model. Called from inside ``generate`` so on
    ZeroGPU it runs in the GPU worker where bitsandbytes can quantize onto the
    allocated GPU (4-bit load requires a GPU). 4-bit shrinks the 7B from ~15 GB
    to ~5 GB, so it loads and generates fast enough to fit the GPU budget."""
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
    tok = AutoTokenizer.from_pretrained(MODEL_ID)
    if ON_ZERO:
        quant = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID, quantization_config=quant, device_map={"": 0},
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
    model.eval()
    return model, tok


def _render(messages, tok) -> str:
    return tok.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True)


@GPU(duration=180)
def generate(messages: list[dict], max_new_tokens: int = 512,
             temperature: float = 0.2) -> str:
    """Generate an assistant reply for chat-format ``messages``."""
    if STUB:
        return (
            "Here is a Godot 4 movement script:\n\n```gdscript\n"
            "extends CharacterBody2D\n\n@export var speed: float = 200.0\n\n"
            "func _physics_process(delta: float) -> void:\n"
            "\tvar dir := Input.get_vector(\"ui_left\", \"ui_right\", "
            "\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
            "\tmove_and_slide()\n```\n"
        )
    import torch, time
    model, tok = _model_and_tokenizer()
    dev = "cuda" if ON_ZERO else "cpu"
    text = _render(messages, tok)
    inputs = tok([text], return_tensors="pt").to(dev)
    print(f"[gen] dev={dev} cuda_avail={torch.cuda.is_available()} generating max_new={max_new_tokens}", flush=True)
    t0 = time.time()
    with torch.no_grad():
        out = model.generate(
            **inputs, max_new_tokens=max_new_tokens,
            do_sample=temperature > 0, temperature=max(temperature, 1e-4),
            top_p=0.95, pad_token_id=tok.eos_token_id,
        )
    n_new = int(out.shape[-1] - inputs["input_ids"].shape[1])
    print(f"[gen] done {n_new} tokens in {time.time()-t0:.1f}s on {dev}", flush=True)
    gen = out[0][inputs["input_ids"].shape[1]:]
    return tok.decode(gen, skip_special_tokens=True).strip()


def warmup() -> None:
    """No-op on ZeroGPU: the model can only be loaded inside @spaces.GPU (the
    GPU does not exist in the main process)."""
    if not ON_ZERO and not STUB:
        _model_and_tokenizer()