"""Inference layer for TemperCheck.

Two interchangeable backends, selected by the TEMPER_BACKEND env var (which
defaults to "transformers" on a Hugging Face Space, "ollama" elsewhere):

  - "transformers" (the Hugging Face Space / ZeroGPU) — loads google/gemma-4-E4B-it
                   with transformers and runs inference inside a @spaces.GPU
                   function. This is the deployment path; Gemma 4 vision works
                   here (it is broken in the local Ollama builds — see CLAUDE.md).
  - "ollama"       (local experimentation) — calls a local Ollama server. Fast and
                   torch-free, but the local Gemma 4 vision is unreliable, so this
                   is for plumbing/UI work, not real verdicts.

Everything else in the app talks to `score_image()` and never imports a backend
directly, so the model can be swapped without touching the UI (see CLAUDE.md).
"""

from __future__ import annotations

import base64
import io
import os

from PIL import Image

from .prompt import (
    SYSTEM_PROMPT,
    USER_INSTRUCTION,
    TemperVerdict,
    build_messages,
    parse_verdict,
)

# On a Space, default to the transformers backend; locally, default to Ollama.
_ON_SPACE = bool(os.environ.get("SPACE_ID"))
BACKEND = os.environ.get(
    "TEMPER_BACKEND", "transformers" if _ON_SPACE else "ollama"
).lower()

# Use 127.0.0.1, not "localhost": on Windows the latter resolves to IPv6 ::1
# first and stalls ~2s per request before falling back to IPv4 (measured — it
# was over half the total latency).
OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://127.0.0.1:11434")
OLLAMA_MODEL = os.environ.get(
    "TEMPER_OLLAMA_MODEL", "huihui_ai/gemma-4-abliterated:e4b-q8_0"
)
HF_MODEL = os.environ.get("TEMPER_HF_MODEL", "google/gemma-4-E4B-it")

# The verdict JSON is ~80 tokens; cap generation so the model can't ramble.
# Headroom over that keeps the JSON from ever truncating (which would break
# parsing). Generation is the only length-dependent cost — the image is not.
MAX_NEW_TOKENS = 192
# Our prompt is ~500 tokens; left alone the model loads a 128K context and pays
# the setup cost for it every request. A small context removes most of that.
OLLAMA_NUM_CTX = 4096
# Pin the model in VRAM between requests so there's no reload on the first hit
# after an idle gap. -1 = never unload.
OLLAMA_KEEP_ALIVE = -1

# Reuse one HTTP connection across requests (cheap; avoids per-call TCP setup).
_session = None


def _get_session():
    global _session
    if _session is None:
        import requests

        _session = requests.Session()
    return _session


def get_backend_name() -> str:
    if BACKEND == "transformers":
        return f"transformers · {HF_MODEL}"
    return f"ollama · {OLLAMA_MODEL}"


def _to_png_bytes(image: Image.Image) -> bytes:
    buf = io.BytesIO()
    image.convert("RGB").save(buf, format="PNG")
    return buf.getvalue()


# --- Ollama backend ---------------------------------------------------------


def _score_ollama(image: Image.Image) -> str:
    b64 = base64.b64encode(_to_png_bytes(image)).decode("ascii")
    payload = {
        "model": OLLAMA_MODEL,
        "format": "json",  # ask Ollama to constrain output to JSON
        "stream": False,
        "keep_alive": OLLAMA_KEEP_ALIVE,
        # NOTE: do NOT set num_predict here — with this model + format:json on
        # Ollama 0.30.7 it returns an empty completion (measured). The JSON
        # format already stops generation when the object closes, so a cap is
        # unnecessary. num_predict/max_new_tokens applies to the HF path only.
        "options": {
            "temperature": 0.4,
            "num_ctx": OLLAMA_NUM_CTX,
        },
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": USER_INSTRUCTION, "images": [b64]},
        ],
    }
    resp = _get_session().post(f"{OLLAMA_HOST}/api/chat", json=payload, timeout=180)
    resp.raise_for_status()
    return resp.json()["message"]["content"]


# --- Transformers backend (Hugging Face Space / ZeroGPU) --------------------
#
# ZeroGPU rules (https://huggingface.co/docs/hub/spaces-zerogpu):
#   * import `spaces` before torch,
#   * place the model on `cuda` at MODULE level (a CUDA emulation mode makes this
#     work at startup; lazy-loading inside the GPU fn is slower and discouraged),
#   * decorate the inference fn with @spaces.GPU (a no-op off ZeroGPU).
# All of this is set up only when the transformers backend is active, so local
# Ollama work needs neither torch nor spaces installed.


def _build_transformers_scorer():
    import spaces  # noqa: F401  (import before torch on ZeroGPU)
    import torch
    from transformers import AutoModelForImageTextToText, AutoProcessor

    processor = AutoProcessor.from_pretrained(HF_MODEL)
    model = AutoModelForImageTextToText.from_pretrained(HF_MODEL, dtype="auto")
    model.eval()
    model.to("cuda")  # placed at module level per ZeroGPU; realized in the GPU fn

    @spaces.GPU(duration=90)
    def _score(image: Image.Image) -> str:
        messages = build_messages(image.convert("RGB"))
        inputs = processor.apply_chat_template(
            messages,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
            add_generation_prompt=True,
        ).to("cuda")
        input_len = inputs["input_ids"].shape[-1]
        with torch.inference_mode():
            out = model.generate(
                **inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=False
            )
        return processor.decode(out[0][input_len:], skip_special_tokens=True)

    return _score


if BACKEND == "transformers":
    _score_transformers = _build_transformers_scorer()
else:

    def _score_transformers(image: Image.Image) -> str:
        raise RuntimeError("transformers backend is not active")


# --- Public API -------------------------------------------------------------


def score_image(image: Image.Image) -> TemperVerdict:
    """Run the configured backend on a PIL image and return a parsed verdict."""
    if image is None:
        raise ValueError("No image provided.")
    text = _score_transformers(image) if BACKEND == "transformers" else _score_ollama(
        image
    )
    return parse_verdict(text)