"""Local Ollama client (real calls, not mocks).

Fully local Gemma inference → earns Off the Grid + Llama Champion by
construction (Ollama runs on llama.cpp). The UI surfaces which model ran and
whether it was a real call or the deterministic fallback (never crash the demo).

`gemma4:e4b` is the default (= gemma4:latest, 9.6GB). `gemma4:e2b` if CPU
latency demands. NOTE: `gemma4:4b` does NOT exist — never use that tag.
"""

from __future__ import annotations

import json
import os

MODEL = os.environ.get("CHIEF_ENGINEER_MODEL", "gemma4:e4b")
MODAL_API_URL = os.environ.get("CHIEF_ENGINEER_MODAL_URL",
    "https://kylebrodeur--microfactory-node-inference-serve.modal.run/v1/chat/completions")

# Backend select. Default "ollama" keeps local/recording behavior IDENTICAL.
# CHIEF_ENGINEER_BACKEND is the *initial default* (e.g. a Space var = zerogpu); it is
# NOT a hard lock. The in-app model switcher changes the backend at runtime by setting
# the env var, so routing reads it dynamically via _backend() — a fixed Space var would
# otherwise freeze the ZeroGPU<->Modal switch. BACKEND keeps the startup value for
# back-compat/logging. Unknown/import-fail → ollama.
BACKEND = os.environ.get("CHIEF_ENGINEER_BACKEND", "ollama").lower()


def _backend() -> str:
    """Active backend, read dynamically so the model switcher's selection takes effect
    at runtime (the dropdown sets CHIEF_ENGINEER_BACKEND via app._apply_model_choice)."""
    return os.environ.get("CHIEF_ENGINEER_BACKEND", BACKEND).lower()


try:
    import ollama  # type: ignore
except Exception:  # pragma: no cover
    ollama = None  # type: ignore


def _zerogpu():
    """Lazily import the ZeroGPU backend; None unless selected and importable."""
    if _backend() != "zerogpu":
        return None
    try:
        from . import llm_zerogpu  # heavy deps are import-guarded inside
        return llm_zerogpu
    except Exception:
        return None


def _modal_api():
    """Lazily check if Modal API backend is selected."""
    if _backend() != "modal":
        return None
    return True  # Modal API is always available (HTTP endpoint)


def _forced_offline() -> bool:
    """Force the deterministic fallback path regardless of any daemon/backend.
    Read dynamically (not cached) so tests can toggle it. Used by the offline
    core suite so `make test` never touches Ollama, even when `ollama serve` is up."""
    return os.environ.get("CHIEF_ENGINEER_OFFLINE", "").lower() in ("1", "true", "yes")


def is_available() -> bool:
    """True if the active backend can serve a real call."""
    if _forced_offline():
        return False
    zg = _zerogpu()
    if zg is not None:
        return zg.is_available()
    if _modal_api():
        return True
    if ollama is None:
        return False
    try:
        ollama.list()
        return True
    except Exception:
        return False


def backend_status() -> str:
    zg = _zerogpu()
    if zg is not None:
        return zg.backend_status()
    if _modal_api():
        return f"<span style='color:var(--ao-green);'>●</span> live · Modal API (remote GPU)"
    return (f"<span style='color:var(--ao-green);'>●</span> live · {MODEL} (local Ollama)"
            if is_available() else
            f"<span style='color:var(--ao-yellow);'>●</span> offline fallback · "
            f"{MODEL} unreachable (deterministic)")


def warm_up() -> str:
    """Pay the model's cold start now (off-camera), so the first real BUILD is fast.
    On ZeroGPU this enters the GPU window and loads the model; on Ollama/fallback it is
    a cheap no-op. On Modal API it's a no-op (Modal handles its own warm-up).
    Returns the (post-load) backend status. Never raises."""
    zg = _zerogpu()
    if zg is not None:
        try:
            return zg.warm()
        except Exception:
            return backend_status()
    return backend_status()


def chat_json(system: str, user: str, temperature: float = 0.4) -> dict | None:
    """One JSON-mode chat turn. Returns parsed dict, or None to signal fallback."""
    if _forced_offline():
        return None
    zg = _zerogpu()
    if zg is not None:
        try:
            return zg.chat_json(system, user, temperature)
        except Exception:
            return None
    if _modal_api():
        try:
            import urllib.request
            body = json.dumps({
                "messages": [{"role": "user", "content": f"{system}\n\n{user}"}],
                "max_tokens": 512,
                "temperature": temperature,
            }).encode()
            req = urllib.request.Request(MODAL_API_URL, data=body,
                headers={"Content-Type": "application/json"})
            with urllib.request.urlopen(req, timeout=120) as resp:
                data = json.loads(resp.read())
                text = data["choices"][0]["message"]["content"].strip()
                if text.startswith("```"):
                    text = text.strip("`").lstrip()
                    if text[:4].lower() == "json":
                        text = text[4:]
                return json.loads(text)
        except Exception:
            return None
    if not is_available():
        return None
    try:
        resp = ollama.chat(
            model=MODEL,
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": user},
            ],
            format="json",
            options={"temperature": temperature},
        )
        content = resp["message"]["content"].strip()
        # Fence-strip safety net (GEMMA-STEERING Technique 2): small Gemmas can
        # wrap JSON in ```json fences even in JSON mode. Strip before parsing.
        if content.startswith("```"):
            content = content.strip("`").lstrip()
            if content[:4].lower() == "json":
                content = content[4:]
        return json.loads(content)
    except Exception:
        return None