"""Local inference: manage a llama-server subprocess running Nemotron.

We bundle nothing. On first run the player is walked through fetching the
llama.cpp release binary and the GGUF (the carving-the-totem ritual lives
in the UI layer; this module only knows paths, processes, and health).
"""

from __future__ import annotations

import os
import shutil
import subprocess
import time
import urllib.request
from pathlib import Path

from .api import OpenAIChatBackend

SCRYPT_HOME = Path(os.environ.get("SCRYPT_HOME", "~/.scrypt")).expanduser()
HF_REPO = "https://huggingface.co/IMJONEZZ/warden-nemotron-3-nano-30b/resolve/main"
MIN_RAM_GB = 32
PORT = 8731

# Quant ladder: pick the heaviest GGUF that leaves the OS and the game
# breathing room on this machine's TOTAL memory. (file size GB, min RAM GB)
# Below 32GB total we refuse local mode and point at the API.
QUANT_LADDER = [
    ("Q8_0", 34, 96),
    ("Q6_K", 34, 80),
    ("Q5_K_M", 27, 64),
    ("Q4_K_S", 22, 40),
    ("Q3_K_S", 19, 32),
]


def choose_quant(ram_gb: float | None = None) -> str | None:
    """Best quant for this machine, or None if below the floor.

    Honors SCRYPT_QUANT. Unknown RAM (non-Linux) defaults to Q4_K_S —
    the player can always override.
    """
    forced = os.environ.get("SCRYPT_QUANT")
    if forced:
        return forced
    ram = system_ram_gb() if ram_gb is None else ram_gb
    if ram == 0:
        return "Q4_K_S"
    for quant, _size, min_ram in QUANT_LADDER:
        if ram >= min_ram:
            return quant
    return None


def model_file(quant: str) -> str:
    return f"warden-nemotron-3-nano-30b-{quant}.gguf"


def model_url(quant: str) -> str:
    return f"{HF_REPO}/{model_file(quant)}"


class LocalSetupError(Exception):
    pass


def system_ram_gb() -> float:
    try:
        with open("/proc/meminfo") as f:
            for line in f:
                if line.startswith("MemTotal"):
                    return int(line.split()[1]) / 1024 / 1024
    except OSError:
        pass
    return 0.0


def find_binary() -> Path | None:
    # Our curated binary first: it's the one we built/verified for this
    # machine (e.g. the CUDA build), so it outranks whatever is on PATH.
    local = SCRYPT_HOME / "bin" / "llama-server"
    if local.exists():
        return local
    on_path = shutil.which("llama-server")
    return Path(on_path) if on_path else None


def llama_cpp_available() -> bool:
    """Bundled runtime: pip install 'scrypt[local]' brings llama-cpp-python."""
    import importlib.util

    return importlib.util.find_spec("llama_cpp") is not None


def server_command(model: Path, port: int, ctx_size: int) -> list[str] | None:
    """How to launch an OpenAI-compatible server on this machine.

    Prefers a native llama-server binary (faster, full --jinja template
    support); falls back to the bundled llama-cpp-python server so players
    never need anything on PATH.
    """
    binary = find_binary()
    if binary is not None:
        return [
            str(binary), "--model", str(model), "--port", str(port),
            "--ctx-size", str(ctx_size),
            "--jinja",  # required for Nemotron chat template kwargs
            "--no-webui",
        ]
    if llama_cpp_available():
        import sys

        # llama_cpp.server ignores chat_template_kwargs; enable_thinking
        # stays off by default in the GGUF template, which is what we want.
        return [
            sys.executable, "-m", "llama_cpp.server", "--model", str(model),
            "--port", str(port), "--n_ctx", str(ctx_size),
        ]
    return None


def model_path() -> Path:
    quant = choose_quant()
    name = model_file(quant) if quant else model_file("Q3_K_S")
    return SCRYPT_HOME / "models" / name


def installed_model() -> Path | None:
    """Any already-downloaded model, so a player who manually fetched a
    different tier — or named the file whatever they liked — still gets
    local mode. Ladder names win; otherwise the largest .gguf present."""
    models_dir = SCRYPT_HOME / "models"
    if not models_dir.is_dir():
        return None
    for quant, _size, _ram in QUANT_LADDER:
        candidate = models_dir / model_file(quant)
        if candidate.exists():
            return candidate
    ggufs = sorted(
        models_dir.glob("*.gguf"), key=lambda p: p.stat().st_size, reverse=True
    )
    return ggufs[0] if ggufs else None


def preflight() -> list[str]:
    """Human-readable problems blocking local inference. Empty = ready."""
    problems = []
    quant = choose_quant()
    if quant is None:
        problems.append(
            f"this machine has {system_ram_gb():.0f}GB RAM; local play needs "
            f"{MIN_RAM_GB}GB. use API mode instead (set SCRYPT_API_KEY)."
        )
    if find_binary() is None and not llama_cpp_available():
        problems.append(
            "no llama runtime. easiest: pip install 'scrypt[local]' (bundles "
            "llama-cpp-python). or install llama.cpp's llama-server "
            f"(https://github.com/ggml-org/llama.cpp/releases) on PATH or at "
            f"{SCRYPT_HOME / 'bin' / 'llama-server'}."
        )
    if installed_model() is None and quant is not None:
        size = next(s for q, s, _ in QUANT_LADDER if q == quant)
        problems.append(
            f"model not found. this machine's tier is {quant} (~{size}GB):\n"
            f"  {model_url(quant)}\n  -> {SCRYPT_HOME / 'models' / model_file(quant)}"
        )
    return problems


def download_model(progress=None) -> Path:
    """Fetch this machine's tier of GGUF with a progress callback(fraction)."""
    quant = choose_quant()
    if quant is None:
        raise LocalSetupError(f"below the {MIN_RAM_GB}GB floor; use API mode")
    dest = SCRYPT_HOME / "models" / model_file(quant)
    dest.parent.mkdir(parents=True, exist_ok=True)
    tmp = dest.with_suffix(".part")

    def hook(blocks, block_size, total):
        if progress and total > 0:
            progress(min(1.0, blocks * block_size / total))

    try:
        urllib.request.urlretrieve(model_url(quant), tmp, reporthook=hook)
    except BaseException:
        tmp.unlink(missing_ok=True)  # a stale 20GB .part helps nobody
        raise
    tmp.rename(dest)
    return dest


class LlamaServer:
    """Owns the llama-server subprocess for one game session."""

    def __init__(self, port: int = PORT):
        self.port = port
        self.proc: subprocess.Popen | None = None

    @property
    def base_url(self) -> str:
        return f"http://127.0.0.1:{self.port}/v1"

    def start(self, *, ctx_size: int = 8192, wait_s: float = 300.0) -> None:
        problems = preflight()
        if problems:
            raise LocalSetupError("\n".join(problems))
        cmd = server_command(installed_model(), self.port, ctx_size)
        assert cmd is not None  # preflight guaranteed a runtime
        log_path = SCRYPT_HOME / "llama-server.log"
        log_path.parent.mkdir(parents=True, exist_ok=True)
        with log_path.open("wb") as log:
            self.proc = subprocess.Popen(cmd, stdout=log, stderr=log)
        deadline = time.monotonic() + wait_s
        while time.monotonic() < deadline:
            if self.proc.poll() is not None:
                raise LocalSetupError(
                    "llama-server exited during startup: "
                    + self._log_tail(log_path)
                    + f"\n(full log: {log_path})"
                )
            if self._healthy():
                return
            time.sleep(0.5)
        self.stop()
        raise LocalSetupError("llama-server did not become healthy in time")

    @staticmethod
    def _log_tail(log_path: Path, lines: int = 3) -> str:
        try:
            tail = log_path.read_text(errors="replace").strip().splitlines()
            interesting = [l for l in tail if "error" in l.lower()] or tail
            return " | ".join(interesting[-lines:])
        except OSError:
            return "(no log)"

    def _healthy(self) -> bool:
        # llama-server answers /health; llama_cpp.server answers /v1/models.
        for probe in ("health", "v1/models"):
            try:
                with urllib.request.urlopen(
                    f"http://127.0.0.1:{self.port}/{probe}", timeout=1
                ) as r:
                    if r.status == 200:
                        return True
            except OSError:
                continue
        return False

    def backend(self, **kw) -> OpenAIChatBackend:
        return OpenAIChatBackend(self.base_url, model="local", **kw)

    def stop(self) -> None:
        if self.proc and self.proc.poll() is None:
            self.proc.terminate()
            try:
                self.proc.wait(timeout=5)
            except subprocess.TimeoutExpired:
                self.proc.kill()
        self.proc = None