"""
Self-hosted LLM using HuggingFace Transformers — zero external API, no C++ compilation.
All speeds measured on 2 vCPU / 16 GB RAM (HF Free Tier).

Model options (set LLM_MODEL env var in HF Space to switch, no redeploy needed):
  #1  TinyLlama/TinyLlama-1.1B-Chat-v1.0           ~1 GB    40-60 tok/s  Apache 2.0   demos, prototypes
  #2  Qwen/Qwen3-0.6B                               ~0.5 GB  45-55 tok/s  Apache 2.0   speed-critical, Think mode
  #3  meta-llama/Llama-3.2-1B-Instruct              ~1.5 GB  35-50 tok/s  Community    128K ctx, long-context (needs HF_TOKEN)
  #4  HuggingFaceTB/SmolLM2-1.7B-Instruct          ~2 GB    25-35 tok/s  Apache 2.0   good quality/size ratio
  #5  HuggingFaceTB/SmolLM2-360M-Instruct  [DEF]   ~0.4 GB  60-80 tok/s  Apache 2.0   fastest, no token needed
  #5  Qwen/Qwen2.5-1.5B-Instruct                    ~2 GB    25-40 tok/s  Apache 2.0   multilingual, 32K ctx
  #6  stabilityai/stablelm-2-zephyr-1_6b            ~2 GB    25-40 tok/s  MIT          DPO-tuned chat feel
  #7  Qwen/Qwen2.5-Coder-1.5B-Instruct              ~2 GB    25-40 tok/s  Apache 2.0   code completion/review
  #8  microsoft/phi-2                                ~3 GB    18-28 tok/s  MIT          reasoning & logic
  #9  google/gemma-3-1b-it                           ~1.5 GB  35-48 tok/s  Gemma ToU    multilingual, 140+ langs

Note:
  - Llama 3.2 and Gemma 3 may require a HF_TOKEN env var (gated models).
  - Qwen3 supports /think and /no_think prefixes for reasoning depth control.
"""

import os
import time
import threading

import warnings
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from model.log import section, step, ok, warn, error

# Suppress torch_dtype deprecation warning from transformers dev build
warnings.filterwarnings(
    "ignore",
    message=r".*torch_dtype.*is deprecated.*Use.*dtype.*",
    category=FutureWarning,
)

MODEL_ID = os.environ.get("LLM_MODEL", "HuggingFaceTB/SmolLM2-360M-Instruct")

# Models that need trust_remote_code=True (custom architectures)
_TRUST_REMOTE_CODE_MODELS = (
    "LiquidAI/",
    "DavidAU/LFM",
)

def _needs_trust_remote_code(model_id: str) -> bool:
    return any(model_id.startswith(prefix) for prefix in _TRUST_REMOTE_CODE_MODELS)

_tokenizer: AutoTokenizer | None = None
_llm: AutoModelForCausalLM | None = None
_llm_ready: bool = False
_loading: bool = False
_loading_msg: str = ""
_loading_error: str | None = None
_switch_lock = threading.Lock()


def _load() -> None:
    global _tokenizer, _llm, _llm_ready, _loading_msg

    if _llm is not None:
        return

    t0   = time.perf_counter()
    _trc = _needs_trust_remote_code(MODEL_ID)

    section("MODEL", f"Loading  {MODEL_ID}")

    if _trc:
        step("MODEL", "trust_remote_code=True  (custom architecture)")

    # ── Tokenizer ─────────────────────────────────────────────────────────────
    _loading_msg = f"Loading tokenizer…"
    step("MODEL", f"Fetching tokenizer…")
    try:
        _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=_trc)
        ok("MODEL", "Tokenizer loaded")
    except Exception as exc:
        error("MODEL", f"Tokenizer load failed → {exc}")
        raise

    # ── Weights ───────────────────────────────────────────────────────────────
    _loading_msg = "Loading model weights…  (downloads on first run, then cached)"
    step("MODEL", "Loading weights  (first run will download — subsequent boots use cache)")

    device_info = "CUDA" if torch.cuda.is_available() else "CPU"
    step("MODEL", f"Device: {device_info}  ·  dtype: bfloat16")

    try:
        _llm = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            torch_dtype=torch.bfloat16,
            trust_remote_code=_trc,
            device_map="cpu",
            low_cpu_mem_usage=True,
        )
    except Exception as exc:
        error("MODEL", str(exc))
        raise

    _llm.eval()
    _llm_ready = True
    _loading_msg = ""

    elapsed = time.perf_counter() - t0
    params  = sum(p.numel() for p in _llm.parameters()) / 1e6
    ok("MODEL", f"Ready  ·  {params:.0f}M params  ·  {elapsed:.1f}s")
    section("MODEL", "Model online")


def get_tokenizer() -> AutoTokenizer:
    _load()
    return _tokenizer  # type: ignore


def get_llm() -> AutoModelForCausalLM:
    _load()
    return _llm  # type: ignore


def get_model_name() -> str:
    return MODEL_ID


def is_loading() -> bool:
    return _loading


def get_loading_status() -> dict:
    """Return the current model loading state for admin polling."""
    return {
        "model":   MODEL_ID,
        "ready":   _llm_ready,
        "loading": _loading,
        "msg":     _loading_msg,
        "error":   _loading_error,
    }


def switch_model(new_model_id: str) -> None:
    """Unload the current model, update MODEL_ID, and load the new model.
    Designed to be called from a background thread."""
    global MODEL_ID, _tokenizer, _llm, _llm_ready
    global _loading, _loading_msg, _loading_error

    with _switch_lock:
        prev = MODEL_ID
        section("SWITCH", f"{prev}  →  {new_model_id}")

        _loading       = True
        _loading_error = None
        _loading_msg   = f"Unloading {prev}…"
        _llm_ready     = False

        # ── Release current model from memory ─────────────────────────────────
        step("SWITCH", f"Unloading  {prev}")
        try:
            import gc
            _llm       = None
            _tokenizer = None
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                step("SWITCH", "CUDA cache cleared")
            ok("SWITCH", "Memory freed")
        except Exception as exc:
            warn("SWITCH", f"Cleanup warning: {exc}")

        MODEL_ID = new_model_id
        step("SWITCH", f"Starting load of  {new_model_id}")

        try:
            _load()          # uses updated MODEL_ID; sets _llm_ready = True
            _loading = False
            ok("SWITCH", f"Switch complete  →  {new_model_id}")
        except Exception as exc:
            _loading_error = str(exc)
            _loading       = False
            _loading_msg   = ""
            error("SWITCH", f"Failed to load  {new_model_id}\n  {exc}")


def is_llm_ready() -> bool:
    return _llm_ready