"""Loads the fine-tuned GGUF via llama-cpp-python.

Pattern follows the HF ZeroGPU + small-talk reference:
  - hf_hub_download files at module import (warm cache)
  - instantiate Llama inside @spaces.GPU function on each request

Robustness:
  - Uses HF_TOKEN if available (avoids 401 on rate-limited / gated lookups)
  - Treats MMPROJ_FILE as optional. If unset OR download fails, vision is
    disabled gracefully (text-only chat still works).
  - Fails loudly with a helpful message if the main GGUF cannot be fetched.
"""
import os
import traceback
from huggingface_hub import hf_hub_download
from .config import MODEL_REPO, GGUF_FILE, MMPROJ_FILE, HF_TOKEN


def _download(repo_id: str, filename: str):
    """hf_hub_download with optional token, returns path or raises."""
    kwargs = {"repo_id": repo_id, "filename": filename}
    if HF_TOKEN:
        kwargs["token"] = HF_TOKEN
    return hf_hub_download(**kwargs)


print(f"[model_loader] downloading {MODEL_REPO}/{GGUF_FILE} …")
try:
    MODEL_PATH = _download(MODEL_REPO, GGUF_FILE)
    print(f"[model_loader] main model ready: {MODEL_PATH}")
except Exception as e:
    # We re-raise so the Space fails fast with a clear message rather than
    # silently running with no brain. The traceback is already useful.
    print(f"[model_loader] FAILED to download {MODEL_REPO}/{GGUF_FILE}: {e}")
    print("[model_loader] Check that ELYSIUM_MODEL_REPO and ELYSIUM_GGUF_FILE "
          "point at a real public file, and (for private repos) that HF_TOKEN is set.")
    raise

# ─── mmproj (vision projector) is OPTIONAL ──────────────────────────────────
MMPROJ_PATH = None
if MMPROJ_FILE:
    print(f"[model_loader] attempting mmproj {MODEL_REPO}/{MMPROJ_FILE} …")
    try:
        MMPROJ_PATH = _download(MODEL_REPO, MMPROJ_FILE)
        print(f"[model_loader] mmproj ready: {MMPROJ_PATH}")
    except Exception as e:
        print(f"[model_loader] mmproj unavailable ({e}) — vision disabled")
        MMPROJ_PATH = None
else:
    print("[model_loader] ELYSIUM_MMPROJ_FILE not set — vision disabled (text-only mode)")


def make_llm():
    """Create a fresh Llama inside a GPU context.
    The .gguf file is filesystem-cached, so this is fast after the first call.
    """
    from llama_cpp import Llama

    chat_handler = None
    if MMPROJ_PATH:
        try:
            from llama_cpp.llama_chat_format import MiniCPMv26ChatHandler
            chat_handler = MiniCPMv26ChatHandler(clip_model_path=MMPROJ_PATH, verbose=False)
        except Exception as e:
            print(f"[model_loader] vision chat handler failed: {e}")
            traceback.print_exc()
            chat_handler = None

    return Llama(
        model_path=MODEL_PATH,
        chat_handler=chat_handler,
        n_gpu_layers=-1,
        n_ctx=8192,
        flash_attn=True,
        verbose=False,
    )