"""ZeroGPU extraction via the **llama.cpp** runtime (earns the 🦙 Llama Champion badge).

Text-only by default. Set LLAMACPP_VISION=1 to run the same PDF/image vision pipeline as
Transformers (GGUF + mmproj through llama-cpp-python).

Config (env):
  EXTRACTOR_BACKEND=llamacpp-gpu
  LLAMACPP_VISION=1                 enable vision (PDF/image uploads)
  LLAMACPP_GGUF_REPO                HF repo with GGUF weights
  LLAMACPP_MODEL_FILE               GGUF filename
  LLAMACPP_MMPROJ_FILE              mmproj filename (required when vision is on)
  LLAMACPP_CHAT_HANDLER             default MiniCPMv26ChatHandler
  LLAMACPP_MAX_TOKENS               default 3072
  LLAMACPP_N_CTX                    default 8192
  LLAMACPP_N_GPU_LAYERS             default 0 for CPU wheels; -1 on CUDA builds
"""

from __future__ import annotations

import os
import time
from functools import lru_cache
from typing import Any

from src.document_processing import document_intake_metadata, document_to_payload_parts
from src.extraction.llamacpp_vision import (
    DEFAULT_CHAT_HANDLER,
    DEFAULT_MMPROJ_FILE,
    download_hf_file,
    llamacpp_vision_enabled,
    load_vision_llama,
)
from src.openbmb_client import (
    EXTRACTION_PROMPT,
    ExtractionResult,
    _normalize_notes,
    _normalize_patient,
    _normalize_tests,
    _parse_json_response,
    summarize_document_parts,
)
from src.space_runtime import is_cpu_basic_space

DEFAULT_GGUF_REPO = "openbmb/MiniCPM-V-4.6-gguf"
DEFAULT_MODEL_FILE = "MiniCPM-V-4_6-Q4_K_M.gguf"

try:
    import spaces
except ImportError:  # Local dev without the HF Spaces package.
    class _SpacesFallback:
        @staticmethod
        def GPU(*_args: Any, **_kwargs: Any):
            def decorator(func):
                return func

            return decorator

    spaces = _SpacesFallback()  # type: ignore[assignment]


class LlamaCppGPUExtractor:
    """Extractor that runs the GGUF through llama.cpp on ZeroGPU."""

    def __init__(self) -> None:
        self.repo = os.getenv("LLAMACPP_GGUF_REPO", DEFAULT_GGUF_REPO).strip()
        self.model_file = os.getenv("LLAMACPP_MODEL_FILE", DEFAULT_MODEL_FILE).strip()
        self.mmproj_file = os.getenv("LLAMACPP_MMPROJ_FILE", DEFAULT_MMPROJ_FILE).strip()
        self.chat_handler = os.getenv("LLAMACPP_CHAT_HANDLER", DEFAULT_CHAT_HANDLER).strip()
        self.max_tokens = int(os.getenv("LLAMACPP_MAX_TOKENS", "3072"))
        self.n_ctx = int(os.getenv("LLAMACPP_N_CTX", "8192"))
        self.n_gpu_layers = int(os.getenv("LLAMACPP_N_GPU_LAYERS", "0"))
        self.vision_enabled = llamacpp_vision_enabled()
        self.use_spaces_gpu = not is_cpu_basic_space()
        if self.vision_enabled and not self.mmproj_file:
            raise ValueError("LLAMACPP_VISION=1 requires LLAMACPP_MMPROJ_FILE.")

    def extract(self, file_path: str, max_pages: int = 3) -> ExtractionResult:
        parts = document_to_payload_parts(file_path, max_pages=max_pages)
        started = time.perf_counter()
        if self.vision_enabled:
            runner = (
                _run_llamacpp_vision_generation
                if self.use_spaces_gpu
                else _run_llamacpp_vision_generation_cpu
            )
            raw = runner(
                parts=parts,
                repo=self.repo,
                model_file=self.model_file,
                mmproj_file=self.mmproj_file,
                chat_handler=self.chat_handler,
                max_tokens=self.max_tokens,
                n_ctx=self.n_ctx,
                n_gpu_layers=self.n_gpu_layers,
            )
            backend = "llamacpp-gpu-vision" if self.use_spaces_gpu else "llamacpp-cpu-vision"
            composed_prompt = None
        else:
            prompt_text = _compose_prompt(parts)
            runner = (
                _run_llamacpp_generation
                if self.use_spaces_gpu
                else _run_llamacpp_generation_cpu
            )
            raw = runner(
                prompt_text=prompt_text,
                repo=self.repo,
                model_file=self.model_file,
                max_tokens=self.max_tokens,
                n_ctx=self.n_ctx,
                n_gpu_layers=self.n_gpu_layers,
            )
            backend = "llamacpp-gpu" if self.use_spaces_gpu else "llamacpp-cpu"
            composed_prompt = prompt_text

        duration_ms = int((time.perf_counter() - started) * 1000)
        parsed = _parse_json_response(raw)
        summary = {
            "backend": backend,
            "repo": self.repo,
            "model": self.model_file,
            "vision_enabled": self.vision_enabled,
            "spaces_gpu": self.use_spaces_gpu,
            "document_parts": len(parts),
            "max_pages": max_pages,
            "extraction_prompt": EXTRACTION_PROMPT,
            "user_message_preview": summarize_document_parts(parts),
            **document_intake_metadata(file_path, parts),
            "return_code": 0,
            "duration_ms": duration_ms,
        }
        if self.vision_enabled:
            summary["mmproj"] = self.mmproj_file
            summary["chat_handler"] = self.chat_handler
        else:
            summary["composed_prompt"] = composed_prompt

        return ExtractionResult(
            patient=_normalize_patient(parsed.get("patient", {})),
            tests=_normalize_tests(parsed.get("tests", [])),
            notes=_normalize_notes(parsed.get("notes", [])),
            raw_response=raw,
            request_summary=summary,
        )


@lru_cache(maxsize=1)
def _load_text(model_path: str, n_ctx: int, n_gpu_layers: int):
    from llama_cpp import Llama

    return Llama(
        model_path=model_path,
        n_ctx=n_ctx,
        n_gpu_layers=n_gpu_layers,
        verbose=False,
    )


def _vision_messages(parts: list[dict[str, Any]]) -> list[dict[str, Any]]:
    return [{"role": "user", "content": [{"type": "text", "text": EXTRACTION_PROMPT}, *parts]}]


def _raise_generation_error(exc: Exception, *, vision: bool) -> RuntimeError:
    message = f"{type(exc).__name__}: {exc}"
    if "llama_decode returned -1" in message:
        detail = (
            "llama.cpp ran out of room while decoding the vision prompt. "
            if vision
            else "llama.cpp ran out of room while decoding the PDF text prompt. "
        )
        raise RuntimeError(
            detail
            + "Try increasing LLAMACPP_N_CTX, lowering the number of PDF pages, or trimming the "
            "input before sending it to the model."
        ) from exc
    model_label = "vision GGUF + mmproj" if vision else "text-only GGUF"
    raise RuntimeError(
        f"The llama.cpp backend could not complete extraction with the {model_label} model. "
        f"Inner error: {message}"
    ) from exc


@spaces.GPU(duration=600)
def _run_llamacpp_vision_generation(
    parts: list[dict[str, Any]],
    repo: str,
    model_file: str,
    mmproj_file: str,
    chat_handler: str,
    max_tokens: int,
    n_ctx: int,
    n_gpu_layers: int,
) -> str:
    return _run_llamacpp_vision_generation_cpu(
        parts=parts,
        repo=repo,
        model_file=model_file,
        mmproj_file=mmproj_file,
        chat_handler=chat_handler,
        max_tokens=max_tokens,
        n_ctx=n_ctx,
        n_gpu_layers=n_gpu_layers,
    )


def _run_llamacpp_vision_generation_cpu(
    parts: list[dict[str, Any]],
    repo: str,
    model_file: str,
    mmproj_file: str,
    chat_handler: str,
    max_tokens: int,
    n_ctx: int,
    n_gpu_layers: int,
) -> str:
    try:
        model_path = download_hf_file(repo, model_file)
        mmproj_path = download_hf_file(repo, mmproj_file)
    except Exception as exc:
        raise RuntimeError(
            "llama.cpp download failed while preparing the vision GGUF assets: "
            f"{type(exc).__name__}: {exc}"
        ) from exc

    try:
        llm = load_vision_llama(model_path, mmproj_path, n_ctx, n_gpu_layers, chat_handler)
    except Exception as exc:
        raise RuntimeError(
            "The llama.cpp backend could not load the vision GGUF + mmproj model. "
            "This usually means the downloaded model build is incompatible with the installed "
            "llama-cpp-python wheel or the model files are incomplete. "
            f"Inner error: {type(exc).__name__}: {exc}"
        ) from exc

    try:
        response = llm.create_chat_completion(
            messages=_vision_messages(parts),
            response_format={"type": "json_object"},
            temperature=0.0,
            max_tokens=max_tokens,
        )
        return response["choices"][0]["message"].get("content") or "{}"
    except Exception as exc:
        raise _raise_generation_error(exc, vision=True) from exc


@spaces.GPU(duration=600)
def _run_llamacpp_generation(
    prompt_text: str,
    repo: str,
    model_file: str,
    max_tokens: int,
    n_ctx: int,
    n_gpu_layers: int,
) -> str:
    return _run_llamacpp_generation_cpu(
        prompt_text=prompt_text,
        repo=repo,
        model_file=model_file,
        max_tokens=max_tokens,
        n_ctx=n_ctx,
        n_gpu_layers=n_gpu_layers,
    )


def _run_llamacpp_generation_cpu(
    prompt_text: str,
    repo: str,
    model_file: str,
    max_tokens: int,
    n_ctx: int,
    n_gpu_layers: int,
) -> str:
    try:
        model_path = download_hf_file(repo, model_file)
    except Exception as exc:
        raise RuntimeError(
            "llama.cpp download failed while preparing the GGUF model: "
            f"{type(exc).__name__}: {exc}"
        ) from exc

    try:
        llm = _load_text(model_path, n_ctx, n_gpu_layers)
    except Exception as exc:
        raise RuntimeError(
            "The llama.cpp backend could not load the text-only GGUF model. "
            "This usually means the downloaded model build is incompatible with the installed "
            "llama-cpp-python wheel or the model file is incomplete. "
            f"Inner error: {type(exc).__name__}: {exc}"
        ) from exc

    try:
        response = llm.create_chat_completion(
            messages=[{"role": "user", "content": prompt_text}],
            response_format={"type": "json_object"},
            temperature=0.0,
            max_tokens=max_tokens,
        )
        return response["choices"][0]["message"].get("content") or "{}"
    except Exception as exc:
        raise _raise_generation_error(exc, vision=False) from exc


def _compose_prompt(parts: list[dict[str, Any]]) -> str:
    text_parts: list[str] = [EXTRACTION_PROMPT]
    image_count = 0
    for part in parts:
        if part.get("type") == "text":
            text = str(part.get("text", "")).strip()
            if text:
                text_parts.append(text)
        elif part.get("type") == "image_url":
            image_count += 1

    if image_count and len(text_parts) == 1:
        raise RuntimeError(
            "The llama.cpp text backend cannot analyze image-based documents. "
            "Set LLAMACPP_VISION=1 with EXTRACTOR_BACKEND=llamacpp-gpu, or use "
            "EXTRACTOR_BACKEND=transformers for local vision extraction."
        )

    return "\n\n".join(text_parts)