Spaces:

pbhappliedsystems
/

quant-eval-agent-arena

Running on Zero

File size: 9,399 Bytes

813f493

# model_loader.py
# PBH Applied Systems — GGUF model loading via llama-cpp-python.
# ZeroGPU-safe: models are loaded inside @spaces.GPU decorated scope.
#
# The cu121 llama-cpp-python wheel links libllama.so against CUDA 12 SONAMEs
# (libcudart.so.12, libcublasLt.so.12). ZeroGPU runs CUDA 13 system-wide,
# so these files are not present in the system library paths.
#
# Fix: requirements.txt installs nvidia-cuda-runtime-cu12 and nvidia-cublas-cu12
# as Python packages that ship libcudart.so.12 and libcublas*.so.12 inside
# site-packages. model_loader.py finds them there and preloads them with
# ctypes.RTLD_GLOBAL before `from llama_cpp import Llama` fires.

import os
import glob
import ctypes
import logging
import site
import sysconfig
from pathlib import Path
from eval_data import MODELS, pair_is_feasible

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Jinja2 patch — register {% generation %} as a known extension.
# ---------------------------------------------------------------------------

def _patch_jinja2_generation_tag() -> None:
    try:
        import jinja2
        from jinja2 import nodes
        from jinja2.ext import Extension

        class _GenerationExtension(Extension):
            tags = frozenset(["generation"])

            def parse(self, parser):
                lineno = next(parser.stream).lineno
                body = parser.parse_statements(
                    ["name:endgeneration"], drop_needle=True
                )
                return nodes.Scope(body, lineno=lineno)

        _orig_init = jinja2.Environment.__init__

        def _patched_init(env_self, *args, **kwargs):
            exts = list(kwargs.get("extensions", []))
            ext_tags = {
                tag
                for ext in exts
                if isinstance(ext, type) and hasattr(ext, "tags")
                for tag in ext.tags
            }
            if "generation" not in ext_tags:
                exts.append(_GenerationExtension)
            kwargs["extensions"] = exts
            _orig_init(env_self, *args, **kwargs)

        jinja2.Environment.__init__ = _patched_init
        logger.info("Jinja2 {% generation %} extension registered.")

    except Exception as exc:
        logger.warning(f"Jinja2 generation tag patch failed (non-fatal): {exc}")


_patch_jinja2_generation_tag()


# ---------------------------------------------------------------------------
# spaces import
# ---------------------------------------------------------------------------

try:
    import spaces
    ZEROGPU_AVAILABLE = True
except ImportError:
    ZEROGPU_AVAILABLE = False
    class spaces:
        @staticmethod
        def GPU(fn=None, duration=None):
            if fn is not None:
                return fn
            def decorator(f):
                return f
            return decorator

from huggingface_hub import hf_hub_download

CACHE_DIR = Path(os.environ.get("HF_HOME", "/tmp/hf_cache")) / "pbh_gguf"
CACHE_DIR.mkdir(parents=True, exist_ok=True)

DEFAULT_N_CTX = 8192
DEFAULT_N_GPU_LAYERS = -1
DEFAULT_N_THREADS = 4

CHAT_FORMAT_OVERRIDES = {
    "ministral-14b-instruct":  "mistral-instruct",
    "ministral-14b-reasoning": "mistral-instruct",
    "phi4-reasoning-plus":     "chatml",
    "mistral-nemo":            "mistral-instruct",
}

_model_cache: dict = {}


# ---------------------------------------------------------------------------
# CUDA runtime preload for llama-cpp-python cu121 wheel
# ---------------------------------------------------------------------------

def _iter_cuda_library_dirs() -> list:
    """Return candidate CUDA library directories visible to this process."""
    dirs = []

    def add(path):
        if path and os.path.isdir(path) and path not in dirs:
            dirs.append(path)

    # 1. Python-packaged NVIDIA CUDA 12 libraries from requirements.txt.
    site_roots = set(site.getsitepackages())
    user_site = site.getusersitepackages()
    if user_site:
        site_roots.add(user_site)
    purelib = sysconfig.get_paths().get("purelib")
    if purelib:
        site_roots.add(purelib)

    for root in site_roots:
        add(os.path.join(root, "nvidia", "cuda_runtime", "lib"))
        add(os.path.join(root, "nvidia", "cublas", "lib"))

    # 2. ZeroGPU/runtime CUDA locations.
    cuda_home = (
        os.environ.get("CUDA_HOME")
        or os.environ.get("CUDA_PATH")
        or os.environ.get("CUDADIR")
    )
    if cuda_home:
        add(os.path.join(cuda_home, "lib64"))

    # 3. Common system paths, including ZeroGPU's mounted CUDA image.
    for pattern in [
        "/cuda-image/usr/local/cuda*/lib64",
        "/usr/local/cuda*/lib64",
        "/usr/lib/x86_64-linux-gnu",
        "/usr/lib64",
    ]:
        for match in glob.glob(pattern):
            add(match)

    return dirs


def _prepend_ld_library_path(dirs: list) -> None:
    existing = [p for p in os.environ.get("LD_LIBRARY_PATH", "").split(":") if p]
    merged = []
    for path in dirs + existing:
        if path and path not in merged:
            merged.append(path)
    os.environ["LD_LIBRARY_PATH"] = ":".join(merged)


def _preload_shared_library(filename: str, dirs: list, required: bool = True) -> bool:
    """Load a shared library by absolute path with RTLD_GLOBAL."""
    for lib_dir in dirs:
        candidate = os.path.join(lib_dir, filename)
        if os.path.exists(candidate):
            try:
                ctypes.CDLL(candidate, mode=ctypes.RTLD_GLOBAL)
                logger.info("Preloaded %s from %s", filename, candidate)
                return True
            except Exception as e:
                logger.warning("Failed to preload %s from %s: %s", filename, candidate, e)
    if required:
        logger.warning("Required CUDA library not found: %s (searched: %s)", filename, dirs)
    return False


def _ensure_cuda_compat() -> None:
    """
    Ensure llama-cpp-python's cu121 wheel can import libllama.so.

    The cu121 wheel is linked against CUDA 12 SONAMEs such as libcudart.so.12.
    In a Gradio/ZeroGPU Space, the CUDA 12 runtime must be provided by Python
    packages in requirements.txt and preloaded before `from llama_cpp import Llama`.
    """
    lib_dirs = _iter_cuda_library_dirs()
    logger.info("CUDA library search dirs: %s", lib_dirs)
    _prepend_ld_library_path(lib_dirs)

    # Load dependency libraries explicitly with RTLD_GLOBAL so the dynamic
    # linker finds their symbols when loading libllama.so's DT_NEEDED entries.
    loaded_cudart = _preload_shared_library("libcudart.so.12", lib_dirs, required=True)
    _preload_shared_library("libcublasLt.so.12", lib_dirs, required=False)
    _preload_shared_library("libcublas.so.12", lib_dirs, required=False)

    if not loaded_cudart:
        logger.warning(
            "libcudart.so.12 was not preloaded. llama_cpp import may fail. "
            "Check that nvidia-cuda-runtime-cu12 is installed."
        )


# ---------------------------------------------------------------------------
# GGUF download
# ---------------------------------------------------------------------------

def _download_gguf(model_key: str) -> str:
    m = MODELS[model_key]
    logger.info(f"Downloading {m['hf_filename']} from {m['hf_repo']}...")
    local_path = hf_hub_download(
        repo_id=m["hf_repo"],
        filename=m["hf_filename"],
        cache_dir=str(CACHE_DIR),
    )
    logger.info(f"GGUF at: {local_path}")
    return local_path


# ---------------------------------------------------------------------------
# Model loading — lazy llama_cpp import inside @spaces.GPU scope
# ---------------------------------------------------------------------------

_cuda_compat_done = False


def load_model(model_key: str, n_ctx: int = DEFAULT_N_CTX):
    """
    Load a GGUF model by key. Returns cached instance if already loaded.
    Must be called within a @spaces.GPU decorated function on ZeroGPU.
    """
    global _cuda_compat_done

    if model_key in _model_cache:
        logger.info(f"Cache hit: {model_key}")
        return _model_cache[model_key]

    if model_key not in MODELS:
        raise ValueError(f"Unknown model key: {model_key}")

    # Preload CUDA 12 runtime libs before first llama_cpp import
    if not _cuda_compat_done:
        _ensure_cuda_compat()
        _cuda_compat_done = True

    # Lazy import — runs inside @spaces.GPU where GPU is allocated
    from llama_cpp import Llama

    m = MODELS[model_key]
    logger.info(f"Loading {m['display_name']} (n_ctx={n_ctx})...")

    gguf_path = _download_gguf(model_key)
    chat_format = CHAT_FORMAT_OVERRIDES.get(model_key, None)

    llm = Llama(
        model_path=gguf_path,
        n_ctx=n_ctx,
        n_gpu_layers=DEFAULT_N_GPU_LAYERS,
        n_threads=DEFAULT_N_THREADS,
        verbose=False,
        flash_attn=True,
        chat_format=chat_format,
    )

    _model_cache[model_key] = llm
    logger.info(
        f"Loaded and cached: {model_key} "
        f"(chat_format={chat_format or 'auto'})"
    )
    return llm


def validate_pair(model_key_a: str, model_key_b: str) -> tuple:
    return pair_is_feasible(model_key_a, model_key_b)


def get_model_n_ctx(model_key: str) -> int:
    if model_key == "qwen2.5-14b-1m":
        return 8192
    m = MODELS.get(model_key, {})
    return min(DEFAULT_N_CTX, m.get("context_window", DEFAULT_N_CTX))