Spaces:
Running on Zero
Running on Zero
| # model_loader.py | |
| # PBH Applied Systems — GGUF model loading via llama-cpp-python. | |
| # ZeroGPU-safe: models are loaded inside @spaces.GPU decorated scope. | |
| # | |
| # The cu121 llama-cpp-python wheel links libllama.so against CUDA 12 SONAMEs | |
| # (libcudart.so.12, libcublasLt.so.12). ZeroGPU runs CUDA 13 system-wide, | |
| # so these files are not present in the system library paths. | |
| # | |
| # Fix: requirements.txt installs nvidia-cuda-runtime-cu12 and nvidia-cublas-cu12 | |
| # as Python packages that ship libcudart.so.12 and libcublas*.so.12 inside | |
| # site-packages. model_loader.py finds them there and preloads them with | |
| # ctypes.RTLD_GLOBAL before `from llama_cpp import Llama` fires. | |
| import os | |
| import glob | |
| import ctypes | |
| import logging | |
| import site | |
| import sysconfig | |
| from pathlib import Path | |
| from eval_data import MODELS, pair_is_feasible | |
| logger = logging.getLogger(__name__) | |
| # --------------------------------------------------------------------------- | |
| # Jinja2 patch — register {% generation %} as a known extension. | |
| # --------------------------------------------------------------------------- | |
| def _patch_jinja2_generation_tag() -> None: | |
| try: | |
| import jinja2 | |
| from jinja2 import nodes | |
| from jinja2.ext import Extension | |
| class _GenerationExtension(Extension): | |
| tags = frozenset(["generation"]) | |
| def parse(self, parser): | |
| lineno = next(parser.stream).lineno | |
| body = parser.parse_statements( | |
| ["name:endgeneration"], drop_needle=True | |
| ) | |
| return nodes.Scope(body, lineno=lineno) | |
| _orig_init = jinja2.Environment.__init__ | |
| def _patched_init(env_self, *args, **kwargs): | |
| exts = list(kwargs.get("extensions", [])) | |
| ext_tags = { | |
| tag | |
| for ext in exts | |
| if isinstance(ext, type) and hasattr(ext, "tags") | |
| for tag in ext.tags | |
| } | |
| if "generation" not in ext_tags: | |
| exts.append(_GenerationExtension) | |
| kwargs["extensions"] = exts | |
| _orig_init(env_self, *args, **kwargs) | |
| jinja2.Environment.__init__ = _patched_init | |
| logger.info("Jinja2 {% generation %} extension registered.") | |
| except Exception as exc: | |
| logger.warning(f"Jinja2 generation tag patch failed (non-fatal): {exc}") | |
| _patch_jinja2_generation_tag() | |
| # --------------------------------------------------------------------------- | |
| # spaces import | |
| # --------------------------------------------------------------------------- | |
| try: | |
| import spaces | |
| ZEROGPU_AVAILABLE = True | |
| except ImportError: | |
| ZEROGPU_AVAILABLE = False | |
| class spaces: | |
| def GPU(fn=None, duration=None): | |
| if fn is not None: | |
| return fn | |
| def decorator(f): | |
| return f | |
| return decorator | |
| from huggingface_hub import hf_hub_download | |
| CACHE_DIR = Path(os.environ.get("HF_HOME", "/tmp/hf_cache")) / "pbh_gguf" | |
| CACHE_DIR.mkdir(parents=True, exist_ok=True) | |
| DEFAULT_N_CTX = 8192 | |
| DEFAULT_N_GPU_LAYERS = -1 | |
| DEFAULT_N_THREADS = 4 | |
| CHAT_FORMAT_OVERRIDES = { | |
| "ministral-14b-instruct": "mistral-instruct", | |
| "ministral-14b-reasoning": "mistral-instruct", | |
| "phi4-reasoning-plus": "chatml", | |
| "mistral-nemo": "mistral-instruct", | |
| } | |
| _model_cache: dict = {} | |
| # --------------------------------------------------------------------------- | |
| # CUDA runtime preload for llama-cpp-python cu121 wheel | |
| # --------------------------------------------------------------------------- | |
| def _iter_cuda_library_dirs() -> list: | |
| """Return candidate CUDA library directories visible to this process.""" | |
| dirs = [] | |
| def add(path): | |
| if path and os.path.isdir(path) and path not in dirs: | |
| dirs.append(path) | |
| # 1. Python-packaged NVIDIA CUDA 12 libraries from requirements.txt. | |
| site_roots = set(site.getsitepackages()) | |
| user_site = site.getusersitepackages() | |
| if user_site: | |
| site_roots.add(user_site) | |
| purelib = sysconfig.get_paths().get("purelib") | |
| if purelib: | |
| site_roots.add(purelib) | |
| for root in site_roots: | |
| add(os.path.join(root, "nvidia", "cuda_runtime", "lib")) | |
| add(os.path.join(root, "nvidia", "cublas", "lib")) | |
| # 2. ZeroGPU/runtime CUDA locations. | |
| cuda_home = ( | |
| os.environ.get("CUDA_HOME") | |
| or os.environ.get("CUDA_PATH") | |
| or os.environ.get("CUDADIR") | |
| ) | |
| if cuda_home: | |
| add(os.path.join(cuda_home, "lib64")) | |
| # 3. Common system paths, including ZeroGPU's mounted CUDA image. | |
| for pattern in [ | |
| "/cuda-image/usr/local/cuda*/lib64", | |
| "/usr/local/cuda*/lib64", | |
| "/usr/lib/x86_64-linux-gnu", | |
| "/usr/lib64", | |
| ]: | |
| for match in glob.glob(pattern): | |
| add(match) | |
| return dirs | |
| def _prepend_ld_library_path(dirs: list) -> None: | |
| existing = [p for p in os.environ.get("LD_LIBRARY_PATH", "").split(":") if p] | |
| merged = [] | |
| for path in dirs + existing: | |
| if path and path not in merged: | |
| merged.append(path) | |
| os.environ["LD_LIBRARY_PATH"] = ":".join(merged) | |
| def _preload_shared_library(filename: str, dirs: list, required: bool = True) -> bool: | |
| """Load a shared library by absolute path with RTLD_GLOBAL.""" | |
| for lib_dir in dirs: | |
| candidate = os.path.join(lib_dir, filename) | |
| if os.path.exists(candidate): | |
| try: | |
| ctypes.CDLL(candidate, mode=ctypes.RTLD_GLOBAL) | |
| logger.info("Preloaded %s from %s", filename, candidate) | |
| return True | |
| except Exception as e: | |
| logger.warning("Failed to preload %s from %s: %s", filename, candidate, e) | |
| if required: | |
| logger.warning("Required CUDA library not found: %s (searched: %s)", filename, dirs) | |
| return False | |
| def _ensure_cuda_compat() -> None: | |
| """ | |
| Ensure llama-cpp-python's cu121 wheel can import libllama.so. | |
| The cu121 wheel is linked against CUDA 12 SONAMEs such as libcudart.so.12. | |
| In a Gradio/ZeroGPU Space, the CUDA 12 runtime must be provided by Python | |
| packages in requirements.txt and preloaded before `from llama_cpp import Llama`. | |
| """ | |
| lib_dirs = _iter_cuda_library_dirs() | |
| logger.info("CUDA library search dirs: %s", lib_dirs) | |
| _prepend_ld_library_path(lib_dirs) | |
| # Load dependency libraries explicitly with RTLD_GLOBAL so the dynamic | |
| # linker finds their symbols when loading libllama.so's DT_NEEDED entries. | |
| loaded_cudart = _preload_shared_library("libcudart.so.12", lib_dirs, required=True) | |
| _preload_shared_library("libcublasLt.so.12", lib_dirs, required=False) | |
| _preload_shared_library("libcublas.so.12", lib_dirs, required=False) | |
| if not loaded_cudart: | |
| logger.warning( | |
| "libcudart.so.12 was not preloaded. llama_cpp import may fail. " | |
| "Check that nvidia-cuda-runtime-cu12 is installed." | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # GGUF download | |
| # --------------------------------------------------------------------------- | |
| def _download_gguf(model_key: str) -> str: | |
| m = MODELS[model_key] | |
| logger.info(f"Downloading {m['hf_filename']} from {m['hf_repo']}...") | |
| local_path = hf_hub_download( | |
| repo_id=m["hf_repo"], | |
| filename=m["hf_filename"], | |
| cache_dir=str(CACHE_DIR), | |
| ) | |
| logger.info(f"GGUF at: {local_path}") | |
| return local_path | |
| # --------------------------------------------------------------------------- | |
| # Model loading — lazy llama_cpp import inside @spaces.GPU scope | |
| # --------------------------------------------------------------------------- | |
| _cuda_compat_done = False | |
| def load_model(model_key: str, n_ctx: int = DEFAULT_N_CTX): | |
| """ | |
| Load a GGUF model by key. Returns cached instance if already loaded. | |
| Must be called within a @spaces.GPU decorated function on ZeroGPU. | |
| """ | |
| global _cuda_compat_done | |
| if model_key in _model_cache: | |
| logger.info(f"Cache hit: {model_key}") | |
| return _model_cache[model_key] | |
| if model_key not in MODELS: | |
| raise ValueError(f"Unknown model key: {model_key}") | |
| # Preload CUDA 12 runtime libs before first llama_cpp import | |
| if not _cuda_compat_done: | |
| _ensure_cuda_compat() | |
| _cuda_compat_done = True | |
| # Lazy import — runs inside @spaces.GPU where GPU is allocated | |
| from llama_cpp import Llama | |
| m = MODELS[model_key] | |
| logger.info(f"Loading {m['display_name']} (n_ctx={n_ctx})...") | |
| gguf_path = _download_gguf(model_key) | |
| chat_format = CHAT_FORMAT_OVERRIDES.get(model_key, None) | |
| llm = Llama( | |
| model_path=gguf_path, | |
| n_ctx=n_ctx, | |
| n_gpu_layers=DEFAULT_N_GPU_LAYERS, | |
| n_threads=DEFAULT_N_THREADS, | |
| verbose=False, | |
| flash_attn=True, | |
| chat_format=chat_format, | |
| ) | |
| _model_cache[model_key] = llm | |
| logger.info( | |
| f"Loaded and cached: {model_key} " | |
| f"(chat_format={chat_format or 'auto'})" | |
| ) | |
| return llm | |
| def validate_pair(model_key_a: str, model_key_b: str) -> tuple: | |
| return pair_is_feasible(model_key_a, model_key_b) | |
| def get_model_n_ctx(model_key: str) -> int: | |
| if model_key == "qwen2.5-14b-1m": | |
| return 8192 | |
| m = MODELS.get(model_key, {}) | |
| return min(DEFAULT_N_CTX, m.get("context_window", DEFAULT_N_CTX)) | |