blood-test-explainer / src /extraction /llamacpp_gpu.py
Codex
Select llama.cpp GGUF on CPU Basic Spaces
f9ab250
Raw
History Blame Contribute Delete
11.1 kB
"""ZeroGPU extraction via the **llama.cpp** runtime (earns the 🦙 Llama Champion badge).
Text-only by default. Set LLAMACPP_VISION=1 to run the same PDF/image vision pipeline as
Transformers (GGUF + mmproj through llama-cpp-python).
Config (env):
EXTRACTOR_BACKEND=llamacpp-gpu
LLAMACPP_VISION=1 enable vision (PDF/image uploads)
LLAMACPP_GGUF_REPO HF repo with GGUF weights
LLAMACPP_MODEL_FILE GGUF filename
LLAMACPP_MMPROJ_FILE mmproj filename (required when vision is on)
LLAMACPP_CHAT_HANDLER default MiniCPMv26ChatHandler
LLAMACPP_MAX_TOKENS default 3072
LLAMACPP_N_CTX default 8192
LLAMACPP_N_GPU_LAYERS default 0 for CPU wheels; -1 on CUDA builds
"""
from __future__ import annotations
import os
import time
from functools import lru_cache
from typing import Any
from src.document_processing import document_intake_metadata, document_to_payload_parts
from src.extraction.llamacpp_vision import (
DEFAULT_CHAT_HANDLER,
DEFAULT_MMPROJ_FILE,
download_hf_file,
llamacpp_vision_enabled,
load_vision_llama,
)
from src.openbmb_client import (
EXTRACTION_PROMPT,
ExtractionResult,
_normalize_notes,
_normalize_patient,
_normalize_tests,
_parse_json_response,
summarize_document_parts,
)
from src.space_runtime import is_cpu_basic_space
DEFAULT_GGUF_REPO = "openbmb/MiniCPM-V-4.6-gguf"
DEFAULT_MODEL_FILE = "MiniCPM-V-4_6-Q4_K_M.gguf"
try:
import spaces
except ImportError: # Local dev without the HF Spaces package.
class _SpacesFallback:
@staticmethod
def GPU(*_args: Any, **_kwargs: Any):
def decorator(func):
return func
return decorator
spaces = _SpacesFallback() # type: ignore[assignment]
class LlamaCppGPUExtractor:
"""Extractor that runs the GGUF through llama.cpp on ZeroGPU."""
def __init__(self) -> None:
self.repo = os.getenv("LLAMACPP_GGUF_REPO", DEFAULT_GGUF_REPO).strip()
self.model_file = os.getenv("LLAMACPP_MODEL_FILE", DEFAULT_MODEL_FILE).strip()
self.mmproj_file = os.getenv("LLAMACPP_MMPROJ_FILE", DEFAULT_MMPROJ_FILE).strip()
self.chat_handler = os.getenv("LLAMACPP_CHAT_HANDLER", DEFAULT_CHAT_HANDLER).strip()
self.max_tokens = int(os.getenv("LLAMACPP_MAX_TOKENS", "3072"))
self.n_ctx = int(os.getenv("LLAMACPP_N_CTX", "8192"))
self.n_gpu_layers = int(os.getenv("LLAMACPP_N_GPU_LAYERS", "0"))
self.vision_enabled = llamacpp_vision_enabled()
self.use_spaces_gpu = not is_cpu_basic_space()
if self.vision_enabled and not self.mmproj_file:
raise ValueError("LLAMACPP_VISION=1 requires LLAMACPP_MMPROJ_FILE.")
def extract(self, file_path: str, max_pages: int = 3) -> ExtractionResult:
parts = document_to_payload_parts(file_path, max_pages=max_pages)
started = time.perf_counter()
if self.vision_enabled:
runner = (
_run_llamacpp_vision_generation
if self.use_spaces_gpu
else _run_llamacpp_vision_generation_cpu
)
raw = runner(
parts=parts,
repo=self.repo,
model_file=self.model_file,
mmproj_file=self.mmproj_file,
chat_handler=self.chat_handler,
max_tokens=self.max_tokens,
n_ctx=self.n_ctx,
n_gpu_layers=self.n_gpu_layers,
)
backend = "llamacpp-gpu-vision" if self.use_spaces_gpu else "llamacpp-cpu-vision"
composed_prompt = None
else:
prompt_text = _compose_prompt(parts)
runner = (
_run_llamacpp_generation
if self.use_spaces_gpu
else _run_llamacpp_generation_cpu
)
raw = runner(
prompt_text=prompt_text,
repo=self.repo,
model_file=self.model_file,
max_tokens=self.max_tokens,
n_ctx=self.n_ctx,
n_gpu_layers=self.n_gpu_layers,
)
backend = "llamacpp-gpu" if self.use_spaces_gpu else "llamacpp-cpu"
composed_prompt = prompt_text
duration_ms = int((time.perf_counter() - started) * 1000)
parsed = _parse_json_response(raw)
summary = {
"backend": backend,
"repo": self.repo,
"model": self.model_file,
"vision_enabled": self.vision_enabled,
"spaces_gpu": self.use_spaces_gpu,
"document_parts": len(parts),
"max_pages": max_pages,
"extraction_prompt": EXTRACTION_PROMPT,
"user_message_preview": summarize_document_parts(parts),
**document_intake_metadata(file_path, parts),
"return_code": 0,
"duration_ms": duration_ms,
}
if self.vision_enabled:
summary["mmproj"] = self.mmproj_file
summary["chat_handler"] = self.chat_handler
else:
summary["composed_prompt"] = composed_prompt
return ExtractionResult(
patient=_normalize_patient(parsed.get("patient", {})),
tests=_normalize_tests(parsed.get("tests", [])),
notes=_normalize_notes(parsed.get("notes", [])),
raw_response=raw,
request_summary=summary,
)
@lru_cache(maxsize=1)
def _load_text(model_path: str, n_ctx: int, n_gpu_layers: int):
from llama_cpp import Llama
return Llama(
model_path=model_path,
n_ctx=n_ctx,
n_gpu_layers=n_gpu_layers,
verbose=False,
)
def _vision_messages(parts: list[dict[str, Any]]) -> list[dict[str, Any]]:
return [{"role": "user", "content": [{"type": "text", "text": EXTRACTION_PROMPT}, *parts]}]
def _raise_generation_error(exc: Exception, *, vision: bool) -> RuntimeError:
message = f"{type(exc).__name__}: {exc}"
if "llama_decode returned -1" in message:
detail = (
"llama.cpp ran out of room while decoding the vision prompt. "
if vision
else "llama.cpp ran out of room while decoding the PDF text prompt. "
)
raise RuntimeError(
detail
+ "Try increasing LLAMACPP_N_CTX, lowering the number of PDF pages, or trimming the "
"input before sending it to the model."
) from exc
model_label = "vision GGUF + mmproj" if vision else "text-only GGUF"
raise RuntimeError(
f"The llama.cpp backend could not complete extraction with the {model_label} model. "
f"Inner error: {message}"
) from exc
@spaces.GPU(duration=600)
def _run_llamacpp_vision_generation(
parts: list[dict[str, Any]],
repo: str,
model_file: str,
mmproj_file: str,
chat_handler: str,
max_tokens: int,
n_ctx: int,
n_gpu_layers: int,
) -> str:
return _run_llamacpp_vision_generation_cpu(
parts=parts,
repo=repo,
model_file=model_file,
mmproj_file=mmproj_file,
chat_handler=chat_handler,
max_tokens=max_tokens,
n_ctx=n_ctx,
n_gpu_layers=n_gpu_layers,
)
def _run_llamacpp_vision_generation_cpu(
parts: list[dict[str, Any]],
repo: str,
model_file: str,
mmproj_file: str,
chat_handler: str,
max_tokens: int,
n_ctx: int,
n_gpu_layers: int,
) -> str:
try:
model_path = download_hf_file(repo, model_file)
mmproj_path = download_hf_file(repo, mmproj_file)
except Exception as exc:
raise RuntimeError(
"llama.cpp download failed while preparing the vision GGUF assets: "
f"{type(exc).__name__}: {exc}"
) from exc
try:
llm = load_vision_llama(model_path, mmproj_path, n_ctx, n_gpu_layers, chat_handler)
except Exception as exc:
raise RuntimeError(
"The llama.cpp backend could not load the vision GGUF + mmproj model. "
"This usually means the downloaded model build is incompatible with the installed "
"llama-cpp-python wheel or the model files are incomplete. "
f"Inner error: {type(exc).__name__}: {exc}"
) from exc
try:
response = llm.create_chat_completion(
messages=_vision_messages(parts),
response_format={"type": "json_object"},
temperature=0.0,
max_tokens=max_tokens,
)
return response["choices"][0]["message"].get("content") or "{}"
except Exception as exc:
raise _raise_generation_error(exc, vision=True) from exc
@spaces.GPU(duration=600)
def _run_llamacpp_generation(
prompt_text: str,
repo: str,
model_file: str,
max_tokens: int,
n_ctx: int,
n_gpu_layers: int,
) -> str:
return _run_llamacpp_generation_cpu(
prompt_text=prompt_text,
repo=repo,
model_file=model_file,
max_tokens=max_tokens,
n_ctx=n_ctx,
n_gpu_layers=n_gpu_layers,
)
def _run_llamacpp_generation_cpu(
prompt_text: str,
repo: str,
model_file: str,
max_tokens: int,
n_ctx: int,
n_gpu_layers: int,
) -> str:
try:
model_path = download_hf_file(repo, model_file)
except Exception as exc:
raise RuntimeError(
"llama.cpp download failed while preparing the GGUF model: "
f"{type(exc).__name__}: {exc}"
) from exc
try:
llm = _load_text(model_path, n_ctx, n_gpu_layers)
except Exception as exc:
raise RuntimeError(
"The llama.cpp backend could not load the text-only GGUF model. "
"This usually means the downloaded model build is incompatible with the installed "
"llama-cpp-python wheel or the model file is incomplete. "
f"Inner error: {type(exc).__name__}: {exc}"
) from exc
try:
response = llm.create_chat_completion(
messages=[{"role": "user", "content": prompt_text}],
response_format={"type": "json_object"},
temperature=0.0,
max_tokens=max_tokens,
)
return response["choices"][0]["message"].get("content") or "{}"
except Exception as exc:
raise _raise_generation_error(exc, vision=False) from exc
def _compose_prompt(parts: list[dict[str, Any]]) -> str:
text_parts: list[str] = [EXTRACTION_PROMPT]
image_count = 0
for part in parts:
if part.get("type") == "text":
text = str(part.get("text", "")).strip()
if text:
text_parts.append(text)
elif part.get("type") == "image_url":
image_count += 1
if image_count and len(text_parts) == 1:
raise RuntimeError(
"The llama.cpp text backend cannot analyze image-based documents. "
"Set LLAMACPP_VISION=1 with EXTRACTOR_BACKEND=llamacpp-gpu, or use "
"EXTRACTOR_BACKEND=transformers for local vision extraction."
)
return "\n\n".join(text_parts)