Spaces:

build-small-hackathon
/

blood-test-explainer

Running on Zero

blood-test-explainer / src /extraction /llamacpp_gpu.py

Codex

Select llama.cpp GGUF on CPU Basic Spaces

f9ab250 21 days ago

11.1 kB

	"""ZeroGPU extraction via the llama.cpp runtime (earns the 🦙 Llama Champion badge).

	Text-only by default. Set LLAMACPP_VISION=1 to run the same PDF/image vision pipeline as
	Transformers (GGUF + mmproj through llama-cpp-python).

	Config (env):
	EXTRACTOR_BACKEND=llamacpp-gpu
	LLAMACPP_VISION=1 enable vision (PDF/image uploads)
	LLAMACPP_GGUF_REPO HF repo with GGUF weights
	LLAMACPP_MODEL_FILE GGUF filename
	LLAMACPP_MMPROJ_FILE mmproj filename (required when vision is on)
	LLAMACPP_CHAT_HANDLER default MiniCPMv26ChatHandler
	LLAMACPP_MAX_TOKENS default 3072
	LLAMACPP_N_CTX default 8192
	LLAMACPP_N_GPU_LAYERS default 0 for CPU wheels; -1 on CUDA builds
	"""

	from __future__ import annotations

	import os
	import time
	from functools import lru_cache
	from typing import Any

	from src.document_processing import document_intake_metadata, document_to_payload_parts
	from src.extraction.llamacpp_vision import (
	DEFAULT_CHAT_HANDLER,
	DEFAULT_MMPROJ_FILE,
	download_hf_file,
	llamacpp_vision_enabled,
	load_vision_llama,
	)
	from src.openbmb_client import (
	EXTRACTION_PROMPT,
	ExtractionResult,
	_normalize_notes,
	_normalize_patient,
	_normalize_tests,
	_parse_json_response,
	summarize_document_parts,
	)
	from src.space_runtime import is_cpu_basic_space

	DEFAULT_GGUF_REPO = "openbmb/MiniCPM-V-4.6-gguf"
	DEFAULT_MODEL_FILE = "MiniCPM-V-4_6-Q4_K_M.gguf"

	try:
	import spaces
	except ImportError: # Local dev without the HF Spaces package.
	class _SpacesFallback:
	@staticmethod
	def GPU(_args: Any, *_kwargs: Any):
	def decorator(func):
	return func

	return decorator

	spaces = _SpacesFallback() # type: ignore[assignment]


	class LlamaCppGPUExtractor:
	"""Extractor that runs the GGUF through llama.cpp on ZeroGPU."""

	def __init__(self) -> None:
	self.repo = os.getenv("LLAMACPP_GGUF_REPO", DEFAULT_GGUF_REPO).strip()
	self.model_file = os.getenv("LLAMACPP_MODEL_FILE", DEFAULT_MODEL_FILE).strip()
	self.mmproj_file = os.getenv("LLAMACPP_MMPROJ_FILE", DEFAULT_MMPROJ_FILE).strip()
	self.chat_handler = os.getenv("LLAMACPP_CHAT_HANDLER", DEFAULT_CHAT_HANDLER).strip()
	self.max_tokens = int(os.getenv("LLAMACPP_MAX_TOKENS", "3072"))
	self.n_ctx = int(os.getenv("LLAMACPP_N_CTX", "8192"))
	self.n_gpu_layers = int(os.getenv("LLAMACPP_N_GPU_LAYERS", "0"))
	self.vision_enabled = llamacpp_vision_enabled()
	self.use_spaces_gpu = not is_cpu_basic_space()
	if self.vision_enabled and not self.mmproj_file:
	raise ValueError("LLAMACPP_VISION=1 requires LLAMACPP_MMPROJ_FILE.")

	def extract(self, file_path: str, max_pages: int = 3) -> ExtractionResult:
	parts = document_to_payload_parts(file_path, max_pages=max_pages)
	started = time.perf_counter()
	if self.vision_enabled:
	runner = (
	_run_llamacpp_vision_generation
	if self.use_spaces_gpu
	else _run_llamacpp_vision_generation_cpu
	)
	raw = runner(
	parts=parts,
	repo=self.repo,
	model_file=self.model_file,
	mmproj_file=self.mmproj_file,
	chat_handler=self.chat_handler,
	max_tokens=self.max_tokens,
	n_ctx=self.n_ctx,
	n_gpu_layers=self.n_gpu_layers,
	)
	backend = "llamacpp-gpu-vision" if self.use_spaces_gpu else "llamacpp-cpu-vision"
	composed_prompt = None
	else:
	prompt_text = _compose_prompt(parts)
	runner = (
	_run_llamacpp_generation
	if self.use_spaces_gpu
	else _run_llamacpp_generation_cpu
	)
	raw = runner(
	prompt_text=prompt_text,
	repo=self.repo,
	model_file=self.model_file,
	max_tokens=self.max_tokens,
	n_ctx=self.n_ctx,
	n_gpu_layers=self.n_gpu_layers,
	)
	backend = "llamacpp-gpu" if self.use_spaces_gpu else "llamacpp-cpu"
	composed_prompt = prompt_text

	duration_ms = int((time.perf_counter() - started) * 1000)
	parsed = _parse_json_response(raw)
	summary = {
	"backend": backend,
	"repo": self.repo,
	"model": self.model_file,
	"vision_enabled": self.vision_enabled,
	"spaces_gpu": self.use_spaces_gpu,
	"document_parts": len(parts),
	"max_pages": max_pages,
	"extraction_prompt": EXTRACTION_PROMPT,
	"user_message_preview": summarize_document_parts(parts),
	**document_intake_metadata(file_path, parts),
	"return_code": 0,
	"duration_ms": duration_ms,
	}
	if self.vision_enabled:
	summary["mmproj"] = self.mmproj_file
	summary["chat_handler"] = self.chat_handler
	else:
	summary["composed_prompt"] = composed_prompt

	return ExtractionResult(
	patient=_normalize_patient(parsed.get("patient", {})),
	tests=_normalize_tests(parsed.get("tests", [])),
	notes=_normalize_notes(parsed.get("notes", [])),
	raw_response=raw,
	request_summary=summary,
	)


	@lru_cache(maxsize=1)
	def _load_text(model_path: str, n_ctx: int, n_gpu_layers: int):
	from llama_cpp import Llama

	return Llama(
	model_path=model_path,
	n_ctx=n_ctx,
	n_gpu_layers=n_gpu_layers,
	verbose=False,
	)


	def _vision_messages(parts: list[dict[str, Any]]) -> list[dict[str, Any]]:
	return [{"role": "user", "content": [{"type": "text", "text": EXTRACTION_PROMPT}, *parts]}]


	def _raise_generation_error(exc: Exception, *, vision: bool) -> RuntimeError:
	message = f"{type(exc).__name__}: {exc}"
	if "llama_decode returned -1" in message:
	detail = (
	"llama.cpp ran out of room while decoding the vision prompt. "
	if vision
	else "llama.cpp ran out of room while decoding the PDF text prompt. "
	)
	raise RuntimeError(
	detail
	+ "Try increasing LLAMACPP_N_CTX, lowering the number of PDF pages, or trimming the "
	"input before sending it to the model."
	) from exc
	model_label = "vision GGUF + mmproj" if vision else "text-only GGUF"
	raise RuntimeError(
	f"The llama.cpp backend could not complete extraction with the {model_label} model. "
	f"Inner error: {message}"
	) from exc


	@spaces.GPU(duration=600)
	def _run_llamacpp_vision_generation(
	parts: list[dict[str, Any]],
	repo: str,
	model_file: str,
	mmproj_file: str,
	chat_handler: str,
	max_tokens: int,
	n_ctx: int,
	n_gpu_layers: int,
	) -> str:
	return _run_llamacpp_vision_generation_cpu(
	parts=parts,
	repo=repo,
	model_file=model_file,
	mmproj_file=mmproj_file,
	chat_handler=chat_handler,
	max_tokens=max_tokens,
	n_ctx=n_ctx,
	n_gpu_layers=n_gpu_layers,
	)


	def _run_llamacpp_vision_generation_cpu(
	parts: list[dict[str, Any]],
	repo: str,
	model_file: str,
	mmproj_file: str,
	chat_handler: str,
	max_tokens: int,
	n_ctx: int,
	n_gpu_layers: int,
	) -> str:
	try:
	model_path = download_hf_file(repo, model_file)
	mmproj_path = download_hf_file(repo, mmproj_file)
	except Exception as exc:
	raise RuntimeError(
	"llama.cpp download failed while preparing the vision GGUF assets: "
	f"{type(exc).__name__}: {exc}"
	) from exc

	try:
	llm = load_vision_llama(model_path, mmproj_path, n_ctx, n_gpu_layers, chat_handler)
	except Exception as exc:
	raise RuntimeError(
	"The llama.cpp backend could not load the vision GGUF + mmproj model. "
	"This usually means the downloaded model build is incompatible with the installed "
	"llama-cpp-python wheel or the model files are incomplete. "
	f"Inner error: {type(exc).__name__}: {exc}"
	) from exc

	try:
	response = llm.create_chat_completion(
	messages=_vision_messages(parts),
	response_format={"type": "json_object"},
	temperature=0.0,
	max_tokens=max_tokens,
	)
	return response["choices"][0]["message"].get("content") or "{}"
	except Exception as exc:
	raise _raise_generation_error(exc, vision=True) from exc


	@spaces.GPU(duration=600)
	def _run_llamacpp_generation(
	prompt_text: str,
	repo: str,
	model_file: str,
	max_tokens: int,
	n_ctx: int,
	n_gpu_layers: int,
	) -> str:
	return _run_llamacpp_generation_cpu(
	prompt_text=prompt_text,
	repo=repo,
	model_file=model_file,
	max_tokens=max_tokens,
	n_ctx=n_ctx,
	n_gpu_layers=n_gpu_layers,
	)


	def _run_llamacpp_generation_cpu(
	prompt_text: str,
	repo: str,
	model_file: str,
	max_tokens: int,
	n_ctx: int,
	n_gpu_layers: int,
	) -> str:
	try:
	model_path = download_hf_file(repo, model_file)
	except Exception as exc:
	raise RuntimeError(
	"llama.cpp download failed while preparing the GGUF model: "
	f"{type(exc).__name__}: {exc}"
	) from exc

	try:
	llm = _load_text(model_path, n_ctx, n_gpu_layers)
	except Exception as exc:
	raise RuntimeError(
	"The llama.cpp backend could not load the text-only GGUF model. "
	"This usually means the downloaded model build is incompatible with the installed "
	"llama-cpp-python wheel or the model file is incomplete. "
	f"Inner error: {type(exc).__name__}: {exc}"
	) from exc

	try:
	response = llm.create_chat_completion(
	messages=[{"role": "user", "content": prompt_text}],
	response_format={"type": "json_object"},
	temperature=0.0,
	max_tokens=max_tokens,
	)
	return response["choices"][0]["message"].get("content") or "{}"
	except Exception as exc:
	raise _raise_generation_error(exc, vision=False) from exc


	def _compose_prompt(parts: list[dict[str, Any]]) -> str:
	text_parts: list[str] = [EXTRACTION_PROMPT]
	image_count = 0
	for part in parts:
	if part.get("type") == "text":
	text = str(part.get("text", "")).strip()
	if text:
	text_parts.append(text)
	elif part.get("type") == "image_url":
	image_count += 1

	if image_count and len(text_parts) == 1:
	raise RuntimeError(
	"The llama.cpp text backend cannot analyze image-based documents. "
	"Set LLAMACPP_VISION=1 with EXTRACTOR_BACKEND=llamacpp-gpu, or use "
	"EXTRACTOR_BACKEND=transformers for local vision extraction."
	)

	return "\n\n".join(text_parts)