Spaces:
Running on Zero
Running on Zero
| """ZeroGPU extraction via the **llama.cpp** runtime (earns the 🦙 Llama Champion badge). | |
| Text-only by default. Set LLAMACPP_VISION=1 to run the same PDF/image vision pipeline as | |
| Transformers (GGUF + mmproj through llama-cpp-python). | |
| Config (env): | |
| EXTRACTOR_BACKEND=llamacpp-gpu | |
| LLAMACPP_VISION=1 enable vision (PDF/image uploads) | |
| LLAMACPP_GGUF_REPO HF repo with GGUF weights | |
| LLAMACPP_MODEL_FILE GGUF filename | |
| LLAMACPP_MMPROJ_FILE mmproj filename (required when vision is on) | |
| LLAMACPP_CHAT_HANDLER default MiniCPMv26ChatHandler | |
| LLAMACPP_MAX_TOKENS default 3072 | |
| LLAMACPP_N_CTX default 8192 | |
| LLAMACPP_N_GPU_LAYERS default 0 for CPU wheels; -1 on CUDA builds | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import time | |
| from functools import lru_cache | |
| from typing import Any | |
| from src.document_processing import document_intake_metadata, document_to_payload_parts | |
| from src.extraction.llamacpp_vision import ( | |
| DEFAULT_CHAT_HANDLER, | |
| DEFAULT_MMPROJ_FILE, | |
| download_hf_file, | |
| llamacpp_vision_enabled, | |
| load_vision_llama, | |
| ) | |
| from src.openbmb_client import ( | |
| EXTRACTION_PROMPT, | |
| ExtractionResult, | |
| _normalize_notes, | |
| _normalize_patient, | |
| _normalize_tests, | |
| _parse_json_response, | |
| summarize_document_parts, | |
| ) | |
| from src.space_runtime import is_cpu_basic_space | |
| DEFAULT_GGUF_REPO = "openbmb/MiniCPM-V-4.6-gguf" | |
| DEFAULT_MODEL_FILE = "MiniCPM-V-4_6-Q4_K_M.gguf" | |
| try: | |
| import spaces | |
| except ImportError: # Local dev without the HF Spaces package. | |
| class _SpacesFallback: | |
| def GPU(*_args: Any, **_kwargs: Any): | |
| def decorator(func): | |
| return func | |
| return decorator | |
| spaces = _SpacesFallback() # type: ignore[assignment] | |
| class LlamaCppGPUExtractor: | |
| """Extractor that runs the GGUF through llama.cpp on ZeroGPU.""" | |
| def __init__(self) -> None: | |
| self.repo = os.getenv("LLAMACPP_GGUF_REPO", DEFAULT_GGUF_REPO).strip() | |
| self.model_file = os.getenv("LLAMACPP_MODEL_FILE", DEFAULT_MODEL_FILE).strip() | |
| self.mmproj_file = os.getenv("LLAMACPP_MMPROJ_FILE", DEFAULT_MMPROJ_FILE).strip() | |
| self.chat_handler = os.getenv("LLAMACPP_CHAT_HANDLER", DEFAULT_CHAT_HANDLER).strip() | |
| self.max_tokens = int(os.getenv("LLAMACPP_MAX_TOKENS", "3072")) | |
| self.n_ctx = int(os.getenv("LLAMACPP_N_CTX", "8192")) | |
| self.n_gpu_layers = int(os.getenv("LLAMACPP_N_GPU_LAYERS", "0")) | |
| self.vision_enabled = llamacpp_vision_enabled() | |
| self.use_spaces_gpu = not is_cpu_basic_space() | |
| if self.vision_enabled and not self.mmproj_file: | |
| raise ValueError("LLAMACPP_VISION=1 requires LLAMACPP_MMPROJ_FILE.") | |
| def extract(self, file_path: str, max_pages: int = 3) -> ExtractionResult: | |
| parts = document_to_payload_parts(file_path, max_pages=max_pages) | |
| started = time.perf_counter() | |
| if self.vision_enabled: | |
| runner = ( | |
| _run_llamacpp_vision_generation | |
| if self.use_spaces_gpu | |
| else _run_llamacpp_vision_generation_cpu | |
| ) | |
| raw = runner( | |
| parts=parts, | |
| repo=self.repo, | |
| model_file=self.model_file, | |
| mmproj_file=self.mmproj_file, | |
| chat_handler=self.chat_handler, | |
| max_tokens=self.max_tokens, | |
| n_ctx=self.n_ctx, | |
| n_gpu_layers=self.n_gpu_layers, | |
| ) | |
| backend = "llamacpp-gpu-vision" if self.use_spaces_gpu else "llamacpp-cpu-vision" | |
| composed_prompt = None | |
| else: | |
| prompt_text = _compose_prompt(parts) | |
| runner = ( | |
| _run_llamacpp_generation | |
| if self.use_spaces_gpu | |
| else _run_llamacpp_generation_cpu | |
| ) | |
| raw = runner( | |
| prompt_text=prompt_text, | |
| repo=self.repo, | |
| model_file=self.model_file, | |
| max_tokens=self.max_tokens, | |
| n_ctx=self.n_ctx, | |
| n_gpu_layers=self.n_gpu_layers, | |
| ) | |
| backend = "llamacpp-gpu" if self.use_spaces_gpu else "llamacpp-cpu" | |
| composed_prompt = prompt_text | |
| duration_ms = int((time.perf_counter() - started) * 1000) | |
| parsed = _parse_json_response(raw) | |
| summary = { | |
| "backend": backend, | |
| "repo": self.repo, | |
| "model": self.model_file, | |
| "vision_enabled": self.vision_enabled, | |
| "spaces_gpu": self.use_spaces_gpu, | |
| "document_parts": len(parts), | |
| "max_pages": max_pages, | |
| "extraction_prompt": EXTRACTION_PROMPT, | |
| "user_message_preview": summarize_document_parts(parts), | |
| **document_intake_metadata(file_path, parts), | |
| "return_code": 0, | |
| "duration_ms": duration_ms, | |
| } | |
| if self.vision_enabled: | |
| summary["mmproj"] = self.mmproj_file | |
| summary["chat_handler"] = self.chat_handler | |
| else: | |
| summary["composed_prompt"] = composed_prompt | |
| return ExtractionResult( | |
| patient=_normalize_patient(parsed.get("patient", {})), | |
| tests=_normalize_tests(parsed.get("tests", [])), | |
| notes=_normalize_notes(parsed.get("notes", [])), | |
| raw_response=raw, | |
| request_summary=summary, | |
| ) | |
| def _load_text(model_path: str, n_ctx: int, n_gpu_layers: int): | |
| from llama_cpp import Llama | |
| return Llama( | |
| model_path=model_path, | |
| n_ctx=n_ctx, | |
| n_gpu_layers=n_gpu_layers, | |
| verbose=False, | |
| ) | |
| def _vision_messages(parts: list[dict[str, Any]]) -> list[dict[str, Any]]: | |
| return [{"role": "user", "content": [{"type": "text", "text": EXTRACTION_PROMPT}, *parts]}] | |
| def _raise_generation_error(exc: Exception, *, vision: bool) -> RuntimeError: | |
| message = f"{type(exc).__name__}: {exc}" | |
| if "llama_decode returned -1" in message: | |
| detail = ( | |
| "llama.cpp ran out of room while decoding the vision prompt. " | |
| if vision | |
| else "llama.cpp ran out of room while decoding the PDF text prompt. " | |
| ) | |
| raise RuntimeError( | |
| detail | |
| + "Try increasing LLAMACPP_N_CTX, lowering the number of PDF pages, or trimming the " | |
| "input before sending it to the model." | |
| ) from exc | |
| model_label = "vision GGUF + mmproj" if vision else "text-only GGUF" | |
| raise RuntimeError( | |
| f"The llama.cpp backend could not complete extraction with the {model_label} model. " | |
| f"Inner error: {message}" | |
| ) from exc | |
| def _run_llamacpp_vision_generation( | |
| parts: list[dict[str, Any]], | |
| repo: str, | |
| model_file: str, | |
| mmproj_file: str, | |
| chat_handler: str, | |
| max_tokens: int, | |
| n_ctx: int, | |
| n_gpu_layers: int, | |
| ) -> str: | |
| return _run_llamacpp_vision_generation_cpu( | |
| parts=parts, | |
| repo=repo, | |
| model_file=model_file, | |
| mmproj_file=mmproj_file, | |
| chat_handler=chat_handler, | |
| max_tokens=max_tokens, | |
| n_ctx=n_ctx, | |
| n_gpu_layers=n_gpu_layers, | |
| ) | |
| def _run_llamacpp_vision_generation_cpu( | |
| parts: list[dict[str, Any]], | |
| repo: str, | |
| model_file: str, | |
| mmproj_file: str, | |
| chat_handler: str, | |
| max_tokens: int, | |
| n_ctx: int, | |
| n_gpu_layers: int, | |
| ) -> str: | |
| try: | |
| model_path = download_hf_file(repo, model_file) | |
| mmproj_path = download_hf_file(repo, mmproj_file) | |
| except Exception as exc: | |
| raise RuntimeError( | |
| "llama.cpp download failed while preparing the vision GGUF assets: " | |
| f"{type(exc).__name__}: {exc}" | |
| ) from exc | |
| try: | |
| llm = load_vision_llama(model_path, mmproj_path, n_ctx, n_gpu_layers, chat_handler) | |
| except Exception as exc: | |
| raise RuntimeError( | |
| "The llama.cpp backend could not load the vision GGUF + mmproj model. " | |
| "This usually means the downloaded model build is incompatible with the installed " | |
| "llama-cpp-python wheel or the model files are incomplete. " | |
| f"Inner error: {type(exc).__name__}: {exc}" | |
| ) from exc | |
| try: | |
| response = llm.create_chat_completion( | |
| messages=_vision_messages(parts), | |
| response_format={"type": "json_object"}, | |
| temperature=0.0, | |
| max_tokens=max_tokens, | |
| ) | |
| return response["choices"][0]["message"].get("content") or "{}" | |
| except Exception as exc: | |
| raise _raise_generation_error(exc, vision=True) from exc | |
| def _run_llamacpp_generation( | |
| prompt_text: str, | |
| repo: str, | |
| model_file: str, | |
| max_tokens: int, | |
| n_ctx: int, | |
| n_gpu_layers: int, | |
| ) -> str: | |
| return _run_llamacpp_generation_cpu( | |
| prompt_text=prompt_text, | |
| repo=repo, | |
| model_file=model_file, | |
| max_tokens=max_tokens, | |
| n_ctx=n_ctx, | |
| n_gpu_layers=n_gpu_layers, | |
| ) | |
| def _run_llamacpp_generation_cpu( | |
| prompt_text: str, | |
| repo: str, | |
| model_file: str, | |
| max_tokens: int, | |
| n_ctx: int, | |
| n_gpu_layers: int, | |
| ) -> str: | |
| try: | |
| model_path = download_hf_file(repo, model_file) | |
| except Exception as exc: | |
| raise RuntimeError( | |
| "llama.cpp download failed while preparing the GGUF model: " | |
| f"{type(exc).__name__}: {exc}" | |
| ) from exc | |
| try: | |
| llm = _load_text(model_path, n_ctx, n_gpu_layers) | |
| except Exception as exc: | |
| raise RuntimeError( | |
| "The llama.cpp backend could not load the text-only GGUF model. " | |
| "This usually means the downloaded model build is incompatible with the installed " | |
| "llama-cpp-python wheel or the model file is incomplete. " | |
| f"Inner error: {type(exc).__name__}: {exc}" | |
| ) from exc | |
| try: | |
| response = llm.create_chat_completion( | |
| messages=[{"role": "user", "content": prompt_text}], | |
| response_format={"type": "json_object"}, | |
| temperature=0.0, | |
| max_tokens=max_tokens, | |
| ) | |
| return response["choices"][0]["message"].get("content") or "{}" | |
| except Exception as exc: | |
| raise _raise_generation_error(exc, vision=False) from exc | |
| def _compose_prompt(parts: list[dict[str, Any]]) -> str: | |
| text_parts: list[str] = [EXTRACTION_PROMPT] | |
| image_count = 0 | |
| for part in parts: | |
| if part.get("type") == "text": | |
| text = str(part.get("text", "")).strip() | |
| if text: | |
| text_parts.append(text) | |
| elif part.get("type") == "image_url": | |
| image_count += 1 | |
| if image_count and len(text_parts) == 1: | |
| raise RuntimeError( | |
| "The llama.cpp text backend cannot analyze image-based documents. " | |
| "Set LLAMACPP_VISION=1 with EXTRACTOR_BACKEND=llamacpp-gpu, or use " | |
| "EXTRACTOR_BACKEND=transformers for local vision extraction." | |
| ) | |
| return "\n\n".join(text_parts) | |