Spaces:
Running on Zero
Running on Zero
| """Offline extraction via a local llama.cpp server (llama-server). | |
| This is the off-grid backend that actually works for MiniCPM-V 4.6. The pip `llama-cpp-python` | |
| bundles an llama.cpp too old to load 4.6, but the current `llama-server` (brew / release build) | |
| runs it fine. We POST to a llama-server on localhost with the document image plus: | |
| - our **GBNF grammar**, so the output is always the `{tests, notes}` schema, and | |
| - `enable_thinking: false`, so the model doesn't spend its whole token budget on a `<think>` | |
| ramble (the cause of the "could not be converted into a report" failure). | |
| localhost = the model running on this machine, so it is still fully off-grid (no external call). | |
| Run the server next to the app: | |
| llama-server -m model.gguf --mmproj mmproj.gguf --port 8080 | |
| Config (env): | |
| LLAMA_SERVER_URL default http://127.0.0.1:8080/v1/chat/completions | |
| LLAMA_SERVER_MODEL default "minicpm-v" | |
| LLAMA_SERVER_GRAMMAR set to "1" to send the GBNF grammar (OFF by default: the current | |
| llama-server build rejects our grammar, and `enable_thinking:false` | |
| plus the tolerant parser already yield clean {tests,notes} output) | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import time | |
| import requests | |
| from src.document_processing import document_intake_metadata, document_to_payload_parts | |
| from src.grammar import extraction_grammar | |
| from src.openbmb_client import ( | |
| EXTRACTION_PROMPT, | |
| ExtractionResult, | |
| _normalize_notes, | |
| _normalize_patient, | |
| _normalize_tests, | |
| _parse_json_response, | |
| summarize_document_parts, | |
| ) | |
| DEFAULT_SERVER_URL = "http://127.0.0.1:8080/v1/chat/completions" | |
| class LocalServerExtractor: | |
| """Implements the `Extractor` protocol against a local llama-server.""" | |
| def __init__( | |
| self, | |
| url: str | None = None, | |
| model: str | None = None, | |
| timeout_seconds: int = 180, | |
| ) -> None: | |
| self.url = (url or os.getenv("LLAMA_SERVER_URL") or DEFAULT_SERVER_URL).strip() | |
| self.model = (model or os.getenv("LLAMA_SERVER_MODEL") or "minicpm-v").strip() | |
| self.timeout_seconds = timeout_seconds | |
| self.use_grammar = os.getenv("LLAMA_SERVER_GRAMMAR", "0") == "1" | |
| def extract(self, file_path: str, max_pages: int = 3) -> ExtractionResult: | |
| parts = document_to_payload_parts(file_path, max_pages=max_pages) | |
| payload = { | |
| "model": self.model, | |
| "messages": [ | |
| {"role": "user", "content": [{"type": "text", "text": EXTRACTION_PROMPT}, *parts]} | |
| ], | |
| "temperature": 0, | |
| "max_tokens": 2048, | |
| # Stop the model from emitting a <think> reasoning block (it otherwise burns the | |
| # whole token budget before producing JSON). Unknown fields are ignored by the server. | |
| "chat_template_kwargs": {"enable_thinking": False}, | |
| } | |
| if self.use_grammar: | |
| # Grammar-constrained decoding: output can only be our {tests, notes} schema. | |
| payload["grammar"] = extraction_grammar() | |
| started = time.perf_counter() | |
| response = requests.post( | |
| self.url, | |
| json=payload, | |
| headers={"Content-Type": "application/json"}, | |
| timeout=self.timeout_seconds, | |
| ) | |
| duration_ms = int((time.perf_counter() - started) * 1000) | |
| response.raise_for_status() | |
| raw = _message_content(response.json()) | |
| parsed = _parse_json_response(raw) | |
| return ExtractionResult( | |
| patient=_normalize_patient(parsed.get("patient", {})), | |
| tests=_normalize_tests(parsed.get("tests", [])), | |
| notes=_normalize_notes(parsed.get("notes", [])), | |
| raw_response=raw, | |
| request_summary={ | |
| "backend": "local-server", | |
| "url": self.url, | |
| "model": self.model, | |
| "document_parts": len(parts), | |
| "max_pages": max_pages, | |
| "grammar": self.use_grammar, | |
| "user_message_preview": summarize_document_parts(parts), | |
| **document_intake_metadata(file_path, parts), | |
| "http_status": response.status_code, | |
| "return_code": 0, | |
| "duration_ms": duration_ms, | |
| }, | |
| ) | |
| def _message_content(payload: dict) -> str: | |
| try: | |
| message = payload["choices"][0]["message"] | |
| except (KeyError, IndexError, TypeError) as error: | |
| raise ValueError("llama-server response did not include choices[0].message.") from error | |
| content = message.get("content") or "" | |
| if isinstance(content, list): | |
| content = "\n".join(p.get("text", "") for p in content if isinstance(p, dict)) | |
| return content.strip() | |