blood-test-explainer / src /extraction /local_server.py
r0mant1c's picture
Add agent trace panel, vision extraction, and local Transformers backend.
354b37e
Raw
History Blame Contribute Delete
4.77 kB
"""Offline extraction via a local llama.cpp server (llama-server).
This is the off-grid backend that actually works for MiniCPM-V 4.6. The pip `llama-cpp-python`
bundles an llama.cpp too old to load 4.6, but the current `llama-server` (brew / release build)
runs it fine. We POST to a llama-server on localhost with the document image plus:
- our **GBNF grammar**, so the output is always the `{tests, notes}` schema, and
- `enable_thinking: false`, so the model doesn't spend its whole token budget on a `<think>`
ramble (the cause of the "could not be converted into a report" failure).
localhost = the model running on this machine, so it is still fully off-grid (no external call).
Run the server next to the app:
llama-server -m model.gguf --mmproj mmproj.gguf --port 8080
Config (env):
LLAMA_SERVER_URL default http://127.0.0.1:8080/v1/chat/completions
LLAMA_SERVER_MODEL default "minicpm-v"
LLAMA_SERVER_GRAMMAR set to "1" to send the GBNF grammar (OFF by default: the current
llama-server build rejects our grammar, and `enable_thinking:false`
plus the tolerant parser already yield clean {tests,notes} output)
"""
from __future__ import annotations
import os
import time
import requests
from src.document_processing import document_intake_metadata, document_to_payload_parts
from src.grammar import extraction_grammar
from src.openbmb_client import (
EXTRACTION_PROMPT,
ExtractionResult,
_normalize_notes,
_normalize_patient,
_normalize_tests,
_parse_json_response,
summarize_document_parts,
)
DEFAULT_SERVER_URL = "http://127.0.0.1:8080/v1/chat/completions"
class LocalServerExtractor:
"""Implements the `Extractor` protocol against a local llama-server."""
def __init__(
self,
url: str | None = None,
model: str | None = None,
timeout_seconds: int = 180,
) -> None:
self.url = (url or os.getenv("LLAMA_SERVER_URL") or DEFAULT_SERVER_URL).strip()
self.model = (model or os.getenv("LLAMA_SERVER_MODEL") or "minicpm-v").strip()
self.timeout_seconds = timeout_seconds
self.use_grammar = os.getenv("LLAMA_SERVER_GRAMMAR", "0") == "1"
def extract(self, file_path: str, max_pages: int = 3) -> ExtractionResult:
parts = document_to_payload_parts(file_path, max_pages=max_pages)
payload = {
"model": self.model,
"messages": [
{"role": "user", "content": [{"type": "text", "text": EXTRACTION_PROMPT}, *parts]}
],
"temperature": 0,
"max_tokens": 2048,
# Stop the model from emitting a <think> reasoning block (it otherwise burns the
# whole token budget before producing JSON). Unknown fields are ignored by the server.
"chat_template_kwargs": {"enable_thinking": False},
}
if self.use_grammar:
# Grammar-constrained decoding: output can only be our {tests, notes} schema.
payload["grammar"] = extraction_grammar()
started = time.perf_counter()
response = requests.post(
self.url,
json=payload,
headers={"Content-Type": "application/json"},
timeout=self.timeout_seconds,
)
duration_ms = int((time.perf_counter() - started) * 1000)
response.raise_for_status()
raw = _message_content(response.json())
parsed = _parse_json_response(raw)
return ExtractionResult(
patient=_normalize_patient(parsed.get("patient", {})),
tests=_normalize_tests(parsed.get("tests", [])),
notes=_normalize_notes(parsed.get("notes", [])),
raw_response=raw,
request_summary={
"backend": "local-server",
"url": self.url,
"model": self.model,
"document_parts": len(parts),
"max_pages": max_pages,
"grammar": self.use_grammar,
"user_message_preview": summarize_document_parts(parts),
**document_intake_metadata(file_path, parts),
"http_status": response.status_code,
"return_code": 0,
"duration_ms": duration_ms,
},
)
def _message_content(payload: dict) -> str:
try:
message = payload["choices"][0]["message"]
except (KeyError, IndexError, TypeError) as error:
raise ValueError("llama-server response did not include choices[0].message.") from error
content = message.get("content") or ""
if isinstance(content, list):
content = "\n".join(p.get("text", "") for p in content if isinstance(p, dict))
return content.strip()