Spaces:

build-small-hackathon
/

blood-test-explainer

Running on Zero

App Files Files Community

blood-test-explainer / src /extraction /local_server.py

r0mant1c

Add agent trace panel, vision extraction, and local Transformers backend.

354b37e 20 days ago

Raw

History Blame Contribute Delete

4.77 kB

	"""Offline extraction via a local llama.cpp server (llama-server).

	This is the off-grid backend that actually works for MiniCPM-V 4.6. The pip `llama-cpp-python`
	bundles an llama.cpp too old to load 4.6, but the current `llama-server` (brew / release build)
	runs it fine. We POST to a llama-server on localhost with the document image plus:
	- our GBNF grammar, so the output is always the `{tests, notes}` schema, and
	- `enable_thinking: false`, so the model doesn't spend its whole token budget on a `<think>`
	ramble (the cause of the "could not be converted into a report" failure).

	localhost = the model running on this machine, so it is still fully off-grid (no external call).

	Run the server next to the app:
	llama-server -m model.gguf --mmproj mmproj.gguf --port 8080

	Config (env):
	LLAMA_SERVER_URL default http://127.0.0.1:8080/v1/chat/completions
	LLAMA_SERVER_MODEL default "minicpm-v"
	LLAMA_SERVER_GRAMMAR set to "1" to send the GBNF grammar (OFF by default: the current
	llama-server build rejects our grammar, and `enable_thinking:false`
	plus the tolerant parser already yield clean {tests,notes} output)
	"""

	from __future__ import annotations

	import os
	import time

	import requests

	from src.document_processing import document_intake_metadata, document_to_payload_parts
	from src.grammar import extraction_grammar
	from src.openbmb_client import (
	EXTRACTION_PROMPT,
	ExtractionResult,
	_normalize_notes,
	_normalize_patient,
	_normalize_tests,
	_parse_json_response,
	summarize_document_parts,
	)

	DEFAULT_SERVER_URL = "http://127.0.0.1:8080/v1/chat/completions"


	class LocalServerExtractor:
	"""Implements the `Extractor` protocol against a local llama-server."""

	def __init__(
	self,
	url: str \| None = None,
	model: str \| None = None,
	timeout_seconds: int = 180,
	) -> None:
	self.url = (url or os.getenv("LLAMA_SERVER_URL") or DEFAULT_SERVER_URL).strip()
	self.model = (model or os.getenv("LLAMA_SERVER_MODEL") or "minicpm-v").strip()
	self.timeout_seconds = timeout_seconds
	self.use_grammar = os.getenv("LLAMA_SERVER_GRAMMAR", "0") == "1"

	def extract(self, file_path: str, max_pages: int = 3) -> ExtractionResult:
	parts = document_to_payload_parts(file_path, max_pages=max_pages)
	payload = {
	"model": self.model,
	"messages": [
	{"role": "user", "content": [{"type": "text", "text": EXTRACTION_PROMPT}, *parts]}
	],
	"temperature": 0,
	"max_tokens": 2048,
	# Stop the model from emitting a <think> reasoning block (it otherwise burns the
	# whole token budget before producing JSON). Unknown fields are ignored by the server.
	"chat_template_kwargs": {"enable_thinking": False},
	}
	if self.use_grammar:
	# Grammar-constrained decoding: output can only be our {tests, notes} schema.
	payload["grammar"] = extraction_grammar()

	started = time.perf_counter()
	response = requests.post(
	self.url,
	json=payload,
	headers={"Content-Type": "application/json"},
	timeout=self.timeout_seconds,
	)
	duration_ms = int((time.perf_counter() - started) * 1000)
	response.raise_for_status()

	raw = _message_content(response.json())
	parsed = _parse_json_response(raw)
	return ExtractionResult(
	patient=_normalize_patient(parsed.get("patient", {})),
	tests=_normalize_tests(parsed.get("tests", [])),
	notes=_normalize_notes(parsed.get("notes", [])),
	raw_response=raw,
	request_summary={
	"backend": "local-server",
	"url": self.url,
	"model": self.model,
	"document_parts": len(parts),
	"max_pages": max_pages,
	"grammar": self.use_grammar,
	"user_message_preview": summarize_document_parts(parts),
	**document_intake_metadata(file_path, parts),
	"http_status": response.status_code,
	"return_code": 0,
	"duration_ms": duration_ms,
	},
	)


	def _message_content(payload: dict) -> str:
	try:
	message = payload["choices"][0]["message"]
	except (KeyError, IndexError, TypeError) as error:
	raise ValueError("llama-server response did not include choices[0].message.") from error
	content = message.get("content") or ""
	if isinstance(content, list):
	content = "\n".join(p.get("text", "") for p in content if isinstance(p, dict))
	return content.strip()