blood-test-explainer / src /document_processing.py
r0mant1c's picture
Add agent trace panel, vision extraction, and local Transformers backend.
354b37e
Raw
History Blame Contribute Delete
3.71 kB
from __future__ import annotations
import base64
import io
from pathlib import Path
import fitz
SUPPORTED_TEXT_EXTENSIONS = {".txt", ".csv"}
SUPPORTED_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp", ".tif", ".tiff"}
SUPPORTED_EXTENSIONS = SUPPORTED_TEXT_EXTENSIONS | SUPPORTED_IMAGE_EXTENSIONS | {".pdf"}
_DEFAULT_MAX_PAGES = 3
_PDF_RENDER_MATRIX = fitz.Matrix(2.5, 2.5)
_MAX_IMAGE_EDGE = 2048
def validate_upload(path: str) -> Path:
file_path = Path(path)
if not file_path.exists():
raise ValueError("Uploaded file could not be found.")
extension = file_path.suffix.lower()
if extension not in SUPPORTED_EXTENSIONS:
allowed = ", ".join(sorted(SUPPORTED_EXTENSIONS))
raise ValueError(f"Unsupported file type `{extension}`. Supported types: {allowed}.")
return file_path
def document_intake_metadata(path: str, parts: list[dict]) -> dict[str, object]:
"""Lightweight intake stats for traces (no base64 payloads)."""
extension = Path(path).suffix.lower()
image_count = sum(1 for part in parts if part.get("type") == "image_url")
text_characters = sum(len(str(part.get("text") or "")) for part in parts if part.get("type") == "text")
return {
"source_extension": extension,
"input_modality": "vision" if image_count else "text",
"pages_rendered": image_count if extension == ".pdf" else None,
"image_count": image_count,
"text_characters": text_characters,
}
def document_to_payload_parts(path: str, max_pages: int | None = None) -> list[dict]:
"""Build OpenAI-compatible message parts for vision extraction."""
file_path = validate_upload(path)
extension = file_path.suffix.lower()
page_limit = _DEFAULT_MAX_PAGES if max_pages is None else max_pages
if extension == ".pdf":
return _pdf_to_image_parts(file_path, max_pages=page_limit)
if extension in SUPPORTED_IMAGE_EXTENSIONS:
return [_image_part(file_path)]
if extension in SUPPORTED_TEXT_EXTENSIONS:
return [{"type": "text", "text": _read_text_file(file_path)}]
raise ValueError(f"Unsupported file type `{extension}`.")
def _pdf_to_image_parts(file_path: Path, max_pages: int) -> list[dict]:
parts: list[dict] = []
with fitz.open(file_path) as document:
if document.page_count == 0:
raise ValueError("The uploaded PDF does not contain any pages.")
pages_to_render = min(document.page_count, max(1, max_pages))
for page_index in range(pages_to_render):
page = document.load_page(page_index)
pixmap = page.get_pixmap(matrix=_PDF_RENDER_MATRIX, alpha=False)
encoded = base64.b64encode(pixmap.tobytes("png")).decode("ascii")
parts.append(
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{encoded}"},
}
)
return parts
def _image_part(file_path: Path) -> dict:
from PIL import Image, ImageOps
with Image.open(file_path) as image:
image = ImageOps.exif_transpose(image).convert("RGB")
image.thumbnail((_MAX_IMAGE_EDGE, _MAX_IMAGE_EDGE))
buffer = io.BytesIO()
image.save(buffer, format="JPEG", quality=90, optimize=True)
encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
return {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
}
def _read_text_file(file_path: Path) -> str:
text = file_path.read_text(encoding="utf-8", errors="replace").strip()
if not text:
raise ValueError("The uploaded text file is empty.")
return text[:20000]