Spaces:

build-small-hackathon
/

blood-test-explainer

Running on Zero

App Files Files Community

blood-test-explainer / src /document_processing.py

r0mant1c

Add agent trace panel, vision extraction, and local Transformers backend.

354b37e 14 days ago

Raw

History Blame Contribute Delete

3.71 kB

	from __future__ import annotations

	import base64
	import io
	from pathlib import Path

	import fitz

	SUPPORTED_TEXT_EXTENSIONS = {".txt", ".csv"}
	SUPPORTED_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp", ".tif", ".tiff"}
	SUPPORTED_EXTENSIONS = SUPPORTED_TEXT_EXTENSIONS \| SUPPORTED_IMAGE_EXTENSIONS \| {".pdf"}

	_DEFAULT_MAX_PAGES = 3
	_PDF_RENDER_MATRIX = fitz.Matrix(2.5, 2.5)
	_MAX_IMAGE_EDGE = 2048


	def validate_upload(path: str) -> Path:
	file_path = Path(path)
	if not file_path.exists():
	raise ValueError("Uploaded file could not be found.")

	extension = file_path.suffix.lower()
	if extension not in SUPPORTED_EXTENSIONS:
	allowed = ", ".join(sorted(SUPPORTED_EXTENSIONS))
	raise ValueError(f"Unsupported file type `{extension}`. Supported types: {allowed}.")

	return file_path


	def document_intake_metadata(path: str, parts: list[dict]) -> dict[str, object]:
	"""Lightweight intake stats for traces (no base64 payloads)."""
	extension = Path(path).suffix.lower()
	image_count = sum(1 for part in parts if part.get("type") == "image_url")
	text_characters = sum(len(str(part.get("text") or "")) for part in parts if part.get("type") == "text")
	return {
	"source_extension": extension,
	"input_modality": "vision" if image_count else "text",
	"pages_rendered": image_count if extension == ".pdf" else None,
	"image_count": image_count,
	"text_characters": text_characters,
	}


	def document_to_payload_parts(path: str, max_pages: int \| None = None) -> list[dict]:
	"""Build OpenAI-compatible message parts for vision extraction."""
	file_path = validate_upload(path)
	extension = file_path.suffix.lower()
	page_limit = _DEFAULT_MAX_PAGES if max_pages is None else max_pages

	if extension == ".pdf":
	return _pdf_to_image_parts(file_path, max_pages=page_limit)

	if extension in SUPPORTED_IMAGE_EXTENSIONS:
	return [_image_part(file_path)]

	if extension in SUPPORTED_TEXT_EXTENSIONS:
	return [{"type": "text", "text": _read_text_file(file_path)}]

	raise ValueError(f"Unsupported file type `{extension}`.")


	def _pdf_to_image_parts(file_path: Path, max_pages: int) -> list[dict]:
	parts: list[dict] = []
	with fitz.open(file_path) as document:
	if document.page_count == 0:
	raise ValueError("The uploaded PDF does not contain any pages.")

	pages_to_render = min(document.page_count, max(1, max_pages))
	for page_index in range(pages_to_render):
	page = document.load_page(page_index)
	pixmap = page.get_pixmap(matrix=_PDF_RENDER_MATRIX, alpha=False)
	encoded = base64.b64encode(pixmap.tobytes("png")).decode("ascii")
	parts.append(
	{
	"type": "image_url",
	"image_url": {"url": f"data:image/png;base64,{encoded}"},
	}
	)

	return parts


	def _image_part(file_path: Path) -> dict:
	from PIL import Image, ImageOps

	with Image.open(file_path) as image:
	image = ImageOps.exif_transpose(image).convert("RGB")
	image.thumbnail((_MAX_IMAGE_EDGE, _MAX_IMAGE_EDGE))
	buffer = io.BytesIO()
	image.save(buffer, format="JPEG", quality=90, optimize=True)
	encoded = base64.b64encode(buffer.getvalue()).decode("ascii")

	return {
	"type": "image_url",
	"image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
	}


	def _read_text_file(file_path: Path) -> str:
	text = file_path.read_text(encoding="utf-8", errors="replace").strip()
	if not text:
	raise ValueError("The uploaded text file is empty.")
	return text[:20000]