Spaces:

build-small-hackathon
/

bureaucat

Running on Zero

App Files Files Community

bureaucat / tests /test_pdf_input.py

ravinsingh15

Bureaucat — Build Small Hackathon submission (Qwen3-VL-8B, ZeroGPU, gr.Server)

6b5e47d 21 days ago

Raw

History Blame Contribute Delete

11.5 kB

	"""
	No-model tests for PDF input handling (INPUT-02, Phase 03-02).

	These tests NEVER load model weights — BUREAUCAT_NO_MODEL=1 is set before importing app.

	Test coverage:
	(a) pdf_to_images() renders a valid in-memory PDF to a non-empty list[PIL.Image],
	capped at MAX_PAGES_HARD=5 pages.
	(b) A corrupt PDF passed through the decode() dispatch returns
	StructuredResult(doctype="unreadable", severity=None) — no exception raised.
	(c) decode([]) returns the existing "Please upload..." StructuredResult
	(model-free, identical to the existing empty-input guard in test_multi_image.py).
	(d) SC2 END-TO-END (model-guarded): a real single-page letter PDF through the full
	decode([<pdf_path>], "English", False) path asserts doctype=="letter" and
	severity is not None — proves a PDF yields the same four-section analysis as
	a JPEG via the existing run_inference_multi path.
	SKIPPED when BUREAUCAT_NO_MODEL=1 (mirrors the MPS-integration tier in 03-01).
	SC2 is CONFIRMED at the Phase 3 human checkpoint (03-03 Task 4) on real weights.

	PII safety: all fixtures are programmatically generated from synthetic content only.
	No real Swedish personnummer patterns are introduced.
	"""

	import io
	import os
	import sys
	import tempfile

	import pytest
	from PIL import Image as PILImage

	# Set escape hatch BEFORE importing app so model weights are never downloaded.
	# app.py treats ANY non-empty value as "skip model" (os.getenv truthiness), so the
	# SC2 model tier is opted into with an explicit 0, which we must remove entirely:
	# BUREAUCAT_NO_MODEL=0 python -m pytest tests/test_pdf_input.py -k SC2
	if os.environ.get("BUREAUCAT_NO_MODEL") == "0":
	del os.environ["BUREAUCAT_NO_MODEL"]
	else:
	os.environ.setdefault("BUREAUCAT_NO_MODEL", "1")

	# Ensure project root is on the path when running from tests/
	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	import app
	from app import StructuredResult, MAX_PAGES_HARD


	# ---------------------------------------------------------------------------
	# Fixture helpers
	# ---------------------------------------------------------------------------

	def _make_in_memory_pdf(num_pages: int = 1) -> bytes:
	"""
	Build a minimal valid PDF in memory using Pillow.

	Pillow can save images as PDF pages; using save_all + append_images creates
	a multi-page PDF. This is the lightest approach — no extra dependencies beyond
	Pillow (already in requirements.txt).

	Returns raw PDF bytes (no personnummer; synthetic colored squares only).
	"""
	imgs = [PILImage.new("RGB", (100, 100), color=(i * 30 % 255, 100, 200))
	for i in range(num_pages)]
	buf = io.BytesIO()
	# Pillow ≥10: save first image, append remaining as pages
	imgs[0].save(
	buf,
	format="PDF",
	save_all=True,
	append_images=imgs[1:],
	)
	return buf.getvalue()


	def _write_tmp_pdf(pdf_bytes: bytes) -> str:
	"""Write PDF bytes to a NamedTemporaryFile and return the path (delete=False)."""
	with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as fh:
	fh.write(pdf_bytes)
	return fh.name


	# ---------------------------------------------------------------------------
	# (a) pdf_to_images renders valid PDF bytes to PIL Images, capped at MAX_PAGES_HARD
	# ---------------------------------------------------------------------------

	class TestPdfToImages:
	"""Unit tests for app.pdf_to_images() — no model required."""

	def test_single_page_pdf_returns_one_image(self):
	"""A single-page in-memory PDF renders to exactly one PIL Image."""
	pdf_bytes = _make_in_memory_pdf(num_pages=1)
	images = app.pdf_to_images(pdf_bytes)
	assert isinstance(images, list), "pdf_to_images must return a list"
	assert len(images) == 1, f"Expected 1 image, got {len(images)}"
	assert isinstance(images[0], PILImage.Image), (
	f"Expected PIL.Image.Image, got {type(images[0])}"
	)

	def test_multi_page_pdf_renders_all_pages(self):
	"""A 3-page PDF renders to exactly 3 PIL Images when under the cap."""
	pdf_bytes = _make_in_memory_pdf(num_pages=3)
	images = app.pdf_to_images(pdf_bytes)
	assert len(images) == 3, f"Expected 3 images, got {len(images)}"

	def test_pdf_capped_at_max_pages_hard(self):
	"""A PDF with more than MAX_PAGES_HARD pages is truncated to MAX_PAGES_HARD."""
	# Build a PDF with MAX_PAGES_HARD + 2 pages to prove the cap fires.
	num_pages = MAX_PAGES_HARD + 2
	pdf_bytes = _make_in_memory_pdf(num_pages=num_pages)
	images = app.pdf_to_images(pdf_bytes)
	assert len(images) == MAX_PAGES_HARD, (
	f"Expected {MAX_PAGES_HARD} images (cap), got {len(images)}"
	)

	def test_pdf_custom_max_pages_respected(self):
	"""pdf_to_images respects an explicit max_pages override."""
	pdf_bytes = _make_in_memory_pdf(num_pages=5)
	images = app.pdf_to_images(pdf_bytes, max_pages=2)
	assert len(images) == 2, f"Expected 2 images with max_pages=2, got {len(images)}"

	def test_rendered_images_are_not_empty(self):
	"""Each rendered PIL Image has non-zero dimensions (not a blank stub)."""
	pdf_bytes = _make_in_memory_pdf(num_pages=2)
	images = app.pdf_to_images(pdf_bytes)
	for i, img in enumerate(images):
	w, h = img.size
	assert w > 0 and h > 0, f"Image {i} has zero dimension: {img.size}"


	# ---------------------------------------------------------------------------
	# (b) Corrupt PDF through decode() → doctype="unreadable", no exception
	# ---------------------------------------------------------------------------

	class TestCorruptPdfRefusal:
	"""Verify malformed PDF bytes route to the slice-1 refusal path (T-03-04)."""

	def test_corrupt_pdf_bytes_returns_unreadable_doctype(self):
	"""
	Corrupt bytes written to a .pdf temp file: decode() must return
	doctype="unreadable" with severity=None — never raise an exception.
	"""
	corrupt_bytes = b"This is not a valid PDF file\x00\xff\xfegarbage"
	tmp_path = _write_tmp_pdf(corrupt_bytes)
	try:
	result = app.decode([tmp_path], "English", False)
	assert isinstance(result, StructuredResult), (
	f"Expected StructuredResult, got {type(result)}"
	)
	assert result.doctype == "unreadable", (
	f"Expected doctype='unreadable', got {result.doctype!r}"
	)
	assert result.severity is None, (
	f"Expected severity=None for corrupt PDF, got {result.severity}"
	)
	finally:
	os.unlink(tmp_path)

	def test_corrupt_pdf_does_not_raise(self):
	"""decode() must not propagate exceptions for corrupt PDFs."""
	corrupt_bytes = b"\x89PNG\r\n\x1a\n" # PNG magic in a .pdf file — also corrupt
	tmp_path = _write_tmp_pdf(corrupt_bytes)
	try:
	# If this raises, the test fails — that's the whole point
	result = app.decode([tmp_path], "English", False)
	assert isinstance(result, StructuredResult)
	finally:
	os.unlink(tmp_path)

	def test_truncated_pdf_header_returns_unreadable(self):
	"""A PDF file that starts with %PDF- but is truncated returns unreadable."""
	truncated = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog" # valid start, no end
	tmp_path = _write_tmp_pdf(truncated)
	try:
	result = app.decode([tmp_path], "English", False)
	assert result.doctype == "unreadable"
	assert result.severity is None
	finally:
	os.unlink(tmp_path)


	# ---------------------------------------------------------------------------
	# (c) Empty-input guard (unchanged from pre-03-02 behavior)
	# ---------------------------------------------------------------------------

	class TestDecodeEmptyInputGuard:
	"""decode([]) and decode(None) must return the existing error sentinel."""

	def test_decode_empty_list_returns_upload_prompt(self):
	"""decode([]) returns StructuredResult with 'please upload' in .raw."""
	result = app.decode([], "English", False)
	assert isinstance(result, StructuredResult)
	assert "upload" in result.raw.lower() or "please" in result.raw.lower(), (
	f"Expected upload prompt in .raw, got: {result.raw!r}"
	)
	assert result.severity is None
	assert result.doctype == "letter"

	def test_decode_none_returns_error_sentinel(self):
	"""decode(None) returns a non-empty StructuredResult error sentinel."""
	result = app.decode(None, "English", False)
	assert isinstance(result, StructuredResult)
	assert result.raw, "Expected non-empty .raw for None input"


	# ---------------------------------------------------------------------------
	# (d) SC2 END-TO-END (model-guarded) — SKIPPED when BUREAUCAT_NO_MODEL=1
	# ---------------------------------------------------------------------------

	@pytest.mark.skipif(
	os.environ.get("BUREAUCAT_NO_MODEL") == "1",
	reason="SC2 model-guarded test: skipped in no-model CI tier; runs on MPS/CUDA with weights loaded",
	)
	class TestSC2PdfEndToEnd:
	"""
	SC2 end-to-end: a real single-page letter PDF through the full decode() path.

	Asserts doctype=="letter" and severity is not None — proving a PDF yields the
	same four-section analysis as a JPEG, not merely that plumbing is wired.

	Fixture: build from an existing synthetic gold letter PNG (no real PII).
	"""

	def test_single_page_pdf_decodes_to_letter_with_severity(self):
	"""
	SC2: a synthetic gold letter rendered as a one-page PDF through decode()
	returns a StructuredResult with doctype='letter' and a non-None severity.
	"""
	# Pick any existing synthetic gold letter PNG as the source image.
	gold_png = os.path.join(
	os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
	"data", "letters", "public", "skatteverket-slutskattebesked.png",
	)
	assert os.path.exists(gold_png), (
	f"SC2 fixture image not found: {gold_png}\n"
	"Ensure the synthetic gold set is present (data/letters/public/)."
	)

	# Convert PNG → in-memory PDF → temp .pdf file (Pillow, no external deps)
	src_img = PILImage.open(gold_png).convert("RGB")
	buf = io.BytesIO()
	src_img.save(buf, format="PDF")
	pdf_bytes = buf.getvalue()

	tmp_path = _write_tmp_pdf(pdf_bytes)
	try:
	result = app.decode([tmp_path], "English", False)
	assert isinstance(result, StructuredResult), (
	f"Expected StructuredResult, got {type(result)}"
	)
	assert result.doctype == "letter", (
	f"SC2 FAIL: expected doctype='letter', got {result.doctype!r}. "
	"The PDF dispatch may have routed to the refusal path."
	)
	assert result.severity is not None, (
	"SC2 FAIL: severity is None — model output may have been truncated "
	"or the PDF rendering path did not reach run_inference_multi."
	)
	finally:
	os.unlink(tmp_path)


	if __name__ == "__main__":
	pytest.main([__file__, "-v"])