Spaces:
Running on Zero
Running on Zero
| """ | |
| No-model tests for PDF input handling (INPUT-02, Phase 03-02). | |
| These tests NEVER load model weights β BUREAUCAT_NO_MODEL=1 is set before importing app. | |
| Test coverage: | |
| (a) pdf_to_images() renders a valid in-memory PDF to a non-empty list[PIL.Image], | |
| capped at MAX_PAGES_HARD=5 pages. | |
| (b) A corrupt PDF passed through the decode() dispatch returns | |
| StructuredResult(doctype="unreadable", severity=None) β no exception raised. | |
| (c) decode([]) returns the existing "Please upload..." StructuredResult | |
| (model-free, identical to the existing empty-input guard in test_multi_image.py). | |
| (d) SC2 END-TO-END (model-guarded): a real single-page letter PDF through the full | |
| decode([<pdf_path>], "English", False) path asserts doctype=="letter" and | |
| severity is not None β proves a PDF yields the same four-section analysis as | |
| a JPEG via the existing run_inference_multi path. | |
| SKIPPED when BUREAUCAT_NO_MODEL=1 (mirrors the MPS-integration tier in 03-01). | |
| SC2 is CONFIRMED at the Phase 3 human checkpoint (03-03 Task 4) on real weights. | |
| PII safety: all fixtures are programmatically generated from synthetic content only. | |
| No real Swedish personnummer patterns are introduced. | |
| """ | |
| import io | |
| import os | |
| import sys | |
| import tempfile | |
| import pytest | |
| from PIL import Image as PILImage | |
| # Set escape hatch BEFORE importing app so model weights are never downloaded. | |
| # app.py treats ANY non-empty value as "skip model" (os.getenv truthiness), so the | |
| # SC2 model tier is opted into with an explicit 0, which we must remove entirely: | |
| # BUREAUCAT_NO_MODEL=0 python -m pytest tests/test_pdf_input.py -k SC2 | |
| if os.environ.get("BUREAUCAT_NO_MODEL") == "0": | |
| del os.environ["BUREAUCAT_NO_MODEL"] | |
| else: | |
| os.environ.setdefault("BUREAUCAT_NO_MODEL", "1") | |
| # Ensure project root is on the path when running from tests/ | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| import app | |
| from app import StructuredResult, MAX_PAGES_HARD | |
| # --------------------------------------------------------------------------- | |
| # Fixture helpers | |
| # --------------------------------------------------------------------------- | |
| def _make_in_memory_pdf(num_pages: int = 1) -> bytes: | |
| """ | |
| Build a minimal valid PDF in memory using Pillow. | |
| Pillow can save images as PDF pages; using save_all + append_images creates | |
| a multi-page PDF. This is the lightest approach β no extra dependencies beyond | |
| Pillow (already in requirements.txt). | |
| Returns raw PDF bytes (no personnummer; synthetic colored squares only). | |
| """ | |
| imgs = [PILImage.new("RGB", (100, 100), color=(i * 30 % 255, 100, 200)) | |
| for i in range(num_pages)] | |
| buf = io.BytesIO() | |
| # Pillow β₯10: save first image, append remaining as pages | |
| imgs[0].save( | |
| buf, | |
| format="PDF", | |
| save_all=True, | |
| append_images=imgs[1:], | |
| ) | |
| return buf.getvalue() | |
| def _write_tmp_pdf(pdf_bytes: bytes) -> str: | |
| """Write PDF bytes to a NamedTemporaryFile and return the path (delete=False).""" | |
| with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as fh: | |
| fh.write(pdf_bytes) | |
| return fh.name | |
| # --------------------------------------------------------------------------- | |
| # (a) pdf_to_images renders valid PDF bytes to PIL Images, capped at MAX_PAGES_HARD | |
| # --------------------------------------------------------------------------- | |
| class TestPdfToImages: | |
| """Unit tests for app.pdf_to_images() β no model required.""" | |
| def test_single_page_pdf_returns_one_image(self): | |
| """A single-page in-memory PDF renders to exactly one PIL Image.""" | |
| pdf_bytes = _make_in_memory_pdf(num_pages=1) | |
| images = app.pdf_to_images(pdf_bytes) | |
| assert isinstance(images, list), "pdf_to_images must return a list" | |
| assert len(images) == 1, f"Expected 1 image, got {len(images)}" | |
| assert isinstance(images[0], PILImage.Image), ( | |
| f"Expected PIL.Image.Image, got {type(images[0])}" | |
| ) | |
| def test_multi_page_pdf_renders_all_pages(self): | |
| """A 3-page PDF renders to exactly 3 PIL Images when under the cap.""" | |
| pdf_bytes = _make_in_memory_pdf(num_pages=3) | |
| images = app.pdf_to_images(pdf_bytes) | |
| assert len(images) == 3, f"Expected 3 images, got {len(images)}" | |
| def test_pdf_capped_at_max_pages_hard(self): | |
| """A PDF with more than MAX_PAGES_HARD pages is truncated to MAX_PAGES_HARD.""" | |
| # Build a PDF with MAX_PAGES_HARD + 2 pages to prove the cap fires. | |
| num_pages = MAX_PAGES_HARD + 2 | |
| pdf_bytes = _make_in_memory_pdf(num_pages=num_pages) | |
| images = app.pdf_to_images(pdf_bytes) | |
| assert len(images) == MAX_PAGES_HARD, ( | |
| f"Expected {MAX_PAGES_HARD} images (cap), got {len(images)}" | |
| ) | |
| def test_pdf_custom_max_pages_respected(self): | |
| """pdf_to_images respects an explicit max_pages override.""" | |
| pdf_bytes = _make_in_memory_pdf(num_pages=5) | |
| images = app.pdf_to_images(pdf_bytes, max_pages=2) | |
| assert len(images) == 2, f"Expected 2 images with max_pages=2, got {len(images)}" | |
| def test_rendered_images_are_not_empty(self): | |
| """Each rendered PIL Image has non-zero dimensions (not a blank stub).""" | |
| pdf_bytes = _make_in_memory_pdf(num_pages=2) | |
| images = app.pdf_to_images(pdf_bytes) | |
| for i, img in enumerate(images): | |
| w, h = img.size | |
| assert w > 0 and h > 0, f"Image {i} has zero dimension: {img.size}" | |
| # --------------------------------------------------------------------------- | |
| # (b) Corrupt PDF through decode() β doctype="unreadable", no exception | |
| # --------------------------------------------------------------------------- | |
| class TestCorruptPdfRefusal: | |
| """Verify malformed PDF bytes route to the slice-1 refusal path (T-03-04).""" | |
| def test_corrupt_pdf_bytes_returns_unreadable_doctype(self): | |
| """ | |
| Corrupt bytes written to a .pdf temp file: decode() must return | |
| doctype="unreadable" with severity=None β never raise an exception. | |
| """ | |
| corrupt_bytes = b"This is not a valid PDF file\x00\xff\xfegarbage" | |
| tmp_path = _write_tmp_pdf(corrupt_bytes) | |
| try: | |
| result = app.decode([tmp_path], "English", False) | |
| assert isinstance(result, StructuredResult), ( | |
| f"Expected StructuredResult, got {type(result)}" | |
| ) | |
| assert result.doctype == "unreadable", ( | |
| f"Expected doctype='unreadable', got {result.doctype!r}" | |
| ) | |
| assert result.severity is None, ( | |
| f"Expected severity=None for corrupt PDF, got {result.severity}" | |
| ) | |
| finally: | |
| os.unlink(tmp_path) | |
| def test_corrupt_pdf_does_not_raise(self): | |
| """decode() must not propagate exceptions for corrupt PDFs.""" | |
| corrupt_bytes = b"\x89PNG\r\n\x1a\n" # PNG magic in a .pdf file β also corrupt | |
| tmp_path = _write_tmp_pdf(corrupt_bytes) | |
| try: | |
| # If this raises, the test fails β that's the whole point | |
| result = app.decode([tmp_path], "English", False) | |
| assert isinstance(result, StructuredResult) | |
| finally: | |
| os.unlink(tmp_path) | |
| def test_truncated_pdf_header_returns_unreadable(self): | |
| """A PDF file that starts with %PDF- but is truncated returns unreadable.""" | |
| truncated = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog" # valid start, no end | |
| tmp_path = _write_tmp_pdf(truncated) | |
| try: | |
| result = app.decode([tmp_path], "English", False) | |
| assert result.doctype == "unreadable" | |
| assert result.severity is None | |
| finally: | |
| os.unlink(tmp_path) | |
| # --------------------------------------------------------------------------- | |
| # (c) Empty-input guard (unchanged from pre-03-02 behavior) | |
| # --------------------------------------------------------------------------- | |
| class TestDecodeEmptyInputGuard: | |
| """decode([]) and decode(None) must return the existing error sentinel.""" | |
| def test_decode_empty_list_returns_upload_prompt(self): | |
| """decode([]) returns StructuredResult with 'please upload' in .raw.""" | |
| result = app.decode([], "English", False) | |
| assert isinstance(result, StructuredResult) | |
| assert "upload" in result.raw.lower() or "please" in result.raw.lower(), ( | |
| f"Expected upload prompt in .raw, got: {result.raw!r}" | |
| ) | |
| assert result.severity is None | |
| assert result.doctype == "letter" | |
| def test_decode_none_returns_error_sentinel(self): | |
| """decode(None) returns a non-empty StructuredResult error sentinel.""" | |
| result = app.decode(None, "English", False) | |
| assert isinstance(result, StructuredResult) | |
| assert result.raw, "Expected non-empty .raw for None input" | |
| # --------------------------------------------------------------------------- | |
| # (d) SC2 END-TO-END (model-guarded) β SKIPPED when BUREAUCAT_NO_MODEL=1 | |
| # --------------------------------------------------------------------------- | |
| class TestSC2PdfEndToEnd: | |
| """ | |
| SC2 end-to-end: a real single-page letter PDF through the full decode() path. | |
| Asserts doctype=="letter" and severity is not None β proving a PDF yields the | |
| same four-section analysis as a JPEG, not merely that plumbing is wired. | |
| Fixture: build from an existing synthetic gold letter PNG (no real PII). | |
| """ | |
| def test_single_page_pdf_decodes_to_letter_with_severity(self): | |
| """ | |
| SC2: a synthetic gold letter rendered as a one-page PDF through decode() | |
| returns a StructuredResult with doctype='letter' and a non-None severity. | |
| """ | |
| # Pick any existing synthetic gold letter PNG as the source image. | |
| gold_png = os.path.join( | |
| os.path.dirname(os.path.dirname(os.path.abspath(__file__))), | |
| "data", "letters", "public", "skatteverket-slutskattebesked.png", | |
| ) | |
| assert os.path.exists(gold_png), ( | |
| f"SC2 fixture image not found: {gold_png}\n" | |
| "Ensure the synthetic gold set is present (data/letters/public/)." | |
| ) | |
| # Convert PNG β in-memory PDF β temp .pdf file (Pillow, no external deps) | |
| src_img = PILImage.open(gold_png).convert("RGB") | |
| buf = io.BytesIO() | |
| src_img.save(buf, format="PDF") | |
| pdf_bytes = buf.getvalue() | |
| tmp_path = _write_tmp_pdf(pdf_bytes) | |
| try: | |
| result = app.decode([tmp_path], "English", False) | |
| assert isinstance(result, StructuredResult), ( | |
| f"Expected StructuredResult, got {type(result)}" | |
| ) | |
| assert result.doctype == "letter", ( | |
| f"SC2 FAIL: expected doctype='letter', got {result.doctype!r}. " | |
| "The PDF dispatch may have routed to the refusal path." | |
| ) | |
| assert result.severity is not None, ( | |
| "SC2 FAIL: severity is None β model output may have been truncated " | |
| "or the PDF rendering path did not reach run_inference_multi." | |
| ) | |
| finally: | |
| os.unlink(tmp_path) | |
| if __name__ == "__main__": | |
| pytest.main([__file__, "-v"]) | |