""" Image-only / scanned-PDF path: detect no-text PDFs, render pages to images, and build a deck from them. The render/detect plumbing is testable without a GPU; the model call is exercised only in stub mode here (real vision inference is covered by the deploy, not locally). Skips when the real-model image deps (PyMuPDF + Pillow) aren't installed — they live in requirements-model.txt. RECALL_STUB=1 python3 -m pytest test_image_pdf.py """ import io import os import tempfile import pytest os.environ.setdefault("RECALL_STUB", "1") fitz = pytest.importorskip("fitz", reason="needs PyMuPDF (real-model dep)") PIL_Image = pytest.importorskip("PIL.Image", reason="needs Pillow (real-model dep)") from PIL import Image, ImageDraw # noqa: E402 import content_pipeline as cp # noqa: E402 def _make_scanned_pdf(pages: int = 2) -> str: """A PDF with no text layer — each page is a rasterized image.""" fd, path = tempfile.mkstemp(suffix=".pdf") os.close(fd) doc = fitz.open() for i in range(pages): img = Image.new("RGB", (600, 800), "white") ImageDraw.Draw(img).text((40, 40), f"Scanned page {i + 1}", fill="black") buf = io.BytesIO() img.save(buf, format="PNG") page = doc.new_page(width=600, height=800) page.insert_image(fitz.Rect(0, 0, 600, 800), stream=buf.getvalue()) doc.save(path) doc.close() return path def _make_text_pdf() -> str: fd, path = tempfile.mkstemp(suffix=".pdf") os.close(fd) doc = fitz.open() doc.new_page().insert_text((72, 72), "Selectable text about photosynthesis.") doc.save(path) doc.close() return path def test_is_image_only_pdf_detects_scanned(): path = _make_scanned_pdf() try: assert cp.is_image_only_pdf(path) is True finally: os.unlink(path) def test_text_pdf_is_not_image_only(): path = _make_text_pdf() try: assert cp.is_image_only_pdf(path) is False finally: os.unlink(path) def test_non_pdf_is_not_image_only_and_renders_nothing(): assert cp.is_image_only_pdf("notes.txt") is False assert cp.render_pdf_images("notes.txt") == [] def test_render_pdf_images_returns_pages(): path = _make_scanned_pdf(pages=3) try: images = cp.render_pdf_images(path) assert len(images) == 3 assert all(isinstance(im, Image.Image) for im in images) finally: os.unlink(path) def test_render_pdf_images_respects_max_pages(): path = _make_scanned_pdf(pages=5) try: assert len(cp.render_pdf_images(path, max_pages=2)) == 2 finally: os.unlink(path) def test_generate_deck_from_images_stub(): # STUB returns the canned demo deck regardless of the images passed. Pin # llm.STUB explicitly (and restore) so this is robust to other test files' # import-order / STUB mutations rather than relying on ambient state. import llm prev = llm.STUB llm.STUB = True try: deck = cp.generate_deck_from_images([object(), object()]) finally: llm.STUB = prev assert isinstance(deck, list) and len(deck) >= 1 assert all(c.get("question") and c.get("answer") for c in deck) if __name__ == "__main__": test_is_image_only_pdf_detects_scanned() test_text_pdf_is_not_image_only() test_non_pdf_is_not_image_only_and_renders_nothing() test_render_pdf_images_returns_pages() test_render_pdf_images_respects_max_pages() test_generate_deck_from_images_stub() print("All image-PDF tests passed.")