Spaces:
Running on Zero
Running on Zero
| """ | |
| Image-only / scanned-PDF path: detect no-text PDFs, render pages to images, and | |
| build a deck from them. The render/detect plumbing is testable without a GPU; | |
| the model call is exercised only in stub mode here (real vision inference is | |
| covered by the deploy, not locally). Skips when the real-model image deps | |
| (PyMuPDF + Pillow) aren't installed — they live in requirements-model.txt. | |
| RECALL_STUB=1 python3 -m pytest test_image_pdf.py | |
| """ | |
| import io | |
| import os | |
| import tempfile | |
| import pytest | |
| os.environ.setdefault("RECALL_STUB", "1") | |
| fitz = pytest.importorskip("fitz", reason="needs PyMuPDF (real-model dep)") | |
| PIL_Image = pytest.importorskip("PIL.Image", reason="needs Pillow (real-model dep)") | |
| from PIL import Image, ImageDraw # noqa: E402 | |
| import content_pipeline as cp # noqa: E402 | |
| def _make_scanned_pdf(pages: int = 2) -> str: | |
| """A PDF with no text layer — each page is a rasterized image.""" | |
| fd, path = tempfile.mkstemp(suffix=".pdf") | |
| os.close(fd) | |
| doc = fitz.open() | |
| for i in range(pages): | |
| img = Image.new("RGB", (600, 800), "white") | |
| ImageDraw.Draw(img).text((40, 40), f"Scanned page {i + 1}", fill="black") | |
| buf = io.BytesIO() | |
| img.save(buf, format="PNG") | |
| page = doc.new_page(width=600, height=800) | |
| page.insert_image(fitz.Rect(0, 0, 600, 800), stream=buf.getvalue()) | |
| doc.save(path) | |
| doc.close() | |
| return path | |
| def _make_text_pdf() -> str: | |
| fd, path = tempfile.mkstemp(suffix=".pdf") | |
| os.close(fd) | |
| doc = fitz.open() | |
| doc.new_page().insert_text((72, 72), "Selectable text about photosynthesis.") | |
| doc.save(path) | |
| doc.close() | |
| return path | |
| def test_is_image_only_pdf_detects_scanned(): | |
| path = _make_scanned_pdf() | |
| try: | |
| assert cp.is_image_only_pdf(path) is True | |
| finally: | |
| os.unlink(path) | |
| def test_text_pdf_is_not_image_only(): | |
| path = _make_text_pdf() | |
| try: | |
| assert cp.is_image_only_pdf(path) is False | |
| finally: | |
| os.unlink(path) | |
| def test_non_pdf_is_not_image_only_and_renders_nothing(): | |
| assert cp.is_image_only_pdf("notes.txt") is False | |
| assert cp.render_pdf_images("notes.txt") == [] | |
| def test_render_pdf_images_returns_pages(): | |
| path = _make_scanned_pdf(pages=3) | |
| try: | |
| images = cp.render_pdf_images(path) | |
| assert len(images) == 3 | |
| assert all(isinstance(im, Image.Image) for im in images) | |
| finally: | |
| os.unlink(path) | |
| def test_render_pdf_images_respects_max_pages(): | |
| path = _make_scanned_pdf(pages=5) | |
| try: | |
| assert len(cp.render_pdf_images(path, max_pages=2)) == 2 | |
| finally: | |
| os.unlink(path) | |
| def test_generate_deck_from_images_stub(): | |
| # STUB returns the canned demo deck regardless of the images passed. Pin | |
| # llm.STUB explicitly (and restore) so this is robust to other test files' | |
| # import-order / STUB mutations rather than relying on ambient state. | |
| import llm | |
| prev = llm.STUB | |
| llm.STUB = True | |
| try: | |
| deck = cp.generate_deck_from_images([object(), object()]) | |
| finally: | |
| llm.STUB = prev | |
| assert isinstance(deck, list) and len(deck) >= 1 | |
| assert all(c.get("question") and c.get("answer") for c in deck) | |
| if __name__ == "__main__": | |
| test_is_image_only_pdf_detects_scanned() | |
| test_text_pdf_is_not_image_only() | |
| test_non_pdf_is_not_image_only_and_renders_nothing() | |
| test_render_pdf_images_returns_pages() | |
| test_render_pdf_images_respects_max_pages() | |
| test_generate_deck_from_images_stub() | |
| print("All image-PDF tests passed.") | |