study-partner / test_image_pdf.py
nz-nz's picture
Sync from GitHub via hub-sync
058157a verified
Raw
History Blame Contribute Delete
3.54 kB
"""
Image-only / scanned-PDF path: detect no-text PDFs, render pages to images, and
build a deck from them. The render/detect plumbing is testable without a GPU;
the model call is exercised only in stub mode here (real vision inference is
covered by the deploy, not locally). Skips when the real-model image deps
(PyMuPDF + Pillow) aren't installed — they live in requirements-model.txt.
RECALL_STUB=1 python3 -m pytest test_image_pdf.py
"""
import io
import os
import tempfile
import pytest
os.environ.setdefault("RECALL_STUB", "1")
fitz = pytest.importorskip("fitz", reason="needs PyMuPDF (real-model dep)")
PIL_Image = pytest.importorskip("PIL.Image", reason="needs Pillow (real-model dep)")
from PIL import Image, ImageDraw # noqa: E402
import content_pipeline as cp # noqa: E402
def _make_scanned_pdf(pages: int = 2) -> str:
"""A PDF with no text layer — each page is a rasterized image."""
fd, path = tempfile.mkstemp(suffix=".pdf")
os.close(fd)
doc = fitz.open()
for i in range(pages):
img = Image.new("RGB", (600, 800), "white")
ImageDraw.Draw(img).text((40, 40), f"Scanned page {i + 1}", fill="black")
buf = io.BytesIO()
img.save(buf, format="PNG")
page = doc.new_page(width=600, height=800)
page.insert_image(fitz.Rect(0, 0, 600, 800), stream=buf.getvalue())
doc.save(path)
doc.close()
return path
def _make_text_pdf() -> str:
fd, path = tempfile.mkstemp(suffix=".pdf")
os.close(fd)
doc = fitz.open()
doc.new_page().insert_text((72, 72), "Selectable text about photosynthesis.")
doc.save(path)
doc.close()
return path
def test_is_image_only_pdf_detects_scanned():
path = _make_scanned_pdf()
try:
assert cp.is_image_only_pdf(path) is True
finally:
os.unlink(path)
def test_text_pdf_is_not_image_only():
path = _make_text_pdf()
try:
assert cp.is_image_only_pdf(path) is False
finally:
os.unlink(path)
def test_non_pdf_is_not_image_only_and_renders_nothing():
assert cp.is_image_only_pdf("notes.txt") is False
assert cp.render_pdf_images("notes.txt") == []
def test_render_pdf_images_returns_pages():
path = _make_scanned_pdf(pages=3)
try:
images = cp.render_pdf_images(path)
assert len(images) == 3
assert all(isinstance(im, Image.Image) for im in images)
finally:
os.unlink(path)
def test_render_pdf_images_respects_max_pages():
path = _make_scanned_pdf(pages=5)
try:
assert len(cp.render_pdf_images(path, max_pages=2)) == 2
finally:
os.unlink(path)
def test_generate_deck_from_images_stub():
# STUB returns the canned demo deck regardless of the images passed. Pin
# llm.STUB explicitly (and restore) so this is robust to other test files'
# import-order / STUB mutations rather than relying on ambient state.
import llm
prev = llm.STUB
llm.STUB = True
try:
deck = cp.generate_deck_from_images([object(), object()])
finally:
llm.STUB = prev
assert isinstance(deck, list) and len(deck) >= 1
assert all(c.get("question") and c.get("answer") for c in deck)
if __name__ == "__main__":
test_is_image_only_pdf_detects_scanned()
test_text_pdf_is_not_image_only()
test_non_pdf_is_not_image_only_and_renders_nothing()
test_render_pdf_images_returns_pages()
test_render_pdf_images_respects_max_pages()
test_generate_deck_from_images_stub()
print("All image-PDF tests passed.")