Spaces:

build-small-hackathon
/

study-partner

Running on Zero

App Files Files Community

study-partner / test_image_pdf.py

nz-nz

Sync from GitHub via hub-sync

058157a verified 15 days ago

Raw

History Blame Contribute Delete

3.54 kB

	"""
	Image-only / scanned-PDF path: detect no-text PDFs, render pages to images, and
	build a deck from them. The render/detect plumbing is testable without a GPU;
	the model call is exercised only in stub mode here (real vision inference is
	covered by the deploy, not locally). Skips when the real-model image deps
	(PyMuPDF + Pillow) aren't installed — they live in requirements-model.txt.

	RECALL_STUB=1 python3 -m pytest test_image_pdf.py
	"""
	import io
	import os
	import tempfile

	import pytest

	os.environ.setdefault("RECALL_STUB", "1")

	fitz = pytest.importorskip("fitz", reason="needs PyMuPDF (real-model dep)")
	PIL_Image = pytest.importorskip("PIL.Image", reason="needs Pillow (real-model dep)")
	from PIL import Image, ImageDraw # noqa: E402

	import content_pipeline as cp # noqa: E402


	def _make_scanned_pdf(pages: int = 2) -> str:
	"""A PDF with no text layer — each page is a rasterized image."""
	fd, path = tempfile.mkstemp(suffix=".pdf")
	os.close(fd)
	doc = fitz.open()
	for i in range(pages):
	img = Image.new("RGB", (600, 800), "white")
	ImageDraw.Draw(img).text((40, 40), f"Scanned page {i + 1}", fill="black")
	buf = io.BytesIO()
	img.save(buf, format="PNG")
	page = doc.new_page(width=600, height=800)
	page.insert_image(fitz.Rect(0, 0, 600, 800), stream=buf.getvalue())
	doc.save(path)
	doc.close()
	return path


	def _make_text_pdf() -> str:
	fd, path = tempfile.mkstemp(suffix=".pdf")
	os.close(fd)
	doc = fitz.open()
	doc.new_page().insert_text((72, 72), "Selectable text about photosynthesis.")
	doc.save(path)
	doc.close()
	return path


	def test_is_image_only_pdf_detects_scanned():
	path = _make_scanned_pdf()
	try:
	assert cp.is_image_only_pdf(path) is True
	finally:
	os.unlink(path)


	def test_text_pdf_is_not_image_only():
	path = _make_text_pdf()
	try:
	assert cp.is_image_only_pdf(path) is False
	finally:
	os.unlink(path)


	def test_non_pdf_is_not_image_only_and_renders_nothing():
	assert cp.is_image_only_pdf("notes.txt") is False
	assert cp.render_pdf_images("notes.txt") == []


	def test_render_pdf_images_returns_pages():
	path = _make_scanned_pdf(pages=3)
	try:
	images = cp.render_pdf_images(path)
	assert len(images) == 3
	assert all(isinstance(im, Image.Image) for im in images)
	finally:
	os.unlink(path)


	def test_render_pdf_images_respects_max_pages():
	path = _make_scanned_pdf(pages=5)
	try:
	assert len(cp.render_pdf_images(path, max_pages=2)) == 2
	finally:
	os.unlink(path)


	def test_generate_deck_from_images_stub():
	# STUB returns the canned demo deck regardless of the images passed. Pin
	# llm.STUB explicitly (and restore) so this is robust to other test files'
	# import-order / STUB mutations rather than relying on ambient state.
	import llm
	prev = llm.STUB
	llm.STUB = True
	try:
	deck = cp.generate_deck_from_images([object(), object()])
	finally:
	llm.STUB = prev
	assert isinstance(deck, list) and len(deck) >= 1
	assert all(c.get("question") and c.get("answer") for c in deck)


	if __name__ == "__main__":
	test_is_image_only_pdf_detects_scanned()
	test_text_pdf_is_not_image_only()
	test_non_pdf_is_not_image_only_and_renders_nothing()
	test_render_pdf_images_returns_pages()
	test_render_pdf_images_respects_max_pages()
	test_generate_deck_from_images_stub()
	print("All image-PDF tests passed.")