import importlib.util import tempfile import unittest from pathlib import Path from zsgdp.pipeline import parse_document @unittest.skipIf(importlib.util.find_spec("fitz") is None, "PyMuPDF is not installed") class PDFIntegrationTests(unittest.TestCase): def test_pymupdf_parse_exports_page_table_and_figure_assets(self): import fitz # type: ignore with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) pdf_path = tmp_path / "sample.pdf" output_dir = tmp_path / "out" doc = fitz.open() page = doc.new_page(width=612, height=792) page.insert_text((72, 72), "Annual Report", fontsize=20) page.insert_text((72, 120), "Revenue Summary", fontsize=14) page.insert_text( (72, 155), "Region Q1 Q2\nNorth America 10 12\nEurope 8 7", fontsize=11, fontname="cour", ) page.draw_rect(fitz.Rect(72, 265, 260, 360)) doc.save(pdf_path) parsed = parse_document(pdf_path, output_dir, selected_parsers=["pymupdf"]) self.assertEqual(parsed.file_type, "pdf") self.assertEqual(len(parsed.tables), 1) self.assertGreaterEqual(len(parsed.figures), 1) self.assertTrue((output_dir / "assets" / "pages" / "page_0001.png").exists()) self.assertTrue((output_dir / "assets" / "tables" / "p0001_t001.png").exists()) self.assertTrue(any((output_dir / "assets" / "figures").glob("p0001_f*.png"))) self.assertEqual(parsed.quality_report.metrics["table_chunk_coverage"], 1.0) self.assertEqual(parsed.quality_report.metrics["figure_chunk_coverage"], 1.0) if __name__ == "__main__": unittest.main()