Spaces:
Running on Zero
Running on Zero
| import importlib.util | |
| import tempfile | |
| import unittest | |
| from pathlib import Path | |
| from zsgdp.pipeline import parse_document | |
| class PDFIntegrationTests(unittest.TestCase): | |
| def test_pymupdf_parse_exports_page_table_and_figure_assets(self): | |
| import fitz # type: ignore | |
| with tempfile.TemporaryDirectory() as tmp: | |
| tmp_path = Path(tmp) | |
| pdf_path = tmp_path / "sample.pdf" | |
| output_dir = tmp_path / "out" | |
| doc = fitz.open() | |
| page = doc.new_page(width=612, height=792) | |
| page.insert_text((72, 72), "Annual Report", fontsize=20) | |
| page.insert_text((72, 120), "Revenue Summary", fontsize=14) | |
| page.insert_text( | |
| (72, 155), | |
| "Region Q1 Q2\nNorth America 10 12\nEurope 8 7", | |
| fontsize=11, | |
| fontname="cour", | |
| ) | |
| page.draw_rect(fitz.Rect(72, 265, 260, 360)) | |
| doc.save(pdf_path) | |
| parsed = parse_document(pdf_path, output_dir, selected_parsers=["pymupdf"]) | |
| self.assertEqual(parsed.file_type, "pdf") | |
| self.assertEqual(len(parsed.tables), 1) | |
| self.assertGreaterEqual(len(parsed.figures), 1) | |
| self.assertTrue((output_dir / "assets" / "pages" / "page_0001.png").exists()) | |
| self.assertTrue((output_dir / "assets" / "tables" / "p0001_t001.png").exists()) | |
| self.assertTrue(any((output_dir / "assets" / "figures").glob("p0001_f*.png"))) | |
| self.assertEqual(parsed.quality_report.metrics["table_chunk_coverage"], 1.0) | |
| self.assertEqual(parsed.quality_report.metrics["figure_chunk_coverage"], 1.0) | |
| if __name__ == "__main__": | |
| unittest.main() | |