Spaces:
Running on Zero
Running on Zero
| import unittest | |
| from pathlib import Path | |
| from zsgdp.parsers.pymupdf_parser import ( | |
| TextBlock, | |
| _asset_root, | |
| _guess_element_type, | |
| _is_table_text, | |
| _sort_blocks_reading_order, | |
| _table_text_to_markdown, | |
| ) | |
| class PyMuPDFParserHelperTests(unittest.TestCase): | |
| def test_table_text_detection_and_markdown(self): | |
| text = "Region Q1 Q2\nNorth America 10 12\nEurope 8 7" | |
| self.assertTrue(_is_table_text(text)) | |
| self.assertEqual( | |
| _table_text_to_markdown(text), | |
| "| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |", | |
| ) | |
| def test_reading_order_detects_two_columns(self): | |
| blocks = [ | |
| TextBlock(1, "right top", (320, 50, 500, 70), 1), | |
| TextBlock(1, "left bottom", (50, 200, 230, 220), 2), | |
| TextBlock(1, "left top", (50, 50, 230, 70), 3), | |
| TextBlock(1, "right bottom", (320, 200, 500, 220), 4), | |
| TextBlock(1, "left mid", (50, 120, 230, 140), 5), | |
| TextBlock(1, "right mid", (320, 120, 500, 140), 6), | |
| ] | |
| ordered = _sort_blocks_reading_order(blocks, 600) | |
| self.assertEqual([block.text for block in ordered], ["left top", "left mid", "left bottom", "right top", "right mid", "right bottom"]) | |
| def test_guess_element_type_for_table_and_title(self): | |
| table = TextBlock(1, "A B\n1 2", (50, 100, 300, 160), 1, avg_font_size=10) | |
| title = TextBlock(1, "Annual Report", (50, 40, 400, 70), 2, max_font_size=18, avg_font_size=18) | |
| self.assertEqual(_guess_element_type(table, 2, 10, 800), "table") | |
| self.assertEqual(_guess_element_type(title, 1, 10, 800), "title") | |
| def test_asset_root_uses_runtime_output_dir(self): | |
| root = _asset_root({"runtime": {"output_dir": "/tmp/out"}, "pdf": {"asset_dir": "pdf_assets"}}) | |
| self.assertEqual(root, Path("/tmp/out/pdf_assets")) | |
| if __name__ == "__main__": | |
| unittest.main() | |