import unittest from pathlib import Path from zsgdp.parsers.pymupdf_parser import ( TextBlock, _asset_root, _guess_element_type, _is_table_text, _sort_blocks_reading_order, _table_text_to_markdown, ) class PyMuPDFParserHelperTests(unittest.TestCase): def test_table_text_detection_and_markdown(self): text = "Region Q1 Q2\nNorth America 10 12\nEurope 8 7" self.assertTrue(_is_table_text(text)) self.assertEqual( _table_text_to_markdown(text), "| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |", ) def test_reading_order_detects_two_columns(self): blocks = [ TextBlock(1, "right top", (320, 50, 500, 70), 1), TextBlock(1, "left bottom", (50, 200, 230, 220), 2), TextBlock(1, "left top", (50, 50, 230, 70), 3), TextBlock(1, "right bottom", (320, 200, 500, 220), 4), TextBlock(1, "left mid", (50, 120, 230, 140), 5), TextBlock(1, "right mid", (320, 120, 500, 140), 6), ] ordered = _sort_blocks_reading_order(blocks, 600) self.assertEqual([block.text for block in ordered], ["left top", "left mid", "left bottom", "right top", "right mid", "right bottom"]) def test_guess_element_type_for_table_and_title(self): table = TextBlock(1, "A B\n1 2", (50, 100, 300, 160), 1, avg_font_size=10) title = TextBlock(1, "Annual Report", (50, 40, 400, 70), 2, max_font_size=18, avg_font_size=18) self.assertEqual(_guess_element_type(table, 2, 10, 800), "table") self.assertEqual(_guess_element_type(title, 1, 10, 800), "title") def test_asset_root_uses_runtime_output_dir(self): root = _asset_root({"runtime": {"output_dir": "/tmp/out"}, "pdf": {"asset_dir": "pdf_assets"}}) self.assertEqual(root, Path("/tmp/out/pdf_assets")) if __name__ == "__main__": unittest.main()