import unittest from zsgdp.merge.dedupe import dedupe_elements, dedupe_tables from zsgdp.schema import Element, TableObject class MergeDedupeTests(unittest.TestCase): def test_merges_docling_heading_with_pymupdf_bbox(self): elements = [ Element( element_id="docling_p1_e1", doc_id="d1", page_num=1, type="heading", text="## Revenue Summary", markdown="## Revenue Summary", reading_order=1, confidence=0.88, source_parser="docling", ), Element( element_id="pymupdf_p1_e1", doc_id="d1", page_num=1, type="paragraph", text="Revenue Summary", bbox=(72.0, 100.0, 200.0, 124.0), reading_order=1, confidence=0.86, source_parser="pymupdf", ), ] deduped = dedupe_elements(elements) self.assertEqual(len(deduped), 1) self.assertEqual(deduped[0].source_parser, "docling") self.assertEqual(deduped[0].bbox, (72.0, 100.0, 200.0, 124.0)) self.assertEqual(deduped[0].provenance["bbox_source_parser"], "pymupdf") def test_drops_paragraph_duplicate_of_structured_table(self): elements = [ Element( element_id="docling_p1_e1", doc_id="d1", page_num=1, type="paragraph", text="Region Q1 Q2 North America 10 12 Europe 8 7", reading_order=1, confidence=0.88, source_parser="docling", ), Element( element_id="pymupdf_p1_e1", doc_id="d1", page_num=1, type="table", markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |", reading_order=1, confidence=0.72, source_parser="pymupdf", ), ] deduped = dedupe_elements(elements) self.assertEqual(len(deduped), 1) self.assertEqual(deduped[0].type, "table") def test_merges_duplicate_table_elements_and_keeps_better_grid(self): elements = [ Element( element_id="docling_p1_e3", doc_id="d1", page_num=1, type="table", markdown="| Region | Q1 | Q2 North America | 10 | 12 Europe | 8 | 7 |\n| --- | --- | --- | --- | --- | --- | --- |", reading_order=3, confidence=0.88, source_parser="docling", ), Element( element_id="pymupdf_p1_e3", doc_id="d1", page_num=1, type="table", bbox=(72.0, 144.0, 237.0, 186.0), markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |", reading_order=3, confidence=0.72, source_parser="pymupdf", ), ] deduped = dedupe_elements(elements) self.assertEqual(len(deduped), 1) self.assertEqual(deduped[0].source_parser, "pymupdf") self.assertEqual(deduped[0].confidence, 0.88) self.assertIn("| North America | 10 | 12 |", deduped[0].markdown or "") self.assertEqual(deduped[0].bbox, (72.0, 144.0, 237.0, 186.0)) def test_merges_duplicate_tables_and_keeps_better_grid_assets(self): tables = [ TableObject( table_id="docling_t1", page_nums=[1], markdown="| Region | Q1 | Q2 North America | 10 | 12 Europe | 8 | 7 |\n| --- | --- | --- | --- | --- | --- | --- |", confidence=0.84, source_parser="docling", ), TableObject( table_id="pymupdf_t1", page_nums=[1], bbox=[(72.0, 144.0, 237.0, 186.0)], markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |", confidence=0.72, source_parser="pymupdf", provenance={"crop_path": "/tmp/table.png"}, ), ] deduped = dedupe_tables(tables) self.assertEqual(len(deduped), 1) self.assertEqual(deduped[0].source_parser, "pymupdf") self.assertEqual(deduped[0].confidence, 0.84) self.assertEqual(deduped[0].bbox, [(72.0, 144.0, 237.0, 186.0)]) self.assertEqual(deduped[0].provenance["crop_path"], "/tmp/table.png") self.assertEqual(deduped[0].provenance["source_parsers"], ["pymupdf", "docling"]) if __name__ == "__main__": unittest.main()