Spaces:
Running on Zero
Running on Zero
| import unittest | |
| from zsgdp.merge.dedupe import dedupe_elements, dedupe_tables | |
| from zsgdp.schema import Element, TableObject | |
| class MergeDedupeTests(unittest.TestCase): | |
| def test_merges_docling_heading_with_pymupdf_bbox(self): | |
| elements = [ | |
| Element( | |
| element_id="docling_p1_e1", | |
| doc_id="d1", | |
| page_num=1, | |
| type="heading", | |
| text="## Revenue Summary", | |
| markdown="## Revenue Summary", | |
| reading_order=1, | |
| confidence=0.88, | |
| source_parser="docling", | |
| ), | |
| Element( | |
| element_id="pymupdf_p1_e1", | |
| doc_id="d1", | |
| page_num=1, | |
| type="paragraph", | |
| text="Revenue Summary", | |
| bbox=(72.0, 100.0, 200.0, 124.0), | |
| reading_order=1, | |
| confidence=0.86, | |
| source_parser="pymupdf", | |
| ), | |
| ] | |
| deduped = dedupe_elements(elements) | |
| self.assertEqual(len(deduped), 1) | |
| self.assertEqual(deduped[0].source_parser, "docling") | |
| self.assertEqual(deduped[0].bbox, (72.0, 100.0, 200.0, 124.0)) | |
| self.assertEqual(deduped[0].provenance["bbox_source_parser"], "pymupdf") | |
| def test_drops_paragraph_duplicate_of_structured_table(self): | |
| elements = [ | |
| Element( | |
| element_id="docling_p1_e1", | |
| doc_id="d1", | |
| page_num=1, | |
| type="paragraph", | |
| text="Region Q1 Q2 North America 10 12 Europe 8 7", | |
| reading_order=1, | |
| confidence=0.88, | |
| source_parser="docling", | |
| ), | |
| Element( | |
| element_id="pymupdf_p1_e1", | |
| doc_id="d1", | |
| page_num=1, | |
| type="table", | |
| markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |", | |
| reading_order=1, | |
| confidence=0.72, | |
| source_parser="pymupdf", | |
| ), | |
| ] | |
| deduped = dedupe_elements(elements) | |
| self.assertEqual(len(deduped), 1) | |
| self.assertEqual(deduped[0].type, "table") | |
| def test_merges_duplicate_table_elements_and_keeps_better_grid(self): | |
| elements = [ | |
| Element( | |
| element_id="docling_p1_e3", | |
| doc_id="d1", | |
| page_num=1, | |
| type="table", | |
| markdown="| Region | Q1 | Q2 North America | 10 | 12 Europe | 8 | 7 |\n| --- | --- | --- | --- | --- | --- | --- |", | |
| reading_order=3, | |
| confidence=0.88, | |
| source_parser="docling", | |
| ), | |
| Element( | |
| element_id="pymupdf_p1_e3", | |
| doc_id="d1", | |
| page_num=1, | |
| type="table", | |
| bbox=(72.0, 144.0, 237.0, 186.0), | |
| markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |", | |
| reading_order=3, | |
| confidence=0.72, | |
| source_parser="pymupdf", | |
| ), | |
| ] | |
| deduped = dedupe_elements(elements) | |
| self.assertEqual(len(deduped), 1) | |
| self.assertEqual(deduped[0].source_parser, "pymupdf") | |
| self.assertEqual(deduped[0].confidence, 0.88) | |
| self.assertIn("| North America | 10 | 12 |", deduped[0].markdown or "") | |
| self.assertEqual(deduped[0].bbox, (72.0, 144.0, 237.0, 186.0)) | |
| def test_merges_duplicate_tables_and_keeps_better_grid_assets(self): | |
| tables = [ | |
| TableObject( | |
| table_id="docling_t1", | |
| page_nums=[1], | |
| markdown="| Region | Q1 | Q2 North America | 10 | 12 Europe | 8 | 7 |\n| --- | --- | --- | --- | --- | --- | --- |", | |
| confidence=0.84, | |
| source_parser="docling", | |
| ), | |
| TableObject( | |
| table_id="pymupdf_t1", | |
| page_nums=[1], | |
| bbox=[(72.0, 144.0, 237.0, 186.0)], | |
| markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |", | |
| confidence=0.72, | |
| source_parser="pymupdf", | |
| provenance={"crop_path": "/tmp/table.png"}, | |
| ), | |
| ] | |
| deduped = dedupe_tables(tables) | |
| self.assertEqual(len(deduped), 1) | |
| self.assertEqual(deduped[0].source_parser, "pymupdf") | |
| self.assertEqual(deduped[0].confidence, 0.84) | |
| self.assertEqual(deduped[0].bbox, [(72.0, 144.0, 237.0, 186.0)]) | |
| self.assertEqual(deduped[0].provenance["crop_path"], "/tmp/table.png") | |
| self.assertEqual(deduped[0].provenance["source_parsers"], ["pymupdf", "docling"]) | |
| if __name__ == "__main__": | |
| unittest.main() | |