zeroshotGPU / tests /test_merge.py
Arjunvir Singh
Initial commit: zeroshotGPU MVP with full eval surface
db06ffa
import unittest
from zsgdp.merge.dedupe import dedupe_elements, dedupe_tables
from zsgdp.schema import Element, TableObject
class MergeDedupeTests(unittest.TestCase):
def test_merges_docling_heading_with_pymupdf_bbox(self):
elements = [
Element(
element_id="docling_p1_e1",
doc_id="d1",
page_num=1,
type="heading",
text="## Revenue Summary",
markdown="## Revenue Summary",
reading_order=1,
confidence=0.88,
source_parser="docling",
),
Element(
element_id="pymupdf_p1_e1",
doc_id="d1",
page_num=1,
type="paragraph",
text="Revenue Summary",
bbox=(72.0, 100.0, 200.0, 124.0),
reading_order=1,
confidence=0.86,
source_parser="pymupdf",
),
]
deduped = dedupe_elements(elements)
self.assertEqual(len(deduped), 1)
self.assertEqual(deduped[0].source_parser, "docling")
self.assertEqual(deduped[0].bbox, (72.0, 100.0, 200.0, 124.0))
self.assertEqual(deduped[0].provenance["bbox_source_parser"], "pymupdf")
def test_drops_paragraph_duplicate_of_structured_table(self):
elements = [
Element(
element_id="docling_p1_e1",
doc_id="d1",
page_num=1,
type="paragraph",
text="Region Q1 Q2 North America 10 12 Europe 8 7",
reading_order=1,
confidence=0.88,
source_parser="docling",
),
Element(
element_id="pymupdf_p1_e1",
doc_id="d1",
page_num=1,
type="table",
markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |",
reading_order=1,
confidence=0.72,
source_parser="pymupdf",
),
]
deduped = dedupe_elements(elements)
self.assertEqual(len(deduped), 1)
self.assertEqual(deduped[0].type, "table")
def test_merges_duplicate_table_elements_and_keeps_better_grid(self):
elements = [
Element(
element_id="docling_p1_e3",
doc_id="d1",
page_num=1,
type="table",
markdown="| Region | Q1 | Q2 North America | 10 | 12 Europe | 8 | 7 |\n| --- | --- | --- | --- | --- | --- | --- |",
reading_order=3,
confidence=0.88,
source_parser="docling",
),
Element(
element_id="pymupdf_p1_e3",
doc_id="d1",
page_num=1,
type="table",
bbox=(72.0, 144.0, 237.0, 186.0),
markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |",
reading_order=3,
confidence=0.72,
source_parser="pymupdf",
),
]
deduped = dedupe_elements(elements)
self.assertEqual(len(deduped), 1)
self.assertEqual(deduped[0].source_parser, "pymupdf")
self.assertEqual(deduped[0].confidence, 0.88)
self.assertIn("| North America | 10 | 12 |", deduped[0].markdown or "")
self.assertEqual(deduped[0].bbox, (72.0, 144.0, 237.0, 186.0))
def test_merges_duplicate_tables_and_keeps_better_grid_assets(self):
tables = [
TableObject(
table_id="docling_t1",
page_nums=[1],
markdown="| Region | Q1 | Q2 North America | 10 | 12 Europe | 8 | 7 |\n| --- | --- | --- | --- | --- | --- | --- |",
confidence=0.84,
source_parser="docling",
),
TableObject(
table_id="pymupdf_t1",
page_nums=[1],
bbox=[(72.0, 144.0, 237.0, 186.0)],
markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |",
confidence=0.72,
source_parser="pymupdf",
provenance={"crop_path": "/tmp/table.png"},
),
]
deduped = dedupe_tables(tables)
self.assertEqual(len(deduped), 1)
self.assertEqual(deduped[0].source_parser, "pymupdf")
self.assertEqual(deduped[0].confidence, 0.84)
self.assertEqual(deduped[0].bbox, [(72.0, 144.0, 237.0, 186.0)])
self.assertEqual(deduped[0].provenance["crop_path"], "/tmp/table.png")
self.assertEqual(deduped[0].provenance["source_parsers"], ["pymupdf", "docling"])
if __name__ == "__main__":
unittest.main()