"""Tests for table structure similarity, formula CER, and OmniDocBench adapters.""" from __future__ import annotations import json import tempfile import unittest from pathlib import Path from zsgdp.benchmarks.datasets import DatasetDocument, register_dataset_loader from zsgdp.benchmarks.datasets import _LOADERS as _DATASET_LOADERS from zsgdp.benchmarks.ground_truth import ( omnidocbench_formula_truths, omnidocbench_table_truths, parsed_formula_records, parsed_table_records, ) from zsgdp.benchmarks.parser_quality import run_parser_benchmark from zsgdp.schema import Element, FigureObject, ParsedDocument, QualityReport, TableObject from zsgdp.verify.formula_extraction import compute_formula_extraction from zsgdp.verify.table_structure import compute_table_structure_score, html_to_rows, markdown_to_rows class TestMarkdownAndHTMLRows(unittest.TestCase): def test_markdown_strips_separator_row(self): rows = markdown_to_rows("| A | B |\n| --- | --- |\n| 1 | 2 |\n") self.assertEqual(rows, [["a", "b"], ["1", "2"]]) def test_html_handles_th_and_td(self): html = "
Col
val
" self.assertEqual(html_to_rows(html), [["col"], ["val"]]) class TestComputeTableStructure(unittest.TestCase): def test_perfect_match(self): truth = {"markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", "page_num": 1} prediction = {"markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", "page_num": 1} result = compute_table_structure_score([prediction], [truth]) self.assertEqual(result["matched_pair_count"], 1) self.assertEqual(result["mean_table_score"], 1.0) self.assertEqual(result["mean_cell_content_f1"], 1.0) self.assertEqual(result["table_match_rate"], 1.0) def test_partial_overlap_scores_between_zero_and_one(self): truth = {"markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", "page_num": 1} prediction = {"markdown": "| A | B |\n| --- | --- |\n| 1 | 3 |", "page_num": 1} result = compute_table_structure_score([prediction], [truth]) self.assertEqual(result["matched_pair_count"], 1) self.assertGreater(result["mean_table_score"], 0.0) self.assertLess(result["mean_table_score"], 1.0) def test_extra_prediction_lowers_match_rate(self): truth = {"markdown": "| A |\n| --- |\n| 1 |", "page_num": 1} predictions = [ {"markdown": "| A |\n| --- |\n| 1 |", "page_num": 1}, {"markdown": "| Z |\n| --- |\n| 9 |", "page_num": 2}, ] result = compute_table_structure_score(predictions, [truth]) self.assertEqual(result["matched_pair_count"], 1) self.assertEqual(result["table_match_rate"], 0.5) self.assertEqual(result["table_count_delta"], 1) def test_no_matching_page_yields_no_pair(self): truth = {"markdown": "| A |\n| --- |\n| 1 |", "page_num": 1} prediction = {"markdown": "| A |\n| --- |\n| 1 |", "page_num": 2} result = compute_table_structure_score([prediction], [truth]) self.assertEqual(result["matched_pair_count"], 0) def test_empty_inputs_are_vacuous(self): result = compute_table_structure_score([], []) self.assertEqual(result["mean_table_score"], 1.0) self.assertEqual(result["table_match_rate"], 1.0) class TestComputeFormulaExtraction(unittest.TestCase): def test_exact_match_yields_zero_cer(self): result = compute_formula_extraction( [{"latex": "E = mc^2", "page_num": 1}], [{"latex": "E = mc^2", "page_num": 1}], ) self.assertEqual(result["mean_cer"], 0.0) self.assertEqual(result["mean_accuracy"], 1.0) self.assertEqual(result["exact_match_rate"], 1.0) def test_one_char_off_yields_proportional_cer(self): result = compute_formula_extraction( [{"latex": "E = mc^3", "page_num": 1}], [{"latex": "E = mc^2", "page_num": 1}], ) # Levenshtein distance 1 over reference length 8 self.assertAlmostEqual(result["mean_cer"], 1 / 8, places=6) self.assertEqual(result["exact_match_rate"], 0.0) def test_empty_inputs_are_vacuous(self): result = compute_formula_extraction([], []) self.assertEqual(result["mean_cer"], 0.0) self.assertEqual(result["mean_accuracy"], 1.0) def test_one_side_empty_yields_full_error(self): result = compute_formula_extraction([], [{"latex": "x", "page_num": 1}]) self.assertEqual(result["mean_cer"], 1.0) self.assertEqual(result["mean_accuracy"], 0.0) def test_dollar_delimiters_stripped(self): result = compute_formula_extraction( [{"latex": "$$E = mc^2$$", "page_num": 1}], [{"latex": "E = mc^2", "page_num": 1}], ) self.assertEqual(result["exact_match_rate"], 1.0) def test_greedy_matching_picks_lowest_cer_pair(self): predictions = [ {"latex": "E = mc^2", "page_num": 1}, {"latex": "F = ma", "page_num": 1}, ] truths = [ {"latex": "F = ma", "page_num": 1}, {"latex": "E = mc^2", "page_num": 1}, ] result = compute_formula_extraction(predictions, truths) self.assertEqual(result["matched_pair_count"], 2) self.assertEqual(result["exact_match_rate"], 1.0) class TestOmniDocBenchAdapters(unittest.TestCase): def test_table_truths_extract_markdown_and_page(self): gt = { "layout_dets": [ {"category": "table", "markdown": "| A |\n| --- |\n| 1 |", "page_num": 1}, {"category": "Title", "text": "ignore", "page_num": 1}, {"category": "Table", "html": "
x
", "page_num": 2}, ] } truths = omnidocbench_table_truths(gt) self.assertEqual(len(truths), 2) self.assertEqual(truths[0]["page_num"], 1) self.assertEqual(truths[1]["page_num"], 2) def test_formula_truths_extract_latex(self): gt = { "layout_dets": [ {"category": "formula", "latex": "E = mc^2", "page_num": 1}, {"category": "Equation", "text": "F = ma", "page_num": 2}, {"category": "Title", "text": "ignore", "page_num": 1}, ] } truths = omnidocbench_formula_truths(gt) self.assertEqual(len(truths), 2) self.assertEqual(truths[0]["latex"], "E = mc^2") self.assertEqual(truths[1]["latex"], "F = ma") def test_unknown_shape_returns_empty(self): self.assertEqual(omnidocbench_table_truths({"weird": True}), []) self.assertEqual(omnidocbench_formula_truths({}), []) class TestParsedRecords(unittest.TestCase): def test_parsed_table_records_dedupes_object_and_element(self): parsed = ParsedDocument( doc_id="d1", source_path="/tmp/d1.pdf", file_type="pdf", elements=[ Element( element_id="t1", doc_id="d1", page_num=1, type="table", markdown="| A |\n| --- |\n| 1 |", ), ], tables=[ TableObject( table_id="t1", page_nums=[1], markdown="| A |\n| --- |\n| 1 |", ), ], quality_report=QualityReport(), ) records = parsed_table_records(parsed) # Both table objects keyed differently, so we get 2 records (table object + element). # The dedupe key is per-source so they stay distinct, which is fine for matching. self.assertGreaterEqual(len(records), 1) self.assertTrue(any(record["table_id"] == "t1" for record in records)) def test_parsed_formula_records_extract_latex(self): parsed = ParsedDocument( doc_id="d1", source_path="/tmp/d1.pdf", file_type="pdf", elements=[ Element(element_id="f1", doc_id="d1", page_num=1, type="formula", text="E = mc^2"), Element(element_id="p1", doc_id="d1", page_num=1, type="paragraph", text="not a formula"), Element(element_id="f2", doc_id="d1", page_num=2, type="formula", text=""), ], quality_report=QualityReport(), ) records = parsed_formula_records(parsed) self.assertEqual(len(records), 1) self.assertEqual(records[0]["formula_id"], "f1") self.assertEqual(records[0]["latex"], "E = mc^2") class TestBenchmarkIntegration(unittest.TestCase): def test_omnidocbench_smoke_run_emits_metrics(self): # Use a markdown source with a one-shot loader that tags the document # as `omnidocbench`. Lets us exercise the full benchmark wiring (table + # formula adapters, CSVs) without needing PyMuPDF to parse bytes. ground_truth = { "layout_dets": [ { "category": "table", "markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", "page_num": 1, }, {"category": "formula", "latex": "E = mc^2", "page_num": 1}, ] } with tempfile.TemporaryDirectory() as tmp: tmp = Path(tmp) src = tmp / "in" src.mkdir() md_path = src / "doc.md" md_path.write_text("# Doc\n\n| A | B |\n| --- | --- |\n| 1 | 2 |\n", encoding="utf-8") def fake_loader(root: Path): yield DatasetDocument( dataset_id="omnidocbench", doc_id="doc", path=md_path, ground_truth=ground_truth, metadata={}, ) register_dataset_loader("omnidocbench", fake_loader) try: summary = run_parser_benchmark(src, tmp / "out", dataset_name="omnidocbench") finally: from zsgdp.benchmarks.ground_truth import omnidocbench_layout_truths # restore the real loader from zsgdp.benchmarks.datasets import _load_omnidocbench _DATASET_LOADERS["omnidocbench"] = _load_omnidocbench self.assertEqual(summary["dataset_name"], "omnidocbench") doc = summary["documents"][0] self.assertTrue(doc["table_structure_evaluated"]) self.assertTrue(doc["formula_evaluated"]) self.assertTrue((tmp / "out" / "table_structure_runs.csv").exists()) self.assertTrue((tmp / "out" / "formula_runs.csv").exists()) if __name__ == "__main__": unittest.main()