Spaces:
Running on Zero
Running on Zero
| """Tests for table structure similarity, formula CER, and OmniDocBench adapters.""" | |
| from __future__ import annotations | |
| import json | |
| import tempfile | |
| import unittest | |
| from pathlib import Path | |
| from zsgdp.benchmarks.datasets import DatasetDocument, register_dataset_loader | |
| from zsgdp.benchmarks.datasets import _LOADERS as _DATASET_LOADERS | |
| from zsgdp.benchmarks.ground_truth import ( | |
| omnidocbench_formula_truths, | |
| omnidocbench_table_truths, | |
| parsed_formula_records, | |
| parsed_table_records, | |
| ) | |
| from zsgdp.benchmarks.parser_quality import run_parser_benchmark | |
| from zsgdp.schema import Element, FigureObject, ParsedDocument, QualityReport, TableObject | |
| from zsgdp.verify.formula_extraction import compute_formula_extraction | |
| from zsgdp.verify.table_structure import compute_table_structure_score, html_to_rows, markdown_to_rows | |
| class TestMarkdownAndHTMLRows(unittest.TestCase): | |
| def test_markdown_strips_separator_row(self): | |
| rows = markdown_to_rows("| A | B |\n| --- | --- |\n| 1 | 2 |\n") | |
| self.assertEqual(rows, [["a", "b"], ["1", "2"]]) | |
| def test_html_handles_th_and_td(self): | |
| html = "<table><tr><th>Col</th></tr><tr><td>val</td></tr></table>" | |
| self.assertEqual(html_to_rows(html), [["col"], ["val"]]) | |
| class TestComputeTableStructure(unittest.TestCase): | |
| def test_perfect_match(self): | |
| truth = {"markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", "page_num": 1} | |
| prediction = {"markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", "page_num": 1} | |
| result = compute_table_structure_score([prediction], [truth]) | |
| self.assertEqual(result["matched_pair_count"], 1) | |
| self.assertEqual(result["mean_table_score"], 1.0) | |
| self.assertEqual(result["mean_cell_content_f1"], 1.0) | |
| self.assertEqual(result["table_match_rate"], 1.0) | |
| def test_partial_overlap_scores_between_zero_and_one(self): | |
| truth = {"markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", "page_num": 1} | |
| prediction = {"markdown": "| A | B |\n| --- | --- |\n| 1 | 3 |", "page_num": 1} | |
| result = compute_table_structure_score([prediction], [truth]) | |
| self.assertEqual(result["matched_pair_count"], 1) | |
| self.assertGreater(result["mean_table_score"], 0.0) | |
| self.assertLess(result["mean_table_score"], 1.0) | |
| def test_extra_prediction_lowers_match_rate(self): | |
| truth = {"markdown": "| A |\n| --- |\n| 1 |", "page_num": 1} | |
| predictions = [ | |
| {"markdown": "| A |\n| --- |\n| 1 |", "page_num": 1}, | |
| {"markdown": "| Z |\n| --- |\n| 9 |", "page_num": 2}, | |
| ] | |
| result = compute_table_structure_score(predictions, [truth]) | |
| self.assertEqual(result["matched_pair_count"], 1) | |
| self.assertEqual(result["table_match_rate"], 0.5) | |
| self.assertEqual(result["table_count_delta"], 1) | |
| def test_no_matching_page_yields_no_pair(self): | |
| truth = {"markdown": "| A |\n| --- |\n| 1 |", "page_num": 1} | |
| prediction = {"markdown": "| A |\n| --- |\n| 1 |", "page_num": 2} | |
| result = compute_table_structure_score([prediction], [truth]) | |
| self.assertEqual(result["matched_pair_count"], 0) | |
| def test_empty_inputs_are_vacuous(self): | |
| result = compute_table_structure_score([], []) | |
| self.assertEqual(result["mean_table_score"], 1.0) | |
| self.assertEqual(result["table_match_rate"], 1.0) | |
| class TestComputeFormulaExtraction(unittest.TestCase): | |
| def test_exact_match_yields_zero_cer(self): | |
| result = compute_formula_extraction( | |
| [{"latex": "E = mc^2", "page_num": 1}], | |
| [{"latex": "E = mc^2", "page_num": 1}], | |
| ) | |
| self.assertEqual(result["mean_cer"], 0.0) | |
| self.assertEqual(result["mean_accuracy"], 1.0) | |
| self.assertEqual(result["exact_match_rate"], 1.0) | |
| def test_one_char_off_yields_proportional_cer(self): | |
| result = compute_formula_extraction( | |
| [{"latex": "E = mc^3", "page_num": 1}], | |
| [{"latex": "E = mc^2", "page_num": 1}], | |
| ) | |
| # Levenshtein distance 1 over reference length 8 | |
| self.assertAlmostEqual(result["mean_cer"], 1 / 8, places=6) | |
| self.assertEqual(result["exact_match_rate"], 0.0) | |
| def test_empty_inputs_are_vacuous(self): | |
| result = compute_formula_extraction([], []) | |
| self.assertEqual(result["mean_cer"], 0.0) | |
| self.assertEqual(result["mean_accuracy"], 1.0) | |
| def test_one_side_empty_yields_full_error(self): | |
| result = compute_formula_extraction([], [{"latex": "x", "page_num": 1}]) | |
| self.assertEqual(result["mean_cer"], 1.0) | |
| self.assertEqual(result["mean_accuracy"], 0.0) | |
| def test_dollar_delimiters_stripped(self): | |
| result = compute_formula_extraction( | |
| [{"latex": "$$E = mc^2$$", "page_num": 1}], | |
| [{"latex": "E = mc^2", "page_num": 1}], | |
| ) | |
| self.assertEqual(result["exact_match_rate"], 1.0) | |
| def test_greedy_matching_picks_lowest_cer_pair(self): | |
| predictions = [ | |
| {"latex": "E = mc^2", "page_num": 1}, | |
| {"latex": "F = ma", "page_num": 1}, | |
| ] | |
| truths = [ | |
| {"latex": "F = ma", "page_num": 1}, | |
| {"latex": "E = mc^2", "page_num": 1}, | |
| ] | |
| result = compute_formula_extraction(predictions, truths) | |
| self.assertEqual(result["matched_pair_count"], 2) | |
| self.assertEqual(result["exact_match_rate"], 1.0) | |
| class TestOmniDocBenchAdapters(unittest.TestCase): | |
| def test_table_truths_extract_markdown_and_page(self): | |
| gt = { | |
| "layout_dets": [ | |
| {"category": "table", "markdown": "| A |\n| --- |\n| 1 |", "page_num": 1}, | |
| {"category": "Title", "text": "ignore", "page_num": 1}, | |
| {"category": "Table", "html": "<table><tr><td>x</td></tr></table>", "page_num": 2}, | |
| ] | |
| } | |
| truths = omnidocbench_table_truths(gt) | |
| self.assertEqual(len(truths), 2) | |
| self.assertEqual(truths[0]["page_num"], 1) | |
| self.assertEqual(truths[1]["page_num"], 2) | |
| def test_formula_truths_extract_latex(self): | |
| gt = { | |
| "layout_dets": [ | |
| {"category": "formula", "latex": "E = mc^2", "page_num": 1}, | |
| {"category": "Equation", "text": "F = ma", "page_num": 2}, | |
| {"category": "Title", "text": "ignore", "page_num": 1}, | |
| ] | |
| } | |
| truths = omnidocbench_formula_truths(gt) | |
| self.assertEqual(len(truths), 2) | |
| self.assertEqual(truths[0]["latex"], "E = mc^2") | |
| self.assertEqual(truths[1]["latex"], "F = ma") | |
| def test_unknown_shape_returns_empty(self): | |
| self.assertEqual(omnidocbench_table_truths({"weird": True}), []) | |
| self.assertEqual(omnidocbench_formula_truths({}), []) | |
| class TestParsedRecords(unittest.TestCase): | |
| def test_parsed_table_records_dedupes_object_and_element(self): | |
| parsed = ParsedDocument( | |
| doc_id="d1", | |
| source_path="/tmp/d1.pdf", | |
| file_type="pdf", | |
| elements=[ | |
| Element( | |
| element_id="t1", | |
| doc_id="d1", | |
| page_num=1, | |
| type="table", | |
| markdown="| A |\n| --- |\n| 1 |", | |
| ), | |
| ], | |
| tables=[ | |
| TableObject( | |
| table_id="t1", | |
| page_nums=[1], | |
| markdown="| A |\n| --- |\n| 1 |", | |
| ), | |
| ], | |
| quality_report=QualityReport(), | |
| ) | |
| records = parsed_table_records(parsed) | |
| # Both table objects keyed differently, so we get 2 records (table object + element). | |
| # The dedupe key is per-source so they stay distinct, which is fine for matching. | |
| self.assertGreaterEqual(len(records), 1) | |
| self.assertTrue(any(record["table_id"] == "t1" for record in records)) | |
| def test_parsed_formula_records_extract_latex(self): | |
| parsed = ParsedDocument( | |
| doc_id="d1", | |
| source_path="/tmp/d1.pdf", | |
| file_type="pdf", | |
| elements=[ | |
| Element(element_id="f1", doc_id="d1", page_num=1, type="formula", text="E = mc^2"), | |
| Element(element_id="p1", doc_id="d1", page_num=1, type="paragraph", text="not a formula"), | |
| Element(element_id="f2", doc_id="d1", page_num=2, type="formula", text=""), | |
| ], | |
| quality_report=QualityReport(), | |
| ) | |
| records = parsed_formula_records(parsed) | |
| self.assertEqual(len(records), 1) | |
| self.assertEqual(records[0]["formula_id"], "f1") | |
| self.assertEqual(records[0]["latex"], "E = mc^2") | |
| class TestBenchmarkIntegration(unittest.TestCase): | |
| def test_omnidocbench_smoke_run_emits_metrics(self): | |
| # Use a markdown source with a one-shot loader that tags the document | |
| # as `omnidocbench`. Lets us exercise the full benchmark wiring (table + | |
| # formula adapters, CSVs) without needing PyMuPDF to parse bytes. | |
| ground_truth = { | |
| "layout_dets": [ | |
| { | |
| "category": "table", | |
| "markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", | |
| "page_num": 1, | |
| }, | |
| {"category": "formula", "latex": "E = mc^2", "page_num": 1}, | |
| ] | |
| } | |
| with tempfile.TemporaryDirectory() as tmp: | |
| tmp = Path(tmp) | |
| src = tmp / "in" | |
| src.mkdir() | |
| md_path = src / "doc.md" | |
| md_path.write_text("# Doc\n\n| A | B |\n| --- | --- |\n| 1 | 2 |\n", encoding="utf-8") | |
| def fake_loader(root: Path): | |
| yield DatasetDocument( | |
| dataset_id="omnidocbench", | |
| doc_id="doc", | |
| path=md_path, | |
| ground_truth=ground_truth, | |
| metadata={}, | |
| ) | |
| register_dataset_loader("omnidocbench", fake_loader) | |
| try: | |
| summary = run_parser_benchmark(src, tmp / "out", dataset_name="omnidocbench") | |
| finally: | |
| from zsgdp.benchmarks.ground_truth import omnidocbench_layout_truths | |
| # restore the real loader | |
| from zsgdp.benchmarks.datasets import _load_omnidocbench | |
| _DATASET_LOADERS["omnidocbench"] = _load_omnidocbench | |
| self.assertEqual(summary["dataset_name"], "omnidocbench") | |
| doc = summary["documents"][0] | |
| self.assertTrue(doc["table_structure_evaluated"]) | |
| self.assertTrue(doc["formula_evaluated"]) | |
| self.assertTrue((tmp / "out" / "table_structure_runs.csv").exists()) | |
| self.assertTrue((tmp / "out" / "formula_runs.csv").exists()) | |
| if __name__ == "__main__": | |
| unittest.main() | |