Spaces:
Running on Zero
Running on Zero
| """Tests for the per-parser GT-comparison leaderboard rollup.""" | |
| from __future__ import annotations | |
| import tempfile | |
| import unittest | |
| from pathlib import Path | |
| from zsgdp.benchmarks.datasets import DatasetDocument, register_dataset_loader | |
| from zsgdp.benchmarks.datasets import _LOADERS as _DATASET_LOADERS | |
| from zsgdp.benchmarks.parser_quality import _per_parser_gt_leaderboard, run_parser_benchmark | |
| class TestPerParserGTLeaderboard(unittest.TestCase): | |
| def test_aggregates_by_parser(self): | |
| rows = [ | |
| { | |
| "parser": "docling", | |
| "layout_evaluated": True, | |
| "table_evaluated": True, | |
| "formula_evaluated": False, | |
| "layout_class_aware_f1": 0.9, | |
| "layout_class_agnostic_f1": 0.95, | |
| "layout_class_aware_precision": 0.9, | |
| "layout_class_aware_recall": 0.9, | |
| "layout_prediction_count": 10, | |
| "table_structure_score": 0.8, | |
| "table_match_rate": 0.9, | |
| "table_cell_content_f1": 0.7, | |
| "formula_cer": 0.0, | |
| "formula_accuracy": 0.0, | |
| "formula_exact_match_rate": 0.0, | |
| "element_count": 10, | |
| "table_count": 2, | |
| "figure_count": 1, | |
| }, | |
| { | |
| "parser": "docling", | |
| "layout_evaluated": True, | |
| "table_evaluated": False, | |
| "formula_evaluated": False, | |
| "layout_class_aware_f1": 0.7, | |
| "layout_class_agnostic_f1": 0.75, | |
| "layout_class_aware_precision": 0.7, | |
| "layout_class_aware_recall": 0.7, | |
| "layout_prediction_count": 8, | |
| "table_structure_score": 0.0, | |
| "table_match_rate": 0.0, | |
| "table_cell_content_f1": 0.0, | |
| "formula_cer": 0.0, | |
| "formula_accuracy": 0.0, | |
| "formula_exact_match_rate": 0.0, | |
| "element_count": 8, | |
| "table_count": 0, | |
| "figure_count": 0, | |
| }, | |
| { | |
| "parser": "pymupdf", | |
| "layout_evaluated": True, | |
| "table_evaluated": False, | |
| "formula_evaluated": False, | |
| "layout_class_aware_f1": 0.5, | |
| "layout_class_agnostic_f1": 0.55, | |
| "layout_class_aware_precision": 0.5, | |
| "layout_class_aware_recall": 0.5, | |
| "layout_prediction_count": 6, | |
| "table_structure_score": 0.0, | |
| "table_match_rate": 0.0, | |
| "table_cell_content_f1": 0.0, | |
| "formula_cer": 0.0, | |
| "formula_accuracy": 0.0, | |
| "formula_exact_match_rate": 0.0, | |
| "element_count": 6, | |
| "table_count": 0, | |
| "figure_count": 0, | |
| }, | |
| ] | |
| leaderboard = _per_parser_gt_leaderboard(rows) | |
| by_parser = {row["parser"]: row for row in leaderboard} | |
| # Docling appears once with 2 documents, 2 layout-evaluated, 1 table-evaluated. | |
| self.assertEqual(by_parser["docling"]["document_count"], 2) | |
| self.assertEqual(by_parser["docling"]["layout_evaluated_count"], 2) | |
| self.assertEqual(by_parser["docling"]["table_evaluated_count"], 1) | |
| self.assertEqual(by_parser["docling"]["formula_evaluated_count"], 0) | |
| self.assertAlmostEqual(by_parser["docling"]["mean_layout_class_aware_f1"], 0.8, places=6) | |
| # Table mean uses only the row that was evaluated. | |
| self.assertAlmostEqual(by_parser["docling"]["mean_table_structure_score"], 0.8, places=6) | |
| # PyMuPDF appears once. | |
| self.assertEqual(by_parser["pymupdf"]["document_count"], 1) | |
| self.assertAlmostEqual(by_parser["pymupdf"]["mean_layout_class_aware_f1"], 0.5, places=6) | |
| def test_sorted_by_layout_then_table_then_formula_inverse(self): | |
| rows = [ | |
| {"parser": "low", "layout_evaluated": True, "layout_class_aware_f1": 0.3, "layout_class_agnostic_f1": 0.3}, | |
| {"parser": "high", "layout_evaluated": True, "layout_class_aware_f1": 0.9, "layout_class_agnostic_f1": 0.9}, | |
| ] | |
| leaderboard = _per_parser_gt_leaderboard(rows) | |
| self.assertEqual([row["parser"] for row in leaderboard], ["high", "low"]) | |
| def test_empty_rows_returns_empty_list(self): | |
| self.assertEqual(_per_parser_gt_leaderboard([]), []) | |
| class TestBenchmarkEmitsLeaderboard(unittest.TestCase): | |
| def test_per_parser_gt_leaderboard_csv_written(self): | |
| ground_truth = { | |
| "layout_dets": [ | |
| {"category": "title", "bbox": [0, 0, 100, 30], "page_num": 1}, | |
| ] | |
| } | |
| with tempfile.TemporaryDirectory() as tmp: | |
| tmp = Path(tmp) | |
| src = tmp / "in" | |
| src.mkdir() | |
| md_path = src / "doc.md" | |
| md_path.write_text("# Doc\n\nSome text.\n", encoding="utf-8") | |
| def fake_loader(root: Path): | |
| yield DatasetDocument( | |
| dataset_id="omnidocbench", | |
| doc_id="doc", | |
| path=md_path, | |
| ground_truth=ground_truth, | |
| metadata={}, | |
| ) | |
| register_dataset_loader("omnidocbench", fake_loader) | |
| try: | |
| summary = run_parser_benchmark(src, tmp / "out", dataset_name="omnidocbench") | |
| finally: | |
| from zsgdp.benchmarks.datasets import _load_omnidocbench | |
| _DATASET_LOADERS["omnidocbench"] = _load_omnidocbench | |
| self.assertIn("per_parser_gt_leaderboard", summary) | |
| csv_path = tmp / "out" / "per_parser_gt_leaderboard.csv" | |
| self.assertTrue(csv_path.exists()) | |
| content = csv_path.read_text() | |
| header = content.splitlines()[0] | |
| self.assertIn("parser", header) | |
| self.assertIn("mean_layout_class_aware_f1", header) | |
| self.assertIn("layout_evaluated_count", header) | |
| if __name__ == "__main__": | |
| unittest.main() | |