"""Tests for the per-parser GT-comparison leaderboard rollup.""" from __future__ import annotations import tempfile import unittest from pathlib import Path from zsgdp.benchmarks.datasets import DatasetDocument, register_dataset_loader from zsgdp.benchmarks.datasets import _LOADERS as _DATASET_LOADERS from zsgdp.benchmarks.parser_quality import _per_parser_gt_leaderboard, run_parser_benchmark class TestPerParserGTLeaderboard(unittest.TestCase): def test_aggregates_by_parser(self): rows = [ { "parser": "docling", "layout_evaluated": True, "table_evaluated": True, "formula_evaluated": False, "layout_class_aware_f1": 0.9, "layout_class_agnostic_f1": 0.95, "layout_class_aware_precision": 0.9, "layout_class_aware_recall": 0.9, "layout_prediction_count": 10, "table_structure_score": 0.8, "table_match_rate": 0.9, "table_cell_content_f1": 0.7, "formula_cer": 0.0, "formula_accuracy": 0.0, "formula_exact_match_rate": 0.0, "element_count": 10, "table_count": 2, "figure_count": 1, }, { "parser": "docling", "layout_evaluated": True, "table_evaluated": False, "formula_evaluated": False, "layout_class_aware_f1": 0.7, "layout_class_agnostic_f1": 0.75, "layout_class_aware_precision": 0.7, "layout_class_aware_recall": 0.7, "layout_prediction_count": 8, "table_structure_score": 0.0, "table_match_rate": 0.0, "table_cell_content_f1": 0.0, "formula_cer": 0.0, "formula_accuracy": 0.0, "formula_exact_match_rate": 0.0, "element_count": 8, "table_count": 0, "figure_count": 0, }, { "parser": "pymupdf", "layout_evaluated": True, "table_evaluated": False, "formula_evaluated": False, "layout_class_aware_f1": 0.5, "layout_class_agnostic_f1": 0.55, "layout_class_aware_precision": 0.5, "layout_class_aware_recall": 0.5, "layout_prediction_count": 6, "table_structure_score": 0.0, "table_match_rate": 0.0, "table_cell_content_f1": 0.0, "formula_cer": 0.0, "formula_accuracy": 0.0, "formula_exact_match_rate": 0.0, "element_count": 6, "table_count": 0, "figure_count": 0, }, ] leaderboard = _per_parser_gt_leaderboard(rows) by_parser = {row["parser"]: row for row in leaderboard} # Docling appears once with 2 documents, 2 layout-evaluated, 1 table-evaluated. self.assertEqual(by_parser["docling"]["document_count"], 2) self.assertEqual(by_parser["docling"]["layout_evaluated_count"], 2) self.assertEqual(by_parser["docling"]["table_evaluated_count"], 1) self.assertEqual(by_parser["docling"]["formula_evaluated_count"], 0) self.assertAlmostEqual(by_parser["docling"]["mean_layout_class_aware_f1"], 0.8, places=6) # Table mean uses only the row that was evaluated. self.assertAlmostEqual(by_parser["docling"]["mean_table_structure_score"], 0.8, places=6) # PyMuPDF appears once. self.assertEqual(by_parser["pymupdf"]["document_count"], 1) self.assertAlmostEqual(by_parser["pymupdf"]["mean_layout_class_aware_f1"], 0.5, places=6) def test_sorted_by_layout_then_table_then_formula_inverse(self): rows = [ {"parser": "low", "layout_evaluated": True, "layout_class_aware_f1": 0.3, "layout_class_agnostic_f1": 0.3}, {"parser": "high", "layout_evaluated": True, "layout_class_aware_f1": 0.9, "layout_class_agnostic_f1": 0.9}, ] leaderboard = _per_parser_gt_leaderboard(rows) self.assertEqual([row["parser"] for row in leaderboard], ["high", "low"]) def test_empty_rows_returns_empty_list(self): self.assertEqual(_per_parser_gt_leaderboard([]), []) class TestBenchmarkEmitsLeaderboard(unittest.TestCase): def test_per_parser_gt_leaderboard_csv_written(self): ground_truth = { "layout_dets": [ {"category": "title", "bbox": [0, 0, 100, 30], "page_num": 1}, ] } with tempfile.TemporaryDirectory() as tmp: tmp = Path(tmp) src = tmp / "in" src.mkdir() md_path = src / "doc.md" md_path.write_text("# Doc\n\nSome text.\n", encoding="utf-8") def fake_loader(root: Path): yield DatasetDocument( dataset_id="omnidocbench", doc_id="doc", path=md_path, ground_truth=ground_truth, metadata={}, ) register_dataset_loader("omnidocbench", fake_loader) try: summary = run_parser_benchmark(src, tmp / "out", dataset_name="omnidocbench") finally: from zsgdp.benchmarks.datasets import _load_omnidocbench _DATASET_LOADERS["omnidocbench"] = _load_omnidocbench self.assertIn("per_parser_gt_leaderboard", summary) csv_path = tmp / "out" / "per_parser_gt_leaderboard.csv" self.assertTrue(csv_path.exists()) content = csv_path.read_text() header = content.splitlines()[0] self.assertIn("parser", header) self.assertIn("mean_layout_class_aware_f1", header) self.assertIn("layout_evaluated_count", header) if __name__ == "__main__": unittest.main()