zeroshotGPU / tests /test_per_parser_leaderboard.py
Arjunvir Singh
Initial commit: zeroshotGPU MVP with full eval surface
db06ffa
"""Tests for the per-parser GT-comparison leaderboard rollup."""
from __future__ import annotations
import tempfile
import unittest
from pathlib import Path
from zsgdp.benchmarks.datasets import DatasetDocument, register_dataset_loader
from zsgdp.benchmarks.datasets import _LOADERS as _DATASET_LOADERS
from zsgdp.benchmarks.parser_quality import _per_parser_gt_leaderboard, run_parser_benchmark
class TestPerParserGTLeaderboard(unittest.TestCase):
def test_aggregates_by_parser(self):
rows = [
{
"parser": "docling",
"layout_evaluated": True,
"table_evaluated": True,
"formula_evaluated": False,
"layout_class_aware_f1": 0.9,
"layout_class_agnostic_f1": 0.95,
"layout_class_aware_precision": 0.9,
"layout_class_aware_recall": 0.9,
"layout_prediction_count": 10,
"table_structure_score": 0.8,
"table_match_rate": 0.9,
"table_cell_content_f1": 0.7,
"formula_cer": 0.0,
"formula_accuracy": 0.0,
"formula_exact_match_rate": 0.0,
"element_count": 10,
"table_count": 2,
"figure_count": 1,
},
{
"parser": "docling",
"layout_evaluated": True,
"table_evaluated": False,
"formula_evaluated": False,
"layout_class_aware_f1": 0.7,
"layout_class_agnostic_f1": 0.75,
"layout_class_aware_precision": 0.7,
"layout_class_aware_recall": 0.7,
"layout_prediction_count": 8,
"table_structure_score": 0.0,
"table_match_rate": 0.0,
"table_cell_content_f1": 0.0,
"formula_cer": 0.0,
"formula_accuracy": 0.0,
"formula_exact_match_rate": 0.0,
"element_count": 8,
"table_count": 0,
"figure_count": 0,
},
{
"parser": "pymupdf",
"layout_evaluated": True,
"table_evaluated": False,
"formula_evaluated": False,
"layout_class_aware_f1": 0.5,
"layout_class_agnostic_f1": 0.55,
"layout_class_aware_precision": 0.5,
"layout_class_aware_recall": 0.5,
"layout_prediction_count": 6,
"table_structure_score": 0.0,
"table_match_rate": 0.0,
"table_cell_content_f1": 0.0,
"formula_cer": 0.0,
"formula_accuracy": 0.0,
"formula_exact_match_rate": 0.0,
"element_count": 6,
"table_count": 0,
"figure_count": 0,
},
]
leaderboard = _per_parser_gt_leaderboard(rows)
by_parser = {row["parser"]: row for row in leaderboard}
# Docling appears once with 2 documents, 2 layout-evaluated, 1 table-evaluated.
self.assertEqual(by_parser["docling"]["document_count"], 2)
self.assertEqual(by_parser["docling"]["layout_evaluated_count"], 2)
self.assertEqual(by_parser["docling"]["table_evaluated_count"], 1)
self.assertEqual(by_parser["docling"]["formula_evaluated_count"], 0)
self.assertAlmostEqual(by_parser["docling"]["mean_layout_class_aware_f1"], 0.8, places=6)
# Table mean uses only the row that was evaluated.
self.assertAlmostEqual(by_parser["docling"]["mean_table_structure_score"], 0.8, places=6)
# PyMuPDF appears once.
self.assertEqual(by_parser["pymupdf"]["document_count"], 1)
self.assertAlmostEqual(by_parser["pymupdf"]["mean_layout_class_aware_f1"], 0.5, places=6)
def test_sorted_by_layout_then_table_then_formula_inverse(self):
rows = [
{"parser": "low", "layout_evaluated": True, "layout_class_aware_f1": 0.3, "layout_class_agnostic_f1": 0.3},
{"parser": "high", "layout_evaluated": True, "layout_class_aware_f1": 0.9, "layout_class_agnostic_f1": 0.9},
]
leaderboard = _per_parser_gt_leaderboard(rows)
self.assertEqual([row["parser"] for row in leaderboard], ["high", "low"])
def test_empty_rows_returns_empty_list(self):
self.assertEqual(_per_parser_gt_leaderboard([]), [])
class TestBenchmarkEmitsLeaderboard(unittest.TestCase):
def test_per_parser_gt_leaderboard_csv_written(self):
ground_truth = {
"layout_dets": [
{"category": "title", "bbox": [0, 0, 100, 30], "page_num": 1},
]
}
with tempfile.TemporaryDirectory() as tmp:
tmp = Path(tmp)
src = tmp / "in"
src.mkdir()
md_path = src / "doc.md"
md_path.write_text("# Doc\n\nSome text.\n", encoding="utf-8")
def fake_loader(root: Path):
yield DatasetDocument(
dataset_id="omnidocbench",
doc_id="doc",
path=md_path,
ground_truth=ground_truth,
metadata={},
)
register_dataset_loader("omnidocbench", fake_loader)
try:
summary = run_parser_benchmark(src, tmp / "out", dataset_name="omnidocbench")
finally:
from zsgdp.benchmarks.datasets import _load_omnidocbench
_DATASET_LOADERS["omnidocbench"] = _load_omnidocbench
self.assertIn("per_parser_gt_leaderboard", summary)
csv_path = tmp / "out" / "per_parser_gt_leaderboard.csv"
self.assertTrue(csv_path.exists())
content = csv_path.read_text()
header = content.splitlines()[0]
self.assertIn("parser", header)
self.assertIn("mean_layout_class_aware_f1", header)
self.assertIn("layout_evaluated_count", header)
if __name__ == "__main__":
unittest.main()