zeroshotGPU / tests /test_cross_dataset.py
Arjunvir Singh
Initial commit: zeroshotGPU MVP with full eval surface
db06ffa
"""Tests for cross-dataset benchmark comparison."""
from __future__ import annotations
import json
import tempfile
import unittest
from pathlib import Path
from zsgdp.benchmarks.cross_dataset import (
combine_benchmark_summaries,
write_cross_dataset_outputs,
)
def _summary(dataset_name: str, *, layout_f1: float, leaderboard: list[dict] | None = None) -> dict:
return {
"dataset_name": dataset_name,
"dataset_root": f"/tmp/{dataset_name}",
"document_count": 5,
"mean_quality_score": 0.9,
"mean_layout_f1": layout_f1,
"mean_retrieval_recall_at_5": 0.7,
"mean_table_structure_score": 0.6,
"mean_formula_cer": 0.2,
"per_parser_gt_leaderboard": leaderboard or [],
}
class TestCombineBenchmarkSummaries(unittest.TestCase):
def test_two_runs_produce_two_rows(self):
runs = [
("docs_a", _summary("docs_a", layout_f1=0.5)),
("docs_b", _summary("docs_b", layout_f1=0.8)),
]
comparison = combine_benchmark_summaries(runs)
self.assertEqual(comparison["run_count"], 2)
self.assertEqual(comparison["labels"], ["docs_a", "docs_b"])
self.assertEqual([row["label"] for row in comparison["dataset_summary"]], ["docs_a", "docs_b"])
layouts = {row["label"]: row["mean_layout_f1"] for row in comparison["dataset_summary"]}
self.assertEqual(layouts, {"docs_a": 0.5, "docs_b": 0.8})
def test_parser_matrix_aligns_parsers_across_runs(self):
leaderboard_a = [
{"parser": "docling", "mean_layout_class_aware_f1": 0.9, "document_count": 3},
{"parser": "pymupdf", "mean_layout_class_aware_f1": 0.4, "document_count": 3},
]
leaderboard_b = [
{"parser": "docling", "mean_layout_class_aware_f1": 0.7, "document_count": 5},
# marker only appears in run B.
{"parser": "marker", "mean_layout_class_aware_f1": 0.6, "document_count": 5},
]
runs = [
("a", _summary("a", layout_f1=0.5, leaderboard=leaderboard_a)),
("b", _summary("b", layout_f1=0.7, leaderboard=leaderboard_b)),
]
comparison = combine_benchmark_summaries(runs)
matrix = comparison["parser_matrix"]
parsers = sorted(row["parser"] for row in matrix)
self.assertEqual(parsers, ["docling", "marker", "pymupdf"])
by_parser = {row["parser"]: row for row in matrix}
# Docling appears in both runs.
self.assertEqual(by_parser["docling"]["a__mean_layout_class_aware_f1"], 0.9)
self.assertEqual(by_parser["docling"]["b__mean_layout_class_aware_f1"], 0.7)
# Marker missing in run A -> None, present in B.
self.assertIsNone(by_parser["marker"]["a__mean_layout_class_aware_f1"])
self.assertEqual(by_parser["marker"]["b__mean_layout_class_aware_f1"], 0.6)
# PyMuPDF missing in run B -> None.
self.assertIsNone(by_parser["pymupdf"]["b__mean_layout_class_aware_f1"])
def test_duplicate_labels_raise(self):
with self.assertRaises(ValueError):
combine_benchmark_summaries(
[
("same", _summary("a", layout_f1=0.5)),
("same", _summary("b", layout_f1=0.7)),
]
)
def test_summary_loaded_from_path(self):
with tempfile.TemporaryDirectory() as tmp:
tmp = Path(tmp)
(tmp / "results.json").write_text(json.dumps(_summary("from_path", layout_f1=0.42)))
comparison = combine_benchmark_summaries([("a", tmp)])
self.assertEqual(comparison["dataset_summary"][0]["mean_layout_f1"], 0.42)
def test_missing_metric_yields_none_not_zero(self):
# A summary missing mean_formula_cer (older code, e.g.) preserves None.
sparse_summary = {"dataset_name": "old_run", "document_count": 1}
comparison = combine_benchmark_summaries([("old", sparse_summary)])
row = comparison["dataset_summary"][0]
self.assertEqual(row["document_count"], 1)
self.assertIsNone(row["mean_layout_f1"])
self.assertIsNone(row["mean_formula_cer"])
class TestWriteCrossDatasetOutputs(unittest.TestCase):
def test_writes_json_and_csvs(self):
leaderboard = [{"parser": "docling", "mean_layout_class_aware_f1": 0.9, "document_count": 3}]
comparison = combine_benchmark_summaries(
[("a", _summary("a", layout_f1=0.5, leaderboard=leaderboard))]
)
with tempfile.TemporaryDirectory() as tmp:
tmp = Path(tmp)
write_cross_dataset_outputs(comparison, tmp)
self.assertTrue((tmp / "cross_dataset_comparison.json").exists())
self.assertTrue((tmp / "dataset_summary.csv").exists())
self.assertTrue((tmp / "parser_matrix.csv").exists())
ds_csv = (tmp / "dataset_summary.csv").read_text()
self.assertIn("mean_layout_f1", ds_csv.splitlines()[0])
self.assertIn("a", ds_csv.splitlines()[1])
matrix_csv = (tmp / "parser_matrix.csv").read_text()
self.assertIn("a__mean_layout_class_aware_f1", matrix_csv.splitlines()[0])
if __name__ == "__main__":
unittest.main()