"""Tests for cross-dataset benchmark comparison.""" from __future__ import annotations import json import tempfile import unittest from pathlib import Path from zsgdp.benchmarks.cross_dataset import ( combine_benchmark_summaries, write_cross_dataset_outputs, ) def _summary(dataset_name: str, *, layout_f1: float, leaderboard: list[dict] | None = None) -> dict: return { "dataset_name": dataset_name, "dataset_root": f"/tmp/{dataset_name}", "document_count": 5, "mean_quality_score": 0.9, "mean_layout_f1": layout_f1, "mean_retrieval_recall_at_5": 0.7, "mean_table_structure_score": 0.6, "mean_formula_cer": 0.2, "per_parser_gt_leaderboard": leaderboard or [], } class TestCombineBenchmarkSummaries(unittest.TestCase): def test_two_runs_produce_two_rows(self): runs = [ ("docs_a", _summary("docs_a", layout_f1=0.5)), ("docs_b", _summary("docs_b", layout_f1=0.8)), ] comparison = combine_benchmark_summaries(runs) self.assertEqual(comparison["run_count"], 2) self.assertEqual(comparison["labels"], ["docs_a", "docs_b"]) self.assertEqual([row["label"] for row in comparison["dataset_summary"]], ["docs_a", "docs_b"]) layouts = {row["label"]: row["mean_layout_f1"] for row in comparison["dataset_summary"]} self.assertEqual(layouts, {"docs_a": 0.5, "docs_b": 0.8}) def test_parser_matrix_aligns_parsers_across_runs(self): leaderboard_a = [ {"parser": "docling", "mean_layout_class_aware_f1": 0.9, "document_count": 3}, {"parser": "pymupdf", "mean_layout_class_aware_f1": 0.4, "document_count": 3}, ] leaderboard_b = [ {"parser": "docling", "mean_layout_class_aware_f1": 0.7, "document_count": 5}, # marker only appears in run B. {"parser": "marker", "mean_layout_class_aware_f1": 0.6, "document_count": 5}, ] runs = [ ("a", _summary("a", layout_f1=0.5, leaderboard=leaderboard_a)), ("b", _summary("b", layout_f1=0.7, leaderboard=leaderboard_b)), ] comparison = combine_benchmark_summaries(runs) matrix = comparison["parser_matrix"] parsers = sorted(row["parser"] for row in matrix) self.assertEqual(parsers, ["docling", "marker", "pymupdf"]) by_parser = {row["parser"]: row for row in matrix} # Docling appears in both runs. self.assertEqual(by_parser["docling"]["a__mean_layout_class_aware_f1"], 0.9) self.assertEqual(by_parser["docling"]["b__mean_layout_class_aware_f1"], 0.7) # Marker missing in run A -> None, present in B. self.assertIsNone(by_parser["marker"]["a__mean_layout_class_aware_f1"]) self.assertEqual(by_parser["marker"]["b__mean_layout_class_aware_f1"], 0.6) # PyMuPDF missing in run B -> None. self.assertIsNone(by_parser["pymupdf"]["b__mean_layout_class_aware_f1"]) def test_duplicate_labels_raise(self): with self.assertRaises(ValueError): combine_benchmark_summaries( [ ("same", _summary("a", layout_f1=0.5)), ("same", _summary("b", layout_f1=0.7)), ] ) def test_summary_loaded_from_path(self): with tempfile.TemporaryDirectory() as tmp: tmp = Path(tmp) (tmp / "results.json").write_text(json.dumps(_summary("from_path", layout_f1=0.42))) comparison = combine_benchmark_summaries([("a", tmp)]) self.assertEqual(comparison["dataset_summary"][0]["mean_layout_f1"], 0.42) def test_missing_metric_yields_none_not_zero(self): # A summary missing mean_formula_cer (older code, e.g.) preserves None. sparse_summary = {"dataset_name": "old_run", "document_count": 1} comparison = combine_benchmark_summaries([("old", sparse_summary)]) row = comparison["dataset_summary"][0] self.assertEqual(row["document_count"], 1) self.assertIsNone(row["mean_layout_f1"]) self.assertIsNone(row["mean_formula_cer"]) class TestWriteCrossDatasetOutputs(unittest.TestCase): def test_writes_json_and_csvs(self): leaderboard = [{"parser": "docling", "mean_layout_class_aware_f1": 0.9, "document_count": 3}] comparison = combine_benchmark_summaries( [("a", _summary("a", layout_f1=0.5, leaderboard=leaderboard))] ) with tempfile.TemporaryDirectory() as tmp: tmp = Path(tmp) write_cross_dataset_outputs(comparison, tmp) self.assertTrue((tmp / "cross_dataset_comparison.json").exists()) self.assertTrue((tmp / "dataset_summary.csv").exists()) self.assertTrue((tmp / "parser_matrix.csv").exists()) ds_csv = (tmp / "dataset_summary.csv").read_text() self.assertIn("mean_layout_f1", ds_csv.splitlines()[0]) self.assertIn("a", ds_csv.splitlines()[1]) matrix_csv = (tmp / "parser_matrix.csv").read_text() self.assertIn("a__mean_layout_class_aware_f1", matrix_csv.splitlines()[0]) if __name__ == "__main__": unittest.main()