Spaces:
Running on Zero
Running on Zero
| """Tests for cross-dataset benchmark comparison.""" | |
| from __future__ import annotations | |
| import json | |
| import tempfile | |
| import unittest | |
| from pathlib import Path | |
| from zsgdp.benchmarks.cross_dataset import ( | |
| combine_benchmark_summaries, | |
| write_cross_dataset_outputs, | |
| ) | |
| def _summary(dataset_name: str, *, layout_f1: float, leaderboard: list[dict] | None = None) -> dict: | |
| return { | |
| "dataset_name": dataset_name, | |
| "dataset_root": f"/tmp/{dataset_name}", | |
| "document_count": 5, | |
| "mean_quality_score": 0.9, | |
| "mean_layout_f1": layout_f1, | |
| "mean_retrieval_recall_at_5": 0.7, | |
| "mean_table_structure_score": 0.6, | |
| "mean_formula_cer": 0.2, | |
| "per_parser_gt_leaderboard": leaderboard or [], | |
| } | |
| class TestCombineBenchmarkSummaries(unittest.TestCase): | |
| def test_two_runs_produce_two_rows(self): | |
| runs = [ | |
| ("docs_a", _summary("docs_a", layout_f1=0.5)), | |
| ("docs_b", _summary("docs_b", layout_f1=0.8)), | |
| ] | |
| comparison = combine_benchmark_summaries(runs) | |
| self.assertEqual(comparison["run_count"], 2) | |
| self.assertEqual(comparison["labels"], ["docs_a", "docs_b"]) | |
| self.assertEqual([row["label"] for row in comparison["dataset_summary"]], ["docs_a", "docs_b"]) | |
| layouts = {row["label"]: row["mean_layout_f1"] for row in comparison["dataset_summary"]} | |
| self.assertEqual(layouts, {"docs_a": 0.5, "docs_b": 0.8}) | |
| def test_parser_matrix_aligns_parsers_across_runs(self): | |
| leaderboard_a = [ | |
| {"parser": "docling", "mean_layout_class_aware_f1": 0.9, "document_count": 3}, | |
| {"parser": "pymupdf", "mean_layout_class_aware_f1": 0.4, "document_count": 3}, | |
| ] | |
| leaderboard_b = [ | |
| {"parser": "docling", "mean_layout_class_aware_f1": 0.7, "document_count": 5}, | |
| # marker only appears in run B. | |
| {"parser": "marker", "mean_layout_class_aware_f1": 0.6, "document_count": 5}, | |
| ] | |
| runs = [ | |
| ("a", _summary("a", layout_f1=0.5, leaderboard=leaderboard_a)), | |
| ("b", _summary("b", layout_f1=0.7, leaderboard=leaderboard_b)), | |
| ] | |
| comparison = combine_benchmark_summaries(runs) | |
| matrix = comparison["parser_matrix"] | |
| parsers = sorted(row["parser"] for row in matrix) | |
| self.assertEqual(parsers, ["docling", "marker", "pymupdf"]) | |
| by_parser = {row["parser"]: row for row in matrix} | |
| # Docling appears in both runs. | |
| self.assertEqual(by_parser["docling"]["a__mean_layout_class_aware_f1"], 0.9) | |
| self.assertEqual(by_parser["docling"]["b__mean_layout_class_aware_f1"], 0.7) | |
| # Marker missing in run A -> None, present in B. | |
| self.assertIsNone(by_parser["marker"]["a__mean_layout_class_aware_f1"]) | |
| self.assertEqual(by_parser["marker"]["b__mean_layout_class_aware_f1"], 0.6) | |
| # PyMuPDF missing in run B -> None. | |
| self.assertIsNone(by_parser["pymupdf"]["b__mean_layout_class_aware_f1"]) | |
| def test_duplicate_labels_raise(self): | |
| with self.assertRaises(ValueError): | |
| combine_benchmark_summaries( | |
| [ | |
| ("same", _summary("a", layout_f1=0.5)), | |
| ("same", _summary("b", layout_f1=0.7)), | |
| ] | |
| ) | |
| def test_summary_loaded_from_path(self): | |
| with tempfile.TemporaryDirectory() as tmp: | |
| tmp = Path(tmp) | |
| (tmp / "results.json").write_text(json.dumps(_summary("from_path", layout_f1=0.42))) | |
| comparison = combine_benchmark_summaries([("a", tmp)]) | |
| self.assertEqual(comparison["dataset_summary"][0]["mean_layout_f1"], 0.42) | |
| def test_missing_metric_yields_none_not_zero(self): | |
| # A summary missing mean_formula_cer (older code, e.g.) preserves None. | |
| sparse_summary = {"dataset_name": "old_run", "document_count": 1} | |
| comparison = combine_benchmark_summaries([("old", sparse_summary)]) | |
| row = comparison["dataset_summary"][0] | |
| self.assertEqual(row["document_count"], 1) | |
| self.assertIsNone(row["mean_layout_f1"]) | |
| self.assertIsNone(row["mean_formula_cer"]) | |
| class TestWriteCrossDatasetOutputs(unittest.TestCase): | |
| def test_writes_json_and_csvs(self): | |
| leaderboard = [{"parser": "docling", "mean_layout_class_aware_f1": 0.9, "document_count": 3}] | |
| comparison = combine_benchmark_summaries( | |
| [("a", _summary("a", layout_f1=0.5, leaderboard=leaderboard))] | |
| ) | |
| with tempfile.TemporaryDirectory() as tmp: | |
| tmp = Path(tmp) | |
| write_cross_dataset_outputs(comparison, tmp) | |
| self.assertTrue((tmp / "cross_dataset_comparison.json").exists()) | |
| self.assertTrue((tmp / "dataset_summary.csv").exists()) | |
| self.assertTrue((tmp / "parser_matrix.csv").exists()) | |
| ds_csv = (tmp / "dataset_summary.csv").read_text() | |
| self.assertIn("mean_layout_f1", ds_csv.splitlines()[0]) | |
| self.assertIn("a", ds_csv.splitlines()[1]) | |
| matrix_csv = (tmp / "parser_matrix.csv").read_text() | |
| self.assertIn("a__mean_layout_class_aware_f1", matrix_csv.splitlines()[0]) | |
| if __name__ == "__main__": | |
| unittest.main() | |