"""Tests for per-parser GT-comparison metrics within a single merged run.""" from __future__ import annotations import json import tempfile import unittest from pathlib import Path from types import SimpleNamespace from zsgdp.benchmarks.datasets import DatasetDocument, register_dataset_loader from zsgdp.benchmarks.datasets import _LOADERS as _DATASET_LOADERS from zsgdp.benchmarks.parser_quality import run_parser_benchmark from zsgdp.benchmarks.per_parser_metrics import compute_per_parser_metrics def _parsed_with_candidates(candidates: dict) -> SimpleNamespace: return SimpleNamespace(provenance={"candidates": candidates}) class TestComputePerParserMetrics(unittest.TestCase): def test_returns_one_block_per_parser_with_layout_truths(self): candidates = { "docling": { "elements": [ {"element_id": "e1", "type": "title", "page_num": 1, "bbox": [0, 0, 100, 30]}, ], "tables": [], "figures": [], }, "pymupdf": { "elements": [ {"element_id": "e2", "type": "paragraph", "page_num": 1, "bbox": [200, 200, 300, 300]}, ], "tables": [], "figures": [], }, } layout_truths = [{"bbox": (0, 0, 100, 30), "category": "title", "page_num": 1}] result = compute_per_parser_metrics( _parsed_with_candidates(candidates), layout_truths=layout_truths, ) self.assertEqual(set(result), {"docling", "pymupdf"}) self.assertEqual(result["docling"]["layout"]["class_aware_f1"], 1.0) # PyMuPDF predicted a paragraph far from any truth -> 0 F1. self.assertEqual(result["pymupdf"]["layout"]["class_aware_f1"], 0.0) # Element counts surfaced even when the parser scored zero. self.assertEqual(result["pymupdf"]["element_count"], 1) def test_omits_metric_block_when_truths_empty(self): candidates = { "docling": { "elements": [{"element_id": "e1", "type": "title", "page_num": 1, "bbox": [0, 0, 10, 10]}], "tables": [], "figures": [], }, } result = compute_per_parser_metrics(_parsed_with_candidates(candidates)) self.assertEqual(set(result["docling"]), {"parser", "element_count", "table_count", "figure_count"}) def test_table_and_formula_metrics_per_parser(self): candidates = { "docling": { "elements": [ {"element_id": "f1", "type": "formula", "page_num": 1, "text": "E = mc^2"}, ], "tables": [ {"table_id": "t1", "page_nums": [1], "markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |"}, ], "figures": [], }, "pymupdf": { "elements": [ {"element_id": "f2", "type": "formula", "page_num": 1, "text": "E = mc^9"}, ], "tables": [], "figures": [], }, } table_truths = [{"markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", "page_num": 1}] formula_truths = [{"latex": "E = mc^2", "page_num": 1}] result = compute_per_parser_metrics( _parsed_with_candidates(candidates), table_truths=table_truths, formula_truths=formula_truths, ) # Docling matches table and formula exactly. self.assertEqual(result["docling"]["table_structure"]["mean_table_score"], 1.0) self.assertEqual(result["docling"]["formula"]["mean_cer"], 0.0) # PyMuPDF's formula is one char off; table predictions empty. self.assertGreater(result["pymupdf"]["formula"]["mean_cer"], 0.0) self.assertEqual(result["pymupdf"]["table_structure"]["matched_pair_count"], 0) def test_no_candidates_returns_empty_dict(self): parsed = SimpleNamespace(provenance={"candidates": {}}) self.assertEqual(compute_per_parser_metrics(parsed, layout_truths=[]), {}) class TestPipelinePopulatesCandidates(unittest.TestCase): def test_candidates_serialized_to_provenance(self): with tempfile.TemporaryDirectory() as tmp: input_path = Path(tmp) / "doc.md" input_path.write_text("# Doc\n\nSome content.\n", encoding="utf-8") from zsgdp.pipeline import parse_document parsed = parse_document(input_path, Path(tmp) / "out") candidates = parsed.provenance.get("candidates") self.assertIsInstance(candidates, dict) self.assertGreater(len(candidates), 0) # text parser should be one of the candidates for markdown. self.assertIn("text", candidates) self.assertIn("elements", candidates["text"]) class TestBenchmarkIntegration(unittest.TestCase): def test_per_parser_csv_emitted_with_omnidocbench_truths(self): ground_truth = { "layout_dets": [ {"category": "title", "bbox": [0, 0, 100, 30], "page_num": 1}, {"category": "table", "markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", "page_num": 1}, ] } with tempfile.TemporaryDirectory() as tmp: tmp = Path(tmp) src = tmp / "in" src.mkdir() md_path = src / "doc.md" md_path.write_text("# Doc\n\n| A | B |\n| --- | --- |\n| 1 | 2 |\n", encoding="utf-8") def fake_loader(root: Path): yield DatasetDocument( dataset_id="omnidocbench", doc_id="doc", path=md_path, ground_truth=ground_truth, metadata={}, ) register_dataset_loader("omnidocbench", fake_loader) try: summary = run_parser_benchmark(src, tmp / "out", dataset_name="omnidocbench") finally: from zsgdp.benchmarks.datasets import _load_omnidocbench _DATASET_LOADERS["omnidocbench"] = _load_omnidocbench doc = summary["documents"][0] self.assertIn("per_parser_metrics", doc) self.assertGreater(len(doc["per_parser_metrics"]), 0) csv_path = tmp / "out" / "per_parser_metrics.csv" self.assertTrue(csv_path.exists()) content = csv_path.read_text() self.assertIn("parser", content.splitlines()[0]) self.assertGreater(len(content.splitlines()), 1) if __name__ == "__main__": unittest.main()