zeroshotGPU / tests /test_per_parser_metrics.py
Arjunvir Singh
Initial commit: zeroshotGPU MVP with full eval surface
db06ffa
"""Tests for per-parser GT-comparison metrics within a single merged run."""
from __future__ import annotations
import json
import tempfile
import unittest
from pathlib import Path
from types import SimpleNamespace
from zsgdp.benchmarks.datasets import DatasetDocument, register_dataset_loader
from zsgdp.benchmarks.datasets import _LOADERS as _DATASET_LOADERS
from zsgdp.benchmarks.parser_quality import run_parser_benchmark
from zsgdp.benchmarks.per_parser_metrics import compute_per_parser_metrics
def _parsed_with_candidates(candidates: dict) -> SimpleNamespace:
return SimpleNamespace(provenance={"candidates": candidates})
class TestComputePerParserMetrics(unittest.TestCase):
def test_returns_one_block_per_parser_with_layout_truths(self):
candidates = {
"docling": {
"elements": [
{"element_id": "e1", "type": "title", "page_num": 1, "bbox": [0, 0, 100, 30]},
],
"tables": [],
"figures": [],
},
"pymupdf": {
"elements": [
{"element_id": "e2", "type": "paragraph", "page_num": 1, "bbox": [200, 200, 300, 300]},
],
"tables": [],
"figures": [],
},
}
layout_truths = [{"bbox": (0, 0, 100, 30), "category": "title", "page_num": 1}]
result = compute_per_parser_metrics(
_parsed_with_candidates(candidates),
layout_truths=layout_truths,
)
self.assertEqual(set(result), {"docling", "pymupdf"})
self.assertEqual(result["docling"]["layout"]["class_aware_f1"], 1.0)
# PyMuPDF predicted a paragraph far from any truth -> 0 F1.
self.assertEqual(result["pymupdf"]["layout"]["class_aware_f1"], 0.0)
# Element counts surfaced even when the parser scored zero.
self.assertEqual(result["pymupdf"]["element_count"], 1)
def test_omits_metric_block_when_truths_empty(self):
candidates = {
"docling": {
"elements": [{"element_id": "e1", "type": "title", "page_num": 1, "bbox": [0, 0, 10, 10]}],
"tables": [],
"figures": [],
},
}
result = compute_per_parser_metrics(_parsed_with_candidates(candidates))
self.assertEqual(set(result["docling"]), {"parser", "element_count", "table_count", "figure_count"})
def test_table_and_formula_metrics_per_parser(self):
candidates = {
"docling": {
"elements": [
{"element_id": "f1", "type": "formula", "page_num": 1, "text": "E = mc^2"},
],
"tables": [
{"table_id": "t1", "page_nums": [1], "markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |"},
],
"figures": [],
},
"pymupdf": {
"elements": [
{"element_id": "f2", "type": "formula", "page_num": 1, "text": "E = mc^9"},
],
"tables": [],
"figures": [],
},
}
table_truths = [{"markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", "page_num": 1}]
formula_truths = [{"latex": "E = mc^2", "page_num": 1}]
result = compute_per_parser_metrics(
_parsed_with_candidates(candidates),
table_truths=table_truths,
formula_truths=formula_truths,
)
# Docling matches table and formula exactly.
self.assertEqual(result["docling"]["table_structure"]["mean_table_score"], 1.0)
self.assertEqual(result["docling"]["formula"]["mean_cer"], 0.0)
# PyMuPDF's formula is one char off; table predictions empty.
self.assertGreater(result["pymupdf"]["formula"]["mean_cer"], 0.0)
self.assertEqual(result["pymupdf"]["table_structure"]["matched_pair_count"], 0)
def test_no_candidates_returns_empty_dict(self):
parsed = SimpleNamespace(provenance={"candidates": {}})
self.assertEqual(compute_per_parser_metrics(parsed, layout_truths=[]), {})
class TestPipelinePopulatesCandidates(unittest.TestCase):
def test_candidates_serialized_to_provenance(self):
with tempfile.TemporaryDirectory() as tmp:
input_path = Path(tmp) / "doc.md"
input_path.write_text("# Doc\n\nSome content.\n", encoding="utf-8")
from zsgdp.pipeline import parse_document
parsed = parse_document(input_path, Path(tmp) / "out")
candidates = parsed.provenance.get("candidates")
self.assertIsInstance(candidates, dict)
self.assertGreater(len(candidates), 0)
# text parser should be one of the candidates for markdown.
self.assertIn("text", candidates)
self.assertIn("elements", candidates["text"])
class TestBenchmarkIntegration(unittest.TestCase):
def test_per_parser_csv_emitted_with_omnidocbench_truths(self):
ground_truth = {
"layout_dets": [
{"category": "title", "bbox": [0, 0, 100, 30], "page_num": 1},
{"category": "table", "markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", "page_num": 1},
]
}
with tempfile.TemporaryDirectory() as tmp:
tmp = Path(tmp)
src = tmp / "in"
src.mkdir()
md_path = src / "doc.md"
md_path.write_text("# Doc\n\n| A | B |\n| --- | --- |\n| 1 | 2 |\n", encoding="utf-8")
def fake_loader(root: Path):
yield DatasetDocument(
dataset_id="omnidocbench",
doc_id="doc",
path=md_path,
ground_truth=ground_truth,
metadata={},
)
register_dataset_loader("omnidocbench", fake_loader)
try:
summary = run_parser_benchmark(src, tmp / "out", dataset_name="omnidocbench")
finally:
from zsgdp.benchmarks.datasets import _load_omnidocbench
_DATASET_LOADERS["omnidocbench"] = _load_omnidocbench
doc = summary["documents"][0]
self.assertIn("per_parser_metrics", doc)
self.assertGreater(len(doc["per_parser_metrics"]), 0)
csv_path = tmp / "out" / "per_parser_metrics.csv"
self.assertTrue(csv_path.exists())
content = csv_path.read_text()
self.assertIn("parser", content.splitlines()[0])
self.assertGreater(len(content.splitlines()), 1)
if __name__ == "__main__":
unittest.main()