zeroshotGPU / tests /test_parser_disagreement.py
Arjunvir Singh
Initial commit: zeroshotGPU MVP with full eval surface
db06ffa
"""Tests for parser-disagreement and repair-success metrics."""
from __future__ import annotations
import tempfile
import unittest
from pathlib import Path
from zsgdp.merge.conflict_detection import build_candidate_conflict_report
from zsgdp.pipeline import parse_document
from zsgdp.schema import DocumentProfile, Element, ParseCandidate, PageProfile, TableObject
from zsgdp.verify.parser_disagreement import compute_parser_disagreement
from zsgdp.verify.repair_success import compute_repair_success
def _profile() -> DocumentProfile:
return DocumentProfile(
doc_id="d1",
source_path="/tmp/d1.md",
file_type="markdown",
page_count=1,
extension=".md",
pages=[PageProfile(page_num=1, digital_text_chars=400, digital_text_quality=0.9)],
)
def _candidate(name: str, *, text: str, table_count: int = 0) -> ParseCandidate:
elements = [
Element(
element_id=f"{name}_e1",
doc_id="d1",
page_num=1,
type="paragraph",
text=text,
reading_order=1,
source_parser=name,
)
]
tables: list[TableObject] = []
for index in range(table_count):
tables.append(
TableObject(
table_id=f"{name}_t{index + 1}",
page_nums=[1],
markdown="| A | B |\n| --- | --- |\n| 1 | 2 |",
source_parser=name,
)
)
return ParseCandidate(
parser_name=name,
doc_id="d1",
source_path="/tmp/d1.md",
file_type="markdown",
elements=elements,
tables=tables,
figures=[],
pages=[{"page_num": 1, "source_parser": name}],
confidence=0.8,
)
class TestParserDisagreement(unittest.TestCase):
def test_disagreement_rate_uses_pair_count_denominator(self):
candidates = [
_candidate("docling", text="A" * 800, table_count=4),
_candidate("pymupdf", text="A" * 100, table_count=0),
]
report = build_candidate_conflict_report(candidates)
parser_metrics = {
"docling": {"parser": "docling", "failed": False},
"pymupdf": {"parser": "pymupdf", "failed": False},
}
result = compute_parser_disagreement(report, parser_metrics)
self.assertEqual(result["candidate_count"], 2)
self.assertEqual(result["parser_pair_count"], 1)
self.assertGreater(result["conflict_count"], 0)
self.assertGreater(result["disagreement_rate"], 0.0)
self.assertIn("text_coverage_gap", result["disagreement_by_type"])
self.assertIn("docling|pymupdf", result["disagreement_by_parser_pair"])
def test_disagreement_rate_zero_when_single_parser(self):
result = compute_parser_disagreement(
{"conflicts": []},
{"docling": {"parser": "docling", "failed": False}},
)
self.assertEqual(result["candidate_count"], 1)
self.assertEqual(result["parser_pair_count"], 0)
self.assertEqual(result["disagreement_rate"], 0.0)
def test_failed_parsers_excluded_from_pair_count(self):
result = compute_parser_disagreement(
{"conflicts": []},
{
"docling": {"parser": "docling", "failed": False},
"marker": {"parser": "marker", "failed": True, "error": "boom"},
"pymupdf": {"parser": "pymupdf", "failed": False},
},
)
self.assertEqual(result["candidate_count"], 2)
self.assertEqual(result["parser_pair_count"], 1)
class TestRepairSuccess(unittest.TestCase):
def test_resolution_rate_when_blocking_issue_resolved(self):
pre = {"score": 0.5, "issues": [{"issue_type": "invalid_table", "blocking": True, "page_num": 1, "region_id": "t1"}]}
post = {"score": 0.9, "issues": []}
history = [{"iteration": 1, "before_score": 0.5, "after_score": 0.9, "actions": [{"action": "repair_table"}]}]
result = compute_repair_success(pre, post, history)
self.assertEqual(result["pre_repair_blocking_count"], 1)
self.assertEqual(result["post_repair_blocking_count"], 0)
self.assertEqual(result["resolved_blocking_count"], 1)
self.assertEqual(result["repair_resolution_rate"], 1.0)
self.assertEqual(result["repair_regression_rate"], 0.0)
self.assertEqual(result["iteration_count"], 1)
self.assertAlmostEqual(result["score_delta"], 0.4, places=6)
def test_regression_rate_counts_new_blocking_issues(self):
pre = {"score": 0.7, "issues": [{"issue_type": "invalid_table", "blocking": True, "region_id": "t1"}]}
post = {
"score": 0.6,
"issues": [
{"issue_type": "invalid_table", "blocking": True, "region_id": "t1"},
{"issue_type": "missing_text_coverage", "blocking": True, "page_num": 2},
],
}
history = [{"iteration": 1, "before_score": 0.7, "after_score": 0.6, "actions": []}]
result = compute_repair_success(pre, post, history)
self.assertEqual(result["resolved_blocking_count"], 0)
self.assertEqual(result["regressed_blocking_count"], 1)
self.assertEqual(result["repair_regression_rate"], 1.0)
self.assertEqual(result["repair_resolution_rate"], 0.0)
def test_vacuous_success_when_no_pre_repair_blocking_issues(self):
result = compute_repair_success(
{"score": 1.0, "issues": []},
{"score": 1.0, "issues": []},
[],
)
self.assertEqual(result["repair_resolution_rate"], 1.0)
self.assertEqual(result["repair_regression_rate"], 0.0)
self.assertEqual(result["iteration_count"], 0)
class TestRepairSuccessIntegration(unittest.TestCase):
def test_pipeline_records_resolution_for_iterative_table_repair(self):
with tempfile.TemporaryDirectory() as tmp:
input_path = Path(tmp) / "report.md"
input_path.write_text(
"# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 | 3 |\n",
encoding="utf-8",
)
parsed = parse_document(input_path, Path(tmp) / "out")
metrics = parsed.quality_report.metrics
self.assertIn("repair_resolution_rate", metrics)
self.assertIn("repair_regression_rate", metrics)
self.assertIn("parser_disagreement_rate", metrics)
success = parsed.provenance["repair_success"]
self.assertGreaterEqual(success["pre_repair_issue_count"], 1)
self.assertGreaterEqual(success["resolved_any_count"], 1)
self.assertGreaterEqual(success["repair_resolution_rate_any"], 0.0)
self.assertGreater(success["iteration_count"], 0)
if __name__ == "__main__":
unittest.main()