zeroshotGPU / tests /test_conflict_detection.py
Arjunvir Singh
Initial commit: zeroshotGPU MVP with full eval surface
db06ffa
import tempfile
import unittest
from pathlib import Path
from zsgdp.export import export_parsed_document
from zsgdp.merge.conflict_detection import build_candidate_conflict_report, detect_candidate_conflicts
from zsgdp.merge.merge_candidates import merge_candidates
from zsgdp.schema import DocumentProfile, Element, PageProfile, ParseCandidate, TableObject
class ConflictDetectionTests(unittest.TestCase):
def test_conflict_report_flags_reading_order_and_table_structure(self):
candidates = [_candidate("docling", ["Alpha", "Beta", "Gamma"], 3), _candidate("pymupdf", ["Gamma", "Beta", "Alpha"], 2)]
report = build_candidate_conflict_report(candidates)
issues = detect_candidate_conflicts(candidates)
conflict_types = {conflict["type"] for conflict in report["conflicts"]}
self.assertIn("reading_order_disagreement", conflict_types)
self.assertIn("table_structure_disagreement", conflict_types)
self.assertTrue(issues)
self.assertTrue(all(issue.issue_type == "parser_disagreement" for issue in issues))
def test_merge_stores_and_exports_conflict_report(self):
profile = DocumentProfile(
doc_id="d1",
source_path="sample.pdf",
file_type="pdf",
page_count=1,
extension=".pdf",
pages=[PageProfile(page_num=1, digital_text_chars=30)],
)
parsed = merge_candidates(
[_candidate("docling", ["Alpha", "Beta", "Gamma"], 3), _candidate("pymupdf", ["Gamma", "Beta", "Alpha"], 2)],
profile,
)
with tempfile.TemporaryDirectory() as tmp:
output_dir = Path(tmp) / "out"
export_parsed_document(parsed, output_dir)
self.assertTrue((output_dir / "conflict_report.json").exists())
self.assertIn("conflict_report", parsed.provenance)
self.assertGreater(parsed.provenance["conflict_report"]["conflict_count"], 0)
def _candidate(parser_name: str, ordered_text: list[str], table_columns: int) -> ParseCandidate:
elements = [
Element(
element_id=f"{parser_name}_e{index}",
doc_id="d1",
page_num=1,
type="paragraph",
text=text,
reading_order=index,
confidence=0.8,
source_parser=parser_name,
)
for index, text in enumerate(ordered_text, start=1)
]
return ParseCandidate(
parser_name=parser_name,
doc_id="d1",
source_path="sample.pdf",
file_type="pdf",
pages=[{"page_num": 1, "source_parser": parser_name}],
elements=elements,
tables=[
TableObject(
table_id=f"{parser_name}_t1",
page_nums=[1],
markdown=_table_markdown(table_columns),
confidence=0.8,
source_parser=parser_name,
)
],
confidence=0.8,
)
def _table_markdown(columns: int) -> str:
if columns == 3:
return "| Region | Q1 | Q2 |\n| --- | --- | --- |\n| NA | 10 | 12 |"
return "| Region | Q1 |\n| --- | --- |\n| NA | 10 |"
if __name__ == "__main__":
unittest.main()