import tempfile import unittest from pathlib import Path from zsgdp.export import export_parsed_document from zsgdp.merge.conflict_detection import build_candidate_conflict_report, detect_candidate_conflicts from zsgdp.merge.merge_candidates import merge_candidates from zsgdp.schema import DocumentProfile, Element, PageProfile, ParseCandidate, TableObject class ConflictDetectionTests(unittest.TestCase): def test_conflict_report_flags_reading_order_and_table_structure(self): candidates = [_candidate("docling", ["Alpha", "Beta", "Gamma"], 3), _candidate("pymupdf", ["Gamma", "Beta", "Alpha"], 2)] report = build_candidate_conflict_report(candidates) issues = detect_candidate_conflicts(candidates) conflict_types = {conflict["type"] for conflict in report["conflicts"]} self.assertIn("reading_order_disagreement", conflict_types) self.assertIn("table_structure_disagreement", conflict_types) self.assertTrue(issues) self.assertTrue(all(issue.issue_type == "parser_disagreement" for issue in issues)) def test_merge_stores_and_exports_conflict_report(self): profile = DocumentProfile( doc_id="d1", source_path="sample.pdf", file_type="pdf", page_count=1, extension=".pdf", pages=[PageProfile(page_num=1, digital_text_chars=30)], ) parsed = merge_candidates( [_candidate("docling", ["Alpha", "Beta", "Gamma"], 3), _candidate("pymupdf", ["Gamma", "Beta", "Alpha"], 2)], profile, ) with tempfile.TemporaryDirectory() as tmp: output_dir = Path(tmp) / "out" export_parsed_document(parsed, output_dir) self.assertTrue((output_dir / "conflict_report.json").exists()) self.assertIn("conflict_report", parsed.provenance) self.assertGreater(parsed.provenance["conflict_report"]["conflict_count"], 0) def _candidate(parser_name: str, ordered_text: list[str], table_columns: int) -> ParseCandidate: elements = [ Element( element_id=f"{parser_name}_e{index}", doc_id="d1", page_num=1, type="paragraph", text=text, reading_order=index, confidence=0.8, source_parser=parser_name, ) for index, text in enumerate(ordered_text, start=1) ] return ParseCandidate( parser_name=parser_name, doc_id="d1", source_path="sample.pdf", file_type="pdf", pages=[{"page_num": 1, "source_parser": parser_name}], elements=elements, tables=[ TableObject( table_id=f"{parser_name}_t1", page_nums=[1], markdown=_table_markdown(table_columns), confidence=0.8, source_parser=parser_name, ) ], confidence=0.8, ) def _table_markdown(columns: int) -> str: if columns == 3: return "| Region | Q1 | Q2 |\n| --- | --- | --- |\n| NA | 10 | 12 |" return "| Region | Q1 |\n| --- | --- |\n| NA | 10 |" if __name__ == "__main__": unittest.main()