Spaces:
Running on Zero
Running on Zero
| import tempfile | |
| import unittest | |
| from pathlib import Path | |
| from zsgdp.export import export_parsed_document | |
| from zsgdp.merge.conflict_detection import build_candidate_conflict_report, detect_candidate_conflicts | |
| from zsgdp.merge.merge_candidates import merge_candidates | |
| from zsgdp.schema import DocumentProfile, Element, PageProfile, ParseCandidate, TableObject | |
| class ConflictDetectionTests(unittest.TestCase): | |
| def test_conflict_report_flags_reading_order_and_table_structure(self): | |
| candidates = [_candidate("docling", ["Alpha", "Beta", "Gamma"], 3), _candidate("pymupdf", ["Gamma", "Beta", "Alpha"], 2)] | |
| report = build_candidate_conflict_report(candidates) | |
| issues = detect_candidate_conflicts(candidates) | |
| conflict_types = {conflict["type"] for conflict in report["conflicts"]} | |
| self.assertIn("reading_order_disagreement", conflict_types) | |
| self.assertIn("table_structure_disagreement", conflict_types) | |
| self.assertTrue(issues) | |
| self.assertTrue(all(issue.issue_type == "parser_disagreement" for issue in issues)) | |
| def test_merge_stores_and_exports_conflict_report(self): | |
| profile = DocumentProfile( | |
| doc_id="d1", | |
| source_path="sample.pdf", | |
| file_type="pdf", | |
| page_count=1, | |
| extension=".pdf", | |
| pages=[PageProfile(page_num=1, digital_text_chars=30)], | |
| ) | |
| parsed = merge_candidates( | |
| [_candidate("docling", ["Alpha", "Beta", "Gamma"], 3), _candidate("pymupdf", ["Gamma", "Beta", "Alpha"], 2)], | |
| profile, | |
| ) | |
| with tempfile.TemporaryDirectory() as tmp: | |
| output_dir = Path(tmp) / "out" | |
| export_parsed_document(parsed, output_dir) | |
| self.assertTrue((output_dir / "conflict_report.json").exists()) | |
| self.assertIn("conflict_report", parsed.provenance) | |
| self.assertGreater(parsed.provenance["conflict_report"]["conflict_count"], 0) | |
| def _candidate(parser_name: str, ordered_text: list[str], table_columns: int) -> ParseCandidate: | |
| elements = [ | |
| Element( | |
| element_id=f"{parser_name}_e{index}", | |
| doc_id="d1", | |
| page_num=1, | |
| type="paragraph", | |
| text=text, | |
| reading_order=index, | |
| confidence=0.8, | |
| source_parser=parser_name, | |
| ) | |
| for index, text in enumerate(ordered_text, start=1) | |
| ] | |
| return ParseCandidate( | |
| parser_name=parser_name, | |
| doc_id="d1", | |
| source_path="sample.pdf", | |
| file_type="pdf", | |
| pages=[{"page_num": 1, "source_parser": parser_name}], | |
| elements=elements, | |
| tables=[ | |
| TableObject( | |
| table_id=f"{parser_name}_t1", | |
| page_nums=[1], | |
| markdown=_table_markdown(table_columns), | |
| confidence=0.8, | |
| source_parser=parser_name, | |
| ) | |
| ], | |
| confidence=0.8, | |
| ) | |
| def _table_markdown(columns: int) -> str: | |
| if columns == 3: | |
| return "| Region | Q1 | Q2 |\n| --- | --- | --- |\n| NA | 10 | 12 |" | |
| return "| Region | Q1 |\n| --- | --- |\n| NA | 10 |" | |
| if __name__ == "__main__": | |
| unittest.main() | |