import unittest from zsgdp.schema import DocumentProfile, Element, FigureObject, PageProfile, ParsedDocument from zsgdp.verify import verify_parse from zsgdp.verify.table_quality import markdown_table_is_valid class VerifyTests(unittest.TestCase): def test_verify_simple_document(self): profile = DocumentProfile( doc_id="d1", source_path="sample.txt", file_type="text", page_count=1, extension=".txt", pages=[PageProfile(page_num=1, digital_text_chars=11, digital_text_quality=1.0)], ) parsed = ParsedDocument(doc_id="d1", source_path="sample.txt", file_type="text") parsed.elements.append( Element( element_id="e1", doc_id="d1", page_num=1, type="paragraph", text="hello world", reading_order=1, confidence=0.9, ) ) parsed.quality_report = verify_parse(profile, parsed) self.assertEqual(parsed.quality_report.score, 1.0) self.assertEqual(parsed.quality_report.metrics["element_count"], 1) def test_markdown_table_requires_data_row(self): self.assertFalse(markdown_table_is_valid("| A | B |\n| --- | --- |")) self.assertTrue(markdown_table_is_valid("| A | B |\n| --- | --- |\n| 1 | 2 |")) def test_verify_flags_missing_figure_context(self): profile = DocumentProfile( doc_id="d1", source_path="sample.pdf", file_type="pdf", page_count=1, extension=".pdf", pages=[PageProfile(page_num=1, digital_text_chars=12, digital_text_quality=1.0)], ) parsed = ParsedDocument(doc_id="d1", source_path="sample.pdf", file_type="pdf") parsed.elements.append( Element(element_id="e1", doc_id="d1", page_num=1, type="paragraph", text="hello world!", reading_order=1) ) parsed.figures.append(FigureObject(figure_id="f1", page_num=1)) report = verify_parse(profile, parsed) issue_types = [issue.issue_type for issue in report.issues] self.assertIn("missing_figure_region", issue_types) self.assertIn("missing_figure_caption", issue_types) self.assertEqual(report.metrics["figure_description_coverage"], 0.0) def test_verify_flags_formula_heavy_page_without_formula_elements(self): profile = DocumentProfile( doc_id="d1", source_path="math.pdf", file_type="pdf", page_count=1, extension=".pdf", pages=[PageProfile(page_num=1, digital_text_chars=24, digital_text_quality=1.0, formula_density=0.30)], ) parsed = ParsedDocument(doc_id="d1", source_path="math.pdf", file_type="pdf") parsed.elements.append( Element(element_id="e1", doc_id="d1", page_num=1, type="paragraph", text="Equation heavy page text", reading_order=1) ) report = verify_parse(profile, parsed) self.assertIn("missing_formula_regions", [issue.issue_type for issue in report.issues]) self.assertEqual(report.metrics["formula_page_coverage"], 0.0) if __name__ == "__main__": unittest.main()