Spaces:
Running on Zero
Running on Zero
| import unittest | |
| from zsgdp.schema import DocumentProfile, Element, FigureObject, PageProfile, ParsedDocument | |
| from zsgdp.verify import verify_parse | |
| from zsgdp.verify.table_quality import markdown_table_is_valid | |
| class VerifyTests(unittest.TestCase): | |
| def test_verify_simple_document(self): | |
| profile = DocumentProfile( | |
| doc_id="d1", | |
| source_path="sample.txt", | |
| file_type="text", | |
| page_count=1, | |
| extension=".txt", | |
| pages=[PageProfile(page_num=1, digital_text_chars=11, digital_text_quality=1.0)], | |
| ) | |
| parsed = ParsedDocument(doc_id="d1", source_path="sample.txt", file_type="text") | |
| parsed.elements.append( | |
| Element( | |
| element_id="e1", | |
| doc_id="d1", | |
| page_num=1, | |
| type="paragraph", | |
| text="hello world", | |
| reading_order=1, | |
| confidence=0.9, | |
| ) | |
| ) | |
| parsed.quality_report = verify_parse(profile, parsed) | |
| self.assertEqual(parsed.quality_report.score, 1.0) | |
| self.assertEqual(parsed.quality_report.metrics["element_count"], 1) | |
| def test_markdown_table_requires_data_row(self): | |
| self.assertFalse(markdown_table_is_valid("| A | B |\n| --- | --- |")) | |
| self.assertTrue(markdown_table_is_valid("| A | B |\n| --- | --- |\n| 1 | 2 |")) | |
| def test_verify_flags_missing_figure_context(self): | |
| profile = DocumentProfile( | |
| doc_id="d1", | |
| source_path="sample.pdf", | |
| file_type="pdf", | |
| page_count=1, | |
| extension=".pdf", | |
| pages=[PageProfile(page_num=1, digital_text_chars=12, digital_text_quality=1.0)], | |
| ) | |
| parsed = ParsedDocument(doc_id="d1", source_path="sample.pdf", file_type="pdf") | |
| parsed.elements.append( | |
| Element(element_id="e1", doc_id="d1", page_num=1, type="paragraph", text="hello world!", reading_order=1) | |
| ) | |
| parsed.figures.append(FigureObject(figure_id="f1", page_num=1)) | |
| report = verify_parse(profile, parsed) | |
| issue_types = [issue.issue_type for issue in report.issues] | |
| self.assertIn("missing_figure_region", issue_types) | |
| self.assertIn("missing_figure_caption", issue_types) | |
| self.assertEqual(report.metrics["figure_description_coverage"], 0.0) | |
| def test_verify_flags_formula_heavy_page_without_formula_elements(self): | |
| profile = DocumentProfile( | |
| doc_id="d1", | |
| source_path="math.pdf", | |
| file_type="pdf", | |
| page_count=1, | |
| extension=".pdf", | |
| pages=[PageProfile(page_num=1, digital_text_chars=24, digital_text_quality=1.0, formula_density=0.30)], | |
| ) | |
| parsed = ParsedDocument(doc_id="d1", source_path="math.pdf", file_type="pdf") | |
| parsed.elements.append( | |
| Element(element_id="e1", doc_id="d1", page_num=1, type="paragraph", text="Equation heavy page text", reading_order=1) | |
| ) | |
| report = verify_parse(profile, parsed) | |
| self.assertIn("missing_formula_regions", [issue.issue_type for issue in report.issues]) | |
| self.assertEqual(report.metrics["formula_page_coverage"], 0.0) | |
| if __name__ == "__main__": | |
| unittest.main() | |