Spaces:
Running on Zero
Running on Zero
File size: 2,464 Bytes
db06ffa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | import unittest
from unittest.mock import patch
from zsgdp.config import load_config
from zsgdp.normalize.normalize_unstructured import normalize_unstructured_parts
from zsgdp.parsers.external import MinerUParser, OlmOCRParser, PaddleOCRParser
from zsgdp.schema import DocumentProfile, PageProfile
class ExternalParserAdapterTests(unittest.TestCase):
def test_command_backed_parsers_normalize_markdown(self):
cases = [
(MinerUParser, "mineru"),
(OlmOCRParser, "olmocr"),
(PaddleOCRParser, "paddleocr"),
]
profile = _profile()
for parser_class, parser_name in cases:
with self.subTest(parser=parser_name), patch.object(parser_class, "available", return_value=True), patch(
"zsgdp.parsers.external.run_external_parser_to_markdown",
return_value="# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |",
):
candidate = parser_class().parse("sample.pdf", profile, load_config())
self.assertEqual(candidate.parser_name, parser_name)
self.assertEqual(candidate.elements[0].source_parser, parser_name)
self.assertEqual(len(candidate.tables), 1)
self.assertEqual(candidate.provenance["requested_pages"], [1])
def test_unstructured_normalizer_preserves_page_and_title_metadata(self):
class Metadata:
page_number = 2
class Title:
category = "Title"
metadata = Metadata()
def __str__(self):
return "Executive Summary"
class Narrative:
category = "NarrativeText"
metadata = Metadata()
def __str__(self):
return "The document parser keeps provenance."
candidate = normalize_unstructured_parts(parts=[Title(), Narrative()], profile=_profile(), source_path="sample.pdf")
self.assertEqual(candidate.parser_name, "unstructured")
self.assertEqual(candidate.elements[0].page_num, 2)
self.assertEqual(candidate.elements[0].type, "title")
self.assertEqual(candidate.elements[0].markdown, "# Executive Summary")
def _profile():
return DocumentProfile(
doc_id="d1",
source_path="sample.pdf",
file_type="pdf",
page_count=1,
extension=".pdf",
pages=[PageProfile(page_num=1, digital_text_chars=20)],
)
if __name__ == "__main__":
unittest.main()
|