File size: 1,311 Bytes
db06ffa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import unittest

from zsgdp.parsers.docling_parser import _export_markdown, normalize_docling_markdown
from zsgdp.schema import DocumentProfile, PageProfile


class FakeDoclingDocument:
    def export_to_markdown(self):
        return "# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |"


class DoclingParserTests(unittest.TestCase):
    def test_export_markdown_uses_docling_method(self):
        self.assertEqual(_export_markdown(FakeDoclingDocument()), "# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |")

    def test_normalize_docling_markdown_emits_schema(self):
        profile = DocumentProfile(
            doc_id="d1",
            source_path="sample.pdf",
            file_type="pdf",
            page_count=1,
            extension=".pdf",
            pages=[PageProfile(page_num=1, digital_text_chars=20)],
        )

        candidate = normalize_docling_markdown(
            markdown="# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |",
            profile=profile,
            source_path="sample.pdf",
        )

        self.assertEqual(candidate.parser_name, "docling")
        self.assertEqual(len(candidate.elements), 2)
        self.assertEqual(len(candidate.tables), 1)
        self.assertEqual(candidate.pages[0]["source_parser"], "docling")


if __name__ == "__main__":
    unittest.main()