File size: 2,017 Bytes
db06ffa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import unittest

from zsgdp.normalize.markdown import markdown_to_blocks, normalize_markdown_candidate, normalize_markdown_table


class MarkdownNormalizerTests(unittest.TestCase):
    def test_markdown_to_blocks_preserves_pages_tables_and_images(self):
        markdown = """# Report

Intro paragraph.

| Region | Q1 |
| --- | --- |
| NA | 10 |

<!-- page:2 -->

## Figure Section

![Chart caption](chart.png)
"""

        candidate = normalize_markdown_candidate(
            markdown=markdown,
            doc_id="d1",
            source_path="sample.md",
            file_type="markdown",
            parser_name="test",
        )

        self.assertEqual([page["page_num"] for page in candidate.pages], [1, 2])
        self.assertEqual(len(candidate.tables), 1)
        self.assertEqual(candidate.tables[0].page_nums, [1])
        self.assertEqual(len(candidate.figures), 1)
        self.assertEqual(candidate.figures[0].page_num, 2)
        self.assertEqual(candidate.figures[0].image_path, "chart.png")

    def test_normalize_markdown_table_repairs_separator(self):
        table = "| A | B |\n| --- | --- |\n| 1 | 2 |"

        self.assertEqual(normalize_markdown_table(table), "| A | B |\n| --- | --- |\n| 1 | 2 |")

    def test_normalize_plain_aligned_table(self):
        table = "Region    Q1    Q2\nNorth America    10    12\nEurope    8    7"

        self.assertEqual(
            normalize_markdown_table(table),
            "| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |",
        )

    def test_markdown_to_blocks_detects_plain_aligned_table(self):
        blocks = markdown_to_blocks("# Report\n\nRegion    Q1    Q2\nNorth America    10    12\nEurope    8    7")

        self.assertEqual(blocks[1].block_type, "table")

    def test_markdown_to_blocks_classifies_caption(self):
        blocks = markdown_to_blocks("Figure 1 Revenue trend")

        self.assertEqual(blocks[0].block_type, "caption")


if __name__ == "__main__":
    unittest.main()