File size: 5,590 Bytes
046e3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import os
import fitz
import toml

from mamba import description, it, before
from pdfxmeta import extract_meta, dump_meta, dump_toml

dirpath = os.path.dirname(os.path.abspath(__file__))

with description("extract_meta:") as self:
    with before.all:
        self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf"))

    with it("extracts metadata from pdf"):
        meta = extract_meta(self.doc, "Section One", 1)
        assert len(meta) == 1

        m = meta[0]
        assert m['text'] == "Section One"
        assert 'font' in m
        assert 'CMBX12' in m['font']

    with it("matches lowercase when ignore case is set"):
        meta = extract_meta(self.doc, "section one", 1, True)
        assert len(meta) == 1

        m = meta[0]
        assert m['text'] == "Section One"
        assert 'font' in m
        assert 'CMBX12' in m['font']

    with it("matches mixed case when ignore case is set"):
        meta = extract_meta(self.doc, "sEcTIoN OnE", 1, True)
        assert len(meta) == 1

        m = meta[0]
        assert m['text'] == "Section One"
        assert 'font' in m
        assert 'CMBX12' in m['font']

    with it("matches nothing if ignore case is not set"):
        meta = extract_meta(self.doc, "section one", 1, False)
        assert len(meta) == 0

    with it("can match multiple instances of needle"):
        meta = extract_meta(self.doc, "Section", 1)
        assert len(meta) == 2

        m = meta[0]
        assert m['text'] == "Section One"
        assert 'font' in m
        assert 'CMBX12' in m['font']

        m = meta[1]
        assert m['text'] == "Section Two"
        assert 'font' in m
        assert 'CMBX12' in m['font']

    with it("returns [] when nothing is matched"):
        meta = extract_meta(self.doc, "Sectoin", 1, False)
        assert len(meta) == 0

    with it("returns [] when page number is out of range"):
        meta = extract_meta(self.doc, "Section One", 0)
        assert len(meta) == 0

        meta = extract_meta(self.doc, "Section One", 7)
        assert len(meta) == 0

    with it("can match text on any page when page number is not specified"):
        meta = extract_meta(self.doc, "The End")
        assert len(meta) == 1

        m = meta[0]
        assert m['text'] == "The End"
        assert 'font' in m
        assert 'CMBX12' in m['font']

with description("dump_meta:") as self:
    with before.all:
        self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
        self.expected_meta = {
            'font': {
                'name': 'CMBX12',
                'size': 14.346199989318848,
                'color': 0x000000,
                'superscript': False,
                'italic': False,
                'serif': True,
                'monospace': False,
                'bold': True
            },
            'bbox': {
                'left': 157.98439025878906,
                'top': 237.6484375,
                'right': 243.12905883789062,
                'bottom': 252.00897216796875
            }
        }

    with it("produces valid toml"):
        meta = extract_meta(self.doc, "Section One", 1)
        assert len(meta) == 1

        meta_dict = toml.loads(dump_meta(meta[0]))
        assert meta_dict == self.expected_meta


with description("dump_toml:") as self:
    with before.all:
        self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
        self.expected_recipe = {
            'heading': [
                {
                    'level': 1,
                    'greedy': True,
                    'font': {
                        'name': 'CMBX12',
                        'size': 14.346199989318848,
                    }
                }
            ]
        }

    with it("produces valid toml"):
        meta = extract_meta(self.doc, "Section One", 1)
        assert len(meta) == 1

        meta_dict = toml.loads(dump_toml(meta[0], 1))
        assert meta_dict == self.expected_recipe

    with it("strips font subset correctly"):
        with_subset = {
            'font': "subset+font",
            'size': 1,
            'flags': 20,
            'color': 0,
            'bbox': (1, 2, 3, 4),
            'text': ""
        }

        without_subset = {
            'font': "font",
            'size': 1,
            'flags': 20,
            'color': 0,
            'bbox': (1, 2, 3, 4),
            'text': ""
        }

        expected = {
            'heading': [
                {
                    'level': 1,
                    'greedy': True,
                    'font': {
                        'name': 'font',
                        'size': 1
                    }
                }
            ]
        }

        double_plus = {
            'font': "subset+font+font",
            'size': 1,
            'flags': 20,
            'color': 0,
            'bbox': (1, 2, 3, 4),
            'text': ""
        }

        expected2 = {
            'heading': [
                {
                    'level': 1,
                    'greedy': True,
                    'font': {
                        'name': 'font+font',
                        'size': 1
                    }
                }
            ]
        }

        assert toml.loads(dump_toml(with_subset, 1)) == expected
        assert toml.loads(dump_toml(without_subset, 1)) == expected
        assert toml.loads(dump_toml(double_plus, 1)) == expected2