File size: 5,560 Bytes
046e3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""Filter on span dictionaries



This module contains the internal representation of heading filters, which are

used to test if a span should be included in the ToC.

"""

import re

from typing import Optional
from re import Pattern

DEF_TOLERANCE: float = 1e-5


def admits_float(expect: Optional[float],

                 actual: Optional[float],

                 tolerance: float) -> bool:
    """Check if a float should be admitted by a filter"""
    return (expect is None) or \
           (actual is not None and abs(expect - actual) <= tolerance)


class FontFilter:
    """Filter on font attributes"""
    name: Pattern
    size: Optional[float]
    size_tolerance: float
    color: Optional[int]
    flags: int
    # besides the usual true (1) and false (0), we have another state,
    # unset (x), where the truth table would be
    # a b diff?
    # 0 0 0
    # 0 1 1
    # 1 0 1
    # 1 1 0
    # x 0 0
    # x 1 0
    # it's very inefficient to compare bit by bit, which would take 5 bitwise
    # operations to compare, and then 4 to combine the results, we will use a
    # trick to reduce it to 2 ops.
    # step 1: use XOR to find different bits. if unset, set bit to 0, we will
    #         take care of false positives in the next step
    # a b a^b
    # 0 0 0
    # 0 1 1
    # 1 0 1
    # 1 1 0
    # step 2: use AND with a ignore mask, (0 for ignored) to eliminate false
    #         positives
    # a b a&b
    # 0 1 0           <- no diff
    # 0 0 0           <- no diff
    # 1 1 1           <- found difference
    # 1 0 0           <- ignored
    ign_mask: int

    def __init__(self, font_dict: dict):
        self.name = re.compile(font_dict.get('name', ""))
        self.size = font_dict.get('size')
        self.size_tolerance = font_dict.get('size_tolerance', DEF_TOLERANCE)
        self.color = font_dict.get('color')
        # some branchless trick, mainly to save space
        # x * True = x
        # x * False = 0
        self.flags = (0b00001 * font_dict.get('superscript', False) |
                      0b00010 * font_dict.get('italic', False) |
                      0b00100 * font_dict.get('serif', False) |
                      0b01000 * font_dict.get('monospace', False) |
                      0b10000 * font_dict.get('bold', False))

        self.ign_mask = (0b00001 * ('superscript' in font_dict) |
                         0b00010 * ('italic' in font_dict) |
                         0b00100 * ('serif' in font_dict) |
                         0b01000 * ('monospace' in font_dict) |
                         0b10000 * ('bold' in font_dict))

    def admits(self, spn: dict) -> bool:
        """Check if the font attributes admit the span



        Argument

          spn: the span dict to be checked

        Returns

          False if the span doesn't match current font attribute

        """
        if not self.name.search(spn.get('font', "")):
            return False

        if self.color is not None and self.color != spn.get('color'):
            return False

        if not admits_float(self.size, spn.get('size'), self.size_tolerance):
            return False

        flags = spn.get('flags', ~self.flags)
        # see above for explanation
        return not (flags ^ self.flags) & self.ign_mask


class BoundingBoxFilter:
    """Filter on bounding boxes"""
    left: Optional[float]
    top: Optional[float]
    right: Optional[float]
    bottom: Optional[float]
    tolernace: float

    def __init__(self, bbox_dict: dict):
        self.left = bbox_dict.get('left')
        self.top = bbox_dict.get('top')
        self.right = bbox_dict.get('right')
        self.bottom = bbox_dict.get('bottom')
        self.tolerance = bbox_dict.get('tolerance', DEF_TOLERANCE)

    def admits(self, spn: dict) -> bool:
        """Check if the bounding box admit the span



        Argument

          spn: the span dict to be checked

        Returns

          False if the span doesn't match current bounding box setting

        """
        bbox = spn.get('bbox', (None, None, None, None))
        return (admits_float(self.left, bbox[0], self.tolerance) and
                admits_float(self.top, bbox[1], self.tolerance) and
                admits_float(self.right, bbox[2], self.tolerance) and
                admits_float(self.bottom, bbox[3], self.tolerance))


class ToCFilter:
    """Filter on span dictionary to pick out headings in the ToC"""
    # The level of the title, strictly > 0
    level: int
    # When set, the filter will be more *greedy* and extract all the text in a
    # block even when at least one match occurs
    greedy: bool
    font: FontFilter
    bbox: BoundingBoxFilter

    def __init__(self, fltr_dict: dict):
        lvl = fltr_dict.get('level')

        if lvl is None:
            raise ValueError("filter's 'level' is not set")
        if lvl < 1:
            raise ValueError("filter's 'level' must be >= 1")

        self.level = lvl
        self.greedy = fltr_dict.get('greedy', False)
        self.font = FontFilter(fltr_dict.get('font', {}))
        self.bbox = BoundingBoxFilter(fltr_dict.get('bbox', {}))

    def admits(self, spn: dict) -> bool:
        """Check if the filter admits the span



        Arguments

          spn: the span dict to be checked

        Returns

          False if the span doesn't match the filter

        """
        return self.font.admits(spn) and self.bbox.admits(spn)