Spaces:
Sleeping
Sleeping
File size: 5,560 Bytes
046e3b8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | """Filter on span dictionaries
This module contains the internal representation of heading filters, which are
used to test if a span should be included in the ToC.
"""
import re
from typing import Optional
from re import Pattern
DEF_TOLERANCE: float = 1e-5
def admits_float(expect: Optional[float],
actual: Optional[float],
tolerance: float) -> bool:
"""Check if a float should be admitted by a filter"""
return (expect is None) or \
(actual is not None and abs(expect - actual) <= tolerance)
class FontFilter:
"""Filter on font attributes"""
name: Pattern
size: Optional[float]
size_tolerance: float
color: Optional[int]
flags: int
# besides the usual true (1) and false (0), we have another state,
# unset (x), where the truth table would be
# a b diff?
# 0 0 0
# 0 1 1
# 1 0 1
# 1 1 0
# x 0 0
# x 1 0
# it's very inefficient to compare bit by bit, which would take 5 bitwise
# operations to compare, and then 4 to combine the results, we will use a
# trick to reduce it to 2 ops.
# step 1: use XOR to find different bits. if unset, set bit to 0, we will
# take care of false positives in the next step
# a b a^b
# 0 0 0
# 0 1 1
# 1 0 1
# 1 1 0
# step 2: use AND with a ignore mask, (0 for ignored) to eliminate false
# positives
# a b a&b
# 0 1 0 <- no diff
# 0 0 0 <- no diff
# 1 1 1 <- found difference
# 1 0 0 <- ignored
ign_mask: int
def __init__(self, font_dict: dict):
self.name = re.compile(font_dict.get('name', ""))
self.size = font_dict.get('size')
self.size_tolerance = font_dict.get('size_tolerance', DEF_TOLERANCE)
self.color = font_dict.get('color')
# some branchless trick, mainly to save space
# x * True = x
# x * False = 0
self.flags = (0b00001 * font_dict.get('superscript', False) |
0b00010 * font_dict.get('italic', False) |
0b00100 * font_dict.get('serif', False) |
0b01000 * font_dict.get('monospace', False) |
0b10000 * font_dict.get('bold', False))
self.ign_mask = (0b00001 * ('superscript' in font_dict) |
0b00010 * ('italic' in font_dict) |
0b00100 * ('serif' in font_dict) |
0b01000 * ('monospace' in font_dict) |
0b10000 * ('bold' in font_dict))
def admits(self, spn: dict) -> bool:
"""Check if the font attributes admit the span
Argument
spn: the span dict to be checked
Returns
False if the span doesn't match current font attribute
"""
if not self.name.search(spn.get('font', "")):
return False
if self.color is not None and self.color != spn.get('color'):
return False
if not admits_float(self.size, spn.get('size'), self.size_tolerance):
return False
flags = spn.get('flags', ~self.flags)
# see above for explanation
return not (flags ^ self.flags) & self.ign_mask
class BoundingBoxFilter:
"""Filter on bounding boxes"""
left: Optional[float]
top: Optional[float]
right: Optional[float]
bottom: Optional[float]
tolernace: float
def __init__(self, bbox_dict: dict):
self.left = bbox_dict.get('left')
self.top = bbox_dict.get('top')
self.right = bbox_dict.get('right')
self.bottom = bbox_dict.get('bottom')
self.tolerance = bbox_dict.get('tolerance', DEF_TOLERANCE)
def admits(self, spn: dict) -> bool:
"""Check if the bounding box admit the span
Argument
spn: the span dict to be checked
Returns
False if the span doesn't match current bounding box setting
"""
bbox = spn.get('bbox', (None, None, None, None))
return (admits_float(self.left, bbox[0], self.tolerance) and
admits_float(self.top, bbox[1], self.tolerance) and
admits_float(self.right, bbox[2], self.tolerance) and
admits_float(self.bottom, bbox[3], self.tolerance))
class ToCFilter:
"""Filter on span dictionary to pick out headings in the ToC"""
# The level of the title, strictly > 0
level: int
# When set, the filter will be more *greedy* and extract all the text in a
# block even when at least one match occurs
greedy: bool
font: FontFilter
bbox: BoundingBoxFilter
def __init__(self, fltr_dict: dict):
lvl = fltr_dict.get('level')
if lvl is None:
raise ValueError("filter's 'level' is not set")
if lvl < 1:
raise ValueError("filter's 'level' must be >= 1")
self.level = lvl
self.greedy = fltr_dict.get('greedy', False)
self.font = FontFilter(fltr_dict.get('font', {}))
self.bbox = BoundingBoxFilter(fltr_dict.get('bbox', {}))
def admits(self, spn: dict) -> bool:
"""Check if the filter admits the span
Arguments
spn: the span dict to be checked
Returns
False if the span doesn't match the filter
"""
return self.font.admits(spn) and self.bbox.admits(spn)
|