adelevett's picture
Upload 76 files
046e3b8 verified
"""Filter on span dictionaries
This module contains the internal representation of heading filters, which are
used to test if a span should be included in the ToC.
"""
import re
from typing import Optional
from re import Pattern
DEF_TOLERANCE: float = 1e-5
def admits_float(expect: Optional[float],
actual: Optional[float],
tolerance: float) -> bool:
"""Check if a float should be admitted by a filter"""
return (expect is None) or \
(actual is not None and abs(expect - actual) <= tolerance)
class FontFilter:
"""Filter on font attributes"""
name: Pattern
size: Optional[float]
size_tolerance: float
color: Optional[int]
flags: int
# besides the usual true (1) and false (0), we have another state,
# unset (x), where the truth table would be
# a b diff?
# 0 0 0
# 0 1 1
# 1 0 1
# 1 1 0
# x 0 0
# x 1 0
# it's very inefficient to compare bit by bit, which would take 5 bitwise
# operations to compare, and then 4 to combine the results, we will use a
# trick to reduce it to 2 ops.
# step 1: use XOR to find different bits. if unset, set bit to 0, we will
# take care of false positives in the next step
# a b a^b
# 0 0 0
# 0 1 1
# 1 0 1
# 1 1 0
# step 2: use AND with a ignore mask, (0 for ignored) to eliminate false
# positives
# a b a&b
# 0 1 0 <- no diff
# 0 0 0 <- no diff
# 1 1 1 <- found difference
# 1 0 0 <- ignored
ign_mask: int
def __init__(self, font_dict: dict):
self.name = re.compile(font_dict.get('name', ""))
self.size = font_dict.get('size')
self.size_tolerance = font_dict.get('size_tolerance', DEF_TOLERANCE)
self.color = font_dict.get('color')
# some branchless trick, mainly to save space
# x * True = x
# x * False = 0
self.flags = (0b00001 * font_dict.get('superscript', False) |
0b00010 * font_dict.get('italic', False) |
0b00100 * font_dict.get('serif', False) |
0b01000 * font_dict.get('monospace', False) |
0b10000 * font_dict.get('bold', False))
self.ign_mask = (0b00001 * ('superscript' in font_dict) |
0b00010 * ('italic' in font_dict) |
0b00100 * ('serif' in font_dict) |
0b01000 * ('monospace' in font_dict) |
0b10000 * ('bold' in font_dict))
def admits(self, spn: dict) -> bool:
"""Check if the font attributes admit the span
Argument
spn: the span dict to be checked
Returns
False if the span doesn't match current font attribute
"""
if not self.name.search(spn.get('font', "")):
return False
if self.color is not None and self.color != spn.get('color'):
return False
if not admits_float(self.size, spn.get('size'), self.size_tolerance):
return False
flags = spn.get('flags', ~self.flags)
# see above for explanation
return not (flags ^ self.flags) & self.ign_mask
class BoundingBoxFilter:
"""Filter on bounding boxes"""
left: Optional[float]
top: Optional[float]
right: Optional[float]
bottom: Optional[float]
tolernace: float
def __init__(self, bbox_dict: dict):
self.left = bbox_dict.get('left')
self.top = bbox_dict.get('top')
self.right = bbox_dict.get('right')
self.bottom = bbox_dict.get('bottom')
self.tolerance = bbox_dict.get('tolerance', DEF_TOLERANCE)
def admits(self, spn: dict) -> bool:
"""Check if the bounding box admit the span
Argument
spn: the span dict to be checked
Returns
False if the span doesn't match current bounding box setting
"""
bbox = spn.get('bbox', (None, None, None, None))
return (admits_float(self.left, bbox[0], self.tolerance) and
admits_float(self.top, bbox[1], self.tolerance) and
admits_float(self.right, bbox[2], self.tolerance) and
admits_float(self.bottom, bbox[3], self.tolerance))
class ToCFilter:
"""Filter on span dictionary to pick out headings in the ToC"""
# The level of the title, strictly > 0
level: int
# When set, the filter will be more *greedy* and extract all the text in a
# block even when at least one match occurs
greedy: bool
font: FontFilter
bbox: BoundingBoxFilter
def __init__(self, fltr_dict: dict):
lvl = fltr_dict.get('level')
if lvl is None:
raise ValueError("filter's 'level' is not set")
if lvl < 1:
raise ValueError("filter's 'level' must be >= 1")
self.level = lvl
self.greedy = fltr_dict.get('greedy', False)
self.font = FontFilter(fltr_dict.get('font', {}))
self.bbox = BoundingBoxFilter(fltr_dict.get('bbox', {}))
def admits(self, spn: dict) -> bool:
"""Check if the filter admits the span
Arguments
spn: the span dict to be checked
Returns
False if the span doesn't match the filter
"""
return self.font.admits(spn) and self.bbox.admits(spn)