Spaces:

adelevett
/

pdf.tocgen.split

Sleeping

pdf.tocgen.split

File size: 5,560 Bytes

046e3b8

"""Filter on span dictionaries



This module contains the internal representation of heading filters, which are

used to test if a span should be included in the ToC.

"""

import re

from typing import Optional
from re import Pattern

DEF_TOLERANCE: float = 1e-5


def admits_float(expect: Optional[float],

                 actual: Optional[float],

                 tolerance: float) -> bool:
    """Check if a float should be admitted by a filter"""
    return (expect is None) or \
           (actual is not None and abs(expect - actual) <= tolerance)


class FontFilter:
    """Filter on font attributes"""
    name: Pattern
    size: Optional[float]
    size_tolerance: float
    color: Optional[int]
    flags: int
    # besides the usual true (1) and false (0), we have another state,
    # unset (x), where the truth table would be
    # a b diff?
    # 0 0 0
    # 0 1 1
    # 1 0 1
    # 1 1 0
    # x 0 0
    # x 1 0
    # it's very inefficient to compare bit by bit, which would take 5 bitwise
    # operations to compare, and then 4 to combine the results, we will use a
    # trick to reduce it to 2 ops.
    # step 1: use XOR to find different bits. if unset, set bit to 0, we will
    #         take care of false positives in the next step
    # a b a^b
    # 0 0 0
    # 0 1 1
    # 1 0 1
    # 1 1 0
    # step 2: use AND with a ignore mask, (0 for ignored) to eliminate false
    #         positives
    # a b a&b
    # 0 1 0           <- no diff
    # 0 0 0           <- no diff
    # 1 1 1           <- found difference
    # 1 0 0           <- ignored
    ign_mask: int

    def __init__(self, font_dict: dict):
        self.name = re.compile(font_dict.get('name', ""))
        self.size = font_dict.get('size')
        self.size_tolerance = font_dict.get('size_tolerance', DEF_TOLERANCE)
        self.color = font_dict.get('color')
        # some branchless trick, mainly to save space
        # x * True = x
        # x * False = 0
        self.flags = (0b00001 * font_dict.get('superscript', False) |
                      0b00010 * font_dict.get('italic', False) |
                      0b00100 * font_dict.get('serif', False) |
                      0b01000 * font_dict.get('monospace', False) |
                      0b10000 * font_dict.get('bold', False))

        self.ign_mask = (0b00001 * ('superscript' in font_dict) |
                         0b00010 * ('italic' in font_dict) |
                         0b00100 * ('serif' in font_dict) |
                         0b01000 * ('monospace' in font_dict) |
                         0b10000 * ('bold' in font_dict))

    def admits(self, spn: dict) -> bool:
        """Check if the font attributes admit the span



        Argument

          spn: the span dict to be checked

        Returns

          False if the span doesn't match current font attribute

        """
        if not self.name.search(spn.get('font', "")):
            return False

        if self.color is not None and self.color != spn.get('color'):
            return False

        if not admits_float(self.size, spn.get('size'), self.size_tolerance):
            return False

        flags = spn.get('flags', ~self.flags)
        # see above for explanation
        return not (flags ^ self.flags) & self.ign_mask


class BoundingBoxFilter:
    """Filter on bounding boxes"""
    left: Optional[float]
    top: Optional[float]
    right: Optional[float]
    bottom: Optional[float]
    tolernace: float

    def __init__(self, bbox_dict: dict):
        self.left = bbox_dict.get('left')
        self.top = bbox_dict.get('top')
        self.right = bbox_dict.get('right')
        self.bottom = bbox_dict.get('bottom')
        self.tolerance = bbox_dict.get('tolerance', DEF_TOLERANCE)

    def admits(self, spn: dict) -> bool:
        """Check if the bounding box admit the span



        Argument

          spn: the span dict to be checked

        Returns

          False if the span doesn't match current bounding box setting

        """
        bbox = spn.get('bbox', (None, None, None, None))
        return (admits_float(self.left, bbox[0], self.tolerance) and
                admits_float(self.top, bbox[1], self.tolerance) and
                admits_float(self.right, bbox[2], self.tolerance) and
                admits_float(self.bottom, bbox[3], self.tolerance))


class ToCFilter:
    """Filter on span dictionary to pick out headings in the ToC"""
    # The level of the title, strictly > 0
    level: int
    # When set, the filter will be more *greedy* and extract all the text in a
    # block even when at least one match occurs
    greedy: bool
    font: FontFilter
    bbox: BoundingBoxFilter

    def __init__(self, fltr_dict: dict):
        lvl = fltr_dict.get('level')

        if lvl is None:
            raise ValueError("filter's 'level' is not set")
        if lvl < 1:
            raise ValueError("filter's 'level' must be >= 1")

        self.level = lvl
        self.greedy = fltr_dict.get('greedy', False)
        self.font = FontFilter(fltr_dict.get('font', {}))
        self.bbox = BoundingBoxFilter(fltr_dict.get('bbox', {}))

    def admits(self, spn: dict) -> bool:
        """Check if the filter admits the span



        Arguments

          spn: the span dict to be checked

        Returns

          False if the span doesn't match the filter

        """
        return self.font.admits(spn) and self.bbox.admits(spn)