Spaces:

adelevett
/

pdf.tocgen.split

Sleeping

@@ -1,158 +0,0 @@
-"""The executable of pdftocgen"""
-import toml
-import sys
-import getopt
-import pdftocgen
-import io
-from getopt import GetoptError
-from typing import TextIO
-from fitzutils import open_pdf, dump_toc, pprint_toc, get_file_encoding
-from .tocgen import gen_toc
-usage_s = """
-usage: pdftocgen [options] doc.pdf < recipe.toml
-""".strip()
-help_s = """
-usage: pdftocgen [options] doc.pdf < recipe.toml
-Generate PDF table of contents from a recipe file.
-This command automatically generates a table of contents for
-doc.pdf based on the font attributes and position of
-headings specified in a TOML recipe file. See [1] for an
-introduction to recipe files.
-To generate the table of contents for a pdf, use input
-redirection or pipes to supply a recipe file
-    $ pdftocgen in.pdf < recipe.toml
-or alternatively use the -r flag
-    $ pdftocgen -r recipe.toml in.pdf
-The output of this command can be directly piped into
-pdftocio to generate a new pdf file using the generated
-table of contents
-    $ pdftocgen -r recipe.toml in.pdf | pdftocio -o out.pdf in.pdf
-or you could save the output of this command to a file for
-further tweaking using output redirection
-    $ pdftocgen -r recipe.toml in.pdf > toc
-or the -o flag:
-    $ pdftocgen -r recipe.toml -o toc in.pdf
-If you only need a readable format of the table of contents,
-use the -H flag
-    $ pdftocgen -r recipe.toml -H in.pdf
-This format cannot be parsed by pdftocio, but it is slightly
-more readable.
-arguments
-  doc.pdf                   path to the input PDF document
-options
-  -h, --help                show help
-  -r, --recipe=recipe.toml  path to the recipe file. if this flag is
-                            not specified, the default is stdin
-  -H, --human-readable      print the toc in a readable format
-  -v, --vpos                if this flag is set, the vertical position
-                            of each heading will be generated in the
-                            output
-  -o, --out=file            path to the output file. if this flag is
-                            not specified, the default is stdout
-  -g, --debug               enable debug mode
-  -V, --version             show version number
-[1]: https://krasjet.com/voice/pdf.tocgen/#step-1-build-a-recipe
-""".strip()
-def main():
-    # parse arguments
-    try:
-        opts, args = getopt.gnu_getopt(
-            sys.argv[1:],
-            "hr:Hvo:gV",
-            ["help", "recipe=", "human-readable", "vpos", "out=", "debug", "version"]
-        )
-    except GetoptError as e:
-        print(e, file=sys.stderr)
-        print(usage_s, file=sys.stderr)
-        sys.exit(2)
-    recipe_file: TextIO = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='ignore')
-    readable: bool = False
-    vpos: bool = False
-    out: TextIO = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore')
-    debug: bool = False
-    for o, a in opts:
-        if o in ("-H", "--human-readable"):
-            readable = True
-        elif o in ("-v", "--vpos"):
-            vpos = True
-        elif o in ("-r", "--recipe"):
-            try:
-                recipe_file = open(a, "r", encoding=get_file_encoding(a))
-            except IOError as e:
-                print("error: can't open file for reading", file=sys.stderr)
-                print(e, file=sys.stderr)
-                sys.exit(1)
-        elif o in ("-o", "--out"):
-            try:
-                out = open(a, "w", encoding='utf-8', errors='ignore')
-            except IOError as e:
-                print("error: can't open file for writing", file=sys.stderr)
-                print(e, file=sys.stderr)
-                sys.exit(1)
-        elif o in ("-g", "--debug"):
-            debug = True
-        elif o in ("-V", "--version"):
-            print("pdftocgen", pdftocgen.__version__, file=sys.stderr)
-            sys.exit()
-        elif o in ("-h", "--help"):
-            print(help_s, file=sys.stderr)
-            sys.exit()
-    if len(args) < 1:
-        print("error: no input pdf is given", file=sys.stderr)
-        print(usage_s, file=sys.stderr)
-        sys.exit(1)
-    path_in: str = args[0]
-    # done parsing arguments
-    try:
-        with open_pdf(path_in) as doc:
-            recipe = toml.load(recipe_file)
-            toc = gen_toc(doc, recipe)
-            if readable:
-                print(pprint_toc(toc), file=out)
-            else:
-                print(dump_toc(toc, vpos), end="", file=out)
-    except ValueError as e:
-        if debug:
-            raise e
-        print("error:", e, file=sys.stderr)
-        sys.exit(1)
-    except IOError as e:
-        if debug:
-            raise e
-        print("error: unable to open file", file=sys.stderr)
-        print(e, file=sys.stderr)
-        sys.exit(1)
-    except KeyboardInterrupt as e:
-        if debug:
-            raise e
-        print("error: interrupted", file=sys.stderr)
-        sys.exit(1)

pdftocgen/pdftocgen/filter.py DELETED Viewed

@@ -1,161 +0,0 @@
-"""Filter on span dictionaries
-This module contains the internal representation of heading filters, which are
-used to test if a span should be included in the ToC.
-"""
-import re
-from typing import Optional
-from re import Pattern
-DEF_TOLERANCE: float = 1e-5
-def admits_float(expect: Optional[float],
-                 actual: Optional[float],
-                 tolerance: float) -> bool:
-    """Check if a float should be admitted by a filter"""
-    return (expect is None) or \
-           (actual is not None and abs(expect - actual) <= tolerance)
-class FontFilter:
-    """Filter on font attributes"""
-    name: Pattern
-    size: Optional[float]
-    size_tolerance: float
-    color: Optional[int]
-    flags: int
-    # besides the usual true (1) and false (0), we have another state,
-    # unset (x), where the truth table would be
-    # a b diff?
-    # 0 0 0
-    # 0 1 1
-    # 1 0 1
-    # 1 1 0
-    # x 0 0
-    # x 1 0
-    # it's very inefficient to compare bit by bit, which would take 5 bitwise
-    # operations to compare, and then 4 to combine the results, we will use a
-    # trick to reduce it to 2 ops.
-    # step 1: use XOR to find different bits. if unset, set bit to 0, we will
-    #         take care of false positives in the next step
-    # a b a^b
-    # 0 0 0
-    # 0 1 1
-    # 1 0 1
-    # 1 1 0
-    # step 2: use AND with a ignore mask, (0 for ignored) to eliminate false
-    #         positives
-    # a b a&b
-    # 0 1 0           <- no diff
-    # 0 0 0           <- no diff
-    # 1 1 1           <- found difference
-    # 1 0 0           <- ignored
-    ign_mask: int
-    def __init__(self, font_dict: dict):
-        self.name = re.compile(font_dict.get('name', ""))
-        self.size = font_dict.get('size')
-        self.size_tolerance = font_dict.get('size_tolerance', DEF_TOLERANCE)
-        self.color = font_dict.get('color')
-        # some branchless trick, mainly to save space
-        # x * True = x
-        # x * False = 0
-        self.flags = (0b00001 * font_dict.get('superscript', False) |
-                      0b00010 * font_dict.get('italic', False) |
-                      0b00100 * font_dict.get('serif', False) |
-                      0b01000 * font_dict.get('monospace', False) |
-                      0b10000 * font_dict.get('bold', False))
-        self.ign_mask = (0b00001 * ('superscript' in font_dict) |
-                         0b00010 * ('italic' in font_dict) |
-                         0b00100 * ('serif' in font_dict) |
-                         0b01000 * ('monospace' in font_dict) |
-                         0b10000 * ('bold' in font_dict))
-    def admits(self, spn: dict) -> bool:
-        """Check if the font attributes admit the span
-        Argument
-          spn: the span dict to be checked
-        Returns
-          False if the span doesn't match current font attribute
-        """
-        if not self.name.search(spn.get('font', "")):
-            return False
-        if self.color is not None and self.color != spn.get('color'):
-            return False
-        if not admits_float(self.size, spn.get('size'), self.size_tolerance):
-            return False
-        flags = spn.get('flags', ~self.flags)
-        # see above for explanation
-        return not (flags ^ self.flags) & self.ign_mask
-class BoundingBoxFilter:
-    """Filter on bounding boxes"""
-    left: Optional[float]
-    top: Optional[float]
-    right: Optional[float]
-    bottom: Optional[float]
-    tolernace: float
-    def __init__(self, bbox_dict: dict):
-        self.left = bbox_dict.get('left')
-        self.top = bbox_dict.get('top')
-        self.right = bbox_dict.get('right')
-        self.bottom = bbox_dict.get('bottom')
-        self.tolerance = bbox_dict.get('tolerance', DEF_TOLERANCE)
-    def admits(self, spn: dict) -> bool:
-        """Check if the bounding box admit the span
-        Argument
-          spn: the span dict to be checked
-        Returns
-          False if the span doesn't match current bounding box setting
-        """
-        bbox = spn.get('bbox', (None, None, None, None))
-        return (admits_float(self.left, bbox[0], self.tolerance) and
-                admits_float(self.top, bbox[1], self.tolerance) and
-                admits_float(self.right, bbox[2], self.tolerance) and
-                admits_float(self.bottom, bbox[3], self.tolerance))
-class ToCFilter:
-    """Filter on span dictionary to pick out headings in the ToC"""
-    # The level of the title, strictly > 0
-    level: int
-    # When set, the filter will be more *greedy* and extract all the text in a
-    # block even when at least one match occurs
-    greedy: bool
-    font: FontFilter
-    bbox: BoundingBoxFilter
-    def __init__(self, fltr_dict: dict):
-        lvl = fltr_dict.get('level')
-        if lvl is None:
-            raise ValueError("filter's 'level' is not set")
-        if lvl < 1:
-            raise ValueError("filter's 'level' must be >= 1")
-        self.level = lvl
-        self.greedy = fltr_dict.get('greedy', False)
-        self.font = FontFilter(fltr_dict.get('font', {}))
-        self.bbox = BoundingBoxFilter(fltr_dict.get('bbox', {}))
-    def admits(self, spn: dict) -> bool:
-        """Check if the filter admits the span
-        Arguments
-          spn: the span dict to be checked
-        Returns
-          False if the span doesn't match the filter
-        """
-        return self.font.admits(spn) and self.bbox.admits(spn)

pdftocgen/pdftocgen/recipe.py DELETED Viewed

@@ -1,188 +0,0 @@
-from dataclasses import dataclass
-from typing import Optional, List, Dict, Iterator
-from .filter import ToCFilter
-from fitzutils import ToCEntry
-from itertools import chain
-from collections import defaultdict
-from fitz import Document
-class FoundGreedy(Exception):
-    """A hacky solution to do short-circuiting in Python.
-    The main reason to do this short-circuiting is to untangle the logic of
-    greedy filter with normal execution, which makes the typing and code much
-    cleaner, but it can also save some unecessary comparisons.
-    Probably similar to call/cc in scheme or longjump in C
-    c.f. https://ds26gte.github.io/tyscheme/index-Z-H-15.html#node_sec_13.2
-    """
-    level: int
-    def __init__(self, level):
-        """
-        Argument
-          level: level of the greedy filter
-        """
-        super().__init__()
-        self.level = level
-def blk_to_str(blk: dict) -> str:
-    """Extract all the text inside a block"""
-    return " ".join([
-        spn.get('text', "").strip()
-        for line in blk.get('lines', [])
-        for spn in line.get('spans', [])
-    ])
-@dataclass
-class Fragment:
-    """A fragment of the extracted heading"""
-    text: str
-    level: int
-def concatFrag(frags: Iterator[Optional[Fragment]], sep: str = " ") -> Dict[int, str]:
-    """Concatenate fragments to strings
-    Returns
-      a dictionary (level -> title) that contains the title for each level.
-    """
-    # accumulate a list of strings for each level of heading
-    acc = defaultdict(list)
-    for frag in frags:
-        if frag is not None:
-            acc[frag.level].append(frag.text)
-    result = {}
-    for level, strs in acc.items():
-        result[level] = sep.join(strs)
-    return result
-class Recipe:
-    """The internal representation of a recipe"""
-    filters: List[ToCFilter]
-    def __init__(self, recipe_dict: dict):
-        fltr_dicts = recipe_dict.get('heading', [])
-        if len(fltr_dicts) == 0:
-            raise ValueError("no filters found in recipe")
-        self.filters = [ToCFilter(fltr) for fltr in fltr_dicts]
-    def _extract_span(self, spn: dict) -> Optional[Fragment]:
-        """Extract text from span along with level
-        Argument
-          spn: a span dictionary
-          {
-            'bbox': (float, float, float, float),
-            'color': int,
-            'flags': int,
-            'font': str,
-            'size': float,
-            'text': str
-          }
-        Returns
-          a fragment of the heading or None if no match
-        """
-        for fltr in self.filters:
-            if fltr.admits(spn):
-                text = spn.get('text', "").strip()
-                if not text:
-                    # don't match empty spaces
-                    return None
-                if fltr.greedy:
-                    # propagate all the way back to extract_block
-                    raise FoundGreedy(fltr.level)
-                return Fragment(text, fltr.level)
-        return None
-    def _extract_line(self, line: dict) -> List[Optional[Fragment]]:
-        """Extract matching heading fragments in a line.
-        Argument
-          line: a line dictionary
-          {
-            'bbox': (float, float, float, float),
-            'wmode': int,
-            'dir': (float, float),
-            'spans': [dict]
-          }
-        Returns
-          a list of fragments concatenated from result in a line
-        """
-        return [self._extract_span(spn) for spn in line.get('spans', [])]
-    def extract_block(self, block: dict, page: int) -> List[ToCEntry]:
-        """Extract matching headings in a block.
-        Argument
-          block: a block dictionary
-          {
-            'bbox': (float, float, float, float),
-            'lines': [dict],
-            'type': int
-          }
-        Returns
-          a list of toc entries, concatenated from the result of lines
-        """
-        if block.get('type') != 0:
-            # not a text block
-            return []
-        vpos = block.get('bbox', (0, 0))[1]
-        try:
-            frags = chain.from_iterable([
-                self._extract_line(ln) for ln in block.get('lines')
-            ])
-            titles = concatFrag(frags)
-            return [
-                ToCEntry(level, title, page, vpos)
-                for level, title in titles.items()
-            ]
-        except FoundGreedy as e:
-            # Smart Greedy: Only merged text that MATCHES the filter
-            # Find the filter that triggered this level
-            relevant_filter = next((f for f in self.filters if f.level == e.level), None)
-            parts = []
-            if relevant_filter:
-                for ln in block.get('lines', []):
-                    for spn in ln.get('spans', []):
-                        if relevant_filter.admits(spn):
-                            parts.append(spn.get('text', "").strip())
-            merged_text = " ".join(parts)
-            if merged_text:
-                return [ToCEntry(e.level, merged_text, page, vpos)]
-            else:
-                return []
-def extract_toc(doc: Document, recipe: Recipe) -> List[ToCEntry]:
-    """Extract toc entries from a document
-    Arguments
-      doc: a pdf document
-      recipe: recipe from user
-    Returns
-      a list of toc entries in the document
-    """
-    result = []
-    for page in doc.pages():
-        for blk in page.get_textpage().extractDICT().get('blocks', []):
-            result.extend(
-                recipe.extract_block(blk, page.number + 1)
-            )
-    return result

pdftocgen/pdftocgen/tocgen.py DELETED Viewed

@@ -1,15 +0,0 @@
-from fitz import Document
-from typing import List
-from fitzutils import ToCEntry
-from .recipe import Recipe, extract_toc
-def gen_toc(doc: Document, recipe_dict: dict) -> List[ToCEntry]:
-    """Generate the table of content for a document from recipe
-    Argument
-      doc: a pdf document
-      recipe_dict: the recipe dictionary used to generate the toc
-    Returns
-      a list of ToC entries
-    """
-    return extract_toc(doc, Recipe(recipe_dict))