Spaces:

adelevett
/

pdf.tocgen.split

Sleeping

App Files Files Community

adelevett commited on Feb 4

Commit

9cdd820

verified ·

1 Parent(s): 9386150

Upload 6 files

Browse files

Files changed (6) hide show

pdftocgen/pdftocgen/__init__.py +3 -0
pdftocgen/pdftocgen/__main__.py +4 -0
pdftocgen/pdftocgen/app.py +158 -0
pdftocgen/pdftocgen/filter.py +161 -0
pdftocgen/pdftocgen/recipe.py +188 -0
pdftocgen/pdftocgen/tocgen.py +15 -0

pdftocgen/pdftocgen/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """Generate table of contents for pdf based on a recipe file"""
2	+
3	+ __version__ = '1.3.4'

pdftocgen/pdftocgen/__main__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .app import main
+if __name__ == '__main__':
+    main()

pdftocgen/pdftocgen/app.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""The executable of pdftocgen"""
+import toml
+import sys
+import getopt
+import pdftocgen
+import io
+from getopt import GetoptError
+from typing import TextIO
+from fitzutils import open_pdf, dump_toc, pprint_toc, get_file_encoding
+from .tocgen import gen_toc
+usage_s = """
+usage: pdftocgen [options] doc.pdf < recipe.toml
+""".strip()
+help_s = """
+usage: pdftocgen [options] doc.pdf < recipe.toml
+Generate PDF table of contents from a recipe file.
+This command automatically generates a table of contents for
+doc.pdf based on the font attributes and position of
+headings specified in a TOML recipe file. See [1] for an
+introduction to recipe files.
+To generate the table of contents for a pdf, use input
+redirection or pipes to supply a recipe file
+    $ pdftocgen in.pdf < recipe.toml
+or alternatively use the -r flag
+    $ pdftocgen -r recipe.toml in.pdf
+The output of this command can be directly piped into
+pdftocio to generate a new pdf file using the generated
+table of contents
+    $ pdftocgen -r recipe.toml in.pdf | pdftocio -o out.pdf in.pdf
+or you could save the output of this command to a file for
+further tweaking using output redirection
+    $ pdftocgen -r recipe.toml in.pdf > toc
+or the -o flag:
+    $ pdftocgen -r recipe.toml -o toc in.pdf
+If you only need a readable format of the table of contents,
+use the -H flag
+    $ pdftocgen -r recipe.toml -H in.pdf
+This format cannot be parsed by pdftocio, but it is slightly
+more readable.
+arguments
+  doc.pdf                   path to the input PDF document
+options
+  -h, --help                show help
+  -r, --recipe=recipe.toml  path to the recipe file. if this flag is
+                            not specified, the default is stdin
+  -H, --human-readable      print the toc in a readable format
+  -v, --vpos                if this flag is set, the vertical position
+                            of each heading will be generated in the
+                            output
+  -o, --out=file            path to the output file. if this flag is
+                            not specified, the default is stdout
+  -g, --debug               enable debug mode
+  -V, --version             show version number
+[1]: https://krasjet.com/voice/pdf.tocgen/#step-1-build-a-recipe
+""".strip()
+def main():
+    # parse arguments
+    try:
+        opts, args = getopt.gnu_getopt(
+            sys.argv[1:],
+            "hr:Hvo:gV",
+            ["help", "recipe=", "human-readable", "vpos", "out=", "debug", "version"]
+        )
+    except GetoptError as e:
+        print(e, file=sys.stderr)
+        print(usage_s, file=sys.stderr)
+        sys.exit(2)
+    recipe_file: TextIO = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='ignore')
+    readable: bool = False
+    vpos: bool = False
+    out: TextIO = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore')
+    debug: bool = False
+    for o, a in opts:
+        if o in ("-H", "--human-readable"):
+            readable = True
+        elif o in ("-v", "--vpos"):
+            vpos = True
+        elif o in ("-r", "--recipe"):
+            try:
+                recipe_file = open(a, "r", encoding=get_file_encoding(a))
+            except IOError as e:
+                print("error: can't open file for reading", file=sys.stderr)
+                print(e, file=sys.stderr)
+                sys.exit(1)
+        elif o in ("-o", "--out"):
+            try:
+                out = open(a, "w", encoding='utf-8', errors='ignore')
+            except IOError as e:
+                print("error: can't open file for writing", file=sys.stderr)
+                print(e, file=sys.stderr)
+                sys.exit(1)
+        elif o in ("-g", "--debug"):
+            debug = True
+        elif o in ("-V", "--version"):
+            print("pdftocgen", pdftocgen.__version__, file=sys.stderr)
+            sys.exit()
+        elif o in ("-h", "--help"):
+            print(help_s, file=sys.stderr)
+            sys.exit()
+    if len(args) < 1:
+        print("error: no input pdf is given", file=sys.stderr)
+        print(usage_s, file=sys.stderr)
+        sys.exit(1)
+    path_in: str = args[0]
+    # done parsing arguments
+    try:
+        with open_pdf(path_in) as doc:
+            recipe = toml.load(recipe_file)
+            toc = gen_toc(doc, recipe)
+            if readable:
+                print(pprint_toc(toc), file=out)
+            else:
+                print(dump_toc(toc, vpos), end="", file=out)
+    except ValueError as e:
+        if debug:
+            raise e
+        print("error:", e, file=sys.stderr)
+        sys.exit(1)
+    except IOError as e:
+        if debug:
+            raise e
+        print("error: unable to open file", file=sys.stderr)
+        print(e, file=sys.stderr)
+        sys.exit(1)
+    except KeyboardInterrupt as e:
+        if debug:
+            raise e
+        print("error: interrupted", file=sys.stderr)
+        sys.exit(1)

pdftocgen/pdftocgen/filter.py ADDED Viewed

	@@ -0,0 +1,161 @@

+"""Filter on span dictionaries
+This module contains the internal representation of heading filters, which are
+used to test if a span should be included in the ToC.
+"""
+import re
+from typing import Optional
+from re import Pattern
+DEF_TOLERANCE: float = 1e-5
+def admits_float(expect: Optional[float],
+                 actual: Optional[float],
+                 tolerance: float) -> bool:
+    """Check if a float should be admitted by a filter"""
+    return (expect is None) or \
+           (actual is not None and abs(expect - actual) <= tolerance)
+class FontFilter:
+    """Filter on font attributes"""
+    name: Pattern
+    size: Optional[float]
+    size_tolerance: float
+    color: Optional[int]
+    flags: int
+    # besides the usual true (1) and false (0), we have another state,
+    # unset (x), where the truth table would be
+    # a b diff?
+    # 0 0 0
+    # 0 1 1
+    # 1 0 1
+    # 1 1 0
+    # x 0 0
+    # x 1 0
+    # it's very inefficient to compare bit by bit, which would take 5 bitwise
+    # operations to compare, and then 4 to combine the results, we will use a
+    # trick to reduce it to 2 ops.
+    # step 1: use XOR to find different bits. if unset, set bit to 0, we will
+    #         take care of false positives in the next step
+    # a b a^b
+    # 0 0 0
+    # 0 1 1
+    # 1 0 1
+    # 1 1 0
+    # step 2: use AND with a ignore mask, (0 for ignored) to eliminate false
+    #         positives
+    # a b a&b
+    # 0 1 0           <- no diff
+    # 0 0 0           <- no diff
+    # 1 1 1           <- found difference
+    # 1 0 0           <- ignored
+    ign_mask: int
+    def __init__(self, font_dict: dict):
+        self.name = re.compile(font_dict.get('name', ""))
+        self.size = font_dict.get('size')
+        self.size_tolerance = font_dict.get('size_tolerance', DEF_TOLERANCE)
+        self.color = font_dict.get('color')
+        # some branchless trick, mainly to save space
+        # x * True = x
+        # x * False = 0
+        self.flags = (0b00001 * font_dict.get('superscript', False) |
+                      0b00010 * font_dict.get('italic', False) |
+                      0b00100 * font_dict.get('serif', False) |
+                      0b01000 * font_dict.get('monospace', False) |
+                      0b10000 * font_dict.get('bold', False))
+        self.ign_mask = (0b00001 * ('superscript' in font_dict) |
+                         0b00010 * ('italic' in font_dict) |
+                         0b00100 * ('serif' in font_dict) |
+                         0b01000 * ('monospace' in font_dict) |
+                         0b10000 * ('bold' in font_dict))
+    def admits(self, spn: dict) -> bool:
+        """Check if the font attributes admit the span
+        Argument
+          spn: the span dict to be checked
+        Returns
+          False if the span doesn't match current font attribute
+        """
+        if not self.name.search(spn.get('font', "")):
+            return False
+        if self.color is not None and self.color != spn.get('color'):
+            return False
+        if not admits_float(self.size, spn.get('size'), self.size_tolerance):
+            return False
+        flags = spn.get('flags', ~self.flags)
+        # see above for explanation
+        return not (flags ^ self.flags) & self.ign_mask
+class BoundingBoxFilter:
+    """Filter on bounding boxes"""
+    left: Optional[float]
+    top: Optional[float]
+    right: Optional[float]
+    bottom: Optional[float]
+    tolernace: float
+    def __init__(self, bbox_dict: dict):
+        self.left = bbox_dict.get('left')
+        self.top = bbox_dict.get('top')
+        self.right = bbox_dict.get('right')
+        self.bottom = bbox_dict.get('bottom')
+        self.tolerance = bbox_dict.get('tolerance', DEF_TOLERANCE)
+    def admits(self, spn: dict) -> bool:
+        """Check if the bounding box admit the span
+        Argument
+          spn: the span dict to be checked
+        Returns
+          False if the span doesn't match current bounding box setting
+        """
+        bbox = spn.get('bbox', (None, None, None, None))
+        return (admits_float(self.left, bbox[0], self.tolerance) and
+                admits_float(self.top, bbox[1], self.tolerance) and
+                admits_float(self.right, bbox[2], self.tolerance) and
+                admits_float(self.bottom, bbox[3], self.tolerance))
+class ToCFilter:
+    """Filter on span dictionary to pick out headings in the ToC"""
+    # The level of the title, strictly > 0
+    level: int
+    # When set, the filter will be more *greedy* and extract all the text in a
+    # block even when at least one match occurs
+    greedy: bool
+    font: FontFilter
+    bbox: BoundingBoxFilter
+    def __init__(self, fltr_dict: dict):
+        lvl = fltr_dict.get('level')
+        if lvl is None:
+            raise ValueError("filter's 'level' is not set")
+        if lvl < 1:
+            raise ValueError("filter's 'level' must be >= 1")
+        self.level = lvl
+        self.greedy = fltr_dict.get('greedy', False)
+        self.font = FontFilter(fltr_dict.get('font', {}))
+        self.bbox = BoundingBoxFilter(fltr_dict.get('bbox', {}))
+    def admits(self, spn: dict) -> bool:
+        """Check if the filter admits the span
+        Arguments
+          spn: the span dict to be checked
+        Returns
+          False if the span doesn't match the filter
+        """
+        return self.font.admits(spn) and self.bbox.admits(spn)

pdftocgen/pdftocgen/recipe.py ADDED Viewed

	@@ -0,0 +1,188 @@

+from dataclasses import dataclass
+from typing import Optional, List, Dict, Iterator
+from .filter import ToCFilter
+from fitzutils import ToCEntry
+from itertools import chain
+from collections import defaultdict
+from fitz import Document
+class FoundGreedy(Exception):
+    """A hacky solution to do short-circuiting in Python.
+    The main reason to do this short-circuiting is to untangle the logic of
+    greedy filter with normal execution, which makes the typing and code much
+    cleaner, but it can also save some unecessary comparisons.
+    Probably similar to call/cc in scheme or longjump in C
+    c.f. https://ds26gte.github.io/tyscheme/index-Z-H-15.html#node_sec_13.2
+    """
+    level: int
+    def __init__(self, level):
+        """
+        Argument
+          level: level of the greedy filter
+        """
+        super().__init__()
+        self.level = level
+def blk_to_str(blk: dict) -> str:
+    """Extract all the text inside a block"""
+    return " ".join([
+        spn.get('text', "").strip()
+        for line in blk.get('lines', [])
+        for spn in line.get('spans', [])
+    ])
+@dataclass
+class Fragment:
+    """A fragment of the extracted heading"""
+    text: str
+    level: int
+def concatFrag(frags: Iterator[Optional[Fragment]], sep: str = " ") -> Dict[int, str]:
+    """Concatenate fragments to strings
+    Returns
+      a dictionary (level -> title) that contains the title for each level.
+    """
+    # accumulate a list of strings for each level of heading
+    acc = defaultdict(list)
+    for frag in frags:
+        if frag is not None:
+            acc[frag.level].append(frag.text)
+    result = {}
+    for level, strs in acc.items():
+        result[level] = sep.join(strs)
+    return result
+class Recipe:
+    """The internal representation of a recipe"""
+    filters: List[ToCFilter]
+    def __init__(self, recipe_dict: dict):
+        fltr_dicts = recipe_dict.get('heading', [])
+        if len(fltr_dicts) == 0:
+            raise ValueError("no filters found in recipe")
+        self.filters = [ToCFilter(fltr) for fltr in fltr_dicts]
+    def _extract_span(self, spn: dict) -> Optional[Fragment]:
+        """Extract text from span along with level
+        Argument
+          spn: a span dictionary
+          {
+            'bbox': (float, float, float, float),
+            'color': int,
+            'flags': int,
+            'font': str,
+            'size': float,
+            'text': str
+          }
+        Returns
+          a fragment of the heading or None if no match
+        """
+        for fltr in self.filters:
+            if fltr.admits(spn):
+                text = spn.get('text', "").strip()
+                if not text:
+                    # don't match empty spaces
+                    return None
+                if fltr.greedy:
+                    # propagate all the way back to extract_block
+                    raise FoundGreedy(fltr.level)
+                return Fragment(text, fltr.level)
+        return None
+    def _extract_line(self, line: dict) -> List[Optional[Fragment]]:
+        """Extract matching heading fragments in a line.
+        Argument
+          line: a line dictionary
+          {
+            'bbox': (float, float, float, float),
+            'wmode': int,
+            'dir': (float, float),
+            'spans': [dict]
+          }
+        Returns
+          a list of fragments concatenated from result in a line
+        """
+        return [self._extract_span(spn) for spn in line.get('spans', [])]
+    def extract_block(self, block: dict, page: int) -> List[ToCEntry]:
+        """Extract matching headings in a block.
+        Argument
+          block: a block dictionary
+          {
+            'bbox': (float, float, float, float),
+            'lines': [dict],
+            'type': int
+          }
+        Returns
+          a list of toc entries, concatenated from the result of lines
+        """
+        if block.get('type') != 0:
+            # not a text block
+            return []
+        vpos = block.get('bbox', (0, 0))[1]
+        try:
+            frags = chain.from_iterable([
+                self._extract_line(ln) for ln in block.get('lines')
+            ])
+            titles = concatFrag(frags)
+            return [
+                ToCEntry(level, title, page, vpos)
+                for level, title in titles.items()
+            ]
+        except FoundGreedy as e:
+            # Smart Greedy: Only merged text that MATCHES the filter
+            # Find the filter that triggered this level
+            relevant_filter = next((f for f in self.filters if f.level == e.level), None)
+            parts = []
+            if relevant_filter:
+                for ln in block.get('lines', []):
+                    for spn in ln.get('spans', []):
+                        if relevant_filter.admits(spn):
+                            parts.append(spn.get('text', "").strip())
+            merged_text = " ".join(parts)
+            if merged_text:
+                return [ToCEntry(e.level, merged_text, page, vpos)]
+            else:
+                return []
+def extract_toc(doc: Document, recipe: Recipe) -> List[ToCEntry]:
+    """Extract toc entries from a document
+    Arguments
+      doc: a pdf document
+      recipe: recipe from user
+    Returns
+      a list of toc entries in the document
+    """
+    result = []
+    for page in doc.pages():
+        for blk in page.get_textpage().extractDICT().get('blocks', []):
+            result.extend(
+                recipe.extract_block(blk, page.number + 1)
+            )
+    return result

pdftocgen/pdftocgen/tocgen.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from fitz import Document
+from typing import List
+from fitzutils import ToCEntry
+from .recipe import Recipe, extract_toc
+def gen_toc(doc: Document, recipe_dict: dict) -> List[ToCEntry]:
+    """Generate the table of content for a document from recipe
+    Argument
+      doc: a pdf document
+      recipe_dict: the recipe dictionary used to generate the toc
+    Returns
+      a list of ToC entries
+    """
+    return extract_toc(doc, Recipe(recipe_dict))