adelevett commited on
Commit
9cdd820
·
verified ·
1 Parent(s): 9386150

Upload 6 files

Browse files
pdftocgen/pdftocgen/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """Generate table of contents for pdf based on a recipe file"""
2
+
3
+ __version__ = '1.3.4'
pdftocgen/pdftocgen/__main__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .app import main
2
+
3
+ if __name__ == '__main__':
4
+ main()
pdftocgen/pdftocgen/app.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """The executable of pdftocgen"""
2
+
3
+ import toml
4
+ import sys
5
+ import getopt
6
+ import pdftocgen
7
+ import io
8
+
9
+ from getopt import GetoptError
10
+ from typing import TextIO
11
+ from fitzutils import open_pdf, dump_toc, pprint_toc, get_file_encoding
12
+ from .tocgen import gen_toc
13
+
14
+ usage_s = """
15
+ usage: pdftocgen [options] doc.pdf < recipe.toml
16
+ """.strip()
17
+
18
+ help_s = """
19
+ usage: pdftocgen [options] doc.pdf < recipe.toml
20
+
21
+ Generate PDF table of contents from a recipe file.
22
+
23
+ This command automatically generates a table of contents for
24
+ doc.pdf based on the font attributes and position of
25
+ headings specified in a TOML recipe file. See [1] for an
26
+ introduction to recipe files.
27
+
28
+ To generate the table of contents for a pdf, use input
29
+ redirection or pipes to supply a recipe file
30
+
31
+ $ pdftocgen in.pdf < recipe.toml
32
+
33
+ or alternatively use the -r flag
34
+
35
+ $ pdftocgen -r recipe.toml in.pdf
36
+
37
+ The output of this command can be directly piped into
38
+ pdftocio to generate a new pdf file using the generated
39
+ table of contents
40
+
41
+ $ pdftocgen -r recipe.toml in.pdf | pdftocio -o out.pdf in.pdf
42
+
43
+ or you could save the output of this command to a file for
44
+ further tweaking using output redirection
45
+
46
+ $ pdftocgen -r recipe.toml in.pdf > toc
47
+
48
+ or the -o flag:
49
+
50
+ $ pdftocgen -r recipe.toml -o toc in.pdf
51
+
52
+ If you only need a readable format of the table of contents,
53
+ use the -H flag
54
+
55
+ $ pdftocgen -r recipe.toml -H in.pdf
56
+
57
+ This format cannot be parsed by pdftocio, but it is slightly
58
+ more readable.
59
+
60
+ arguments
61
+ doc.pdf path to the input PDF document
62
+
63
+ options
64
+ -h, --help show help
65
+ -r, --recipe=recipe.toml path to the recipe file. if this flag is
66
+ not specified, the default is stdin
67
+ -H, --human-readable print the toc in a readable format
68
+ -v, --vpos if this flag is set, the vertical position
69
+ of each heading will be generated in the
70
+ output
71
+ -o, --out=file path to the output file. if this flag is
72
+ not specified, the default is stdout
73
+ -g, --debug enable debug mode
74
+ -V, --version show version number
75
+
76
+ [1]: https://krasjet.com/voice/pdf.tocgen/#step-1-build-a-recipe
77
+ """.strip()
78
+
79
+
80
+ def main():
81
+ # parse arguments
82
+ try:
83
+ opts, args = getopt.gnu_getopt(
84
+ sys.argv[1:],
85
+ "hr:Hvo:gV",
86
+ ["help", "recipe=", "human-readable", "vpos", "out=", "debug", "version"]
87
+ )
88
+ except GetoptError as e:
89
+ print(e, file=sys.stderr)
90
+ print(usage_s, file=sys.stderr)
91
+ sys.exit(2)
92
+
93
+ recipe_file: TextIO = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='ignore')
94
+ readable: bool = False
95
+ vpos: bool = False
96
+ out: TextIO = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore')
97
+ debug: bool = False
98
+
99
+ for o, a in opts:
100
+ if o in ("-H", "--human-readable"):
101
+ readable = True
102
+ elif o in ("-v", "--vpos"):
103
+ vpos = True
104
+ elif o in ("-r", "--recipe"):
105
+ try:
106
+ recipe_file = open(a, "r", encoding=get_file_encoding(a))
107
+ except IOError as e:
108
+ print("error: can't open file for reading", file=sys.stderr)
109
+ print(e, file=sys.stderr)
110
+ sys.exit(1)
111
+ elif o in ("-o", "--out"):
112
+ try:
113
+ out = open(a, "w", encoding='utf-8', errors='ignore')
114
+ except IOError as e:
115
+ print("error: can't open file for writing", file=sys.stderr)
116
+ print(e, file=sys.stderr)
117
+ sys.exit(1)
118
+ elif o in ("-g", "--debug"):
119
+ debug = True
120
+ elif o in ("-V", "--version"):
121
+ print("pdftocgen", pdftocgen.__version__, file=sys.stderr)
122
+ sys.exit()
123
+ elif o in ("-h", "--help"):
124
+ print(help_s, file=sys.stderr)
125
+ sys.exit()
126
+
127
+ if len(args) < 1:
128
+ print("error: no input pdf is given", file=sys.stderr)
129
+ print(usage_s, file=sys.stderr)
130
+ sys.exit(1)
131
+
132
+ path_in: str = args[0]
133
+ # done parsing arguments
134
+
135
+ try:
136
+ with open_pdf(path_in) as doc:
137
+ recipe = toml.load(recipe_file)
138
+ toc = gen_toc(doc, recipe)
139
+ if readable:
140
+ print(pprint_toc(toc), file=out)
141
+ else:
142
+ print(dump_toc(toc, vpos), end="", file=out)
143
+ except ValueError as e:
144
+ if debug:
145
+ raise e
146
+ print("error:", e, file=sys.stderr)
147
+ sys.exit(1)
148
+ except IOError as e:
149
+ if debug:
150
+ raise e
151
+ print("error: unable to open file", file=sys.stderr)
152
+ print(e, file=sys.stderr)
153
+ sys.exit(1)
154
+ except KeyboardInterrupt as e:
155
+ if debug:
156
+ raise e
157
+ print("error: interrupted", file=sys.stderr)
158
+ sys.exit(1)
pdftocgen/pdftocgen/filter.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Filter on span dictionaries
2
+
3
+ This module contains the internal representation of heading filters, which are
4
+ used to test if a span should be included in the ToC.
5
+ """
6
+
7
+ import re
8
+
9
+ from typing import Optional
10
+ from re import Pattern
11
+
12
+ DEF_TOLERANCE: float = 1e-5
13
+
14
+
15
+ def admits_float(expect: Optional[float],
16
+ actual: Optional[float],
17
+ tolerance: float) -> bool:
18
+ """Check if a float should be admitted by a filter"""
19
+ return (expect is None) or \
20
+ (actual is not None and abs(expect - actual) <= tolerance)
21
+
22
+
23
+ class FontFilter:
24
+ """Filter on font attributes"""
25
+ name: Pattern
26
+ size: Optional[float]
27
+ size_tolerance: float
28
+ color: Optional[int]
29
+ flags: int
30
+ # besides the usual true (1) and false (0), we have another state,
31
+ # unset (x), where the truth table would be
32
+ # a b diff?
33
+ # 0 0 0
34
+ # 0 1 1
35
+ # 1 0 1
36
+ # 1 1 0
37
+ # x 0 0
38
+ # x 1 0
39
+ # it's very inefficient to compare bit by bit, which would take 5 bitwise
40
+ # operations to compare, and then 4 to combine the results, we will use a
41
+ # trick to reduce it to 2 ops.
42
+ # step 1: use XOR to find different bits. if unset, set bit to 0, we will
43
+ # take care of false positives in the next step
44
+ # a b a^b
45
+ # 0 0 0
46
+ # 0 1 1
47
+ # 1 0 1
48
+ # 1 1 0
49
+ # step 2: use AND with a ignore mask, (0 for ignored) to eliminate false
50
+ # positives
51
+ # a b a&b
52
+ # 0 1 0 <- no diff
53
+ # 0 0 0 <- no diff
54
+ # 1 1 1 <- found difference
55
+ # 1 0 0 <- ignored
56
+ ign_mask: int
57
+
58
+ def __init__(self, font_dict: dict):
59
+ self.name = re.compile(font_dict.get('name', ""))
60
+ self.size = font_dict.get('size')
61
+ self.size_tolerance = font_dict.get('size_tolerance', DEF_TOLERANCE)
62
+ self.color = font_dict.get('color')
63
+ # some branchless trick, mainly to save space
64
+ # x * True = x
65
+ # x * False = 0
66
+ self.flags = (0b00001 * font_dict.get('superscript', False) |
67
+ 0b00010 * font_dict.get('italic', False) |
68
+ 0b00100 * font_dict.get('serif', False) |
69
+ 0b01000 * font_dict.get('monospace', False) |
70
+ 0b10000 * font_dict.get('bold', False))
71
+
72
+ self.ign_mask = (0b00001 * ('superscript' in font_dict) |
73
+ 0b00010 * ('italic' in font_dict) |
74
+ 0b00100 * ('serif' in font_dict) |
75
+ 0b01000 * ('monospace' in font_dict) |
76
+ 0b10000 * ('bold' in font_dict))
77
+
78
+ def admits(self, spn: dict) -> bool:
79
+ """Check if the font attributes admit the span
80
+
81
+ Argument
82
+ spn: the span dict to be checked
83
+ Returns
84
+ False if the span doesn't match current font attribute
85
+ """
86
+ if not self.name.search(spn.get('font', "")):
87
+ return False
88
+
89
+ if self.color is not None and self.color != spn.get('color'):
90
+ return False
91
+
92
+ if not admits_float(self.size, spn.get('size'), self.size_tolerance):
93
+ return False
94
+
95
+ flags = spn.get('flags', ~self.flags)
96
+ # see above for explanation
97
+ return not (flags ^ self.flags) & self.ign_mask
98
+
99
+
100
+ class BoundingBoxFilter:
101
+ """Filter on bounding boxes"""
102
+ left: Optional[float]
103
+ top: Optional[float]
104
+ right: Optional[float]
105
+ bottom: Optional[float]
106
+ tolernace: float
107
+
108
+ def __init__(self, bbox_dict: dict):
109
+ self.left = bbox_dict.get('left')
110
+ self.top = bbox_dict.get('top')
111
+ self.right = bbox_dict.get('right')
112
+ self.bottom = bbox_dict.get('bottom')
113
+ self.tolerance = bbox_dict.get('tolerance', DEF_TOLERANCE)
114
+
115
+ def admits(self, spn: dict) -> bool:
116
+ """Check if the bounding box admit the span
117
+
118
+ Argument
119
+ spn: the span dict to be checked
120
+ Returns
121
+ False if the span doesn't match current bounding box setting
122
+ """
123
+ bbox = spn.get('bbox', (None, None, None, None))
124
+ return (admits_float(self.left, bbox[0], self.tolerance) and
125
+ admits_float(self.top, bbox[1], self.tolerance) and
126
+ admits_float(self.right, bbox[2], self.tolerance) and
127
+ admits_float(self.bottom, bbox[3], self.tolerance))
128
+
129
+
130
+ class ToCFilter:
131
+ """Filter on span dictionary to pick out headings in the ToC"""
132
+ # The level of the title, strictly > 0
133
+ level: int
134
+ # When set, the filter will be more *greedy* and extract all the text in a
135
+ # block even when at least one match occurs
136
+ greedy: bool
137
+ font: FontFilter
138
+ bbox: BoundingBoxFilter
139
+
140
+ def __init__(self, fltr_dict: dict):
141
+ lvl = fltr_dict.get('level')
142
+
143
+ if lvl is None:
144
+ raise ValueError("filter's 'level' is not set")
145
+ if lvl < 1:
146
+ raise ValueError("filter's 'level' must be >= 1")
147
+
148
+ self.level = lvl
149
+ self.greedy = fltr_dict.get('greedy', False)
150
+ self.font = FontFilter(fltr_dict.get('font', {}))
151
+ self.bbox = BoundingBoxFilter(fltr_dict.get('bbox', {}))
152
+
153
+ def admits(self, spn: dict) -> bool:
154
+ """Check if the filter admits the span
155
+
156
+ Arguments
157
+ spn: the span dict to be checked
158
+ Returns
159
+ False if the span doesn't match the filter
160
+ """
161
+ return self.font.admits(spn) and self.bbox.admits(spn)
pdftocgen/pdftocgen/recipe.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Optional, List, Dict, Iterator
3
+ from .filter import ToCFilter
4
+ from fitzutils import ToCEntry
5
+ from itertools import chain
6
+ from collections import defaultdict
7
+ from fitz import Document
8
+
9
+
10
+ class FoundGreedy(Exception):
11
+ """A hacky solution to do short-circuiting in Python.
12
+
13
+ The main reason to do this short-circuiting is to untangle the logic of
14
+ greedy filter with normal execution, which makes the typing and code much
15
+ cleaner, but it can also save some unecessary comparisons.
16
+
17
+ Probably similar to call/cc in scheme or longjump in C
18
+ c.f. https://ds26gte.github.io/tyscheme/index-Z-H-15.html#node_sec_13.2
19
+ """
20
+ level: int
21
+
22
+ def __init__(self, level):
23
+ """
24
+ Argument
25
+ level: level of the greedy filter
26
+ """
27
+ super().__init__()
28
+ self.level = level
29
+
30
+
31
+ def blk_to_str(blk: dict) -> str:
32
+ """Extract all the text inside a block"""
33
+ return " ".join([
34
+ spn.get('text', "").strip()
35
+ for line in blk.get('lines', [])
36
+ for spn in line.get('spans', [])
37
+ ])
38
+
39
+
40
+ @dataclass
41
+ class Fragment:
42
+ """A fragment of the extracted heading"""
43
+ text: str
44
+ level: int
45
+
46
+
47
+ def concatFrag(frags: Iterator[Optional[Fragment]], sep: str = " ") -> Dict[int, str]:
48
+ """Concatenate fragments to strings
49
+
50
+ Returns
51
+ a dictionary (level -> title) that contains the title for each level.
52
+ """
53
+ # accumulate a list of strings for each level of heading
54
+ acc = defaultdict(list)
55
+ for frag in frags:
56
+ if frag is not None:
57
+ acc[frag.level].append(frag.text)
58
+
59
+ result = {}
60
+ for level, strs in acc.items():
61
+ result[level] = sep.join(strs)
62
+ return result
63
+
64
+
65
+ class Recipe:
66
+ """The internal representation of a recipe"""
67
+ filters: List[ToCFilter]
68
+
69
+ def __init__(self, recipe_dict: dict):
70
+ fltr_dicts = recipe_dict.get('heading', [])
71
+
72
+ if len(fltr_dicts) == 0:
73
+ raise ValueError("no filters found in recipe")
74
+ self.filters = [ToCFilter(fltr) for fltr in fltr_dicts]
75
+
76
+ def _extract_span(self, spn: dict) -> Optional[Fragment]:
77
+ """Extract text from span along with level
78
+
79
+ Argument
80
+ spn: a span dictionary
81
+ {
82
+ 'bbox': (float, float, float, float),
83
+ 'color': int,
84
+ 'flags': int,
85
+ 'font': str,
86
+ 'size': float,
87
+ 'text': str
88
+ }
89
+ Returns
90
+ a fragment of the heading or None if no match
91
+ """
92
+ for fltr in self.filters:
93
+ if fltr.admits(spn):
94
+ text = spn.get('text', "").strip()
95
+
96
+ if not text:
97
+ # don't match empty spaces
98
+ return None
99
+
100
+ if fltr.greedy:
101
+ # propagate all the way back to extract_block
102
+ raise FoundGreedy(fltr.level)
103
+
104
+ return Fragment(text, fltr.level)
105
+ return None
106
+
107
+ def _extract_line(self, line: dict) -> List[Optional[Fragment]]:
108
+ """Extract matching heading fragments in a line.
109
+
110
+ Argument
111
+ line: a line dictionary
112
+ {
113
+ 'bbox': (float, float, float, float),
114
+ 'wmode': int,
115
+ 'dir': (float, float),
116
+ 'spans': [dict]
117
+ }
118
+ Returns
119
+ a list of fragments concatenated from result in a line
120
+ """
121
+ return [self._extract_span(spn) for spn in line.get('spans', [])]
122
+
123
+ def extract_block(self, block: dict, page: int) -> List[ToCEntry]:
124
+ """Extract matching headings in a block.
125
+
126
+ Argument
127
+ block: a block dictionary
128
+ {
129
+ 'bbox': (float, float, float, float),
130
+ 'lines': [dict],
131
+ 'type': int
132
+ }
133
+ Returns
134
+ a list of toc entries, concatenated from the result of lines
135
+ """
136
+ if block.get('type') != 0:
137
+ # not a text block
138
+ return []
139
+
140
+ vpos = block.get('bbox', (0, 0))[1]
141
+
142
+ try:
143
+ frags = chain.from_iterable([
144
+ self._extract_line(ln) for ln in block.get('lines')
145
+ ])
146
+ titles = concatFrag(frags)
147
+
148
+ return [
149
+ ToCEntry(level, title, page, vpos)
150
+ for level, title in titles.items()
151
+ ]
152
+ except FoundGreedy as e:
153
+ # Smart Greedy: Only merged text that MATCHES the filter
154
+ # Find the filter that triggered this level
155
+ relevant_filter = next((f for f in self.filters if f.level == e.level), None)
156
+
157
+ parts = []
158
+ if relevant_filter:
159
+ for ln in block.get('lines', []):
160
+ for spn in ln.get('spans', []):
161
+ if relevant_filter.admits(spn):
162
+ parts.append(spn.get('text', "").strip())
163
+
164
+ merged_text = " ".join(parts)
165
+ if merged_text:
166
+ return [ToCEntry(e.level, merged_text, page, vpos)]
167
+ else:
168
+ return []
169
+
170
+
171
+ def extract_toc(doc: Document, recipe: Recipe) -> List[ToCEntry]:
172
+ """Extract toc entries from a document
173
+
174
+ Arguments
175
+ doc: a pdf document
176
+ recipe: recipe from user
177
+ Returns
178
+ a list of toc entries in the document
179
+ """
180
+ result = []
181
+
182
+ for page in doc.pages():
183
+ for blk in page.get_textpage().extractDICT().get('blocks', []):
184
+ result.extend(
185
+ recipe.extract_block(blk, page.number + 1)
186
+ )
187
+
188
+ return result
pdftocgen/pdftocgen/tocgen.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fitz import Document
2
+ from typing import List
3
+ from fitzutils import ToCEntry
4
+ from .recipe import Recipe, extract_toc
5
+
6
+ def gen_toc(doc: Document, recipe_dict: dict) -> List[ToCEntry]:
7
+ """Generate the table of content for a document from recipe
8
+
9
+ Argument
10
+ doc: a pdf document
11
+ recipe_dict: the recipe dictionary used to generate the toc
12
+ Returns
13
+ a list of ToC entries
14
+ """
15
+ return extract_toc(doc, Recipe(recipe_dict))