adelevett commited on
Commit
92cc6a2
·
verified ·
1 Parent(s): 9cdd820

Delete pdftocgen/pdftocgen

Browse files
pdftocgen/pdftocgen/__init__.py DELETED
@@ -1,3 +0,0 @@
1
- """Generate table of contents for pdf based on a recipe file"""
2
-
3
- __version__ = '1.3.4'
 
 
 
 
pdftocgen/pdftocgen/__main__.py DELETED
@@ -1,4 +0,0 @@
1
- from .app import main
2
-
3
- if __name__ == '__main__':
4
- main()
 
 
 
 
 
pdftocgen/pdftocgen/app.py DELETED
@@ -1,158 +0,0 @@
1
- """The executable of pdftocgen"""
2
-
3
- import toml
4
- import sys
5
- import getopt
6
- import pdftocgen
7
- import io
8
-
9
- from getopt import GetoptError
10
- from typing import TextIO
11
- from fitzutils import open_pdf, dump_toc, pprint_toc, get_file_encoding
12
- from .tocgen import gen_toc
13
-
14
- usage_s = """
15
- usage: pdftocgen [options] doc.pdf < recipe.toml
16
- """.strip()
17
-
18
- help_s = """
19
- usage: pdftocgen [options] doc.pdf < recipe.toml
20
-
21
- Generate PDF table of contents from a recipe file.
22
-
23
- This command automatically generates a table of contents for
24
- doc.pdf based on the font attributes and position of
25
- headings specified in a TOML recipe file. See [1] for an
26
- introduction to recipe files.
27
-
28
- To generate the table of contents for a pdf, use input
29
- redirection or pipes to supply a recipe file
30
-
31
- $ pdftocgen in.pdf < recipe.toml
32
-
33
- or alternatively use the -r flag
34
-
35
- $ pdftocgen -r recipe.toml in.pdf
36
-
37
- The output of this command can be directly piped into
38
- pdftocio to generate a new pdf file using the generated
39
- table of contents
40
-
41
- $ pdftocgen -r recipe.toml in.pdf | pdftocio -o out.pdf in.pdf
42
-
43
- or you could save the output of this command to a file for
44
- further tweaking using output redirection
45
-
46
- $ pdftocgen -r recipe.toml in.pdf > toc
47
-
48
- or the -o flag:
49
-
50
- $ pdftocgen -r recipe.toml -o toc in.pdf
51
-
52
- If you only need a readable format of the table of contents,
53
- use the -H flag
54
-
55
- $ pdftocgen -r recipe.toml -H in.pdf
56
-
57
- This format cannot be parsed by pdftocio, but it is slightly
58
- more readable.
59
-
60
- arguments
61
- doc.pdf path to the input PDF document
62
-
63
- options
64
- -h, --help show help
65
- -r, --recipe=recipe.toml path to the recipe file. if this flag is
66
- not specified, the default is stdin
67
- -H, --human-readable print the toc in a readable format
68
- -v, --vpos if this flag is set, the vertical position
69
- of each heading will be generated in the
70
- output
71
- -o, --out=file path to the output file. if this flag is
72
- not specified, the default is stdout
73
- -g, --debug enable debug mode
74
- -V, --version show version number
75
-
76
- [1]: https://krasjet.com/voice/pdf.tocgen/#step-1-build-a-recipe
77
- """.strip()
78
-
79
-
80
- def main():
81
- # parse arguments
82
- try:
83
- opts, args = getopt.gnu_getopt(
84
- sys.argv[1:],
85
- "hr:Hvo:gV",
86
- ["help", "recipe=", "human-readable", "vpos", "out=", "debug", "version"]
87
- )
88
- except GetoptError as e:
89
- print(e, file=sys.stderr)
90
- print(usage_s, file=sys.stderr)
91
- sys.exit(2)
92
-
93
- recipe_file: TextIO = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='ignore')
94
- readable: bool = False
95
- vpos: bool = False
96
- out: TextIO = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore')
97
- debug: bool = False
98
-
99
- for o, a in opts:
100
- if o in ("-H", "--human-readable"):
101
- readable = True
102
- elif o in ("-v", "--vpos"):
103
- vpos = True
104
- elif o in ("-r", "--recipe"):
105
- try:
106
- recipe_file = open(a, "r", encoding=get_file_encoding(a))
107
- except IOError as e:
108
- print("error: can't open file for reading", file=sys.stderr)
109
- print(e, file=sys.stderr)
110
- sys.exit(1)
111
- elif o in ("-o", "--out"):
112
- try:
113
- out = open(a, "w", encoding='utf-8', errors='ignore')
114
- except IOError as e:
115
- print("error: can't open file for writing", file=sys.stderr)
116
- print(e, file=sys.stderr)
117
- sys.exit(1)
118
- elif o in ("-g", "--debug"):
119
- debug = True
120
- elif o in ("-V", "--version"):
121
- print("pdftocgen", pdftocgen.__version__, file=sys.stderr)
122
- sys.exit()
123
- elif o in ("-h", "--help"):
124
- print(help_s, file=sys.stderr)
125
- sys.exit()
126
-
127
- if len(args) < 1:
128
- print("error: no input pdf is given", file=sys.stderr)
129
- print(usage_s, file=sys.stderr)
130
- sys.exit(1)
131
-
132
- path_in: str = args[0]
133
- # done parsing arguments
134
-
135
- try:
136
- with open_pdf(path_in) as doc:
137
- recipe = toml.load(recipe_file)
138
- toc = gen_toc(doc, recipe)
139
- if readable:
140
- print(pprint_toc(toc), file=out)
141
- else:
142
- print(dump_toc(toc, vpos), end="", file=out)
143
- except ValueError as e:
144
- if debug:
145
- raise e
146
- print("error:", e, file=sys.stderr)
147
- sys.exit(1)
148
- except IOError as e:
149
- if debug:
150
- raise e
151
- print("error: unable to open file", file=sys.stderr)
152
- print(e, file=sys.stderr)
153
- sys.exit(1)
154
- except KeyboardInterrupt as e:
155
- if debug:
156
- raise e
157
- print("error: interrupted", file=sys.stderr)
158
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pdftocgen/pdftocgen/filter.py DELETED
@@ -1,161 +0,0 @@
1
- """Filter on span dictionaries
2
-
3
- This module contains the internal representation of heading filters, which are
4
- used to test if a span should be included in the ToC.
5
- """
6
-
7
- import re
8
-
9
- from typing import Optional
10
- from re import Pattern
11
-
12
- DEF_TOLERANCE: float = 1e-5
13
-
14
-
15
- def admits_float(expect: Optional[float],
16
- actual: Optional[float],
17
- tolerance: float) -> bool:
18
- """Check if a float should be admitted by a filter"""
19
- return (expect is None) or \
20
- (actual is not None and abs(expect - actual) <= tolerance)
21
-
22
-
23
- class FontFilter:
24
- """Filter on font attributes"""
25
- name: Pattern
26
- size: Optional[float]
27
- size_tolerance: float
28
- color: Optional[int]
29
- flags: int
30
- # besides the usual true (1) and false (0), we have another state,
31
- # unset (x), where the truth table would be
32
- # a b diff?
33
- # 0 0 0
34
- # 0 1 1
35
- # 1 0 1
36
- # 1 1 0
37
- # x 0 0
38
- # x 1 0
39
- # it's very inefficient to compare bit by bit, which would take 5 bitwise
40
- # operations to compare, and then 4 to combine the results, we will use a
41
- # trick to reduce it to 2 ops.
42
- # step 1: use XOR to find different bits. if unset, set bit to 0, we will
43
- # take care of false positives in the next step
44
- # a b a^b
45
- # 0 0 0
46
- # 0 1 1
47
- # 1 0 1
48
- # 1 1 0
49
- # step 2: use AND with a ignore mask, (0 for ignored) to eliminate false
50
- # positives
51
- # a b a&b
52
- # 0 1 0 <- no diff
53
- # 0 0 0 <- no diff
54
- # 1 1 1 <- found difference
55
- # 1 0 0 <- ignored
56
- ign_mask: int
57
-
58
- def __init__(self, font_dict: dict):
59
- self.name = re.compile(font_dict.get('name', ""))
60
- self.size = font_dict.get('size')
61
- self.size_tolerance = font_dict.get('size_tolerance', DEF_TOLERANCE)
62
- self.color = font_dict.get('color')
63
- # some branchless trick, mainly to save space
64
- # x * True = x
65
- # x * False = 0
66
- self.flags = (0b00001 * font_dict.get('superscript', False) |
67
- 0b00010 * font_dict.get('italic', False) |
68
- 0b00100 * font_dict.get('serif', False) |
69
- 0b01000 * font_dict.get('monospace', False) |
70
- 0b10000 * font_dict.get('bold', False))
71
-
72
- self.ign_mask = (0b00001 * ('superscript' in font_dict) |
73
- 0b00010 * ('italic' in font_dict) |
74
- 0b00100 * ('serif' in font_dict) |
75
- 0b01000 * ('monospace' in font_dict) |
76
- 0b10000 * ('bold' in font_dict))
77
-
78
- def admits(self, spn: dict) -> bool:
79
- """Check if the font attributes admit the span
80
-
81
- Argument
82
- spn: the span dict to be checked
83
- Returns
84
- False if the span doesn't match current font attribute
85
- """
86
- if not self.name.search(spn.get('font', "")):
87
- return False
88
-
89
- if self.color is not None and self.color != spn.get('color'):
90
- return False
91
-
92
- if not admits_float(self.size, spn.get('size'), self.size_tolerance):
93
- return False
94
-
95
- flags = spn.get('flags', ~self.flags)
96
- # see above for explanation
97
- return not (flags ^ self.flags) & self.ign_mask
98
-
99
-
100
- class BoundingBoxFilter:
101
- """Filter on bounding boxes"""
102
- left: Optional[float]
103
- top: Optional[float]
104
- right: Optional[float]
105
- bottom: Optional[float]
106
- tolernace: float
107
-
108
- def __init__(self, bbox_dict: dict):
109
- self.left = bbox_dict.get('left')
110
- self.top = bbox_dict.get('top')
111
- self.right = bbox_dict.get('right')
112
- self.bottom = bbox_dict.get('bottom')
113
- self.tolerance = bbox_dict.get('tolerance', DEF_TOLERANCE)
114
-
115
- def admits(self, spn: dict) -> bool:
116
- """Check if the bounding box admit the span
117
-
118
- Argument
119
- spn: the span dict to be checked
120
- Returns
121
- False if the span doesn't match current bounding box setting
122
- """
123
- bbox = spn.get('bbox', (None, None, None, None))
124
- return (admits_float(self.left, bbox[0], self.tolerance) and
125
- admits_float(self.top, bbox[1], self.tolerance) and
126
- admits_float(self.right, bbox[2], self.tolerance) and
127
- admits_float(self.bottom, bbox[3], self.tolerance))
128
-
129
-
130
- class ToCFilter:
131
- """Filter on span dictionary to pick out headings in the ToC"""
132
- # The level of the title, strictly > 0
133
- level: int
134
- # When set, the filter will be more *greedy* and extract all the text in a
135
- # block even when at least one match occurs
136
- greedy: bool
137
- font: FontFilter
138
- bbox: BoundingBoxFilter
139
-
140
- def __init__(self, fltr_dict: dict):
141
- lvl = fltr_dict.get('level')
142
-
143
- if lvl is None:
144
- raise ValueError("filter's 'level' is not set")
145
- if lvl < 1:
146
- raise ValueError("filter's 'level' must be >= 1")
147
-
148
- self.level = lvl
149
- self.greedy = fltr_dict.get('greedy', False)
150
- self.font = FontFilter(fltr_dict.get('font', {}))
151
- self.bbox = BoundingBoxFilter(fltr_dict.get('bbox', {}))
152
-
153
- def admits(self, spn: dict) -> bool:
154
- """Check if the filter admits the span
155
-
156
- Arguments
157
- spn: the span dict to be checked
158
- Returns
159
- False if the span doesn't match the filter
160
- """
161
- return self.font.admits(spn) and self.bbox.admits(spn)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pdftocgen/pdftocgen/recipe.py DELETED
@@ -1,188 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Optional, List, Dict, Iterator
3
- from .filter import ToCFilter
4
- from fitzutils import ToCEntry
5
- from itertools import chain
6
- from collections import defaultdict
7
- from fitz import Document
8
-
9
-
10
- class FoundGreedy(Exception):
11
- """A hacky solution to do short-circuiting in Python.
12
-
13
- The main reason to do this short-circuiting is to untangle the logic of
14
- greedy filter with normal execution, which makes the typing and code much
15
- cleaner, but it can also save some unecessary comparisons.
16
-
17
- Probably similar to call/cc in scheme or longjump in C
18
- c.f. https://ds26gte.github.io/tyscheme/index-Z-H-15.html#node_sec_13.2
19
- """
20
- level: int
21
-
22
- def __init__(self, level):
23
- """
24
- Argument
25
- level: level of the greedy filter
26
- """
27
- super().__init__()
28
- self.level = level
29
-
30
-
31
- def blk_to_str(blk: dict) -> str:
32
- """Extract all the text inside a block"""
33
- return " ".join([
34
- spn.get('text', "").strip()
35
- for line in blk.get('lines', [])
36
- for spn in line.get('spans', [])
37
- ])
38
-
39
-
40
- @dataclass
41
- class Fragment:
42
- """A fragment of the extracted heading"""
43
- text: str
44
- level: int
45
-
46
-
47
- def concatFrag(frags: Iterator[Optional[Fragment]], sep: str = " ") -> Dict[int, str]:
48
- """Concatenate fragments to strings
49
-
50
- Returns
51
- a dictionary (level -> title) that contains the title for each level.
52
- """
53
- # accumulate a list of strings for each level of heading
54
- acc = defaultdict(list)
55
- for frag in frags:
56
- if frag is not None:
57
- acc[frag.level].append(frag.text)
58
-
59
- result = {}
60
- for level, strs in acc.items():
61
- result[level] = sep.join(strs)
62
- return result
63
-
64
-
65
- class Recipe:
66
- """The internal representation of a recipe"""
67
- filters: List[ToCFilter]
68
-
69
- def __init__(self, recipe_dict: dict):
70
- fltr_dicts = recipe_dict.get('heading', [])
71
-
72
- if len(fltr_dicts) == 0:
73
- raise ValueError("no filters found in recipe")
74
- self.filters = [ToCFilter(fltr) for fltr in fltr_dicts]
75
-
76
- def _extract_span(self, spn: dict) -> Optional[Fragment]:
77
- """Extract text from span along with level
78
-
79
- Argument
80
- spn: a span dictionary
81
- {
82
- 'bbox': (float, float, float, float),
83
- 'color': int,
84
- 'flags': int,
85
- 'font': str,
86
- 'size': float,
87
- 'text': str
88
- }
89
- Returns
90
- a fragment of the heading or None if no match
91
- """
92
- for fltr in self.filters:
93
- if fltr.admits(spn):
94
- text = spn.get('text', "").strip()
95
-
96
- if not text:
97
- # don't match empty spaces
98
- return None
99
-
100
- if fltr.greedy:
101
- # propagate all the way back to extract_block
102
- raise FoundGreedy(fltr.level)
103
-
104
- return Fragment(text, fltr.level)
105
- return None
106
-
107
- def _extract_line(self, line: dict) -> List[Optional[Fragment]]:
108
- """Extract matching heading fragments in a line.
109
-
110
- Argument
111
- line: a line dictionary
112
- {
113
- 'bbox': (float, float, float, float),
114
- 'wmode': int,
115
- 'dir': (float, float),
116
- 'spans': [dict]
117
- }
118
- Returns
119
- a list of fragments concatenated from result in a line
120
- """
121
- return [self._extract_span(spn) for spn in line.get('spans', [])]
122
-
123
- def extract_block(self, block: dict, page: int) -> List[ToCEntry]:
124
- """Extract matching headings in a block.
125
-
126
- Argument
127
- block: a block dictionary
128
- {
129
- 'bbox': (float, float, float, float),
130
- 'lines': [dict],
131
- 'type': int
132
- }
133
- Returns
134
- a list of toc entries, concatenated from the result of lines
135
- """
136
- if block.get('type') != 0:
137
- # not a text block
138
- return []
139
-
140
- vpos = block.get('bbox', (0, 0))[1]
141
-
142
- try:
143
- frags = chain.from_iterable([
144
- self._extract_line(ln) for ln in block.get('lines')
145
- ])
146
- titles = concatFrag(frags)
147
-
148
- return [
149
- ToCEntry(level, title, page, vpos)
150
- for level, title in titles.items()
151
- ]
152
- except FoundGreedy as e:
153
- # Smart Greedy: Only merged text that MATCHES the filter
154
- # Find the filter that triggered this level
155
- relevant_filter = next((f for f in self.filters if f.level == e.level), None)
156
-
157
- parts = []
158
- if relevant_filter:
159
- for ln in block.get('lines', []):
160
- for spn in ln.get('spans', []):
161
- if relevant_filter.admits(spn):
162
- parts.append(spn.get('text', "").strip())
163
-
164
- merged_text = " ".join(parts)
165
- if merged_text:
166
- return [ToCEntry(e.level, merged_text, page, vpos)]
167
- else:
168
- return []
169
-
170
-
171
- def extract_toc(doc: Document, recipe: Recipe) -> List[ToCEntry]:
172
- """Extract toc entries from a document
173
-
174
- Arguments
175
- doc: a pdf document
176
- recipe: recipe from user
177
- Returns
178
- a list of toc entries in the document
179
- """
180
- result = []
181
-
182
- for page in doc.pages():
183
- for blk in page.get_textpage().extractDICT().get('blocks', []):
184
- result.extend(
185
- recipe.extract_block(blk, page.number + 1)
186
- )
187
-
188
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pdftocgen/pdftocgen/tocgen.py DELETED
@@ -1,15 +0,0 @@
1
- from fitz import Document
2
- from typing import List
3
- from fitzutils import ToCEntry
4
- from .recipe import Recipe, extract_toc
5
-
6
- def gen_toc(doc: Document, recipe_dict: dict) -> List[ToCEntry]:
7
- """Generate the table of content for a document from recipe
8
-
9
- Argument
10
- doc: a pdf document
11
- recipe_dict: the recipe dictionary used to generate the toc
12
- Returns
13
- a list of ToC entries
14
- """
15
- return extract_toc(doc, Recipe(recipe_dict))