File size: 5,038 Bytes
d600971
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""The executable of pdftocgen"""

import toml
import sys
import getopt
import pdftocgen
import io

from getopt import GetoptError
from typing import TextIO
from fitzutils import open_pdf, dump_toc, pprint_toc, get_file_encoding
from .tocgen import gen_toc

usage_s = """

usage: pdftocgen [options] doc.pdf < recipe.toml

""".strip()

help_s = """

usage: pdftocgen [options] doc.pdf < recipe.toml



Generate PDF table of contents from a recipe file.



This command automatically generates a table of contents for

doc.pdf based on the font attributes and position of

headings specified in a TOML recipe file. See [1] for an

introduction to recipe files.



To generate the table of contents for a pdf, use input

redirection or pipes to supply a recipe file



    $ pdftocgen in.pdf < recipe.toml



or alternatively use the -r flag



    $ pdftocgen -r recipe.toml in.pdf



The output of this command can be directly piped into

pdftocio to generate a new pdf file using the generated

table of contents



    $ pdftocgen -r recipe.toml in.pdf | pdftocio -o out.pdf in.pdf



or you could save the output of this command to a file for

further tweaking using output redirection



    $ pdftocgen -r recipe.toml in.pdf > toc



or the -o flag:



    $ pdftocgen -r recipe.toml -o toc in.pdf



If you only need a readable format of the table of contents,

use the -H flag



    $ pdftocgen -r recipe.toml -H in.pdf



This format cannot be parsed by pdftocio, but it is slightly

more readable.



arguments

  doc.pdf                   path to the input PDF document



options

  -h, --help                show help

  -r, --recipe=recipe.toml  path to the recipe file. if this flag is

                            not specified, the default is stdin

  -H, --human-readable      print the toc in a readable format

  -v, --vpos                if this flag is set, the vertical position

                            of each heading will be generated in the

                            output

  -o, --out=file            path to the output file. if this flag is

                            not specified, the default is stdout

  -g, --debug               enable debug mode

  -V, --version             show version number



[1]: https://krasjet.com/voice/pdf.tocgen/#step-1-build-a-recipe

""".strip()


def main():
    # parse arguments
    try:
        opts, args = getopt.gnu_getopt(
            sys.argv[1:],
            "hr:Hvo:gV",
            ["help", "recipe=", "human-readable", "vpos", "out=", "debug", "version"]
        )
    except GetoptError as e:
        print(e, file=sys.stderr)
        print(usage_s, file=sys.stderr)
        sys.exit(2)

    recipe_file: TextIO = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='ignore')
    readable: bool = False
    vpos: bool = False
    out: TextIO = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore')
    debug: bool = False

    for o, a in opts:
        if o in ("-H", "--human-readable"):
            readable = True
        elif o in ("-v", "--vpos"):
            vpos = True
        elif o in ("-r", "--recipe"):
            try:
                recipe_file = open(a, "r", encoding=get_file_encoding(a))
            except IOError as e:
                print("error: can't open file for reading", file=sys.stderr)
                print(e, file=sys.stderr)
                sys.exit(1)
        elif o in ("-o", "--out"):
            try:
                out = open(a, "w", encoding='utf-8', errors='ignore')
            except IOError as e:
                print("error: can't open file for writing", file=sys.stderr)
                print(e, file=sys.stderr)
                sys.exit(1)
        elif o in ("-g", "--debug"):
            debug = True
        elif o in ("-V", "--version"):
            print("pdftocgen", pdftocgen.__version__, file=sys.stderr)
            sys.exit()
        elif o in ("-h", "--help"):
            print(help_s, file=sys.stderr)
            sys.exit()

    if len(args) < 1:
        print("error: no input pdf is given", file=sys.stderr)
        print(usage_s, file=sys.stderr)
        sys.exit(1)

    path_in: str = args[0]
    # done parsing arguments

    try:
        with open_pdf(path_in) as doc:
            recipe = toml.load(recipe_file)
            toc = gen_toc(doc, recipe)
            if readable:
                print(pprint_toc(toc), file=out)
            else:
                print(dump_toc(toc, vpos), end="", file=out)
    except ValueError as e:
        if debug:
            raise e
        print("error:", e, file=sys.stderr)
        sys.exit(1)
    except IOError as e:
        if debug:
            raise e
        print("error: unable to open file", file=sys.stderr)
        print(e, file=sys.stderr)
        sys.exit(1)
    except KeyboardInterrupt as e:
        if debug:
            raise e
        print("error: interrupted", file=sys.stderr)
        sys.exit(1)