File size: 4,661 Bytes
046e3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""The executable of pdfxmeta"""

import getopt
import sys
import pdfxmeta
import io

from getopt import GetoptError
from typing import Optional, TextIO
from fitzutils import open_pdf
from textwrap import indent
from pdfxmeta import dump_meta, dump_toml, extract_meta


usage_s = """

usage: pdfxmeta [options] doc.pdf [pattern]

""".strip()

help_s = """

usage: pdfxmeta [options] doc.pdf [pattern]



Extract the metadata for pattern in doc.pdf.



To use this command, first open up the pdf file with your

favorite pdf reader and find the text you want to search

for. Then use



    $ pdfxmeta -p 1 in.pdf "Subsection One"



to find the metadata, mainly the font attributes and

bounding box, of lines containing the pattern "Subsection

One" on page 1. Specifying a page number is optional but

highly recommended, since it greatly reduces the ambiguity

of matches and execution time.



The output of this command can be directly copy-pasted to

build a recipe file for pdftocgen. Alternatively, you could

also use the --auto or -a flag to output a valid heading

filter directly



    $ pdfxmeta -p 1 -a 2 in.pdf "Subsection One" >> recipe.toml



where the argument of -a is the level of the heading filter,

which in this case is 2.



arguments

  doc.pdf            path to the input PDF document

  [pattern]          the pattern to search for (python regex). if not

                     given, dump the entire document



options

  -h, --help         show help

  -p, --page=PAGE    specify the page to search for (1-based index)

  -i, --ignore-case  when flag is set, search will be case-insensitive

  -a, --auto=LEVEL   when flag is set, the output would be a valid

                     heading filter of the specified heading level in

                     default settings. it is directly usable by

                     pdftocgen.

  -o, --out=FILE     path to the output file. if this flag is not

                     specified, the default is stdout

  -V, --version      show version number

""".strip()


def print_result(meta: dict) -> str:
    """pretty print results in a structured manner"""
    return f"{meta.get('text', '')}:\n{indent(dump_meta(meta), '    ')}"


def main():
    # parse arguments
    try:
        opts, args = getopt.gnu_getopt(
            sys.argv[1:],
            "hiVp:a:o:",
            ["help", "ignore-case", "version", "page=", "auto=", "out="]
        )
    except GetoptError as e:
        print(e, file=sys.stderr)
        print(usage_s, file=sys.stderr)
        sys.exit(2)

    ignore_case: bool = False
    page: Optional[int] = None
    auto_level: Optional[int] = None
    out: TextIO = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore')

    for o, a in opts:
        if o in ("-i", "--ignore-case"):
            ignore_case = True
        elif o in ("-p", "--page"):
            try:
                page = int(a)
            except ValueError as e:
                print("error: invalid page number", file=sys.stderr)
                sys.exit(1)
        elif o in ("-a", "--auto"):
            try:
                auto_level = int(a)
            except ValueError as e:
                print("error: invalid level", file=sys.stderr)
                sys.exit(1)
        elif o in ("-o", "--out"):
            try:
                out = open(a, "w", encoding='utf-8', errors='ignore')
            except IOError as e:
                print("error: can't open file for writing", file=sys.stderr)
                print(e, file=sys.stderr)
                sys.exit(1)
        elif o in ("-V", "--version"):
            print("pdfxmeta", pdfxmeta.__version__, file=sys.stderr)
            sys.exit()
        elif o in ("-h", "--help"):
            print(help_s, file=sys.stderr)
            sys.exit()

    argc = len(args)

    if argc < 1:
        print("error: no input pdf is given", file=sys.stderr)
        print(usage_s, file=sys.stderr)
        sys.exit(1)

    path_in: str = args[0]
    pattern: str = ""

    if argc >= 2:
        pattern = args[1]

    # done parsing arguments

    with open_pdf(path_in) as doc:
        meta = extract_meta(doc, pattern, page, ignore_case)

        # nothing found
        if len(meta) == 0:
            sys.exit(1)

        # should we add \n between each output?
        addnl = not out.isatty()

        if auto_level:
            print('\n'.join(
                [dump_toml(m, auto_level, addnl) for m in meta]
            ), file=out)
        else:
            print('\n'.join(map(print_result, meta)), file=out)