Spaces:
Sleeping
Sleeping
File size: 4,661 Bytes
046e3b8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | """The executable of pdfxmeta"""
import getopt
import sys
import pdfxmeta
import io
from getopt import GetoptError
from typing import Optional, TextIO
from fitzutils import open_pdf
from textwrap import indent
from pdfxmeta import dump_meta, dump_toml, extract_meta
usage_s = """
usage: pdfxmeta [options] doc.pdf [pattern]
""".strip()
help_s = """
usage: pdfxmeta [options] doc.pdf [pattern]
Extract the metadata for pattern in doc.pdf.
To use this command, first open up the pdf file with your
favorite pdf reader and find the text you want to search
for. Then use
$ pdfxmeta -p 1 in.pdf "Subsection One"
to find the metadata, mainly the font attributes and
bounding box, of lines containing the pattern "Subsection
One" on page 1. Specifying a page number is optional but
highly recommended, since it greatly reduces the ambiguity
of matches and execution time.
The output of this command can be directly copy-pasted to
build a recipe file for pdftocgen. Alternatively, you could
also use the --auto or -a flag to output a valid heading
filter directly
$ pdfxmeta -p 1 -a 2 in.pdf "Subsection One" >> recipe.toml
where the argument of -a is the level of the heading filter,
which in this case is 2.
arguments
doc.pdf path to the input PDF document
[pattern] the pattern to search for (python regex). if not
given, dump the entire document
options
-h, --help show help
-p, --page=PAGE specify the page to search for (1-based index)
-i, --ignore-case when flag is set, search will be case-insensitive
-a, --auto=LEVEL when flag is set, the output would be a valid
heading filter of the specified heading level in
default settings. it is directly usable by
pdftocgen.
-o, --out=FILE path to the output file. if this flag is not
specified, the default is stdout
-V, --version show version number
""".strip()
def print_result(meta: dict) -> str:
"""pretty print results in a structured manner"""
return f"{meta.get('text', '')}:\n{indent(dump_meta(meta), ' ')}"
def main():
# parse arguments
try:
opts, args = getopt.gnu_getopt(
sys.argv[1:],
"hiVp:a:o:",
["help", "ignore-case", "version", "page=", "auto=", "out="]
)
except GetoptError as e:
print(e, file=sys.stderr)
print(usage_s, file=sys.stderr)
sys.exit(2)
ignore_case: bool = False
page: Optional[int] = None
auto_level: Optional[int] = None
out: TextIO = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore')
for o, a in opts:
if o in ("-i", "--ignore-case"):
ignore_case = True
elif o in ("-p", "--page"):
try:
page = int(a)
except ValueError as e:
print("error: invalid page number", file=sys.stderr)
sys.exit(1)
elif o in ("-a", "--auto"):
try:
auto_level = int(a)
except ValueError as e:
print("error: invalid level", file=sys.stderr)
sys.exit(1)
elif o in ("-o", "--out"):
try:
out = open(a, "w", encoding='utf-8', errors='ignore')
except IOError as e:
print("error: can't open file for writing", file=sys.stderr)
print(e, file=sys.stderr)
sys.exit(1)
elif o in ("-V", "--version"):
print("pdfxmeta", pdfxmeta.__version__, file=sys.stderr)
sys.exit()
elif o in ("-h", "--help"):
print(help_s, file=sys.stderr)
sys.exit()
argc = len(args)
if argc < 1:
print("error: no input pdf is given", file=sys.stderr)
print(usage_s, file=sys.stderr)
sys.exit(1)
path_in: str = args[0]
pattern: str = ""
if argc >= 2:
pattern = args[1]
# done parsing arguments
with open_pdf(path_in) as doc:
meta = extract_meta(doc, pattern, page, ignore_case)
# nothing found
if len(meta) == 0:
sys.exit(1)
# should we add \n between each output?
addnl = not out.isatty()
if auto_level:
print('\n'.join(
[dump_toml(m, auto_level, addnl) for m in meta]
), file=out)
else:
print('\n'.join(map(print_result, meta)), file=out)
|