"""The executable of pdfxmeta""" import getopt import sys import pdfxmeta import io from getopt import GetoptError from typing import Optional, TextIO from fitzutils import open_pdf from textwrap import indent from pdfxmeta import dump_meta, dump_toml, extract_meta usage_s = """ usage: pdfxmeta [options] doc.pdf [pattern] """.strip() help_s = """ usage: pdfxmeta [options] doc.pdf [pattern] Extract the metadata for pattern in doc.pdf. To use this command, first open up the pdf file with your favorite pdf reader and find the text you want to search for. Then use $ pdfxmeta -p 1 in.pdf "Subsection One" to find the metadata, mainly the font attributes and bounding box, of lines containing the pattern "Subsection One" on page 1. Specifying a page number is optional but highly recommended, since it greatly reduces the ambiguity of matches and execution time. The output of this command can be directly copy-pasted to build a recipe file for pdftocgen. Alternatively, you could also use the --auto or -a flag to output a valid heading filter directly $ pdfxmeta -p 1 -a 2 in.pdf "Subsection One" >> recipe.toml where the argument of -a is the level of the heading filter, which in this case is 2. arguments doc.pdf path to the input PDF document [pattern] the pattern to search for (python regex). if not given, dump the entire document options -h, --help show help -p, --page=PAGE specify the page to search for (1-based index) -i, --ignore-case when flag is set, search will be case-insensitive -a, --auto=LEVEL when flag is set, the output would be a valid heading filter of the specified heading level in default settings. it is directly usable by pdftocgen. -o, --out=FILE path to the output file. if this flag is not specified, the default is stdout -V, --version show version number """.strip() def print_result(meta: dict) -> str: """pretty print results in a structured manner""" return f"{meta.get('text', '')}:\n{indent(dump_meta(meta), ' ')}" def main(): # parse arguments try: opts, args = getopt.gnu_getopt( sys.argv[1:], "hiVp:a:o:", ["help", "ignore-case", "version", "page=", "auto=", "out="] ) except GetoptError as e: print(e, file=sys.stderr) print(usage_s, file=sys.stderr) sys.exit(2) ignore_case: bool = False page: Optional[int] = None auto_level: Optional[int] = None out: TextIO = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore') for o, a in opts: if o in ("-i", "--ignore-case"): ignore_case = True elif o in ("-p", "--page"): try: page = int(a) except ValueError as e: print("error: invalid page number", file=sys.stderr) sys.exit(1) elif o in ("-a", "--auto"): try: auto_level = int(a) except ValueError as e: print("error: invalid level", file=sys.stderr) sys.exit(1) elif o in ("-o", "--out"): try: out = open(a, "w", encoding='utf-8', errors='ignore') except IOError as e: print("error: can't open file for writing", file=sys.stderr) print(e, file=sys.stderr) sys.exit(1) elif o in ("-V", "--version"): print("pdfxmeta", pdfxmeta.__version__, file=sys.stderr) sys.exit() elif o in ("-h", "--help"): print(help_s, file=sys.stderr) sys.exit() argc = len(args) if argc < 1: print("error: no input pdf is given", file=sys.stderr) print(usage_s, file=sys.stderr) sys.exit(1) path_in: str = args[0] pattern: str = "" if argc >= 2: pattern = args[1] # done parsing arguments with open_pdf(path_in) as doc: meta = extract_meta(doc, pattern, page, ignore_case) # nothing found if len(meta) == 0: sys.exit(1) # should we add \n between each output? addnl = not out.isatty() if auto_level: print('\n'.join( [dump_toml(m, auto_level, addnl) for m in meta] ), file=out) else: print('\n'.join(map(print_result, meta)), file=out)