Spaces:

adelevett
/

pdf.tocgen.split

Sleeping

App Files Files Community

pdf.tocgen.split / pdfxmeta /app.py

adelevett

Upload 76 files

046e3b8 verified about 1 month ago

raw

history blame contribute delete

4.66 kB

	"""The executable of pdfxmeta"""

	import getopt
	import sys
	import pdfxmeta
	import io

	from getopt import GetoptError
	from typing import Optional, TextIO
	from fitzutils import open_pdf
	from textwrap import indent
	from pdfxmeta import dump_meta, dump_toml, extract_meta


	usage_s = """
	usage: pdfxmeta [options] doc.pdf [pattern]
	""".strip()

	help_s = """
	usage: pdfxmeta [options] doc.pdf [pattern]

	Extract the metadata for pattern in doc.pdf.

	To use this command, first open up the pdf file with your
	favorite pdf reader and find the text you want to search
	for. Then use

	$ pdfxmeta -p 1 in.pdf "Subsection One"

	to find the metadata, mainly the font attributes and
	bounding box, of lines containing the pattern "Subsection
	One" on page 1. Specifying a page number is optional but
	highly recommended, since it greatly reduces the ambiguity
	of matches and execution time.

	The output of this command can be directly copy-pasted to
	build a recipe file for pdftocgen. Alternatively, you could
	also use the --auto or -a flag to output a valid heading
	filter directly

	$ pdfxmeta -p 1 -a 2 in.pdf "Subsection One" >> recipe.toml

	where the argument of -a is the level of the heading filter,
	which in this case is 2.

	arguments
	doc.pdf path to the input PDF document
	[pattern] the pattern to search for (python regex). if not
	given, dump the entire document

	options
	-h, --help show help
	-p, --page=PAGE specify the page to search for (1-based index)
	-i, --ignore-case when flag is set, search will be case-insensitive
	-a, --auto=LEVEL when flag is set, the output would be a valid
	heading filter of the specified heading level in
	default settings. it is directly usable by
	pdftocgen.
	-o, --out=FILE path to the output file. if this flag is not
	specified, the default is stdout
	-V, --version show version number
	""".strip()


	def print_result(meta: dict) -> str:
	"""pretty print results in a structured manner"""
	return f"{meta.get('text', '')}:\n{indent(dump_meta(meta), ' ')}"


	def main():
	# parse arguments
	try:
	opts, args = getopt.gnu_getopt(
	sys.argv[1:],
	"hiVp:a:o:",
	["help", "ignore-case", "version", "page=", "auto=", "out="]
	)
	except GetoptError as e:
	print(e, file=sys.stderr)
	print(usage_s, file=sys.stderr)
	sys.exit(2)

	ignore_case: bool = False
	page: Optional[int] = None
	auto_level: Optional[int] = None
	out: TextIO = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore')

	for o, a in opts:
	if o in ("-i", "--ignore-case"):
	ignore_case = True
	elif o in ("-p", "--page"):
	try:
	page = int(a)
	except ValueError as e:
	print("error: invalid page number", file=sys.stderr)
	sys.exit(1)
	elif o in ("-a", "--auto"):
	try:
	auto_level = int(a)
	except ValueError as e:
	print("error: invalid level", file=sys.stderr)
	sys.exit(1)
	elif o in ("-o", "--out"):
	try:
	out = open(a, "w", encoding='utf-8', errors='ignore')
	except IOError as e:
	print("error: can't open file for writing", file=sys.stderr)
	print(e, file=sys.stderr)
	sys.exit(1)
	elif o in ("-V", "--version"):
	print("pdfxmeta", pdfxmeta.__version__, file=sys.stderr)
	sys.exit()
	elif o in ("-h", "--help"):
	print(help_s, file=sys.stderr)
	sys.exit()

	argc = len(args)

	if argc < 1:
	print("error: no input pdf is given", file=sys.stderr)
	print(usage_s, file=sys.stderr)
	sys.exit(1)

	path_in: str = args[0]
	pattern: str = ""

	if argc >= 2:
	pattern = args[1]

	# done parsing arguments

	with open_pdf(path_in) as doc:
	meta = extract_meta(doc, pattern, page, ignore_case)

	# nothing found
	if len(meta) == 0:
	sys.exit(1)

	# should we add \n between each output?
	addnl = not out.isatty()

	if auto_level:
	print('\n'.join(
	[dump_toml(m, auto_level, addnl) for m in meta]
	), file=out)
	else:
	print('\n'.join(map(print_result, meta)), file=out)