Spaces:

adelevett
/

pdf.tocgen.split

Sleeping

App Files Files Community

pdf.tocgen.split / pdfxmeta /pdfxmeta.py

adelevett

Upload 76 files

046e3b8 verified about 1 month ago

raw

history blame contribute delete

7.02 kB

	"""Extract metadata for a string in a pdf file"""

	from toml.encoder import _dump_str, _dump_float

	import re

	from fitz import Document, Page
	from typing import Optional, List


	def extract_meta(doc: Document,
	pattern: str,
	page: Optional[int] = None,
	ign_case: bool = False
	) -> List[dict]:
	"""Extract meta for a `pattern` on `page` in a pdf document

	Arguments
	doc: document from pymupdf
	pattern: a regular expression pattern
	page: page number (1-based index), if None is given, search for the
	entire document, but this is highly discouraged.
	ign_case: ignore case?
	"""
	result = []

	if page is None:
	pages = doc.pages()
	elif 1 <= page <= doc.page_count:
	pages = [doc[page - 1]]
	else: # page out of range
	return result

	regex = re.compile(
	pattern,
	re.IGNORECASE
	) if ign_case else re.compile(pattern)

	# we could parallelize this, but I don't see a reason
	# to not specify a page number
	for p in pages:
	found = search_in_page(regex, p)
	for s in found:
	s['page_index'] = p.number + 1
	try:
	s['page_label'] = p.get_label()
	except Exception:
	# Fallback if get_label fails due to PyMuPDF version issues
	s['page_label'] = ""
	result.extend(found)

	return result


	def search_in_page(regex: re.Pattern, page: Page) -> List[dict]:
	"""Search for `text` in `page` and extract meta using optimized search_for"""
	result = []

	# 1. Use simple string search if regex is just a literal (optimization)
	# But since we have a compiled regex, we might need to extract the pattern if it's simple
	# Or just use the regex to find matches in the FULL text of the page first?
	# PyMuPDF's search_for takes a string. It doesn't support regex directly in wrapped core.
	# However, for the purpose of this tool which claims regex support, we have a dilemma.
	# But most users searching "Chapter 1" are doing literal searches.

	# If we want to support the user's "Divided World", we need to handle the case where it might be split.
	# The most robust way for PDF text search is usually:
	# 1. Get all text (with position).
	# 2. Run regex on the full text.
	# 3. Map match back to bbox.
	# 4. Find spans in bbox.

	# BUT, to keep it simple and fix the immediate "spinning" and "missing" issue:
	# The previous code iterated every span.
	# Let's try to be smarter.

	# For now, let's assume the user pattern is often a literal or we can approximate it.
	# If the user provides a regex, we can't easily use search_for.
	# However, the user provided "Divided World".

	# Let's fallback to the robust get_text("dict") but optimize the check?
	# No, get_text("dict") IS the slow part.

	# Alternative:
	# Use page.get_text("text") -> run regex -> if match, THEN get_text("dict")?
	# That saves time for pages that DON'T match.

	# Improved Algorithm:
	# 1. Extract plain text of the page.
	# 2. If regex doesn't match plain text, SKIP the page. (Huge optimization)
	# 3. If it does match, perform the detailed span search.

	text_content = page.get_text()
	if not regex.search(text_content):
	return []

	# If we are here, there is a match on this page. Now find the exact spans.
	# Note: If the text is split across spans, the simple span iterator below will STILL fail to extract the specific span metadata for the whole match.
	# But at least it won't spin on empty pages.

	page_meta = page.get_textpage().extractDICT()

	for blk in page_meta.get('blocks', []):
	for ln in blk.get('lines', []):
	for spn in ln.get('spans', []):
	text = spn.get('text', "")
	if regex.search(text):
	result.append(spn)
	return result


	def to_bools(var: int) -> str:
	"""Convert int to lowercase bool string"""
	return str(var != 0).lower()


	def dump_meta(spn: dict) -> str:
	"""Dump the span dict from PyMuPDF to TOML compatible string"""
	result = []

	if 'page_index' in spn:
	result.append(f"page.index = {spn['page_index']}")
	if 'page_label' in spn:
	result.append(f"page.label = \"{spn['page_label']}\"")

	result.append(f"font.name = {_dump_str(spn['font'])}")
	result.append(f"font.size = {_dump_float(spn['size'])}")
	result.append(f"font.color = {spn['color']:#08x}")

	flags = spn['flags']

	result.append(f"font.superscript = {to_bools(flags & 0b00001)}")
	result.append(f"font.italic = {to_bools(flags & 0b00010)}")
	result.append(f"font.serif = {to_bools(flags & 0b00100)}")
	result.append(f"font.monospace = {to_bools(flags & 0b01000)}")
	result.append(f"font.bold = {to_bools(flags & 0b10000)}")

	bbox = spn['bbox']

	result.append(f"bbox.left = {_dump_float(bbox[0])}")
	result.append(f"bbox.top = {_dump_float(bbox[1])}")
	result.append(f"bbox.right = {_dump_float(bbox[2])}")
	result.append(f"bbox.bottom = {_dump_float(bbox[3])}")

	return '\n'.join(result)


	def dump_toml(spn: dict, level: int, trail_nl: bool = False) -> str:
	"""Dump a valid TOML directly usable by pdftocgen

	Argument
	spn: span dict of the heading
	level: heading level
	trail_nl: add trailing new line
	Returns
	a valid toml string
	"""
	result = []

	result.append("[[heading]]")
	result.append(f"# {spn.get('text', '')}")
	result.append(f"level = {level}")
	result.append("greedy = true")

	# strip font subset prefix
	# == takeWhile (\c -> c /= '+') str
	before, sep, after = spn['font'].partition('+')
	font = after if sep else before

	result.append(f"font.name = {_dump_str(font)}")
	result.append(f"font.size = {_dump_float(spn['size'])}")
	result.append("# font.size_tolerance = 1e-5")
	result.append(f"# font.color = {spn['color']:#08x}")

	flags = spn['flags']

	result.append(f"# font.superscript = {to_bools(flags & 0b00001)}")
	result.append(f"# font.italic = {to_bools(flags & 0b00010)}")
	result.append(f"# font.serif = {to_bools(flags & 0b00100)}")
	result.append(f"# font.monospace = {to_bools(flags & 0b01000)}")
	result.append(f"# font.bold = {to_bools(flags & 0b10000)}")

	bbox = spn['bbox']

	result.append(f"# bbox.left = {_dump_float(bbox[0])}")
	result.append(f"# bbox.top = {_dump_float(bbox[1])}")
	result.append(f"# bbox.right = {_dump_float(bbox[2])}")
	result.append(f"# bbox.bottom = {_dump_float(bbox[3])}")
	result.append("# bbox.tolerance = 1e-5")

	if trail_nl:
	result.append("")

	return '\n'.join(result)