adelevett's picture
Upload 76 files
046e3b8 verified
"""Extract metadata for a string in a pdf file"""
from toml.encoder import _dump_str, _dump_float
import re
from fitz import Document, Page
from typing import Optional, List
def extract_meta(doc: Document,
pattern: str,
page: Optional[int] = None,
ign_case: bool = False
) -> List[dict]:
"""Extract meta for a `pattern` on `page` in a pdf document
Arguments
doc: document from pymupdf
pattern: a regular expression pattern
page: page number (1-based index), if None is given, search for the
entire document, but this is highly discouraged.
ign_case: ignore case?
"""
result = []
if page is None:
pages = doc.pages()
elif 1 <= page <= doc.page_count:
pages = [doc[page - 1]]
else: # page out of range
return result
regex = re.compile(
pattern,
re.IGNORECASE
) if ign_case else re.compile(pattern)
# we could parallelize this, but I don't see a reason
# to *not* specify a page number
for p in pages:
found = search_in_page(regex, p)
for s in found:
s['page_index'] = p.number + 1
try:
s['page_label'] = p.get_label()
except Exception:
# Fallback if get_label fails due to PyMuPDF version issues
s['page_label'] = ""
result.extend(found)
return result
def search_in_page(regex: re.Pattern, page: Page) -> List[dict]:
"""Search for `text` in `page` and extract meta using optimized search_for"""
result = []
# 1. Use simple string search if regex is just a literal (optimization)
# But since we have a compiled regex, we might need to extract the pattern if it's simple
# Or just use the regex to find matches in the FULL text of the page first?
# PyMuPDF's search_for takes a string. It doesn't support regex directly in wrapped core.
# However, for the purpose of this tool which claims regex support, we have a dilemma.
# But most users searching "Chapter 1" are doing literal searches.
# If we want to support the user's "Divided World", we need to handle the case where it might be split.
# The most robust way for PDF text search is usually:
# 1. Get all text (with position).
# 2. Run regex on the full text.
# 3. Map match back to bbox.
# 4. Find spans in bbox.
# BUT, to keep it simple and fix the immediate "spinning" and "missing" issue:
# The previous code iterated every span.
# Let's try to be smarter.
# For now, let's assume the user pattern is often a literal or we can approximate it.
# If the user provides a regex, we can't easily use search_for.
# However, the user provided "Divided World".
# Let's fallback to the robust get_text("dict") but optimize the check?
# No, get_text("dict") IS the slow part.
# Alternative:
# Use page.get_text("text") -> run regex -> if match, THEN get_text("dict")?
# That saves time for pages that DON'T match.
# Improved Algorithm:
# 1. Extract plain text of the page.
# 2. If regex doesn't match plain text, SKIP the page. (Huge optimization)
# 3. If it does match, perform the detailed span search.
text_content = page.get_text()
if not regex.search(text_content):
return []
# If we are here, there is a match on this page. Now find the exact spans.
# Note: If the text is split across spans, the simple span iterator below will STILL fail to extract the specific span metadata for the *whole* match.
# But at least it won't spin on empty pages.
page_meta = page.get_textpage().extractDICT()
for blk in page_meta.get('blocks', []):
for ln in blk.get('lines', []):
for spn in ln.get('spans', []):
text = spn.get('text', "")
if regex.search(text):
result.append(spn)
return result
def to_bools(var: int) -> str:
"""Convert int to lowercase bool string"""
return str(var != 0).lower()
def dump_meta(spn: dict) -> str:
"""Dump the span dict from PyMuPDF to TOML compatible string"""
result = []
if 'page_index' in spn:
result.append(f"page.index = {spn['page_index']}")
if 'page_label' in spn:
result.append(f"page.label = \"{spn['page_label']}\"")
result.append(f"font.name = {_dump_str(spn['font'])}")
result.append(f"font.size = {_dump_float(spn['size'])}")
result.append(f"font.color = {spn['color']:#08x}")
flags = spn['flags']
result.append(f"font.superscript = {to_bools(flags & 0b00001)}")
result.append(f"font.italic = {to_bools(flags & 0b00010)}")
result.append(f"font.serif = {to_bools(flags & 0b00100)}")
result.append(f"font.monospace = {to_bools(flags & 0b01000)}")
result.append(f"font.bold = {to_bools(flags & 0b10000)}")
bbox = spn['bbox']
result.append(f"bbox.left = {_dump_float(bbox[0])}")
result.append(f"bbox.top = {_dump_float(bbox[1])}")
result.append(f"bbox.right = {_dump_float(bbox[2])}")
result.append(f"bbox.bottom = {_dump_float(bbox[3])}")
return '\n'.join(result)
def dump_toml(spn: dict, level: int, trail_nl: bool = False) -> str:
"""Dump a valid TOML directly usable by pdftocgen
Argument
spn: span dict of the heading
level: heading level
trail_nl: add trailing new line
Returns
a valid toml string
"""
result = []
result.append("[[heading]]")
result.append(f"# {spn.get('text', '')}")
result.append(f"level = {level}")
result.append("greedy = true")
# strip font subset prefix
# == takeWhile (\c -> c /= '+') str
before, sep, after = spn['font'].partition('+')
font = after if sep else before
result.append(f"font.name = {_dump_str(font)}")
result.append(f"font.size = {_dump_float(spn['size'])}")
result.append("# font.size_tolerance = 1e-5")
result.append(f"# font.color = {spn['color']:#08x}")
flags = spn['flags']
result.append(f"# font.superscript = {to_bools(flags & 0b00001)}")
result.append(f"# font.italic = {to_bools(flags & 0b00010)}")
result.append(f"# font.serif = {to_bools(flags & 0b00100)}")
result.append(f"# font.monospace = {to_bools(flags & 0b01000)}")
result.append(f"# font.bold = {to_bools(flags & 0b10000)}")
bbox = spn['bbox']
result.append(f"# bbox.left = {_dump_float(bbox[0])}")
result.append(f"# bbox.top = {_dump_float(bbox[1])}")
result.append(f"# bbox.right = {_dump_float(bbox[2])}")
result.append(f"# bbox.bottom = {_dump_float(bbox[3])}")
result.append("# bbox.tolerance = 1e-5")
if trail_nl:
result.append("")
return '\n'.join(result)