"""Extract metadata for a string in a pdf file""" from toml.encoder import _dump_str, _dump_float import re from fitz import Document, Page from typing import Optional, List def extract_meta(doc: Document, pattern: str, page: Optional[int] = None, ign_case: bool = False ) -> List[dict]: """Extract meta for a `pattern` on `page` in a pdf document Arguments doc: document from pymupdf pattern: a regular expression pattern page: page number (1-based index), if None is given, search for the entire document, but this is highly discouraged. ign_case: ignore case? """ result = [] if page is None: pages = doc.pages() elif 1 <= page <= doc.page_count: pages = [doc[page - 1]] else: # page out of range return result regex = re.compile( pattern, re.IGNORECASE ) if ign_case else re.compile(pattern) # we could parallelize this, but I don't see a reason # to *not* specify a page number for p in pages: found = search_in_page(regex, p) for s in found: s['page_index'] = p.number + 1 try: s['page_label'] = p.get_label() except Exception: # Fallback if get_label fails due to PyMuPDF version issues s['page_label'] = "" result.extend(found) return result def search_in_page(regex: re.Pattern, page: Page) -> List[dict]: """Search for `text` in `page` and extract meta using optimized search_for""" result = [] # 1. Use simple string search if regex is just a literal (optimization) # But since we have a compiled regex, we might need to extract the pattern if it's simple # Or just use the regex to find matches in the FULL text of the page first? # PyMuPDF's search_for takes a string. It doesn't support regex directly in wrapped core. # However, for the purpose of this tool which claims regex support, we have a dilemma. # But most users searching "Chapter 1" are doing literal searches. # If we want to support the user's "Divided World", we need to handle the case where it might be split. # The most robust way for PDF text search is usually: # 1. Get all text (with position). # 2. Run regex on the full text. # 3. Map match back to bbox. # 4. Find spans in bbox. # BUT, to keep it simple and fix the immediate "spinning" and "missing" issue: # The previous code iterated every span. # Let's try to be smarter. # For now, let's assume the user pattern is often a literal or we can approximate it. # If the user provides a regex, we can't easily use search_for. # However, the user provided "Divided World". # Let's fallback to the robust get_text("dict") but optimize the check? # No, get_text("dict") IS the slow part. # Alternative: # Use page.get_text("text") -> run regex -> if match, THEN get_text("dict")? # That saves time for pages that DON'T match. # Improved Algorithm: # 1. Extract plain text of the page. # 2. If regex doesn't match plain text, SKIP the page. (Huge optimization) # 3. If it does match, perform the detailed span search. text_content = page.get_text() if not regex.search(text_content): return [] # If we are here, there is a match on this page. Now find the exact spans. # Note: If the text is split across spans, the simple span iterator below will STILL fail to extract the specific span metadata for the *whole* match. # But at least it won't spin on empty pages. page_meta = page.get_textpage().extractDICT() for blk in page_meta.get('blocks', []): for ln in blk.get('lines', []): for spn in ln.get('spans', []): text = spn.get('text', "") if regex.search(text): result.append(spn) return result def to_bools(var: int) -> str: """Convert int to lowercase bool string""" return str(var != 0).lower() def dump_meta(spn: dict) -> str: """Dump the span dict from PyMuPDF to TOML compatible string""" result = [] if 'page_index' in spn: result.append(f"page.index = {spn['page_index']}") if 'page_label' in spn: result.append(f"page.label = \"{spn['page_label']}\"") result.append(f"font.name = {_dump_str(spn['font'])}") result.append(f"font.size = {_dump_float(spn['size'])}") result.append(f"font.color = {spn['color']:#08x}") flags = spn['flags'] result.append(f"font.superscript = {to_bools(flags & 0b00001)}") result.append(f"font.italic = {to_bools(flags & 0b00010)}") result.append(f"font.serif = {to_bools(flags & 0b00100)}") result.append(f"font.monospace = {to_bools(flags & 0b01000)}") result.append(f"font.bold = {to_bools(flags & 0b10000)}") bbox = spn['bbox'] result.append(f"bbox.left = {_dump_float(bbox[0])}") result.append(f"bbox.top = {_dump_float(bbox[1])}") result.append(f"bbox.right = {_dump_float(bbox[2])}") result.append(f"bbox.bottom = {_dump_float(bbox[3])}") return '\n'.join(result) def dump_toml(spn: dict, level: int, trail_nl: bool = False) -> str: """Dump a valid TOML directly usable by pdftocgen Argument spn: span dict of the heading level: heading level trail_nl: add trailing new line Returns a valid toml string """ result = [] result.append("[[heading]]") result.append(f"# {spn.get('text', '')}") result.append(f"level = {level}") result.append("greedy = true") # strip font subset prefix # == takeWhile (\c -> c /= '+') str before, sep, after = spn['font'].partition('+') font = after if sep else before result.append(f"font.name = {_dump_str(font)}") result.append(f"font.size = {_dump_float(spn['size'])}") result.append("# font.size_tolerance = 1e-5") result.append(f"# font.color = {spn['color']:#08x}") flags = spn['flags'] result.append(f"# font.superscript = {to_bools(flags & 0b00001)}") result.append(f"# font.italic = {to_bools(flags & 0b00010)}") result.append(f"# font.serif = {to_bools(flags & 0b00100)}") result.append(f"# font.monospace = {to_bools(flags & 0b01000)}") result.append(f"# font.bold = {to_bools(flags & 0b10000)}") bbox = spn['bbox'] result.append(f"# bbox.left = {_dump_float(bbox[0])}") result.append(f"# bbox.top = {_dump_float(bbox[1])}") result.append(f"# bbox.right = {_dump_float(bbox[2])}") result.append(f"# bbox.bottom = {_dump_float(bbox[3])}") result.append("# bbox.tolerance = 1e-5") if trail_nl: result.append("") return '\n'.join(result)