Spaces:

adelevett
/

pdf.tocgen.split

Sleeping

pdf.tocgen.split

File size: 7,015 Bytes

046e3b8

"""Extract metadata for a string in a pdf file"""

from toml.encoder import _dump_str, _dump_float

import re

from fitz import Document, Page
from typing import Optional, List


def extract_meta(doc: Document,

                 pattern: str,

                 page: Optional[int] = None,

                 ign_case: bool = False

                 ) -> List[dict]:
    """Extract meta for a `pattern` on `page` in a pdf document



    Arguments

      doc: document from pymupdf

      pattern: a regular expression pattern

      page: page number (1-based index), if None is given, search for the

            entire document, but this is highly discouraged.

      ign_case: ignore case?

    """
    result = []

    if page is None:
        pages = doc.pages()
    elif 1 <= page <= doc.page_count:
        pages = [doc[page - 1]]
    else:  # page out of range
        return result

    regex = re.compile(
        pattern,
        re.IGNORECASE
    ) if ign_case else re.compile(pattern)

    # we could parallelize this, but I don't see a reason
    # to *not* specify a page number
    for p in pages:
        found = search_in_page(regex, p)
        for s in found:
            s['page_index'] = p.number + 1
            try:
                s['page_label'] = p.get_label()
            except Exception:
                # Fallback if get_label fails due to PyMuPDF version issues
                s['page_label'] = ""
        result.extend(found)

    return result


def search_in_page(regex: re.Pattern, page: Page) -> List[dict]:
    """Search for `text` in `page` and extract meta using optimized search_for"""
    result = []

    # 1. Use simple string search if regex is just a literal (optimization)
    # But since we have a compiled regex, we might need to extract the pattern if it's simple
    # Or just use the regex to find matches in the FULL text of the page first?
    # PyMuPDF's search_for takes a string. It doesn't support regex directly in wrapped core.
    # However, for the purpose of this tool which claims regex support, we have a dilemma.
    # But most users searching "Chapter 1" are doing literal searches.
    
    # If we want to support the user's "Divided World", we need to handle the case where it might be split.
    # The most robust way for PDF text search is usually:
    # 1. Get all text (with position).
    # 2. Run regex on the full text.
    # 3. Map match back to bbox.
    # 4. Find spans in bbox.
    
    # BUT, to keep it simple and fix the immediate "spinning" and "missing" issue:
    # The previous code iterated every span. 
    # Let's try to be smarter.
    
    # For now, let's assume the user pattern is often a literal or we can approximate it.
    # If the user provides a regex, we can't easily use search_for.
    # However, the user provided "Divided World".
    
    # Let's fallback to the robust get_text("dict") but optimize the check?
    # No, get_text("dict") IS the slow part.
    
    # Alternative:
    # Use page.get_text("text") -> run regex -> if match, THEN get_text("dict")?
    # That saves time for pages that DON'T match.
    
    # Improved Algorithm:
    # 1. Extract plain text of the page.
    # 2. If regex doesn't match plain text, SKIP the page. (Huge optimization)
    # 3. If it does match, perform the detailed span search.

    text_content = page.get_text()
    if not regex.search(text_content):
        return []

    # If we are here, there is a match on this page. Now find the exact spans.
    # Note: If the text is split across spans, the simple span iterator below will STILL fail to extract the specific span metadata for the *whole* match.
    # But at least it won't spin on empty pages.
    
    page_meta = page.get_textpage().extractDICT()

    for blk in page_meta.get('blocks', []):
        for ln in blk.get('lines', []):
            for spn in ln.get('spans', []):
                text = spn.get('text', "")
                if regex.search(text):
                    result.append(spn)
    return result


def to_bools(var: int) -> str:
    """Convert int to lowercase bool string"""
    return str(var != 0).lower()


def dump_meta(spn: dict) -> str:
    """Dump the span dict from PyMuPDF to TOML compatible string"""
    result = []

    if 'page_index' in spn:
        result.append(f"page.index = {spn['page_index']}")
    if 'page_label' in spn:
        result.append(f"page.label = \"{spn['page_label']}\"")

    result.append(f"font.name = {_dump_str(spn['font'])}")
    result.append(f"font.size = {_dump_float(spn['size'])}")
    result.append(f"font.color = {spn['color']:#08x}")

    flags = spn['flags']

    result.append(f"font.superscript = {to_bools(flags & 0b00001)}")
    result.append(f"font.italic = {to_bools(flags & 0b00010)}")
    result.append(f"font.serif = {to_bools(flags & 0b00100)}")
    result.append(f"font.monospace = {to_bools(flags & 0b01000)}")
    result.append(f"font.bold = {to_bools(flags & 0b10000)}")

    bbox = spn['bbox']

    result.append(f"bbox.left = {_dump_float(bbox[0])}")
    result.append(f"bbox.top = {_dump_float(bbox[1])}")
    result.append(f"bbox.right = {_dump_float(bbox[2])}")
    result.append(f"bbox.bottom = {_dump_float(bbox[3])}")

    return '\n'.join(result)


def dump_toml(spn: dict, level: int, trail_nl: bool = False) -> str:
    """Dump a valid TOML directly usable by pdftocgen



    Argument

      spn: span dict of the heading

      level: heading level

      trail_nl: add trailing new line

    Returns

      a valid toml string

    """
    result = []

    result.append("[[heading]]")
    result.append(f"# {spn.get('text', '')}")
    result.append(f"level = {level}")
    result.append("greedy = true")

    # strip font subset prefix
    # == takeWhile (\c -> c /= '+') str
    before, sep, after = spn['font'].partition('+')
    font = after if sep else before

    result.append(f"font.name = {_dump_str(font)}")
    result.append(f"font.size = {_dump_float(spn['size'])}")
    result.append("# font.size_tolerance = 1e-5")
    result.append(f"# font.color = {spn['color']:#08x}")

    flags = spn['flags']

    result.append(f"# font.superscript = {to_bools(flags & 0b00001)}")
    result.append(f"# font.italic = {to_bools(flags & 0b00010)}")
    result.append(f"# font.serif = {to_bools(flags & 0b00100)}")
    result.append(f"# font.monospace = {to_bools(flags & 0b01000)}")
    result.append(f"# font.bold = {to_bools(flags & 0b10000)}")

    bbox = spn['bbox']

    result.append(f"# bbox.left = {_dump_float(bbox[0])}")
    result.append(f"# bbox.top = {_dump_float(bbox[1])}")
    result.append(f"# bbox.right = {_dump_float(bbox[2])}")
    result.append(f"# bbox.bottom = {_dump_float(bbox[3])}")
    result.append("# bbox.tolerance = 1e-5")

    if trail_nl:
        result.append("")

    return '\n'.join(result)