Spaces:
Sleeping
Sleeping
| """Extract metadata for a string in a pdf file""" | |
| from toml.encoder import _dump_str, _dump_float | |
| import re | |
| from fitz import Document, Page | |
| from typing import Optional, List | |
| def extract_meta(doc: Document, | |
| pattern: str, | |
| page: Optional[int] = None, | |
| ign_case: bool = False | |
| ) -> List[dict]: | |
| """Extract meta for a `pattern` on `page` in a pdf document | |
| Arguments | |
| doc: document from pymupdf | |
| pattern: a regular expression pattern | |
| page: page number (1-based index), if None is given, search for the | |
| entire document, but this is highly discouraged. | |
| ign_case: ignore case? | |
| """ | |
| result = [] | |
| if page is None: | |
| pages = doc.pages() | |
| elif 1 <= page <= doc.page_count: | |
| pages = [doc[page - 1]] | |
| else: # page out of range | |
| return result | |
| regex = re.compile( | |
| pattern, | |
| re.IGNORECASE | |
| ) if ign_case else re.compile(pattern) | |
| # we could parallelize this, but I don't see a reason | |
| # to *not* specify a page number | |
| for p in pages: | |
| found = search_in_page(regex, p) | |
| for s in found: | |
| s['page_index'] = p.number + 1 | |
| try: | |
| s['page_label'] = p.get_label() | |
| except Exception: | |
| # Fallback if get_label fails due to PyMuPDF version issues | |
| s['page_label'] = "" | |
| result.extend(found) | |
| return result | |
| def search_in_page(regex: re.Pattern, page: Page) -> List[dict]: | |
| """Search for `text` in `page` and extract meta using optimized search_for""" | |
| result = [] | |
| # 1. Use simple string search if regex is just a literal (optimization) | |
| # But since we have a compiled regex, we might need to extract the pattern if it's simple | |
| # Or just use the regex to find matches in the FULL text of the page first? | |
| # PyMuPDF's search_for takes a string. It doesn't support regex directly in wrapped core. | |
| # However, for the purpose of this tool which claims regex support, we have a dilemma. | |
| # But most users searching "Chapter 1" are doing literal searches. | |
| # If we want to support the user's "Divided World", we need to handle the case where it might be split. | |
| # The most robust way for PDF text search is usually: | |
| # 1. Get all text (with position). | |
| # 2. Run regex on the full text. | |
| # 3. Map match back to bbox. | |
| # 4. Find spans in bbox. | |
| # BUT, to keep it simple and fix the immediate "spinning" and "missing" issue: | |
| # The previous code iterated every span. | |
| # Let's try to be smarter. | |
| # For now, let's assume the user pattern is often a literal or we can approximate it. | |
| # If the user provides a regex, we can't easily use search_for. | |
| # However, the user provided "Divided World". | |
| # Let's fallback to the robust get_text("dict") but optimize the check? | |
| # No, get_text("dict") IS the slow part. | |
| # Alternative: | |
| # Use page.get_text("text") -> run regex -> if match, THEN get_text("dict")? | |
| # That saves time for pages that DON'T match. | |
| # Improved Algorithm: | |
| # 1. Extract plain text of the page. | |
| # 2. If regex doesn't match plain text, SKIP the page. (Huge optimization) | |
| # 3. If it does match, perform the detailed span search. | |
| text_content = page.get_text() | |
| if not regex.search(text_content): | |
| return [] | |
| # If we are here, there is a match on this page. Now find the exact spans. | |
| # Note: If the text is split across spans, the simple span iterator below will STILL fail to extract the specific span metadata for the *whole* match. | |
| # But at least it won't spin on empty pages. | |
| page_meta = page.get_textpage().extractDICT() | |
| for blk in page_meta.get('blocks', []): | |
| for ln in blk.get('lines', []): | |
| for spn in ln.get('spans', []): | |
| text = spn.get('text', "") | |
| if regex.search(text): | |
| result.append(spn) | |
| return result | |
| def to_bools(var: int) -> str: | |
| """Convert int to lowercase bool string""" | |
| return str(var != 0).lower() | |
| def dump_meta(spn: dict) -> str: | |
| """Dump the span dict from PyMuPDF to TOML compatible string""" | |
| result = [] | |
| if 'page_index' in spn: | |
| result.append(f"page.index = {spn['page_index']}") | |
| if 'page_label' in spn: | |
| result.append(f"page.label = \"{spn['page_label']}\"") | |
| result.append(f"font.name = {_dump_str(spn['font'])}") | |
| result.append(f"font.size = {_dump_float(spn['size'])}") | |
| result.append(f"font.color = {spn['color']:#08x}") | |
| flags = spn['flags'] | |
| result.append(f"font.superscript = {to_bools(flags & 0b00001)}") | |
| result.append(f"font.italic = {to_bools(flags & 0b00010)}") | |
| result.append(f"font.serif = {to_bools(flags & 0b00100)}") | |
| result.append(f"font.monospace = {to_bools(flags & 0b01000)}") | |
| result.append(f"font.bold = {to_bools(flags & 0b10000)}") | |
| bbox = spn['bbox'] | |
| result.append(f"bbox.left = {_dump_float(bbox[0])}") | |
| result.append(f"bbox.top = {_dump_float(bbox[1])}") | |
| result.append(f"bbox.right = {_dump_float(bbox[2])}") | |
| result.append(f"bbox.bottom = {_dump_float(bbox[3])}") | |
| return '\n'.join(result) | |
| def dump_toml(spn: dict, level: int, trail_nl: bool = False) -> str: | |
| """Dump a valid TOML directly usable by pdftocgen | |
| Argument | |
| spn: span dict of the heading | |
| level: heading level | |
| trail_nl: add trailing new line | |
| Returns | |
| a valid toml string | |
| """ | |
| result = [] | |
| result.append("[[heading]]") | |
| result.append(f"# {spn.get('text', '')}") | |
| result.append(f"level = {level}") | |
| result.append("greedy = true") | |
| # strip font subset prefix | |
| # == takeWhile (\c -> c /= '+') str | |
| before, sep, after = spn['font'].partition('+') | |
| font = after if sep else before | |
| result.append(f"font.name = {_dump_str(font)}") | |
| result.append(f"font.size = {_dump_float(spn['size'])}") | |
| result.append("# font.size_tolerance = 1e-5") | |
| result.append(f"# font.color = {spn['color']:#08x}") | |
| flags = spn['flags'] | |
| result.append(f"# font.superscript = {to_bools(flags & 0b00001)}") | |
| result.append(f"# font.italic = {to_bools(flags & 0b00010)}") | |
| result.append(f"# font.serif = {to_bools(flags & 0b00100)}") | |
| result.append(f"# font.monospace = {to_bools(flags & 0b01000)}") | |
| result.append(f"# font.bold = {to_bools(flags & 0b10000)}") | |
| bbox = spn['bbox'] | |
| result.append(f"# bbox.left = {_dump_float(bbox[0])}") | |
| result.append(f"# bbox.top = {_dump_float(bbox[1])}") | |
| result.append(f"# bbox.right = {_dump_float(bbox[2])}") | |
| result.append(f"# bbox.bottom = {_dump_float(bbox[3])}") | |
| result.append("# bbox.tolerance = 1e-5") | |
| if trail_nl: | |
| result.append("") | |
| return '\n'.join(result) | |