Spaces:
Sleeping
Sleeping
File size: 7,015 Bytes
046e3b8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | """Extract metadata for a string in a pdf file"""
from toml.encoder import _dump_str, _dump_float
import re
from fitz import Document, Page
from typing import Optional, List
def extract_meta(doc: Document,
pattern: str,
page: Optional[int] = None,
ign_case: bool = False
) -> List[dict]:
"""Extract meta for a `pattern` on `page` in a pdf document
Arguments
doc: document from pymupdf
pattern: a regular expression pattern
page: page number (1-based index), if None is given, search for the
entire document, but this is highly discouraged.
ign_case: ignore case?
"""
result = []
if page is None:
pages = doc.pages()
elif 1 <= page <= doc.page_count:
pages = [doc[page - 1]]
else: # page out of range
return result
regex = re.compile(
pattern,
re.IGNORECASE
) if ign_case else re.compile(pattern)
# we could parallelize this, but I don't see a reason
# to *not* specify a page number
for p in pages:
found = search_in_page(regex, p)
for s in found:
s['page_index'] = p.number + 1
try:
s['page_label'] = p.get_label()
except Exception:
# Fallback if get_label fails due to PyMuPDF version issues
s['page_label'] = ""
result.extend(found)
return result
def search_in_page(regex: re.Pattern, page: Page) -> List[dict]:
"""Search for `text` in `page` and extract meta using optimized search_for"""
result = []
# 1. Use simple string search if regex is just a literal (optimization)
# But since we have a compiled regex, we might need to extract the pattern if it's simple
# Or just use the regex to find matches in the FULL text of the page first?
# PyMuPDF's search_for takes a string. It doesn't support regex directly in wrapped core.
# However, for the purpose of this tool which claims regex support, we have a dilemma.
# But most users searching "Chapter 1" are doing literal searches.
# If we want to support the user's "Divided World", we need to handle the case where it might be split.
# The most robust way for PDF text search is usually:
# 1. Get all text (with position).
# 2. Run regex on the full text.
# 3. Map match back to bbox.
# 4. Find spans in bbox.
# BUT, to keep it simple and fix the immediate "spinning" and "missing" issue:
# The previous code iterated every span.
# Let's try to be smarter.
# For now, let's assume the user pattern is often a literal or we can approximate it.
# If the user provides a regex, we can't easily use search_for.
# However, the user provided "Divided World".
# Let's fallback to the robust get_text("dict") but optimize the check?
# No, get_text("dict") IS the slow part.
# Alternative:
# Use page.get_text("text") -> run regex -> if match, THEN get_text("dict")?
# That saves time for pages that DON'T match.
# Improved Algorithm:
# 1. Extract plain text of the page.
# 2. If regex doesn't match plain text, SKIP the page. (Huge optimization)
# 3. If it does match, perform the detailed span search.
text_content = page.get_text()
if not regex.search(text_content):
return []
# If we are here, there is a match on this page. Now find the exact spans.
# Note: If the text is split across spans, the simple span iterator below will STILL fail to extract the specific span metadata for the *whole* match.
# But at least it won't spin on empty pages.
page_meta = page.get_textpage().extractDICT()
for blk in page_meta.get('blocks', []):
for ln in blk.get('lines', []):
for spn in ln.get('spans', []):
text = spn.get('text', "")
if regex.search(text):
result.append(spn)
return result
def to_bools(var: int) -> str:
"""Convert int to lowercase bool string"""
return str(var != 0).lower()
def dump_meta(spn: dict) -> str:
"""Dump the span dict from PyMuPDF to TOML compatible string"""
result = []
if 'page_index' in spn:
result.append(f"page.index = {spn['page_index']}")
if 'page_label' in spn:
result.append(f"page.label = \"{spn['page_label']}\"")
result.append(f"font.name = {_dump_str(spn['font'])}")
result.append(f"font.size = {_dump_float(spn['size'])}")
result.append(f"font.color = {spn['color']:#08x}")
flags = spn['flags']
result.append(f"font.superscript = {to_bools(flags & 0b00001)}")
result.append(f"font.italic = {to_bools(flags & 0b00010)}")
result.append(f"font.serif = {to_bools(flags & 0b00100)}")
result.append(f"font.monospace = {to_bools(flags & 0b01000)}")
result.append(f"font.bold = {to_bools(flags & 0b10000)}")
bbox = spn['bbox']
result.append(f"bbox.left = {_dump_float(bbox[0])}")
result.append(f"bbox.top = {_dump_float(bbox[1])}")
result.append(f"bbox.right = {_dump_float(bbox[2])}")
result.append(f"bbox.bottom = {_dump_float(bbox[3])}")
return '\n'.join(result)
def dump_toml(spn: dict, level: int, trail_nl: bool = False) -> str:
"""Dump a valid TOML directly usable by pdftocgen
Argument
spn: span dict of the heading
level: heading level
trail_nl: add trailing new line
Returns
a valid toml string
"""
result = []
result.append("[[heading]]")
result.append(f"# {spn.get('text', '')}")
result.append(f"level = {level}")
result.append("greedy = true")
# strip font subset prefix
# == takeWhile (\c -> c /= '+') str
before, sep, after = spn['font'].partition('+')
font = after if sep else before
result.append(f"font.name = {_dump_str(font)}")
result.append(f"font.size = {_dump_float(spn['size'])}")
result.append("# font.size_tolerance = 1e-5")
result.append(f"# font.color = {spn['color']:#08x}")
flags = spn['flags']
result.append(f"# font.superscript = {to_bools(flags & 0b00001)}")
result.append(f"# font.italic = {to_bools(flags & 0b00010)}")
result.append(f"# font.serif = {to_bools(flags & 0b00100)}")
result.append(f"# font.monospace = {to_bools(flags & 0b01000)}")
result.append(f"# font.bold = {to_bools(flags & 0b10000)}")
bbox = spn['bbox']
result.append(f"# bbox.left = {_dump_float(bbox[0])}")
result.append(f"# bbox.top = {_dump_float(bbox[1])}")
result.append(f"# bbox.right = {_dump_float(bbox[2])}")
result.append(f"# bbox.bottom = {_dump_float(bbox[3])}")
result.append("# bbox.tolerance = 1e-5")
if trail_nl:
result.append("")
return '\n'.join(result)
|