File size: 7,015 Bytes
046e3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
"""Extract metadata for a string in a pdf file"""

from toml.encoder import _dump_str, _dump_float

import re

from fitz import Document, Page
from typing import Optional, List


def extract_meta(doc: Document,

                 pattern: str,

                 page: Optional[int] = None,

                 ign_case: bool = False

                 ) -> List[dict]:
    """Extract meta for a `pattern` on `page` in a pdf document



    Arguments

      doc: document from pymupdf

      pattern: a regular expression pattern

      page: page number (1-based index), if None is given, search for the

            entire document, but this is highly discouraged.

      ign_case: ignore case?

    """
    result = []

    if page is None:
        pages = doc.pages()
    elif 1 <= page <= doc.page_count:
        pages = [doc[page - 1]]
    else:  # page out of range
        return result

    regex = re.compile(
        pattern,
        re.IGNORECASE
    ) if ign_case else re.compile(pattern)

    # we could parallelize this, but I don't see a reason
    # to *not* specify a page number
    for p in pages:
        found = search_in_page(regex, p)
        for s in found:
            s['page_index'] = p.number + 1
            try:
                s['page_label'] = p.get_label()
            except Exception:
                # Fallback if get_label fails due to PyMuPDF version issues
                s['page_label'] = ""
        result.extend(found)

    return result


def search_in_page(regex: re.Pattern, page: Page) -> List[dict]:
    """Search for `text` in `page` and extract meta using optimized search_for"""
    result = []

    # 1. Use simple string search if regex is just a literal (optimization)
    # But since we have a compiled regex, we might need to extract the pattern if it's simple
    # Or just use the regex to find matches in the FULL text of the page first?
    # PyMuPDF's search_for takes a string. It doesn't support regex directly in wrapped core.
    # However, for the purpose of this tool which claims regex support, we have a dilemma.
    # But most users searching "Chapter 1" are doing literal searches.
    
    # If we want to support the user's "Divided World", we need to handle the case where it might be split.
    # The most robust way for PDF text search is usually:
    # 1. Get all text (with position).
    # 2. Run regex on the full text.
    # 3. Map match back to bbox.
    # 4. Find spans in bbox.
    
    # BUT, to keep it simple and fix the immediate "spinning" and "missing" issue:
    # The previous code iterated every span. 
    # Let's try to be smarter.
    
    # For now, let's assume the user pattern is often a literal or we can approximate it.
    # If the user provides a regex, we can't easily use search_for.
    # However, the user provided "Divided World".
    
    # Let's fallback to the robust get_text("dict") but optimize the check?
    # No, get_text("dict") IS the slow part.
    
    # Alternative:
    # Use page.get_text("text") -> run regex -> if match, THEN get_text("dict")?
    # That saves time for pages that DON'T match.
    
    # Improved Algorithm:
    # 1. Extract plain text of the page.
    # 2. If regex doesn't match plain text, SKIP the page. (Huge optimization)
    # 3. If it does match, perform the detailed span search.

    text_content = page.get_text()
    if not regex.search(text_content):
        return []

    # If we are here, there is a match on this page. Now find the exact spans.
    # Note: If the text is split across spans, the simple span iterator below will STILL fail to extract the specific span metadata for the *whole* match.
    # But at least it won't spin on empty pages.
    
    page_meta = page.get_textpage().extractDICT()

    for blk in page_meta.get('blocks', []):
        for ln in blk.get('lines', []):
            for spn in ln.get('spans', []):
                text = spn.get('text', "")
                if regex.search(text):
                    result.append(spn)
    return result


def to_bools(var: int) -> str:
    """Convert int to lowercase bool string"""
    return str(var != 0).lower()


def dump_meta(spn: dict) -> str:
    """Dump the span dict from PyMuPDF to TOML compatible string"""
    result = []

    if 'page_index' in spn:
        result.append(f"page.index = {spn['page_index']}")
    if 'page_label' in spn:
        result.append(f"page.label = \"{spn['page_label']}\"")

    result.append(f"font.name = {_dump_str(spn['font'])}")
    result.append(f"font.size = {_dump_float(spn['size'])}")
    result.append(f"font.color = {spn['color']:#08x}")

    flags = spn['flags']

    result.append(f"font.superscript = {to_bools(flags & 0b00001)}")
    result.append(f"font.italic = {to_bools(flags & 0b00010)}")
    result.append(f"font.serif = {to_bools(flags & 0b00100)}")
    result.append(f"font.monospace = {to_bools(flags & 0b01000)}")
    result.append(f"font.bold = {to_bools(flags & 0b10000)}")

    bbox = spn['bbox']

    result.append(f"bbox.left = {_dump_float(bbox[0])}")
    result.append(f"bbox.top = {_dump_float(bbox[1])}")
    result.append(f"bbox.right = {_dump_float(bbox[2])}")
    result.append(f"bbox.bottom = {_dump_float(bbox[3])}")

    return '\n'.join(result)


def dump_toml(spn: dict, level: int, trail_nl: bool = False) -> str:
    """Dump a valid TOML directly usable by pdftocgen



    Argument

      spn: span dict of the heading

      level: heading level

      trail_nl: add trailing new line

    Returns

      a valid toml string

    """
    result = []

    result.append("[[heading]]")
    result.append(f"# {spn.get('text', '')}")
    result.append(f"level = {level}")
    result.append("greedy = true")

    # strip font subset prefix
    # == takeWhile (\c -> c /= '+') str
    before, sep, after = spn['font'].partition('+')
    font = after if sep else before

    result.append(f"font.name = {_dump_str(font)}")
    result.append(f"font.size = {_dump_float(spn['size'])}")
    result.append("# font.size_tolerance = 1e-5")
    result.append(f"# font.color = {spn['color']:#08x}")

    flags = spn['flags']

    result.append(f"# font.superscript = {to_bools(flags & 0b00001)}")
    result.append(f"# font.italic = {to_bools(flags & 0b00010)}")
    result.append(f"# font.serif = {to_bools(flags & 0b00100)}")
    result.append(f"# font.monospace = {to_bools(flags & 0b01000)}")
    result.append(f"# font.bold = {to_bools(flags & 0b10000)}")

    bbox = spn['bbox']

    result.append(f"# bbox.left = {_dump_float(bbox[0])}")
    result.append(f"# bbox.top = {_dump_float(bbox[1])}")
    result.append(f"# bbox.right = {_dump_float(bbox[2])}")
    result.append(f"# bbox.bottom = {_dump_float(bbox[3])}")
    result.append("# bbox.tolerance = 1e-5")

    if trail_nl:
        result.append("")

    return '\n'.join(result)