Spaces:
Sleeping
Sleeping
File size: 5,745 Bytes
046e3b8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | from dataclasses import dataclass
from typing import Optional, List, Dict, Iterator
from .filter import ToCFilter
from fitzutils import ToCEntry
from itertools import chain
from collections import defaultdict
from fitz import Document
class FoundGreedy(Exception):
"""A hacky solution to do short-circuiting in Python.
The main reason to do this short-circuiting is to untangle the logic of
greedy filter with normal execution, which makes the typing and code much
cleaner, but it can also save some unecessary comparisons.
Probably similar to call/cc in scheme or longjump in C
c.f. https://ds26gte.github.io/tyscheme/index-Z-H-15.html#node_sec_13.2
"""
level: int
def __init__(self, level):
"""
Argument
level: level of the greedy filter
"""
super().__init__()
self.level = level
def blk_to_str(blk: dict) -> str:
"""Extract all the text inside a block"""
return " ".join([
spn.get('text', "").strip()
for line in blk.get('lines', [])
for spn in line.get('spans', [])
])
@dataclass
class Fragment:
"""A fragment of the extracted heading"""
text: str
level: int
def concatFrag(frags: Iterator[Optional[Fragment]], sep: str = " ") -> Dict[int, str]:
"""Concatenate fragments to strings
Returns
a dictionary (level -> title) that contains the title for each level.
"""
# accumulate a list of strings for each level of heading
acc = defaultdict(list)
for frag in frags:
if frag is not None:
acc[frag.level].append(frag.text)
result = {}
for level, strs in acc.items():
result[level] = sep.join(strs)
return result
class Recipe:
"""The internal representation of a recipe"""
filters: List[ToCFilter]
def __init__(self, recipe_dict: dict):
fltr_dicts = recipe_dict.get('heading', [])
if len(fltr_dicts) == 0:
raise ValueError("no filters found in recipe")
self.filters = [ToCFilter(fltr) for fltr in fltr_dicts]
def _extract_span(self, spn: dict) -> Optional[Fragment]:
"""Extract text from span along with level
Argument
spn: a span dictionary
{
'bbox': (float, float, float, float),
'color': int,
'flags': int,
'font': str,
'size': float,
'text': str
}
Returns
a fragment of the heading or None if no match
"""
for fltr in self.filters:
if fltr.admits(spn):
text = spn.get('text', "").strip()
if not text:
# don't match empty spaces
return None
if fltr.greedy:
# propagate all the way back to extract_block
raise FoundGreedy(fltr.level)
return Fragment(text, fltr.level)
return None
def _extract_line(self, line: dict) -> List[Optional[Fragment]]:
"""Extract matching heading fragments in a line.
Argument
line: a line dictionary
{
'bbox': (float, float, float, float),
'wmode': int,
'dir': (float, float),
'spans': [dict]
}
Returns
a list of fragments concatenated from result in a line
"""
return [self._extract_span(spn) for spn in line.get('spans', [])]
def extract_block(self, block: dict, page: int) -> List[ToCEntry]:
"""Extract matching headings in a block.
Argument
block: a block dictionary
{
'bbox': (float, float, float, float),
'lines': [dict],
'type': int
}
Returns
a list of toc entries, concatenated from the result of lines
"""
if block.get('type') != 0:
# not a text block
return []
vpos = block.get('bbox', (0, 0))[1]
try:
frags = chain.from_iterable([
self._extract_line(ln) for ln in block.get('lines')
])
titles = concatFrag(frags)
return [
ToCEntry(level, title, page, vpos)
for level, title in titles.items()
]
except FoundGreedy as e:
# Smart Greedy: Only merged text that MATCHES the filter
# Find the filter that triggered this level
relevant_filter = next((f for f in self.filters if f.level == e.level), None)
parts = []
if relevant_filter:
for ln in block.get('lines', []):
for spn in ln.get('spans', []):
if relevant_filter.admits(spn):
parts.append(spn.get('text', "").strip())
merged_text = " ".join(parts)
if merged_text:
return [ToCEntry(e.level, merged_text, page, vpos)]
else:
return []
def extract_toc(doc: Document, recipe: Recipe) -> List[ToCEntry]:
"""Extract toc entries from a document
Arguments
doc: a pdf document
recipe: recipe from user
Returns
a list of toc entries in the document
"""
result = []
for page in doc.pages():
for blk in page.get_textpage().extractDICT().get('blocks', []):
result.extend(
recipe.extract_block(blk, page.number + 1)
)
return result
|