File size: 5,745 Bytes
046e3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
from dataclasses import dataclass
from typing import Optional, List, Dict, Iterator
from .filter import ToCFilter
from fitzutils import ToCEntry
from itertools import chain
from collections import defaultdict
from fitz import Document


class FoundGreedy(Exception):
    """A hacky solution to do short-circuiting in Python.



    The main reason to do this short-circuiting is to untangle the logic of

    greedy filter with normal execution, which makes the typing and code much

    cleaner, but it can also save some unecessary comparisons.



    Probably similar to call/cc in scheme or longjump in C

    c.f. https://ds26gte.github.io/tyscheme/index-Z-H-15.html#node_sec_13.2

    """
    level: int

    def __init__(self, level):
        """

        Argument

          level: level of the greedy filter

        """
        super().__init__()
        self.level = level


def blk_to_str(blk: dict) -> str:
    """Extract all the text inside a block"""
    return " ".join([
        spn.get('text', "").strip()
        for line in blk.get('lines', [])
        for spn in line.get('spans', [])
    ])


@dataclass
class Fragment:
    """A fragment of the extracted heading"""
    text: str
    level: int


def concatFrag(frags: Iterator[Optional[Fragment]], sep: str = " ") -> Dict[int, str]:
    """Concatenate fragments to strings



    Returns

      a dictionary (level -> title) that contains the title for each level.

    """
    # accumulate a list of strings for each level of heading
    acc = defaultdict(list)
    for frag in frags:
        if frag is not None:
            acc[frag.level].append(frag.text)

    result = {}
    for level, strs in acc.items():
        result[level] = sep.join(strs)
    return result


class Recipe:
    """The internal representation of a recipe"""
    filters: List[ToCFilter]

    def __init__(self, recipe_dict: dict):
        fltr_dicts = recipe_dict.get('heading', [])

        if len(fltr_dicts) == 0:
            raise ValueError("no filters found in recipe")
        self.filters = [ToCFilter(fltr) for fltr in fltr_dicts]

    def _extract_span(self, spn: dict) -> Optional[Fragment]:
        """Extract text from span along with level



        Argument

          spn: a span dictionary

          {

            'bbox': (float, float, float, float),

            'color': int,

            'flags': int,

            'font': str,

            'size': float,

            'text': str

          }

        Returns

          a fragment of the heading or None if no match

        """
        for fltr in self.filters:
            if fltr.admits(spn):
                text = spn.get('text', "").strip()

                if not text:
                    # don't match empty spaces
                    return None

                if fltr.greedy:
                    # propagate all the way back to extract_block
                    raise FoundGreedy(fltr.level)

                return Fragment(text, fltr.level)
        return None

    def _extract_line(self, line: dict) -> List[Optional[Fragment]]:
        """Extract matching heading fragments in a line.



        Argument

          line: a line dictionary

          {

            'bbox': (float, float, float, float),

            'wmode': int,

            'dir': (float, float),

            'spans': [dict]

          }

        Returns

          a list of fragments concatenated from result in a line

        """
        return [self._extract_span(spn) for spn in line.get('spans', [])]

    def extract_block(self, block: dict, page: int) -> List[ToCEntry]:
        """Extract matching headings in a block.



        Argument

          block: a block dictionary

          {

            'bbox': (float, float, float, float),

            'lines': [dict],

            'type': int

          }

        Returns

          a list of toc entries, concatenated from the result of lines

        """
        if block.get('type') != 0:
            # not a text block
            return []

        vpos = block.get('bbox', (0, 0))[1]

        try:
            frags = chain.from_iterable([
                self._extract_line(ln) for ln in block.get('lines')
            ])
            titles = concatFrag(frags)

            return [
                ToCEntry(level, title, page, vpos)
                for level, title in titles.items()
            ]
        except FoundGreedy as e:
            # Smart Greedy: Only merged text that MATCHES the filter
            # Find the filter that triggered this level
            relevant_filter = next((f for f in self.filters if f.level == e.level), None)
            
            parts = []
            if relevant_filter:
                for ln in block.get('lines', []):
                    for spn in ln.get('spans', []):
                        if relevant_filter.admits(spn):
                            parts.append(spn.get('text', "").strip())
            
            merged_text = " ".join(parts)
            if merged_text:
                return [ToCEntry(e.level, merged_text, page, vpos)]
            else:
                return []


def extract_toc(doc: Document, recipe: Recipe) -> List[ToCEntry]:
    """Extract toc entries from a document



    Arguments

      doc: a pdf document

      recipe: recipe from user

    Returns

      a list of toc entries in the document

    """
    result = []

    for page in doc.pages():
        for blk in page.get_textpage().extractDICT().get('blocks', []):
            result.extend(
                recipe.extract_block(blk, page.number + 1)
            )

    return result