from dataclasses import dataclass from typing import Optional, List, Dict, Iterator from .filter import ToCFilter from fitzutils import ToCEntry from itertools import chain from collections import defaultdict from fitz import Document class FoundGreedy(Exception): """A hacky solution to do short-circuiting in Python. The main reason to do this short-circuiting is to untangle the logic of greedy filter with normal execution, which makes the typing and code much cleaner, but it can also save some unecessary comparisons. Probably similar to call/cc in scheme or longjump in C c.f. https://ds26gte.github.io/tyscheme/index-Z-H-15.html#node_sec_13.2 """ level: int def __init__(self, level): """ Argument level: level of the greedy filter """ super().__init__() self.level = level def blk_to_str(blk: dict) -> str: """Extract all the text inside a block""" return " ".join([ spn.get('text', "").strip() for line in blk.get('lines', []) for spn in line.get('spans', []) ]) @dataclass class Fragment: """A fragment of the extracted heading""" text: str level: int def concatFrag(frags: Iterator[Optional[Fragment]], sep: str = " ") -> Dict[int, str]: """Concatenate fragments to strings Returns a dictionary (level -> title) that contains the title for each level. """ # accumulate a list of strings for each level of heading acc = defaultdict(list) for frag in frags: if frag is not None: acc[frag.level].append(frag.text) result = {} for level, strs in acc.items(): result[level] = sep.join(strs) return result class Recipe: """The internal representation of a recipe""" filters: List[ToCFilter] def __init__(self, recipe_dict: dict): fltr_dicts = recipe_dict.get('heading', []) if len(fltr_dicts) == 0: raise ValueError("no filters found in recipe") self.filters = [ToCFilter(fltr) for fltr in fltr_dicts] def _extract_span(self, spn: dict) -> Optional[Fragment]: """Extract text from span along with level Argument spn: a span dictionary { 'bbox': (float, float, float, float), 'color': int, 'flags': int, 'font': str, 'size': float, 'text': str } Returns a fragment of the heading or None if no match """ for fltr in self.filters: if fltr.admits(spn): text = spn.get('text', "").strip() if not text: # don't match empty spaces return None if fltr.greedy: # propagate all the way back to extract_block raise FoundGreedy(fltr.level) return Fragment(text, fltr.level) return None def _extract_line(self, line: dict) -> List[Optional[Fragment]]: """Extract matching heading fragments in a line. Argument line: a line dictionary { 'bbox': (float, float, float, float), 'wmode': int, 'dir': (float, float), 'spans': [dict] } Returns a list of fragments concatenated from result in a line """ return [self._extract_span(spn) for spn in line.get('spans', [])] def extract_block(self, block: dict, page: int) -> List[ToCEntry]: """Extract matching headings in a block. Argument block: a block dictionary { 'bbox': (float, float, float, float), 'lines': [dict], 'type': int } Returns a list of toc entries, concatenated from the result of lines """ if block.get('type') != 0: # not a text block return [] vpos = block.get('bbox', (0, 0))[1] try: frags = chain.from_iterable([ self._extract_line(ln) for ln in block.get('lines') ]) titles = concatFrag(frags) return [ ToCEntry(level, title, page, vpos) for level, title in titles.items() ] except FoundGreedy as e: # Smart Greedy: Only merged text that MATCHES the filter # Find the filter that triggered this level relevant_filter = next((f for f in self.filters if f.level == e.level), None) parts = [] if relevant_filter: for ln in block.get('lines', []): for spn in ln.get('spans', []): if relevant_filter.admits(spn): parts.append(spn.get('text', "").strip()) merged_text = " ".join(parts) if merged_text: return [ToCEntry(e.level, merged_text, page, vpos)] else: return [] def extract_toc(doc: Document, recipe: Recipe) -> List[ToCEntry]: """Extract toc entries from a document Arguments doc: a pdf document recipe: recipe from user Returns a list of toc entries in the document """ result = [] for page in doc.pages(): for blk in page.get_textpage().extractDICT().get('blocks', []): result.extend( recipe.extract_block(blk, page.number + 1) ) return result