+ Based on
pdf.tocgen by krasjet.
+ Enhanced with UI, Chapter Splitting, and Metadata Search. Licensed under AGPL-3.0.
+
+ """, unsafe_allow_html=True)
diff --git a/fitzutils/__init__.py b/fitzutils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5af8abf9c7e0f10603381e66862c2618a1aa0ffd
--- /dev/null
+++ b/fitzutils/__init__.py
@@ -0,0 +1,17 @@
+"""A collection of utility functions to work with PyMuPDF"""
+
+from .fitzutils import (
+ open_pdf,
+ ToCEntry,
+ dump_toc,
+ pprint_toc,
+ get_file_encoding
+)
+
+__all__ = [
+ 'open_pdf',
+ 'ToCEntry',
+ 'dump_toc',
+ 'pprint_toc',
+ 'get_file_encoding'
+]
diff --git a/fitzutils/fitzutils.py b/fitzutils/fitzutils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f1167b3c2e9308e32fc2ad43d66bca3cbfc8ec7
--- /dev/null
+++ b/fitzutils/fitzutils.py
@@ -0,0 +1,112 @@
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Optional, ContextManager, List, Tuple
+from fitz import Document
+
+import sys
+import fitz
+import io
+import csv
+import chardet
+
+
+@contextmanager
+def open_pdf(path: str,
+ exit_on_error: bool = True
+ ) -> ContextManager[Optional[Document]]:
+ """A context manager for fitz Document
+
+ This context manager will take care of the error handling when creating a
+ fitz Document.
+
+ Arguments
+ path: the path of the pdf file
+ exit_on_error: if true, exit with error code 1 when error occurs
+ """
+ try:
+ doc = fitz.open(path)
+ except Exception as e:
+ if exit_on_error:
+ print(f"error: fail to open {path}", file=sys.stderr)
+ print(e, file=sys.stderr)
+ sys.exit(1)
+ else:
+ yield None
+ else:
+ try:
+ yield doc
+ finally:
+ doc.close()
+
+
+@dataclass
+class ToCEntry:
+ """A single entry in the table of contents"""
+ level: int
+ title: str
+ pagenum: int
+ # vpos == bbox.top, used for sorting
+ vpos: Optional[float] = None
+
+ @staticmethod
+ def key(e) -> Tuple[int, float]:
+ """Key used for sorting"""
+ return (e.pagenum, 0 if e.vpos is None else e.vpos)
+
+ def to_fitz_entry(self) -> list:
+ return ([self.level, self.title, self.pagenum] +
+ [self.vpos] * (self.vpos is not None))
+
+
+def dump_toc(entries: List[ToCEntry], dump_vpos: bool = False) -> str:
+ """Dump table of contents as a CSV dialect
+
+ We will use indentations to represent the level of each entry, except that,
+ everything should be similar to the normal CSV.
+
+ Argument
+ entries: a list of ToC entries
+ dump_vpos: if true, the vertical position of a page is also dumped
+ Returns
+ a multiline string
+ """
+ with io.StringIO(newline='\n') as out:
+ writer = csv.writer(out, lineterminator='\n',
+ delimiter=' ', quoting=csv.QUOTE_NONNUMERIC)
+ for entry in entries:
+ out.write((entry.level - 1) * ' ')
+ writer.writerow(
+ [entry.title, entry.pagenum] +
+ ([entry.vpos] * (dump_vpos and entry.vpos is not None))
+ )
+ return out.getvalue()
+
+
+def pprint_toc(entries: List[ToCEntry]) -> str:
+ """Pretty print table of contents
+
+ Argument
+ entries: a list of ToC entries
+ Returns
+ a multiline string
+ """
+ return '\n'.join([
+ f"{(entry.level - 1) * ' '}{entry.title} ··· {entry.pagenum}"
+ for entry in entries
+ ])
+
+
+def get_file_encoding(path: str) -> str:
+ """Get encoding of file
+
+ Argument
+ path: file path
+ Returns
+ encoding string
+ """
+ try:
+ with open(path, "rb") as f:
+ enc = chardet.detect(f.read()).encoding
+ except:
+ enc = 'utf-8'
+ return enc
diff --git a/pdftocgen/__init__.py b/pdftocgen/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc6cbb261c96a596ae893408f22ae843761788fb
--- /dev/null
+++ b/pdftocgen/__init__.py
@@ -0,0 +1,3 @@
+"""Generate table of contents for pdf based on a recipe file"""
+
+__version__ = '1.3.4'
diff --git a/pdftocgen/__main__.py b/pdftocgen/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..528380d62037100b8f2f0746d849a6ff67e2e3be
--- /dev/null
+++ b/pdftocgen/__main__.py
@@ -0,0 +1,4 @@
+from .app import main
+
+if __name__ == '__main__':
+ main()
diff --git a/pdftocgen/filter.py b/pdftocgen/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b04b8571df94129fb75aa9f345a219b956583d
--- /dev/null
+++ b/pdftocgen/filter.py
@@ -0,0 +1,161 @@
+"""Filter on span dictionaries
+
+This module contains the internal representation of heading filters, which are
+used to test if a span should be included in the ToC.
+"""
+
+import re
+
+from typing import Optional
+from re import Pattern
+
+DEF_TOLERANCE: float = 1e-5
+
+
+def admits_float(expect: Optional[float],
+ actual: Optional[float],
+ tolerance: float) -> bool:
+ """Check if a float should be admitted by a filter"""
+ return (expect is None) or \
+ (actual is not None and abs(expect - actual) <= tolerance)
+
+
+class FontFilter:
+ """Filter on font attributes"""
+ name: Pattern
+ size: Optional[float]
+ size_tolerance: float
+ color: Optional[int]
+ flags: int
+ # besides the usual true (1) and false (0), we have another state,
+ # unset (x), where the truth table would be
+ # a b diff?
+ # 0 0 0
+ # 0 1 1
+ # 1 0 1
+ # 1 1 0
+ # x 0 0
+ # x 1 0
+ # it's very inefficient to compare bit by bit, which would take 5 bitwise
+ # operations to compare, and then 4 to combine the results, we will use a
+ # trick to reduce it to 2 ops.
+ # step 1: use XOR to find different bits. if unset, set bit to 0, we will
+ # take care of false positives in the next step
+ # a b a^b
+ # 0 0 0
+ # 0 1 1
+ # 1 0 1
+ # 1 1 0
+ # step 2: use AND with a ignore mask, (0 for ignored) to eliminate false
+ # positives
+ # a b a&b
+ # 0 1 0 <- no diff
+ # 0 0 0 <- no diff
+ # 1 1 1 <- found difference
+ # 1 0 0 <- ignored
+ ign_mask: int
+
+ def __init__(self, font_dict: dict):
+ self.name = re.compile(font_dict.get('name', ""))
+ self.size = font_dict.get('size')
+ self.size_tolerance = font_dict.get('size_tolerance', DEF_TOLERANCE)
+ self.color = font_dict.get('color')
+ # some branchless trick, mainly to save space
+ # x * True = x
+ # x * False = 0
+ self.flags = (0b00001 * font_dict.get('superscript', False) |
+ 0b00010 * font_dict.get('italic', False) |
+ 0b00100 * font_dict.get('serif', False) |
+ 0b01000 * font_dict.get('monospace', False) |
+ 0b10000 * font_dict.get('bold', False))
+
+ self.ign_mask = (0b00001 * ('superscript' in font_dict) |
+ 0b00010 * ('italic' in font_dict) |
+ 0b00100 * ('serif' in font_dict) |
+ 0b01000 * ('monospace' in font_dict) |
+ 0b10000 * ('bold' in font_dict))
+
+ def admits(self, spn: dict) -> bool:
+ """Check if the font attributes admit the span
+
+ Argument
+ spn: the span dict to be checked
+ Returns
+ False if the span doesn't match current font attribute
+ """
+ if not self.name.search(spn.get('font', "")):
+ return False
+
+ if self.color is not None and self.color != spn.get('color'):
+ return False
+
+ if not admits_float(self.size, spn.get('size'), self.size_tolerance):
+ return False
+
+ flags = spn.get('flags', ~self.flags)
+ # see above for explanation
+ return not (flags ^ self.flags) & self.ign_mask
+
+
+class BoundingBoxFilter:
+ """Filter on bounding boxes"""
+ left: Optional[float]
+ top: Optional[float]
+ right: Optional[float]
+ bottom: Optional[float]
+ tolernace: float
+
+ def __init__(self, bbox_dict: dict):
+ self.left = bbox_dict.get('left')
+ self.top = bbox_dict.get('top')
+ self.right = bbox_dict.get('right')
+ self.bottom = bbox_dict.get('bottom')
+ self.tolerance = bbox_dict.get('tolerance', DEF_TOLERANCE)
+
+ def admits(self, spn: dict) -> bool:
+ """Check if the bounding box admit the span
+
+ Argument
+ spn: the span dict to be checked
+ Returns
+ False if the span doesn't match current bounding box setting
+ """
+ bbox = spn.get('bbox', (None, None, None, None))
+ return (admits_float(self.left, bbox[0], self.tolerance) and
+ admits_float(self.top, bbox[1], self.tolerance) and
+ admits_float(self.right, bbox[2], self.tolerance) and
+ admits_float(self.bottom, bbox[3], self.tolerance))
+
+
+class ToCFilter:
+ """Filter on span dictionary to pick out headings in the ToC"""
+ # The level of the title, strictly > 0
+ level: int
+ # When set, the filter will be more *greedy* and extract all the text in a
+ # block even when at least one match occurs
+ greedy: bool
+ font: FontFilter
+ bbox: BoundingBoxFilter
+
+ def __init__(self, fltr_dict: dict):
+ lvl = fltr_dict.get('level')
+
+ if lvl is None:
+ raise ValueError("filter's 'level' is not set")
+ if lvl < 1:
+ raise ValueError("filter's 'level' must be >= 1")
+
+ self.level = lvl
+ self.greedy = fltr_dict.get('greedy', False)
+ self.font = FontFilter(fltr_dict.get('font', {}))
+ self.bbox = BoundingBoxFilter(fltr_dict.get('bbox', {}))
+
+ def admits(self, spn: dict) -> bool:
+ """Check if the filter admits the span
+
+ Arguments
+ spn: the span dict to be checked
+ Returns
+ False if the span doesn't match the filter
+ """
+ return self.font.admits(spn) and self.bbox.admits(spn)
diff --git a/pdftocgen/recipe.py b/pdftocgen/recipe.py
new file mode 100644
index 0000000000000000000000000000000000000000..010bb31fa61e5edb29792a33fd4c6931bf3fdbee
--- /dev/null
+++ b/pdftocgen/recipe.py
@@ -0,0 +1,188 @@
+from dataclasses import dataclass
+from typing import Optional, List, Dict, Iterator
+from .filter import ToCFilter
+from fitzutils import ToCEntry
+from itertools import chain
+from collections import defaultdict
+from fitz import Document
+
+
+class FoundGreedy(Exception):
+ """A hacky solution to do short-circuiting in Python.
+
+ The main reason to do this short-circuiting is to untangle the logic of
+ greedy filter with normal execution, which makes the typing and code much
+ cleaner, but it can also save some unecessary comparisons.
+
+ Probably similar to call/cc in scheme or longjump in C
+ c.f. https://ds26gte.github.io/tyscheme/index-Z-H-15.html#node_sec_13.2
+ """
+ level: int
+
+ def __init__(self, level):
+ """
+ Argument
+ level: level of the greedy filter
+ """
+ super().__init__()
+ self.level = level
+
+
+def blk_to_str(blk: dict) -> str:
+ """Extract all the text inside a block"""
+ return " ".join([
+ spn.get('text', "").strip()
+ for line in blk.get('lines', [])
+ for spn in line.get('spans', [])
+ ])
+
+
+@dataclass
+class Fragment:
+ """A fragment of the extracted heading"""
+ text: str
+ level: int
+
+
+def concatFrag(frags: Iterator[Optional[Fragment]], sep: str = " ") -> Dict[int, str]:
+ """Concatenate fragments to strings
+
+ Returns
+ a dictionary (level -> title) that contains the title for each level.
+ """
+ # accumulate a list of strings for each level of heading
+ acc = defaultdict(list)
+ for frag in frags:
+ if frag is not None:
+ acc[frag.level].append(frag.text)
+
+ result = {}
+ for level, strs in acc.items():
+ result[level] = sep.join(strs)
+ return result
+
+
+class Recipe:
+ """The internal representation of a recipe"""
+ filters: List[ToCFilter]
+
+ def __init__(self, recipe_dict: dict):
+ fltr_dicts = recipe_dict.get('heading', [])
+
+ if len(fltr_dicts) == 0:
+ raise ValueError("no filters found in recipe")
+ self.filters = [ToCFilter(fltr) for fltr in fltr_dicts]
+
+ def _extract_span(self, spn: dict) -> Optional[Fragment]:
+ """Extract text from span along with level
+
+ Argument
+ spn: a span dictionary
+ {
+ 'bbox': (float, float, float, float),
+ 'color': int,
+ 'flags': int,
+ 'font': str,
+ 'size': float,
+ 'text': str
+ }
+ Returns
+ a fragment of the heading or None if no match
+ """
+ for fltr in self.filters:
+ if fltr.admits(spn):
+ text = spn.get('text', "").strip()
+
+ if not text:
+ # don't match empty spaces
+ return None
+
+ if fltr.greedy:
+ # propagate all the way back to extract_block
+ raise FoundGreedy(fltr.level)
+
+ return Fragment(text, fltr.level)
+ return None
+
+ def _extract_line(self, line: dict) -> List[Optional[Fragment]]:
+ """Extract matching heading fragments in a line.
+
+ Argument
+ line: a line dictionary
+ {
+ 'bbox': (float, float, float, float),
+ 'wmode': int,
+ 'dir': (float, float),
+ 'spans': [dict]
+ }
+ Returns
+ a list of fragments concatenated from result in a line
+ """
+ return [self._extract_span(spn) for spn in line.get('spans', [])]
+
+ def extract_block(self, block: dict, page: int) -> List[ToCEntry]:
+ """Extract matching headings in a block.
+
+ Argument
+ block: a block dictionary
+ {
+ 'bbox': (float, float, float, float),
+ 'lines': [dict],
+ 'type': int
+ }
+ Returns
+ a list of toc entries, concatenated from the result of lines
+ """
+ if block.get('type') != 0:
+ # not a text block
+ return []
+
+ vpos = block.get('bbox', (0, 0))[1]
+
+ try:
+ frags = chain.from_iterable([
+ self._extract_line(ln) for ln in block.get('lines')
+ ])
+ titles = concatFrag(frags)
+
+ return [
+ ToCEntry(level, title, page, vpos)
+ for level, title in titles.items()
+ ]
+ except FoundGreedy as e:
+ # Smart Greedy: Only merged text that MATCHES the filter
+ # Find the filter that triggered this level
+ relevant_filter = next((f for f in self.filters if f.level == e.level), None)
+
+ parts = []
+ if relevant_filter:
+ for ln in block.get('lines', []):
+ for spn in ln.get('spans', []):
+ if relevant_filter.admits(spn):
+ parts.append(spn.get('text', "").strip())
+
+ merged_text = " ".join(parts)
+ if merged_text:
+ return [ToCEntry(e.level, merged_text, page, vpos)]
+ else:
+ return []
+
+
+def extract_toc(doc: Document, recipe: Recipe) -> List[ToCEntry]:
+ """Extract toc entries from a document
+
+ Arguments
+ doc: a pdf document
+ recipe: recipe from user
+ Returns
+ a list of toc entries in the document
+ """
+ result = []
+
+ for page in doc.pages():
+ for blk in page.get_textpage().extractDICT().get('blocks', []):
+ result.extend(
+ recipe.extract_block(blk, page.number + 1)
+ )
+
+ return result
diff --git a/pdftocgen/tocgen.py b/pdftocgen/tocgen.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fcb9d21c15c57cec9898c17677d614b2f4351ad
--- /dev/null
+++ b/pdftocgen/tocgen.py
@@ -0,0 +1,15 @@
+from fitz import Document
+from typing import List
+from fitzutils import ToCEntry
+from .recipe import Recipe, extract_toc
+
+def gen_toc(doc: Document, recipe_dict: dict) -> List[ToCEntry]:
+ """Generate the table of content for a document from recipe
+
+ Argument
+ doc: a pdf document
+ recipe_dict: the recipe dictionary used to generate the toc
+ Returns
+ a list of ToC entries
+ """
+ return extract_toc(doc, Recipe(recipe_dict))
diff --git a/pdftocio/__init__.py b/pdftocio/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe25c642e35a38b9dce6750edd85e315434b27d8
--- /dev/null
+++ b/pdftocio/__init__.py
@@ -0,0 +1,3 @@
+"""Manipulating the table of contents of a pdf"""
+
+__version__ = '1.3.4'
diff --git a/pdftocio/__main__.py b/pdftocio/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..528380d62037100b8f2f0746d849a6ff67e2e3be
--- /dev/null
+++ b/pdftocio/__main__.py
@@ -0,0 +1,4 @@
+from .app import main
+
+if __name__ == '__main__':
+ main()
diff --git a/pdftocio/app.py b/pdftocio/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eb14f579b912c2b00e382ba78ce695bb16d97d0
--- /dev/null
+++ b/pdftocio/app.py
@@ -0,0 +1,184 @@
+"""The executable of pdftocio"""
+
+import sys
+import os.path
+import pdftocio
+import getopt
+import io
+
+from typing import Optional, TextIO
+from getopt import GetoptError
+from fitzutils import open_pdf, dump_toc, pprint_toc, get_file_encoding
+from .tocparser import parse_toc
+from .tocio import write_toc, read_toc
+
+usage_s = """
+usage: pdftocio [options] in.pdf < toc
+ pdftocio [options] in.pdf
+""".strip()
+
+help_s = r"""
+usage: pdftocio [options] in.pdf < toc
+ pdftocio [options] in.pdf
+
+Import/output the table of contents of a PDF file.
+
+This command can operate in two ways: it can either be used
+to extract the table of contents of a PDF, or import table
+of contents to a PDF using the output of pdftocgen.
+
+1. To extract the table of contents of a PDF for
+ modification, only supply a input file:
+
+ $ pdftocio in.pdf
+
+ or if you want to print it in a readable format, use the
+ -H flag:
+
+ $ pdftocio -H in.pdf
+
+2. To import a table of contents to a PDF using the toc file
+ generated by pdftocgen, use input redirection,
+
+ $ pdftocio in.pdf < toc
+
+ pipes,
+
+ $ pdftocgen -r recipe.toml in.pdf | pdftocio in.pdf
+
+ or the -t flag
+
+ $ pdftocio -t toc in.pdf
+
+ to supply the toc file. If you want to specify an output
+ file name, use the -o option
+
+ $ pdftocio -t toc -o out.pdf in.pdf
+
+arguments
+ in.pdf path to the input PDF document
+
+options
+ -h, --help show help
+ -t, --toc=toc path to the table of contents generated by
+ pdftocgen. if this option is not given, the
+ default is stdin, but if no input is piped or
+ redirected to stdin, this program will instead
+ print the existing ToC of the PDF file
+ -v, --vpos if this flag is set, the vertical position of
+ each heading will be dumped to the output
+ -p, --print when flag is set, print the existing ToC in
+ the input PDF file. this flag is usually not
+ necessary, since it is the default behavior
+ when no input is given
+ -H, --human-readable print the toc in a readable format
+ -o, --out=file.pdf path to the output file. if this flag is not
+ specified, the default is {input}_out.pdf
+ -g, --debug enable debug mode
+ -V, --version show version number
+
+[1]: https://krasjet.com/voice/pdf.tocgen/#step-1-build-a-recipe
+""".strip()
+
+
+def main():
+ # parse arguments
+ try:
+ opts, args = getopt.gnu_getopt(
+ sys.argv[1:],
+ "hvt:pHo:gV",
+ ["help", "vpos", "toc=", "print", "human-readable", "out=", "debug", "version"]
+ )
+ except GetoptError as e:
+ print(e, file=sys.stderr)
+ print(usage_s, file=sys.stderr)
+ sys.exit(2)
+
+ toc_file: TextIO = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='ignore')
+ print_toc: bool = False
+ readable: bool = False
+ out: Optional[str] = None
+ vpos: bool = False
+ debug: bool = False
+
+ for o, a in opts:
+ if o in ("-H", "--human-readable"):
+ readable = True
+ elif o in ("-p", "--print"):
+ print_toc = True
+ elif o in ("-v", "--vpos"):
+ vpos = True
+ elif o in ("-t", "--toc"):
+ try:
+ toc_file = open(a, "r", encoding=get_file_encoding(a))
+ except IOError as e:
+ print("error: can't open file for reading", file=sys.stderr)
+ print(e, file=sys.stderr)
+ sys.exit(1)
+ elif o in ("-o", "--out"):
+ out = a
+ elif o in ("-g", "--debug"):
+ debug = True
+ elif o in ("-V", "--version"):
+ print("pdftocio", pdftocio.__version__, file=sys.stderr)
+ sys.exit()
+ elif o in ("-h", "--help"):
+ print(help_s, file=sys.stderr)
+ sys.exit()
+
+ if len(args) < 1:
+ print("error: no input pdf is given", file=sys.stderr)
+ print(usage_s, file=sys.stderr)
+ sys.exit(1)
+
+ path_in: str = args[0]
+ # done parsing arguments
+
+ try:
+ with open_pdf(path_in) as doc:
+ if toc_file.isatty() or print_toc:
+ # no input from user, switch to output mode and extract the toc
+ # of pdf
+ toc = read_toc(doc)
+ if len(toc) == 0:
+ print("error: no table of contents found", file=sys.stderr)
+ sys.exit(1)
+
+ stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore')
+
+ if readable:
+ print(pprint_toc(toc), file=stdout)
+ else:
+ print(dump_toc(toc, vpos), end="", file=stdout)
+ sys.exit(0)
+
+ # an input is given, so switch to input mode
+ toc = parse_toc(toc_file)
+ write_toc(doc, toc)
+
+ if out is None:
+ # add suffix to input name as output
+ pfx, ext = os.path.splitext(path_in)
+ out = f"{pfx}_out{ext}"
+ doc.save(out)
+ except ValueError as e:
+ if debug:
+ raise e
+ print("error:", e, file=sys.stderr)
+ sys.exit(1)
+ except IOError as e:
+ if debug:
+ raise e
+ print("error: unable to open file", file=sys.stderr)
+ print(e, file=sys.stderr)
+ sys.exit(1)
+ except IndexError as e:
+ if debug:
+ raise e
+ print("index error:", e, file=sys.stderr)
+ sys.exit(1)
+ except KeyboardInterrupt as e:
+ if debug:
+ raise e
+ print("error: interrupted", file=sys.stderr)
+ sys.exit(1)
diff --git a/pdftocio/tocio.py b/pdftocio/tocio.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2f4fdbc24aefddfd72d3f0bcf72eabe8dfaa3e8
--- /dev/null
+++ b/pdftocio/tocio.py
@@ -0,0 +1,20 @@
+"""Reading and writing table of contents from/to a pdf"""
+
+from typing import List
+from fitz import Document
+from fitzutils import ToCEntry
+
+
+def write_toc(doc: Document, toc: List[ToCEntry]):
+ """Write table of contents to a document"""
+ fitz_toc = list(map(lambda e: e.to_fitz_entry(), toc))
+ doc.set_toc(fitz_toc)
+
+
+def read_toc(doc: Document) -> List[ToCEntry]:
+ """Read table of contents from a document"""
+ return [
+ ToCEntry(e[0], e[1], e[2], e[3]['to'].y) if (len(e) == 4 and 'to' in e[3]) else
+ ToCEntry(e[0], e[1], e[2])
+ for e in doc.get_toc(False)
+ ]
diff --git a/pdftocio/tocparser.py b/pdftocio/tocparser.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d4e6c3031da4d4f052dc617c03ec964d33f81ef
--- /dev/null
+++ b/pdftocio/tocparser.py
@@ -0,0 +1,38 @@
+"""Parser for table of content csv file"""
+
+import csv
+import sys
+
+from typing import IO, List
+from fitzutils import ToCEntry
+from itertools import takewhile
+
+
+def parse_entry(entry: List) -> ToCEntry:
+ """parse a row in csv to a toc entry"""
+
+ # a somewhat weird hack, csv reader would read spaces as an empty '', so we
+ # only need to count the number of '' before an entry to determined the
+ # heading level
+ indent = len(list(takewhile(lambda x: x == '', entry)))
+ try:
+ toc_entry = ToCEntry(
+ int(indent / 4) + 1, # 4 spaces = 1 level
+ entry[indent], # heading
+ int(entry[indent + 1]), # pagenum
+ *entry[indent + 2:] # vpos
+ )
+ return toc_entry
+ except IndexError as e:
+ print(f"Unable to parse toc entry {entry};",
+ f"Need at least {indent + 2} parts but only have {len(entry)}.",
+ "Make sure the page number is present.",
+ file=sys.stderr)
+ raise e
+
+
+def parse_toc(file: IO) -> List[ToCEntry]:
+ """Parse a toc file to a list of toc entries"""
+ reader = csv.reader(file, lineterminator='\n',
+ delimiter=' ', quoting=csv.QUOTE_NONNUMERIC)
+ return list(map(parse_entry, reader))
diff --git a/pdfxmeta/__init__.py b/pdfxmeta/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..207212c694c3bb9c517747e1e6953ce3666ad8b0
--- /dev/null
+++ b/pdfxmeta/__init__.py
@@ -0,0 +1,5 @@
+"""Extract metadata (fonts, bounding box) for a string in a pdf"""
+
+__version__ = '1.3.4'
+
+from .pdfxmeta import extract_meta, dump_meta, dump_toml
diff --git a/pdfxmeta/__main__.py b/pdfxmeta/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..528380d62037100b8f2f0746d849a6ff67e2e3be
--- /dev/null
+++ b/pdfxmeta/__main__.py
@@ -0,0 +1,4 @@
+from .app import main
+
+if __name__ == '__main__':
+ main()
diff --git a/pdfxmeta/app.py b/pdfxmeta/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d27c9f31430574a39e4c0001fd9b93f558c8174
--- /dev/null
+++ b/pdfxmeta/app.py
@@ -0,0 +1,147 @@
+"""The executable of pdfxmeta"""
+
+import getopt
+import sys
+import pdfxmeta
+import io
+
+from getopt import GetoptError
+from typing import Optional, TextIO
+from fitzutils import open_pdf
+from textwrap import indent
+from pdfxmeta import dump_meta, dump_toml, extract_meta
+
+
+usage_s = """
+usage: pdfxmeta [options] doc.pdf [pattern]
+""".strip()
+
+help_s = """
+usage: pdfxmeta [options] doc.pdf [pattern]
+
+Extract the metadata for pattern in doc.pdf.
+
+To use this command, first open up the pdf file with your
+favorite pdf reader and find the text you want to search
+for. Then use
+
+ $ pdfxmeta -p 1 in.pdf "Subsection One"
+
+to find the metadata, mainly the font attributes and
+bounding box, of lines containing the pattern "Subsection
+One" on page 1. Specifying a page number is optional but
+highly recommended, since it greatly reduces the ambiguity
+of matches and execution time.
+
+The output of this command can be directly copy-pasted to
+build a recipe file for pdftocgen. Alternatively, you could
+also use the --auto or -a flag to output a valid heading
+filter directly
+
+ $ pdfxmeta -p 1 -a 2 in.pdf "Subsection One" >> recipe.toml
+
+where the argument of -a is the level of the heading filter,
+which in this case is 2.
+
+arguments
+ doc.pdf path to the input PDF document
+ [pattern] the pattern to search for (python regex). if not
+ given, dump the entire document
+
+options
+ -h, --help show help
+ -p, --page=PAGE specify the page to search for (1-based index)
+ -i, --ignore-case when flag is set, search will be case-insensitive
+ -a, --auto=LEVEL when flag is set, the output would be a valid
+ heading filter of the specified heading level in
+ default settings. it is directly usable by
+ pdftocgen.
+ -o, --out=FILE path to the output file. if this flag is not
+ specified, the default is stdout
+ -V, --version show version number
+""".strip()
+
+
+def print_result(meta: dict) -> str:
+ """pretty print results in a structured manner"""
+ return f"{meta.get('text', '')}:\n{indent(dump_meta(meta), ' ')}"
+
+
+def main():
+ # parse arguments
+ try:
+ opts, args = getopt.gnu_getopt(
+ sys.argv[1:],
+ "hiVp:a:o:",
+ ["help", "ignore-case", "version", "page=", "auto=", "out="]
+ )
+ except GetoptError as e:
+ print(e, file=sys.stderr)
+ print(usage_s, file=sys.stderr)
+ sys.exit(2)
+
+ ignore_case: bool = False
+ page: Optional[int] = None
+ auto_level: Optional[int] = None
+ out: TextIO = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore')
+
+ for o, a in opts:
+ if o in ("-i", "--ignore-case"):
+ ignore_case = True
+ elif o in ("-p", "--page"):
+ try:
+ page = int(a)
+ except ValueError as e:
+ print("error: invalid page number", file=sys.stderr)
+ sys.exit(1)
+ elif o in ("-a", "--auto"):
+ try:
+ auto_level = int(a)
+ except ValueError as e:
+ print("error: invalid level", file=sys.stderr)
+ sys.exit(1)
+ elif o in ("-o", "--out"):
+ try:
+ out = open(a, "w", encoding='utf-8', errors='ignore')
+ except IOError as e:
+ print("error: can't open file for writing", file=sys.stderr)
+ print(e, file=sys.stderr)
+ sys.exit(1)
+ elif o in ("-V", "--version"):
+ print("pdfxmeta", pdfxmeta.__version__, file=sys.stderr)
+ sys.exit()
+ elif o in ("-h", "--help"):
+ print(help_s, file=sys.stderr)
+ sys.exit()
+
+ argc = len(args)
+
+ if argc < 1:
+ print("error: no input pdf is given", file=sys.stderr)
+ print(usage_s, file=sys.stderr)
+ sys.exit(1)
+
+ path_in: str = args[0]
+ pattern: str = ""
+
+ if argc >= 2:
+ pattern = args[1]
+
+ # done parsing arguments
+
+ with open_pdf(path_in) as doc:
+ meta = extract_meta(doc, pattern, page, ignore_case)
+
+ # nothing found
+ if len(meta) == 0:
+ sys.exit(1)
+
+ # should we add \n between each output?
+ addnl = not out.isatty()
+
+ if auto_level:
+ print('\n'.join(
+ [dump_toml(m, auto_level, addnl) for m in meta]
+ ), file=out)
+ else:
+ print('\n'.join(map(print_result, meta)), file=out)
diff --git a/pdfxmeta/pdfxmeta.py b/pdfxmeta/pdfxmeta.py
new file mode 100644
index 0000000000000000000000000000000000000000..acdb949feefd1a3057074362bc46e3c418eba8c1
--- /dev/null
+++ b/pdfxmeta/pdfxmeta.py
@@ -0,0 +1,194 @@
+"""Extract metadata for a string in a pdf file"""
+
+from toml.encoder import _dump_str, _dump_float
+
+import re
+
+from fitz import Document, Page
+from typing import Optional, List
+
+
+def extract_meta(doc: Document,
+ pattern: str,
+ page: Optional[int] = None,
+ ign_case: bool = False
+ ) -> List[dict]:
+ """Extract meta for a `pattern` on `page` in a pdf document
+
+ Arguments
+ doc: document from pymupdf
+ pattern: a regular expression pattern
+ page: page number (1-based index), if None is given, search for the
+ entire document, but this is highly discouraged.
+ ign_case: ignore case?
+ """
+ result = []
+
+ if page is None:
+ pages = doc.pages()
+ elif 1 <= page <= doc.page_count:
+ pages = [doc[page - 1]]
+ else: # page out of range
+ return result
+
+ regex = re.compile(
+ pattern,
+ re.IGNORECASE
+ ) if ign_case else re.compile(pattern)
+
+ # we could parallelize this, but I don't see a reason
+ # to *not* specify a page number
+ for p in pages:
+ found = search_in_page(regex, p)
+ for s in found:
+ s['page_index'] = p.number + 1
+ try:
+ s['page_label'] = p.get_label()
+ except Exception:
+ # Fallback if get_label fails due to PyMuPDF version issues
+ s['page_label'] = ""
+ result.extend(found)
+
+ return result
+
+
+def search_in_page(regex: re.Pattern, page: Page) -> List[dict]:
+ """Search for `text` in `page` and extract meta using optimized search_for"""
+ result = []
+
+ # 1. Use simple string search if regex is just a literal (optimization)
+ # But since we have a compiled regex, we might need to extract the pattern if it's simple
+ # Or just use the regex to find matches in the FULL text of the page first?
+ # PyMuPDF's search_for takes a string. It doesn't support regex directly in wrapped core.
+ # However, for the purpose of this tool which claims regex support, we have a dilemma.
+ # But most users searching "Chapter 1" are doing literal searches.
+
+ # If we want to support the user's "Divided World", we need to handle the case where it might be split.
+ # The most robust way for PDF text search is usually:
+ # 1. Get all text (with position).
+ # 2. Run regex on the full text.
+ # 3. Map match back to bbox.
+ # 4. Find spans in bbox.
+
+ # BUT, to keep it simple and fix the immediate "spinning" and "missing" issue:
+ # The previous code iterated every span.
+ # Let's try to be smarter.
+
+ # For now, let's assume the user pattern is often a literal or we can approximate it.
+ # If the user provides a regex, we can't easily use search_for.
+ # However, the user provided "Divided World".
+
+ # Let's fallback to the robust get_text("dict") but optimize the check?
+ # No, get_text("dict") IS the slow part.
+
+ # Alternative:
+ # Use page.get_text("text") -> run regex -> if match, THEN get_text("dict")?
+ # That saves time for pages that DON'T match.
+
+ # Improved Algorithm:
+ # 1. Extract plain text of the page.
+ # 2. If regex doesn't match plain text, SKIP the page. (Huge optimization)
+ # 3. If it does match, perform the detailed span search.
+
+ text_content = page.get_text()
+ if not regex.search(text_content):
+ return []
+
+ # If we are here, there is a match on this page. Now find the exact spans.
+ # Note: If the text is split across spans, the simple span iterator below will STILL fail to extract the specific span metadata for the *whole* match.
+ # But at least it won't spin on empty pages.
+
+ page_meta = page.get_textpage().extractDICT()
+
+ for blk in page_meta.get('blocks', []):
+ for ln in blk.get('lines', []):
+ for spn in ln.get('spans', []):
+ text = spn.get('text', "")
+ if regex.search(text):
+ result.append(spn)
+ return result
+
+
+def to_bools(var: int) -> str:
+ """Convert int to lowercase bool string"""
+ return str(var != 0).lower()
+
+
+def dump_meta(spn: dict) -> str:
+ """Dump the span dict from PyMuPDF to TOML compatible string"""
+ result = []
+
+ if 'page_index' in spn:
+ result.append(f"page.index = {spn['page_index']}")
+ if 'page_label' in spn:
+ result.append(f"page.label = \"{spn['page_label']}\"")
+
+ result.append(f"font.name = {_dump_str(spn['font'])}")
+ result.append(f"font.size = {_dump_float(spn['size'])}")
+ result.append(f"font.color = {spn['color']:#08x}")
+
+ flags = spn['flags']
+
+ result.append(f"font.superscript = {to_bools(flags & 0b00001)}")
+ result.append(f"font.italic = {to_bools(flags & 0b00010)}")
+ result.append(f"font.serif = {to_bools(flags & 0b00100)}")
+ result.append(f"font.monospace = {to_bools(flags & 0b01000)}")
+ result.append(f"font.bold = {to_bools(flags & 0b10000)}")
+
+ bbox = spn['bbox']
+
+ result.append(f"bbox.left = {_dump_float(bbox[0])}")
+ result.append(f"bbox.top = {_dump_float(bbox[1])}")
+ result.append(f"bbox.right = {_dump_float(bbox[2])}")
+ result.append(f"bbox.bottom = {_dump_float(bbox[3])}")
+
+ return '\n'.join(result)
+
+
+def dump_toml(spn: dict, level: int, trail_nl: bool = False) -> str:
+ """Dump a valid TOML directly usable by pdftocgen
+
+ Argument
+ spn: span dict of the heading
+ level: heading level
+ trail_nl: add trailing new line
+ Returns
+ a valid toml string
+ """
+ result = []
+
+ result.append("[[heading]]")
+ result.append(f"# {spn.get('text', '')}")
+ result.append(f"level = {level}")
+ result.append("greedy = true")
+
+ # strip font subset prefix
+ # == takeWhile (\c -> c /= '+') str
+ before, sep, after = spn['font'].partition('+')
+ font = after if sep else before
+
+ result.append(f"font.name = {_dump_str(font)}")
+ result.append(f"font.size = {_dump_float(spn['size'])}")
+ result.append("# font.size_tolerance = 1e-5")
+ result.append(f"# font.color = {spn['color']:#08x}")
+
+ flags = spn['flags']
+
+ result.append(f"# font.superscript = {to_bools(flags & 0b00001)}")
+ result.append(f"# font.italic = {to_bools(flags & 0b00010)}")
+ result.append(f"# font.serif = {to_bools(flags & 0b00100)}")
+ result.append(f"# font.monospace = {to_bools(flags & 0b01000)}")
+ result.append(f"# font.bold = {to_bools(flags & 0b10000)}")
+
+ bbox = spn['bbox']
+
+ result.append(f"# bbox.left = {_dump_float(bbox[0])}")
+ result.append(f"# bbox.top = {_dump_float(bbox[1])}")
+ result.append(f"# bbox.right = {_dump_float(bbox[2])}")
+ result.append(f"# bbox.bottom = {_dump_float(bbox[3])}")
+ result.append("# bbox.tolerance = 1e-5")
+
+ if trail_nl:
+ result.append("")
+
+ return '\n'.join(result)
diff --git a/poetry.lock b/poetry.lock
new file mode 100644
index 0000000000000000000000000000000000000000..a86919f10476ffc87b01327ec1399cdd1bf39d9d
--- /dev/null
+++ b/poetry.lock
@@ -0,0 +1,534 @@
+# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
+
+[[package]]
+name = "args"
+version = "0.1.0"
+description = "Command Arguments for Humans."
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+ {file = "args-0.1.0.tar.gz", hash = "sha256:a785b8d837625e9b61c39108532d95b85274acd679693b71ebb5156848fcf814"},
+]
+
+[[package]]
+name = "astroid"
+version = "2.11.7"
+description = "An abstract syntax tree for Python with inference support."
+category = "dev"
+optional = false
+python-versions = ">=3.6.2"
+files = [
+ {file = "astroid-2.11.7-py3-none-any.whl", hash = "sha256:86b0a340a512c65abf4368b80252754cda17c02cdbbd3f587dddf98112233e7b"},
+ {file = "astroid-2.11.7.tar.gz", hash = "sha256:bb24615c77f4837c707669d16907331374ae8a964650a66999da3f5ca68dc946"},
+]
+
+[package.dependencies]
+lazy-object-proxy = ">=1.4.0"
+setuptools = ">=20.0"
+typed-ast = {version = ">=1.4.0,<2.0", markers = "implementation_name == \"cpython\" and python_version < \"3.8\""}
+typing-extensions = {version = ">=3.10", markers = "python_version < \"3.10\""}
+wrapt = ">=1.11,<2"
+
+[[package]]
+name = "chardet"
+version = "5.1.0"
+description = "Universal encoding detector for Python 3"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "chardet-5.1.0-py3-none-any.whl", hash = "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"},
+ {file = "chardet-5.1.0.tar.gz", hash = "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5"},
+]
+
+[[package]]
+name = "clint"
+version = "0.5.1"
+description = "Python Command Line Interface Tools"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+ {file = "clint-0.5.1.tar.gz", hash = "sha256:05224c32b1075563d0b16d0015faaf9da43aa214e4a2140e51f08789e7a4c5aa"},
+]
+
+[package.dependencies]
+args = "*"
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+files = [
+ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+
+[[package]]
+name = "coverage"
+version = "7.2.3"
+description = "Code coverage measurement for Python"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "coverage-7.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e58c0d41d336569d63d1b113bd573db8363bc4146f39444125b7f8060e4e04f5"},
+ {file = "coverage-7.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:344e714bd0fe921fc72d97404ebbdbf9127bac0ca1ff66d7b79efc143cf7c0c4"},
+ {file = "coverage-7.2.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:974bc90d6f6c1e59ceb1516ab00cf1cdfbb2e555795d49fa9571d611f449bcb2"},
+ {file = "coverage-7.2.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0743b0035d4b0e32bc1df5de70fba3059662ace5b9a2a86a9f894cfe66569013"},
+ {file = "coverage-7.2.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d0391fb4cfc171ce40437f67eb050a340fdbd0f9f49d6353a387f1b7f9dd4fa"},
+ {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4a42e1eff0ca9a7cb7dc9ecda41dfc7cbc17cb1d02117214be0561bd1134772b"},
+ {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:be19931a8dcbe6ab464f3339966856996b12a00f9fe53f346ab3be872d03e257"},
+ {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:72fcae5bcac3333a4cf3b8f34eec99cea1187acd55af723bcbd559adfdcb5535"},
+ {file = "coverage-7.2.3-cp310-cp310-win32.whl", hash = "sha256:aeae2aa38395b18106e552833f2a50c27ea0000122bde421c31d11ed7e6f9c91"},
+ {file = "coverage-7.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:83957d349838a636e768251c7e9979e899a569794b44c3728eaebd11d848e58e"},
+ {file = "coverage-7.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:dfd393094cd82ceb9b40df4c77976015a314b267d498268a076e940fe7be6b79"},
+ {file = "coverage-7.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:182eb9ac3f2b4874a1f41b78b87db20b66da6b9cdc32737fbbf4fea0c35b23fc"},
+ {file = "coverage-7.2.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bb1e77a9a311346294621be905ea8a2c30d3ad371fc15bb72e98bfcfae532df"},
+ {file = "coverage-7.2.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca0f34363e2634deffd390a0fef1aa99168ae9ed2af01af4a1f5865e362f8623"},
+ {file = "coverage-7.2.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55416d7385774285b6e2a5feca0af9652f7f444a4fa3d29d8ab052fafef9d00d"},
+ {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:06ddd9c0249a0546997fdda5a30fbcb40f23926df0a874a60a8a185bc3a87d93"},
+ {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:fff5aaa6becf2c6a1699ae6a39e2e6fb0672c2d42eca8eb0cafa91cf2e9bd312"},
+ {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ea53151d87c52e98133eb8ac78f1206498c015849662ca8dc246255265d9c3c4"},
+ {file = "coverage-7.2.3-cp311-cp311-win32.whl", hash = "sha256:8f6c930fd70d91ddee53194e93029e3ef2aabe26725aa3c2753df057e296b925"},
+ {file = "coverage-7.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:fa546d66639d69aa967bf08156eb8c9d0cd6f6de84be9e8c9819f52ad499c910"},
+ {file = "coverage-7.2.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b2317d5ed777bf5a033e83d4f1389fd4ef045763141d8f10eb09a7035cee774c"},
+ {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be9824c1c874b73b96288c6d3de793bf7f3a597770205068c6163ea1f326e8b9"},
+ {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c3b2803e730dc2797a017335827e9da6da0e84c745ce0f552e66400abdfb9a1"},
+ {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f69770f5ca1994cb32c38965e95f57504d3aea96b6c024624fdd5bb1aa494a1"},
+ {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1127b16220f7bfb3f1049ed4a62d26d81970a723544e8252db0efde853268e21"},
+ {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:aa784405f0c640940595fa0f14064d8e84aff0b0f762fa18393e2760a2cf5841"},
+ {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3146b8e16fa60427e03884301bf8209221f5761ac754ee6b267642a2fd354c48"},
+ {file = "coverage-7.2.3-cp37-cp37m-win32.whl", hash = "sha256:1fd78b911aea9cec3b7e1e2622c8018d51c0d2bbcf8faaf53c2497eb114911c1"},
+ {file = "coverage-7.2.3-cp37-cp37m-win_amd64.whl", hash = "sha256:0f3736a5d34e091b0a611964c6262fd68ca4363df56185902528f0b75dbb9c1f"},
+ {file = "coverage-7.2.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:981b4df72c93e3bc04478153df516d385317628bd9c10be699c93c26ddcca8ab"},
+ {file = "coverage-7.2.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c0045f8f23a5fb30b2eb3b8a83664d8dc4fb58faddf8155d7109166adb9f2040"},
+ {file = "coverage-7.2.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f760073fcf8f3d6933178d67754f4f2d4e924e321f4bb0dcef0424ca0215eba1"},
+ {file = "coverage-7.2.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c86bd45d1659b1ae3d0ba1909326b03598affbc9ed71520e0ff8c31a993ad911"},
+ {file = "coverage-7.2.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:172db976ae6327ed4728e2507daf8a4de73c7cc89796483e0a9198fd2e47b462"},
+ {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:d2a3a6146fe9319926e1d477842ca2a63fe99af5ae690b1f5c11e6af074a6b5c"},
+ {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:f649dd53833b495c3ebd04d6eec58479454a1784987af8afb77540d6c1767abd"},
+ {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7c4ed4e9f3b123aa403ab424430b426a1992e6f4c8fd3cb56ea520446e04d152"},
+ {file = "coverage-7.2.3-cp38-cp38-win32.whl", hash = "sha256:eb0edc3ce9760d2f21637766c3aa04822030e7451981ce569a1b3456b7053f22"},
+ {file = "coverage-7.2.3-cp38-cp38-win_amd64.whl", hash = "sha256:63cdeaac4ae85a179a8d6bc09b77b564c096250d759eed343a89d91bce8b6367"},
+ {file = "coverage-7.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:20d1a2a76bb4eb00e4d36b9699f9b7aba93271c9c29220ad4c6a9581a0320235"},
+ {file = "coverage-7.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ea748802cc0de4de92ef8244dd84ffd793bd2e7be784cd8394d557a3c751e21"},
+ {file = "coverage-7.2.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21b154aba06df42e4b96fc915512ab39595105f6c483991287021ed95776d934"},
+ {file = "coverage-7.2.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd214917cabdd6f673a29d708574e9fbdb892cb77eb426d0eae3490d95ca7859"},
+ {file = "coverage-7.2.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c2e58e45fe53fab81f85474e5d4d226eeab0f27b45aa062856c89389da2f0d9"},
+ {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:87ecc7c9a1a9f912e306997ffee020297ccb5ea388421fe62a2a02747e4d5539"},
+ {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:387065e420aed3c71b61af7e82c7b6bc1c592f7e3c7a66e9f78dd178699da4fe"},
+ {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ea3f5bc91d7d457da7d48c7a732beaf79d0c8131df3ab278e6bba6297e23c6c4"},
+ {file = "coverage-7.2.3-cp39-cp39-win32.whl", hash = "sha256:ae7863a1d8db6a014b6f2ff9c1582ab1aad55a6d25bac19710a8df68921b6e30"},
+ {file = "coverage-7.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:3f04becd4fcda03c0160d0da9c8f0c246bc78f2f7af0feea1ec0930e7c93fa4a"},
+ {file = "coverage-7.2.3-pp37.pp38.pp39-none-any.whl", hash = "sha256:965ee3e782c7892befc25575fa171b521d33798132692df428a09efacaffe8d0"},
+ {file = "coverage-7.2.3.tar.gz", hash = "sha256:d298c2815fa4891edd9abe5ad6e6cb4207104c7dd9fd13aea3fdebf6f9b91259"},
+]
+
+[package.extras]
+toml = ["tomli"]
+
+[[package]]
+name = "dill"
+version = "0.3.6"
+description = "serialize all of python"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "dill-0.3.6-py3-none-any.whl", hash = "sha256:a07ffd2351b8c678dfc4a856a3005f8067aea51d6ba6c700796a4d9e280f39f0"},
+ {file = "dill-0.3.6.tar.gz", hash = "sha256:e5db55f3687856d8fbdab002ed78544e1c4559a130302693d839dfe8f93f2373"},
+]
+
+[package.extras]
+graph = ["objgraph (>=1.7.2)"]
+
+[[package]]
+name = "isort"
+version = "5.11.5"
+description = "A Python utility / library to sort Python imports."
+category = "dev"
+optional = false
+python-versions = ">=3.7.0"
+files = [
+ {file = "isort-5.11.5-py3-none-any.whl", hash = "sha256:ba1d72fb2595a01c7895a5128f9585a5cc4b6d395f1c8d514989b9a7eb2a8746"},
+ {file = "isort-5.11.5.tar.gz", hash = "sha256:6be1f76a507cb2ecf16c7cf14a37e41609ca082330be4e3436a18ef74add55db"},
+]
+
+[package.extras]
+colors = ["colorama (>=0.4.3,<0.5.0)"]
+pipfile-deprecated-finder = ["pip-shims (>=0.5.2)", "pipreqs", "requirementslib"]
+plugins = ["setuptools"]
+requirements-deprecated-finder = ["pip-api", "pipreqs"]
+
+[[package]]
+name = "jedi"
+version = "0.17.2"
+description = "An autocompletion tool for Python that can be used for text editors."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+files = [
+ {file = "jedi-0.17.2-py2.py3-none-any.whl", hash = "sha256:98cc583fa0f2f8304968199b01b6b4b94f469a1f4a74c1560506ca2a211378b5"},
+ {file = "jedi-0.17.2.tar.gz", hash = "sha256:86ed7d9b750603e4ba582ea8edc678657fb4007894a12bcf6f4bb97892f31d20"},
+]
+
+[package.dependencies]
+parso = ">=0.7.0,<0.8.0"
+
+[package.extras]
+qa = ["flake8 (==3.7.9)"]
+testing = ["Django (<3.1)", "colorama", "docopt", "pytest (>=3.9.0,<5.0.0)"]
+
+[[package]]
+name = "lazy-object-proxy"
+version = "1.9.0"
+description = "A fast and thorough lazy object proxy."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "lazy-object-proxy-1.9.0.tar.gz", hash = "sha256:659fb5809fa4629b8a1ac5106f669cfc7bef26fbb389dda53b3e010d1ac4ebae"},
+ {file = "lazy_object_proxy-1.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b40387277b0ed2d0602b8293b94d7257e17d1479e257b4de114ea11a8cb7f2d7"},
+ {file = "lazy_object_proxy-1.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8c6cfb338b133fbdbc5cfaa10fe3c6aeea827db80c978dbd13bc9dd8526b7d4"},
+ {file = "lazy_object_proxy-1.9.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:721532711daa7db0d8b779b0bb0318fa87af1c10d7fe5e52ef30f8eff254d0cd"},
+ {file = "lazy_object_proxy-1.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:66a3de4a3ec06cd8af3f61b8e1ec67614fbb7c995d02fa224813cb7afefee701"},
+ {file = "lazy_object_proxy-1.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1aa3de4088c89a1b69f8ec0dcc169aa725b0ff017899ac568fe44ddc1396df46"},
+ {file = "lazy_object_proxy-1.9.0-cp310-cp310-win32.whl", hash = "sha256:f0705c376533ed2a9e5e97aacdbfe04cecd71e0aa84c7c0595d02ef93b6e4455"},
+ {file = "lazy_object_proxy-1.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:ea806fd4c37bf7e7ad82537b0757999264d5f70c45468447bb2b91afdbe73a6e"},
+ {file = "lazy_object_proxy-1.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:946d27deaff6cf8452ed0dba83ba38839a87f4f7a9732e8f9fd4107b21e6ff07"},
+ {file = "lazy_object_proxy-1.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79a31b086e7e68b24b99b23d57723ef7e2c6d81ed21007b6281ebcd1688acb0a"},
+ {file = "lazy_object_proxy-1.9.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f699ac1c768270c9e384e4cbd268d6e67aebcfae6cd623b4d7c3bfde5a35db59"},
+ {file = "lazy_object_proxy-1.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bfb38f9ffb53b942f2b5954e0f610f1e721ccebe9cce9025a38c8ccf4a5183a4"},
+ {file = "lazy_object_proxy-1.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:189bbd5d41ae7a498397287c408617fe5c48633e7755287b21d741f7db2706a9"},
+ {file = "lazy_object_proxy-1.9.0-cp311-cp311-win32.whl", hash = "sha256:81fc4d08b062b535d95c9ea70dbe8a335c45c04029878e62d744bdced5141586"},
+ {file = "lazy_object_proxy-1.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:f2457189d8257dd41ae9b434ba33298aec198e30adf2dcdaaa3a28b9994f6adb"},
+ {file = "lazy_object_proxy-1.9.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d9e25ef10a39e8afe59a5c348a4dbf29b4868ab76269f81ce1674494e2565a6e"},
+ {file = "lazy_object_proxy-1.9.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cbf9b082426036e19c6924a9ce90c740a9861e2bdc27a4834fd0a910742ac1e8"},
+ {file = "lazy_object_proxy-1.9.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f5fa4a61ce2438267163891961cfd5e32ec97a2c444e5b842d574251ade27d2"},
+ {file = "lazy_object_proxy-1.9.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:8fa02eaab317b1e9e03f69aab1f91e120e7899b392c4fc19807a8278a07a97e8"},
+ {file = "lazy_object_proxy-1.9.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e7c21c95cae3c05c14aafffe2865bbd5e377cfc1348c4f7751d9dc9a48ca4bda"},
+ {file = "lazy_object_proxy-1.9.0-cp37-cp37m-win32.whl", hash = "sha256:f12ad7126ae0c98d601a7ee504c1122bcef553d1d5e0c3bfa77b16b3968d2734"},
+ {file = "lazy_object_proxy-1.9.0-cp37-cp37m-win_amd64.whl", hash = "sha256:edd20c5a55acb67c7ed471fa2b5fb66cb17f61430b7a6b9c3b4a1e40293b1671"},
+ {file = "lazy_object_proxy-1.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2d0daa332786cf3bb49e10dc6a17a52f6a8f9601b4cf5c295a4f85854d61de63"},
+ {file = "lazy_object_proxy-1.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cd077f3d04a58e83d04b20e334f678c2b0ff9879b9375ed107d5d07ff160171"},
+ {file = "lazy_object_proxy-1.9.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:660c94ea760b3ce47d1855a30984c78327500493d396eac4dfd8bd82041b22be"},
+ {file = "lazy_object_proxy-1.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:212774e4dfa851e74d393a2370871e174d7ff0ebc980907723bb67d25c8a7c30"},
+ {file = "lazy_object_proxy-1.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f0117049dd1d5635bbff65444496c90e0baa48ea405125c088e93d9cf4525b11"},
+ {file = "lazy_object_proxy-1.9.0-cp38-cp38-win32.whl", hash = "sha256:0a891e4e41b54fd5b8313b96399f8b0e173bbbfc03c7631f01efbe29bb0bcf82"},
+ {file = "lazy_object_proxy-1.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:9990d8e71b9f6488e91ad25f322898c136b008d87bf852ff65391b004da5e17b"},
+ {file = "lazy_object_proxy-1.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9e7551208b2aded9c1447453ee366f1c4070602b3d932ace044715d89666899b"},
+ {file = "lazy_object_proxy-1.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f83ac4d83ef0ab017683d715ed356e30dd48a93746309c8f3517e1287523ef4"},
+ {file = "lazy_object_proxy-1.9.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7322c3d6f1766d4ef1e51a465f47955f1e8123caee67dd641e67d539a534d006"},
+ {file = "lazy_object_proxy-1.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:18b78ec83edbbeb69efdc0e9c1cb41a3b1b1ed11ddd8ded602464c3fc6020494"},
+ {file = "lazy_object_proxy-1.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:09763491ce220c0299688940f8dc2c5d05fd1f45af1e42e636b2e8b2303e4382"},
+ {file = "lazy_object_proxy-1.9.0-cp39-cp39-win32.whl", hash = "sha256:9090d8e53235aa280fc9239a86ae3ea8ac58eff66a705fa6aa2ec4968b95c821"},
+ {file = "lazy_object_proxy-1.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:db1c1722726f47e10e0b5fdbf15ac3b8adb58c091d12b3ab713965795036985f"},
+]
+
+[[package]]
+name = "mamba"
+version = "0.11.2"
+description = "The definitive testing tool for Python. Born under the banner of Behavior Driven Development."
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+ {file = "mamba-0.11.2.tar.gz", hash = "sha256:75cfc6dfd287dcccaf86dd753cf48e0a7337487c7c3fafda05a6a67ded6da496"},
+]
+
+[package.dependencies]
+clint = "*"
+coverage = "*"
+
+[[package]]
+name = "mccabe"
+version = "0.7.0"
+description = "McCabe checker, plugin for flake8"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+files = [
+ {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"},
+ {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
+]
+
+[[package]]
+name = "parso"
+version = "0.7.1"
+description = "A Python Parser"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+ {file = "parso-0.7.1-py2.py3-none-any.whl", hash = "sha256:97218d9159b2520ff45eb78028ba8b50d2bc61dcc062a9682666f2dc4bd331ea"},
+ {file = "parso-0.7.1.tar.gz", hash = "sha256:caba44724b994a8a5e086460bb212abc5a8bc46951bf4a9a1210745953622eb9"},
+]
+
+[package.extras]
+testing = ["docopt", "pytest (>=3.0.7)"]
+
+[[package]]
+name = "platformdirs"
+version = "3.2.0"
+description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "platformdirs-3.2.0-py3-none-any.whl", hash = "sha256:ebe11c0d7a805086e99506aa331612429a72ca7cd52a1f0d277dc4adc20cb10e"},
+ {file = "platformdirs-3.2.0.tar.gz", hash = "sha256:d5b638ca397f25f979350ff789db335903d7ea010ab28903f57b27e1b16c2b08"},
+]
+
+[package.dependencies]
+typing-extensions = {version = ">=4.5", markers = "python_version < \"3.8\""}
+
+[package.extras]
+docs = ["furo (>=2022.12.7)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.22,!=1.23.4)"]
+test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.2.2)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"]
+
+[[package]]
+name = "pylint"
+version = "2.13.9"
+description = "python code static checker"
+category = "dev"
+optional = false
+python-versions = ">=3.6.2"
+files = [
+ {file = "pylint-2.13.9-py3-none-any.whl", hash = "sha256:705c620d388035bdd9ff8b44c5bcdd235bfb49d276d488dd2c8ff1736aa42526"},
+ {file = "pylint-2.13.9.tar.gz", hash = "sha256:095567c96e19e6f57b5b907e67d265ff535e588fe26b12b5ebe1fc5645b2c731"},
+]
+
+[package.dependencies]
+astroid = ">=2.11.5,<=2.12.0-dev0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+dill = ">=0.2"
+isort = ">=4.2.5,<6"
+mccabe = ">=0.6,<0.8"
+platformdirs = ">=2.2.0"
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
+typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""}
+
+[package.extras]
+testutil = ["gitpython (>3)"]
+
+[[package]]
+name = "pymupdf"
+version = "1.22.1"
+description = "Python bindings for the PDF toolkit and renderer MuPDF"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "PyMuPDF-1.22.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6bda7a64a1263f1c2b6421ae8803db50d4c8a67de95e05d7a38c313de913b0de"},
+ {file = "PyMuPDF-1.22.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b5f62ad244b04b7aa5e7d50b06b8bbc582b2f1d0f2c66013051463d63dfe6c5e"},
+ {file = "PyMuPDF-1.22.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce633b9d522528959988647dfbd2c9144ad5422dd75e89e60039da36a412fd3c"},
+ {file = "PyMuPDF-1.22.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:733e7b87765ea55202b042b7c84c6b94185ee29fe3a2bd2ee02681c0fd584033"},
+ {file = "PyMuPDF-1.22.1-cp310-cp310-win32.whl", hash = "sha256:701499f0a17ccc8dd80707dbeb3a2e60657a6bdc05be7c8c69fa60eb134e1805"},
+ {file = "PyMuPDF-1.22.1-cp310-cp310-win_amd64.whl", hash = "sha256:81fa90d157ef7b2ecd72eedafe9db56d3b0f8c3b392d7a2057f659bfcc1f7cad"},
+ {file = "PyMuPDF-1.22.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4edac1dd8e5c35b55420925b5486bec4427b07a073cd03f6081b7234ed37217e"},
+ {file = "PyMuPDF-1.22.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7744b9853fc55df75f6d37a376432eddd450c1d2072f6ef66b392b7229bccdc6"},
+ {file = "PyMuPDF-1.22.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:711adc70d664cdd5d361154bb3485546eaa5e8a90827db6abf9c42ca292aa9e1"},
+ {file = "PyMuPDF-1.22.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a1d77a3057ad7fc3e2e02e5fedd53199206a49c4b4c5e3ee75458c17d6b739cb"},
+ {file = "PyMuPDF-1.22.1-cp311-cp311-win32.whl", hash = "sha256:b5eca48ea55eafcea68b14669a9f5030c15056431b10710d863de9f9a6b1a0ce"},
+ {file = "PyMuPDF-1.22.1-cp311-cp311-win_amd64.whl", hash = "sha256:8e0bfbd6195f45326f9182fff04ac2af9568d78fc1f32dcfa15f84a302d8aafe"},
+ {file = "PyMuPDF-1.22.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:440efca115e70c8cdfc492e98b182e24c565d8e68f26754e28e61cf108a915d9"},
+ {file = "PyMuPDF-1.22.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a70ab2d38b366c7237adce7d54f3028a7825f165a73c137a1746a6b592d26bb2"},
+ {file = "PyMuPDF-1.22.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e4a924ffecb8046fbfe7dff9b69f9938389f094dccab07a378850bf9f889c62"},
+ {file = "PyMuPDF-1.22.1-cp37-cp37m-win32.whl", hash = "sha256:24e66c2ff4d6cfee5b082c3e2c92b40214799888bf2efcca1f70108c3dfedddb"},
+ {file = "PyMuPDF-1.22.1-cp37-cp37m-win_amd64.whl", hash = "sha256:51504bfa2ee207c5c1a38d47b4b91af1bacbd8937b959d947d81fc8f7e023bd8"},
+ {file = "PyMuPDF-1.22.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:219337a3be00df2bf65071d5e4e1e6759afd06310d4ec7b1c9694a5b03b5d8d6"},
+ {file = "PyMuPDF-1.22.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:050719cb42a8847d564af1d8509d7290176e7c4fde6da7be5751303fa8237aed"},
+ {file = "PyMuPDF-1.22.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5871b9e38e68b92533fb7c6fbe3eb7b059f5071d4c2e3ff51cedcc73c994afbc"},
+ {file = "PyMuPDF-1.22.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5a0332d6dac4ebf32cb7f0c8639b22b56c9475cb87bc0a0361f9cdc9c2d08a1"},
+ {file = "PyMuPDF-1.22.1-cp38-cp38-win32.whl", hash = "sha256:127985812c4a2f0106375c4f4916ca68c1559d6b224a050ce75393e454333995"},
+ {file = "PyMuPDF-1.22.1-cp38-cp38-win_amd64.whl", hash = "sha256:99764c46fb8df253a3ea9fbb13b132f205561d6227b0d00e673998b18d7280eb"},
+ {file = "PyMuPDF-1.22.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fdb21332d28567e278008dd6130564ac0f5de8aff364a1e7809a70a0f969df26"},
+ {file = "PyMuPDF-1.22.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:88202e42d957a41deff212dcb1d8e16e469d21d09a72ab372ee2f173a22112c8"},
+ {file = "PyMuPDF-1.22.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36b7fd85f5813045f10b65caf4cbdad03b51b07076f07b205853a1e44c898e34"},
+ {file = "PyMuPDF-1.22.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45e601f7b1ee2a0c1a261bb0179eba4a9899117404eccf0a573e6497ed507ea8"},
+ {file = "PyMuPDF-1.22.1-cp39-cp39-win32.whl", hash = "sha256:c610acdbd2f2d994130341559f26c098df546a1fc187adee3b63a0f489310808"},
+ {file = "PyMuPDF-1.22.1-cp39-cp39-win_amd64.whl", hash = "sha256:af1e6d5dd122c097f23a7e89f8c2197310e85a4c8e8f63ff94444188d9bc0a4e"},
+ {file = "PyMuPDF-1.22.1.tar.gz", hash = "sha256:ad34bba78ce147cee50e1dc30fa16f29135a4c3d6a2b1c1b0403ebbcc9fbe4be"},
+]
+
+[[package]]
+name = "setuptools"
+version = "67.7.0"
+description = "Easily download, build, install, upgrade, and uninstall Python packages"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "setuptools-67.7.0-py3-none-any.whl", hash = "sha256:888be97fde8cc3afd60f7784e678fa29ee13c4e5362daa7104a93bba33646c50"},
+ {file = "setuptools-67.7.0.tar.gz", hash = "sha256:b7e53a01c6c654d26d2999ee033d8c6125e5fa55f03b7b193f937ae7ac999f22"},
+]
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+
+[[package]]
+name = "toml"
+version = "0.10.2"
+description = "Python Library for Tom's Obvious, Minimal Language"
+category = "main"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+files = [
+ {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
+ {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
+]
+
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
+
+[[package]]
+name = "typed-ast"
+version = "1.5.4"
+description = "a fork of Python 2 and 3 ast modules with type comment support"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+files = [
+ {file = "typed_ast-1.5.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:669dd0c4167f6f2cd9f57041e03c3c2ebf9063d0757dc89f79ba1daa2bfca9d4"},
+ {file = "typed_ast-1.5.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:211260621ab1cd7324e0798d6be953d00b74e0428382991adfddb352252f1d62"},
+ {file = "typed_ast-1.5.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:267e3f78697a6c00c689c03db4876dd1efdfea2f251a5ad6555e82a26847b4ac"},
+ {file = "typed_ast-1.5.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c542eeda69212fa10a7ada75e668876fdec5f856cd3d06829e6aa64ad17c8dfe"},
+ {file = "typed_ast-1.5.4-cp310-cp310-win_amd64.whl", hash = "sha256:a9916d2bb8865f973824fb47436fa45e1ebf2efd920f2b9f99342cb7fab93f72"},
+ {file = "typed_ast-1.5.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:79b1e0869db7c830ba6a981d58711c88b6677506e648496b1f64ac7d15633aec"},
+ {file = "typed_ast-1.5.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a94d55d142c9265f4ea46fab70977a1944ecae359ae867397757d836ea5a3f47"},
+ {file = "typed_ast-1.5.4-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:183afdf0ec5b1b211724dfef3d2cad2d767cbefac291f24d69b00546c1837fb6"},
+ {file = "typed_ast-1.5.4-cp36-cp36m-win_amd64.whl", hash = "sha256:639c5f0b21776605dd6c9dbe592d5228f021404dafd377e2b7ac046b0349b1a1"},
+ {file = "typed_ast-1.5.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cf4afcfac006ece570e32d6fa90ab74a17245b83dfd6655a6f68568098345ff6"},
+ {file = "typed_ast-1.5.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed855bbe3eb3715fca349c80174cfcfd699c2f9de574d40527b8429acae23a66"},
+ {file = "typed_ast-1.5.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:6778e1b2f81dfc7bc58e4b259363b83d2e509a65198e85d5700dfae4c6c8ff1c"},
+ {file = "typed_ast-1.5.4-cp37-cp37m-win_amd64.whl", hash = "sha256:0261195c2062caf107831e92a76764c81227dae162c4f75192c0d489faf751a2"},
+ {file = "typed_ast-1.5.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2efae9db7a8c05ad5547d522e7dbe62c83d838d3906a3716d1478b6c1d61388d"},
+ {file = "typed_ast-1.5.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7d5d014b7daa8b0bf2eaef684295acae12b036d79f54178b92a2b6a56f92278f"},
+ {file = "typed_ast-1.5.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:370788a63915e82fd6f212865a596a0fefcbb7d408bbbb13dea723d971ed8bdc"},
+ {file = "typed_ast-1.5.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4e964b4ff86550a7a7d56345c7864b18f403f5bd7380edf44a3c1fb4ee7ac6c6"},
+ {file = "typed_ast-1.5.4-cp38-cp38-win_amd64.whl", hash = "sha256:683407d92dc953c8a7347119596f0b0e6c55eb98ebebd9b23437501b28dcbb8e"},
+ {file = "typed_ast-1.5.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4879da6c9b73443f97e731b617184a596ac1235fe91f98d279a7af36c796da35"},
+ {file = "typed_ast-1.5.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3e123d878ba170397916557d31c8f589951e353cc95fb7f24f6bb69adc1a8a97"},
+ {file = "typed_ast-1.5.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebd9d7f80ccf7a82ac5f88c521115cc55d84e35bf8b446fcd7836eb6b98929a3"},
+ {file = "typed_ast-1.5.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98f80dee3c03455e92796b58b98ff6ca0b2a6f652120c263efdba4d6c5e58f72"},
+ {file = "typed_ast-1.5.4-cp39-cp39-win_amd64.whl", hash = "sha256:0fdbcf2fef0ca421a3f5912555804296f0b0960f0418c440f5d6d3abb549f3e1"},
+ {file = "typed_ast-1.5.4.tar.gz", hash = "sha256:39e21ceb7388e4bb37f4c679d72707ed46c2fbf2a5609b8b8ebc4b067d977df2"},
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.5.0"
+description = "Backported and Experimental Type Hints for Python 3.7+"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "typing_extensions-4.5.0-py3-none-any.whl", hash = "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"},
+ {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"},
+]
+
+[[package]]
+name = "wrapt"
+version = "1.15.0"
+description = "Module for decorators, wrappers and monkey patching."
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
+files = [
+ {file = "wrapt-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ca1cccf838cd28d5a0883b342474c630ac48cac5df0ee6eacc9c7290f76b11c1"},
+ {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:e826aadda3cae59295b95343db8f3d965fb31059da7de01ee8d1c40a60398b29"},
+ {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:5fc8e02f5984a55d2c653f5fea93531e9836abbd84342c1d1e17abc4a15084c2"},
+ {file = "wrapt-1.15.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:96e25c8603a155559231c19c0349245eeb4ac0096fe3c1d0be5c47e075bd4f46"},
+ {file = "wrapt-1.15.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:40737a081d7497efea35ab9304b829b857f21558acfc7b3272f908d33b0d9d4c"},
+ {file = "wrapt-1.15.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:f87ec75864c37c4c6cb908d282e1969e79763e0d9becdfe9fe5473b7bb1e5f09"},
+ {file = "wrapt-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:1286eb30261894e4c70d124d44b7fd07825340869945c79d05bda53a40caa079"},
+ {file = "wrapt-1.15.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:493d389a2b63c88ad56cdc35d0fa5752daac56ca755805b1b0c530f785767d5e"},
+ {file = "wrapt-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:58d7a75d731e8c63614222bcb21dd992b4ab01a399f1f09dd82af17bbfc2368a"},
+ {file = "wrapt-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:21f6d9a0d5b3a207cdf7acf8e58d7d13d463e639f0c7e01d82cdb671e6cb7923"},
+ {file = "wrapt-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ce42618f67741d4697684e501ef02f29e758a123aa2d669e2d964ff734ee00ee"},
+ {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41d07d029dd4157ae27beab04d22b8e261eddfc6ecd64ff7000b10dc8b3a5727"},
+ {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54accd4b8bc202966bafafd16e69da9d5640ff92389d33d28555c5fd4f25ccb7"},
+ {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fbfbca668dd15b744418265a9607baa970c347eefd0db6a518aaf0cfbd153c0"},
+ {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:76e9c727a874b4856d11a32fb0b389afc61ce8aaf281ada613713ddeadd1cfec"},
+ {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e20076a211cd6f9b44a6be58f7eeafa7ab5720eb796975d0c03f05b47d89eb90"},
+ {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a74d56552ddbde46c246b5b89199cb3fd182f9c346c784e1a93e4dc3f5ec9975"},
+ {file = "wrapt-1.15.0-cp310-cp310-win32.whl", hash = "sha256:26458da5653aa5b3d8dc8b24192f574a58984c749401f98fff994d41d3f08da1"},
+ {file = "wrapt-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:75760a47c06b5974aa5e01949bf7e66d2af4d08cb8c1d6516af5e39595397f5e"},
+ {file = "wrapt-1.15.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ba1711cda2d30634a7e452fc79eabcadaffedf241ff206db2ee93dd2c89a60e7"},
+ {file = "wrapt-1.15.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:56374914b132c702aa9aa9959c550004b8847148f95e1b824772d453ac204a72"},
+ {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a89ce3fd220ff144bd9d54da333ec0de0399b52c9ac3d2ce34b569cf1a5748fb"},
+ {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3bbe623731d03b186b3d6b0d6f51865bf598587c38d6f7b0be2e27414f7f214e"},
+ {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3abbe948c3cbde2689370a262a8d04e32ec2dd4f27103669a45c6929bcdbfe7c"},
+ {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b67b819628e3b748fd3c2192c15fb951f549d0f47c0449af0764d7647302fda3"},
+ {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7eebcdbe3677e58dd4c0e03b4f2cfa346ed4049687d839adad68cc38bb559c92"},
+ {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:74934ebd71950e3db69960a7da29204f89624dde411afbfb3b4858c1409b1e98"},
+ {file = "wrapt-1.15.0-cp311-cp311-win32.whl", hash = "sha256:bd84395aab8e4d36263cd1b9308cd504f6cf713b7d6d3ce25ea55670baec5416"},
+ {file = "wrapt-1.15.0-cp311-cp311-win_amd64.whl", hash = "sha256:a487f72a25904e2b4bbc0817ce7a8de94363bd7e79890510174da9d901c38705"},
+ {file = "wrapt-1.15.0-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:4ff0d20f2e670800d3ed2b220d40984162089a6e2c9646fdb09b85e6f9a8fc29"},
+ {file = "wrapt-1.15.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:9ed6aa0726b9b60911f4aed8ec5b8dd7bf3491476015819f56473ffaef8959bd"},
+ {file = "wrapt-1.15.0-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:896689fddba4f23ef7c718279e42f8834041a21342d95e56922e1c10c0cc7afb"},
+ {file = "wrapt-1.15.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:75669d77bb2c071333417617a235324a1618dba66f82a750362eccbe5b61d248"},
+ {file = "wrapt-1.15.0-cp35-cp35m-win32.whl", hash = "sha256:fbec11614dba0424ca72f4e8ba3c420dba07b4a7c206c8c8e4e73f2e98f4c559"},
+ {file = "wrapt-1.15.0-cp35-cp35m-win_amd64.whl", hash = "sha256:fd69666217b62fa5d7c6aa88e507493a34dec4fa20c5bd925e4bc12fce586639"},
+ {file = "wrapt-1.15.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b0724f05c396b0a4c36a3226c31648385deb6a65d8992644c12a4963c70326ba"},
+ {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbeccb1aa40ab88cd29e6c7d8585582c99548f55f9b2581dfc5ba68c59a85752"},
+ {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38adf7198f8f154502883242f9fe7333ab05a5b02de7d83aa2d88ea621f13364"},
+ {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:578383d740457fa790fdf85e6d346fda1416a40549fe8db08e5e9bd281c6a475"},
+ {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:a4cbb9ff5795cd66f0066bdf5947f170f5d63a9274f99bdbca02fd973adcf2a8"},
+ {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:af5bd9ccb188f6a5fdda9f1f09d9f4c86cc8a539bd48a0bfdc97723970348418"},
+ {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:b56d5519e470d3f2fe4aa7585f0632b060d532d0696c5bdfb5e8319e1d0f69a2"},
+ {file = "wrapt-1.15.0-cp36-cp36m-win32.whl", hash = "sha256:77d4c1b881076c3ba173484dfa53d3582c1c8ff1f914c6461ab70c8428b796c1"},
+ {file = "wrapt-1.15.0-cp36-cp36m-win_amd64.whl", hash = "sha256:077ff0d1f9d9e4ce6476c1a924a3332452c1406e59d90a2cf24aeb29eeac9420"},
+ {file = "wrapt-1.15.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5c5aa28df055697d7c37d2099a7bc09f559d5053c3349b1ad0c39000e611d317"},
+ {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a8564f283394634a7a7054b7983e47dbf39c07712d7b177b37e03f2467a024e"},
+ {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780c82a41dc493b62fc5884fb1d3a3b81106642c5c5c78d6a0d4cbe96d62ba7e"},
+ {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e169e957c33576f47e21864cf3fc9ff47c223a4ebca8960079b8bd36cb014fd0"},
+ {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b02f21c1e2074943312d03d243ac4388319f2456576b2c6023041c4d57cd7019"},
+ {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f2e69b3ed24544b0d3dbe2c5c0ba5153ce50dcebb576fdc4696d52aa22db6034"},
+ {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d787272ed958a05b2c86311d3a4135d3c2aeea4fc655705f074130aa57d71653"},
+ {file = "wrapt-1.15.0-cp37-cp37m-win32.whl", hash = "sha256:02fce1852f755f44f95af51f69d22e45080102e9d00258053b79367d07af39c0"},
+ {file = "wrapt-1.15.0-cp37-cp37m-win_amd64.whl", hash = "sha256:abd52a09d03adf9c763d706df707c343293d5d106aea53483e0ec8d9e310ad5e"},
+ {file = "wrapt-1.15.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cdb4f085756c96a3af04e6eca7f08b1345e94b53af8921b25c72f096e704e145"},
+ {file = "wrapt-1.15.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:230ae493696a371f1dbffaad3dafbb742a4d27a0afd2b1aecebe52b740167e7f"},
+ {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63424c681923b9f3bfbc5e3205aafe790904053d42ddcc08542181a30a7a51bd"},
+ {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6bcbfc99f55655c3d93feb7ef3800bd5bbe963a755687cbf1f490a71fb7794b"},
+ {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c99f4309f5145b93eca6e35ac1a988f0dc0a7ccf9ccdcd78d3c0adf57224e62f"},
+ {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b130fe77361d6771ecf5a219d8e0817d61b236b7d8b37cc045172e574ed219e6"},
+ {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:96177eb5645b1c6985f5c11d03fc2dbda9ad24ec0f3a46dcce91445747e15094"},
+ {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5fe3e099cf07d0fb5a1e23d399e5d4d1ca3e6dfcbe5c8570ccff3e9208274f7"},
+ {file = "wrapt-1.15.0-cp38-cp38-win32.whl", hash = "sha256:abd8f36c99512755b8456047b7be10372fca271bf1467a1caa88db991e7c421b"},
+ {file = "wrapt-1.15.0-cp38-cp38-win_amd64.whl", hash = "sha256:b06fa97478a5f478fb05e1980980a7cdf2712015493b44d0c87606c1513ed5b1"},
+ {file = "wrapt-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2e51de54d4fb8fb50d6ee8327f9828306a959ae394d3e01a1ba8b2f937747d86"},
+ {file = "wrapt-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0970ddb69bba00670e58955f8019bec4a42d1785db3faa043c33d81de2bf843c"},
+ {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76407ab327158c510f44ded207e2f76b657303e17cb7a572ffe2f5a8a48aa04d"},
+ {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd525e0e52a5ff16653a3fc9e3dd827981917d34996600bbc34c05d048ca35cc"},
+ {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d37ac69edc5614b90516807de32d08cb8e7b12260a285ee330955604ed9dd29"},
+ {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:078e2a1a86544e644a68422f881c48b84fef6d18f8c7a957ffd3f2e0a74a0d4a"},
+ {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2cf56d0e237280baed46f0b5316661da892565ff58309d4d2ed7dba763d984b8"},
+ {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7dc0713bf81287a00516ef43137273b23ee414fe41a3c14be10dd95ed98a2df9"},
+ {file = "wrapt-1.15.0-cp39-cp39-win32.whl", hash = "sha256:46ed616d5fb42f98630ed70c3529541408166c22cdfd4540b88d5f21006b0eff"},
+ {file = "wrapt-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:eef4d64c650f33347c1f9266fa5ae001440b232ad9b98f1f43dfe7a79435c0a6"},
+ {file = "wrapt-1.15.0-py3-none-any.whl", hash = "sha256:64b1df0f83706b4ef4cfb4fb0e4c2669100fd7ecacfb59e091fad300d4e04640"},
+ {file = "wrapt-1.15.0.tar.gz", hash = "sha256:d06730c6aed78cee4126234cf2d071e01b44b915e725a6cb439a879ec9754a3a"},
+]
+
+[metadata]
+lock-version = "2.0"
+python-versions = "^3.7"
+content-hash = "6dd48af9ea10e0d441e2b6ee3dcdea67bd5b4cc0b6c13b672761212decbaa5f6"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..ac1e2619053761b11ab2f5e246587fe4e96e1a60
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,43 @@
+[tool.poetry]
+name = "pdf.tocgen"
+version = "1.3.4"
+description = "Automatically generate table of contents for pdf files"
+authors = ["krasjet"]
+license = "GPL-3.0-or-later"
+readme = "README.md"
+homepage = "https://krasjet.com/voice/pdf.tocgen/"
+repository = "https://github.com/Krasjet/pdf.tocgen"
+keywords = ["pdf", "cli"]
+
+classifiers = [
+ "Development Status :: 3 - Alpha",
+ "Environment :: Console",
+ "Intended Audience :: End Users/Desktop"
+]
+
+packages = [
+ { include = "pdfxmeta" },
+ { include = "pdftocgen" },
+ { include = "pdftocio" },
+ { include = "fitzutils" }
+]
+
+[tool.poetry.dependencies]
+python = "^3.7"
+PyMuPDF = "^1.18.14"
+toml = "^0.10.2"
+chardet = "^5.1.0"
+
+[tool.poetry.dev-dependencies]
+pylint = "^2.5.3"
+jedi = "^0.17.2"
+mamba = "^0.11.1"
+
+[tool.poetry.scripts]
+pdfxmeta = "pdfxmeta.app:main"
+pdftocgen = "pdftocgen.app:main"
+pdftocio = "pdftocio.app:main"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/recipes/README.md b/recipes/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3c394485bc24eba8adf10a98f4492aac0a3acb11
--- /dev/null
+++ b/recipes/README.md
@@ -0,0 +1,11 @@
+recipes
+=======
+
+This directory contains some pre-made recipes for `pdftocgen`. It could be a
+good reference if you want to craft your own recipes. Feel free to contribute
+more.
+
+The recipes in this directory is separately licensed under the [CC BY-NC-SA 4.0
+License][cc] to prevent any commercial usage.
+
+[cc]: https://creativecommons.org/licenses/by-nc-sa/4.0/
diff --git a/recipes/default_groff_man.toml b/recipes/default_groff_man.toml
new file mode 100644
index 0000000000000000000000000000000000000000..ab3a4690848d38c09d643a7ae01f5f6622768ade
--- /dev/null
+++ b/recipes/default_groff_man.toml
@@ -0,0 +1,12 @@
+# The recipe for
+# $ man -Tpdf man > out.pdf
+# only tested under groff
+[[heading]]
+level = 1
+font.name = "Times-Bold"
+font.size = 10.949999809265137
+font.superscript = false
+font.italic = false
+font.serif = true
+font.monospace = false
+font.bold = true
diff --git a/recipes/default_groff_ms.toml b/recipes/default_groff_ms.toml
new file mode 100644
index 0000000000000000000000000000000000000000..9e59a1daf1f8c42b3aceeee59700419b905a1017
--- /dev/null
+++ b/recipes/default_groff_ms.toml
@@ -0,0 +1,12 @@
+# The recipe for the default groff_ms, produced by
+# $ groff -ms -Tpdf in.ms > out.pdf
+
+[[heading]]
+level = 1
+font.name = "Times-Bold"
+font.size = 10
+bbox.left = 72
+
+# All the headings (.NH) have the same font attributes, so you need to manually
+# format the heading levels of the toc (for vim users, >> in normal mode will
+# add indentation to a line)
diff --git a/recipes/default_latex.toml b/recipes/default_latex.toml
new file mode 100644
index 0000000000000000000000000000000000000000..f79bacf0d0c7e2b431710240f2e3ff0eb6de1f8a
--- /dev/null
+++ b/recipes/default_latex.toml
@@ -0,0 +1,24 @@
+# The recipe for
+# $ pdflatex in.tex
+# under default styles (Computer Modern, article class)
+
+[[heading]]
+level = 1
+greedy = true
+font.name = "CMBX12"
+font.size = 14.346199989318848
+font.size_tolerance = 0.01
+
+[[heading]]
+level = 2
+greedy = true
+font.name = "CMBX12"
+font.size = 11.9552001953125
+font.size_tolerance = 0.01
+
+[[heading]]
+level = 3
+greedy = true
+font.name = "CMBX10"
+font.size = 9.962599754333496
+font.size_tolerance = 0.01
diff --git a/recipes/ft.toml b/recipes/ft.toml
new file mode 100644
index 0000000000000000000000000000000000000000..eb59c5638bde1ea0a360fa0065c37cc55adeb431
--- /dev/null
+++ b/recipes/ft.toml
@@ -0,0 +1,23 @@
+# The recipe for "Lecture Notes for EE 261" [1] by Brad Osgood
+#
+# [1]: https://see.stanford.edu/materials/lsoftaee261/book-fall-07.pdf
+# archive: https://web.archive.org/https://see.stanford.edu/materials/lsoftaee261/book-fall-07.pdf
+
+[[heading]]
+level = 1
+greedy = true
+font.name = "CMBX12"
+font.size = 24.78696060180664
+
+[[heading]]
+level = 2
+greedy = true
+font.name = "CMBX12"
+font.size = 14.346190452575684
+
+[[heading]]
+level = 3
+greedy = true
+font.name = "CMBX12"
+font.size = 11.955169677734375
+
diff --git a/recipes/htdc.toml b/recipes/htdc.toml
new file mode 100644
index 0000000000000000000000000000000000000000..3783190ace58e29965265b9611ad95887b612fd4
--- /dev/null
+++ b/recipes/htdc.toml
@@ -0,0 +1,26 @@
+# The recipe for HtDC by Matthias Felleisen, et al.
+#
+# The output need some manual clean up. For example, the table of contents in
+# the original document is incorrectedly included in the outline, but they
+# should be easy to remove using a text editor.
+#
+# [1]: https://felleisen.org/matthias/HtDC/htdc.pdf
+
+[[heading]]
+level = 1
+font.name = "Palatino-Bold"
+font.size = 17.21540069580078
+font.color = 0x221f1f
+
+[[heading]]
+level = 2
+font.name = "Palatino-Bold"
+font.size = 14.346199989318848
+font.color = 0x221f1f
+
+[[heading]]
+level = 3
+greedy = true
+font.name = "Palatino-Bold"
+font.size = 11.9552001953125
+font.color = 0x221f1f
diff --git a/recipes/onlisp.toml b/recipes/onlisp.toml
new file mode 100644
index 0000000000000000000000000000000000000000..00873f96b8f6bd82e964419ba0e75bbcdc6b9e48
--- /dev/null
+++ b/recipes/onlisp.toml
@@ -0,0 +1,15 @@
+# The recipe for "On Lisp" [1] by Paul Graham
+#
+# Note that you need to download the PDF version. The PDF is well structured
+# and no extra processing is needed.
+# [1]: http://www.paulgraham.com/onlisptext.html
+
+[[heading]]
+level = 1
+font.name = "Times-Bold"
+font.size = 19.92530059814453
+
+[[heading]]
+level = 2
+font.name = "Times-Bold"
+font.size = 11.9552001953125
diff --git a/recipes/recipe.toml b/recipes/recipe.toml
new file mode 100644
index 0000000000000000000000000000000000000000..85a5af2253d5f4db045c3d79c79877277e533e81
--- /dev/null
+++ b/recipes/recipe.toml
@@ -0,0 +1,5 @@
+[[heading]]
+level = 1
+greedy = true
+font.name = "CaslonFiveForty-Roman"
+font.size = 54.10
diff --git a/requirements.txt b/requirements.txt
index 28d994e22f8dd432b51df193562052e315ad95f7..1813feaee654ded47884902fd56ffa99962c83d2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,6 @@
-altair
-pandas
-streamlit
\ No newline at end of file
+streamlit
+pandas
+PyMuPDF==1.25.2
+toml
+chardet
+.
diff --git a/spec/__init__.py b/spec/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/spec/cli_spec.sh b/spec/cli_spec.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d65e40dc3e40ddfa5bad92970f2e2f413e8a8dc2
--- /dev/null
+++ b/spec/cli_spec.sh
@@ -0,0 +1,63 @@
+#!/bin/bash -e
+
+SPEC="spec/files"
+
+checkeq() {
+ if res=$(diff "$1" "$2"); then
+ echo "[✓]"
+ else
+ echo "[✗]"
+ printf "%s\n" "$res"
+ return 1
+ fi
+}
+
+it() {
+ printf " it %s " "$*"
+}
+
+printf "pdfxmeta\n"
+
+it "extracts metadata correctly"
+checkeq <(pdfxmeta -p 1 "$SPEC/level2.pdf" "Section") \
+ "$SPEC/level2_meta"
+
+it "extracts metadata in auto mode correctly"
+checkeq <(pdfxmeta -a 1 -p 1 "$SPEC/level2.pdf" "Section") \
+ "$SPEC/level2_meta.toml"
+
+printf "\npdftocgen\n"
+
+it "generates toc for 2 level heading correctly"
+checkeq <(pdftocgen "$SPEC/level2.pdf" < "$SPEC/level2_recipe.toml") \
+ "$SPEC/level2.toc"
+
+it "generates toc for one page headings correctly"
+checkeq <(pdftocgen "$SPEC/onepage.pdf" < "$SPEC/onepage_greedy.toml") \
+ "$SPEC/onepage.toc"
+
+it "generates toc for hard mode correctly"
+checkeq <(pdftocgen "$SPEC/hardmode.pdf" < "$SPEC/hardmode_recipe.toml") \
+ "$SPEC/hardmode.toc"
+
+it "generates readable toc"
+checkeq <(pdftocgen -H "$SPEC/level2.pdf" < "$SPEC/level2_recipe.toml") \
+ "$SPEC/level2_h.toc"
+
+printf "\npdftocio\n"
+
+tmpdir=$(mktemp -d)
+
+it "adds toc to pdf and prints toc correctly"
+checkeq <(pdftocgen "$SPEC/hardmode.pdf" < "$SPEC/hardmode_recipe.toml" | \
+ pdftocio -o "$tmpdir/out.pdf" "$SPEC/hardmode.pdf" && \
+ pdftocio -p "$tmpdir/out.pdf") \
+ "$SPEC/hardmode.toc"
+
+it "prints toc when -p is set"
+checkeq <(pdftocio -p "$SPEC/hastoc.pdf" < $SPEC/level2.toc) \
+ "$SPEC/hastoc.toc"
+
+it "prints toc vpos when -v is set"
+checkeq <(pdftocio -p -v "$SPEC/hastoc.pdf") \
+ "$SPEC/hastoc_v.toc"
diff --git a/spec/files/Makefile b/spec/files/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..bc48f892720067d4b6d07e849ce79366c9da0fc5
--- /dev/null
+++ b/spec/files/Makefile
@@ -0,0 +1,12 @@
+.PHONY: all clean
+
+all: level2.pdf hastoc.pdf onepage.pdf hardmode.pdf
+
+%.pdf: %.tex
+ latexmk -pdf $<
+
+clean:
+ rm -f *.aux *.dvi *.fdb_latexmk *.fls *.log *.out
+
+nuke: clean
+ rm -f *.pdf
diff --git a/spec/files/hardmode.pdf b/spec/files/hardmode.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..aed1b467ef6b4926771892d1c2cc7ade6dd1813b
--- /dev/null
+++ b/spec/files/hardmode.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9be6a1628292675b467b36a503c37ffa4d3073d2ff87d147dced3b3bff394875
+size 110985
diff --git a/spec/files/hardmode.tex b/spec/files/hardmode.tex
new file mode 100644
index 0000000000000000000000000000000000000000..48242a7cd593df79ce1f1142fc8fb617bddd8e08
--- /dev/null
+++ b/spec/files/hardmode.tex
@@ -0,0 +1,68 @@
+\documentclass{article}[12pt]
+
+\usepackage{lipsum}
+\usepackage{multicol}
+\usepackage{amsmath}
+\usepackage{amsfonts}
+\usepackage[USenglish]{babel}
+\usepackage[stretch=10,shrink=10]{microtype}
+\usepackage[left=1.3in,
+ right=1.3in,
+ top=1in,
+ bottom=1in,
+ footskip=.5in]{geometry}
+\setlength{\columnsep}{0.4in}
+
+\renewcommand{\rmdefault}{zpltlf}
+\usepackage{newpxtext}
+% will mess up embeded symbols
+% \usepackage{newpxmath}
+
+\title{The hard mode}
+\author{krasjet}
+\date{}
+
+\begin{document}
+\begin{multicols}{2}
+[
+ \maketitle
+]
+
+\section{Section One}
+
+\lipsum[2-3]
+
+\section{Section $1 + 1 = 2$}
+
+\lipsum[2-1]
+\begin{align*}
+ x^2 + 2 = 4
+\end{align*}
+\lipsum[2-1]
+
+\subsection{Subsection Two.One}
+\lipsum[2-5]
+
+\section*{$\mathrm{e}^{\ln(3)}$}
+
+\setcounter{section}{3}
+\setcounter{subsection}{0}
+
+\lipsum[1-2]
+
+\subsection{Subsection $\mathrm{e}^{\ln(3)}$.1, with looo\-ooooooooong title}
+\lipsum[2-5]
+
+\subsection{$\mathbb{S}$ubsection Three.Two, another long title}
+\lipsum[1-1]
+
+\subsection{Subsection Three.Three}
+\lipsum[2-3]
+
+\section{The $x \to \infty$ End}
+
+\lipsum[2-2]
+
+\end{multicols}
+
+\end{document}
diff --git a/spec/files/hardmode.toc b/spec/files/hardmode.toc
new file mode 100644
index 0000000000000000000000000000000000000000..aa7398bc42cffb1f98290f1f1d73c5d243f8a342
--- /dev/null
+++ b/spec/files/hardmode.toc
@@ -0,0 +1,8 @@
+"1 Section One" 1
+"2 Section 1 + 1 = 2" 1
+ "2.1 Subsection Two.One" 1
+"e ln(3)" 2
+ "3.1 Subsection e ln(3) .1, with looo- ooooooooong title" 2
+ "3.2 S ubsection Three.Two, another long title" 3
+ "3.3 Subsection Three.Three" 3
+"4 The x → ∞ End" 3
diff --git a/spec/files/hardmode_recipe.toml b/spec/files/hardmode_recipe.toml
new file mode 100644
index 0000000000000000000000000000000000000000..4f67312a7be3dd4dcb5de533a0f413b8b2536e7d
--- /dev/null
+++ b/spec/files/hardmode_recipe.toml
@@ -0,0 +1,18 @@
+[[heading]]
+level = 1
+greedy = true
+font.name = "TeXGyrePagellaX-Bold"
+font.size = 14.346199989318848
+
+[[heading]]
+level = 1
+greedy = true
+font.name = "CMR10"
+font.size = 9.962599754333496
+font.superscript = true
+
+[[heading]]
+level = 2
+greedy = true
+font.name = "TeXGyrePagellaX-Bold"
+font.size = 11.9552001953125
diff --git a/spec/files/hastoc.pdf b/spec/files/hastoc.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..2e518d118710edac73ccf3d2c56faadc0aae8e3f
Binary files /dev/null and b/spec/files/hastoc.pdf differ
diff --git a/spec/files/hastoc.tex b/spec/files/hastoc.tex
new file mode 100644
index 0000000000000000000000000000000000000000..1b1f62a47cd02343eeee77ca0549aed8fd6ff789
--- /dev/null
+++ b/spec/files/hastoc.tex
@@ -0,0 +1,42 @@
+\documentclass{article}
+
+\usepackage{lipsum}
+\usepackage{hyperref}
+
+\title{2 Level Heading Test}
+\author{krasjet}
+\date{}
+
+\begin{document}
+\maketitle
+
+\section{Section One}
+
+\lipsum[2-4]
+
+\section{Section Two}
+
+\lipsum[2-5]
+
+\subsection{Subsection Two.One}
+\lipsum[2-5]
+
+\section{Section Three, with looong loooong looong title}
+
+\lipsum[1-2]
+
+\subsection{Subsection Three.One, with even loooooooooooonger title, and
+probably even more}
+\lipsum[2-5]
+
+\subsection{Subsection Three.Two}
+\lipsum[1-1]
+
+\subsection{Subsection Three.Three}
+\lipsum[2-3]
+
+\section{The End}
+
+\lipsum[2-5]
+
+\end{document}
diff --git a/spec/files/hastoc.toc b/spec/files/hastoc.toc
new file mode 100644
index 0000000000000000000000000000000000000000..b1916bbcb3700c1205acbe5e999b4b59b8b4a300
--- /dev/null
+++ b/spec/files/hastoc.toc
@@ -0,0 +1,8 @@
+"Section One" 1
+"Section Two" 1
+ "Subsection Two.One" 2
+"Section Three, with looong loooong looong title" 3
+ "Subsection Three.One, with even loooooooooooonger title, and probably even more" 3
+ "Subsection Three.Two" 4
+ "Subsection Three.Three" 5
+"The End" 5
diff --git a/spec/files/hastoc_v.toc b/spec/files/hastoc_v.toc
new file mode 100644
index 0000000000000000000000000000000000000000..5b82fb6f6cd404540ce48102ae32336e63d78e34
--- /dev/null
+++ b/spec/files/hastoc_v.toc
@@ -0,0 +1,8 @@
+"Section One" 1 234.65998
+"Section Two" 1 562.148
+ "Subsection Two.One" 2 449.522
+"Section Three, with looong loooong looong title" 3 330.333
+ "Subsection Three.One, with even loooooooooooonger title, and probably even more" 3 616.444
+ "Subsection Three.Two" 4 509.298
+ "Subsection Three.Three" 5 124.802
+"The End" 5 361.387
diff --git a/spec/files/level2.pdf b/spec/files/level2.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..1abfde38371e99a859bc831c240170470fade538
Binary files /dev/null and b/spec/files/level2.pdf differ
diff --git a/spec/files/level2.tex b/spec/files/level2.tex
new file mode 100644
index 0000000000000000000000000000000000000000..9457c2ab681ecff3e0b9ba4b18fd2341819108e3
--- /dev/null
+++ b/spec/files/level2.tex
@@ -0,0 +1,41 @@
+\documentclass{article}
+
+\usepackage{lipsum}
+
+\title{2 Level Heading Test}
+\author{krasjet}
+\date{}
+
+\begin{document}
+\maketitle
+
+\section{Section One}
+
+\lipsum[2-4]
+
+\section{Section Two}
+
+\lipsum[2-5]
+
+\subsection{Subsection Two.One}
+\lipsum[2-5]
+
+\section{Section Three, with looong loooong looong title}
+
+\lipsum[1-2]
+
+\subsection{Subsection Three.One, with even loooooooooooonger title, and
+probably even more}
+\lipsum[2-5]
+
+\subsection{Subsection Three.Two}
+\lipsum[1-1]
+
+\subsection{Subsection Three.Three}
+\lipsum[2-3]
+
+\section{The End}
+
+\lipsum[2-5]
+
+\end{document}
diff --git a/spec/files/level2.toc b/spec/files/level2.toc
new file mode 100644
index 0000000000000000000000000000000000000000..ba9e8df8f2a7a3c97d7386b2ac7ed359eb4ca8e7
--- /dev/null
+++ b/spec/files/level2.toc
@@ -0,0 +1,8 @@
+"1 Section One" 1
+"2 Section Two" 1
+ "2.1 Subsection Two.One" 2
+"3 Section Three, with looong loooong looong ti- tle" 3
+ "3.1 Subsection Three.One, with even loooooooooooonger title, and probably even more" 3
+ "3.2 Subsection Three.Two" 4
+ "3.3 Subsection Three.Three" 5
+"4 The End" 5
diff --git a/spec/files/level2_h.toc b/spec/files/level2_h.toc
new file mode 100644
index 0000000000000000000000000000000000000000..cd65d7d2cdd2fb36a70a633ec42632d5fb0402a0
--- /dev/null
+++ b/spec/files/level2_h.toc
@@ -0,0 +1,8 @@
+1 Section One ··· 1
+2 Section Two ··· 1
+ 2.1 Subsection Two.One ··· 2
+3 Section Three, with looong loooong looong ti- tle ··· 3
+ 3.1 Subsection Three.One, with even loooooooooooonger title, and probably even more ··· 3
+ 3.2 Subsection Three.Two ··· 4
+ 3.3 Subsection Three.Three ··· 5
+4 The End ··· 5
diff --git a/spec/files/level2_meta b/spec/files/level2_meta
new file mode 100644
index 0000000000000000000000000000000000000000..488e41923de2458207a2fedcb56a1092498d9149
--- /dev/null
+++ b/spec/files/level2_meta
@@ -0,0 +1,26 @@
+Section One:
+ font.name = "CMBX12"
+ font.size = 14.346199989318848
+ font.color = 0x000000
+ font.superscript = false
+ font.italic = false
+ font.serif = true
+ font.monospace = false
+ font.bold = true
+ bbox.left = 157.98439025878906
+ bbox.top = 237.6484375
+ bbox.right = 243.12905883789062
+ bbox.bottom = 252.00897216796875
+Section Two:
+ font.name = "CMBX12"
+ font.size = 14.346199989318848
+ font.color = 0x000000
+ font.superscript = false
+ font.italic = false
+ font.serif = true
+ font.monospace = false
+ font.bold = true
+ bbox.left = 157.98439025878906
+ bbox.top = 567.3842163085938
+ bbox.right = 245.18057250976562
+ bbox.bottom = 581.7447509765625
diff --git a/spec/files/level2_meta.toml b/spec/files/level2_meta.toml
new file mode 100644
index 0000000000000000000000000000000000000000..e7886a91f71667fecbdba97f62f5a5bfc605d710
--- /dev/null
+++ b/spec/files/level2_meta.toml
@@ -0,0 +1,38 @@
+[[heading]]
+# Section One
+level = 1
+greedy = true
+font.name = "CMBX12"
+font.size = 14.346199989318848
+# font.size_tolerance = 1e-5
+# font.color = 0x000000
+# font.superscript = false
+# font.italic = false
+# font.serif = true
+# font.monospace = false
+# font.bold = true
+# bbox.left = 157.98439025878906
+# bbox.top = 237.6484375
+# bbox.right = 243.12905883789062
+# bbox.bottom = 252.00897216796875
+# bbox.tolerance = 1e-5
+
+[[heading]]
+# Section Two
+level = 1
+greedy = true
+font.name = "CMBX12"
+font.size = 14.346199989318848
+# font.size_tolerance = 1e-5
+# font.color = 0x000000
+# font.superscript = false
+# font.italic = false
+# font.serif = true
+# font.monospace = false
+# font.bold = true
+# bbox.left = 157.98439025878906
+# bbox.top = 567.3842163085938
+# bbox.right = 245.18057250976562
+# bbox.bottom = 581.7447509765625
+# bbox.tolerance = 1e-5
+
diff --git a/spec/files/level2_recipe.toml b/spec/files/level2_recipe.toml
new file mode 100644
index 0000000000000000000000000000000000000000..9a69ff7864e4bc658fd44bff50f01735337ff17a
--- /dev/null
+++ b/spec/files/level2_recipe.toml
@@ -0,0 +1,9 @@
+[[heading]]
+level = 1
+font.name = "CMBX12"
+font.size = 14.346199989318848
+
+[[heading]]
+level = 2
+font.name = "CMBX12"
+font.size = 11.9552001953125
diff --git a/spec/files/onepage.pdf b/spec/files/onepage.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..4e4bfb3d1a90cf9d35cee459af93cb5677307eb5
Binary files /dev/null and b/spec/files/onepage.pdf differ
diff --git a/spec/files/onepage.tex b/spec/files/onepage.tex
new file mode 100644
index 0000000000000000000000000000000000000000..224b3e62a53af1ea1f4148c88e96a15dfd3d2b5b
--- /dev/null
+++ b/spec/files/onepage.tex
@@ -0,0 +1,37 @@
+\documentclass{article}
+
+\usepackage{lipsum}
+
+\title{One page Test}
+\author{krasjet}
+\date{}
+
+\begin{document}
+\maketitle
+
+\section{Section One}
+
+\section{Section Two}
+
+\subsection{Subsection Two.One}
+\subsection{Subsection Two.Two $\times 2$}
+
+\section{Section Three, with looong loooong looong title}
+
+\subsection{Subsection Three.One, with even loooooooooooonger title, and
+probably even more}
+
+\subsection{Subsection Three.Two}
+
+\subsection{Subsection Three.Three}
+\subsubsection{Subsubsection Three.Three.One}
+\subsubsection{Subsubsection Three.Three.Two}
+\subsubsection{Subsubsection Three.Three.Three}
+
+\subsection{Subsection Three.Four}
+
+\subsection{Subsection Three.Five}
+
+\section{The End}
+
+\end{document}
diff --git a/spec/files/onepage.toc b/spec/files/onepage.toc
new file mode 100644
index 0000000000000000000000000000000000000000..cab90366a9af8cc9ee4368806f84ead29dd76b19
--- /dev/null
+++ b/spec/files/onepage.toc
@@ -0,0 +1,14 @@
+"1 Section One" 1
+"2 Section Two" 1
+ "2.1 Subsection Two.One" 1
+ "2.2 Subsection Two.Two × 2" 1
+"3 Section Three, with looong loooong looong ti- tle" 1
+ "3.1 Subsection Three.One, with even loooooooooooonger title, and probably even more" 1
+ "3.2 Subsection Three.Two" 1
+ "3.3 Subsection Three.Three" 1
+ "3.3.1 Subsubsection Three.Three.One" 1
+ "3.3.2 Subsubsection Three.Three.Two" 1
+ "3.3.3 Subsubsection Three.Three.Three" 1
+ "3.4 Subsection Three.Four" 1
+ "3.5 Subsection Three.Five" 1
+"4 The End" 1
diff --git a/spec/files/onepage_greedy.toml b/spec/files/onepage_greedy.toml
new file mode 100644
index 0000000000000000000000000000000000000000..ff645f2f8ab30042c52e9f90c5b1e9189ff5c2d3
--- /dev/null
+++ b/spec/files/onepage_greedy.toml
@@ -0,0 +1,15 @@
+[[heading]]
+level = 1
+font.name = "CMBX12"
+font.size = 14.346199989318848
+
+[[heading]]
+level = 2
+greedy = true
+font.name = "CMBX12"
+font.size = 11.9552001953125
+
+[[heading]]
+level = 3
+font.name = "CMBX10"
+font.size = 9.962599754333496
diff --git a/spec/files/onepage_recipe.toml b/spec/files/onepage_recipe.toml
new file mode 100644
index 0000000000000000000000000000000000000000..1602f87bf67e0667012e11fe6f49e15f20e1ae2e
--- /dev/null
+++ b/spec/files/onepage_recipe.toml
@@ -0,0 +1,14 @@
+[[heading]]
+level = 1
+font.name = "CMBX12"
+font.size = 14.346199989318848
+
+[[heading]]
+level = 2
+font.name = "(CMBX12|CMSY10|CMR12)"
+font.size = 11.9552001953125
+
+[[heading]]
+level = 3
+font.name = "CMBX10"
+font.size = 9.962599754333496
diff --git a/spec/files/recipe_spec.toml b/spec/files/recipe_spec.toml
new file mode 100644
index 0000000000000000000000000000000000000000..fc9bbe76e2bbd0ac0f9780ed5f0c79011f213c1e
--- /dev/null
+++ b/spec/files/recipe_spec.toml
@@ -0,0 +1,33 @@
+[[heading]]
+level = 1
+font.name = "CMBX12"
+font.size = 14.346199989318848
+font.size_tolerance = 1e-5
+font.color = 0x000000
+font.superscript = false
+font.italic = false
+font.serif = true
+font.monospace = false
+font.bold = true
+bbox.left = 157.98439025878906
+bbox.top = 335.569580078125
+bbox.right = 477.66058349609375
+bbox.bottom = 349.93011474609375
+bbox.tolerance = 1e-5
+
+[[heading]]
+level = 2
+font.name = "CMBX10"
+font.size = 9.962599754333496
+font.size_tolerance = 1e-5
+font.color = 0x000000
+font.superscript = false
+font.italic = false
+font.serif = true
+font.monospace = false
+font.bold = true
+bbox.left = 168.76663208007812
+bbox.top = 127.2930679321289
+bbox.right = 280.66656494140625
+bbox.bottom = 137.2556610107422
+bbox.tolerance = 1e-5
diff --git a/spec/filter_spec.py b/spec/filter_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac02003c36bccd9b573e487f0915fb7383017df4
--- /dev/null
+++ b/spec/filter_spec.py
@@ -0,0 +1,642 @@
+import os
+
+from mamba import description, it, before
+from pdftocgen.filter import (
+ ToCFilter,
+ admits_float,
+ FontFilter,
+ BoundingBoxFilter
+)
+
+dirpath = os.path.dirname(os.path.abspath(__file__))
+
+with description("admits_float") as self:
+ with it("admits if difference is below tol"):
+ assert admits_float(1, 1.05, 0.1)
+ assert admits_float(1, 0.95, 0.1)
+
+ with it("does not admit if difference is too large"):
+ assert not admits_float(1, 1.5, 0.1)
+ assert not admits_float(1, 0.5, 0.1)
+
+ with it("admits anything if expect is unset"):
+ assert admits_float(None, 1, 0.1)
+ assert admits_float(None, None, 0.1)
+
+ with it("does not admit if expect is set but actual is None"):
+ assert not admits_float(1, None, 0.1)
+
+with description("ToCFilter") as self:
+ with before.all:
+ self.title_exact = {
+ 'level': 1,
+ 'font': {
+ 'name': "CMBX12",
+ 'size': 14.346199989318848,
+ 'size_tolerance': 0,
+ 'color': 0,
+ 'superscript': False,
+ 'italic': False,
+ 'serif': True,
+ 'monospace': False,
+ 'bold': True
+ },
+ 'bbox': {
+ 'left': 157.98439025878906,
+ 'top': 567.3842163085938,
+ 'right': 245.18057250976562,
+ 'bottom': 581.7447509765625,
+ 'tolerance': 0
+ }
+ }
+
+ self.text_exact = {
+ 'level': 2,
+ 'font': {
+ 'name': "CMR10",
+ 'size': 9.962599754333496,
+ 'size_tolerance': 0,
+ 'color': 0,
+ 'superscript': False,
+ 'italic': False,
+ 'serif': True,
+ 'monospace': False,
+ 'bold': False
+ },
+ 'bbox': {
+ 'left': 133.76800537109375,
+ 'top': 592.492919921875,
+ 'right': 477.537353515625,
+ 'bottom': 602.4555053710938,
+ 'tolerance': 0
+ }
+ }
+
+ self.spn_title = {
+ 'size': 14.346199989318848,
+ 'flags': 20,
+ 'font': 'TZOLRB+CMBX12',
+ 'color': 0,
+ 'text': 'Section Two',
+ 'bbox': (157.98439025878906,
+ 567.3842163085938,
+ 245.18057250976562,
+ 581.7447509765625)
+ }
+
+ self.spn_text = {
+ 'size': 9.962599754333496,
+ 'flags': 4,
+ 'font': 'MJDLZY+CMR10',
+ 'color': 0,
+ 'text': 'text',
+ 'bbox': (133.76800537109375,
+ 592.492919921875,
+ 477.537353515625,
+ 602.4555053710938)
+ }
+
+ with it("raises error if no toc level is specified"):
+ try:
+ fltr = ToCFilter({})
+ except ValueError:
+ pass
+ except:
+ assert False, "must raise error"
+
+ with it("raises error if toc level is invalid"):
+ try:
+ fltr = ToCFilter({'level': 0})
+ fltr = ToCFilter({'level': -1})
+ except ValueError:
+ pass
+ except:
+ assert False, "must raise error"
+
+ with it("does not raise error if toc level is valid"):
+ try:
+ fltr = ToCFilter({'level': 1})
+ fltr = ToCFilter({'level': 2})
+ except ValueError:
+ assert False, "must not raise error"
+
+ with it("admits exact matches"):
+ filter_title = ToCFilter(self.title_exact)
+ filter_text = ToCFilter(self.text_exact)
+ assert filter_title.admits(self.spn_title)
+ assert filter_text.admits(self.spn_text)
+
+ with it("rejects unmatched spans"):
+ filter_title = ToCFilter(self.title_exact)
+ filter_text = ToCFilter(self.text_exact)
+ assert not filter_title.admits(self.spn_text)
+ assert not filter_text.admits(self.spn_title)
+
+ with it("admits correctly without bbox"):
+ filter_title = ToCFilter({
+ 'level': 1,
+ 'font': {
+ 'name': "CMBX12",
+ }
+ })
+ assert filter_title.admits(self.spn_title)
+
+ filter_text = ToCFilter({
+ 'level': 2,
+ 'font': {
+ 'size': 9.962599754333496,
+ }
+ })
+ assert filter_text.admits(self.spn_text)
+
+ with it("rejects correctly without bbox"):
+ filter_title = ToCFilter({
+ 'level': 1,
+ 'font': {
+ 'name': "CMBX12",
+ }
+ })
+ assert not filter_title.admits(self.spn_text)
+
+ filter_text = ToCFilter({
+ 'level': 2,
+ 'font': {
+ 'size': 9.962599754333496,
+ }
+ })
+ assert not filter_text.admits(self.spn_title)
+
+ with it("admits correctly without font"):
+ filter_title = ToCFilter({
+ 'level': 1,
+ 'bbox': {
+ 'left': 157.98439025878906,
+ }
+ })
+ assert filter_title.admits(self.spn_title)
+
+ filter_text = ToCFilter({
+ 'level': 2,
+ 'bbox': {
+ 'top': 592.492919921875,
+ }
+ })
+ assert filter_text.admits(self.spn_text)
+
+ with it("rejects correctly without font"):
+ filter_title = ToCFilter({
+ 'level': 1,
+ 'bbox': {
+ 'left': 157.98439025878906,
+ }
+ })
+ assert not filter_title.admits(self.spn_text)
+
+ filter_text = ToCFilter({
+ 'level': 2,
+ 'bbox': {
+ 'top': 592.492919921875,
+ }
+ })
+ assert not filter_text.admits(self.spn_title)
+
+
+with description("FontFilter") as self:
+ with before.all:
+ self.title_exact = {
+ 'name': "CMBX12",
+ 'size': 14.346199989318848,
+ 'size_tolerance': 0,
+ 'color': 0,
+ 'superscript': False,
+ 'italic': False,
+ 'serif': True,
+ 'monospace': False,
+ 'bold': True
+ }
+
+ self.text_exact = {
+ 'name': "CMR10",
+ 'size': 9.962599754333496,
+ 'size_tolerance': 0,
+ 'color': 0,
+ 'superscript': False,
+ 'italic': False,
+ 'serif': True,
+ 'monospace': False,
+ 'bold': False
+ }
+
+ self.spn_title = {
+ 'size': 14.346199989318848,
+ 'flags': 20,
+ 'font': 'TZOLRB+CMBX12',
+ 'color': 0,
+ 'text': 'Section Two',
+ 'bbox': (157.98439025878906,
+ 567.3842163085938,
+ 245.18057250976562,
+ 581.7447509765625)
+ }
+
+ self.spn_small_title = {
+ 'size': 9.962599754333496,
+ 'flags': 4,
+ 'font': 'TZOLRB+CMBX12',
+ 'color': 0,
+ 'text': 'text',
+ 'bbox': (133.76800537109375,
+ 592.492919921875,
+ 477.537353515625,
+ 602.4555053710938)
+ }
+
+ self.spn_text = {
+ 'size': 9.962599754333496,
+ 'flags': 4,
+ 'font': 'MJDLZY+CMR10',
+ 'color': 0,
+ 'text': 'text',
+ 'bbox': (133.76800537109375,
+ 592.492919921875,
+ 477.537353515625,
+ 602.4555053710938)
+ }
+
+ with it("has a working constructor"):
+ fnt = FontFilter(self.title_exact)
+ assert fnt.name.search("TZOLRB+CMBX12")
+ assert fnt.name.search("CMBX12")
+ assert not fnt.name.search("CMBX10")
+ assert fnt.flags == 0b10100
+ assert fnt.ign_mask == 0b11111
+ assert fnt.color == 0x000000
+ assert fnt.size == 14.346199989318848
+ assert fnt.size_tolerance == 0
+
+ with it("can construct if empty dict is given in the constructor"):
+ fnt = FontFilter({})
+ assert fnt.name.search("anything")
+ assert fnt.flags == 0
+ assert fnt.ign_mask == 0
+ assert fnt.color is None
+ assert fnt.size is None
+ assert fnt.size_tolerance == 1e-5
+
+ with it("admits exact matches"):
+ fnt_title = FontFilter(self.title_exact)
+ fnt_text = FontFilter(self.text_exact)
+ assert fnt_title.admits(self.spn_title)
+ assert fnt_text.admits(self.spn_text)
+
+ with it("rejects unmatched spans"):
+ fnt_title = FontFilter(self.title_exact)
+ assert not fnt_title.admits(self.spn_text)
+ assert not fnt_title.admits(self.spn_small_title)
+
+ fnt_text = FontFilter(self.text_exact)
+ assert not fnt_text.admits(self.spn_title)
+ assert not fnt_text.admits(self.spn_small_title)
+
+ with it("admits correctly without font name"):
+ fnt_title = FontFilter({
+ 'size': 14.346199989318848,
+ 'size_tolerance': 0,
+ 'color': 0,
+ 'superscript': False,
+ 'italic': False,
+ 'serif': True,
+ 'monospace': False,
+ 'bold': True
+ })
+ assert fnt_title.admits(self.spn_title)
+
+ with it("rejects correctly without font name"):
+ fnt_title = FontFilter({
+ 'size': 14.346199989318848,
+ 'size_tolerance': 0,
+ 'color': 0,
+ 'superscript': False,
+ 'italic': False,
+ 'serif': True,
+ 'monospace': False,
+ 'bold': True
+ })
+ assert not fnt_title.admits(self.spn_text)
+ assert not fnt_title.admits(self.spn_small_title)
+
+ with it("admits correctly with only font name"):
+ fnt_title = FontFilter({
+ 'name': "CMBX12"
+ })
+ assert fnt_title.admits(self.spn_title)
+ assert fnt_title.admits(self.spn_small_title)
+
+ with it("rejects correctly with only font name"):
+ fnt_title = FontFilter({
+ 'name': "CMBX12"
+ })
+ assert not fnt_title.admits(self.spn_text)
+
+ with it("admits correctly without size"):
+ fnt_title = FontFilter({
+ 'name': "CMBX12",
+ 'size_tolerance': 0,
+ 'color': 0,
+ 'superscript': False,
+ 'italic': False,
+ 'serif': True,
+ 'monospace': False,
+ 'bold': True
+ })
+ assert fnt_title.admits(self.spn_title)
+
+ with it("rejects correctly without size"):
+ fnt_title = FontFilter({
+ 'name': "CMBX12",
+ 'size_tolerance': 0,
+ 'color': 0,
+ 'superscript': False,
+ 'italic': False,
+ 'serif': True,
+ 'monospace': False,
+ 'bold': True
+ })
+ assert not fnt_title.admits(self.spn_text)
+ assert not fnt_title.admits(self.spn_small_title)
+
+ with it("admits correctly with only size"):
+ fnt_title = FontFilter({
+ 'size': 14.346199989318848,
+ 'size_tolerance': 0
+ })
+ assert fnt_title.admits(self.spn_title)
+
+ with it("rejects correctly with only size"):
+ fnt_title = FontFilter({
+ 'size': 14.346199989318848,
+ 'size_tolerance': 0
+ })
+ assert not fnt_title.admits(self.spn_text)
+ assert not fnt_title.admits(self.spn_small_title)
+
+ with it("admits correctly without color"):
+ fnt_title = FontFilter({
+ 'name': "CMBX12",
+ 'size': 14.346199989318848,
+ 'size_tolerance': 0,
+ 'superscript': False,
+ 'italic': False,
+ 'serif': True,
+ 'monospace': False,
+ 'bold': True
+ })
+ assert fnt_title.admits(self.spn_title)
+
+ with it("rejects correctly without color"):
+ fnt_title = FontFilter({
+ 'name': "CMBX12",
+ 'size': 14.346199989318848,
+ 'size_tolerance': 0,
+ 'superscript': False,
+ 'italic': False,
+ 'serif': True,
+ 'monospace': False,
+ 'bold': True
+ })
+ assert not fnt_title.admits(self.spn_text)
+ assert not fnt_title.admits(self.spn_small_title)
+
+ with it("admits correctly with only color"):
+ fnt_title = FontFilter({
+ 'color': 0x000000,
+ })
+ assert fnt_title.admits(self.spn_title)
+ assert fnt_title.admits(self.spn_text)
+ assert fnt_title.admits(self.spn_small_title)
+
+ with it("rejects correctly with only color"):
+ fnt_title = FontFilter({
+ 'color': 0x000000,
+ })
+ spn_blue = {
+ 'size': 14.346199989318848,
+ 'flags': 20,
+ 'font': 'TZOLRB+CMBX12',
+ 'color': 0x0000ff,
+ 'text': 'Section Two',
+ 'bbox': (157.98439025878906,
+ 567.3842163085938,
+ 245.18057250976562,
+ 581.7447509765625)
+ }
+ assert not fnt_title.admits(spn_blue)
+
+ with it("admits correctly with only flags"):
+ fnt_title = FontFilter({
+ 'superscript': False,
+ 'italic': False,
+ 'serif': True,
+ 'monospace': False,
+ 'bold': True
+ })
+ assert fnt_title.admits(self.spn_title)
+
+ with it("rejects correctly with only flags"):
+ fnt_title = FontFilter({
+ 'superscript': False,
+ 'italic': False,
+ 'serif': True,
+ 'monospace': False,
+ 'bold': True
+ })
+ assert not fnt_title.admits(self.spn_text)
+ assert not fnt_title.admits(self.spn_small_title)
+
+ with it("admits correctly without flags"):
+ fnt_title = FontFilter({
+ 'name': "CMBX12",
+ 'size': 14.346199989318848,
+ 'size_tolerance': 0,
+ 'color': 0,
+ })
+ assert fnt_title.admits(self.spn_title)
+
+ with it("rejects correctly without flags"):
+ fnt_title = FontFilter({
+ 'name': "CMBX12",
+ 'size': 14.346199989318848,
+ 'size_tolerance': 0,
+ 'color': 0,
+ })
+ assert not fnt_title.admits(self.spn_text)
+ assert not fnt_title.admits(self.spn_small_title)
+
+ with it("admits correctly with partial flags"):
+ fnt_title = FontFilter({
+ 'serif': True,
+ 'bold': True
+ })
+ fnt_serif = FontFilter({
+ 'serif': True
+ })
+ fnt_sans = FontFilter({
+ 'serif': False
+ })
+ fnt_mono = FontFilter({
+ 'monospace': True
+ })
+ assert fnt_title.admits(self.spn_title)
+ assert fnt_serif.admits(self.spn_title)
+ assert fnt_serif.admits(self.spn_text)
+ assert fnt_sans.admits({'flags': 0b11011})
+ assert fnt_mono.admits({'flags': 0b11111})
+
+ with it("rejects correctly with partial flags"):
+ fnt_title = FontFilter({
+ 'serif': True,
+ 'bold': True
+ })
+ fnt_serif = FontFilter({
+ 'serif': True
+ })
+ fnt_sans = FontFilter({
+ 'serif': False
+ })
+ fnt_mono = FontFilter({
+ 'monospace': True
+ })
+ assert not fnt_title.admits(self.spn_text)
+ assert not fnt_title.admits(self.spn_small_title)
+ assert not fnt_sans.admits(self.spn_title)
+ assert not fnt_sans.admits(self.spn_text)
+ assert not fnt_mono.admits(self.spn_title)
+ assert not fnt_mono.admits(self.spn_text)
+
+
+with description("BoundingBoxFilter") as self:
+ with before.all:
+ self.title_exact = {
+ 'left': 157.98439025878906,
+ 'top': 567.3842163085938,
+ 'right': 245.18057250976562,
+ 'bottom': 581.7447509765625,
+ 'tolerance': 0
+ }
+
+ self.text_exact = {
+ 'left': 133.76800537109375,
+ 'top': 592.492919921875,
+ 'right': 477.537353515625,
+ 'bottom': 602.4555053710938,
+ 'tolerance': 0
+ }
+
+ self.spn_title = {
+ 'size': 14.346199989318848,
+ 'flags': 20,
+ 'font': 'TZOLRB+CMBX12',
+ 'color': 0,
+ 'text': 'Section Two',
+ 'bbox': (157.98439025878906,
+ 567.3842163085938,
+ 245.18057250976562,
+ 581.7447509765625)
+ }
+
+ self.spn_title2 = {
+ 'size': 14.346199989318848,
+ 'flags': 20,
+ 'font': 'TZOLRB+CMBX12',
+ 'color': 0,
+ 'text': 'Section One',
+ 'bbox': (157.98439025878906,
+ 335.569580078125,
+ 477.66058349609375,
+ 349.93011474609375)
+ }
+
+ self.spn_text = {
+ 'size': 9.962599754333496,
+ 'flags': 4,
+ 'font': 'MJDLZY+CMR10',
+ 'color': 0,
+ 'text': 'text',
+ 'bbox': (133.76800537109375,
+ 592.492919921875,
+ 477.537353515625,
+ 602.4555053710938)
+ }
+ with it("has a working constructor"):
+ bbox = BoundingBoxFilter(self.title_exact)
+ assert bbox.left is not None
+ assert bbox.right is not None
+ assert bbox.top is not None
+ assert bbox.bottom is not None
+ assert bbox.tolerance == 0
+
+ with it("can construct if empty dict is given in the constructor"):
+ bbox = BoundingBoxFilter({})
+ assert bbox.left is None
+ assert bbox.right is None
+ assert bbox.top is None
+ assert bbox.bottom is None
+ assert bbox.tolerance == 1e-5
+
+ with it("admits exact matches"):
+ bbox_title = BoundingBoxFilter(self.title_exact)
+ bbox_text = BoundingBoxFilter(self.text_exact)
+ assert bbox_title.admits(self.spn_title)
+ assert bbox_text.admits(self.spn_text)
+
+ with it("rejects unmatched spans"):
+ bbox_title = BoundingBoxFilter(self.title_exact)
+ assert not bbox_title.admits(self.spn_text)
+ assert not bbox_title.admits(self.spn_title2)
+
+ bbox_text = BoundingBoxFilter(self.text_exact)
+ assert not bbox_text.admits(self.spn_title)
+ assert not bbox_text.admits(self.spn_title2)
+
+ with it("admits correctly with partial bbox"):
+ bbox_title = BoundingBoxFilter({
+ 'left': 157.98439025878906
+ })
+ assert bbox_title.admits(self.spn_title)
+ assert bbox_title.admits(self.spn_title2)
+
+ bbox_top = BoundingBoxFilter({
+ 'top': 567.3842163085938
+ })
+ assert bbox_top.admits(self.spn_title)
+
+ bbox_right = BoundingBoxFilter({
+ 'right': 245.18057250976562
+ })
+ assert bbox_right.admits(self.spn_title)
+
+ bbox_bottom = BoundingBoxFilter({
+ 'bottom': 581.7447509765625
+ })
+ assert bbox_bottom.admits(self.spn_title)
+
+ with it("rejects correctly with partial bbox"):
+ bbox_title = BoundingBoxFilter({
+ 'left': 157.98439025878906
+ })
+ assert not bbox_title.admits(self.spn_text)
+
+ bbox_top = BoundingBoxFilter({
+ 'top': 567.3842163085938
+ })
+ assert not bbox_top.admits(self.spn_title2)
+
+ bbox_right = BoundingBoxFilter({
+ 'right': 245.18057250976562
+ })
+ assert not bbox_right.admits(self.spn_title2)
+
+ bbox_bottom = BoundingBoxFilter({
+ 'bottom': 581.7447509765625
+ })
+ assert not bbox_bottom.admits(self.spn_title2)
diff --git a/spec/fitzutils_spec.py b/spec/fitzutils_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fb271c74bacbb6cc3d062cebcdafd3fe6671d28
--- /dev/null
+++ b/spec/fitzutils_spec.py
@@ -0,0 +1,101 @@
+import os
+import io
+
+from mamba import description, it, before
+from fitzutils import (
+ open_pdf,
+ ToCEntry,
+ dump_toc
+)
+from pdftocio.tocparser import parse_toc
+
+dirpath = os.path.dirname(os.path.abspath(__file__))
+
+valid_file = os.path.join(dirpath, "files/level2.pdf")
+invalid_file = os.path.join(dirpath, "files/nothing.pdf")
+
+with description("open_pdf:") as self:
+ with it("opens pdf file for reading"):
+ with open_pdf(valid_file, False) as doc:
+ assert doc is not None
+ assert doc.page_count == 6
+
+ with it("returns None if pdf file is invalid"):
+ with open_pdf(invalid_file, False) as doc:
+ assert doc is None
+
+ with it("exits if pdf file is invalid and exit_on_error is true"):
+ try:
+ with open_pdf(invalid_file, True) as doc:
+ assert False, "should have exited"
+ except AssertionError as err:
+ raise err
+ except:
+ pass
+
+with description("ToCEntry") as self:
+ with it("matches fitz's representation"):
+ fitz_entry = [1, "title", 2]
+ fitz_entry2 = [1, "title", 2, 100.0]
+
+ toc_entry = ToCEntry(level=1, title="title", pagenum=2)
+ toc_entry2 = ToCEntry(level=1, title="title", pagenum=2, vpos=100.0)
+
+ assert toc_entry.to_fitz_entry() == fitz_entry
+ assert toc_entry2.to_fitz_entry() == fitz_entry2
+
+ assert ToCEntry(*fitz_entry) == toc_entry
+ assert ToCEntry(*fitz_entry2) == toc_entry2
+
+ with it("is sorted correctly"):
+ entries = [
+ ToCEntry(level=1, title="title4", pagenum=2, vpos=150.0),
+ ToCEntry(level=1, title="title3", pagenum=2, vpos=90.0),
+ ToCEntry(level=1, title="title5", pagenum=3, vpos=0.0),
+ ToCEntry(level=1, title="title2", pagenum=1, vpos=150.0),
+ ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0),
+ ToCEntry(level=1, title="title6", pagenum=5, vpos=200.0)
+ ]
+
+ expected = [
+ ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0),
+ ToCEntry(level=1, title="title2", pagenum=1, vpos=150.0),
+ ToCEntry(level=1, title="title3", pagenum=2, vpos=90.0),
+ ToCEntry(level=1, title="title4", pagenum=2, vpos=150.0),
+ ToCEntry(level=1, title="title5", pagenum=3, vpos=0.0),
+ ToCEntry(level=1, title="title6", pagenum=5, vpos=200.0)
+ ]
+ assert sorted(entries, key=ToCEntry.key) == expected
+
+
+with description("dump_toc") as self:
+ with before.all:
+ self.toc = [
+ ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0),
+ ToCEntry(level=2, title="title2", pagenum=1, vpos=150.0),
+ ToCEntry(level=3, title="title3", pagenum=2, vpos=90.0),
+ ToCEntry(level=2, title="title4", pagenum=2, vpos=150.0),
+ ToCEntry(level=2, title="title5", pagenum=3, vpos=0.0),
+ ToCEntry(level=1, title="title6", pagenum=5, vpos=200.0)
+ ]
+
+ self.toc_novpos = [
+ ToCEntry(level=1, title="title1", pagenum=1),
+ ToCEntry(level=2, title="title2", pagenum=1),
+ ToCEntry(level=3, title="title3", pagenum=2),
+ ToCEntry(level=2, title="title4", pagenum=2),
+ ToCEntry(level=2, title="title5", pagenum=3),
+ ToCEntry(level=1, title="title6", pagenum=5)
+ ]
+
+ with it("won't print vpos if vpos is False"):
+ toc_s = dump_toc(self.toc, False)
+ f = io.StringIO(toc_s)
+ assert parse_toc(f) == self.toc_novpos
+ assert parse_toc(f) != self.toc
+
+ with it("won't print vpos if vpos is missing"):
+ toc_s = dump_toc(self.toc_novpos, True)
+ f = io.StringIO(toc_s)
+ assert parse_toc(f) == self.toc_novpos
+ assert parse_toc(f) != self.toc
diff --git a/spec/parser_spec.py b/spec/parser_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..548998adef037079f0b0a3044725985d00482be7
--- /dev/null
+++ b/spec/parser_spec.py
@@ -0,0 +1,65 @@
+import os
+import io
+
+from mamba import description, it, before
+from fitzutils import (
+ dump_toc,
+ ToCEntry
+)
+from pdftocio.tocparser import parse_toc
+
+dirpath = os.path.dirname(os.path.abspath(__file__))
+
+valid_file = os.path.join(dirpath, "files/level2.pdf")
+invalid_file = os.path.join(dirpath, "files/nothing.pdf")
+
+with description("parse_toc") as self:
+ with before.all:
+ self.toc = [
+ ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0),
+ ToCEntry(level=2, title="title2", pagenum=1, vpos=150.0),
+ ToCEntry(level=3, title="title3", pagenum=2, vpos=90.0),
+ ToCEntry(level=2, title="title4", pagenum=2, vpos=150.0),
+ ToCEntry(level=2, title="title5", pagenum=3, vpos=0.0),
+ ToCEntry(level=1, title="title6", pagenum=5, vpos=200.0)
+ ]
+
+ self.toc_novpos = [
+ ToCEntry(level=1, title="title1", pagenum=1),
+ ToCEntry(level=2, title="title2", pagenum=1),
+ ToCEntry(level=3, title="title3", pagenum=2),
+ ToCEntry(level=2, title="title4", pagenum=2),
+ ToCEntry(level=2, title="title5", pagenum=3),
+ ToCEntry(level=1, title="title6", pagenum=5)
+ ]
+
+
+ with it("can recover the result from dump_toc"):
+ toc_s = dump_toc(self.toc, True)
+ f = io.StringIO(toc_s)
+ assert parse_toc(f) == self.toc
+ assert parse_toc(f) != self.toc_novpos
+
+ toc_s = dump_toc(self.toc_novpos, False)
+ f = io.StringIO(toc_s)
+ assert parse_toc(f) == self.toc_novpos
+ assert parse_toc(f) != self.toc
+
+ with it("escapes quotations correctly"):
+ quoted = '"a ""quoted"" title" 2\n "a single \'quoted\' title" 4'
+ expect = [
+ ToCEntry(level=1, title='a "quoted" title', pagenum=2),
+ ToCEntry(level=2, title="a single 'quoted' title", pagenum=4)
+ ]
+ f = io.StringIO(quoted)
+ assert parse_toc(f) == expect
+
+ with it("raises error when toc entry is invalid"):
+ malformed = '"entry" 1\n "error entry"'
+ f = io.StringIO(malformed)
+ try:
+ parse_toc(f)
+ except IndexError:
+ pass
+ else:
+ assert False, "must raise error"
diff --git a/spec/tocgen_spec.py b/spec/tocgen_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..aade8b67f47d66d0bf3913e473adca9a1702ac8c
--- /dev/null
+++ b/spec/tocgen_spec.py
@@ -0,0 +1,159 @@
+import os
+import fitz
+import toml
+
+from mamba import description, it, before
+from fitzutils import ToCEntry
+from pdftocgen.tocgen import gen_toc
+
+dirpath = os.path.dirname(os.path.abspath(__file__))
+
+with description("gen_toc") as self:
+ with before.all:
+ self.level2 = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
+ self.level2_recipe = toml.load(
+ open(os.path.join(dirpath, "files/level2_recipe.toml"))
+ )
+ self.level2_expect = [
+ ToCEntry(level=1, title='1 Section One',
+ pagenum=1, vpos=237.6484375),
+ ToCEntry(level=1, title='2 Section Two',
+ pagenum=1, vpos=567.3842163085938),
+ ToCEntry(level=2, title='2.1 Subsection Two.One',
+ pagenum=2, vpos=452.56671142578125),
+ ToCEntry(level=1,
+ title='3 Section Three, with looong loooong looong ti- tle',
+ pagenum=3, vpos=335.569580078125),
+ ToCEntry(level=2, title='3.1 Subsection Three.One, '
+ 'with even loooooooooooonger title, and probably even more',
+ pagenum=3, vpos=619.4886474609375),
+ ToCEntry(level=2, title='3.2 Subsection Three.Two',
+ pagenum=4, vpos=512.3426513671875),
+ ToCEntry(level=2, title='3.3 Subsection Three.Three',
+ pagenum=5, vpos=125.79861450195312),
+ ToCEntry(level=1, title='4 The End',
+ pagenum=5, vpos=366.62347412109375)
+ ]
+
+ self.onepage = fitz.open(os.path.join(dirpath, "files/onepage.pdf"))
+ self.onepage_recipe = toml.load(
+ open(os.path.join(dirpath, "files/onepage_recipe.toml"))
+ )
+ self.onepage_greedy = toml.load(
+ open(os.path.join(dirpath, "files/onepage_greedy.toml"))
+ )
+ self.onepage_expect = [
+ # false positive, but easy to remove in post-processing
+ ToCEntry(level=2, title='krasjet',
+ pagenum=1, vpos=196.53366088867188),
+ ToCEntry(level=1, title='1 Section One',
+ pagenum=1, vpos=237.6484375),
+ ToCEntry(level=1, title='2 Section Two',
+ pagenum=1, vpos=265.44744873046875),
+ ToCEntry(level=2, title='2.1 Subsection Two.One',
+ pagenum=1, vpos=291.0536804199219),
+ ToCEntry(level=2, title='2.2 Subsection Two.Two \xd7 2',
+ pagenum=1, vpos=311.1368103027344),
+ ToCEntry(level=1, title='3 Section Three, with looong loooong looong ti- tle',
+ pagenum=1, vpos=334.00946044921875),
+ ToCEntry(level=2, title='3.1 Subsection Three.One, '
+ 'with even loooooooooooonger title, and probably even more',
+ pagenum=1, vpos=377.5487060546875),
+ ToCEntry(level=2, title='3.2 Subsection Three.Two',
+ pagenum=1, vpos=411.8786926269531),
+ ToCEntry(level=2, title='3.3 Subsection Three.Three',
+ pagenum=1, vpos=432.26068115234375),
+ ToCEntry(level=3, title='3.3.1 Subsubsection Three.Three.One',
+ pagenum=1, vpos=452.1441345214844),
+ ToCEntry(level=3, title='3.3.2 Subsubsection Three.Three.Two',
+ pagenum=1, vpos=470.53314208984375),
+ ToCEntry(level=3, title='3.3.3 Subsubsection Three.Three.Three',
+ pagenum=1, vpos=488.9231262207031),
+ ToCEntry(level=2, title='3.4 Subsection Three.Four',
+ pagenum=1, vpos=507.8106994628906),
+ ToCEntry(level=2, title='3.5 Subsection Three.Five',
+ pagenum=1, vpos=528.191650390625),
+ ToCEntry(level=1, title='4 The End',
+ pagenum=1, vpos=550.7654418945312)
+ ]
+
+ self.onepage_greedy_expect = [
+ # hooray, no more false positives
+ ToCEntry(level=1, title='1 Section One',
+ pagenum=1, vpos=237.6484375),
+ ToCEntry(level=1, title='2 Section Two',
+ pagenum=1, vpos=265.44744873046875),
+ ToCEntry(level=2, title='2.1 Subsection Two.One',
+ pagenum=1, vpos=291.0536804199219),
+ ToCEntry(level=2, title='2.2 Subsection Two.Two \xd7 2',
+ pagenum=1, vpos=311.1368103027344),
+ ToCEntry(level=1, title='3 Section Three, with looong loooong looong ti- tle',
+ pagenum=1, vpos=334.00946044921875),
+ ToCEntry(level=2, title='3.1 Subsection Three.One, '
+ 'with even loooooooooooonger title, and probably even more',
+ pagenum=1, vpos=377.5487060546875),
+ ToCEntry(level=2, title='3.2 Subsection Three.Two',
+ pagenum=1, vpos=411.8786926269531),
+ ToCEntry(level=2, title='3.3 Subsection Three.Three',
+ pagenum=1, vpos=432.26068115234375),
+ ToCEntry(level=3, title='3.3.1 Subsubsection Three.Three.One',
+ pagenum=1, vpos=452.1441345214844),
+ ToCEntry(level=3, title='3.3.2 Subsubsection Three.Three.Two',
+ pagenum=1, vpos=470.53314208984375),
+ ToCEntry(level=3, title='3.3.3 Subsubsection Three.Three.Three',
+ pagenum=1, vpos=488.9231262207031),
+ ToCEntry(level=2, title='3.4 Subsection Three.Four',
+ pagenum=1, vpos=507.8106994628906),
+ ToCEntry(level=2, title='3.5 Subsection Three.Five',
+ pagenum=1, vpos=528.191650390625),
+ ToCEntry(level=1, title='4 The End',
+ pagenum=1, vpos=550.7654418945312)
+ ]
+
+ self.hardmode = fitz.open(os.path.join(dirpath, "files/hardmode.pdf"))
+ self.hardmode_recipe = toml.load(
+ open(os.path.join(dirpath, "files/hardmode_recipe.toml"))
+ )
+
+ self.hardmode_expect = [
+ ToCEntry(level=1, title='1 Section One',
+ pagenum=1, vpos=174.1232452392578),
+ ToCEntry(level=1, title='2 Section 1 + 1 = 2',
+ pagenum=1, vpos=584.5831909179688),
+ ToCEntry(level=2, title='2.1 Subsection Two.One',
+ pagenum=1, vpos=425.2061462402344),
+ ToCEntry(level=1, title='e ln(3)',
+ pagenum=2, vpos=516.01708984375),
+ ToCEntry(level=2, title='3.1 Subsection e ln(3) .1, '
+ 'with looo- ooooooooong title',
+ pagenum=2, vpos=302.5021057128906),
+ ToCEntry(level=2, title='3.2 S ubsection Three.Two, another long title',
+ pagenum=3, vpos=396.212158203125),
+ ToCEntry(level=2, title='3.3 Subsection Three.Three',
+ pagenum=3, vpos=68.84815979003906),
+ ToCEntry(level=1, title='4 The x → ∞ End',
+ pagenum=3, vpos=483.49920654296875)
+ ]
+
+ with it("generates 2-level toc correctly"):
+ assert gen_toc(self.level2, self.level2_recipe) == self.level2_expect
+
+ with it("handles headings on same page correctly"):
+ assert gen_toc(
+ self.onepage, self.onepage_recipe
+ ) == self.onepage_expect
+
+ with it("handles math in heading correctly"):
+ assert gen_toc(
+ self.onepage, self.onepage_recipe
+ ) == self.onepage_expect
+
+ with it("handles greedy filter correctly"):
+ assert gen_toc(
+ self.onepage, self.onepage_greedy
+ ) == self.onepage_greedy_expect
+
+ with it("passes the HARD MODE"):
+ assert gen_toc(
+ self.hardmode, self.hardmode_recipe
+ ) == self.hardmode_expect
diff --git a/spec/tocio_spec.py b/spec/tocio_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b4fff163c0a3e8f49f3832ef1c2e588d57b5120
--- /dev/null
+++ b/spec/tocio_spec.py
@@ -0,0 +1,81 @@
+import os
+import fitz
+
+from mamba import description, it, before
+from fitzutils import ToCEntry
+from pdftocio.tocio import read_toc, write_toc
+
+dirpath = os.path.dirname(os.path.abspath(__file__))
+
+level2 = os.path.join(dirpath, "files/level2.pdf")
+hastoc = os.path.join(dirpath, "files/hastoc.pdf")
+
+with description("read_toc") as self:
+ with before.all:
+ self.doc = fitz.open(level2)
+ self.reference = fitz.open(hastoc)
+ self.expect = [
+ ToCEntry(level=1, title='Section One', pagenum=1, vpos=234.65998),
+ ToCEntry(level=1, title='Section Two', pagenum=1, vpos=562.148),
+ ToCEntry(level=2, title='Subsection Two.One', pagenum=2, vpos=449.522),
+ ToCEntry(level=1,
+ title='Section Three, with looong loooong looong title',
+ pagenum=3,
+ vpos=330.333),
+ ToCEntry(level=2,
+ title='Subsection Three.One, '
+ 'with even loooooooooooonger title, and probably even more',
+ pagenum=3,
+ vpos=616.444),
+ ToCEntry(level=2, title='Subsection Three.Two',
+ pagenum=4, vpos=509.298),
+ ToCEntry(level=2, title='Subsection Three.Three',
+ pagenum=5, vpos=124.802),
+ ToCEntry(level=1, title='The End', pagenum=5, vpos=361.387)
+ ]
+
+ with it("reads pdf toc correctly"):
+ assert self.expect == read_toc(self.reference)
+
+ with it("makes (read_toc -> write_toc -> read_toc) an identity operation (except vpos)"):
+ toc = read_toc(self.reference)
+ write_toc(self.doc, toc)
+ toc2 = read_toc(self.doc)
+
+ assert len(toc2) == len(toc)
+ for e1, e2 in zip(toc, toc2):
+ assert e1.level == e2.level
+ assert e1.title == e2.title
+ assert e1.pagenum == e2.pagenum
+
+with description("write_toc") as self:
+ with before.all:
+ self.doc = fitz.open(level2)
+ self.reference = fitz.open(hastoc)
+ self.toc = [
+ ToCEntry(level=1, title='Section One', pagenum=1),
+ ToCEntry(level=1, title='Section Two', pagenum=1),
+ ToCEntry(level=2, title='Subsection Two.One', pagenum=2),
+ ToCEntry(level=1,
+ title='Section Three, with looong loooong looong title',
+ pagenum=3),
+ ToCEntry(level=2,
+ title='Subsection Three.One, '
+ 'with even loooooooooooonger title, and probably even more',
+ pagenum=3),
+ ToCEntry(level=2, title='Subsection Three.Two',
+ pagenum=4),
+ ToCEntry(level=2, title='Subsection Three.Three',
+ pagenum=5),
+ ToCEntry(level=1, title='The End', pagenum=5)
+ ]
+
+ with it("makes (write_toc -> read_toc) an identity operation (except vpos)"):
+ write_toc(self.doc, self.toc)
+ toc2 = read_toc(self.doc)
+
+ assert len(toc2) == len(self.toc)
+ for e1, e2 in zip(self.toc, toc2):
+ assert e1.level == e2.level
+ assert e1.title == e2.title
+ assert e1.pagenum == e2.pagenum
diff --git a/spec/xmeta_spec.py b/spec/xmeta_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1db81351c40a86ae8c5060dbf47177cd96d6f97
--- /dev/null
+++ b/spec/xmeta_spec.py
@@ -0,0 +1,188 @@
+import os
+import fitz
+import toml
+
+from mamba import description, it, before
+from pdfxmeta import extract_meta, dump_meta, dump_toml
+
+dirpath = os.path.dirname(os.path.abspath(__file__))
+
+with description("extract_meta:") as self:
+ with before.all:
+ self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
+
+ with it("extracts metadata from pdf"):
+ meta = extract_meta(self.doc, "Section One", 1)
+ assert len(meta) == 1
+
+ m = meta[0]
+ assert m['text'] == "Section One"
+ assert 'font' in m
+ assert 'CMBX12' in m['font']
+
+ with it("matches lowercase when ignore case is set"):
+ meta = extract_meta(self.doc, "section one", 1, True)
+ assert len(meta) == 1
+
+ m = meta[0]
+ assert m['text'] == "Section One"
+ assert 'font' in m
+ assert 'CMBX12' in m['font']
+
+ with it("matches mixed case when ignore case is set"):
+ meta = extract_meta(self.doc, "sEcTIoN OnE", 1, True)
+ assert len(meta) == 1
+
+ m = meta[0]
+ assert m['text'] == "Section One"
+ assert 'font' in m
+ assert 'CMBX12' in m['font']
+
+ with it("matches nothing if ignore case is not set"):
+ meta = extract_meta(self.doc, "section one", 1, False)
+ assert len(meta) == 0
+
+ with it("can match multiple instances of needle"):
+ meta = extract_meta(self.doc, "Section", 1)
+ assert len(meta) == 2
+
+ m = meta[0]
+ assert m['text'] == "Section One"
+ assert 'font' in m
+ assert 'CMBX12' in m['font']
+
+ m = meta[1]
+ assert m['text'] == "Section Two"
+ assert 'font' in m
+ assert 'CMBX12' in m['font']
+
+ with it("returns [] when nothing is matched"):
+ meta = extract_meta(self.doc, "Sectoin", 1, False)
+ assert len(meta) == 0
+
+ with it("returns [] when page number is out of range"):
+ meta = extract_meta(self.doc, "Section One", 0)
+ assert len(meta) == 0
+
+ meta = extract_meta(self.doc, "Section One", 7)
+ assert len(meta) == 0
+
+ with it("can match text on any page when page number is not specified"):
+ meta = extract_meta(self.doc, "The End")
+ assert len(meta) == 1
+
+ m = meta[0]
+ assert m['text'] == "The End"
+ assert 'font' in m
+ assert 'CMBX12' in m['font']
+
+with description("dump_meta:") as self:
+ with before.all:
+ self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
+ self.expected_meta = {
+ 'font': {
+ 'name': 'CMBX12',
+ 'size': 14.346199989318848,
+ 'color': 0x000000,
+ 'superscript': False,
+ 'italic': False,
+ 'serif': True,
+ 'monospace': False,
+ 'bold': True
+ },
+ 'bbox': {
+ 'left': 157.98439025878906,
+ 'top': 237.6484375,
+ 'right': 243.12905883789062,
+ 'bottom': 252.00897216796875
+ }
+ }
+
+ with it("produces valid toml"):
+ meta = extract_meta(self.doc, "Section One", 1)
+ assert len(meta) == 1
+
+ meta_dict = toml.loads(dump_meta(meta[0]))
+ assert meta_dict == self.expected_meta
+
+
+with description("dump_toml:") as self:
+ with before.all:
+ self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
+ self.expected_recipe = {
+ 'heading': [
+ {
+ 'level': 1,
+ 'greedy': True,
+ 'font': {
+ 'name': 'CMBX12',
+ 'size': 14.346199989318848,
+ }
+ }
+ ]
+ }
+
+ with it("produces valid toml"):
+ meta = extract_meta(self.doc, "Section One", 1)
+ assert len(meta) == 1
+
+ meta_dict = toml.loads(dump_toml(meta[0], 1))
+ assert meta_dict == self.expected_recipe
+
+ with it("strips font subset correctly"):
+ with_subset = {
+ 'font': "subset+font",
+ 'size': 1,
+ 'flags': 20,
+ 'color': 0,
+ 'bbox': (1, 2, 3, 4),
+ 'text': ""
+ }
+
+ without_subset = {
+ 'font': "font",
+ 'size': 1,
+ 'flags': 20,
+ 'color': 0,
+ 'bbox': (1, 2, 3, 4),
+ 'text': ""
+ }
+
+ expected = {
+ 'heading': [
+ {
+ 'level': 1,
+ 'greedy': True,
+ 'font': {
+ 'name': 'font',
+ 'size': 1
+ }
+ }
+ ]
+ }
+
+ double_plus = {
+ 'font': "subset+font+font",
+ 'size': 1,
+ 'flags': 20,
+ 'color': 0,
+ 'bbox': (1, 2, 3, 4),
+ 'text': ""
+ }
+
+ expected2 = {
+ 'heading': [
+ {
+ 'level': 1,
+ 'greedy': True,
+ 'font': {
+ 'name': 'font+font',
+ 'size': 1
+ }
+ }
+ ]
+ }
+
+ assert toml.loads(dump_toml(with_subset, 1)) == expected
+ assert toml.loads(dump_toml(without_subset, 1)) == expected
+ assert toml.loads(dump_toml(double_plus, 1)) == expected2
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/utils/find_by_font.py b/utils/find_by_font.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1a411f3f315203e1fed8f3184e7987942162aea
--- /dev/null
+++ b/utils/find_by_font.py
@@ -0,0 +1,41 @@
+import sys
+import fitz # PyMuPDF
+import math
+
+def main():
+ if len(sys.argv) < 3:
+ print("Usage: python find_by_font.py