pdf.tocgen.split / spec /tocgen_spec.py
adelevett's picture
Upload 76 files
046e3b8 verified
raw
history blame
7.94 kB
import os
import fitz
import toml
from mamba import description, it, before
from fitzutils import ToCEntry
from pdftocgen.tocgen import gen_toc
dirpath = os.path.dirname(os.path.abspath(__file__))
with description("gen_toc") as self:
with before.all:
self.level2 = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
self.level2_recipe = toml.load(
open(os.path.join(dirpath, "files/level2_recipe.toml"))
)
self.level2_expect = [
ToCEntry(level=1, title='1 Section One',
pagenum=1, vpos=237.6484375),
ToCEntry(level=1, title='2 Section Two',
pagenum=1, vpos=567.3842163085938),
ToCEntry(level=2, title='2.1 Subsection Two.One',
pagenum=2, vpos=452.56671142578125),
ToCEntry(level=1,
title='3 Section Three, with looong loooong looong ti- tle',
pagenum=3, vpos=335.569580078125),
ToCEntry(level=2, title='3.1 Subsection Three.One, '
'with even loooooooooooonger title, and probably even more',
pagenum=3, vpos=619.4886474609375),
ToCEntry(level=2, title='3.2 Subsection Three.Two',
pagenum=4, vpos=512.3426513671875),
ToCEntry(level=2, title='3.3 Subsection Three.Three',
pagenum=5, vpos=125.79861450195312),
ToCEntry(level=1, title='4 The End',
pagenum=5, vpos=366.62347412109375)
]
self.onepage = fitz.open(os.path.join(dirpath, "files/onepage.pdf"))
self.onepage_recipe = toml.load(
open(os.path.join(dirpath, "files/onepage_recipe.toml"))
)
self.onepage_greedy = toml.load(
open(os.path.join(dirpath, "files/onepage_greedy.toml"))
)
self.onepage_expect = [
# false positive, but easy to remove in post-processing
ToCEntry(level=2, title='krasjet',
pagenum=1, vpos=196.53366088867188),
ToCEntry(level=1, title='1 Section One',
pagenum=1, vpos=237.6484375),
ToCEntry(level=1, title='2 Section Two',
pagenum=1, vpos=265.44744873046875),
ToCEntry(level=2, title='2.1 Subsection Two.One',
pagenum=1, vpos=291.0536804199219),
ToCEntry(level=2, title='2.2 Subsection Two.Two \xd7 2',
pagenum=1, vpos=311.1368103027344),
ToCEntry(level=1, title='3 Section Three, with looong loooong looong ti- tle',
pagenum=1, vpos=334.00946044921875),
ToCEntry(level=2, title='3.1 Subsection Three.One, '
'with even loooooooooooonger title, and probably even more',
pagenum=1, vpos=377.5487060546875),
ToCEntry(level=2, title='3.2 Subsection Three.Two',
pagenum=1, vpos=411.8786926269531),
ToCEntry(level=2, title='3.3 Subsection Three.Three',
pagenum=1, vpos=432.26068115234375),
ToCEntry(level=3, title='3.3.1 Subsubsection Three.Three.One',
pagenum=1, vpos=452.1441345214844),
ToCEntry(level=3, title='3.3.2 Subsubsection Three.Three.Two',
pagenum=1, vpos=470.53314208984375),
ToCEntry(level=3, title='3.3.3 Subsubsection Three.Three.Three',
pagenum=1, vpos=488.9231262207031),
ToCEntry(level=2, title='3.4 Subsection Three.Four',
pagenum=1, vpos=507.8106994628906),
ToCEntry(level=2, title='3.5 Subsection Three.Five',
pagenum=1, vpos=528.191650390625),
ToCEntry(level=1, title='4 The End',
pagenum=1, vpos=550.7654418945312)
]
self.onepage_greedy_expect = [
# hooray, no more false positives
ToCEntry(level=1, title='1 Section One',
pagenum=1, vpos=237.6484375),
ToCEntry(level=1, title='2 Section Two',
pagenum=1, vpos=265.44744873046875),
ToCEntry(level=2, title='2.1 Subsection Two.One',
pagenum=1, vpos=291.0536804199219),
ToCEntry(level=2, title='2.2 Subsection Two.Two \xd7 2',
pagenum=1, vpos=311.1368103027344),
ToCEntry(level=1, title='3 Section Three, with looong loooong looong ti- tle',
pagenum=1, vpos=334.00946044921875),
ToCEntry(level=2, title='3.1 Subsection Three.One, '
'with even loooooooooooonger title, and probably even more',
pagenum=1, vpos=377.5487060546875),
ToCEntry(level=2, title='3.2 Subsection Three.Two',
pagenum=1, vpos=411.8786926269531),
ToCEntry(level=2, title='3.3 Subsection Three.Three',
pagenum=1, vpos=432.26068115234375),
ToCEntry(level=3, title='3.3.1 Subsubsection Three.Three.One',
pagenum=1, vpos=452.1441345214844),
ToCEntry(level=3, title='3.3.2 Subsubsection Three.Three.Two',
pagenum=1, vpos=470.53314208984375),
ToCEntry(level=3, title='3.3.3 Subsubsection Three.Three.Three',
pagenum=1, vpos=488.9231262207031),
ToCEntry(level=2, title='3.4 Subsection Three.Four',
pagenum=1, vpos=507.8106994628906),
ToCEntry(level=2, title='3.5 Subsection Three.Five',
pagenum=1, vpos=528.191650390625),
ToCEntry(level=1, title='4 The End',
pagenum=1, vpos=550.7654418945312)
]
self.hardmode = fitz.open(os.path.join(dirpath, "files/hardmode.pdf"))
self.hardmode_recipe = toml.load(
open(os.path.join(dirpath, "files/hardmode_recipe.toml"))
)
self.hardmode_expect = [
ToCEntry(level=1, title='1 Section One',
pagenum=1, vpos=174.1232452392578),
ToCEntry(level=1, title='2 Section 1 + 1 = 2',
pagenum=1, vpos=584.5831909179688),
ToCEntry(level=2, title='2.1 Subsection Two.One',
pagenum=1, vpos=425.2061462402344),
ToCEntry(level=1, title='e ln(3)',
pagenum=2, vpos=516.01708984375),
ToCEntry(level=2, title='3.1 Subsection e ln(3) .1, '
'with looo- ooooooooong title',
pagenum=2, vpos=302.5021057128906),
ToCEntry(level=2, title='3.2 S ubsection Three.Two, another long title',
pagenum=3, vpos=396.212158203125),
ToCEntry(level=2, title='3.3 Subsection Three.Three',
pagenum=3, vpos=68.84815979003906),
ToCEntry(level=1, title='4 The x → ∞ End',
pagenum=3, vpos=483.49920654296875)
]
with it("generates 2-level toc correctly"):
assert gen_toc(self.level2, self.level2_recipe) == self.level2_expect
with it("handles headings on same page correctly"):
assert gen_toc(
self.onepage, self.onepage_recipe
) == self.onepage_expect
with it("handles math in heading correctly"):
assert gen_toc(
self.onepage, self.onepage_recipe
) == self.onepage_expect
with it("handles greedy filter correctly"):
assert gen_toc(
self.onepage, self.onepage_greedy
) == self.onepage_greedy_expect
with it("passes the HARD MODE"):
assert gen_toc(
self.hardmode, self.hardmode_recipe
) == self.hardmode_expect