pdf.tocgen.split / spec /filter_spec.py
adelevett's picture
Upload 76 files
046e3b8 verified
import os
from mamba import description, it, before
from pdftocgen.filter import (
ToCFilter,
admits_float,
FontFilter,
BoundingBoxFilter
)
dirpath = os.path.dirname(os.path.abspath(__file__))
with description("admits_float") as self:
with it("admits if difference is below tol"):
assert admits_float(1, 1.05, 0.1)
assert admits_float(1, 0.95, 0.1)
with it("does not admit if difference is too large"):
assert not admits_float(1, 1.5, 0.1)
assert not admits_float(1, 0.5, 0.1)
with it("admits anything if expect is unset"):
assert admits_float(None, 1, 0.1)
assert admits_float(None, None, 0.1)
with it("does not admit if expect is set but actual is None"):
assert not admits_float(1, None, 0.1)
with description("ToCFilter") as self:
with before.all:
self.title_exact = {
'level': 1,
'font': {
'name': "CMBX12",
'size': 14.346199989318848,
'size_tolerance': 0,
'color': 0,
'superscript': False,
'italic': False,
'serif': True,
'monospace': False,
'bold': True
},
'bbox': {
'left': 157.98439025878906,
'top': 567.3842163085938,
'right': 245.18057250976562,
'bottom': 581.7447509765625,
'tolerance': 0
}
}
self.text_exact = {
'level': 2,
'font': {
'name': "CMR10",
'size': 9.962599754333496,
'size_tolerance': 0,
'color': 0,
'superscript': False,
'italic': False,
'serif': True,
'monospace': False,
'bold': False
},
'bbox': {
'left': 133.76800537109375,
'top': 592.492919921875,
'right': 477.537353515625,
'bottom': 602.4555053710938,
'tolerance': 0
}
}
self.spn_title = {
'size': 14.346199989318848,
'flags': 20,
'font': 'TZOLRB+CMBX12',
'color': 0,
'text': 'Section Two',
'bbox': (157.98439025878906,
567.3842163085938,
245.18057250976562,
581.7447509765625)
}
self.spn_text = {
'size': 9.962599754333496,
'flags': 4,
'font': 'MJDLZY+CMR10',
'color': 0,
'text': 'text',
'bbox': (133.76800537109375,
592.492919921875,
477.537353515625,
602.4555053710938)
}
with it("raises error if no toc level is specified"):
try:
fltr = ToCFilter({})
except ValueError:
pass
except:
assert False, "must raise error"
with it("raises error if toc level is invalid"):
try:
fltr = ToCFilter({'level': 0})
fltr = ToCFilter({'level': -1})
except ValueError:
pass
except:
assert False, "must raise error"
with it("does not raise error if toc level is valid"):
try:
fltr = ToCFilter({'level': 1})
fltr = ToCFilter({'level': 2})
except ValueError:
assert False, "must not raise error"
with it("admits exact matches"):
filter_title = ToCFilter(self.title_exact)
filter_text = ToCFilter(self.text_exact)
assert filter_title.admits(self.spn_title)
assert filter_text.admits(self.spn_text)
with it("rejects unmatched spans"):
filter_title = ToCFilter(self.title_exact)
filter_text = ToCFilter(self.text_exact)
assert not filter_title.admits(self.spn_text)
assert not filter_text.admits(self.spn_title)
with it("admits correctly without bbox"):
filter_title = ToCFilter({
'level': 1,
'font': {
'name': "CMBX12",
}
})
assert filter_title.admits(self.spn_title)
filter_text = ToCFilter({
'level': 2,
'font': {
'size': 9.962599754333496,
}
})
assert filter_text.admits(self.spn_text)
with it("rejects correctly without bbox"):
filter_title = ToCFilter({
'level': 1,
'font': {
'name': "CMBX12",
}
})
assert not filter_title.admits(self.spn_text)
filter_text = ToCFilter({
'level': 2,
'font': {
'size': 9.962599754333496,
}
})
assert not filter_text.admits(self.spn_title)
with it("admits correctly without font"):
filter_title = ToCFilter({
'level': 1,
'bbox': {
'left': 157.98439025878906,
}
})
assert filter_title.admits(self.spn_title)
filter_text = ToCFilter({
'level': 2,
'bbox': {
'top': 592.492919921875,
}
})
assert filter_text.admits(self.spn_text)
with it("rejects correctly without font"):
filter_title = ToCFilter({
'level': 1,
'bbox': {
'left': 157.98439025878906,
}
})
assert not filter_title.admits(self.spn_text)
filter_text = ToCFilter({
'level': 2,
'bbox': {
'top': 592.492919921875,
}
})
assert not filter_text.admits(self.spn_title)
with description("FontFilter") as self:
with before.all:
self.title_exact = {
'name': "CMBX12",
'size': 14.346199989318848,
'size_tolerance': 0,
'color': 0,
'superscript': False,
'italic': False,
'serif': True,
'monospace': False,
'bold': True
}
self.text_exact = {
'name': "CMR10",
'size': 9.962599754333496,
'size_tolerance': 0,
'color': 0,
'superscript': False,
'italic': False,
'serif': True,
'monospace': False,
'bold': False
}
self.spn_title = {
'size': 14.346199989318848,
'flags': 20,
'font': 'TZOLRB+CMBX12',
'color': 0,
'text': 'Section Two',
'bbox': (157.98439025878906,
567.3842163085938,
245.18057250976562,
581.7447509765625)
}
self.spn_small_title = {
'size': 9.962599754333496,
'flags': 4,
'font': 'TZOLRB+CMBX12',
'color': 0,
'text': 'text',
'bbox': (133.76800537109375,
592.492919921875,
477.537353515625,
602.4555053710938)
}
self.spn_text = {
'size': 9.962599754333496,
'flags': 4,
'font': 'MJDLZY+CMR10',
'color': 0,
'text': 'text',
'bbox': (133.76800537109375,
592.492919921875,
477.537353515625,
602.4555053710938)
}
with it("has a working constructor"):
fnt = FontFilter(self.title_exact)
assert fnt.name.search("TZOLRB+CMBX12")
assert fnt.name.search("CMBX12")
assert not fnt.name.search("CMBX10")
assert fnt.flags == 0b10100
assert fnt.ign_mask == 0b11111
assert fnt.color == 0x000000
assert fnt.size == 14.346199989318848
assert fnt.size_tolerance == 0
with it("can construct if empty dict is given in the constructor"):
fnt = FontFilter({})
assert fnt.name.search("anything")
assert fnt.flags == 0
assert fnt.ign_mask == 0
assert fnt.color is None
assert fnt.size is None
assert fnt.size_tolerance == 1e-5
with it("admits exact matches"):
fnt_title = FontFilter(self.title_exact)
fnt_text = FontFilter(self.text_exact)
assert fnt_title.admits(self.spn_title)
assert fnt_text.admits(self.spn_text)
with it("rejects unmatched spans"):
fnt_title = FontFilter(self.title_exact)
assert not fnt_title.admits(self.spn_text)
assert not fnt_title.admits(self.spn_small_title)
fnt_text = FontFilter(self.text_exact)
assert not fnt_text.admits(self.spn_title)
assert not fnt_text.admits(self.spn_small_title)
with it("admits correctly without font name"):
fnt_title = FontFilter({
'size': 14.346199989318848,
'size_tolerance': 0,
'color': 0,
'superscript': False,
'italic': False,
'serif': True,
'monospace': False,
'bold': True
})
assert fnt_title.admits(self.spn_title)
with it("rejects correctly without font name"):
fnt_title = FontFilter({
'size': 14.346199989318848,
'size_tolerance': 0,
'color': 0,
'superscript': False,
'italic': False,
'serif': True,
'monospace': False,
'bold': True
})
assert not fnt_title.admits(self.spn_text)
assert not fnt_title.admits(self.spn_small_title)
with it("admits correctly with only font name"):
fnt_title = FontFilter({
'name': "CMBX12"
})
assert fnt_title.admits(self.spn_title)
assert fnt_title.admits(self.spn_small_title)
with it("rejects correctly with only font name"):
fnt_title = FontFilter({
'name': "CMBX12"
})
assert not fnt_title.admits(self.spn_text)
with it("admits correctly without size"):
fnt_title = FontFilter({
'name': "CMBX12",
'size_tolerance': 0,
'color': 0,
'superscript': False,
'italic': False,
'serif': True,
'monospace': False,
'bold': True
})
assert fnt_title.admits(self.spn_title)
with it("rejects correctly without size"):
fnt_title = FontFilter({
'name': "CMBX12",
'size_tolerance': 0,
'color': 0,
'superscript': False,
'italic': False,
'serif': True,
'monospace': False,
'bold': True
})
assert not fnt_title.admits(self.spn_text)
assert not fnt_title.admits(self.spn_small_title)
with it("admits correctly with only size"):
fnt_title = FontFilter({
'size': 14.346199989318848,
'size_tolerance': 0
})
assert fnt_title.admits(self.spn_title)
with it("rejects correctly with only size"):
fnt_title = FontFilter({
'size': 14.346199989318848,
'size_tolerance': 0
})
assert not fnt_title.admits(self.spn_text)
assert not fnt_title.admits(self.spn_small_title)
with it("admits correctly without color"):
fnt_title = FontFilter({
'name': "CMBX12",
'size': 14.346199989318848,
'size_tolerance': 0,
'superscript': False,
'italic': False,
'serif': True,
'monospace': False,
'bold': True
})
assert fnt_title.admits(self.spn_title)
with it("rejects correctly without color"):
fnt_title = FontFilter({
'name': "CMBX12",
'size': 14.346199989318848,
'size_tolerance': 0,
'superscript': False,
'italic': False,
'serif': True,
'monospace': False,
'bold': True
})
assert not fnt_title.admits(self.spn_text)
assert not fnt_title.admits(self.spn_small_title)
with it("admits correctly with only color"):
fnt_title = FontFilter({
'color': 0x000000,
})
assert fnt_title.admits(self.spn_title)
assert fnt_title.admits(self.spn_text)
assert fnt_title.admits(self.spn_small_title)
with it("rejects correctly with only color"):
fnt_title = FontFilter({
'color': 0x000000,
})
spn_blue = {
'size': 14.346199989318848,
'flags': 20,
'font': 'TZOLRB+CMBX12',
'color': 0x0000ff,
'text': 'Section Two',
'bbox': (157.98439025878906,
567.3842163085938,
245.18057250976562,
581.7447509765625)
}
assert not fnt_title.admits(spn_blue)
with it("admits correctly with only flags"):
fnt_title = FontFilter({
'superscript': False,
'italic': False,
'serif': True,
'monospace': False,
'bold': True
})
assert fnt_title.admits(self.spn_title)
with it("rejects correctly with only flags"):
fnt_title = FontFilter({
'superscript': False,
'italic': False,
'serif': True,
'monospace': False,
'bold': True
})
assert not fnt_title.admits(self.spn_text)
assert not fnt_title.admits(self.spn_small_title)
with it("admits correctly without flags"):
fnt_title = FontFilter({
'name': "CMBX12",
'size': 14.346199989318848,
'size_tolerance': 0,
'color': 0,
})
assert fnt_title.admits(self.spn_title)
with it("rejects correctly without flags"):
fnt_title = FontFilter({
'name': "CMBX12",
'size': 14.346199989318848,
'size_tolerance': 0,
'color': 0,
})
assert not fnt_title.admits(self.spn_text)
assert not fnt_title.admits(self.spn_small_title)
with it("admits correctly with partial flags"):
fnt_title = FontFilter({
'serif': True,
'bold': True
})
fnt_serif = FontFilter({
'serif': True
})
fnt_sans = FontFilter({
'serif': False
})
fnt_mono = FontFilter({
'monospace': True
})
assert fnt_title.admits(self.spn_title)
assert fnt_serif.admits(self.spn_title)
assert fnt_serif.admits(self.spn_text)
assert fnt_sans.admits({'flags': 0b11011})
assert fnt_mono.admits({'flags': 0b11111})
with it("rejects correctly with partial flags"):
fnt_title = FontFilter({
'serif': True,
'bold': True
})
fnt_serif = FontFilter({
'serif': True
})
fnt_sans = FontFilter({
'serif': False
})
fnt_mono = FontFilter({
'monospace': True
})
assert not fnt_title.admits(self.spn_text)
assert not fnt_title.admits(self.spn_small_title)
assert not fnt_sans.admits(self.spn_title)
assert not fnt_sans.admits(self.spn_text)
assert not fnt_mono.admits(self.spn_title)
assert not fnt_mono.admits(self.spn_text)
with description("BoundingBoxFilter") as self:
with before.all:
self.title_exact = {
'left': 157.98439025878906,
'top': 567.3842163085938,
'right': 245.18057250976562,
'bottom': 581.7447509765625,
'tolerance': 0
}
self.text_exact = {
'left': 133.76800537109375,
'top': 592.492919921875,
'right': 477.537353515625,
'bottom': 602.4555053710938,
'tolerance': 0
}
self.spn_title = {
'size': 14.346199989318848,
'flags': 20,
'font': 'TZOLRB+CMBX12',
'color': 0,
'text': 'Section Two',
'bbox': (157.98439025878906,
567.3842163085938,
245.18057250976562,
581.7447509765625)
}
self.spn_title2 = {
'size': 14.346199989318848,
'flags': 20,
'font': 'TZOLRB+CMBX12',
'color': 0,
'text': 'Section One',
'bbox': (157.98439025878906,
335.569580078125,
477.66058349609375,
349.93011474609375)
}
self.spn_text = {
'size': 9.962599754333496,
'flags': 4,
'font': 'MJDLZY+CMR10',
'color': 0,
'text': 'text',
'bbox': (133.76800537109375,
592.492919921875,
477.537353515625,
602.4555053710938)
}
with it("has a working constructor"):
bbox = BoundingBoxFilter(self.title_exact)
assert bbox.left is not None
assert bbox.right is not None
assert bbox.top is not None
assert bbox.bottom is not None
assert bbox.tolerance == 0
with it("can construct if empty dict is given in the constructor"):
bbox = BoundingBoxFilter({})
assert bbox.left is None
assert bbox.right is None
assert bbox.top is None
assert bbox.bottom is None
assert bbox.tolerance == 1e-5
with it("admits exact matches"):
bbox_title = BoundingBoxFilter(self.title_exact)
bbox_text = BoundingBoxFilter(self.text_exact)
assert bbox_title.admits(self.spn_title)
assert bbox_text.admits(self.spn_text)
with it("rejects unmatched spans"):
bbox_title = BoundingBoxFilter(self.title_exact)
assert not bbox_title.admits(self.spn_text)
assert not bbox_title.admits(self.spn_title2)
bbox_text = BoundingBoxFilter(self.text_exact)
assert not bbox_text.admits(self.spn_title)
assert not bbox_text.admits(self.spn_title2)
with it("admits correctly with partial bbox"):
bbox_title = BoundingBoxFilter({
'left': 157.98439025878906
})
assert bbox_title.admits(self.spn_title)
assert bbox_title.admits(self.spn_title2)
bbox_top = BoundingBoxFilter({
'top': 567.3842163085938
})
assert bbox_top.admits(self.spn_title)
bbox_right = BoundingBoxFilter({
'right': 245.18057250976562
})
assert bbox_right.admits(self.spn_title)
bbox_bottom = BoundingBoxFilter({
'bottom': 581.7447509765625
})
assert bbox_bottom.admits(self.spn_title)
with it("rejects correctly with partial bbox"):
bbox_title = BoundingBoxFilter({
'left': 157.98439025878906
})
assert not bbox_title.admits(self.spn_text)
bbox_top = BoundingBoxFilter({
'top': 567.3842163085938
})
assert not bbox_top.admits(self.spn_title2)
bbox_right = BoundingBoxFilter({
'right': 245.18057250976562
})
assert not bbox_right.admits(self.spn_title2)
bbox_bottom = BoundingBoxFilter({
'bottom': 581.7447509765625
})
assert not bbox_bottom.admits(self.spn_title2)