pdf.tocgen.split / spec /xmeta_spec.py
adelevett's picture
Upload 76 files
046e3b8 verified
import os
import fitz
import toml
from mamba import description, it, before
from pdfxmeta import extract_meta, dump_meta, dump_toml
dirpath = os.path.dirname(os.path.abspath(__file__))
with description("extract_meta:") as self:
with before.all:
self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
with it("extracts metadata from pdf"):
meta = extract_meta(self.doc, "Section One", 1)
assert len(meta) == 1
m = meta[0]
assert m['text'] == "Section One"
assert 'font' in m
assert 'CMBX12' in m['font']
with it("matches lowercase when ignore case is set"):
meta = extract_meta(self.doc, "section one", 1, True)
assert len(meta) == 1
m = meta[0]
assert m['text'] == "Section One"
assert 'font' in m
assert 'CMBX12' in m['font']
with it("matches mixed case when ignore case is set"):
meta = extract_meta(self.doc, "sEcTIoN OnE", 1, True)
assert len(meta) == 1
m = meta[0]
assert m['text'] == "Section One"
assert 'font' in m
assert 'CMBX12' in m['font']
with it("matches nothing if ignore case is not set"):
meta = extract_meta(self.doc, "section one", 1, False)
assert len(meta) == 0
with it("can match multiple instances of needle"):
meta = extract_meta(self.doc, "Section", 1)
assert len(meta) == 2
m = meta[0]
assert m['text'] == "Section One"
assert 'font' in m
assert 'CMBX12' in m['font']
m = meta[1]
assert m['text'] == "Section Two"
assert 'font' in m
assert 'CMBX12' in m['font']
with it("returns [] when nothing is matched"):
meta = extract_meta(self.doc, "Sectoin", 1, False)
assert len(meta) == 0
with it("returns [] when page number is out of range"):
meta = extract_meta(self.doc, "Section One", 0)
assert len(meta) == 0
meta = extract_meta(self.doc, "Section One", 7)
assert len(meta) == 0
with it("can match text on any page when page number is not specified"):
meta = extract_meta(self.doc, "The End")
assert len(meta) == 1
m = meta[0]
assert m['text'] == "The End"
assert 'font' in m
assert 'CMBX12' in m['font']
with description("dump_meta:") as self:
with before.all:
self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
self.expected_meta = {
'font': {
'name': 'CMBX12',
'size': 14.346199989318848,
'color': 0x000000,
'superscript': False,
'italic': False,
'serif': True,
'monospace': False,
'bold': True
},
'bbox': {
'left': 157.98439025878906,
'top': 237.6484375,
'right': 243.12905883789062,
'bottom': 252.00897216796875
}
}
with it("produces valid toml"):
meta = extract_meta(self.doc, "Section One", 1)
assert len(meta) == 1
meta_dict = toml.loads(dump_meta(meta[0]))
assert meta_dict == self.expected_meta
with description("dump_toml:") as self:
with before.all:
self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
self.expected_recipe = {
'heading': [
{
'level': 1,
'greedy': True,
'font': {
'name': 'CMBX12',
'size': 14.346199989318848,
}
}
]
}
with it("produces valid toml"):
meta = extract_meta(self.doc, "Section One", 1)
assert len(meta) == 1
meta_dict = toml.loads(dump_toml(meta[0], 1))
assert meta_dict == self.expected_recipe
with it("strips font subset correctly"):
with_subset = {
'font': "subset+font",
'size': 1,
'flags': 20,
'color': 0,
'bbox': (1, 2, 3, 4),
'text': ""
}
without_subset = {
'font': "font",
'size': 1,
'flags': 20,
'color': 0,
'bbox': (1, 2, 3, 4),
'text': ""
}
expected = {
'heading': [
{
'level': 1,
'greedy': True,
'font': {
'name': 'font',
'size': 1
}
}
]
}
double_plus = {
'font': "subset+font+font",
'size': 1,
'flags': 20,
'color': 0,
'bbox': (1, 2, 3, 4),
'text': ""
}
expected2 = {
'heading': [
{
'level': 1,
'greedy': True,
'font': {
'name': 'font+font',
'size': 1
}
}
]
}
assert toml.loads(dump_toml(with_subset, 1)) == expected
assert toml.loads(dump_toml(without_subset, 1)) == expected
assert toml.loads(dump_toml(double_plus, 1)) == expected2