Spaces:
Sleeping
Sleeping
| import os | |
| import fitz | |
| import toml | |
| from mamba import description, it, before | |
| from pdfxmeta import extract_meta, dump_meta, dump_toml | |
| dirpath = os.path.dirname(os.path.abspath(__file__)) | |
| with description("extract_meta:") as self: | |
| with before.all: | |
| self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf")) | |
| with it("extracts metadata from pdf"): | |
| meta = extract_meta(self.doc, "Section One", 1) | |
| assert len(meta) == 1 | |
| m = meta[0] | |
| assert m['text'] == "Section One" | |
| assert 'font' in m | |
| assert 'CMBX12' in m['font'] | |
| with it("matches lowercase when ignore case is set"): | |
| meta = extract_meta(self.doc, "section one", 1, True) | |
| assert len(meta) == 1 | |
| m = meta[0] | |
| assert m['text'] == "Section One" | |
| assert 'font' in m | |
| assert 'CMBX12' in m['font'] | |
| with it("matches mixed case when ignore case is set"): | |
| meta = extract_meta(self.doc, "sEcTIoN OnE", 1, True) | |
| assert len(meta) == 1 | |
| m = meta[0] | |
| assert m['text'] == "Section One" | |
| assert 'font' in m | |
| assert 'CMBX12' in m['font'] | |
| with it("matches nothing if ignore case is not set"): | |
| meta = extract_meta(self.doc, "section one", 1, False) | |
| assert len(meta) == 0 | |
| with it("can match multiple instances of needle"): | |
| meta = extract_meta(self.doc, "Section", 1) | |
| assert len(meta) == 2 | |
| m = meta[0] | |
| assert m['text'] == "Section One" | |
| assert 'font' in m | |
| assert 'CMBX12' in m['font'] | |
| m = meta[1] | |
| assert m['text'] == "Section Two" | |
| assert 'font' in m | |
| assert 'CMBX12' in m['font'] | |
| with it("returns [] when nothing is matched"): | |
| meta = extract_meta(self.doc, "Sectoin", 1, False) | |
| assert len(meta) == 0 | |
| with it("returns [] when page number is out of range"): | |
| meta = extract_meta(self.doc, "Section One", 0) | |
| assert len(meta) == 0 | |
| meta = extract_meta(self.doc, "Section One", 7) | |
| assert len(meta) == 0 | |
| with it("can match text on any page when page number is not specified"): | |
| meta = extract_meta(self.doc, "The End") | |
| assert len(meta) == 1 | |
| m = meta[0] | |
| assert m['text'] == "The End" | |
| assert 'font' in m | |
| assert 'CMBX12' in m['font'] | |
| with description("dump_meta:") as self: | |
| with before.all: | |
| self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf")) | |
| self.expected_meta = { | |
| 'font': { | |
| 'name': 'CMBX12', | |
| 'size': 14.346199989318848, | |
| 'color': 0x000000, | |
| 'superscript': False, | |
| 'italic': False, | |
| 'serif': True, | |
| 'monospace': False, | |
| 'bold': True | |
| }, | |
| 'bbox': { | |
| 'left': 157.98439025878906, | |
| 'top': 237.6484375, | |
| 'right': 243.12905883789062, | |
| 'bottom': 252.00897216796875 | |
| } | |
| } | |
| with it("produces valid toml"): | |
| meta = extract_meta(self.doc, "Section One", 1) | |
| assert len(meta) == 1 | |
| meta_dict = toml.loads(dump_meta(meta[0])) | |
| assert meta_dict == self.expected_meta | |
| with description("dump_toml:") as self: | |
| with before.all: | |
| self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf")) | |
| self.expected_recipe = { | |
| 'heading': [ | |
| { | |
| 'level': 1, | |
| 'greedy': True, | |
| 'font': { | |
| 'name': 'CMBX12', | |
| 'size': 14.346199989318848, | |
| } | |
| } | |
| ] | |
| } | |
| with it("produces valid toml"): | |
| meta = extract_meta(self.doc, "Section One", 1) | |
| assert len(meta) == 1 | |
| meta_dict = toml.loads(dump_toml(meta[0], 1)) | |
| assert meta_dict == self.expected_recipe | |
| with it("strips font subset correctly"): | |
| with_subset = { | |
| 'font': "subset+font", | |
| 'size': 1, | |
| 'flags': 20, | |
| 'color': 0, | |
| 'bbox': (1, 2, 3, 4), | |
| 'text': "" | |
| } | |
| without_subset = { | |
| 'font': "font", | |
| 'size': 1, | |
| 'flags': 20, | |
| 'color': 0, | |
| 'bbox': (1, 2, 3, 4), | |
| 'text': "" | |
| } | |
| expected = { | |
| 'heading': [ | |
| { | |
| 'level': 1, | |
| 'greedy': True, | |
| 'font': { | |
| 'name': 'font', | |
| 'size': 1 | |
| } | |
| } | |
| ] | |
| } | |
| double_plus = { | |
| 'font': "subset+font+font", | |
| 'size': 1, | |
| 'flags': 20, | |
| 'color': 0, | |
| 'bbox': (1, 2, 3, 4), | |
| 'text': "" | |
| } | |
| expected2 = { | |
| 'heading': [ | |
| { | |
| 'level': 1, | |
| 'greedy': True, | |
| 'font': { | |
| 'name': 'font+font', | |
| 'size': 1 | |
| } | |
| } | |
| ] | |
| } | |
| assert toml.loads(dump_toml(with_subset, 1)) == expected | |
| assert toml.loads(dump_toml(without_subset, 1)) == expected | |
| assert toml.loads(dump_toml(double_plus, 1)) == expected2 | |