|
|
import pytest |
|
|
|
|
|
from marker.schema import BlockTypes |
|
|
from marker.schema.text.line import Line |
|
|
|
|
|
|
|
|
@pytest.mark.filename("thinkpython.pdf") |
|
|
@pytest.mark.config({"page_range": [0]}) |
|
|
def test_document_builder(pdf_document): |
|
|
first_page = pdf_document.pages[0] |
|
|
assert first_page.structure[0] == "/page/0/SectionHeader/0" |
|
|
|
|
|
first_block = first_page.get_block(first_page.structure[0]) |
|
|
assert first_block.block_type == BlockTypes.SectionHeader |
|
|
assert first_block.text_extraction_method == "pdftext" |
|
|
|
|
|
first_text_block: Line = first_page.get_block(first_block.structure[0]) |
|
|
assert first_text_block.block_type == BlockTypes.Line |
|
|
|
|
|
first_span = first_page.get_block(first_text_block.structure[0]) |
|
|
assert first_span.block_type == BlockTypes.Span |
|
|
assert first_span.text == "Think Python" |
|
|
assert first_span.font == "URWPalladioL-Roma" |
|
|
assert first_span.formats == ["plain"] |
|
|
|
|
|
|
|
|
@pytest.mark.config({"page_range": [0]}) |
|
|
def test_document_builder_inline_eq(pdf_document): |
|
|
first_page = pdf_document.pages[0] |
|
|
assert first_page.structure[0] == "/page/0/SectionHeader/0" |
|
|
|
|
|
first_block = first_page.get_block(first_page.structure[0]) |
|
|
assert first_block.block_type == BlockTypes.SectionHeader |
|
|
assert first_block.text_extraction_method == "surya" |
|
|
|
|
|
first_text_block: Line = first_page.get_block(first_block.structure[0]) |
|
|
assert first_text_block.block_type == BlockTypes.Line |
|
|
|
|
|
first_span = first_page.get_block(first_text_block.structure[0]) |
|
|
assert first_span.block_type == BlockTypes.Span |
|
|
assert first_span.text.strip() == "Subspace Adversarial Training" |
|
|
assert "bold" in first_span.formats |
|
|
|