|
|
import pytest |
|
|
|
|
|
from marker.schema import BlockTypes |
|
|
from marker.schema.text.line import Line |
|
|
|
|
|
|
|
|
def _ocr_pipeline_test(pdf_document): |
|
|
first_page = pdf_document.pages[0] |
|
|
assert first_page.structure[0] == "/page/0/SectionHeader/0" |
|
|
|
|
|
first_block = first_page.get_block(first_page.structure[0]) |
|
|
assert first_block.text_extraction_method == "surya" |
|
|
assert first_block.block_type == BlockTypes.SectionHeader |
|
|
|
|
|
first_text_block: Line = first_page.get_block(first_block.structure[0]) |
|
|
assert first_text_block.block_type == BlockTypes.Line |
|
|
|
|
|
first_span = first_page.get_block(first_text_block.structure[0]) |
|
|
assert first_span.block_type == BlockTypes.Span |
|
|
assert first_span.text.strip() == "Subspace Adversarial Training" |
|
|
|
|
|
|
|
|
|
|
|
text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,)) |
|
|
text_blocks = first_page.contained_blocks( |
|
|
pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath) |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
max_line_position = max([line.polygon.y_end for line in text_lines]) |
|
|
max_block_position = max( |
|
|
[block.polygon.y_end for block in text_blocks if block.source == "layout"] |
|
|
) |
|
|
assert max_line_position <= (max_block_position * 1.02) |
|
|
|
|
|
|
|
|
@pytest.mark.config({"force_ocr": True, "page_range": [0]}) |
|
|
def test_ocr_pipeline(pdf_document): |
|
|
_ocr_pipeline_test(pdf_document) |
|
|
|
|
|
|
|
|
@pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True}) |
|
|
def test_ocr_with_inline_pipeline(pdf_document): |
|
|
_ocr_pipeline_test(pdf_document) |
|
|
|