|
|
from typing import List |
|
|
|
|
|
import pytest |
|
|
|
|
|
from marker.renderers.markdown import MarkdownRenderer |
|
|
from marker.schema import BlockTypes |
|
|
from marker.processors.table import TableProcessor |
|
|
from marker.schema.blocks import TableCell |
|
|
|
|
|
|
|
|
@pytest.mark.config({"page_range": [5]}) |
|
|
def test_table_processor( |
|
|
pdf_document, recognition_model, table_rec_model, detection_model |
|
|
): |
|
|
processor = TableProcessor(recognition_model, table_rec_model, detection_model) |
|
|
processor(pdf_document) |
|
|
|
|
|
for block in pdf_document.pages[0].children: |
|
|
if block.block_type == BlockTypes.Table: |
|
|
children = block.contained_blocks(pdf_document, (BlockTypes.TableCell,)) |
|
|
assert children |
|
|
assert len(children) > 0 |
|
|
assert isinstance(children[0], TableCell) |
|
|
|
|
|
assert len(pdf_document.contained_blocks((BlockTypes.Table,))) == 2 |
|
|
|
|
|
renderer = MarkdownRenderer() |
|
|
table_output = renderer(pdf_document) |
|
|
assert "Schedule" in table_output.markdown |
|
|
|
|
|
|
|
|
@pytest.mark.filename("table_ex.pdf") |
|
|
@pytest.mark.config({"page_range": [0], "force_ocr": True}) |
|
|
def test_avoid_double_ocr( |
|
|
pdf_document, recognition_model, table_rec_model, detection_model |
|
|
): |
|
|
tables = pdf_document.contained_blocks((BlockTypes.Table,)) |
|
|
lines = tables[0].contained_blocks(pdf_document, (BlockTypes.Line,)) |
|
|
assert len(lines) == 0 |
|
|
|
|
|
processor = TableProcessor( |
|
|
recognition_model, table_rec_model, detection_model, config={"force_ocr": True} |
|
|
) |
|
|
processor(pdf_document) |
|
|
|
|
|
renderer = MarkdownRenderer() |
|
|
table_output = renderer(pdf_document) |
|
|
assert "Participants" in table_output.markdown |
|
|
|
|
|
|
|
|
@pytest.mark.filename("multicol-blocks.pdf") |
|
|
@pytest.mark.config({"page_range": [3]}) |
|
|
def test_overlap_blocks( |
|
|
pdf_document, detection_model, recognition_model, table_rec_model |
|
|
): |
|
|
page = pdf_document.pages[0] |
|
|
assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text( |
|
|
pdf_document |
|
|
) |
|
|
|
|
|
processor = TableProcessor(recognition_model, table_rec_model, detection_model) |
|
|
processor(pdf_document) |
|
|
|
|
|
assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text( |
|
|
pdf_document |
|
|
) |
|
|
|
|
|
|
|
|
@pytest.mark.filename("pres.pdf") |
|
|
@pytest.mark.config({"page_range": [4]}) |
|
|
def test_ocr_table(pdf_document, recognition_model, table_rec_model, detection_model): |
|
|
processor = TableProcessor(recognition_model, table_rec_model, detection_model) |
|
|
processor(pdf_document) |
|
|
|
|
|
renderer = MarkdownRenderer() |
|
|
table_output = renderer(pdf_document) |
|
|
assert "1.2E-38" in table_output.markdown |
|
|
|
|
|
|
|
|
@pytest.mark.config({"page_range": [11]}) |
|
|
def test_split_rows(pdf_document, recognition_model, table_rec_model, detection_model): |
|
|
processor = TableProcessor(recognition_model, table_rec_model, detection_model) |
|
|
processor(pdf_document) |
|
|
|
|
|
table = pdf_document.contained_blocks((BlockTypes.Table,))[-1] |
|
|
cells: List[TableCell] = table.contained_blocks( |
|
|
pdf_document, (BlockTypes.TableCell,) |
|
|
) |
|
|
unique_rows = len(set([cell.row_id for cell in cells])) |
|
|
assert unique_rows == 6 |
|
|
|