Vik Paruchuri
commited on
Commit
·
9980f1e
1
Parent(s):
b730265
Fix remaining tests
Browse files
marker/builders/line.py
CHANGED
|
@@ -53,7 +53,7 @@ class LineBuilder(BaseBuilder):
|
|
| 53 |
provider_line_provider_line_min_overlap_pct: Annotated[
|
| 54 |
float,
|
| 55 |
"The percentage of a provider line that has to be covered by a detected line",
|
| 56 |
-
] = 0.
|
| 57 |
excluded_for_coverage: Annotated[
|
| 58 |
Tuple[BlockTypes],
|
| 59 |
"A list of block types to exclude from the layout coverage check.",
|
|
|
|
| 53 |
provider_line_provider_line_min_overlap_pct: Annotated[
|
| 54 |
float,
|
| 55 |
"The percentage of a provider line that has to be covered by a detected line",
|
| 56 |
+
] = 0.1
|
| 57 |
excluded_for_coverage: Annotated[
|
| 58 |
Tuple[BlockTypes],
|
| 59 |
"A list of block types to exclude from the layout coverage check.",
|
tests/builders/test_document_builder.py
CHANGED
|
@@ -38,6 +38,5 @@ def test_document_builder_inline_eq(pdf_document):
|
|
| 38 |
|
| 39 |
first_span = first_page.get_block(first_text_block.structure[0])
|
| 40 |
assert first_span.block_type == BlockTypes.Span
|
| 41 |
-
assert first_span.text == "Subspace Adversarial Training"
|
| 42 |
-
assert first_span.
|
| 43 |
-
assert first_span.formats == ["plain"]
|
|
|
|
| 38 |
|
| 39 |
first_span = first_page.get_block(first_text_block.structure[0])
|
| 40 |
assert first_span.block_type == BlockTypes.Span
|
| 41 |
+
assert first_span.text.strip() == "Subspace Adversarial Training"
|
| 42 |
+
assert "bold" in first_span.formats
|
|
|
tests/builders/test_pdf_links.py
CHANGED
|
@@ -11,6 +11,7 @@ from marker.util import classes_to_strings
|
|
| 11 |
|
| 12 |
@pytest.mark.filename("arxiv_test.pdf")
|
| 13 |
@pytest.mark.output_format("markdown")
|
|
|
|
| 14 |
def test_pdf_links(pdf_document: Document, config, renderer, model_dict, temp_doc):
|
| 15 |
first_page = pdf_document.pages[1]
|
| 16 |
|
|
@@ -19,26 +20,35 @@ def test_pdf_links(pdf_document: Document, config, renderer, model_dict, temp_do
|
|
| 19 |
artifact_dict=model_dict,
|
| 20 |
processor_list=processors,
|
| 21 |
renderer=classes_to_strings([renderer])[0],
|
| 22 |
-
config=config
|
| 23 |
)
|
| 24 |
|
| 25 |
-
for section_header_span in first_page.contained_blocks(
|
|
|
|
|
|
|
| 26 |
if "II." in section_header_span.text:
|
| 27 |
assert section_header_span.url == "#page-1-0"
|
| 28 |
break
|
| 29 |
else:
|
| 30 |
raise ValueError("Could not find II. in the first page")
|
| 31 |
|
| 32 |
-
section_header_block = first_page.contained_blocks(
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
|
| 35 |
assert first_page.refs[0].ref == "page-1-0"
|
| 36 |
|
| 37 |
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
| 38 |
markdown = markdown_output.markdown
|
| 39 |
|
| 40 |
-
assert
|
| 41 |
assert '<span id="page-1-0"></span>II. THEORETICAL FRAMEWORK' in markdown
|
| 42 |
|
| 43 |
-
for ref in set(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
assert ref in markdown, f"Reference {ref} not found in markdown"
|
|
|
|
| 11 |
|
| 12 |
@pytest.mark.filename("arxiv_test.pdf")
|
| 13 |
@pytest.mark.output_format("markdown")
|
| 14 |
+
@pytest.mark.config({"disable_ocr": True})
|
| 15 |
def test_pdf_links(pdf_document: Document, config, renderer, model_dict, temp_doc):
|
| 16 |
first_page = pdf_document.pages[1]
|
| 17 |
|
|
|
|
| 20 |
artifact_dict=model_dict,
|
| 21 |
processor_list=processors,
|
| 22 |
renderer=classes_to_strings([renderer])[0],
|
| 23 |
+
config=config,
|
| 24 |
)
|
| 25 |
|
| 26 |
+
for section_header_span in first_page.contained_blocks(
|
| 27 |
+
pdf_document, (BlockTypes.Span,)
|
| 28 |
+
):
|
| 29 |
if "II." in section_header_span.text:
|
| 30 |
assert section_header_span.url == "#page-1-0"
|
| 31 |
break
|
| 32 |
else:
|
| 33 |
raise ValueError("Could not find II. in the first page")
|
| 34 |
|
| 35 |
+
section_header_block = first_page.contained_blocks(
|
| 36 |
+
pdf_document, (BlockTypes.SectionHeader,)
|
| 37 |
+
)[0]
|
| 38 |
+
assert section_header_block.raw_text(pdf_document) == "II. THEORETICAL FRAMEWORK\n"
|
| 39 |
|
| 40 |
assert first_page.refs[0].ref == "page-1-0"
|
| 41 |
|
| 42 |
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
| 43 |
markdown = markdown_output.markdown
|
| 44 |
|
| 45 |
+
assert "[II.](#page-1-0)" in markdown
|
| 46 |
assert '<span id="page-1-0"></span>II. THEORETICAL FRAMEWORK' in markdown
|
| 47 |
|
| 48 |
+
for ref in set(
|
| 49 |
+
[
|
| 50 |
+
f'<span id="page-{m[0]}-{m[1]}">'
|
| 51 |
+
for m in re.findall(r"\]\(#page-(\d+)-(\d+)\)", markdown)
|
| 52 |
+
]
|
| 53 |
+
):
|
| 54 |
assert ref in markdown, f"Reference {ref} not found in markdown"
|
tests/builders/test_rotated_bboxes.py
CHANGED
|
@@ -13,7 +13,7 @@ def test_rotated_bboxes(pdf_document):
|
|
| 13 |
text_blocks = first_page.contained_blocks(
|
| 14 |
pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath)
|
| 15 |
)
|
| 16 |
-
assert len(text_lines) ==
|
| 17 |
|
| 18 |
# Ensure the bbox sizes match up
|
| 19 |
max_line_position = max([line.polygon.x_end for line in text_lines])
|
|
|
|
| 13 |
text_blocks = first_page.contained_blocks(
|
| 14 |
pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath)
|
| 15 |
)
|
| 16 |
+
assert len(text_lines) == 84
|
| 17 |
|
| 18 |
# Ensure the bbox sizes match up
|
| 19 |
max_line_position = max([line.polygon.x_end for line in text_lines])
|