Vik Paruchuri
commited on
Commit
·
6951df2
1
Parent(s):
c57e32c
Fix line merging bug
Browse files
marker/builders/line.py
CHANGED
|
@@ -411,6 +411,26 @@ class LineBuilder(BaseBuilder):
|
|
| 411 |
]
|
| 412 |
)
|
| 413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
for line_idx in filtered_merge_lines:
|
| 415 |
text_line = text_lines[line_idx]
|
| 416 |
for merge_section in filtered_merge_lines[line_idx]:
|
|
@@ -424,7 +444,9 @@ class LineBuilder(BaseBuilder):
|
|
| 424 |
]
|
| 425 |
# Set the polygon to the detected line - This is because provider polygons are sometimes incorrect
|
| 426 |
# TODO Add metadata for this
|
| 427 |
-
merged_line.line.polygon =
|
|
|
|
|
|
|
| 428 |
out_provider_lines.append((out_idx, merged_line))
|
| 429 |
already_merged.add(merge_section[0])
|
| 430 |
else:
|
|
@@ -442,7 +464,9 @@ class LineBuilder(BaseBuilder):
|
|
| 442 |
already_merged.add(idx) # Prevent double merging
|
| 443 |
# Set the polygon to the detected line - This is because provider polygons are sometimes incorrect
|
| 444 |
# TODO Add metadata for this
|
| 445 |
-
merged_line.line.polygon =
|
|
|
|
|
|
|
| 446 |
out_provider_lines.append((out_idx, merged_line))
|
| 447 |
|
| 448 |
# Sort to preserve original order
|
|
|
|
| 411 |
]
|
| 412 |
)
|
| 413 |
|
| 414 |
+
def bbox_for_merge_section(
|
| 415 |
+
merge_section: List[int],
|
| 416 |
+
all_merge_sections: List[List[int]],
|
| 417 |
+
text_line: PolygonBox,
|
| 418 |
+
):
|
| 419 |
+
# Don't just take the whole detected line if we have multiple sections inside
|
| 420 |
+
if len(all_merge_sections) == 1:
|
| 421 |
+
return text_line.rescale(image_size, page_size)
|
| 422 |
+
else:
|
| 423 |
+
poly = None
|
| 424 |
+
for section_idx in merge_section:
|
| 425 |
+
section_polygon = deepcopy(
|
| 426 |
+
horizontal_provider_lines[section_idx][1].line.polygon
|
| 427 |
+
)
|
| 428 |
+
if poly is None:
|
| 429 |
+
poly = section_polygon
|
| 430 |
+
else:
|
| 431 |
+
poly = poly.merge([section_polygon])
|
| 432 |
+
return poly
|
| 433 |
+
|
| 434 |
for line_idx in filtered_merge_lines:
|
| 435 |
text_line = text_lines[line_idx]
|
| 436 |
for merge_section in filtered_merge_lines[line_idx]:
|
|
|
|
| 444 |
]
|
| 445 |
# Set the polygon to the detected line - This is because provider polygons are sometimes incorrect
|
| 446 |
# TODO Add metadata for this
|
| 447 |
+
merged_line.line.polygon = bbox_for_merge_section(
|
| 448 |
+
merge_section, filtered_merge_lines[line_idx], text_line
|
| 449 |
+
)
|
| 450 |
out_provider_lines.append((out_idx, merged_line))
|
| 451 |
already_merged.add(merge_section[0])
|
| 452 |
else:
|
|
|
|
| 464 |
already_merged.add(idx) # Prevent double merging
|
| 465 |
# Set the polygon to the detected line - This is because provider polygons are sometimes incorrect
|
| 466 |
# TODO Add metadata for this
|
| 467 |
+
merged_line.line.polygon = bbox_for_merge_section(
|
| 468 |
+
merge_section, filtered_merge_lines[line_idx], text_line
|
| 469 |
+
)
|
| 470 |
out_provider_lines.append((out_idx, merged_line))
|
| 471 |
|
| 472 |
# Sort to preserve original order
|
tests/builders/test_inline_math_lines.py
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
import pytest
|
| 2 |
-
|
| 3 |
-
from marker.processors.line_merge import LineMergeProcessor
|
| 4 |
-
from marker.schema import BlockTypes
|
| 5 |
-
|
| 6 |
-
@pytest.mark.skip("We do not support this functionality anymore")
|
| 7 |
-
@pytest.mark.config({"page_range": [1]})
|
| 8 |
-
def test_inline_box_nomerging(pdf_document, config):
|
| 9 |
-
first_page = pdf_document.pages[0]
|
| 10 |
-
block = pdf_document.get_block(first_page.structure[1]) # First inline math block
|
| 11 |
-
line_count = len(block.contained_blocks(pdf_document, (BlockTypes.Line,)))
|
| 12 |
-
assert line_count == 46
|
| 13 |
-
|
| 14 |
-
merger = LineMergeProcessor(config)
|
| 15 |
-
merger(pdf_document)
|
| 16 |
-
|
| 17 |
-
line_count = len(block.contained_blocks(pdf_document, (BlockTypes.Line,)))
|
| 18 |
-
assert line_count == 46
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
@pytest.mark.skip("We do not support this functionality anymore")
|
| 22 |
-
@pytest.mark.config({"page_range": [1], "use_llm": True})
|
| 23 |
-
def test_inline_box_merging(pdf_document, config):
|
| 24 |
-
first_page = pdf_document.pages[0]
|
| 25 |
-
block = pdf_document.get_block(first_page.structure[1]) # First inline math block
|
| 26 |
-
line_count = len(block.contained_blocks(pdf_document, (BlockTypes.Line,)))
|
| 27 |
-
assert line_count == 21
|
| 28 |
-
|
| 29 |
-
merger = LineMergeProcessor(config)
|
| 30 |
-
merger(pdf_document)
|
| 31 |
-
|
| 32 |
-
line_count = len(block.contained_blocks(pdf_document, (BlockTypes.Line,)))
|
| 33 |
-
assert line_count == 21
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/builders/test_merged_lines.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from marker.schema import BlockTypes
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@pytest.mark.config({"page_range": [6], "format_lines": True, "disable_ocr": True})
|
| 7 |
+
@pytest.mark.filename("bad_math.pdf")
|
| 8 |
+
def test_keep_ocr(pdf_document):
|
| 9 |
+
contained_lines = pdf_document.pages[0].contained_blocks(
|
| 10 |
+
pdf_document, [BlockTypes.Line]
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
# Check that we grabbed the right text
|
| 14 |
+
assert "Lemma" in contained_lines[-1].formatted_text(pdf_document)
|
| 15 |
+
assert "distribution" in contained_lines[-2].formatted_text(pdf_document)
|
| 16 |
+
|
| 17 |
+
# Line 2 comes after line 1
|
| 18 |
+
assert contained_lines[-1].polygon.bbox[1] > contained_lines[-2].polygon.bbox[3]
|
tests/builders/test_strip_existing_ocr.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
| 1 |
import pytest
|
| 2 |
|
| 3 |
-
from marker.builders.document import DocumentBuilder
|
| 4 |
-
from marker.builders.layout import LayoutBuilder
|
| 5 |
-
from marker.schema import BlockTypes
|
| 6 |
|
| 7 |
@pytest.mark.config({"page_range": [0], "strip_existing_ocr": True})
|
| 8 |
@pytest.mark.filename("handwritten.pdf")
|
|
@@ -10,7 +7,8 @@ def test_strip_ocr(doc_provider):
|
|
| 10 |
# Ensure that the OCR text isn't extracted
|
| 11 |
assert len(doc_provider.page_lines) == 0
|
| 12 |
|
|
|
|
| 13 |
@pytest.mark.config({"page_range": [0]})
|
| 14 |
@pytest.mark.filename("handwritten.pdf")
|
| 15 |
def test_keep_ocr(doc_provider):
|
| 16 |
-
assert len(doc_provider.page_lines) == 1
|
|
|
|
| 1 |
import pytest
|
| 2 |
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
@pytest.mark.config({"page_range": [0], "strip_existing_ocr": True})
|
| 5 |
@pytest.mark.filename("handwritten.pdf")
|
|
|
|
| 7 |
# Ensure that the OCR text isn't extracted
|
| 8 |
assert len(doc_provider.page_lines) == 0
|
| 9 |
|
| 10 |
+
|
| 11 |
@pytest.mark.config({"page_range": [0]})
|
| 12 |
@pytest.mark.filename("handwritten.pdf")
|
| 13 |
def test_keep_ocr(doc_provider):
|
| 14 |
+
assert len(doc_provider.page_lines) == 1
|
tests/processors/test_inline_math.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
| 1 |
-
from unittest.mock import Mock
|
| 2 |
-
|
| 3 |
-
import pytest
|
| 4 |
-
|
| 5 |
-
from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor
|
| 6 |
-
from marker.schema import BlockTypes
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
@pytest.mark.skip("We do not support this method of inline math anymore")
|
| 10 |
-
@pytest.mark.filename("adversarial.pdf")
|
| 11 |
-
@pytest.mark.config({"page_range": [0], "use_llm": True})
|
| 12 |
-
def test_llm_text_processor(pdf_document, mocker):
|
| 13 |
-
# Get all inline math lines
|
| 14 |
-
text_lines = pdf_document.contained_blocks((BlockTypes.Line,))
|
| 15 |
-
text_lines = [line for line in text_lines if line.formats and "math" in line.formats]
|
| 16 |
-
assert len(text_lines) == 8
|
| 17 |
-
corrected_lines = ["<math>Text</math>"] * len(text_lines)
|
| 18 |
-
|
| 19 |
-
mock_cls = Mock()
|
| 20 |
-
mock_cls.return_value = {"corrected_lines": corrected_lines}
|
| 21 |
-
|
| 22 |
-
config = {"use_llm": True, "gemini_api_key": "test"}
|
| 23 |
-
processor_lst = [LLMInlineMathLinesProcessor(config)]
|
| 24 |
-
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
|
| 25 |
-
processor(pdf_document)
|
| 26 |
-
|
| 27 |
-
contained_spans = text_lines[0].contained_blocks(pdf_document, (BlockTypes.Span,))
|
| 28 |
-
assert contained_spans[0].text == "Text\n" # Newline inserted at end of line
|
| 29 |
-
assert contained_spans[0].formats == ["math"]
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
@pytest.mark.skip("We do not support this method of inline math anymore")
|
| 33 |
-
@pytest.mark.filename("adversarial.pdf")
|
| 34 |
-
@pytest.mark.config({"page_range": [0]})
|
| 35 |
-
def test_llm_text_processor_disabled(pdf_document):
|
| 36 |
-
# Get all inline math lines
|
| 37 |
-
text_lines = pdf_document.contained_blocks((BlockTypes.Line,))
|
| 38 |
-
text_lines = [line for line in text_lines if line.formats and "math" in line.formats]
|
| 39 |
-
assert len(text_lines) == 0
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
@pytest.mark.skip("We do not support this method of inline math anymore")
|
| 43 |
-
@pytest.mark.filename("adversarial.pdf")
|
| 44 |
-
@pytest.mark.config({"page_range": [0], "texify_inline_spans": True})
|
| 45 |
-
def test_llm_text_processor_texify(pdf_document):
|
| 46 |
-
# Get all inline math lines
|
| 47 |
-
text_lines = pdf_document.contained_blocks((BlockTypes.Line,))
|
| 48 |
-
text_lines = [line for line in text_lines if line.formats and "math" in line.formats]
|
| 49 |
-
assert len(text_lines) == 8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|