Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Apr 8

Commit

6951df2

1 Parent(s): c57e32c

Fix line merging bug

Browse files

Files changed (5) hide show

marker/builders/line.py +26 -2
tests/builders/test_inline_math_lines.py +0 -33
tests/builders/test_merged_lines.py +18 -0
tests/builders/test_strip_existing_ocr.py +2 -4
tests/processors/test_inline_math.py +0 -49

marker/builders/line.py CHANGED Viewed

@@ -411,6 +411,26 @@ class LineBuilder(BaseBuilder):
             ]
         )
         for line_idx in filtered_merge_lines:
             text_line = text_lines[line_idx]
             for merge_section in filtered_merge_lines[line_idx]:
@@ -424,7 +444,9 @@ class LineBuilder(BaseBuilder):
                     ]
                     # Set the polygon to the detected line - This is because provider polygons are sometimes incorrect
                     # TODO Add metadata for this
-                    merged_line.line.polygon = text_line.rescale(image_size, page_size)
                     out_provider_lines.append((out_idx, merged_line))
                     already_merged.add(merge_section[0])
                 else:
@@ -442,7 +464,9 @@ class LineBuilder(BaseBuilder):
                         already_merged.add(idx)  # Prevent double merging
                     # Set the polygon to the detected line - This is because provider polygons are sometimes incorrect
                     # TODO Add metadata for this
-                    merged_line.line.polygon = text_line.rescale(image_size, page_size)
                     out_provider_lines.append((out_idx, merged_line))
         # Sort to preserve original order

             ]
         )
+        def bbox_for_merge_section(
+            merge_section: List[int],
+            all_merge_sections: List[List[int]],
+            text_line: PolygonBox,
+        ):
+            # Don't just take the whole detected line if we have multiple sections inside
+            if len(all_merge_sections) == 1:
+                return text_line.rescale(image_size, page_size)
+            else:
+                poly = None
+                for section_idx in merge_section:
+                    section_polygon = deepcopy(
+                        horizontal_provider_lines[section_idx][1].line.polygon
+                    )
+                    if poly is None:
+                        poly = section_polygon
+                    else:
+                        poly = poly.merge([section_polygon])
+                return poly
         for line_idx in filtered_merge_lines:
             text_line = text_lines[line_idx]
             for merge_section in filtered_merge_lines[line_idx]:
                     ]
                     # Set the polygon to the detected line - This is because provider polygons are sometimes incorrect
                     # TODO Add metadata for this
+                    merged_line.line.polygon = bbox_for_merge_section(
+                        merge_section, filtered_merge_lines[line_idx], text_line
+                    )
                     out_provider_lines.append((out_idx, merged_line))
                     already_merged.add(merge_section[0])
                 else:
                         already_merged.add(idx)  # Prevent double merging
                     # Set the polygon to the detected line - This is because provider polygons are sometimes incorrect
                     # TODO Add metadata for this
+                    merged_line.line.polygon = bbox_for_merge_section(
+                        merge_section, filtered_merge_lines[line_idx], text_line
+                    )
                     out_provider_lines.append((out_idx, merged_line))
         # Sort to preserve original order

tests/builders/test_inline_math_lines.py DELETED Viewed

@@ -1,33 +0,0 @@
-import pytest
-from marker.processors.line_merge import LineMergeProcessor
-from marker.schema import BlockTypes
-@pytest.mark.skip("We do not support this functionality anymore")
-@pytest.mark.config({"page_range": [1]})
-def test_inline_box_nomerging(pdf_document, config):
-    first_page = pdf_document.pages[0]
-    block = pdf_document.get_block(first_page.structure[1]) # First inline math block
-    line_count = len(block.contained_blocks(pdf_document, (BlockTypes.Line,)))
-    assert line_count == 46
-    merger = LineMergeProcessor(config)
-    merger(pdf_document)
-    line_count = len(block.contained_blocks(pdf_document, (BlockTypes.Line,)))
-    assert line_count == 46
-@pytest.mark.skip("We do not support this functionality anymore")
-@pytest.mark.config({"page_range": [1], "use_llm": True})
-def test_inline_box_merging(pdf_document, config):
-    first_page = pdf_document.pages[0]
-    block = pdf_document.get_block(first_page.structure[1]) # First inline math block
-    line_count = len(block.contained_blocks(pdf_document, (BlockTypes.Line,)))
-    assert line_count == 21
-    merger = LineMergeProcessor(config)
-    merger(pdf_document)
-    line_count = len(block.contained_blocks(pdf_document, (BlockTypes.Line,)))
-    assert line_count == 21

tests/builders/test_merged_lines.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import pytest
+from marker.schema import BlockTypes
+@pytest.mark.config({"page_range": [6], "format_lines": True, "disable_ocr": True})
+@pytest.mark.filename("bad_math.pdf")
+def test_keep_ocr(pdf_document):
+    contained_lines = pdf_document.pages[0].contained_blocks(
+        pdf_document, [BlockTypes.Line]
+    )
+    # Check that we grabbed the right text
+    assert "Lemma" in contained_lines[-1].formatted_text(pdf_document)
+    assert "distribution" in contained_lines[-2].formatted_text(pdf_document)
+    # Line 2 comes after line 1
+    assert contained_lines[-1].polygon.bbox[1] > contained_lines[-2].polygon.bbox[3]

tests/builders/test_strip_existing_ocr.py CHANGED Viewed

@@ -1,8 +1,5 @@
 import pytest
-from marker.builders.document import DocumentBuilder
-from marker.builders.layout import LayoutBuilder
-from marker.schema import BlockTypes
 @pytest.mark.config({"page_range": [0], "strip_existing_ocr": True})
 @pytest.mark.filename("handwritten.pdf")
@@ -10,7 +7,8 @@ def test_strip_ocr(doc_provider):
     # Ensure that the OCR text isn't extracted
     assert len(doc_provider.page_lines) == 0
 @pytest.mark.config({"page_range": [0]})
 @pytest.mark.filename("handwritten.pdf")
 def test_keep_ocr(doc_provider):
-    assert len(doc_provider.page_lines) == 1

 import pytest
 @pytest.mark.config({"page_range": [0], "strip_existing_ocr": True})
 @pytest.mark.filename("handwritten.pdf")
     # Ensure that the OCR text isn't extracted
     assert len(doc_provider.page_lines) == 0
 @pytest.mark.config({"page_range": [0]})
 @pytest.mark.filename("handwritten.pdf")
 def test_keep_ocr(doc_provider):
+    assert len(doc_provider.page_lines) == 1

tests/processors/test_inline_math.py DELETED Viewed

@@ -1,49 +0,0 @@
-from unittest.mock import Mock
-import pytest
-from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor
-from marker.schema import BlockTypes
-@pytest.mark.skip("We do not support this method of inline math anymore")
-@pytest.mark.filename("adversarial.pdf")
-@pytest.mark.config({"page_range": [0], "use_llm": True})
-def test_llm_text_processor(pdf_document, mocker):
-    # Get all inline math lines
-    text_lines = pdf_document.contained_blocks((BlockTypes.Line,))
-    text_lines = [line for line in text_lines if line.formats and "math" in line.formats]
-    assert len(text_lines) == 8
-    corrected_lines = ["<math>Text</math>"] * len(text_lines)
-    mock_cls = Mock()
-    mock_cls.return_value = {"corrected_lines": corrected_lines}
-    config = {"use_llm": True, "gemini_api_key": "test"}
-    processor_lst = [LLMInlineMathLinesProcessor(config)]
-    processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
-    processor(pdf_document)
-    contained_spans = text_lines[0].contained_blocks(pdf_document, (BlockTypes.Span,))
-    assert contained_spans[0].text == "Text\n" # Newline inserted at end of line
-    assert contained_spans[0].formats == ["math"]
-@pytest.mark.skip("We do not support this method of inline math anymore")
-@pytest.mark.filename("adversarial.pdf")
-@pytest.mark.config({"page_range": [0]})
-def test_llm_text_processor_disabled(pdf_document):
-    # Get all inline math lines
-    text_lines = pdf_document.contained_blocks((BlockTypes.Line,))
-    text_lines = [line for line in text_lines if line.formats and "math" in line.formats]
-    assert len(text_lines) == 0
-@pytest.mark.skip("We do not support this method of inline math anymore")
-@pytest.mark.filename("adversarial.pdf")
-@pytest.mark.config({"page_range": [0], "texify_inline_spans": True})
-def test_llm_text_processor_texify(pdf_document):
-    # Get all inline math lines
-    text_lines = pdf_document.contained_blocks((BlockTypes.Line,))
-    text_lines = [line for line in text_lines if line.formats and "math" in line.formats]
-    assert len(text_lines) == 8