Vik Paruchuri commited on
Commit
6951df2
·
1 Parent(s): c57e32c

Fix line merging bug

Browse files
marker/builders/line.py CHANGED
@@ -411,6 +411,26 @@ class LineBuilder(BaseBuilder):
411
  ]
412
  )
413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  for line_idx in filtered_merge_lines:
415
  text_line = text_lines[line_idx]
416
  for merge_section in filtered_merge_lines[line_idx]:
@@ -424,7 +444,9 @@ class LineBuilder(BaseBuilder):
424
  ]
425
  # Set the polygon to the detected line - This is because provider polygons are sometimes incorrect
426
  # TODO Add metadata for this
427
- merged_line.line.polygon = text_line.rescale(image_size, page_size)
 
 
428
  out_provider_lines.append((out_idx, merged_line))
429
  already_merged.add(merge_section[0])
430
  else:
@@ -442,7 +464,9 @@ class LineBuilder(BaseBuilder):
442
  already_merged.add(idx) # Prevent double merging
443
  # Set the polygon to the detected line - This is because provider polygons are sometimes incorrect
444
  # TODO Add metadata for this
445
- merged_line.line.polygon = text_line.rescale(image_size, page_size)
 
 
446
  out_provider_lines.append((out_idx, merged_line))
447
 
448
  # Sort to preserve original order
 
411
  ]
412
  )
413
 
414
+ def bbox_for_merge_section(
415
+ merge_section: List[int],
416
+ all_merge_sections: List[List[int]],
417
+ text_line: PolygonBox,
418
+ ):
419
+ # Don't just take the whole detected line if we have multiple sections inside
420
+ if len(all_merge_sections) == 1:
421
+ return text_line.rescale(image_size, page_size)
422
+ else:
423
+ poly = None
424
+ for section_idx in merge_section:
425
+ section_polygon = deepcopy(
426
+ horizontal_provider_lines[section_idx][1].line.polygon
427
+ )
428
+ if poly is None:
429
+ poly = section_polygon
430
+ else:
431
+ poly = poly.merge([section_polygon])
432
+ return poly
433
+
434
  for line_idx in filtered_merge_lines:
435
  text_line = text_lines[line_idx]
436
  for merge_section in filtered_merge_lines[line_idx]:
 
444
  ]
445
  # Set the polygon to the detected line - This is because provider polygons are sometimes incorrect
446
  # TODO Add metadata for this
447
+ merged_line.line.polygon = bbox_for_merge_section(
448
+ merge_section, filtered_merge_lines[line_idx], text_line
449
+ )
450
  out_provider_lines.append((out_idx, merged_line))
451
  already_merged.add(merge_section[0])
452
  else:
 
464
  already_merged.add(idx) # Prevent double merging
465
  # Set the polygon to the detected line - This is because provider polygons are sometimes incorrect
466
  # TODO Add metadata for this
467
+ merged_line.line.polygon = bbox_for_merge_section(
468
+ merge_section, filtered_merge_lines[line_idx], text_line
469
+ )
470
  out_provider_lines.append((out_idx, merged_line))
471
 
472
  # Sort to preserve original order
tests/builders/test_inline_math_lines.py DELETED
@@ -1,33 +0,0 @@
1
- import pytest
2
-
3
- from marker.processors.line_merge import LineMergeProcessor
4
- from marker.schema import BlockTypes
5
-
6
- @pytest.mark.skip("We do not support this functionality anymore")
7
- @pytest.mark.config({"page_range": [1]})
8
- def test_inline_box_nomerging(pdf_document, config):
9
- first_page = pdf_document.pages[0]
10
- block = pdf_document.get_block(first_page.structure[1]) # First inline math block
11
- line_count = len(block.contained_blocks(pdf_document, (BlockTypes.Line,)))
12
- assert line_count == 46
13
-
14
- merger = LineMergeProcessor(config)
15
- merger(pdf_document)
16
-
17
- line_count = len(block.contained_blocks(pdf_document, (BlockTypes.Line,)))
18
- assert line_count == 46
19
-
20
-
21
- @pytest.mark.skip("We do not support this functionality anymore")
22
- @pytest.mark.config({"page_range": [1], "use_llm": True})
23
- def test_inline_box_merging(pdf_document, config):
24
- first_page = pdf_document.pages[0]
25
- block = pdf_document.get_block(first_page.structure[1]) # First inline math block
26
- line_count = len(block.contained_blocks(pdf_document, (BlockTypes.Line,)))
27
- assert line_count == 21
28
-
29
- merger = LineMergeProcessor(config)
30
- merger(pdf_document)
31
-
32
- line_count = len(block.contained_blocks(pdf_document, (BlockTypes.Line,)))
33
- assert line_count == 21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/builders/test_merged_lines.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from marker.schema import BlockTypes
4
+
5
+
6
+ @pytest.mark.config({"page_range": [6], "format_lines": True, "disable_ocr": True})
7
+ @pytest.mark.filename("bad_math.pdf")
8
+ def test_keep_ocr(pdf_document):
9
+ contained_lines = pdf_document.pages[0].contained_blocks(
10
+ pdf_document, [BlockTypes.Line]
11
+ )
12
+
13
+ # Check that we grabbed the right text
14
+ assert "Lemma" in contained_lines[-1].formatted_text(pdf_document)
15
+ assert "distribution" in contained_lines[-2].formatted_text(pdf_document)
16
+
17
+ # Line 2 comes after line 1
18
+ assert contained_lines[-1].polygon.bbox[1] > contained_lines[-2].polygon.bbox[3]
tests/builders/test_strip_existing_ocr.py CHANGED
@@ -1,8 +1,5 @@
1
  import pytest
2
 
3
- from marker.builders.document import DocumentBuilder
4
- from marker.builders.layout import LayoutBuilder
5
- from marker.schema import BlockTypes
6
 
7
  @pytest.mark.config({"page_range": [0], "strip_existing_ocr": True})
8
  @pytest.mark.filename("handwritten.pdf")
@@ -10,7 +7,8 @@ def test_strip_ocr(doc_provider):
10
  # Ensure that the OCR text isn't extracted
11
  assert len(doc_provider.page_lines) == 0
12
 
 
13
  @pytest.mark.config({"page_range": [0]})
14
  @pytest.mark.filename("handwritten.pdf")
15
  def test_keep_ocr(doc_provider):
16
- assert len(doc_provider.page_lines) == 1
 
1
  import pytest
2
 
 
 
 
3
 
4
  @pytest.mark.config({"page_range": [0], "strip_existing_ocr": True})
5
  @pytest.mark.filename("handwritten.pdf")
 
7
  # Ensure that the OCR text isn't extracted
8
  assert len(doc_provider.page_lines) == 0
9
 
10
+
11
  @pytest.mark.config({"page_range": [0]})
12
  @pytest.mark.filename("handwritten.pdf")
13
  def test_keep_ocr(doc_provider):
14
+ assert len(doc_provider.page_lines) == 1
tests/processors/test_inline_math.py DELETED
@@ -1,49 +0,0 @@
1
- from unittest.mock import Mock
2
-
3
- import pytest
4
-
5
- from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor
6
- from marker.schema import BlockTypes
7
-
8
-
9
- @pytest.mark.skip("We do not support this method of inline math anymore")
10
- @pytest.mark.filename("adversarial.pdf")
11
- @pytest.mark.config({"page_range": [0], "use_llm": True})
12
- def test_llm_text_processor(pdf_document, mocker):
13
- # Get all inline math lines
14
- text_lines = pdf_document.contained_blocks((BlockTypes.Line,))
15
- text_lines = [line for line in text_lines if line.formats and "math" in line.formats]
16
- assert len(text_lines) == 8
17
- corrected_lines = ["<math>Text</math>"] * len(text_lines)
18
-
19
- mock_cls = Mock()
20
- mock_cls.return_value = {"corrected_lines": corrected_lines}
21
-
22
- config = {"use_llm": True, "gemini_api_key": "test"}
23
- processor_lst = [LLMInlineMathLinesProcessor(config)]
24
- processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
25
- processor(pdf_document)
26
-
27
- contained_spans = text_lines[0].contained_blocks(pdf_document, (BlockTypes.Span,))
28
- assert contained_spans[0].text == "Text\n" # Newline inserted at end of line
29
- assert contained_spans[0].formats == ["math"]
30
-
31
-
32
- @pytest.mark.skip("We do not support this method of inline math anymore")
33
- @pytest.mark.filename("adversarial.pdf")
34
- @pytest.mark.config({"page_range": [0]})
35
- def test_llm_text_processor_disabled(pdf_document):
36
- # Get all inline math lines
37
- text_lines = pdf_document.contained_blocks((BlockTypes.Line,))
38
- text_lines = [line for line in text_lines if line.formats and "math" in line.formats]
39
- assert len(text_lines) == 0
40
-
41
-
42
- @pytest.mark.skip("We do not support this method of inline math anymore")
43
- @pytest.mark.filename("adversarial.pdf")
44
- @pytest.mark.config({"page_range": [0], "texify_inline_spans": True})
45
- def test_llm_text_processor_texify(pdf_document):
46
- # Get all inline math lines
47
- text_lines = pdf_document.contained_blocks((BlockTypes.Line,))
48
- text_lines = [line for line in text_lines if line.formats and "math" in line.formats]
49
- assert len(text_lines) == 8