Vik Paruchuri commited on
Commit
1daa24b
·
1 Parent(s): 518215f

Add test for partial row splitting

Browse files
marker/processors/table.py CHANGED
@@ -226,7 +226,8 @@ class TableProcessor(BaseProcessor):
226
  new_cell_count += 1
227
 
228
  # For each new row we add, shift up subsequent rows
229
- shift_up += line_lens[0] - 1
 
230
  else:
231
  for cell in row_cells:
232
  cell.row_id += shift_up
 
226
  new_cell_count += 1
227
 
228
  # For each new row we add, shift up subsequent rows
229
+ # The max is to account for partial rows
230
+ shift_up += max(line_lens) - 1
231
  else:
232
  for cell in row_cells:
233
  cell.row_id += shift_up
tests/processors/test_table_processor.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import pytest
2
  from marker.renderers.json import JSONRenderer
3
 
@@ -63,3 +65,15 @@ def test_ocr_table(pdf_document, detection_model, recognition_model, table_rec_m
63
  table_output = renderer(pdf_document)
64
  assert "1.2E-38" in table_output.markdown
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
  import pytest
4
  from marker.renderers.json import JSONRenderer
5
 
 
65
  table_output = renderer(pdf_document)
66
  assert "1.2E-38" in table_output.markdown
67
 
68
+
69
+ @pytest.mark.config({"page_range": [11]})
70
+ def test_split_rows(pdf_document, detection_model, recognition_model, table_rec_model):
71
+ processor = TableProcessor(detection_model, recognition_model, table_rec_model)
72
+ processor(pdf_document)
73
+
74
+ table = pdf_document.contained_blocks((BlockTypes.Table,))[-1]
75
+ cells: List[TableCell] = table.contained_blocks(pdf_document, (BlockTypes.TableCell,))
76
+ unique_rows = len(set([cell.row_id for cell in cells]))
77
+ assert unique_rows == 6
78
+
79
+