Vik Paruchuri commited on
Commit
f712a74
·
1 Parent(s): 21a0fa9

Fix table slicing issues

Browse files
README.md CHANGED
@@ -400,8 +400,8 @@ Marker can extract tables from PDFs using `marker.converters.table.TableConverte
400
 
401
  | Avg score | Total tables | use_llm |
402
  |-----------|--------------|---------|
403
- | 0.824 | 54 | False |
404
- | 0.873 | 54 | True |
405
 
406
  The `--use_llm` flag can significantly improve table recognition performance, as you can see.
407
 
 
400
 
401
  | Avg score | Total tables | use_llm |
402
  |-----------|--------------|---------|
403
+ | 0.82 | 54 | False |
404
+ | 0.887 | 54 | True |
405
 
406
  The `--use_llm` flag can significantly improve table recognition performance, as you can see.
407
 
marker/processors/llm/llm_form.py CHANGED
@@ -17,7 +17,7 @@ Values and labels should appear in html tables, with the labels on the left side
17
  **Instructions:**
18
  1. Carefully examine the provided form block image.
19
  2. Analyze the html representation of the form.
20
- 3. If the html representation is largely correct, then write "No corrections needed."
21
  4. If the html representation contains errors, generate the corrected html representation.
22
  5. Output only either the corrected html representation or "No corrections needed."
23
  **Example:**
 
17
  **Instructions:**
18
  1. Carefully examine the provided form block image.
19
  2. Analyze the html representation of the form.
20
+ 3. If the html representation is largely correct, or you cannot read the image properly, then write "No corrections needed."
21
  4. If the html representation contains errors, generate the corrected html representation.
22
  5. Output only either the corrected html representation or "No corrections needed."
23
  **Example:**
marker/processors/llm/llm_table.py CHANGED
@@ -37,7 +37,7 @@ Some guidelines:
37
  **Instructions:**
38
  1. Carefully examine the provided text block image.
39
  2. Analyze the html representation of the table.
40
- 3. If the html representation is largely correct, then write "No corrections needed."
41
  4. If the html representation contains errors, generate the corrected html representation.
42
  5. Output only either the corrected html representation or "No corrections needed."
43
  **Example:**
 
37
  **Instructions:**
38
  1. Carefully examine the provided text block image.
39
  2. Analyze the html representation of the table.
40
+ 3. If the html representation is largely correct, or you cannot read the image properly, then write "No corrections needed."
41
  4. If the html representation contains errors, generate the corrected html representation.
42
  5. Output only either the corrected html representation or "No corrections needed."
43
  **Example:**
marker/processors/llm/llm_table_merge.py CHANGED
@@ -55,7 +55,7 @@ You'll specify your judgement in json format - first whether Table 2 should be m
55
 
56
  Table 2 should be merged at the bottom of Table 1 if Table 2 has no headers, and the rows have similar values, meaning that Table 2 continues Table 1. Table 2 should be merged to the right of Table 1 if each row in Table 2 matches a row in Table 1, meaning that Table 2 contains additional columns that augment Table 1.
57
 
58
- Only merge Table 1 and Table 2 if Table 2 cannot be interpreted without merging.
59
 
60
  **Instructions:**
61
  1. Carefully examine the provided table images. Table 1 is the first image, and Table 2 is the second image.
 
55
 
56
  Table 2 should be merged at the bottom of Table 1 if Table 2 has no headers, and the rows have similar values, meaning that Table 2 continues Table 1. Table 2 should be merged to the right of Table 1 if each row in Table 2 matches a row in Table 1, meaning that Table 2 contains additional columns that augment Table 1.
57
 
58
+ Only merge Table 1 and Table 2 if Table 2 cannot be interpreted without merging. Only merge Table 1 and Table 2 if you can read both images properly.
59
 
60
  **Instructions:**
61
  1. Carefully examine the provided table images. Table 1 is the first image, and Table 2 is the second image.
marker/processors/table.py CHANGED
@@ -2,6 +2,8 @@ import re
2
  from collections import defaultdict
3
  from copy import deepcopy
4
  from typing import Annotated, List
 
 
5
 
6
  from ftfy import fix_text
7
  from surya.detection import DetectionPredictor
@@ -67,7 +69,7 @@ class TableProcessor(BaseProcessor):
67
  table_data = []
68
  for page in document.pages:
69
  for block in page.contained_blocks(document, self.block_types):
70
- image = block.get_image(document, highres=True, expansion=(.01, .01))
71
  image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.get_image(highres=True).size)
72
 
73
  table_data.append({
@@ -165,22 +167,35 @@ class TableProcessor(BaseProcessor):
165
 
166
  # Other cells that span into this row
167
  rowspan_cells = [c for c in table.cells if c.row_id != row and c.row_id + c.rowspan > row > c.row_id]
168
- should_split = all([
169
- len(row_cells) > 0,
170
  len(rowspan_cells) == 0,
171
  all([r == 1 for r in rowspans]),
172
  all([l > 1 for l in line_lens]),
173
  all([l == line_lens[0] for l in line_lens])
174
  ])
 
 
 
 
 
 
 
 
 
175
  if should_split:
176
- for i in range(0, line_lens[0]):
177
  for cell in row_cells:
178
- line = cell.text_lines[i]
 
 
 
 
179
  cell_id = max_cell_id + new_cell_count
180
  new_cells.append(
181
  SuryaTableCell(
182
- polygon=line["bbox"],
183
- text_lines=[line],
184
  rowspan=1,
185
  colspan=cell.colspan,
186
  row_id=cell.row_id + shift_up + i,
 
2
  from collections import defaultdict
3
  from copy import deepcopy
4
  from typing import Annotated, List
5
+ from collections import Counter
6
+ from PIL import ImageDraw
7
 
8
  from ftfy import fix_text
9
  from surya.detection import DetectionPredictor
 
69
  table_data = []
70
  for page in document.pages:
71
  for block in page.contained_blocks(document, self.block_types):
72
+ image = block.get_image(document, highres=True)
73
  image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.get_image(highres=True).size)
74
 
75
  table_data.append({
 
167
 
168
  # Other cells that span into this row
169
  rowspan_cells = [c for c in table.cells if c.row_id != row and c.row_id + c.rowspan > row > c.row_id]
170
+ should_split_entire_row = all([
171
+ len(row_cells) > 1,
172
  len(rowspan_cells) == 0,
173
  all([r == 1 for r in rowspans]),
174
  all([l > 1 for l in line_lens]),
175
  all([l == line_lens[0] for l in line_lens])
176
  ])
177
+ line_lens_counter = Counter(line_lens)
178
+ counter_keys = sorted(list(line_lens_counter.keys()))
179
+ should_split_partial_row = all([
180
+ len(row_cells) > 3, # Only split if there are more than 3 cells
181
+ len(rowspan_cells) == 0,
182
+ all([r == 1 for r in rowspans]),
183
+ len(line_lens_counter) == 2 and counter_keys[0] <= 1 and counter_keys[1] > 1 and line_lens_counter[counter_keys[0]] == 1, # Allow a single column with a single line - keys are the line lens, values are the counts
184
+ ])
185
+ should_split = should_split_entire_row or should_split_partial_row
186
  if should_split:
187
+ for i in range(0, max(line_lens)):
188
  for cell in row_cells:
189
+ # Calculate height based on number of splits
190
+ split_height = cell.bbox[3] - cell.bbox[1]
191
+ current_bbox = [cell.bbox[0], cell.bbox[1] + i * split_height, cell.bbox[2], cell.bbox[1] + (i + 1) * split_height]
192
+
193
+ line = [cell.text_lines[i]] if cell.text_lines and i < len(cell.text_lines) else None
194
  cell_id = max_cell_id + new_cell_count
195
  new_cells.append(
196
  SuryaTableCell(
197
+ polygon=current_bbox,
198
+ text_lines=line,
199
  rowspan=1,
200
  colspan=cell.colspan,
201
  row_id=cell.row_id + shift_up + i,
marker/scripts/streamlit_app.py CHANGED
@@ -1,11 +1,10 @@
1
  import os
 
 
2
 
3
  from marker.settings import settings
4
  from streamlit.runtime.uploaded_file_manager import UploadedFile
5
 
6
- os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
7
- os.environ["IN_STREAMLIT"] = "true"
8
-
9
  import base64
10
  import io
11
  import re
 
1
  import os
2
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
3
+ os.environ["IN_STREAMLIT"] = "true"
4
 
5
  from marker.settings import settings
6
  from streamlit.runtime.uploaded_file_manager import UploadedFile
7
 
 
 
 
8
  import base64
9
  import io
10
  import re