Vik Paruchuri
commited on
Commit
·
f712a74
1
Parent(s):
21a0fa9
Fix table slicing issues
Browse files
README.md
CHANGED
|
@@ -400,8 +400,8 @@ Marker can extract tables from PDFs using `marker.converters.table.TableConverte
|
|
| 400 |
|
| 401 |
| Avg score | Total tables | use_llm |
|
| 402 |
|-----------|--------------|---------|
|
| 403 |
-
| 0.
|
| 404 |
-
| 0.
|
| 405 |
|
| 406 |
The `--use_llm` flag can significantly improve table recognition performance, as you can see.
|
| 407 |
|
|
|
|
| 400 |
|
| 401 |
| Avg score | Total tables | use_llm |
|
| 402 |
|-----------|--------------|---------|
|
| 403 |
+
| 0.82 | 54 | False |
|
| 404 |
+
| 0.887 | 54 | True |
|
| 405 |
|
| 406 |
The `--use_llm` flag can significantly improve table recognition performance, as you can see.
|
| 407 |
|
marker/processors/llm/llm_form.py
CHANGED
|
@@ -17,7 +17,7 @@ Values and labels should appear in html tables, with the labels on the left side
|
|
| 17 |
**Instructions:**
|
| 18 |
1. Carefully examine the provided form block image.
|
| 19 |
2. Analyze the html representation of the form.
|
| 20 |
-
3. If the html representation is largely correct, then write "No corrections needed."
|
| 21 |
4. If the html representation contains errors, generate the corrected html representation.
|
| 22 |
5. Output only either the corrected html representation or "No corrections needed."
|
| 23 |
**Example:**
|
|
|
|
| 17 |
**Instructions:**
|
| 18 |
1. Carefully examine the provided form block image.
|
| 19 |
2. Analyze the html representation of the form.
|
| 20 |
+
3. If the html representation is largely correct, or you cannot read the image properly, then write "No corrections needed."
|
| 21 |
4. If the html representation contains errors, generate the corrected html representation.
|
| 22 |
5. Output only either the corrected html representation or "No corrections needed."
|
| 23 |
**Example:**
|
marker/processors/llm/llm_table.py
CHANGED
|
@@ -37,7 +37,7 @@ Some guidelines:
|
|
| 37 |
**Instructions:**
|
| 38 |
1. Carefully examine the provided text block image.
|
| 39 |
2. Analyze the html representation of the table.
|
| 40 |
-
3. If the html representation is largely correct, then write "No corrections needed."
|
| 41 |
4. If the html representation contains errors, generate the corrected html representation.
|
| 42 |
5. Output only either the corrected html representation or "No corrections needed."
|
| 43 |
**Example:**
|
|
|
|
| 37 |
**Instructions:**
|
| 38 |
1. Carefully examine the provided text block image.
|
| 39 |
2. Analyze the html representation of the table.
|
| 40 |
+
3. If the html representation is largely correct, or you cannot read the image properly, then write "No corrections needed."
|
| 41 |
4. If the html representation contains errors, generate the corrected html representation.
|
| 42 |
5. Output only either the corrected html representation or "No corrections needed."
|
| 43 |
**Example:**
|
marker/processors/llm/llm_table_merge.py
CHANGED
|
@@ -55,7 +55,7 @@ You'll specify your judgement in json format - first whether Table 2 should be m
|
|
| 55 |
|
| 56 |
Table 2 should be merged at the bottom of Table 1 if Table 2 has no headers, and the rows have similar values, meaning that Table 2 continues Table 1. Table 2 should be merged to the right of Table 1 if each row in Table 2 matches a row in Table 1, meaning that Table 2 contains additional columns that augment Table 1.
|
| 57 |
|
| 58 |
-
Only merge Table 1 and Table 2 if Table 2 cannot be interpreted without merging.
|
| 59 |
|
| 60 |
**Instructions:**
|
| 61 |
1. Carefully examine the provided table images. Table 1 is the first image, and Table 2 is the second image.
|
|
|
|
| 55 |
|
| 56 |
Table 2 should be merged at the bottom of Table 1 if Table 2 has no headers, and the rows have similar values, meaning that Table 2 continues Table 1. Table 2 should be merged to the right of Table 1 if each row in Table 2 matches a row in Table 1, meaning that Table 2 contains additional columns that augment Table 1.
|
| 57 |
|
| 58 |
+
Only merge Table 1 and Table 2 if Table 2 cannot be interpreted without merging. Only merge Table 1 and Table 2 if you can read both images properly.
|
| 59 |
|
| 60 |
**Instructions:**
|
| 61 |
1. Carefully examine the provided table images. Table 1 is the first image, and Table 2 is the second image.
|
marker/processors/table.py
CHANGED
|
@@ -2,6 +2,8 @@ import re
|
|
| 2 |
from collections import defaultdict
|
| 3 |
from copy import deepcopy
|
| 4 |
from typing import Annotated, List
|
|
|
|
|
|
|
| 5 |
|
| 6 |
from ftfy import fix_text
|
| 7 |
from surya.detection import DetectionPredictor
|
|
@@ -67,7 +69,7 @@ class TableProcessor(BaseProcessor):
|
|
| 67 |
table_data = []
|
| 68 |
for page in document.pages:
|
| 69 |
for block in page.contained_blocks(document, self.block_types):
|
| 70 |
-
image = block.get_image(document, highres=True
|
| 71 |
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.get_image(highres=True).size)
|
| 72 |
|
| 73 |
table_data.append({
|
|
@@ -165,22 +167,35 @@ class TableProcessor(BaseProcessor):
|
|
| 165 |
|
| 166 |
# Other cells that span into this row
|
| 167 |
rowspan_cells = [c for c in table.cells if c.row_id != row and c.row_id + c.rowspan > row > c.row_id]
|
| 168 |
-
|
| 169 |
-
len(row_cells) >
|
| 170 |
len(rowspan_cells) == 0,
|
| 171 |
all([r == 1 for r in rowspans]),
|
| 172 |
all([l > 1 for l in line_lens]),
|
| 173 |
all([l == line_lens[0] for l in line_lens])
|
| 174 |
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
if should_split:
|
| 176 |
-
for i in range(0, line_lens
|
| 177 |
for cell in row_cells:
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
cell_id = max_cell_id + new_cell_count
|
| 180 |
new_cells.append(
|
| 181 |
SuryaTableCell(
|
| 182 |
-
polygon=
|
| 183 |
-
text_lines=
|
| 184 |
rowspan=1,
|
| 185 |
colspan=cell.colspan,
|
| 186 |
row_id=cell.row_id + shift_up + i,
|
|
|
|
| 2 |
from collections import defaultdict
|
| 3 |
from copy import deepcopy
|
| 4 |
from typing import Annotated, List
|
| 5 |
+
from collections import Counter
|
| 6 |
+
from PIL import ImageDraw
|
| 7 |
|
| 8 |
from ftfy import fix_text
|
| 9 |
from surya.detection import DetectionPredictor
|
|
|
|
| 69 |
table_data = []
|
| 70 |
for page in document.pages:
|
| 71 |
for block in page.contained_blocks(document, self.block_types):
|
| 72 |
+
image = block.get_image(document, highres=True)
|
| 73 |
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.get_image(highres=True).size)
|
| 74 |
|
| 75 |
table_data.append({
|
|
|
|
| 167 |
|
| 168 |
# Other cells that span into this row
|
| 169 |
rowspan_cells = [c for c in table.cells if c.row_id != row and c.row_id + c.rowspan > row > c.row_id]
|
| 170 |
+
should_split_entire_row = all([
|
| 171 |
+
len(row_cells) > 1,
|
| 172 |
len(rowspan_cells) == 0,
|
| 173 |
all([r == 1 for r in rowspans]),
|
| 174 |
all([l > 1 for l in line_lens]),
|
| 175 |
all([l == line_lens[0] for l in line_lens])
|
| 176 |
])
|
| 177 |
+
line_lens_counter = Counter(line_lens)
|
| 178 |
+
counter_keys = sorted(list(line_lens_counter.keys()))
|
| 179 |
+
should_split_partial_row = all([
|
| 180 |
+
len(row_cells) > 3, # Only split if there are more than 3 cells
|
| 181 |
+
len(rowspan_cells) == 0,
|
| 182 |
+
all([r == 1 for r in rowspans]),
|
| 183 |
+
len(line_lens_counter) == 2 and counter_keys[0] <= 1 and counter_keys[1] > 1 and line_lens_counter[counter_keys[0]] == 1, # Allow a single column with a single line - keys are the line lens, values are the counts
|
| 184 |
+
])
|
| 185 |
+
should_split = should_split_entire_row or should_split_partial_row
|
| 186 |
if should_split:
|
| 187 |
+
for i in range(0, max(line_lens)):
|
| 188 |
for cell in row_cells:
|
| 189 |
+
# Calculate height based on number of splits
|
| 190 |
+
split_height = cell.bbox[3] - cell.bbox[1]
|
| 191 |
+
current_bbox = [cell.bbox[0], cell.bbox[1] + i * split_height, cell.bbox[2], cell.bbox[1] + (i + 1) * split_height]
|
| 192 |
+
|
| 193 |
+
line = [cell.text_lines[i]] if cell.text_lines and i < len(cell.text_lines) else None
|
| 194 |
cell_id = max_cell_id + new_cell_count
|
| 195 |
new_cells.append(
|
| 196 |
SuryaTableCell(
|
| 197 |
+
polygon=current_bbox,
|
| 198 |
+
text_lines=line,
|
| 199 |
rowspan=1,
|
| 200 |
colspan=cell.colspan,
|
| 201 |
row_id=cell.row_id + shift_up + i,
|
marker/scripts/streamlit_app.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from marker.settings import settings
|
| 4 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
| 5 |
|
| 6 |
-
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
| 7 |
-
os.environ["IN_STREAMLIT"] = "true"
|
| 8 |
-
|
| 9 |
import base64
|
| 10 |
import io
|
| 11 |
import re
|
|
|
|
| 1 |
import os
|
| 2 |
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
| 3 |
+
os.environ["IN_STREAMLIT"] = "true"
|
| 4 |
|
| 5 |
from marker.settings import settings
|
| 6 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
| 7 |
|
|
|
|
|
|
|
|
|
|
| 8 |
import base64
|
| 9 |
import io
|
| 10 |
import re
|