Vik Paruchuri
commited on
Commit
·
1deee8c
1
Parent(s):
1dfe667
Clean up table output
Browse files- README.md +4 -1
- benchmarks/table/table.py +1 -0
- marker/processors/table.py +2 -2
- marker/schema/blocks/tablecell.py +7 -2
README.md
CHANGED
|
@@ -394,12 +394,15 @@ Marker takes about 6GB of VRAM on average per task, so you can convert 8 documen
|
|
| 394 |

|
| 395 |
|
| 396 |
## Table Conversion
|
| 397 |
-
Marker can extract tables from
|
| 398 |
|
| 399 |
| Avg score | Total tables |
|
| 400 |
|-------------|----------------|
|
| 401 |
| 0.65 | 1149 |
|
| 402 |
|
|
|
|
|
|
|
|
|
|
| 403 |
## Running your own benchmarks
|
| 404 |
|
| 405 |
You can benchmark the performance of marker on your machine. Install marker manually with:
|
|
|
|
| 394 |

|
| 395 |
|
| 396 |
## Table Conversion
|
| 397 |
+
Marker can extract tables from PDFs using `marker.converters.table.TableConverter`. The table extraction performance is measured by comparing the extracted HTML representation of tables against the original HTML representations using the test split of [FinTabNet](https://developer.ibm.com/exchanges/data/all/fintabnet/). The HTML representations are compared using a tree edit distance based metric to judge both structure and content. Marker detects and identifies the structure of all tables in a PDF page and achieves these scores:
|
| 398 |
|
| 399 |
| Avg score | Total tables |
|
| 400 |
|-------------|----------------|
|
| 401 |
| 0.65 | 1149 |
|
| 402 |
|
| 403 |
+
|
| 404 |
+
We filter out tables that we cannot align with the ground truth, since fintabnet and our layout model have slightly different detection methods (this results in some tables being split/merged).
|
| 405 |
+
|
| 406 |
## Running your own benchmarks
|
| 407 |
|
| 408 |
You can benchmark the performance of marker on your machine. Install marker manually with:
|
benchmarks/table/table.py
CHANGED
|
@@ -152,6 +152,7 @@ def main(out_file: str, dataset: str, max_rows: int, max_workers: int, use_llm:
|
|
| 152 |
for th_tag in marker_table_soup.find_all('th'):
|
| 153 |
th_tag.name = 'td'
|
| 154 |
marker_table_html = str(marker_table_soup)
|
|
|
|
| 155 |
|
| 156 |
results.append({
|
| 157 |
"marker_table": marker_table_html,
|
|
|
|
| 152 |
for th_tag in marker_table_soup.find_all('th'):
|
| 153 |
th_tag.name = 'td'
|
| 154 |
marker_table_html = str(marker_table_soup)
|
| 155 |
+
marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines
|
| 156 |
|
| 157 |
results.append({
|
| 158 |
"marker_table": marker_table_html,
|
marker/processors/table.py
CHANGED
|
@@ -114,8 +114,8 @@ class TableProcessor(BaseProcessor):
|
|
| 114 |
|
| 115 |
def finalize_cell_text(self, cell: SuryaTableCell):
|
| 116 |
text = "\n".join([t["text"].strip() for t in cell.text_lines]) if cell.text_lines else ""
|
| 117 |
-
text = re.sub(r"(\s\.){
|
| 118 |
-
text = re.sub(r"\.{
|
| 119 |
return self.normalize_spaces(fix_text(text))
|
| 120 |
|
| 121 |
@staticmethod
|
|
|
|
| 114 |
|
| 115 |
def finalize_cell_text(self, cell: SuryaTableCell):
|
| 116 |
text = "\n".join([t["text"].strip() for t in cell.text_lines]) if cell.text_lines else ""
|
| 117 |
+
text = re.sub(r"(\s\.){2,}", "", text) # Replace . . .
|
| 118 |
+
text = re.sub(r"\.{2,}", "", text) # Replace ..., like in table of contents
|
| 119 |
return self.normalize_spaces(fix_text(text))
|
| 120 |
|
| 121 |
@staticmethod
|
marker/schema/blocks/tablecell.py
CHANGED
|
@@ -13,5 +13,10 @@ class TableCell(Block):
|
|
| 13 |
block_description: str = "A cell in a table."
|
| 14 |
|
| 15 |
def assemble_html(self, document, child_blocks, parent_structure=None):
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
block_description: str = "A cell in a table."
|
| 14 |
|
| 15 |
def assemble_html(self, document, child_blocks, parent_structure=None):
|
| 16 |
+
tag_cls = "th" if self.is_header else "td"
|
| 17 |
+
tag = f"<{tag_cls}"
|
| 18 |
+
if self.rowspan > 1:
|
| 19 |
+
tag += f" rowspan={self.rowspan}"
|
| 20 |
+
if self.colspan > 1:
|
| 21 |
+
tag += f" colspan={self.colspan}"
|
| 22 |
+
return f"{tag}>{self.text}</{tag_cls}>"
|