Spaces:
Runtime error
Runtime error
fix: handle None values in extracted table cells when converting to markdown and improve empty table detection.
Browse files- data_cutter.py +7 -5
data_cutter.py
CHANGED
|
@@ -17,13 +17,15 @@ def load_documents():
|
|
| 17 |
tables = page.extract_tables() or []
|
| 18 |
table_texts = []
|
| 19 |
for table in tables:
|
| 20 |
-
if not table:
|
| 21 |
continue
|
| 22 |
-
# Convert table to markdown
|
| 23 |
-
|
| 24 |
-
md_table
|
|
|
|
| 25 |
for row in table[1:]:
|
| 26 |
-
|
|
|
|
| 27 |
table_texts.append(md_table)
|
| 28 |
full_page = text + "\n\n" + "\n\n".join(table_texts)
|
| 29 |
documents.append(Document(page_content=full_page, metadata={"page": i+1}))
|
|
|
|
| 17 |
tables = page.extract_tables() or []
|
| 18 |
table_texts = []
|
| 19 |
for table in tables:
|
| 20 |
+
if not table or not table[0]:
|
| 21 |
continue
|
| 22 |
+
# Convert table to markdown - handle None values in cells
|
| 23 |
+
header_row = [str(cell) if cell is not None else "" for cell in table[0]]
|
| 24 |
+
md_table = "| " + " | ".join(header_row) + " |\n"
|
| 25 |
+
md_table += "| " + " | ".join(["---"]*len(header_row)) + " |\n"
|
| 26 |
for row in table[1:]:
|
| 27 |
+
row_cells = [str(cell) if cell is not None else "" for cell in row]
|
| 28 |
+
md_table += "| " + " | ".join(row_cells) + " |\n"
|
| 29 |
table_texts.append(md_table)
|
| 30 |
full_page = text + "\n\n" + "\n\n".join(table_texts)
|
| 31 |
documents.append(Document(page_content=full_page, metadata={"page": i+1}))
|