Kakarot21 commited on
Commit
1d88fd5
Β·
1 Parent(s): 3e09ea4

fix: handle None values in extracted table cells when converting to markdown and improve empty table detection.

Browse files
Files changed (1) hide show
  1. data_cutter.py +7 -5
data_cutter.py CHANGED
@@ -17,13 +17,15 @@ def load_documents():
17
  tables = page.extract_tables() or []
18
  table_texts = []
19
  for table in tables:
20
- if not table:
21
  continue
22
- # Convert table to markdown
23
- md_table = "| " + " | ".join(table[0]) + " |\n"
24
- md_table += "| " + " | ".join(["---"]*len(table[0])) + " |\n"
 
25
  for row in table[1:]:
26
- md_table += "| " + " | ".join([str(cell) if cell else "" for cell in row]) + " |\n"
 
27
  table_texts.append(md_table)
28
  full_page = text + "\n\n" + "\n\n".join(table_texts)
29
  documents.append(Document(page_content=full_page, metadata={"page": i+1}))
 
17
  tables = page.extract_tables() or []
18
  table_texts = []
19
  for table in tables:
20
+ if not table or not table[0]:
21
  continue
22
+ # Convert table to markdown - handle None values in cells
23
+ header_row = [str(cell) if cell is not None else "" for cell in table[0]]
24
+ md_table = "| " + " | ".join(header_row) + " |\n"
25
+ md_table += "| " + " | ".join(["---"]*len(header_row)) + " |\n"
26
  for row in table[1:]:
27
+ row_cells = [str(cell) if cell is not None else "" for cell in row]
28
+ md_table += "| " + " | ".join(row_cells) + " |\n"
29
  table_texts.append(md_table)
30
  full_page = text + "\n\n" + "\n\n".join(table_texts)
31
  documents.append(Document(page_content=full_page, metadata={"page": i+1}))