Spaces:
Paused
Paused
| import os | |
| from typing import List | |
| from langchain_core.documents import Document | |
| from docx import Document as DocxDocument | |
| from docx.oxml.table import CT_Tbl | |
| from docx.oxml.text.paragraph import CT_P | |
| from ingestion.loaders.normalization import normalize_text | |
| def table_to_text(table) -> str: | |
| """Convert DOCX table to plain, readable text without numeric headers.""" | |
| data = [] | |
| try: | |
| for row in table.rows: | |
| row_data = [normalize_text(cell.text) for cell in row.cells] | |
| if any(row_data): # skip empty rows | |
| data.append(row_data) | |
| if not data: | |
| return "" | |
| # Format as a readable markdown-like table instead of CSV with numbers | |
| return "\n".join([" | ".join(row) for row in data]) | |
| except Exception as e: | |
| print(f"Error converting table to text: {e}") | |
| return "" | |
| def load_docx(file_path: str) -> List[Document]: | |
| """Load DOCX file safely, preserving tables and skipping corrupted sections.""" | |
| docs = [] | |
| if not os.path.exists(file_path): | |
| print(f"File not found: {file_path}") | |
| return [] | |
| try: | |
| doc = DocxDocument(file_path) | |
| except Exception as e: | |
| print(f"Failed to open DOCX ({file_path}): {e}") | |
| return [] | |
| try: | |
| body_elements = list(doc.element.body) | |
| paragraph_iter = iter(doc.paragraphs) | |
| table_iter = iter(doc.tables) | |
| for element in body_elements: | |
| if isinstance(element, CT_P): | |
| try: | |
| para = next(paragraph_iter) | |
| cleaned = normalize_text(para.text) | |
| if cleaned: | |
| docs.append( | |
| Document( | |
| page_content=cleaned, | |
| metadata={"source": file_path, "type": "text"}, | |
| ) | |
| ) | |
| except StopIteration: | |
| continue | |
| except Exception as e: | |
| print(f"Error reading paragraph: {e}") | |
| continue | |
| elif isinstance(element, CT_Tbl): | |
| try: | |
| table = next(table_iter) | |
| table_text = table_to_text(table) | |
| if table_text: | |
| docs.append( | |
| Document( | |
| page_content=table_text, | |
| metadata={"source": file_path, "type": "table"}, | |
| ) | |
| ) | |
| except StopIteration: | |
| continue | |
| except Exception as e: | |
| print(f"Error reading table: {e}") | |
| continue | |
| except Exception as e: | |
| print(f"[WARN] Error processing DOCX ({file_path}): {e}") | |
| return [] | |
| return docs | |