import os from typing import List from langchain_core.documents import Document from docx import Document as DocxDocument from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P from ingestion.loaders.normalization import normalize_text def table_to_text(table) -> str: """Convert DOCX table to plain, readable text without numeric headers.""" data = [] try: for row in table.rows: row_data = [normalize_text(cell.text) for cell in row.cells] if any(row_data): # skip empty rows data.append(row_data) if not data: return "" # Format as a readable markdown-like table instead of CSV with numbers return "\n".join([" | ".join(row) for row in data]) except Exception as e: print(f"Error converting table to text: {e}") return "" def load_docx(file_path: str) -> List[Document]: """Load DOCX file safely, preserving tables and skipping corrupted sections.""" docs = [] if not os.path.exists(file_path): print(f"File not found: {file_path}") return [] try: doc = DocxDocument(file_path) except Exception as e: print(f"Failed to open DOCX ({file_path}): {e}") return [] try: body_elements = list(doc.element.body) paragraph_iter = iter(doc.paragraphs) table_iter = iter(doc.tables) for element in body_elements: if isinstance(element, CT_P): try: para = next(paragraph_iter) cleaned = normalize_text(para.text) if cleaned: docs.append( Document( page_content=cleaned, metadata={"source": file_path, "type": "text"}, ) ) except StopIteration: continue except Exception as e: print(f"Error reading paragraph: {e}") continue elif isinstance(element, CT_Tbl): try: table = next(table_iter) table_text = table_to_text(table) if table_text: docs.append( Document( page_content=table_text, metadata={"source": file_path, "type": "table"}, ) ) except StopIteration: continue except Exception as e: print(f"Error reading table: {e}") continue except Exception as e: print(f"[WARN] Error processing DOCX ({file_path}): {e}") return [] return docs