EXAM_RAG_API / ingestion /loaders /docx_loader.py
MinaNasser's picture
1st
1bc3f18
import os
from typing import List
from langchain_core.documents import Document
from docx import Document as DocxDocument
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from ingestion.loaders.normalization import normalize_text
def table_to_text(table) -> str:
"""Convert DOCX table to plain, readable text without numeric headers."""
data = []
try:
for row in table.rows:
row_data = [normalize_text(cell.text) for cell in row.cells]
if any(row_data): # skip empty rows
data.append(row_data)
if not data:
return ""
# Format as a readable markdown-like table instead of CSV with numbers
return "\n".join([" | ".join(row) for row in data])
except Exception as e:
print(f"Error converting table to text: {e}")
return ""
def load_docx(file_path: str) -> List[Document]:
"""Load DOCX file safely, preserving tables and skipping corrupted sections."""
docs = []
if not os.path.exists(file_path):
print(f"File not found: {file_path}")
return []
try:
doc = DocxDocument(file_path)
except Exception as e:
print(f"Failed to open DOCX ({file_path}): {e}")
return []
try:
body_elements = list(doc.element.body)
paragraph_iter = iter(doc.paragraphs)
table_iter = iter(doc.tables)
for element in body_elements:
if isinstance(element, CT_P):
try:
para = next(paragraph_iter)
cleaned = normalize_text(para.text)
if cleaned:
docs.append(
Document(
page_content=cleaned,
metadata={"source": file_path, "type": "text"},
)
)
except StopIteration:
continue
except Exception as e:
print(f"Error reading paragraph: {e}")
continue
elif isinstance(element, CT_Tbl):
try:
table = next(table_iter)
table_text = table_to_text(table)
if table_text:
docs.append(
Document(
page_content=table_text,
metadata={"source": file_path, "type": "table"},
)
)
except StopIteration:
continue
except Exception as e:
print(f"Error reading table: {e}")
continue
except Exception as e:
print(f"[WARN] Error processing DOCX ({file_path}): {e}")
return []
return docs