Spaces:

MinaNasser
/

EXAM_RAG_API

Paused

App Files Files Community

EXAM_RAG_API / ingestion /loaders /docx_loader.py

MinaNasser

1st

1bc3f18 about 2 months ago

raw

history blame contribute delete

2.9 kB

	import os
	from typing import List
	from langchain_core.documents import Document
	from docx import Document as DocxDocument
	from docx.oxml.table import CT_Tbl
	from docx.oxml.text.paragraph import CT_P
	from ingestion.loaders.normalization import normalize_text

	def table_to_text(table) -> str:
	"""Convert DOCX table to plain, readable text without numeric headers."""
	data = []
	try:
	for row in table.rows:
	row_data = [normalize_text(cell.text) for cell in row.cells]
	if any(row_data): # skip empty rows
	data.append(row_data)

	if not data:
	return ""

	# Format as a readable markdown-like table instead of CSV with numbers
	return "\n".join([" \| ".join(row) for row in data])

	except Exception as e:
	print(f"Error converting table to text: {e}")
	return ""




	def load_docx(file_path: str) -> List[Document]:
	"""Load DOCX file safely, preserving tables and skipping corrupted sections."""
	docs = []

	if not os.path.exists(file_path):
	print(f"File not found: {file_path}")
	return []

	try:
	doc = DocxDocument(file_path)
	except Exception as e:
	print(f"Failed to open DOCX ({file_path}): {e}")
	return []

	try:
	body_elements = list(doc.element.body)
	paragraph_iter = iter(doc.paragraphs)
	table_iter = iter(doc.tables)

	for element in body_elements:
	if isinstance(element, CT_P):
	try:
	para = next(paragraph_iter)
	cleaned = normalize_text(para.text)
	if cleaned:
	docs.append(
	Document(
	page_content=cleaned,
	metadata={"source": file_path, "type": "text"},
	)
	)

	except StopIteration:
	continue
	except Exception as e:
	print(f"Error reading paragraph: {e}")
	continue
	elif isinstance(element, CT_Tbl):
	try:
	table = next(table_iter)
	table_text = table_to_text(table)
	if table_text:
	docs.append(
	Document(
	page_content=table_text,
	metadata={"source": file_path, "type": "table"},
	)
	)
	except StopIteration:
	continue
	except Exception as e:
	print(f"Error reading table: {e}")
	continue

	except Exception as e:
	print(f"[WARN] Error processing DOCX ({file_path}): {e}")
	return []

	return docs