iris_backend / backend /src /ingestion /docx_reader.py
Muhammed Sameer
Initial commit - Iris Full (under development)
ea9ca44
from docx import Document
from src.preprocess.cleaner import postprocess_extracted_text
def parse_docx(path: str) -> str:
"""
Extract text from DOCX file.
Returns postprocessed text ready for NER and cleaning.
"""
doc = Document(path)
text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
return postprocess_extracted_text(text)