File size: 373 Bytes
ea9ca44
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
from docx import Document
from src.preprocess.cleaner import postprocess_extracted_text

def parse_docx(path: str) -> str:
    """
    Extract text from DOCX file.
    Returns postprocessed text ready for NER and cleaning.
    """
    doc = Document(path)
    text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
    return postprocess_extracted_text(text)