File size: 739 Bytes
ea9ca44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import os
from .pdf_reader import parse_pdf
from .docx_reader import parse_docx
from src.preprocess.cleaner import postprocess_extracted_text
from src.preprocess.cleaner import clean_text
from src.preprocess.anonymizer import remove_pii

def parse_file(path: str) -> str:
    """Detect file type and parse accordingly."""
    ext = os.path.splitext(path)[1].lower()
    if ext == ".pdf":
        text = parse_pdf(path)
    elif ext == ".docx":
        text = parse_docx(path)
    elif ext == ".txt":
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()
    else:
        raise ValueError(f"Unsupported file type: {ext}")
    
    return postprocess_extracted_text(remove_pii(clean_text(text)))