Spaces:
Sleeping
Sleeping
File size: 739 Bytes
ea9ca44 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | import os
from .pdf_reader import parse_pdf
from .docx_reader import parse_docx
from src.preprocess.cleaner import postprocess_extracted_text
from src.preprocess.cleaner import clean_text
from src.preprocess.anonymizer import remove_pii
def parse_file(path: str) -> str:
"""Detect file type and parse accordingly."""
ext = os.path.splitext(path)[1].lower()
if ext == ".pdf":
text = parse_pdf(path)
elif ext == ".docx":
text = parse_docx(path)
elif ext == ".txt":
with open(path, "r", encoding="utf-8", errors="ignore") as f:
text = f.read()
else:
raise ValueError(f"Unsupported file type: {ext}")
return postprocess_extracted_text(remove_pii(clean_text(text)))
|