RAG / utils.py
oluinioluwa814's picture
Update utils.py
eec8412 verified
import PyPDF2
import docx
from pathlib import Path
def load_text(path: str) -> str:
"""
Load text from TXT, PDF, or DOCX files.
Returns the extracted text as a string.
"""
path_obj = Path(path)
if not path_obj.exists():
raise FileNotFoundError(f"{path} does not exist.")
if path_obj.suffix.lower() == ".txt":
return path_obj.read_text(encoding="utf-8")
elif path_obj.suffix.lower() == ".pdf":
text = ""
with open(path_obj, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
elif path_obj.suffix.lower() == ".docx":
doc = docx.Document(path_obj)
return "\n".join([p.text for p in doc.paragraphs])
else:
raise ValueError(f"Unsupported file type: {path_obj.suffix}")