Spaces:

oluinioluwa814
/

RAG

Sleeping

RAG / utils.py

Update utils.py

eec8412 verified about 2 months ago

947 Bytes

	import PyPDF2
	import docx
	from pathlib import Path

	def load_text(path: str) -> str:
	"""
	Load text from TXT, PDF, or DOCX files.
	Returns the extracted text as a string.
	"""
	path_obj = Path(path)
	if not path_obj.exists():
	raise FileNotFoundError(f"{path} does not exist.")

	if path_obj.suffix.lower() == ".txt":
	return path_obj.read_text(encoding="utf-8")

	elif path_obj.suffix.lower() == ".pdf":
	text = ""
	with open(path_obj, "rb") as f:
	reader = PyPDF2.PdfReader(f)
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	return text

	elif path_obj.suffix.lower() == ".docx":
	doc = docx.Document(path_obj)
	return "\n".join([p.text for p in doc.paragraphs])

	else:
	raise ValueError(f"Unsupported file type: {path_obj.suffix}")