Spaces:

slxhere
/

doc_alive

Sleeping

App Files Files Community

doc_alive / rag /extract_text.py

slxhere's picture

Add audio generation

5c9f0d9 4 months ago

history blame contribute delete

530 Bytes

	from pathlib import Path
	import pdfplumber


	def extract_text(path: str) -> str:
	p = Path(path)
	if p.suffix.lower() in [".txt", ".md"]:
	return p.read_text(encoding="utf-8", errors="ignore")
	if p.suffix.lower() == ".pdf":
	text = []
	with pdfplumber.open(str(p)) as pdf:
	for page in pdf.pages:
	text.append(page.extract_text() or "")
	return "\n".join(text)
	# TODO: docx, html, image(OCR), audio(ASR)
	raise ValueError(f"Unsupported file type: {p.suffix}")