doc_alive / rag /extract_text.py
slxhere's picture
Add audio generation
5c9f0d9
raw
history blame contribute delete
530 Bytes
from pathlib import Path
import pdfplumber
def extract_text(path: str) -> str:
p = Path(path)
if p.suffix.lower() in [".txt", ".md"]:
return p.read_text(encoding="utf-8", errors="ignore")
if p.suffix.lower() == ".pdf":
text = []
with pdfplumber.open(str(p)) as pdf:
for page in pdf.pages:
text.append(page.extract_text() or "")
return "\n".join(text)
# TODO: docx, html, image(OCR), audio(ASR)
raise ValueError(f"Unsupported file type: {p.suffix}")