from pathlib import Path import pdfplumber def extract_text(path: str) -> str: p = Path(path) if p.suffix.lower() in [".txt", ".md"]: return p.read_text(encoding="utf-8", errors="ignore") if p.suffix.lower() == ".pdf": text = [] with pdfplumber.open(str(p)) as pdf: for page in pdf.pages: text.append(page.extract_text() or "") return "\n".join(text) # TODO: docx, html, image(OCR), audio(ASR) raise ValueError(f"Unsupported file type: {p.suffix}")