| from pathlib import Path | |
| import pdfplumber | |
| def extract_text(path: str) -> str: | |
| p = Path(path) | |
| if p.suffix.lower() in [".txt", ".md"]: | |
| return p.read_text(encoding="utf-8", errors="ignore") | |
| if p.suffix.lower() == ".pdf": | |
| text = [] | |
| with pdfplumber.open(str(p)) as pdf: | |
| for page in pdf.pages: | |
| text.append(page.extract_text() or "") | |
| return "\n".join(text) | |
| # TODO: docx, html, image(OCR), audio(ASR) | |
| raise ValueError(f"Unsupported file type: {p.suffix}") |