File size: 530 Bytes
5c9f0d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from pathlib import Path
import pdfplumber


def extract_text(path: str) -> str:
    p = Path(path)
    if p.suffix.lower() in [".txt", ".md"]:
        return p.read_text(encoding="utf-8", errors="ignore")
    if p.suffix.lower() == ".pdf":
        text = []
        with pdfplumber.open(str(p)) as pdf:
            for page in pdf.pages:
                text.append(page.extract_text() or "")
        return "\n".join(text)
    # TODO: docx, html, image(OCR), audio(ASR)
    raise ValueError(f"Unsupported file type: {p.suffix}")