Spaces:
Running
Running
| """Extract clean text from uploaded files. | |
| Supported formats: TXT, MD, PDF, DOCX. Raw bytes pasted as text are handled | |
| by KnowledgePipeline.ingest_text() directly without going through this module. | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| def parse_document(path: str | Path) -> str: | |
| """Return clean plain text extracted from the file at path.""" | |
| path = Path(path) | |
| suffix = path.suffix.lower() | |
| if suffix in {".txt", ".md", ".markdown"}: | |
| return path.read_text(encoding="utf-8", errors="ignore") | |
| if suffix == ".pdf": | |
| return _parse_pdf(path) | |
| if suffix == ".docx": | |
| return _parse_docx(path) | |
| raise ValueError(f"Unsupported file type: {suffix}") | |
| def _parse_pdf(path: Path) -> str: | |
| from pypdf import PdfReader | |
| reader = PdfReader(str(path)) | |
| pages = [] | |
| for page in reader.pages: | |
| try: | |
| pages.append(page.extract_text() or "") | |
| except Exception: | |
| continue | |
| return "\n\n".join(p.strip() for p in pages if p.strip()) | |
| def _parse_docx(path: Path) -> str: | |
| from docx import Document | |
| doc = Document(str(path)) | |
| return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip()) | |