"""Extract clean text from uploaded files. Supported formats: TXT, MD, PDF, DOCX. Raw bytes pasted as text are handled by KnowledgePipeline.ingest_text() directly without going through this module. """ from __future__ import annotations from pathlib import Path def parse_document(path: str | Path) -> str: """Return clean plain text extracted from the file at path.""" path = Path(path) suffix = path.suffix.lower() if suffix in {".txt", ".md", ".markdown"}: return path.read_text(encoding="utf-8", errors="ignore") if suffix == ".pdf": return _parse_pdf(path) if suffix == ".docx": return _parse_docx(path) raise ValueError(f"Unsupported file type: {suffix}") def _parse_pdf(path: Path) -> str: from pypdf import PdfReader reader = PdfReader(str(path)) pages = [] for page in reader.pages: try: pages.append(page.extract_text() or "") except Exception: continue return "\n\n".join(p.strip() for p in pages if p.strip()) def _parse_docx(path: Path) -> str: from docx import Document doc = Document(str(path)) return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())