github-actions
Sync from GitHub @ b179dfb
3694da1
"""Extract clean text from uploaded files.
Supported formats: TXT, MD, PDF, DOCX. Raw bytes pasted as text are handled
by KnowledgePipeline.ingest_text() directly without going through this module.
"""
from __future__ import annotations
from pathlib import Path
def parse_document(path: str | Path) -> str:
"""Return clean plain text extracted from the file at path."""
path = Path(path)
suffix = path.suffix.lower()
if suffix in {".txt", ".md", ".markdown"}:
return path.read_text(encoding="utf-8", errors="ignore")
if suffix == ".pdf":
return _parse_pdf(path)
if suffix == ".docx":
return _parse_docx(path)
raise ValueError(f"Unsupported file type: {suffix}")
def _parse_pdf(path: Path) -> str:
from pypdf import PdfReader
reader = PdfReader(str(path))
pages = []
for page in reader.pages:
try:
pages.append(page.extract_text() or "")
except Exception:
continue
return "\n\n".join(p.strip() for p in pages if p.strip())
def _parse_docx(path: Path) -> str:
from docx import Document
doc = Document(str(path))
return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())