Spaces:
Running
Running
File size: 1,212 Bytes
3694da1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | """Extract clean text from uploaded files.
Supported formats: TXT, MD, PDF, DOCX. Raw bytes pasted as text are handled
by KnowledgePipeline.ingest_text() directly without going through this module.
"""
from __future__ import annotations
from pathlib import Path
def parse_document(path: str | Path) -> str:
"""Return clean plain text extracted from the file at path."""
path = Path(path)
suffix = path.suffix.lower()
if suffix in {".txt", ".md", ".markdown"}:
return path.read_text(encoding="utf-8", errors="ignore")
if suffix == ".pdf":
return _parse_pdf(path)
if suffix == ".docx":
return _parse_docx(path)
raise ValueError(f"Unsupported file type: {suffix}")
def _parse_pdf(path: Path) -> str:
from pypdf import PdfReader
reader = PdfReader(str(path))
pages = []
for page in reader.pages:
try:
pages.append(page.extract_text() or "")
except Exception:
continue
return "\n\n".join(p.strip() for p in pages if p.strip())
def _parse_docx(path: Path) -> str:
from docx import Document
doc = Document(str(path))
return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
|