Spaces:

HemanM
/

EvoTransformerV11

Running

github-actions

Sync from GitHub @ b179dfb

3694da1 5 days ago

1.21 kB

	"""Extract clean text from uploaded files.

	Supported formats: TXT, MD, PDF, DOCX. Raw bytes pasted as text are handled
	by KnowledgePipeline.ingest_text() directly without going through this module.
	"""

	from __future__ import annotations

	from pathlib import Path


	def parse_document(path: str \| Path) -> str:
	"""Return clean plain text extracted from the file at path."""
	path = Path(path)
	suffix = path.suffix.lower()
	if suffix in {".txt", ".md", ".markdown"}:
	return path.read_text(encoding="utf-8", errors="ignore")
	if suffix == ".pdf":
	return _parse_pdf(path)
	if suffix == ".docx":
	return _parse_docx(path)
	raise ValueError(f"Unsupported file type: {suffix}")


	def _parse_pdf(path: Path) -> str:
	from pypdf import PdfReader
	reader = PdfReader(str(path))
	pages = []
	for page in reader.pages:
	try:
	pages.append(page.extract_text() or "")
	except Exception:
	continue
	return "\n\n".join(p.strip() for p in pages if p.strip())


	def _parse_docx(path: Path) -> str:
	from docx import Document
	doc = Document(str(path))
	return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())