Spaces:

aditya-joshi-05
/

Cortex

Running

App Files Files Community

Cortex / ingestion /document_loader.py

aditya-joshi-05

First commit

79ca3d4 2 months ago

raw

history blame contribute delete

6.64 kB

	"""
	Cortex RAG — Document Loader
	Handles PDF, HTML, and plain-text ingestion.
	Returns a list of Document dataclasses ready for chunking.
	"""
	from __future__ import annotations

	import hashlib
	import logging
	import re
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Optional

	logger = logging.getLogger(__name__)


	@dataclass
	class Document:
	"""Raw document before chunking."""
	doc_id: str # sha256 of source path
	source: str # original file path / URL
	doc_type: str # "pdf" \| "html" \| "text"
	title: str
	text: str # full cleaned text
	metadata: dict = field(default_factory=dict)

	@staticmethod
	def make_id(source: str) -> str:
	return hashlib.sha256(source.encode()).hexdigest()[:16]


	class DocumentLoader:
	"""
	Load documents from disk.

	Supports:
	- PDF → pdfplumber (better layout) with PyPDF2 fallback
	- HTML → BeautifulSoup main-content extraction
	- TXT → direct read with encoding detection
	"""

	def __init__(self) -> None:
	self._loaders = {
	".pdf": self._load_pdf,
	".html": self._load_html,
	".htm": self._load_html,
	".txt": self._load_text,
	".md": self._load_text,
	}

	# ── Public ────────────────────────────────────────────────

	def load_file(self, path: str \| Path) -> Document:
	"""Load a single file and return a Document."""
	path = Path(path)
	if not path.exists():
	raise FileNotFoundError(f"File not found: {path}")

	suffix = path.suffix.lower()
	loader = self._loaders.get(suffix)
	if loader is None:
	raise ValueError(f"Unsupported file type: {suffix}")

	logger.info("Loading %s (%s)", path.name, suffix)
	return loader(path)

	def load_directory(
	self,
	directory: str \| Path,
	recursive: bool = True,
	) -> list[Document]:
	"""Load all supported files from a directory."""
	directory = Path(directory)
	pattern = "*/" if recursive else "*"
	docs: list[Document] = []
	for path in directory.glob(pattern):
	if path.suffix.lower() in self._loaders and path.is_file():
	try:
	docs.append(self.load_file(path))
	except Exception as exc:
	logger.warning("Skipping %s — %s", path, exc)
	logger.info("Loaded %d documents from %s", len(docs), directory)
	return docs

	# ── Private loaders ───────────────────────────────────────

	def _load_pdf(self, path: Path) -> Document:
	text = self._extract_pdf_text(path)
	return Document(
	doc_id=Document.make_id(str(path)),
	source=str(path),
	doc_type="pdf",
	title=path.stem.replace("_", " ").replace("-", " ").title(),
	text=self._clean_text(text),
	metadata={"filename": path.name, "pages": text.count("\f") + 1},
	)

	def _load_html(self, path: Path) -> Document:
	raw = path.read_text(encoding="utf-8", errors="replace")
	text, title = self._extract_html_content(raw)
	return Document(
	doc_id=Document.make_id(str(path)),
	source=str(path),
	doc_type="html",
	title=title or path.stem,
	text=self._clean_text(text),
	metadata={"filename": path.name},
	)

	def _load_text(self, path: Path) -> Document:
	raw = path.read_text(encoding="utf-8", errors="replace")
	return Document(
	doc_id=Document.make_id(str(path)),
	source=str(path),
	doc_type="text",
	title=path.stem.replace("_", " ").replace("-", " ").title(),
	text=self._clean_text(raw),
	metadata={"filename": path.name},
	)

	# ── Text extraction helpers ────────────────────────────────

	@staticmethod
	def _extract_pdf_text(path: Path) -> str:
	"""Try pdfplumber first, fall back to PyPDF2."""
	try:
	import pdfplumber # type: ignore
	pages: list[str] = []
	with pdfplumber.open(path) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	pages.append(page_text)
	return "\n\n".join(pages)
	except ImportError:
	pass

	try:
	import PyPDF2 # type: ignore
	pages = []
	with open(path, "rb") as fh:
	reader = PyPDF2.PdfReader(fh)
	for page in reader.pages:
	pages.append(page.extract_text() or "")
	return "\n\n".join(pages)
	except ImportError as exc:
	raise RuntimeError(
	"Install pdfplumber or PyPDF2: pip install pdfplumber"
	) from exc

	@staticmethod
	def _extract_html_content(html: str) -> tuple[str, Optional[str]]:
	"""Extract main text content and title from HTML."""
	try:
	from bs4 import BeautifulSoup # type: ignore
	except ImportError as exc:
	raise RuntimeError(
	"Install beautifulsoup4: pip install beautifulsoup4"
	) from exc

	soup = BeautifulSoup(html, "html.parser")

	# Extract title
	title_tag = soup.find("title")
	title = title_tag.get_text(strip=True) if title_tag else None

	# Remove boilerplate
	for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
	tag.decompose()

	# Prefer <main> or <article>, fall back to <body>
	main = soup.find("main") or soup.find("article") or soup.find("body")
	text = (main or soup).get_text(separator="\n", strip=True)
	return text, title

	@staticmethod
	def _clean_text(text: str) -> str:
	"""Normalise whitespace, remove null bytes and common PDF artefacts."""
	text = text.replace("\x00", "")
	text = re.sub(r"\n{3,}", "\n\n", text)
	text = re.sub(r"[ \t]{2,}", " ", text)
	# Remove lone hyphenation artefacts from PDF line-breaks
	text = re.sub(r"(?<=[a-z])-\n(?=[a-z])", "", text)
	return text.strip()