Spaces:
Running
Running
| """ | |
| Cortex RAG β Document Loader | |
| Handles PDF, HTML, and plain-text ingestion. | |
| Returns a list of Document dataclasses ready for chunking. | |
| """ | |
| from __future__ import annotations | |
| import hashlib | |
| import logging | |
| import re | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| class Document: | |
| """Raw document before chunking.""" | |
| doc_id: str # sha256 of source path | |
| source: str # original file path / URL | |
| doc_type: str # "pdf" | "html" | "text" | |
| title: str | |
| text: str # full cleaned text | |
| metadata: dict = field(default_factory=dict) | |
| def make_id(source: str) -> str: | |
| return hashlib.sha256(source.encode()).hexdigest()[:16] | |
| class DocumentLoader: | |
| """ | |
| Load documents from disk. | |
| Supports: | |
| - PDF β pdfplumber (better layout) with PyPDF2 fallback | |
| - HTML β BeautifulSoup main-content extraction | |
| - TXT β direct read with encoding detection | |
| """ | |
| def __init__(self) -> None: | |
| self._loaders = { | |
| ".pdf": self._load_pdf, | |
| ".html": self._load_html, | |
| ".htm": self._load_html, | |
| ".txt": self._load_text, | |
| ".md": self._load_text, | |
| } | |
| # ββ Public ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_file(self, path: str | Path) -> Document: | |
| """Load a single file and return a Document.""" | |
| path = Path(path) | |
| if not path.exists(): | |
| raise FileNotFoundError(f"File not found: {path}") | |
| suffix = path.suffix.lower() | |
| loader = self._loaders.get(suffix) | |
| if loader is None: | |
| raise ValueError(f"Unsupported file type: {suffix}") | |
| logger.info("Loading %s (%s)", path.name, suffix) | |
| return loader(path) | |
| def load_directory( | |
| self, | |
| directory: str | Path, | |
| recursive: bool = True, | |
| ) -> list[Document]: | |
| """Load all supported files from a directory.""" | |
| directory = Path(directory) | |
| pattern = "**/*" if recursive else "*" | |
| docs: list[Document] = [] | |
| for path in directory.glob(pattern): | |
| if path.suffix.lower() in self._loaders and path.is_file(): | |
| try: | |
| docs.append(self.load_file(path)) | |
| except Exception as exc: | |
| logger.warning("Skipping %s β %s", path, exc) | |
| logger.info("Loaded %d documents from %s", len(docs), directory) | |
| return docs | |
| # ββ Private loaders βββββββββββββββββββββββββββββββββββββββ | |
| def _load_pdf(self, path: Path) -> Document: | |
| text = self._extract_pdf_text(path) | |
| return Document( | |
| doc_id=Document.make_id(str(path)), | |
| source=str(path), | |
| doc_type="pdf", | |
| title=path.stem.replace("_", " ").replace("-", " ").title(), | |
| text=self._clean_text(text), | |
| metadata={"filename": path.name, "pages": text.count("\f") + 1}, | |
| ) | |
| def _load_html(self, path: Path) -> Document: | |
| raw = path.read_text(encoding="utf-8", errors="replace") | |
| text, title = self._extract_html_content(raw) | |
| return Document( | |
| doc_id=Document.make_id(str(path)), | |
| source=str(path), | |
| doc_type="html", | |
| title=title or path.stem, | |
| text=self._clean_text(text), | |
| metadata={"filename": path.name}, | |
| ) | |
| def _load_text(self, path: Path) -> Document: | |
| raw = path.read_text(encoding="utf-8", errors="replace") | |
| return Document( | |
| doc_id=Document.make_id(str(path)), | |
| source=str(path), | |
| doc_type="text", | |
| title=path.stem.replace("_", " ").replace("-", " ").title(), | |
| text=self._clean_text(raw), | |
| metadata={"filename": path.name}, | |
| ) | |
| # ββ Text extraction helpers ββββββββββββββββββββββββββββββββ | |
| def _extract_pdf_text(path: Path) -> str: | |
| """Try pdfplumber first, fall back to PyPDF2.""" | |
| try: | |
| import pdfplumber # type: ignore | |
| pages: list[str] = [] | |
| with pdfplumber.open(path) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| pages.append(page_text) | |
| return "\n\n".join(pages) | |
| except ImportError: | |
| pass | |
| try: | |
| import PyPDF2 # type: ignore | |
| pages = [] | |
| with open(path, "rb") as fh: | |
| reader = PyPDF2.PdfReader(fh) | |
| for page in reader.pages: | |
| pages.append(page.extract_text() or "") | |
| return "\n\n".join(pages) | |
| except ImportError as exc: | |
| raise RuntimeError( | |
| "Install pdfplumber or PyPDF2: pip install pdfplumber" | |
| ) from exc | |
| def _extract_html_content(html: str) -> tuple[str, Optional[str]]: | |
| """Extract main text content and title from HTML.""" | |
| try: | |
| from bs4 import BeautifulSoup # type: ignore | |
| except ImportError as exc: | |
| raise RuntimeError( | |
| "Install beautifulsoup4: pip install beautifulsoup4" | |
| ) from exc | |
| soup = BeautifulSoup(html, "html.parser") | |
| # Extract title | |
| title_tag = soup.find("title") | |
| title = title_tag.get_text(strip=True) if title_tag else None | |
| # Remove boilerplate | |
| for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): | |
| tag.decompose() | |
| # Prefer <main> or <article>, fall back to <body> | |
| main = soup.find("main") or soup.find("article") or soup.find("body") | |
| text = (main or soup).get_text(separator="\n", strip=True) | |
| return text, title | |
| def _clean_text(text: str) -> str: | |
| """Normalise whitespace, remove null bytes and common PDF artefacts.""" | |
| text = text.replace("\x00", "") | |
| text = re.sub(r"\n{3,}", "\n\n", text) | |
| text = re.sub(r"[ \t]{2,}", " ", text) | |
| # Remove lone hyphenation artefacts from PDF line-breaks | |
| text = re.sub(r"(?<=[a-z])-\n(?=[a-z])", "", text) | |
| return text.strip() | |