Spaces:
Running
Running
File size: 6,642 Bytes
79ca3d4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | """
Cortex RAG β Document Loader
Handles PDF, HTML, and plain-text ingestion.
Returns a list of Document dataclasses ready for chunking.
"""
from __future__ import annotations
import hashlib
import logging
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
@dataclass
class Document:
"""Raw document before chunking."""
doc_id: str # sha256 of source path
source: str # original file path / URL
doc_type: str # "pdf" | "html" | "text"
title: str
text: str # full cleaned text
metadata: dict = field(default_factory=dict)
@staticmethod
def make_id(source: str) -> str:
return hashlib.sha256(source.encode()).hexdigest()[:16]
class DocumentLoader:
"""
Load documents from disk.
Supports:
- PDF β pdfplumber (better layout) with PyPDF2 fallback
- HTML β BeautifulSoup main-content extraction
- TXT β direct read with encoding detection
"""
def __init__(self) -> None:
self._loaders = {
".pdf": self._load_pdf,
".html": self._load_html,
".htm": self._load_html,
".txt": self._load_text,
".md": self._load_text,
}
# ββ Public ββββββββββββββββββββββββββββββββββββββββββββββββ
def load_file(self, path: str | Path) -> Document:
"""Load a single file and return a Document."""
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"File not found: {path}")
suffix = path.suffix.lower()
loader = self._loaders.get(suffix)
if loader is None:
raise ValueError(f"Unsupported file type: {suffix}")
logger.info("Loading %s (%s)", path.name, suffix)
return loader(path)
def load_directory(
self,
directory: str | Path,
recursive: bool = True,
) -> list[Document]:
"""Load all supported files from a directory."""
directory = Path(directory)
pattern = "**/*" if recursive else "*"
docs: list[Document] = []
for path in directory.glob(pattern):
if path.suffix.lower() in self._loaders and path.is_file():
try:
docs.append(self.load_file(path))
except Exception as exc:
logger.warning("Skipping %s β %s", path, exc)
logger.info("Loaded %d documents from %s", len(docs), directory)
return docs
# ββ Private loaders βββββββββββββββββββββββββββββββββββββββ
def _load_pdf(self, path: Path) -> Document:
text = self._extract_pdf_text(path)
return Document(
doc_id=Document.make_id(str(path)),
source=str(path),
doc_type="pdf",
title=path.stem.replace("_", " ").replace("-", " ").title(),
text=self._clean_text(text),
metadata={"filename": path.name, "pages": text.count("\f") + 1},
)
def _load_html(self, path: Path) -> Document:
raw = path.read_text(encoding="utf-8", errors="replace")
text, title = self._extract_html_content(raw)
return Document(
doc_id=Document.make_id(str(path)),
source=str(path),
doc_type="html",
title=title or path.stem,
text=self._clean_text(text),
metadata={"filename": path.name},
)
def _load_text(self, path: Path) -> Document:
raw = path.read_text(encoding="utf-8", errors="replace")
return Document(
doc_id=Document.make_id(str(path)),
source=str(path),
doc_type="text",
title=path.stem.replace("_", " ").replace("-", " ").title(),
text=self._clean_text(raw),
metadata={"filename": path.name},
)
# ββ Text extraction helpers ββββββββββββββββββββββββββββββββ
@staticmethod
def _extract_pdf_text(path: Path) -> str:
"""Try pdfplumber first, fall back to PyPDF2."""
try:
import pdfplumber # type: ignore
pages: list[str] = []
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
pages.append(page_text)
return "\n\n".join(pages)
except ImportError:
pass
try:
import PyPDF2 # type: ignore
pages = []
with open(path, "rb") as fh:
reader = PyPDF2.PdfReader(fh)
for page in reader.pages:
pages.append(page.extract_text() or "")
return "\n\n".join(pages)
except ImportError as exc:
raise RuntimeError(
"Install pdfplumber or PyPDF2: pip install pdfplumber"
) from exc
@staticmethod
def _extract_html_content(html: str) -> tuple[str, Optional[str]]:
"""Extract main text content and title from HTML."""
try:
from bs4 import BeautifulSoup # type: ignore
except ImportError as exc:
raise RuntimeError(
"Install beautifulsoup4: pip install beautifulsoup4"
) from exc
soup = BeautifulSoup(html, "html.parser")
# Extract title
title_tag = soup.find("title")
title = title_tag.get_text(strip=True) if title_tag else None
# Remove boilerplate
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
tag.decompose()
# Prefer <main> or <article>, fall back to <body>
main = soup.find("main") or soup.find("article") or soup.find("body")
text = (main or soup).get_text(separator="\n", strip=True)
return text, title
@staticmethod
def _clean_text(text: str) -> str:
"""Normalise whitespace, remove null bytes and common PDF artefacts."""
text = text.replace("\x00", "")
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r"[ \t]{2,}", " ", text)
# Remove lone hyphenation artefacts from PDF line-breaks
text = re.sub(r"(?<=[a-z])-\n(?=[a-z])", "", text)
return text.strip()
|