File size: 6,642 Bytes
79ca3d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""
Cortex RAG β€” Document Loader
Handles PDF, HTML, and plain-text ingestion.
Returns a list of Document dataclasses ready for chunking.
"""
from __future__ import annotations

import hashlib
import logging
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional

logger = logging.getLogger(__name__)


@dataclass
class Document:
    """Raw document before chunking."""
    doc_id: str                          # sha256 of source path
    source: str                          # original file path / URL
    doc_type: str                        # "pdf" | "html" | "text"
    title: str
    text: str                            # full cleaned text
    metadata: dict = field(default_factory=dict)

    @staticmethod
    def make_id(source: str) -> str:
        return hashlib.sha256(source.encode()).hexdigest()[:16]


class DocumentLoader:
    """
    Load documents from disk.

    Supports:
      - PDF  β†’ pdfplumber (better layout) with PyPDF2 fallback
      - HTML β†’ BeautifulSoup main-content extraction
      - TXT  β†’ direct read with encoding detection
    """

    def __init__(self) -> None:
        self._loaders = {
            ".pdf":  self._load_pdf,
            ".html": self._load_html,
            ".htm":  self._load_html,
            ".txt":  self._load_text,
            ".md":   self._load_text,
        }

    # ── Public ────────────────────────────────────────────────

    def load_file(self, path: str | Path) -> Document:
        """Load a single file and return a Document."""
        path = Path(path)
        if not path.exists():
            raise FileNotFoundError(f"File not found: {path}")

        suffix = path.suffix.lower()
        loader = self._loaders.get(suffix)
        if loader is None:
            raise ValueError(f"Unsupported file type: {suffix}")

        logger.info("Loading %s (%s)", path.name, suffix)
        return loader(path)

    def load_directory(
        self,
        directory: str | Path,
        recursive: bool = True,
    ) -> list[Document]:
        """Load all supported files from a directory."""
        directory = Path(directory)
        pattern = "**/*" if recursive else "*"
        docs: list[Document] = []
        for path in directory.glob(pattern):
            if path.suffix.lower() in self._loaders and path.is_file():
                try:
                    docs.append(self.load_file(path))
                except Exception as exc:
                    logger.warning("Skipping %s β€” %s", path, exc)
        logger.info("Loaded %d documents from %s", len(docs), directory)
        return docs

    # ── Private loaders ───────────────────────────────────────

    def _load_pdf(self, path: Path) -> Document:
        text = self._extract_pdf_text(path)
        return Document(
            doc_id=Document.make_id(str(path)),
            source=str(path),
            doc_type="pdf",
            title=path.stem.replace("_", " ").replace("-", " ").title(),
            text=self._clean_text(text),
            metadata={"filename": path.name, "pages": text.count("\f") + 1},
        )

    def _load_html(self, path: Path) -> Document:
        raw = path.read_text(encoding="utf-8", errors="replace")
        text, title = self._extract_html_content(raw)
        return Document(
            doc_id=Document.make_id(str(path)),
            source=str(path),
            doc_type="html",
            title=title or path.stem,
            text=self._clean_text(text),
            metadata={"filename": path.name},
        )

    def _load_text(self, path: Path) -> Document:
        raw = path.read_text(encoding="utf-8", errors="replace")
        return Document(
            doc_id=Document.make_id(str(path)),
            source=str(path),
            doc_type="text",
            title=path.stem.replace("_", " ").replace("-", " ").title(),
            text=self._clean_text(raw),
            metadata={"filename": path.name},
        )

    # ── Text extraction helpers ────────────────────────────────

    @staticmethod
    def _extract_pdf_text(path: Path) -> str:
        """Try pdfplumber first, fall back to PyPDF2."""
        try:
            import pdfplumber  # type: ignore
            pages: list[str] = []
            with pdfplumber.open(path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        pages.append(page_text)
            return "\n\n".join(pages)
        except ImportError:
            pass

        try:
            import PyPDF2  # type: ignore
            pages = []
            with open(path, "rb") as fh:
                reader = PyPDF2.PdfReader(fh)
                for page in reader.pages:
                    pages.append(page.extract_text() or "")
            return "\n\n".join(pages)
        except ImportError as exc:
            raise RuntimeError(
                "Install pdfplumber or PyPDF2: pip install pdfplumber"
            ) from exc

    @staticmethod
    def _extract_html_content(html: str) -> tuple[str, Optional[str]]:
        """Extract main text content and title from HTML."""
        try:
            from bs4 import BeautifulSoup  # type: ignore
        except ImportError as exc:
            raise RuntimeError(
                "Install beautifulsoup4: pip install beautifulsoup4"
            ) from exc

        soup = BeautifulSoup(html, "html.parser")

        # Extract title
        title_tag = soup.find("title")
        title = title_tag.get_text(strip=True) if title_tag else None

        # Remove boilerplate
        for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
            tag.decompose()

        # Prefer <main> or <article>, fall back to <body>
        main = soup.find("main") or soup.find("article") or soup.find("body")
        text = (main or soup).get_text(separator="\n", strip=True)
        return text, title

    @staticmethod
    def _clean_text(text: str) -> str:
        """Normalise whitespace, remove null bytes and common PDF artefacts."""
        text = text.replace("\x00", "")
        text = re.sub(r"\n{3,}", "\n\n", text)
        text = re.sub(r"[ \t]{2,}", " ", text)
        # Remove lone hyphenation artefacts from PDF line-breaks
        text = re.sub(r"(?<=[a-z])-\n(?=[a-z])", "", text)
        return text.strip()