File size: 7,188 Bytes
914e970
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0db9f89
914e970
0db9f89
914e970
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
"""Document ingestion and retrieval for the DRIFT companion."""

import re
import uuid
from pathlib import Path
from typing import List, Optional

import chromadb

from infj_bot.core.config import PROJECT_ROOT, DATA_DIR
from infj_bot.core.embeddings import (
    get_default_embedding_function,
    LocalEmbeddingFunction,
)

SUPPORTED_TEXT = {
    ".txt",
    ".md",
    ".py",
    ".js",
    ".ts",
    ".jsx",
    ".tsx",
    ".json",
    ".yaml",
    ".yml",
    ".csv",
    ".sh",
    ".html",
    ".css",
    ".rs",
    ".go",
    ".java",
    ".c",
    ".cpp",
    ".h",
}
MAX_INGEST_FILE_BYTES = 2_000_000
MAX_DIRECTORY_FILES = 300


def _is_relative_to(child: Path, parent: Path) -> bool:
    try:
        child.relative_to(parent)
        return True
    except ValueError:
        return False


def _resolve_ingest_path(path: str) -> Path:
    target = Path(path).expanduser()
    if not target.is_absolute():
        target = PROJECT_ROOT / target
    target = target.resolve()
    allowed_roots = [PROJECT_ROOT.resolve(), Path.home().resolve()]
    if not any(_is_relative_to(target, root) for root in allowed_roots):
        raise PermissionError(f"Path {path} is outside the allowed ingestion roots.")
    return target


def _chunk_text(text: str, chunk_size: int = 800, overlap: int = 100) -> List[str]:
    """Split text into overlapping chunks by paragraphs."""
    paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
    chunks = []
    current: List[str] = []
    current_len = 0
    for para in paragraphs:
        para_len = len(para)
        if current_len + para_len > chunk_size and current:
            chunks.append("\n\n".join(current))
            # Keep overlap
            overlap_text: List[str] = []
            overlap_len = 0
            for p in reversed(current):
                if overlap_len + len(p) > overlap:
                    break
                overlap_text.insert(0, p)
                overlap_len += len(p)
            current = overlap_text
            current_len = overlap_len
        current.append(para)
        current_len += para_len
    if current:
        chunks.append("\n\n".join(current))
    return chunks


def _read_pdf(path: Path) -> str:
    try:
        from pypdf import PdfReader

        reader = PdfReader(str(path))
        parts = []
        for page in reader.pages:
            text = page.extract_text()
            if text:
                parts.append(text)
        return "\n\n".join(parts)
    except Exception as exc:
        raise RuntimeError(f"PDF read failed: {exc}")


def _read_file(path: Path) -> str:
    if path.stat().st_size > MAX_INGEST_FILE_BYTES:
        raise ValueError(
            f"File too large for ingestion: {path} ({path.stat().st_size} bytes)"
        )
    suffix = path.suffix.lower()
    if suffix == ".pdf":
        return _read_pdf(path)
    return path.read_text(encoding="utf-8", errors="replace")


class DocumentStore:
    def __init__(
        self, persist_directory=None, embedding_function=None, use_semantic=True
    ):
        if persist_directory is None:
            persist_directory = str(DATA_DIR / "chroma_db")
        if embedding_function is None:
            if use_semantic:
                embedding_function = get_default_embedding_function()
            else:
                embedding_function = LocalEmbeddingFunction()
        self.embedding_function = embedding_function
        self.client = chromadb.PersistentClient(path=persist_directory)
        self.collection = self.client.get_or_create_collection(
            name="infj_documents",
            embedding_function=embedding_function,
        )

    def ingest(self, file_path: str, tags: Optional[List[str]] = None) -> int:
        path = _resolve_ingest_path(file_path)
        if not path.exists():
            raise FileNotFoundError(f"Not found: {path}")
        if path.is_dir():
            return self.ingest_directory(path, tags=tags)

        text = _read_file(path)
        if not text.strip():
            return 0

        chunks = _chunk_text(text)
        if not chunks:
            return 0

        ids = [f"doc-{uuid.uuid4().hex[:12]}" for _ in chunks]
        metadatas = []
        for i, _chunk in enumerate(chunks):
            meta = {
                "source": str(path),
                "filename": path.name,
                "chunk_index": i,
                "total_chunks": len(chunks),
                "tags": ",".join(tags or []),
            }
            metadatas.append(meta)

        self.collection.add(
            documents=chunks,
            ids=ids,
            metadatas=metadatas,
        )
        return len(chunks)

    def ingest_directory(
        self, dir_path: Path, tags: Optional[List[str]] = None, recursive: bool = True
    ) -> int:
        total = 0
        scanned = 0
        pattern = "**/*" if recursive else "*"
        for child in Path(dir_path).glob(pattern):
            if child.is_file() and child.suffix.lower() in SUPPORTED_TEXT | {".pdf"}:
                scanned += 1
                if scanned > MAX_DIRECTORY_FILES:
                    raise ValueError(
                        f"Directory ingestion stopped after {MAX_DIRECTORY_FILES} supported files."
                    )
                try:
                    n = self.ingest(str(child), tags=tags)
                    total += n
                except Exception as exc:
                    print(f"[ingest skip] {child}: {exc}")
        return total

    def search(self, query: str, n_results: int = 5) -> List[dict]:
        results = self.collection.query(
            query_texts=[query],
            n_results=n_results,
        )
        out = []
        for i, doc in enumerate(results["documents"][0]):
            meta = results["metadatas"][0][i]
            out.append(
                {
                    "document": doc,
                    "source": meta.get("source", "?"),
                    "filename": meta.get("filename", "?"),
                    "chunk_index": meta.get("chunk_index", 0),
                }
            )
        return out

    def list_sources(self) -> List[str]:
        results = self.collection.get(include=["metadatas"])
        sources = set()
        for meta in results.get("metadatas", []):
            if meta:
                sources.add(meta.get("source", "?"))
        return sorted(sources)

    def delete_source(self, source_path: str) -> int:
        results = self.collection.get(
            where={"source": source_path},
            include=[],
        )
        ids = results.get("ids", [])
        if ids:
            self.collection.delete(ids=ids)
        return len(ids)

    def count(self) -> int:
        return self.collection.count()


def format_doc_results(results: List[dict]) -> str:
    if not results:
        return "No matching documents found."
    lines = []
    for r in results:
        lines.append(
            f"[{r['filename']} chunk {r['chunk_index']}]\n{r['document'][:600]}"
        )
    return "\n---\n".join(lines)


if __name__ == "__main__":
    store = DocumentStore()
    print(f"Document store initialized. Documents: {store.count()}")