File size: 9,179 Bytes
939a9f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
"""
Vector Store Module
===================

Purpose: Store embeddings and retrieve similar ones

This module uses Chroma for persistent, efficient vector storage.
Chroma is free, local, and production-ready.

Key Concepts:
  • Vector storage: Persistent storage mapping chunk_id → embedding
  • Metadata: Source info, text preview, etc.
  • Retrieval: Find top-k most similar vectors using cosine similarity
  • Persistence: Data survives application restarts
"""

from typing import List, Dict, Any
from dataclasses import dataclass, field
import logging
import chromadb
import os

logger = logging.getLogger(__name__)


@dataclass
class RetrievalResult:
    """A single retrieved chunk with metadata."""
    chunk_id: str
    text: str
    similarity: float
    metadata: Dict[str, Any] = field(default_factory=dict)


class ChromaVectorStore:
    """
    Vector store using Chroma (persistent, free, production-ready).
    
    Chroma is a modern vector database that:
    • Stores embeddings persistently on disk
    • Provides similarity search
    • Is completely free and open source
    • Works locally (no API calls)
    
    This is the recommended implementation for production RAG systems.
    """
    
    def __init__(self, persist_directory: str = ".chromadb", collection_name: str = "rag"):
        """
        Initialize Chroma vector store.
        
        Args:
            persist_directory: Where to store vectors on disk
            collection_name: Name of the collection (namespace)
        
        Example:
            >>> store = ChromaVectorStore(persist_directory="./data/vectors")
        """
        self.persist_directory = persist_directory
        self.collection_name = collection_name
        
        # Ensure persist directory exists
        os.makedirs(persist_directory, exist_ok=True)
        
        try:
            # Create persistent client
            self.client = chromadb.PersistentClient(path=persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=collection_name,
                metadata={"hnsw:space": "cosine"}  # Use cosine similarity
            )
            
            logger.info(
                f"✓ Initialized Chroma vector store at {persist_directory} "
                f"(collection: {collection_name})"
            )
        except Exception as e:
            logger.error(f"Failed to initialize Chroma: {e}")
            raise
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        try:
            self.client.persist()
            self.client.shutdown()
        except Exception:
            pass

    def add(
        self,
        chunk_id: str,
        text: str,
        embedding: List[float],
        metadata: Dict[str, Any] = None
    ) -> None:
        """
        Add a chunk with its embedding to the store.
        
        Args:
            chunk_id: Unique identifier for chunk
            text: Original text content
            embedding: Vector representation (list of floats)
            metadata: Optional metadata (source, page number, etc.)
        
        Example:
            >>> store.add(
            ...     "doc1_chunk_0",
            ...     "Machine learning is AI",
            ...     [0.1, 0.2, ..., 0.384],
            ...     metadata={"doc_id": "doc1", "page": 1}
            ... )
        """
        try:
            self.collection.add(
                ids=[chunk_id],
                documents=[text],
                embeddings=[embedding],
                metadatas=[metadata or {}]
            )
            logger.debug(f"Added chunk {chunk_id} ({len(text)} chars)")
        except Exception as e:
            logger.error(f"Failed to add chunk {chunk_id}: {e}")
            raise
    
    def retrieve(
        self,
        query_embedding: List[float],
        top_k: int = 5
    ) -> List[RetrievalResult]:
        """
        Find most similar chunks to query.
        
        Args:
            query_embedding: Query vector
            top_k: Number of results to return
        
        Returns:
            List of RetrievalResult objects, sorted by similarity (highest first)
        
        Example:
            >>> results = store.retrieve(query_embedding, top_k=3)
            >>> for r in results:
            ...     print(f"{r.similarity:.3f} | {r.text[:60]}")
        """
        try:
            if self.collection.count() == 0:
                logger.warning("Vector store is empty")
                return []
            
            # Query Chroma
            results = self.collection.query(
                query_embeddings=[query_embedding],
                n_results=top_k
            )
            
            if not results["ids"] or not results["ids"][0]:
                logger.debug("No results found for query")
                return []
            
            # Convert to RetrievalResult objects
            retrieval_results = []
            
            for i, chunk_id in enumerate(results["ids"][0]):
                # Chroma returns distances, convert to similarity (1 - distance for cosine)
                # Note: Chroma with cosine metric returns distances
                distance = results["distances"][0][i]
                similarity = 1 - distance  # Convert distance to similarity
                
                result = RetrievalResult(
                    chunk_id=chunk_id,
                    text=results["documents"][0][i],
                    similarity=similarity,
                    metadata=results["metadatas"][0][i]
                )
                retrieval_results.append(result)
            
            logger.debug(f"Retrieved {len(retrieval_results)} chunks")
            return retrieval_results
        
        except Exception as e:
            logger.error(f"Retrieval failed: {e}")
            raise
    
    def size(self) -> int:
        """Return number of chunks in store."""
        try:
            count = self.collection.count()
            return count
        except Exception as e:
            logger.error(f"Failed to get store size: {e}")
            return 0
    
    def delete(self, chunk_id: str) -> bool:
        """
        Delete a chunk from the store.
        
        Args:
            chunk_id: ID of chunk to delete
        
        Returns:
            True if deleted, False if not found
        """
        try:
            self.collection.delete(ids=[chunk_id])
            logger.debug(f"Deleted chunk {chunk_id}")
            return True
        except Exception as e:
            logger.error(f"Failed to delete chunk {chunk_id}: {e}")
            return False
    
    def clear(self) -> None:
        """Clear all vectors from store."""
        try:
            # Get all IDs and delete them
            all_data = self.collection.get()
            if all_data["ids"]:
                self.collection.delete(ids=all_data["ids"])
            logger.info("Cleared vector store")
        except Exception as e:
            logger.error(f"Failed to clear store: {e}")
            raise





# ============ TESTS ============

import tempfile
import shutil
import time

def test_chroma_vector_store():
    temp_dir = tempfile.mkdtemp()
    
    store = ChromaVectorStore(persist_directory=temp_dir)
    
    try:
        # Add chunks
        vec1 = [1.0, 0.0, 0.0]
        vec2 = [0.9, 0.1, 0.0]
        vec3 = [0.0, 1.0, 0.0]
        
        store.add("chunk1", "Machine learning", vec1, metadata={"source": "test"})
        store.add("chunk2", "Deep learning networks", vec2, metadata={"source": "test"})
        store.add("chunk3", "Cooking recipes", vec3, metadata={"source": "test"})
        
        # Retrieve
        results = store.retrieve(vec1, top_k=2)
        assert len(results) == 2
        assert results[0].chunk_id == "chunk1"
        print("✓ Chroma test passed!")
    
    finally:
        # Cleanup Chroma resources
        try:
            if hasattr(store, "client"):
                store.client.close()
                del store.client
                del store.collection
        except Exception as e:
            logger.warning(f"Error closing Chroma client: {e}")
        
        # Give Windows time to release file handles
        time.sleep(1.0)
        
        # Retry logic for Windows file deletion
        retry_count = 0
        max_retries = 5
        while retry_count < max_retries:
            try:
                shutil.rmtree(temp_dir)
                break
            except PermissionError:
                retry_count += 1
                if retry_count < max_retries:
                    time.sleep(0.5)
                else:
                    logger.warning(f"Could not delete temp directory {temp_dir}, skipping")
                    break



if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    
    # Test Chroma
    try:
        test_chroma_vector_store()
    except ImportError:
        print("Chroma not installed, skipping test")
    
    # Test SimpleVectorStore
    test_simple_vector_store()