File size: 12,903 Bytes
9ea1183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
import os
import json
import faiss
import numpy as np
from typing import List, Dict, Optional, Tuple
import uuid
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from retriever.embeddings import get_embedding_model
from config import Config

class DocumentStore:
    """Vector store for document storage and retrieval"""
    
    def __init__(self, vector_db_path: Optional[str] = None):
        """Initialize the document store"""
        self.vector_db_path = vector_db_path or Config.VECTOR_DB_PATH
        print(f"Using vector DB path: {self.vector_db_path}")
        
        self.embeddings = get_embedding_model()
        print("Embedding model loaded")
        
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        
        # Create directory if it doesn't exist
        os.makedirs(self.vector_db_path, exist_ok=True)
        
        # Check if index exists, otherwise create it
        self.index_path = os.path.join(self.vector_db_path, "faiss_index")
        self.documents_path = os.path.join(self.vector_db_path, "documents.json")
        
        print(f"Index path: {self.index_path}")
        print(f"Documents path: {self.documents_path}")
        
        # Load or create index
        if os.path.exists(self.index_path) and os.path.exists(self.documents_path):
            print("Found existing index and documents, loading...")
            self.load()
        else:
            print("No existing index found, initializing empty one...")
            # Initialize an empty index
            self.documents = {}
            self.document_embeddings = {}
            self.initialize_index()
            
    def initialize_index(self):
        """Initialize an empty FAISS index"""
        # Get embedding dimension from the model
        test_embedding = self.embeddings.encode("test")
        dimension = len(test_embedding)
        
        # Create empty index
        self.index = faiss.IndexFlatL2(dimension)
        self.save()
    
    def add_text(self, content: str, title: str = "Untitled") -> str:
        """

        Add text content to the document store

        

        Args:

            content (str): The text content to add

            title (str): Title for the content

            

        Returns:

            str: Document ID

        """
        # Generate a unique ID for the document
        doc_id = str(uuid.uuid4())
        
        # Split text into chunks
        chunks = self.text_splitter.split_text(content)
        
        # Store document metadata
        self.documents[doc_id] = {
            "title": title,
            "chunks": chunks,
            "type": "text"
        }
        
        # Compute and store embeddings for each chunk
        chunk_embeddings = []
        for i, chunk in enumerate(chunks):
            embedding = self.embeddings.encode(chunk)
            chunk_id = f"{doc_id}_{i}"
            self.document_embeddings[chunk_id] = {
                "doc_id": doc_id,
                "chunk_index": i
            }
            chunk_embeddings.append(embedding)
        
        # Add embeddings to FAISS index
        if chunk_embeddings:
            self.index.add(np.array(chunk_embeddings, dtype=np.float32))
            self.save()
        
        return doc_id
    
    def add_document(self, file_path: str) -> str:
        """

        Process and add a document file to the store

        

        Args:

            file_path (str): Path to the document file

            

        Returns:

            str: Document ID

        """
        # Determine file type and use appropriate loader
        if file_path.lower().endswith('.pdf'):
            loader = PyPDFLoader(file_path)
            docs = loader.load()
        elif file_path.lower().endswith('.txt'):
            loader = TextLoader(file_path)
            docs = loader.load()
        else:
            raise ValueError(f"Unsupported file type: {file_path}")
        
        # Extract text from documents
        content = "\n\n".join([doc.page_content for doc in docs])
        title = os.path.basename(file_path)
        
        # Add text to document store
        return self.add_text(content, title)
    
    def search(self, query: str, top_k: int = 5) -> List[Dict]:
        """

        Search for relevant document chunks

        

        Args:

            query (str): The search query

            top_k (int): Number of results to return

            

        Returns:

            List[Dict]: List of document chunks with metadata

        """
        # Check if there are any documents first
        if not self.documents:
            print("No documents in store during search")
            return []
            
        # Print debug information
        print(f"Searching for: {query}")
        print(f"Document count: {len(self.documents)}")
        print(f"Document embeddings count: {len(self.document_embeddings)}")
        
        # Encode the query
        query_vector = self.embeddings.encode(query)
        query_vector = np.array([query_vector], dtype=np.float32)
        
        # Search the index
        distances, indices = self.index.search(query_vector, top_k)
        print(f"Search returned {len(indices[0])} results")
        print(f"Indices: {indices[0]}")
        print(f"Distances: {distances[0]}")
        
        results = []
        for i, idx in enumerate(indices[0]):
            # Skip invalid indices
            if idx == -1:
                continue
                
            # Skip results with distance above threshold - TEMPORARILY DISABLED FOR DEBUGGING
            # if distances[0][i] > Config.SIMILARITY_THRESHOLD:
            #     print(f"Skipping result with distance {distances[0][i]} (above threshold {Config.SIMILARITY_THRESHOLD})")
            #     continue
            print(f"Processing result with distance {distances[0][i]}")
                
            # Find the corresponding chunk ID
            chunk_ids = list(self.document_embeddings.keys())
            if idx >= len(chunk_ids):
                print(f"Index {idx} out of range for chunk_ids (len: {len(chunk_ids)})")
                continue
                
            chunk_id = chunk_ids[idx]
            chunk_info = self.document_embeddings[chunk_id]
            doc_id = chunk_info["doc_id"]
            chunk_index = chunk_info["chunk_index"]
            
            # Get document content
            if doc_id not in self.documents:
                print(f"Document ID {doc_id} not found in documents")
                continue
                
            document = self.documents[doc_id]
            if chunk_index >= len(document["chunks"]):
                print(f"Chunk index {chunk_index} out of range for document {doc_id}")
                continue
                
            chunk_content = document["chunks"][chunk_index]
            
            print(f"Found relevant chunk: {chunk_content[:50]}...")
            
            results.append({
                "content": chunk_content,
                "title": document["title"],
                "similarity": float(1 - distances[0][i] / 2),  # Normalize similarity score
                "doc_id": doc_id
            })
        
        print(f"Returning {len(results)} results")
        return results
    def save(self):
        """Save the index and documents to disk"""
        # Save FAISS index
        faiss.write_index(self.index, self.index_path)
        
        # Save documents and mappings
        data = {
            "documents": self.documents,
            "document_embeddings": self.document_embeddings
        }
        with open(self.documents_path, 'w') as f:
            json.dump(data, f)
    
    def load(self):
        """Load the index and documents from disk"""
        try:
            # Load FAISS index
            self.index = faiss.read_index(self.index_path)
            
            # Load documents and mappings
            with open(self.documents_path, 'r') as f:
                data = json.load(f)
                self.documents = data.get("documents", {})
                self.document_embeddings = data.get("document_embeddings", {})
                
            print(f"Loaded {len(self.documents)} documents and {len(self.document_embeddings)} embeddings")
            
            # Verify document structure
            for doc_id, doc in self.documents.items():
                if "chunks" not in doc:
                    print(f"Warning: Document {doc_id} missing 'chunks' field")
                elif not doc["chunks"]:
                    print(f"Warning: Document {doc_id} has empty 'chunks' list")
                
            # Verify embedding-document relationships
            for chunk_id, chunk_info in self.document_embeddings.items():
                doc_id = chunk_info.get("doc_id")
                if doc_id not in self.documents:
                    print(f"Warning: Embedding {chunk_id} refers to non-existent document {doc_id}")
                    continue
                    
                chunk_index = chunk_info.get("chunk_index")
                if chunk_index is None:
                    print(f"Warning: Embedding {chunk_id} missing 'chunk_index'")
                    continue
                    
                doc = self.documents[doc_id]
                if "chunks" not in doc or chunk_index >= len(doc["chunks"]):
                    print(f"Warning: Embedding {chunk_id} refers to non-existent chunk {chunk_index} in document {doc_id}")
        
        except Exception as e:
            print(f"Error loading document store: {e}")
            # Initialize empty collections
            self.documents = {}
            self.document_embeddings = {}
            self.initialize_index()
            
    def rebuild_index(self):
        """Rebuild the index from all documents"""
        # Get embedding dimension
        test_embedding = self.embeddings.encode("test")
        dimension = len(test_embedding)
        
        # Create a new index
        self.index = faiss.IndexFlatL2(dimension)
        
        # Re-embed and add all chunks
        all_embeddings = []
        
        for doc_id, doc_info in self.documents.items():
            chunks = doc_info.get("chunks", [])
            for chunk in chunks:
                embedding = self.embeddings.encode(chunk)
                all_embeddings.append(embedding)
        
        if all_embeddings:
            self.index.add(np.array(all_embeddings, dtype=np.float32))
            
        self.save()

    def load_from_json(self, json_data):
        """Load documents from provided JSON data"""
        self.documents = json_data.get("documents", {})
        self.document_embeddings = json_data.get("document_embeddings", {})
        
        # Rebuild the index
        self.rebuild_index()

    def rebuild_index_from_scratch(self):
        """Completely rebuild the index from the documents"""
        print("Rebuilding search index from scratch...")
        
        # Get embedding dimension
        test_embedding = self.embeddings.encode("test")
        dimension = len(test_embedding)
        
        # Create a new index
        self.index = faiss.IndexFlatL2(dimension)
        
        # Track mappings between index positions and document chunks
        self.document_embeddings = {}
        current_idx = 0
        
        # Re-embed and add all chunks
        all_embeddings = []
        
        for doc_id, doc_info in self.documents.items():
            chunks = doc_info.get("chunks", [])
            print(f"Processing document {doc_id} with {len(chunks)} chunks")
            
            for i, chunk in enumerate(chunks):
                embedding = self.embeddings.encode(chunk)
                all_embeddings.append(embedding)
                
                # Store mapping
                chunk_id = f"{doc_id}_{i}"
                self.document_embeddings[chunk_id] = {
                    "doc_id": doc_id,
                    "chunk_index": i
                }
                current_idx += 1
        
        # Add all embeddings to index at once
        if all_embeddings:
            print(f"Adding {len(all_embeddings)} embeddings to index")
            self.index.add(np.array(all_embeddings, dtype=np.float32))
        else:
            print("No embeddings to add to index")
            
        self.save()
        print("Index rebuild complete")