Rajhuggingface4253 commited on
Commit
7af1947
·
verified ·
1 Parent(s): 11b6956

Update vector.py

Browse files
Files changed (1) hide show
  1. vector.py +1281 -376
vector.py CHANGED
@@ -11,6 +11,25 @@ import ast
11
  import re
12
  from filelock import FileLock
13
  import atexit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # Configure Logging
16
  logging.basicConfig(
@@ -25,23 +44,1108 @@ class VectorDatabase:
25
  self.metadata_path = metadata_path
26
  self.lock_path = index_path + ".lock"
27
 
28
- # File lock for multi-process safety
29
- self.file_lock = FileLock(self.lock_path, timeout=60)
30
- self.memory_lock = threading.Lock()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- logger.info("🧠 Initializing Production Vector Engine (Multi-Worker Safe)...")
 
 
33
 
34
- # Load models
35
- self.embedder = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
36
- self.ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2", cache_dir="./flashrank_cache")
 
 
 
 
 
 
37
 
38
- # Load or create index with file locking
39
- self._load_or_create_index()
40
 
41
- # Register cleanup
42
- atexit.register(self._cleanup)
 
 
 
 
 
 
 
43
 
44
- logger.info(f"✅ Vector Engine Ready. Index: {self.index.ntotal} vectors, {len(self.metadata)} metadata entries")
 
 
 
 
 
45
 
46
  def _load_or_create_index(self):
47
  """Thread-safe and process-safe index loading/creation"""
@@ -50,8 +1154,18 @@ class VectorDatabase:
50
  try:
51
  logger.info("📂 Loading existing vector index...")
52
  self.index = faiss.read_index(self.index_path)
 
 
 
 
53
  with open(self.metadata_path, "rb") as f:
54
  self.metadata = pickle.load(f)
 
 
 
 
 
 
55
  logger.info(f"✅ Loaded index with {self.index.ntotal} vectors, {len(self.metadata)} metadata entries")
56
  except Exception as e:
57
  logger.error(f"⚠️ Failed to load index: {e}. Creating new one.")
@@ -61,298 +1175,148 @@ class VectorDatabase:
61
  self._create_new_index()
62
 
63
  def _create_new_index(self):
64
- """Create fresh IndexFlatL2 for reliable performance"""
65
  dimension = 384
66
- # Use IndexFlatIP (inner product) for cosine similarity
67
- # Or IndexFlatL2 for Euclidean distance
68
- self.index = faiss.IndexFlatIP(dimension) # Cosine similarity
69
  self.metadata = []
70
  logger.info(f"🆕 Created new IndexFlatIP with dimension {dimension}")
71
 
72
  def _save_index(self):
73
  """Thread-safe and process-safe index saving with atomic writes"""
74
  with self.file_lock:
75
- # Create temporary files
76
  temp_index = f"{self.index_path}.tmp"
77
  temp_meta = f"{self.metadata_path}.tmp"
78
 
79
  try:
80
- # Save to temporary files
81
  faiss.write_index(self.index, temp_index)
82
  with open(temp_meta, "wb") as f:
83
  pickle.dump(self.metadata, f)
84
 
85
- # Atomic rename (POSIX operation)
86
  os.replace(temp_index, self.index_path)
87
  os.replace(temp_meta, self.metadata_path)
88
 
89
- logger.info(f"💾 Saved index: {self.index.ntotal} vectors, {len(self.metadata)} metadata")
90
  except Exception as e:
91
  logger.error(f"❌ Failed to save index: {e}")
92
- # Clean up temp files on failure
93
  for f in [temp_index, temp_meta]:
94
  if os.path.exists(f):
95
  try:
96
  os.remove(f)
97
- except:
98
- pass
 
 
99
 
100
- def _chunk_smart_code(self, text, filename):
101
- """
102
- Structure-aware chunker for JS, HTML, CSS, etc.
103
- Splits by logical boundaries (tags, functions) instead of random characters.
104
- """
105
- ext = os.path.splitext(filename)[1].lower()
106
- chunks = []
107
-
108
- # Define split patterns for different languages
109
- patterns = {
110
- # HTML/XML: Split before opening tags, effectively keeping tags grouped
111
- '.html': r'(?=\n\s*<[^/])',
112
- '.htm': r'(?=\n\s*<[^/])',
113
- '.xml': r'(?=\n\s*<[^/])',
114
- '.vue': r'(?=\n\s*<[^/])',
115
- # JS/TS: Split before major keywords
116
- '.js': r'(?=\n\s*(?:function|class|const|let|var|export|import|async))',
117
- '.jsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async))',
118
- '.ts': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type))',
119
- '.tsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type))',
120
- # CSS: Split before selectors
121
- '.css': r'(?=\n\s*[.#@a-zA-Z])',
122
- '.scss': r'(?=\n\s*[.#@a-zA-Z])',
123
- }
124
-
125
- pattern = patterns.get(ext)
126
-
127
- # Fallback to standard if no pattern matches or regex fails
128
- if not pattern:
129
- return self._chunk_text_standard(text)
130
-
131
  try:
132
- # 1. Split by pattern
133
- segments = re.split(pattern, text)
134
 
135
- # 2. Re-group segments into chunks of appropriate size (e.g., 1000 chars)
136
- current_chunk = ""
137
- TARGET_SIZE = 1000
 
138
 
139
- for seg in segments:
140
- if not seg.strip(): continue
141
-
142
- # If adding this segment exceeds target, save current and start new
143
- if len(current_chunk) + len(seg) > TARGET_SIZE and len(current_chunk) > 100:
144
- chunks.append({
145
- "text": current_chunk.strip(),
146
- "type": "code_block",
147
- "name": f"block_{len(chunks)}"
148
- })
149
- current_chunk = seg
150
- else:
151
- current_chunk += seg
152
 
153
- # Add final chunk
154
- if current_chunk:
155
- chunks.append({
156
- "text": current_chunk.strip(),
157
- "type": "code_block",
158
- "name": f"block_{len(chunks)}"
159
- })
160
 
161
- return chunks
 
 
 
 
162
 
163
- except Exception as e:
164
- logger.warning(f"Smart chunking failed for {filename}: {e}. Falling back.")
165
- return self._chunk_text_standard(text)
166
-
167
- def _chunk_python_code(self, text, filename):
168
- """Improved AST chunker that captures EVERYTHING (not just functions)"""
169
- chunks = []
170
- try:
171
- tree = ast.parse(text)
172
- lines = text.splitlines()
173
 
174
- # 1. Global Context (Imports & Assignments)
175
- global_context = []
176
 
177
- # 2. Iterate nodes to find blocks
178
- for node in tree.body:
179
- if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)):
180
- # Extract the block
181
- start = node.lineno - 1
182
- end = node.end_lineno
183
- block_text = "\n".join(lines[start:end])
184
-
185
- chunks.append({
186
- "text": block_text,
187
- "type": "code_function",
188
- "name": node.name
189
- })
190
- elif isinstance(node, (ast.Import, ast.ImportFrom, ast.Assign, ast.Expr)):
191
- # Group top-level scripts/imports together
192
- # We approximate by grabbing the line
193
- if hasattr(node, 'end_lineno'):
194
- start = node.lineno - 1
195
- end = node.end_lineno
196
- global_context.append("\n".join(lines[start:end]))
197
 
198
- # Add the collected global context as the first chunk
199
- if global_context:
200
- # Group globals into chunks of 1000 chars
201
- full_global = "\n".join(global_context)
202
- if len(full_global) > 100:
203
- chunks.insert(0, {
204
- "text": full_global[:1500], # Cap context size
205
- "type": "code_context",
206
- "name": "imports_and_globals"
207
- })
208
-
209
  except Exception as e:
210
- logger.warning(f"AST parsing failed for {filename}: {e}")
211
- return self._chunk_text_standard(text)
212
-
213
- # Fallback: if AST yielded nothing (e.g. empty file), use standards
214
- if not chunks:
215
- return self._chunk_text_standard(text)
216
-
217
- return chunks
218
 
219
- def _chunk_text_standard(self, text, chunk_size=500, overlap=50):
220
- """Standard text chunking with sliding window"""
221
- chunks = []
222
-
223
- # Handle very short text
224
- if len(text) <= chunk_size:
225
- return [{
226
- "text": text,
227
- "type": "text_block",
228
- "name": "full_content"
229
- }]
230
 
231
- # Create overlapping chunks
232
- for i in range(0, len(text), chunk_size - overlap):
233
- chunk = text[i:i + chunk_size]
234
- if len(chunk) > 100: # Minimum chunk size
235
- chunks.append({
236
- "text": chunk,
237
- "type": "text_block",
238
- "name": f"chunk_{i//chunk_size}"
239
- })
240
 
241
- return chunks
 
242
 
243
- def store_session_document(self, text: str, filename: str, user_id: str, chat_id: str):
244
- """Store extracted file content with 'Whole File' capability & Verification"""
245
- if not text or len(text) < 10 or not user_id:
246
- logger.warning(f"Invalid input for {filename}")
247
- return False
248
-
249
- logger.info(f"📥 Storing {filename} ({len(text)} chars) for user {user_id[:8]}...")
250
-
251
- # Determine chunking strategy
252
- chunks_data = []
253
- ext = os.path.splitext(filename)[1].lower()
254
-
255
- try:
256
- if ext == '.py':
257
- chunks_data = self._chunk_python_code(text, filename)
258
- elif ext in ['.js', '.html', '.css', '.java', '.cpp', '.ts', '.tsx', '.jsx', '.vue', '.xml']:
259
- # Use Smart Regex Chunking
260
- chunks_data = self._chunk_smart_code(text, filename)
261
- else:
262
- chunks_data = self._chunk_text_standard(text, chunk_size=500, overlap=50)
263
- except Exception as e:
264
- logger.error(f"Chunking failed for {filename}: {e}")
265
- chunks_data = self._chunk_text_standard(text, chunk_size=500, overlap=50)
266
-
267
- # Ensure we have chunks
268
- if not chunks_data and text:
269
- chunks_data = [{
270
- "text": text[:2000],
271
- "type": "fallback",
272
- "name": "full_document"
273
- }]
274
-
275
- if not chunks_data:
276
- logger.error(f"No chunks generated for {filename}")
277
- return False
278
 
279
- # Prepare for indexing
280
- final_texts = []
281
- final_meta = []
282
 
283
- # 1. Process Standard Chunks
284
- for chunk in chunks_data:
285
- final_texts.append(chunk["text"])
286
- final_meta.append({
287
- "text": chunk["text"],
288
- "source": filename,
289
- "type": "file",
290
- "subtype": chunk.get("type", "general"),
291
- "name": chunk.get("name", "unknown"),
292
- "user_id": user_id,
293
- "chat_id": chat_id,
294
- "timestamp": time.time(),
295
- "chunk_index": len(final_texts)
296
  })
297
-
298
- # 2. Add "Whole File" Entry (FIXED FOR INTENT SEPARATION)
299
- marker_text = f"Entire full content of file {filename} code"
300
- final_texts.append(marker_text)
301
- final_meta.append({
302
- "text": marker_text,
303
- "actual_content": text, # The full content
304
- "source": filename,
305
-
306
- # --- THE FIX ---
307
- "type": "file", # Visible to 'file' searches
308
- "subtype": "whole_file", # Identified by Ranking Logic
309
- # ----------------
310
-
311
- "user_id": user_id,
312
- "chat_id": chat_id,
313
- "timestamp": time.time(),
314
- "chunk_index": -1
315
- })
316
 
317
- # Embed and add to index
318
- try:
319
- embeddings = self.embedder.encode(final_texts)
320
- faiss.normalize_L2(embeddings)
321
-
322
- with self.memory_lock:
323
- self.index.add(np.array(embeddings).astype('float32'))
324
- self.metadata.extend(final_meta)
325
- self._save_index()
326
 
327
- logger.info(f"✅ Stored {len(final_texts)} chunks from {filename} for user {user_id[:8]}")
328
-
329
- # Verify storage
330
- self._verify_storage(user_id, len(final_texts))
331
-
332
- return True
333
 
334
- except Exception as e:
335
- logger.error(f"❌ Failed to store vectors for {filename}: {e}")
336
- return False
337
-
338
- def _verify_storage(self, user_id, expected_count):
339
- """Verify vectors were stored correctly"""
340
- with self.memory_lock:
341
- user_vectors = sum(1 for m in self.metadata if m.get("user_id") == user_id)
342
-
343
- logger.info(f"🔍 Storage verification: User {user_id[:8]} has {user_vectors} vectors (expected: {expected_count})")
344
 
345
- if user_vectors < expected_count:
346
- logger.warning(f"⚠️ Storage mismatch for user {user_id[:8]}")
347
 
348
- def store_chat_context(self, messages: list, user_id: str, chat_id: str):
349
  """Store chat history as session memory"""
350
  if not messages or not user_id:
351
  return False
352
 
353
- # Format conversation
354
  conversation = ""
355
- for msg in messages[-10:]: # Last 10 messages
356
  role = msg.get("role", "unknown")
357
  content = msg.get("content", "")
358
  if content:
@@ -361,13 +1325,11 @@ class VectorDatabase:
361
  if len(conversation) < 50:
362
  return False
363
 
364
- # Chunk conversation
365
- chunks = self._chunk_text_standard(conversation, chunk_size=800, overlap=100)
366
 
367
  if not chunks:
368
  return False
369
 
370
- # Prepare for indexing
371
  texts = [c["text"] for c in chunks]
372
  metadata_list = []
373
 
@@ -382,9 +1344,8 @@ class VectorDatabase:
382
  "chunk_index": i
383
  })
384
 
385
- # Store in index
386
  try:
387
- embeddings = self.embedder.encode(texts)
388
  faiss.normalize_L2(embeddings)
389
 
390
  with self.memory_lock:
@@ -392,165 +1353,94 @@ class VectorDatabase:
392
  self.metadata.extend(metadata_list)
393
  self._save_index()
394
 
 
 
 
395
  logger.info(f"💭 Stored {len(texts)} chat history chunks for user {user_id[:8]}")
396
  return True
397
 
398
  except Exception as e:
399
- logger.error(f"Failed to store chat history: {e}")
400
  return False
401
 
402
- def retrieve_session_context(self, query: str, user_id: str, chat_id: str, filter_type: str = None, top_k=100, final_k=5, min_score=0.25):
403
- """
404
- Retrieve context with 'Whole File' capability.
405
- MAINTAINS SEPARATION: Files vs. History
406
- """
407
- if self.index.ntotal == 0 or not user_id:
408
- return []
409
-
410
- # Debug info
411
- with self.memory_lock:
412
- total_vectors = self.index.ntotal
413
- user_vectors = sum(1 for m in self.metadata if m.get("user_id") == user_id)
414
-
415
- logger.info(f"🔍 Searching for user {user_id[:8]} (User vectors: {user_vectors}/{total_vectors})")
416
-
417
- # Encode query
418
- query_vec = self.embedder.encode([query])
419
- faiss.normalize_L2(query_vec)
420
-
421
- # Search
422
- search_k = min(top_k * 3, self.index.ntotal) if self.index.ntotal > 0 else 1
423
- with self.memory_lock:
424
- D, I = self.index.search(np.array(query_vec).astype('float32'), search_k)
425
-
426
- candidates = []
427
- valid_count = 0
428
- query_lower = query.lower()
429
-
430
- for i, idx in enumerate(I[0]):
431
- if idx == -1 or idx >= len(self.metadata): continue
432
-
433
- item = self.metadata[idx]
434
-
435
- # 1. STRICT ISOLATION (User & Session)
436
- if item.get("user_id") != user_id: continue
437
- if item.get("chat_id") != chat_id: continue
438
-
439
- # 2. INTENT SEPARATION (File vs. History)
440
- # If front-end asks for 'file', we return 'file' (which now includes whole_files).
441
- # If front-end asks for 'history', we return 'history'.
442
- if filter_type and item.get("type") != filter_type:
443
- continue
444
-
445
- score = D[0][i]
446
-
447
- # 3. WHOLE FILE RANKING LOGIC
448
- # We now check 'subtype' instead of 'type'
449
- filename = item.get("source", "").lower()
450
- is_whole_file = item.get("subtype") == "whole_file" # <--- UPDATED CHECK
451
-
452
- if is_whole_file:
453
- if filename in query_lower:
454
- score = 2.0 # Force to top
455
-
456
- if item.get("actual_content"):
457
- item = item.copy()
458
- item["text"] = item["actual_content"]
459
-
460
- # 4. GATEKEEPER
461
- if score < min_score: continue
462
-
463
- candidates.append({
464
- "id": int(idx),
465
- "text": item.get("text", ""),
466
- "meta": item,
467
- "score": score
468
- })
469
- valid_count += 1
470
-
471
- if not candidates: return []
472
-
473
- candidates.sort(key=lambda x: x["score"], reverse=True)
474
-
475
- if candidates[0]["score"] >= 2.0:
476
- logger.info(f"🎯 Returning Whole File: {candidates[0]['meta'].get('source')}")
477
- return candidates[:1]
478
-
479
- try:
480
- rerank_request = RerankRequest(query=query, passages=candidates)
481
- results = self.ranker.rerank(rerank_request)
482
- final_results = [r for r in results[:final_k] if r['score'] > min_score]
483
- return final_results
484
- except Exception as e:
485
- logger.error(f"Reranking failed: {e}")
486
- return candidates[:final_k]
487
-
488
- def delete_session(self, user_id: str, chat_id: str):
489
  """Surgical Strike: Permanently remove ONLY one specific session"""
490
  with self.memory_lock:
491
- # 1. Filter: Keep everything that is NOT this specific chat
492
  new_metadata = []
493
  removed_count = 0
494
 
495
  for meta in self.metadata:
496
- # Check strict ownership and ID match
497
  if meta.get("user_id") == user_id and meta.get("chat_id") == chat_id:
498
  removed_count += 1
499
  else:
500
  new_metadata.append(meta)
501
 
502
  if removed_count == 0:
503
- return False # Nothing to delete
 
504
 
505
  logger.info(f"🧹 Surgically removing {removed_count} vectors for session {chat_id}...")
506
 
507
- # 2. Rebuild Index (Required for FAISS IndexFlatIP)
508
  if not new_metadata:
509
- self.index = faiss.IndexFlatIP(384) # Reset empty
510
  else:
511
- # Re-embed surviving text to rebuild index
512
- # (Optimization: In a huge DB, use IndexIDMap, but for now this is safe)
513
  surviving_texts = [m["text"] for m in new_metadata]
514
  try:
515
- embeddings = self.embedder.encode(surviving_texts)
516
  faiss.normalize_L2(embeddings)
517
 
518
  new_index = faiss.IndexFlatIP(384)
519
  new_index.add(np.array(embeddings).astype('float32'))
520
  self.index = new_index
521
  except Exception as e:
522
- logger.error(f"Rebuild failed: {e}")
523
  return False
524
 
525
  self.metadata = new_metadata
526
  self._save_index()
 
 
 
 
 
527
  return True
528
-
529
- def get_user_stats(self, user_id: str):
530
  """Get statistics for a user's session"""
531
  with self.memory_lock:
532
  user_vectors = []
533
- for meta in self.metadata:
534
- if meta.get("user_id") == user_id:
535
  user_vectors.append(meta)
536
 
537
  stats = {
538
  "user_id": user_id,
539
  "total_vectors": len(user_vectors),
540
  "by_type": {},
541
- "by_source": {}
 
 
542
  }
543
 
544
- for vec in user_vectors:
545
  vec_type = vec.get("type", "unknown")
546
  source = vec.get("source", "unknown")
 
547
 
548
  stats["by_type"][vec_type] = stats["by_type"].get(vec_type, 0) + 1
549
  stats["by_source"][source] = stats["by_source"].get(source, 0) + 1
 
 
 
 
 
 
 
 
550
 
551
  return stats
552
 
553
- def cleanup_old_sessions(self, max_age_hours=24):
554
  """Clean up old session data"""
555
  current_time = time.time()
556
  cutoff = current_time - (max_age_hours * 3600)
@@ -558,35 +1448,46 @@ class VectorDatabase:
558
  with self.memory_lock:
559
  old_metadata = []
560
  new_metadata = []
 
561
 
562
- for i, meta in enumerate(self.metadata):
563
  if meta.get("timestamp", 0) < cutoff:
564
- old_metadata.append((i, meta))
 
 
 
 
565
  else:
566
  new_metadata.append(meta)
567
 
568
  if not old_metadata:
569
  return 0
570
 
571
- # Rebuild index with only recent vectors
572
  logger.info(f"🧹 Cleaning up {len(old_metadata)} old vectors...")
573
 
574
- # Extract recent texts
575
  recent_texts = [m["text"] for m in new_metadata]
576
 
577
  if recent_texts:
578
- embeddings = self.embedder.encode(recent_texts)
579
- faiss.normalize_L2(embeddings)
580
-
581
- # Create new index
582
- self.index = faiss.IndexFlatIP(384)
583
- self.index.add(np.array(embeddings).astype('float32'))
 
 
 
584
  else:
585
  self.index = faiss.IndexFlatIP(384)
586
 
587
  self.metadata = new_metadata
588
  self._save_index()
589
 
 
 
 
 
 
590
  return len(old_metadata)
591
 
592
  def _cleanup(self):
@@ -594,17 +1495,21 @@ class VectorDatabase:
594
  try:
595
  if hasattr(self, 'file_lock'):
596
  self.file_lock.release()
597
- except:
598
- pass
 
599
 
600
  # Global instance (singleton pattern)
601
  _vdb_instance = None
 
602
 
603
- def get_vector_db():
604
- """Singleton factory for VectorDatabase"""
605
  global _vdb_instance
606
  if _vdb_instance is None:
607
- _vdb_instance = VectorDatabase()
 
 
608
  return _vdb_instance
609
 
610
  # For backward compatibility
 
11
  import re
12
  from filelock import FileLock
13
  import atexit
14
+ import gc
15
+ from typing import List, Dict, Any, Optional, Tuple, Union
16
+ from collections import defaultdict, OrderedDict # <-- FIX 1: Add OrderedDict
17
+
18
+ # === NEW IMPORTS FOR HYBRID SEARCH ===
19
+ try:
20
+ from rank_bm25 import BM25Okapi
21
+ BM25_AVAILABLE = True
22
+ except ImportError:
23
+ BM25_AVAILABLE = False
24
+ logging.warning("BM25 not available. Install: pip install rank-bm25")
25
+
26
+ try:
27
+ import nltk
28
+ from nltk.tokenize import word_tokenize, sent_tokenize
29
+ NLTK_AVAILABLE = True
30
+ except ImportError:
31
+ NLTK_AVAILABLE = False
32
+ logging.warning("NLTK not available. Install: pip install nltk")
33
 
34
  # Configure Logging
35
  logging.basicConfig(
 
44
  self.metadata_path = metadata_path
45
  self.lock_path = index_path + ".lock"
46
 
47
+ # File lock for multi-process safety
48
+ self.file_lock = FileLock(self.lock_path, timeout=60)
49
+ self.memory_lock = threading.RLock()
50
+
51
+ logger.info("🧠 Initializing Production Vector Engine with Hybrid Search...")
52
+
53
+ # Load models with error handling
54
+ try:
55
+ self.embedder = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
56
+ self.ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2", cache_dir="./flashrank_cache")
57
+ except Exception as e:
58
+ logger.error(f"❌ Failed to load models: {e}")
59
+ raise RuntimeError(f"Model initialization failed: {e}")
60
+
61
+ # Load or create index with file locking
62
+ self._load_or_create_index()
63
+
64
+ # === FIX 1: LAZY LOADING & LRU CACHE (Memory Safe) ===
65
+ # REMOVED: self._initialize_bm25_from_metadata() - No OOM on startup!
66
+ # Instead, use LRU Cache to load sessions only when searched
67
+ self.bm25_cache_size = 50 # Limit concurrent BM25 indices in memory
68
+ self.bm25_indices = OrderedDict() # {(user_id, chat_id): BM25Okapi} with LRU
69
+ self.bm25_docs = {} # {(user_id, chat_id): [tokenized_documents]}
70
+ self.bm25_doc_to_vector = {} # {(user_id, chat_id): [vector_ids]}
71
+ self.bm25_lock = threading.RLock()
72
+
73
+ # Performance tracking
74
+ self.query_history = []
75
+ self.performance_stats = {
76
+ "exact_matches": 0,
77
+ "semantic_matches": 0,
78
+ "bm25_matches": 0,
79
+ "hybrid_matches": 0,
80
+ "fallback_matches": 0,
81
+ "avg_retrieval_time": 0
82
+ }
83
+
84
+ # Query type classification stats
85
+ self.query_types = defaultdict(int)
86
+
87
+ # Register cleanup
88
+ atexit.register(self._cleanup)
89
+
90
+ logger.info(f"✅ Vector Engine Ready. Index: {self.index.ntotal} vectors, {len(self.metadata)} metadata entries")
91
+ logger.info(f"✅ BM25 LRU Cache: {self.bm25_cache_size} sessions max, BM25 Available: {BM25_AVAILABLE}")
92
+
93
+ # ==================== FIX 2: LAZY BM25 LOADING ====================
94
+
95
+ def _get_or_build_bm25(self, user_id: str, chat_id: str) -> Optional[BM25Okapi]:
96
+ """
97
+ Retrieve BM25 index from cache or build it on-demand (Lazy Load).
98
+ Uses LRU eviction to prevent memory explosion.
99
+ """
100
+ if not BM25_AVAILABLE:
101
+ return None
102
+
103
+ key = (user_id, chat_id)
104
+
105
+ with self.bm25_lock:
106
+ # 1. CACHE HIT: Move to end (mark as recently used)
107
+ if key in self.bm25_indices:
108
+ self.bm25_indices.move_to_end(key)
109
+ return self.bm25_indices[key]
110
+
111
+ # 2. CACHE MISS: Build index on the fly
112
+ logger.debug(f"🔄 Building BM25 index on-demand for session {key}")
113
+
114
+ tokenized_corpus = []
115
+ vector_ids = []
116
+
117
+ # Filter documents for this user only (session isolation)
118
+ with self.memory_lock:
119
+ for idx, meta in enumerate(self.metadata):
120
+ if meta.get("user_id") == user_id and meta.get("chat_id") == chat_id:
121
+ text = meta.get("text", "")
122
+ tokens = self._tokenize_for_bm25(text)
123
+ if tokens: # Only add non-empty tokenized docs
124
+ tokenized_corpus.append(tokens)
125
+ vector_ids.append(idx)
126
+
127
+ if not tokenized_corpus:
128
+ logger.debug(f"⚠️ No documents found for BM25 index {key}")
129
+ return None
130
+
131
+ # Build BM25 index
132
+ try:
133
+ bm25 = BM25Okapi(tokenized_corpus)
134
+
135
+ # Store additional metadata for scoring
136
+ self.bm25_docs[key] = tokenized_corpus
137
+ self.bm25_doc_to_vector[key] = vector_ids
138
+
139
+ # 3. STORE IN CACHE with LRU EVICTION POLICY
140
+ if len(self.bm25_indices) >= self.bm25_cache_size:
141
+ # Remove oldest entry
142
+ oldest_key, _ = self.bm25_indices.popitem(last=False)
143
+ # Clean up associated data
144
+ if oldest_key in self.bm25_docs:
145
+ del self.bm25_docs[oldest_key]
146
+ if oldest_key in self.bm25_doc_to_vector:
147
+ del self.bm25_doc_to_vector[oldest_key]
148
+ logger.debug(f"🧹 Evicted BM25 cache for session {oldest_key}")
149
+
150
+ self.bm25_indices[key] = bm25
151
+ logger.debug(f"✅ Built BM25 index for session {key}: {len(tokenized_corpus)} docs")
152
+
153
+ return bm25
154
+
155
+ except Exception as e:
156
+ logger.error(f"❌ Failed to build BM25 index for {key}: {e}")
157
+ return None
158
+
159
+ def _invalidate_bm25_cache(self, user_id: str, chat_id: str):
160
+ """
161
+ Invalidate BM25 cache for a session (fast, no rebuild).
162
+ Called when new documents are added.
163
+ """
164
+ key = (user_id, chat_id)
165
+ with self.bm25_lock:
166
+ if key in self.bm25_indices:
167
+ del self.bm25_indices[key]
168
+ if key in self.bm25_docs:
169
+ del self.bm25_docs[key]
170
+ if key in self.bm25_doc_to_vector:
171
+ del self.bm25_doc_to_vector[key]
172
+ logger.debug(f"🧹 Invalidated BM25 cache for session {key}")
173
+
174
+ def _tokenize_for_bm25(self, text: str) -> List[str]:
175
+ """Tokenize text for BM25 with proper handling"""
176
+ if not text or not isinstance(text, str):
177
+ return []
178
+
179
+ # Simple tokenization if NLTK not available
180
+ if not NLTK_AVAILABLE:
181
+ # Basic regex tokenization (fallback)
182
+ tokens = re.findall(r'\b\w{2,}\b', text.lower())
183
+ return tokens
184
+
185
+ try:
186
+ # Use NLTK for better tokenization
187
+ tokens = word_tokenize(text.lower())
188
+ # Filter out very short tokens and keep alphanumeric
189
+ tokens = [t for t in tokens if len(t) >= 2 and re.match(r'^[a-z0-9]+$', t)]
190
+ return tokens
191
+ except Exception as e:
192
+ logger.warning(f"Tokenization failed: {e}, using fallback")
193
+ tokens = re.findall(r'\b\w{2,}\b', text.lower())
194
+ return tokens
195
+
196
+ # ==================== ENHANCED STORAGE WITH CACHE INVALIDATION ====================
197
+
198
+ def store_session_document(self, text: str, filename: str, user_id: str, chat_id: str) -> bool:
199
+ """Store extracted file content with enhanced chunking and cache invalidation"""
200
+ if not text or len(text) < 10 or not user_id:
201
+ logger.warning(f"Invalid input for {filename}")
202
+ return False
203
+
204
+ logger.info(f"📥 Storing {filename} ({len(text)} chars) for user {user_id[:8]}...")
205
+
206
+ chunks_data = []
207
+ ext = os.path.splitext(filename)[1].lower()
208
+
209
+ try:
210
+ # ===== FIX 4: CORRECT METHOD NAMES =====
211
+ if ext == '.py':
212
+ chunks_data = self._chunk_python_ast(text, filename) # <-- Fixed name
213
+ elif ext in ['.js', '.html', '.css', '.java', '.cpp', '.ts', '.tsx', '.jsx', '.vue', '.xml', '.scss']:
214
+ chunks_data = self._chunk_smart_code(text, filename)
215
+ else:
216
+ chunks_data = self._chunk_text_enhanced(text, chunk_size=600, overlap=100)
217
+ except Exception as e:
218
+ logger.error(f"Chunking failed for {filename}: {e}")
219
+ chunks_data = self._chunk_text_enhanced(text, chunk_size=600, overlap=100)
220
+
221
+ if not chunks_data and text:
222
+ chunks_data = [{
223
+ "text": text[:2000],
224
+ "type": "fallback",
225
+ "name": "full_document"
226
+ }]
227
+
228
+ if not chunks_data:
229
+ logger.error(f"No chunks generated for {filename}")
230
+ return False
231
+
232
+ final_texts = []
233
+ final_meta = []
234
+
235
+ for chunk in chunks_data:
236
+ final_texts.append(chunk["text"])
237
+ final_meta.append({
238
+ "text": chunk["text"],
239
+ "source": filename,
240
+ "type": "file",
241
+ "subtype": chunk.get("type", "general"),
242
+ "name": chunk.get("name", "unknown"),
243
+ "user_id": user_id,
244
+ "chat_id": chat_id,
245
+ "timestamp": time.time(),
246
+ "chunk_index": len(final_texts)
247
+ })
248
+
249
+ # Whole file embedding for comprehensive answers
250
+ whole_file_text = text[:4000] if len(text) > 4000 else text
251
+ final_texts.append(f"Complete File: {filename} | Full Content: {whole_file_text}")
252
+ final_meta.append({
253
+ "text": whole_file_text,
254
+ "actual_content": text,
255
+ "source": filename,
256
+ "type": "file",
257
+ "subtype": "whole_file",
258
+ "is_whole_file": True,
259
+ "user_id": user_id,
260
+ "chat_id": chat_id,
261
+ "timestamp": time.time(),
262
+ "chunk_index": -1
263
+ })
264
+
265
+ try:
266
+ # Optimized embedding
267
+ embeddings = self.embedder.encode(
268
+ final_texts,
269
+ show_progress_bar=False,
270
+ batch_size=32,
271
+ convert_to_numpy=True,
272
+ normalize_embeddings=True
273
+ )
274
+
275
+ faiss.normalize_L2(embeddings)
276
+
277
+ with self.memory_lock:
278
+ self.index.add(np.array(embeddings).astype('float32'))
279
+ self.metadata.extend(final_meta)
280
+ self._save_index()
281
+
282
+ logger.info(f"✅ Stored {len(final_texts)} chunks from {filename} for user {user_id[:8]}")
283
+
284
+ # ===== FIX 4: CACHE INVALIDATION instead of Immediate Rebuild =====
285
+ # When new files arrive, just invalidate the old cache.
286
+ # It will auto-rebuild (including the new file) on next search.
287
+ self._invalidate_bm25_cache(user_id, chat_id)
288
+
289
+ self._verify_storage(user_id, chat_id, len(final_texts))
290
+
291
+ return True
292
+
293
+ except Exception as e:
294
+ logger.error(f"❌ Failed to store vectors for {filename}: {e}")
295
+ # Clean up partial storage
296
+ with self.memory_lock:
297
+ if self.index.ntotal >= len(final_texts):
298
+ logger.warning("Rolling back partial storage...")
299
+ self._rollback_partial_storage(user_id, chat_id)
300
+ return False
301
+
302
+ # ==================== UPDATED BM25 SEARCH WITH LAZY LOADING ====================
303
+
304
+ def bm25_search(self, query: str, user_id: str, chat_id: str,
305
+ top_k: int = 50, min_score: float = 0.0) -> List[Dict[str, Any]]:
306
+ """
307
+ Pure BM25 search within a session with lazy loading.
308
+ Returns ranked results with BM25 scores.
309
+ """
310
+ if not BM25_AVAILABLE:
311
+ logger.warning("BM25 not available. Falling back to semantic search.")
312
+ return []
313
+
314
+ start_time = time.time()
315
+
316
+ # ===== FIX 3: USE LAZY LOADER =====
317
+ bm25_index = self._get_or_build_bm25(user_id, chat_id)
318
+
319
+ if not bm25_index:
320
+ logger.warning(f"No BM25 index for session {(user_id[:8], chat_id[:8])}")
321
+ return []
322
+
323
+ # Tokenize query
324
+ query_tokens = self._tokenize_for_bm25(query)
325
+ if not query_tokens:
326
+ return []
327
+
328
+ try:
329
+ # Get BM25 scores from the lazy-loaded index
330
+ key = (user_id, chat_id)
331
+ bm25_scores = bm25_index.get_scores(query_tokens)
332
+
333
+ # Get top-k indices
334
+ top_indices = np.argsort(bm25_scores)[::-1][:top_k * 2]
335
+
336
+ results = []
337
+ for idx in top_indices:
338
+ score = float(bm25_scores[idx])
339
+
340
+ # Apply minimum score threshold
341
+ if score < min_score:
342
+ continue
343
+
344
+ # Map BM25 doc index to vector index
345
+ if (key in self.bm25_doc_to_vector and
346
+ idx < len(self.bm25_doc_to_vector[key])):
347
+
348
+ vector_idx = self.bm25_doc_to_vector[key][idx]
349
+ if vector_idx < len(self.metadata):
350
+ meta = self.metadata[vector_idx]
351
+
352
+ # Calculate normalized score (0-1 range)
353
+ normalized_score = min(score / 10.0, 1.0) if score > 0 else 0.0
354
+
355
+ results.append({
356
+ "id": int(vector_idx),
357
+ "text": meta.get("text", ""),
358
+ "meta": meta,
359
+ "score": normalized_score,
360
+ "match_type": "bm25",
361
+ "bm25_raw_score": score,
362
+ "is_whole_file": meta.get("is_whole_file", False)
363
+ })
364
+
365
+ # Sort by BM25 score
366
+ results.sort(key=lambda x: x["score"], reverse=True)
367
+
368
+ elapsed = time.time() - start_time
369
+ logger.debug(f"BM25 search completed in {elapsed:.3f}s: {len(results)} results")
370
+
371
+ return results[:top_k]
372
+
373
+ except Exception as e:
374
+ logger.error(f"BM25 search failed: {e}")
375
+ return []
376
+
377
+ # ==================== HYBRID RETRIEVAL ENGINE (UPDATED) ====================
378
+
379
+ def hybrid_retrieve(self, query: str, user_id: str, chat_id: str,
380
+ filter_type: str = None, top_k: int = 100,
381
+ final_k: int = 5, strategy: str = "smart") -> List[Dict[str, Any]]:
382
+ """
383
+ HYBRID RETRIEVAL: BM25 + Semantic + Exact Fusion
384
+ Now with lazy-loaded BM25 indices for memory safety.
385
+ """
386
+ logger.info(f"🤖 HYBRID SEARCH: '{query[:80]}...' | Strategy: {strategy}")
387
+
388
+ # Classify query type
389
+ query_category = self._classify_query(query)
390
+ self.query_types[query_category] += 1
391
+
392
+ # Choose strategy based on query type if "smart"
393
+ if strategy == "smart":
394
+ if query_category == "code":
395
+ strategy = "bm25_first"
396
+ elif query_category == "natural":
397
+ strategy = "semantic_first"
398
+ else:
399
+ strategy = "fusion"
400
+
401
+ start_time = time.time()
402
+
403
+ # === PHASE 1: GET RESULTS FROM BOTH METHODS ===
404
+ bm25_results = []
405
+ semantic_results = []
406
+
407
+ if strategy in ["bm25_first", "fusion", "weighted", "smart"]:
408
+ bm25_results = self.bm25_search(
409
+ query=query,
410
+ user_id=user_id,
411
+ chat_id=chat_id,
412
+ top_k=top_k * 2,
413
+ min_score=0.1
414
+ )
415
+
416
+ if strategy in ["semantic_first", "fusion", "weighted", "smart"]:
417
+ semantic_results = self._semantic_search(
418
+ query=query,
419
+ user_id=user_id,
420
+ chat_id=chat_id,
421
+ filter_type=filter_type,
422
+ top_k=top_k * 2,
423
+ min_score=0.1,
424
+ final_k=top_k
425
+ )
426
+
427
+ # === PHASE 2: APPLY STRATEGY ===
428
+ if strategy == "bm25_first":
429
+ results = self._bm25_first_fusion(bm25_results, semantic_results, final_k)
430
+ elif strategy == "semantic_first":
431
+ results = self._semantic_first_fusion(semantic_results, bm25_results, final_k)
432
+ elif strategy == "fusion":
433
+ results = self._reciprocal_rank_fusion(bm25_results, semantic_results, final_k)
434
+ elif strategy == "weighted":
435
+ results = self._weighted_fusion(bm25_results, semantic_results, final_k)
436
+ else:
437
+ # Default to fusion
438
+ results = self._reciprocal_rank_fusion(bm25_results, semantic_results, final_k)
439
+
440
+ # === PHASE 3: EXACT FALLBACK IF NO RESULTS ===
441
+ if not results:
442
+ logger.info("🔄 No hybrid results, trying exact fallback...")
443
+ results = self.retrieve_exact(
444
+ query=query,
445
+ user_id=user_id,
446
+ chat_id=chat_id,
447
+ filter_type=filter_type,
448
+ aggressive=True
449
+ )
450
+ if results:
451
+ self.performance_stats["fallback_matches"] += 1
452
+ return results[:final_k]
453
+
454
+ # === PHASE 4: SMART RERANKING ===
455
+ if results and len(results) > 1:
456
+ try:
457
+ results = self._smart_rerank(query, results, final_k)
458
+ except Exception as e:
459
+ logger.warning(f"Reranking failed: {e}")
460
+
461
+ # === PHASE 5: FINAL PROCESSING ===
462
+ elapsed = time.time() - start_time
463
+
464
+ # Boost whole files for complete answers
465
+ for result in results:
466
+ if result.get("is_whole_file"):
467
+ result["score"] = min(result["score"] * 1.2, 1.0)
468
+
469
+ # Ensure scores are in 0-1 range
470
+ for result in results:
471
+ result["score"] = min(max(result["score"], 0.0), 1.0)
472
+
473
+ # Sort by final score
474
+ results.sort(key=lambda x: x["score"], reverse=True)
475
+
476
+ # Update performance stats
477
+ if results:
478
+ self.performance_stats["hybrid_matches"] += 1
479
+ logger.info(f"✅ Hybrid search found {len(results)} results in {elapsed:.3f}s")
480
+ logger.info(f"🏆 Top score: {results[0]['score']:.3f}, Type: {results[0].get('match_type', 'unknown')}")
481
+ else:
482
+ logger.warning(f"❌ Hybrid search found no results")
483
+
484
+ return results[:final_k]
485
+
486
+ # ==================== CORE METHODS (PRESERVED WITH FIXES) ====================
487
+
488
+ def _chunk_python_ast(self, text: str, filename: str) -> List[Dict[str, Any]]:
489
+ """Enhanced AST chunker with better context preservation"""
490
+ chunks = []
491
+ try:
492
+ tree = ast.parse(text)
493
+ lines = text.splitlines()
494
+
495
+ global_context = []
496
+
497
+ for node in tree.body:
498
+ if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)):
499
+ start = max(0, node.lineno - 4)
500
+ end = node.end_lineno + 2
501
+ block_text = "\n".join(lines[start:end])
502
+
503
+ chunks.append({
504
+ "text": f"File: {filename} | Type: {type(node).__name__} | Name: {node.name} | Content: {block_text}",
505
+ "type": "code_function",
506
+ "name": node.name,
507
+ "line_start": start,
508
+ "line_end": end
509
+ })
510
+ elif isinstance(node, (ast.Import, ast.ImportFrom, ast.Assign, ast.Expr)):
511
+ if hasattr(node, 'end_lineno'):
512
+ start = node.lineno - 1
513
+ end = node.end_lineno
514
+ global_context.append("\n".join(lines[start:end]))
515
+
516
+ # Add global context as a separate chunk
517
+ if global_context:
518
+ full_global = "\n".join(global_context)
519
+ if len(full_global) > 50:
520
+ chunks.insert(0, {
521
+ "text": f"File: {filename} | Type: imports_and_globals | Content: {full_global[:2000]}",
522
+ "type": "code_context",
523
+ "name": "imports_and_globals"
524
+ })
525
+
526
+ except Exception as e:
527
+ logger.warning(f"AST parsing failed for {filename}: {e}")
528
+ return self._chunk_text_enhanced(text)
529
+
530
+ if not chunks:
531
+ return self._chunk_text_enhanced(text)
532
+
533
+ return chunks
534
+
535
+ def _chunk_smart_code(self, text: str, filename: str) -> List[Dict[str, Any]]:
536
+ """ENHANCED Structure-aware chunker with context preservation"""
537
+ ext = os.path.splitext(filename)[1].lower()
538
+ chunks = []
539
+
540
+ # Define split patterns for different languages
541
+ patterns = {
542
+ '.html': r'(?=\n\s*<[^/])',
543
+ '.htm': r'(?=\n\s*<[^/])',
544
+ '.xml': r'(?=\n\s*<[^/])',
545
+ '.vue': r'(?=\n\s*<[^/])',
546
+ '.js': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|def|if|for|while|switch))',
547
+ '.jsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|def|if|for|while|switch))',
548
+ '.ts': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type|def|if|for|while))',
549
+ '.tsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type|def|if|for|while))',
550
+ '.css': r'(?=\n\s*[.#@a-zA-Z])',
551
+ '.scss': r'(?=\n\s*[.#@a-zA-Z])',
552
+ '.java': r'(?=\n\s*(?:public|private|protected|class|interface|enum|@))',
553
+ '.cpp': r'(?=\n\s*(?:#include|using|namespace|class|struct|enum|template))',
554
+ }
555
+
556
+ pattern = patterns.get(ext)
557
+
558
+ # Fallback to standard if no pattern matches or regex fails
559
+ if not pattern:
560
+ return self._chunk_text_enhanced(text)
561
+
562
+ try:
563
+ segments = re.split(pattern, text)
564
+
565
+ # Process with CONTEXT OVERLAP for better retrieval
566
+ current_chunk = ""
567
+ TARGET_SIZE = 800
568
+ OVERLAP_SIZE = 100
569
+
570
+ for seg_idx, seg in enumerate(segments):
571
+ if not seg.strip():
572
+ continue
573
+
574
+ # Check if adding this segment would exceed target
575
+ if len(current_chunk) + len(seg) > TARGET_SIZE and len(current_chunk) > 50:
576
+ # Save current chunk
577
+ chunk_text = current_chunk.strip()
578
+ if chunk_text:
579
+ chunks.append({
580
+ "text": f"File: {filename} | Content: {chunk_text}",
581
+ "type": "code_block",
582
+ "name": f"block_{len(chunks)}",
583
+ "context_id": seg_idx
584
+ })
585
+
586
+ # Start new chunk with overlap from previous
587
+ current_chunk = current_chunk[-OVERLAP_SIZE:] + "\n" + seg if OVERLAP_SIZE > 0 else seg
588
+ else:
589
+ current_chunk += seg
590
+
591
+ # Add final chunk
592
+ if current_chunk:
593
+ chunks.append({
594
+ "text": f"File: {filename} | Content: {current_chunk.strip()}",
595
+ "type": "code_block",
596
+ "name": f"block_{len(chunks)}",
597
+ "context_id": len(segments)
598
+ })
599
+
600
+ return chunks
601
+ except Exception as e:
602
+ logger.warning(f"Smart chunking failed for {filename}: {e}. Falling back.")
603
+ return self._chunk_text_enhanced(text)
604
+
605
+ def _chunk_text_enhanced(self, text: str, chunk_size: int = 600, overlap: int = 100) -> List[Dict[str, Any]]:
606
+ """Enhanced text chunking that preserves natural boundaries"""
607
+ chunks = []
608
+
609
+ # Try to split by paragraphs first
610
+ paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
611
+
612
+ if not paragraphs:
613
+ # Fallback to standard chunking
614
+ return self._chunk_text_standard(text, chunk_size, overlap)
615
+
616
+ current_chunk = ""
617
+ for para in paragraphs:
618
+ if len(current_chunk) + len(para) > chunk_size and current_chunk:
619
+ chunks.append({
620
+ "text": current_chunk.strip(),
621
+ "type": "text_paragraph",
622
+ "name": f"para_{len(chunks)}"
623
+ })
624
+ # Keep last overlap portion
625
+ current_chunk = current_chunk[-overlap:] + "\n\n" + para if overlap > 0 else para
626
+ else:
627
+ current_chunk += "\n\n" + para if current_chunk else para
628
+
629
+ if current_chunk:
630
+ chunks.append({
631
+ "text": current_chunk.strip(),
632
+ "type": "text_paragraph",
633
+ "name": f"para_{len(chunks)}"
634
+ })
635
+
636
+ return chunks
637
+
638
+ def _chunk_text_standard(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[Dict[str, Any]]:
639
+ """Standard text chunking with sliding window"""
640
+ chunks = []
641
+
642
+ if len(text) <= chunk_size:
643
+ return [{
644
+ "text": text,
645
+ "type": "text_block",
646
+ "name": "full_content"
647
+ }]
648
+
649
+ for i in range(0, len(text), chunk_size - overlap):
650
+ chunk = text[i:i + chunk_size]
651
+ if len(chunk) > 100:
652
+ chunks.append({
653
+ "text": chunk,
654
+ "type": "text_block",
655
+ "name": f"chunk_{i//chunk_size}"
656
+ })
657
+
658
+ return chunks
659
+
660
+ # ==================== HELPER METHODS FOR HYBRID SEARCH ====================
661
+
662
+ def _classify_query(self, query: str) -> str:
663
+ """Classify query type to determine best search strategy"""
664
+ query_lower = query.lower()
665
+
666
+ # Code/technical query indicators
667
+ code_indicators = [
668
+ r'def\s+\w+\(', r'class\s+\w+', r'function\s+\w+',
669
+ r'import\s+', r'from\s+', r'\.py$', r'\.js$', r'\.java$',
670
+ r'\w+\(.*\)', r'\{.*\}', r'\[.*\]', r'=\s*\w+',
671
+ r'const\s+', r'let\s+', r'var\s+', r'type\s+',
672
+ r'interface\s+', r'export\s+', r'async\s+', r'await\s+',
673
+ r'SELECT\s+', r'FROM\s+', r'WHERE\s+', r'JOIN\s+',
674
+ r'#include', r'using\s+', r'namespace\s+', r'template\s+'
675
+ ]
676
+
677
+ for pattern in code_indicators:
678
+ if re.search(pattern, query_lower):
679
+ return "code"
680
+
681
+ # Natural language query indicators
682
+ natural_indicators = [
683
+ r'^how\s+', r'^what\s+', r'^why\s+', r'^explain\s+',
684
+ r'^describe\s+', r'^summarize\s+', r'^tell\s+me\s+about',
685
+ r'\?$', r'please', r'could you', r'would you',
686
+ r'understand', r'meaning', r'concept', r'idea'
687
+ ]
688
+
689
+ for pattern in natural_indicators:
690
+ if re.search(pattern, query_lower):
691
+ return "natural"
692
+
693
+ # Short keyword query (good for BM25)
694
+ words = query.split()
695
+ if len(words) <= 4 and len(query) < 30:
696
+ return "keyword"
697
+
698
+ # Mixed query
699
+ return "mixed"
700
+
701
+ def _bm25_first_fusion(self, bm25_results: List[Dict], semantic_results: List[Dict],
702
+ final_k: int) -> List[Dict]:
703
+ """BM25 first, supplement with semantic if needed"""
704
+ results = bm25_results.copy()
705
+
706
+ # If BM25 results are weak, add semantic results
707
+ if not results or (results[0]["score"] < 0.3):
708
+ seen_ids = set(r["id"] for r in results)
709
+ for sem in semantic_results:
710
+ if sem["id"] not in seen_ids and len(results) < final_k * 2:
711
+ seen_ids.add(sem["id"])
712
+ sem["match_type"] = "semantic_supplement"
713
+ results.append(sem)
714
+
715
+ return results[:final_k]
716
+
717
+ def _semantic_first_fusion(self, semantic_results: List[Dict], bm25_results: List[Dict],
718
+ final_k: int) -> List[Dict]:
719
+ """Semantic first, supplement with BM25 if needed"""
720
+ results = semantic_results.copy()
721
+
722
+ # If semantic results are weak, add BM25 results
723
+ if not results or (results[0]["score"] < 0.3):
724
+ seen_ids = set(r["id"] for r in results)
725
+ for bm in bm25_results:
726
+ if bm["id"] not in seen_ids and len(results) < final_k * 2:
727
+ seen_ids.add(bm["id"])
728
+ bm["match_type"] = "bm25_supplement"
729
+ results.append(bm)
730
+
731
+ return results[:final_k]
732
+
733
+ def _reciprocal_rank_fusion(self, results1: List[Dict], results2: List[Dict],
734
+ final_k: int, k: int = 60) -> List[Dict]:
735
+ """Combine results using Reciprocal Rank Fusion (RRF)"""
736
+ # Create rank dictionaries
737
+ rank_map1 = {r["id"]: rank + 1 for rank, r in enumerate(results1)}
738
+ rank_map2 = {r["id"]: rank + 1 for rank, r in enumerate(results2)}
739
+
740
+ # Get all unique IDs
741
+ all_ids = set(rank_map1.keys()) | set(rank_map2.keys())
742
+
743
+ # Calculate RRF scores
744
+ rrf_scores = []
745
+ for doc_id in all_ids:
746
+ score = 0.0
747
+ if doc_id in rank_map1:
748
+ score += 1.0 / (rank_map1[doc_id] + k)
749
+ if doc_id in rank_map2:
750
+ score += 1.0 / (rank_map2[doc_id] + k)
751
+ rrf_scores.append((doc_id, score))
752
+
753
+ # Sort by RRF score
754
+ rrf_scores.sort(key=lambda x: x[1], reverse=True)
755
+
756
+ # Create result mapping for quick lookup
757
+ results_map = {}
758
+ for r in results1 + results2:
759
+ if r["id"] not in results_map:
760
+ results_map[r["id"]] = r
761
+
762
+ # Build final results
763
+ combined_results = []
764
+ for doc_id, rrf_score in rrf_scores:
765
+ if doc_id in results_map:
766
+ result = results_map[doc_id].copy()
767
+ result["score"] = rrf_score
768
+ result["match_type"] = "rrf_fusion"
769
+ combined_results.append(result)
770
+
771
+ return combined_results[:final_k]
772
+
773
+ def _weighted_fusion(self, bm25_results: List[Dict], semantic_results: List[Dict],
774
+ final_k: int, bm25_weight: float = 0.4,
775
+ semantic_weight: float = 0.6) -> List[Dict]:
776
+ """Weighted combination of BM25 and semantic scores"""
777
+ # Normalize scores within each result set
778
+ def normalize_scores(results):
779
+ if not results:
780
+ return {}
781
+ max_score = max(r["score"] for r in results) if results else 1.0
782
+ if max_score == 0:
783
+ max_score = 1.0
784
+ return {r["id"]: r["score"] / max_score for r in results}
785
+
786
+ bm25_scores = normalize_scores(bm25_results)
787
+ semantic_scores = normalize_scores(semantic_results)
788
+
789
+ # Get all unique IDs
790
+ all_ids = set(bm25_scores.keys()) | set(semantic_scores.keys())
791
+
792
+ # Calculate weighted scores
793
+ weighted_scores = []
794
+ for doc_id in all_ids:
795
+ bm25_score = bm25_scores.get(doc_id, 0.0)
796
+ semantic_score = semantic_scores.get(doc_id, 0.0)
797
+ weighted = (bm25_score * bm25_weight) + (semantic_score * semantic_weight)
798
+ weighted_scores.append((doc_id, weighted))
799
+
800
+ # Sort by weighted score
801
+ weighted_scores.sort(key=lambda x: x[1], reverse=True)
802
+
803
+ # Create result mapping
804
+ results_map = {}
805
+ for r in bm25_results + semantic_results:
806
+ if r["id"] not in results_map:
807
+ results_map[r["id"]] = r
808
+
809
+ # Build final results
810
+ combined_results = []
811
+ for doc_id, weighted_score in weighted_scores:
812
+ if doc_id in results_map:
813
+ result = results_map[doc_id].copy()
814
+ result["score"] = weighted_score
815
+ result["match_type"] = "weighted_fusion"
816
+ combined_results.append(result)
817
+
818
+ return combined_results[:final_k]
819
+
820
+ def _smart_rerank(self, query: str, candidates: List[Dict], final_k: int) -> List[Dict]:
821
+ """Smart reranking using cross-encoder"""
822
+ if len(candidates) <= 1:
823
+ return candidates
824
+
825
+ try:
826
+ # Prepare passages for reranking
827
+ passages = []
828
+ for cand in candidates[:30]:
829
+ text = cand.get("text", "")
830
+ if len(text) > 1000:
831
+ text = text[:1000] + "..."
832
+
833
+ source = cand.get("meta", {}).get("source", "unknown")
834
+ subtype = cand.get("meta", {}).get("subtype", "general")
835
+
836
+ passages.append({
837
+ "id": cand["id"],
838
+ "text": f"File: {source} | Type: {subtype} | Content: {text}"
839
+ })
840
+
841
+ if not passages:
842
+ return candidates
843
+
844
+ # Rerank with FlashRank
845
+ rerank_request = RerankRequest(query=query, passages=passages)
846
+ reranked = self.ranker.rerank(rerank_request)
847
+
848
+ # Update scores based on reranking
849
+ rerank_map = {r["id"]: r["score"] for r in reranked}
850
+
851
+ for cand in candidates:
852
+ if cand["id"] in rerank_map:
853
+ cand["score"] = (cand["score"] * 0.3) + (rerank_map[cand["id"]] * 0.7)
854
+ cand["match_type"] = cand.get("match_type", "unknown") + "_reranked"
855
+
856
+ candidates.sort(key=lambda x: x["score"], reverse=True)
857
+
858
+ logger.debug(f"Smart reranking applied to {len(candidates)} candidates")
859
+
860
+ except Exception as e:
861
+ logger.warning(f"Reranking error: {e}")
862
+
863
+ return candidates[:final_k]
864
+
865
+ # ==================== COMPATIBILITY METHODS (UPDATED) ====================
866
+
867
+ def retrieve_session_context(self, query: str, user_id: str, chat_id: str,
868
+ filter_type: str = None, top_k: int = 100,
869
+ final_k: int = 5, min_score: float = 0.25,
870
+ use_hybrid: bool = True) -> List[Dict[str, Any]]:
871
+ """
872
+ Enhanced retrieval with hybrid capabilities
873
+ use_hybrid: Whether to use hybrid search (BM25 + semantic)
874
+ """
875
+ # Use hybrid search by default if available
876
+ if use_hybrid and BM25_AVAILABLE:
877
+ return self.hybrid_retrieve(
878
+ query=query,
879
+ user_id=user_id,
880
+ chat_id=chat_id,
881
+ filter_type=filter_type,
882
+ top_k=top_k,
883
+ final_k=final_k,
884
+ strategy="smart"
885
+ )
886
+
887
+ # Fall back to original semantic search
888
+ return self._semantic_search(
889
+ query=query,
890
+ user_id=user_id,
891
+ chat_id=chat_id,
892
+ filter_type=filter_type,
893
+ top_k=top_k,
894
+ min_score=min_score,
895
+ final_k=final_k
896
+ )
897
+
898
+ def _semantic_search(self, query: str, user_id: str, chat_id: str,
899
+ filter_type: str = None, top_k: int = 100,
900
+ min_score: float = 0.25, final_k: int = 10) -> List[Dict[str, Any]]:
901
+ """Core semantic search engine"""
902
+ with self.memory_lock:
903
+ total_vectors = self.index.ntotal
904
+ user_vectors = sum(1 for m in self.metadata if m.get("user_id") == user_id and m.get("chat_id") == chat_id)
905
+
906
+ if total_vectors == 0 or user_vectors == 0:
907
+ return []
908
+
909
+ try:
910
+ query_vec = self.embedder.encode([query], show_progress_bar=False)
911
+ faiss.normalize_L2(query_vec)
912
+ except Exception as e:
913
+ logger.error(f"❌ Failed to encode query: {e}")
914
+ return []
915
+
916
+ search_k = min(top_k * 2, total_vectors)
917
+ if search_k == 0:
918
+ search_k = min(10, total_vectors)
919
+
920
+ try:
921
+ with self.memory_lock:
922
+ if self.index.ntotal == 0:
923
+ return []
924
+ D, I = self.index.search(np.array(query_vec).astype('float32'), search_k)
925
+ except Exception as e:
926
+ logger.error(f"❌ Search failed: {e}")
927
+ return []
928
+
929
+ candidates = []
930
+ query_lower = query.lower()
931
+
932
+ for i, idx in enumerate(I[0]):
933
+ if idx == -1 or idx >= len(self.metadata):
934
+ continue
935
+
936
+ item = self.metadata[idx]
937
+
938
+ # Filter by user and chat
939
+ if item.get("user_id") != user_id or item.get("chat_id") != chat_id:
940
+ continue
941
+
942
+ # Filter by type if specified
943
+ if filter_type and item.get("type") != filter_type:
944
+ continue
945
+
946
+ score = float(D[0][i])
947
+
948
+ if np.isnan(score) or np.isinf(score):
949
+ continue
950
+
951
+ # Whole file boosting
952
+ is_whole_file = item.get("is_whole_file", False) or item.get("subtype") == "whole_file"
953
+ if is_whole_file:
954
+ filename = item.get("source", "").lower()
955
+ if filename in query_lower or any(word in filename for word in query_lower.split()):
956
+ score = 2.5
957
+
958
+ if item.get("actual_content"):
959
+ item = item.copy()
960
+ item["text"] = item["actual_content"]
961
+
962
+ if score < min_score:
963
+ continue
964
+
965
+ candidates.append({
966
+ "id": int(idx),
967
+ "text": item.get("text", ""),
968
+ "meta": item,
969
+ "score": score
970
+ })
971
+
972
+ return candidates
973
+
974
+ def retrieve_exact(self, query: str, user_id: str, chat_id: str,
975
+ filter_type: str = None, aggressive: bool = True) -> List[Dict[str, Any]]:
976
+ """PRIMARY EXACT MATCH RETRIEVAL - Accuracy First!"""
977
+ start_time = time.time()
978
+ query_lower = query.lower().strip()
979
+
980
+ if self.index.ntotal == 0 or not user_id:
981
+ logger.warning(f"❌ Empty index or invalid user_id")
982
+ return []
983
+
984
+ logger.info(f"🎯 EXACT MODE: Searching for '{query[:80]}...'")
985
+
986
+ all_candidates = []
987
+ exact_matches = []
988
+
989
+ # TACTIC 1: BRUTE FORCE SUBSTRING SEARCH
990
+ logger.debug("🔍 Tactic 1: Brute force substring search...")
991
+ with self.memory_lock:
992
+ for idx, meta in enumerate(self.metadata):
993
+ if meta.get("user_id") != user_id or meta.get("chat_id") != chat_id:
994
+ continue
995
+
996
+ if filter_type and meta.get("type") != filter_type:
997
+ continue
998
+
999
+ text = meta.get("text", "").lower()
1000
+ actual_content = meta.get("actual_content", "").lower()
1001
+
1002
+ if query_lower in text or query_lower in actual_content:
1003
+ score = 3.0
1004
+ match_type = "exact_substring"
1005
+
1006
+ display_text = meta.get("actual_content", meta.get("text", ""))
1007
+
1008
+ exact_matches.append({
1009
+ "id": idx,
1010
+ "text": display_text,
1011
+ "meta": meta,
1012
+ "score": score,
1013
+ "match_type": match_type,
1014
+ "confidence": "perfect"
1015
+ })
1016
+
1017
+ if exact_matches:
1018
+ logger.info(f"✨ Found {len(exact_matches)} PERFECT exact matches!")
1019
+ self.performance_stats["exact_matches"] += 1
1020
+
1021
+ exact_matches.sort(key=lambda x: (
1022
+ 1 if x["meta"].get("is_whole_file") else 0,
1023
+ x["score"]
1024
+ ), reverse=True)
1025
+
1026
+ elapsed = time.time() - start_time
1027
+ logger.info(f"⚡ Exact match retrieval took {elapsed:.3f}s")
1028
+ return exact_matches[:3]
1029
+
1030
+ # TACTIC 2: KEYWORD MATCHING
1031
+ if aggressive:
1032
+ logger.debug("🔍 Tactic 2: Aggressive keyword matching...")
1033
+ keywords = [w for w in re.findall(r'\b\w{3,}\b', query_lower) if len(w) > 2]
1034
+
1035
+ if keywords:
1036
+ with self.memory_lock:
1037
+ for idx, meta in enumerate(self.metadata):
1038
+ if meta.get("user_id") != user_id or meta.get("chat_id") != chat_id:
1039
+ continue
1040
+ if filter_type and meta.get("type") != filter_type:
1041
+ continue
1042
+
1043
+ text = meta.get("text", "").lower()
1044
+ keyword_matches = sum(1 for kw in keywords if kw in text)
1045
+
1046
+ if keyword_matches >= max(1, len(keywords) * 0.6):
1047
+ score = 2.0 + (keyword_matches / len(keywords)) * 0.5
1048
+ all_candidates.append({
1049
+ "id": idx,
1050
+ "text": meta.get("actual_content", meta.get("text", "")),
1051
+ "meta": meta,
1052
+ "score": score,
1053
+ "match_type": "keyword_explosion",
1054
+ "keyword_match_ratio": keyword_matches / len(keywords)
1055
+ })
1056
+
1057
+ # TACTIC 3: SEMANTIC SEARCH WITH LOW THRESHOLD
1058
+ logger.debug("🔍 Tactic 3: Semantic search with low threshold...")
1059
+ semantic_results = self._semantic_search(
1060
+ query=query,
1061
+ user_id=user_id,
1062
+ chat_id=chat_id,
1063
+ filter_type=filter_type,
1064
+ top_k=200,
1065
+ min_score=0.1,
1066
+ final_k=30
1067
+ )
1068
+
1069
+ for res in semantic_results:
1070
+ res["match_type"] = "semantic_low_threshold"
1071
+ all_candidates.append(res)
1072
+
1073
+ # DEDUPLICATE AND RANK
1074
+ seen_ids = set()
1075
+ unique_candidates = []
1076
+
1077
+ for candidate in all_candidates:
1078
+ if candidate["id"] not in seen_ids:
1079
+ seen_ids.add(candidate["id"])
1080
+ unique_candidates.append(candidate)
1081
+
1082
+ unique_candidates.sort(key=lambda x: x["score"], reverse=True)
1083
+
1084
+ # Apply reranking if available
1085
+ if unique_candidates:
1086
+ try:
1087
+ passages = []
1088
+ for cand in unique_candidates[:50]:
1089
+ text_for_rerank = cand["text"]
1090
+ if len(text_for_rerank) > 1000:
1091
+ text_for_rerank = text_for_rerank[:1000] + "..."
1092
+
1093
+ passages.append({
1094
+ "id": cand["id"],
1095
+ "text": text_for_rerank
1096
+ })
1097
+
1098
+ if passages:
1099
+ rerank_request = RerankRequest(query=query, passages=passages)
1100
+ reranked = self.ranker.rerank(rerank_request)
1101
+
1102
+ rerank_map = {r["id"]: r["score"] for r in reranked}
1103
+ for cand in unique_candidates:
1104
+ if cand["id"] in rerank_map:
1105
+ cand["score"] = cand["score"] * 0.3 + rerank_map[cand["id"]] * 0.7
1106
+
1107
+ unique_candidates.sort(key=lambda x: x["score"], reverse=True)
1108
+
1109
+ except Exception as e:
1110
+ logger.warning(f"⚠️ Reranking failed: {e}")
1111
+
1112
+ # FINAL SELECTION
1113
+ final_results = []
1114
+ confidence_threshold = 0.4 if aggressive else 0.5
1115
 
1116
+ for cand in unique_candidates[:10]:
1117
+ if cand["score"] >= confidence_threshold:
1118
+ final_results.append(cand)
1119
 
1120
+ if final_results:
1121
+ self.performance_stats["semantic_matches"] += 1
1122
+ logger.info(f"✅ Found {len(final_results)} relevant results")
1123
+
1124
+ top_match = final_results[0]
1125
+ logger.info(f"🏆 Top match: Score={top_match['score']:.3f}, Type={top_match.get('match_type', 'unknown')}")
1126
+
1127
+ if top_match["meta"].get("is_whole_file"):
1128
+ logger.info(f"📄 Returning whole file: {top_match['meta'].get('source', 'unknown')}")
1129
 
1130
+ elapsed = time.time() - start_time
1131
+ logger.info(f"⏱️ Exact retrieval completed in {elapsed:.3f}s")
1132
 
1133
+ # Store in query history
1134
+ self.query_history.append({
1135
+ "query": query[:100],
1136
+ "timestamp": time.time(),
1137
+ "results_count": len(final_results),
1138
+ "top_score": final_results[0]["score"] if final_results else 0,
1139
+ "elapsed_time": elapsed,
1140
+ "method": "exact"
1141
+ })
1142
 
1143
+ if len(self.query_history) > 1000:
1144
+ self.query_history = self.query_history[-500:]
1145
+
1146
+ return final_results[:5]
1147
+
1148
+ # ==================== INFRASTRUCTURE METHODS ====================
1149
 
1150
  def _load_or_create_index(self):
1151
  """Thread-safe and process-safe index loading/creation"""
 
1154
  try:
1155
  logger.info("📂 Loading existing vector index...")
1156
  self.index = faiss.read_index(self.index_path)
1157
+
1158
+ if self.index.ntotal < 0:
1159
+ raise ValueError("Corrupt index: negative vector count")
1160
+
1161
  with open(self.metadata_path, "rb") as f:
1162
  self.metadata = pickle.load(f)
1163
+
1164
+ if len(self.metadata) != self.index.ntotal:
1165
+ logger.error(f"⚠️ Metadata mismatch: {len(self.metadata)} entries vs {self.index.ntotal} vectors. Rebuilding...")
1166
+ self._create_new_index()
1167
+ return
1168
+
1169
  logger.info(f"✅ Loaded index with {self.index.ntotal} vectors, {len(self.metadata)} metadata entries")
1170
  except Exception as e:
1171
  logger.error(f"⚠️ Failed to load index: {e}. Creating new one.")
 
1175
  self._create_new_index()
1176
 
1177
  def _create_new_index(self):
1178
+ """Create fresh IndexFlatIP for cosine similarity"""
1179
  dimension = 384
1180
+ self.index = faiss.IndexFlatIP(dimension)
 
 
1181
  self.metadata = []
1182
  logger.info(f"🆕 Created new IndexFlatIP with dimension {dimension}")
1183
 
1184
  def _save_index(self):
1185
  """Thread-safe and process-safe index saving with atomic writes"""
1186
  with self.file_lock:
 
1187
  temp_index = f"{self.index_path}.tmp"
1188
  temp_meta = f"{self.metadata_path}.tmp"
1189
 
1190
  try:
 
1191
  faiss.write_index(self.index, temp_index)
1192
  with open(temp_meta, "wb") as f:
1193
  pickle.dump(self.metadata, f)
1194
 
 
1195
  os.replace(temp_index, self.index_path)
1196
  os.replace(temp_meta, self.metadata_path)
1197
 
1198
+ logger.info(f"💾 Saved index: {self.index.ntotal} vectors, {len(self.metadata)} metadata entries")
1199
  except Exception as e:
1200
  logger.error(f"❌ Failed to save index: {e}")
 
1201
  for f in [temp_index, temp_meta]:
1202
  if os.path.exists(f):
1203
  try:
1204
  os.remove(f)
1205
+ except Exception:
1206
+ logger.warning(f"Failed to remove temp file: {f}")
1207
+ finally:
1208
+ gc.collect()
1209
 
1210
+ def _rollback_partial_storage(self, user_id: str, chat_id: str):
1211
+ """Remove partially stored vectors for a session"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1212
  try:
1213
+ new_metadata = []
1214
+ surviving_texts = []
1215
 
1216
+ for meta in self.metadata:
1217
+ if meta.get("user_id") != user_id or meta.get("chat_id") != chat_id:
1218
+ new_metadata.append(meta)
1219
+ surviving_texts.append(meta["text"])
1220
 
1221
+ if len(new_metadata) == len(self.metadata):
1222
+ return
 
 
 
 
 
 
 
 
 
 
 
1223
 
1224
+ if surviving_texts:
1225
+ embeddings = self.embedder.encode(surviving_texts, show_progress_bar=False)
1226
+ faiss.normalize_L2(embeddings)
 
 
 
 
1227
 
1228
+ new_index = faiss.IndexFlatIP(384)
1229
+ new_index.add(np.array(embeddings).astype('float32'))
1230
+ self.index = new_index
1231
+ else:
1232
+ self.index = faiss.IndexFlatIP(384)
1233
 
1234
+ self.metadata = new_metadata
1235
+ self._save_index()
 
 
 
 
 
 
 
 
1236
 
1237
+ # Invalidate BM25 cache
1238
+ self._invalidate_bm25_cache(user_id, chat_id)
1239
 
1240
+ logger.info(f"🔄 Rolled back partial storage for user {user_id[:8]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1241
 
 
 
 
 
 
 
 
 
 
 
 
1242
  except Exception as e:
1243
+ logger.error(f" Rollback failed: {e}")
1244
+ self._create_new_index()
 
 
 
 
 
 
1245
 
1246
+ def _verify_storage(self, user_id: str, chat_id: str, expected_count: int):
1247
+ """Verify vectors were stored correctly"""
1248
+ with self.memory_lock:
1249
+ user_vectors = sum(1 for m in self.metadata if m.get("user_id") == user_id and m.get("chat_id") == chat_id)
 
 
 
 
 
 
 
1250
 
1251
+ logger.info(f"🔍 Storage verification: User {user_id[:8]} has {user_vectors} vectors (expected: {expected_count})")
 
 
 
 
 
 
 
 
1252
 
1253
+ if user_vectors < expected_count:
1254
+ logger.warning(f"⚠️ Storage mismatch for user {user_id[:8]}")
1255
 
1256
+ # ==================== ANALYTICS & ADMIN METHODS ====================
1257
+
1258
+ def get_retrieval_analytics(self, query: str = None) -> Dict[str, Any]:
1259
+ """Get detailed analytics about retrieval performance"""
1260
+ analytics = {
1261
+ "performance_stats": self.performance_stats.copy(),
1262
+ "query_types": dict(self.query_types),
1263
+ "query_history_count": len(self.query_history),
1264
+ "index_stats": {
1265
+ "total_vectors": self.index.ntotal,
1266
+ "metadata_count": len(self.metadata),
1267
+ "avg_metadata_size": 0,
1268
+ "bm25_cache_size": len(self.bm25_indices),
1269
+ "bm25_cache_capacity": self.bm25_cache_size,
1270
+ "bm25_available": BM25_AVAILABLE,
1271
+ "nltk_available": NLTK_AVAILABLE
1272
+ },
1273
+ "recent_queries": [],
1274
+ "cache_stats": {
1275
+ "bm25_cache_hits": 0, # Could be tracked with more instrumentation
1276
+ "bm25_cache_misses": 0
1277
+ }
1278
+ }
 
 
 
 
 
 
 
 
 
 
 
 
1279
 
1280
+ if self.metadata:
1281
+ total_text_size = sum(len(m.get("text", "")) for m in self.metadata)
1282
+ analytics["index_stats"]["avg_metadata_size"] = total_text_size / len(self.metadata)
1283
 
1284
+ for qh in self.query_history[-10:]:
1285
+ analytics["recent_queries"].append({
1286
+ "query_preview": qh.get("query", "")[:50],
1287
+ "results": qh.get("results_count", 0),
1288
+ "top_score": qh.get("top_score", 0),
1289
+ "elapsed": qh.get("elapsed_time", 0),
1290
+ "method": qh.get("method", "unknown")
 
 
 
 
 
 
1291
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1292
 
1293
+ if query:
1294
+ query_lower = query.lower()
1295
+ keyword_matches = defaultdict(int)
 
 
 
 
 
 
1296
 
1297
+ for meta in self.metadata:
1298
+ text = meta.get("text", "").lower()
1299
+ for word in re.findall(r'\b\w{3,}\b', query_lower):
1300
+ if word in text:
1301
+ keyword_matches[word] += 1
 
1302
 
1303
+ analytics["query_analysis"] = {
1304
+ "query_length": len(query),
1305
+ "word_count": len(query.split()),
1306
+ "keyword_frequency": dict(keyword_matches),
1307
+ "has_file_reference": bool(re.search(r'\.(?:py|js|html|css|ts|java|cpp)', query, re.I)),
1308
+ "classified_as": self._classify_query(query)
1309
+ }
 
 
 
1310
 
1311
+ return analytics
 
1312
 
1313
+ def store_chat_context(self, messages: list, user_id: str, chat_id: str) -> bool:
1314
  """Store chat history as session memory"""
1315
  if not messages or not user_id:
1316
  return False
1317
 
 
1318
  conversation = ""
1319
+ for msg in messages[-10:]:
1320
  role = msg.get("role", "unknown")
1321
  content = msg.get("content", "")
1322
  if content:
 
1325
  if len(conversation) < 50:
1326
  return False
1327
 
1328
+ chunks = self._chunk_text_enhanced(conversation, chunk_size=800, overlap=100)
 
1329
 
1330
  if not chunks:
1331
  return False
1332
 
 
1333
  texts = [c["text"] for c in chunks]
1334
  metadata_list = []
1335
 
 
1344
  "chunk_index": i
1345
  })
1346
 
 
1347
  try:
1348
+ embeddings = self.embedder.encode(texts, show_progress_bar=False)
1349
  faiss.normalize_L2(embeddings)
1350
 
1351
  with self.memory_lock:
 
1353
  self.metadata.extend(metadata_list)
1354
  self._save_index()
1355
 
1356
+ # Invalidate BM25 cache for this session
1357
+ self._invalidate_bm25_cache(user_id, chat_id)
1358
+
1359
  logger.info(f"💭 Stored {len(texts)} chat history chunks for user {user_id[:8]}")
1360
  return True
1361
 
1362
  except Exception as e:
1363
+ logger.error(f"Failed to store chat history: {e}")
1364
  return False
1365
 
1366
+ def delete_session(self, user_id: str, chat_id: str) -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1367
  """Surgical Strike: Permanently remove ONLY one specific session"""
1368
  with self.memory_lock:
 
1369
  new_metadata = []
1370
  removed_count = 0
1371
 
1372
  for meta in self.metadata:
 
1373
  if meta.get("user_id") == user_id and meta.get("chat_id") == chat_id:
1374
  removed_count += 1
1375
  else:
1376
  new_metadata.append(meta)
1377
 
1378
  if removed_count == 0:
1379
+ logger.info(f"ℹ️ No vectors to delete for session {chat_id}")
1380
+ return False
1381
 
1382
  logger.info(f"🧹 Surgically removing {removed_count} vectors for session {chat_id}...")
1383
 
 
1384
  if not new_metadata:
1385
+ self.index = faiss.IndexFlatIP(384)
1386
  else:
 
 
1387
  surviving_texts = [m["text"] for m in new_metadata]
1388
  try:
1389
+ embeddings = self.embedder.encode(surviving_texts, show_progress_bar=False)
1390
  faiss.normalize_L2(embeddings)
1391
 
1392
  new_index = faiss.IndexFlatIP(384)
1393
  new_index.add(np.array(embeddings).astype('float32'))
1394
  self.index = new_index
1395
  except Exception as e:
1396
+ logger.error(f"Rebuild failed: {e}")
1397
  return False
1398
 
1399
  self.metadata = new_metadata
1400
  self._save_index()
1401
+
1402
+ # Invalidate BM25 cache for this session
1403
+ self._invalidate_bm25_cache(user_id, chat_id)
1404
+
1405
+ logger.info(f"✅ Successfully deleted session {chat_id}")
1406
  return True
1407
+
1408
+ def get_user_stats(self, user_id: str) -> Dict[str, Any]:
1409
  """Get statistics for a user's session"""
1410
  with self.memory_lock:
1411
  user_vectors = []
1412
+ for meta in enumerate(self.metadata):
1413
+ if meta[1].get("user_id") == user_id:
1414
  user_vectors.append(meta)
1415
 
1416
  stats = {
1417
  "user_id": user_id,
1418
  "total_vectors": len(user_vectors),
1419
  "by_type": {},
1420
+ "by_source": {},
1421
+ "sessions": {},
1422
+ "bm25_cached": False
1423
  }
1424
 
1425
+ for vec_id, vec in user_vectors:
1426
  vec_type = vec.get("type", "unknown")
1427
  source = vec.get("source", "unknown")
1428
+ chat_id = vec.get("chat_id", "unknown")
1429
 
1430
  stats["by_type"][vec_type] = stats["by_type"].get(vec_type, 0) + 1
1431
  stats["by_source"][source] = stats["by_source"].get(source, 0) + 1
1432
+ stats["sessions"][chat_id] = stats["sessions"].get(chat_id, 0) + 1
1433
+
1434
+ # Check if any session has BM25 in cache
1435
+ for chat_id in stats["sessions"]:
1436
+ key = (user_id, chat_id)
1437
+ if key in self.bm25_indices:
1438
+ stats["bm25_cached"] = True
1439
+ break
1440
 
1441
  return stats
1442
 
1443
+ def cleanup_old_sessions(self, max_age_hours: int = 24) -> int:
1444
  """Clean up old session data"""
1445
  current_time = time.time()
1446
  cutoff = current_time - (max_age_hours * 3600)
 
1448
  with self.memory_lock:
1449
  old_metadata = []
1450
  new_metadata = []
1451
+ affected_sessions = set()
1452
 
1453
+ for meta in self.metadata:
1454
  if meta.get("timestamp", 0) < cutoff:
1455
+ old_metadata.append(meta)
1456
+ user_id = meta.get("user_id")
1457
+ chat_id = meta.get("chat_id")
1458
+ if user_id and chat_id:
1459
+ affected_sessions.add((user_id, chat_id))
1460
  else:
1461
  new_metadata.append(meta)
1462
 
1463
  if not old_metadata:
1464
  return 0
1465
 
 
1466
  logger.info(f"🧹 Cleaning up {len(old_metadata)} old vectors...")
1467
 
 
1468
  recent_texts = [m["text"] for m in new_metadata]
1469
 
1470
  if recent_texts:
1471
+ try:
1472
+ embeddings = self.embedder.encode(recent_texts, show_progress_bar=False)
1473
+ faiss.normalize_L2(embeddings)
1474
+
1475
+ self.index = faiss.IndexFlatIP(384)
1476
+ self.index.add(np.array(embeddings).astype('float32'))
1477
+ except Exception as e:
1478
+ logger.error(f"❌ Failed to rebuild index: {e}")
1479
+ return 0
1480
  else:
1481
  self.index = faiss.IndexFlatIP(384)
1482
 
1483
  self.metadata = new_metadata
1484
  self._save_index()
1485
 
1486
+ # Remove affected sessions from BM25 cache
1487
+ for key in affected_sessions:
1488
+ self._invalidate_bm25_cache(*key)
1489
+
1490
+ logger.info(f"✅ Cleanup complete. Removed {len(old_metadata)} vectors.")
1491
  return len(old_metadata)
1492
 
1493
  def _cleanup(self):
 
1495
  try:
1496
  if hasattr(self, 'file_lock'):
1497
  self.file_lock.release()
1498
+ gc.collect()
1499
+ except Exception as e:
1500
+ logger.warning(f"Cleanup warning: {e}")
1501
 
1502
  # Global instance (singleton pattern)
1503
  _vdb_instance = None
1504
+ _vdb_lock = threading.Lock()
1505
 
1506
+ def get_vector_db(index_path: str = "faiss_session_index.bin", metadata_path: str = "session_metadata.pkl") -> VectorDatabase:
1507
+ """Singleton factory for VectorDatabase with thread-safe initialization"""
1508
  global _vdb_instance
1509
  if _vdb_instance is None:
1510
+ with _vdb_lock:
1511
+ if _vdb_instance is None:
1512
+ _vdb_instance = VectorDatabase(index_path, metadata_path)
1513
  return _vdb_instance
1514
 
1515
  # For backward compatibility