Rajhuggingface4253 commited on
Commit
adbab41
·
verified ·
1 Parent(s): 05eba13

Update vector.py

Browse files
Files changed (1) hide show
  1. vector.py +199 -56
vector.py CHANGED
@@ -97,45 +97,123 @@ class VectorDatabase:
97
  except:
98
  pass
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  def _chunk_python_code(self, text, filename):
101
- """Smart Python code chunking using AST"""
102
  chunks = []
103
  try:
104
  tree = ast.parse(text)
105
  lines = text.splitlines()
106
 
107
- # Extract global context
108
  global_context = []
109
- for node in tree.body:
110
- if isinstance(node, (ast.Import, ast.ImportFrom, ast.Assign)):
111
- start = node.lineno - 1
112
- end = node.end_lineno
113
- global_context.append("\n".join(lines[start:end]))
114
-
115
- if global_context:
116
- chunks.append({
117
- "text": "\n".join(global_context),
118
- "type": "code_context",
119
- "name": "Imports & Globals"
120
- })
121
 
122
- # Extract functions & classes
123
  for node in tree.body:
124
  if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)):
 
125
  start = node.lineno - 1
126
  end = node.end_lineno
127
- block_content = "\n".join(lines[start:end])
128
 
129
  chunks.append({
130
- "text": block_content,
131
  "type": "code_function",
132
  "name": node.name
133
  })
 
 
 
 
 
 
 
134
 
 
 
 
 
 
 
 
 
 
 
 
135
  except Exception as e:
136
  logger.warning(f"AST parsing failed for {filename}: {e}")
137
  return self._chunk_text_standard(text)
138
 
 
 
 
 
139
  return chunks
140
 
141
  def _chunk_text_standard(self, text, chunk_size=500, overlap=50):
@@ -163,7 +241,7 @@ class VectorDatabase:
163
  return chunks
164
 
165
  def store_session_document(self, text: str, filename: str, user_id: str, chat_id: str):
166
- """Store extracted file content with user session isolation"""
167
  if not text or len(text) < 10 or not user_id:
168
  logger.warning(f"Invalid input for {filename}")
169
  return False
@@ -177,13 +255,13 @@ class VectorDatabase:
177
  try:
178
  if ext == '.py':
179
  chunks_data = self._chunk_python_code(text, filename)
180
- elif ext in ['.js', '.html', '.css', '.java', '.cpp', '.ts', '.tsx', '.jsx']:
181
- chunks_data = self._chunk_text_standard(text, chunk_size=800, overlap=100)
 
182
  else:
183
  chunks_data = self._chunk_text_standard(text, chunk_size=500, overlap=50)
184
  except Exception as e:
185
  logger.error(f"Chunking failed for {filename}: {e}")
186
- # Fallback to simple chunking
187
  chunks_data = self._chunk_text_standard(text, chunk_size=500, overlap=50)
188
 
189
  # Ensure we have chunks
@@ -202,6 +280,7 @@ class VectorDatabase:
202
  final_texts = []
203
  final_meta = []
204
 
 
205
  for chunk in chunks_data:
206
  final_texts.append(chunk["text"])
207
  final_meta.append({
@@ -215,6 +294,21 @@ class VectorDatabase:
215
  "timestamp": time.time(),
216
  "chunk_index": len(final_texts)
217
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  # Embed and add to index
220
  try:
@@ -232,7 +326,7 @@ class VectorDatabase:
232
 
233
  logger.info(f"✅ Stored {len(final_texts)} chunks from {filename} for user {user_id[:8]}")
234
 
235
- # Verify storage
236
  self._verify_storage(user_id, len(final_texts))
237
 
238
  return True
@@ -307,17 +401,16 @@ class VectorDatabase:
307
 
308
  def retrieve_session_context(self, query: str, user_id: str, chat_id: str, filter_type: str = None, top_k=100, final_k=5, min_score=0.25):
309
  """
310
- Retrieve context only from the user's SPECIFIC session.
311
- Includes a 'min_score' threshold to filter out irrelevant noise.
312
  """
313
  if self.index.ntotal == 0 or not user_id:
314
  logger.warning(f"Empty index or missing user_id. Index size: {self.index.ntotal}")
315
  return []
316
 
317
- # Debug: show current state
318
  with self.memory_lock:
319
  total_vectors = self.index.ntotal
320
- total_metadata = len(self.metadata)
321
  user_vectors = sum(1 for m in self.metadata if m.get("user_id") == user_id)
322
 
323
  logger.info(f"🔍 Searching for user {user_id[:8]} (User vectors: {user_vectors}/{total_vectors})")
@@ -326,39 +419,45 @@ class VectorDatabase:
326
  query_vec = self.embedder.encode([query])
327
  faiss.normalize_L2(query_vec)
328
 
329
- # Search (we search more than needed to account for filtering)
330
  search_k = min(top_k * 3, self.index.ntotal) if self.index.ntotal > 0 else 1
331
-
332
  with self.memory_lock:
333
  D, I = self.index.search(np.array(query_vec).astype('float32'), search_k)
334
 
335
- # Process results
336
  candidates = []
337
  valid_count = 0
 
338
 
339
  for i, idx in enumerate(I[0]):
340
- if idx == -1 or idx >= len(self.metadata):
341
- continue
342
 
343
  item = self.metadata[idx]
344
 
345
- # 1. STRICT ISOLATION FILTERING
346
- if item.get("user_id") != user_id:
347
- continue
348
- if item.get("chat_id") != chat_id:
349
- continue
350
- if filter_type and item.get("type") != filter_type:
351
- continue
352
-
353
- # 2. SCORE CORRECTION (CRITICAL FIX)
354
- # Since we use IndexFlatIP with normalized vectors, D[0][i] IS the cosine similarity.
355
- # Do NOT subtract from 1.0.
356
  score = D[0][i]
357
 
358
- # 3. THE GATEKEEPER (Prevents Hallucinations)
359
- # If the similarity is too low (noise), we discard it immediately.
360
- if score < min_score:
361
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
 
363
  candidates.append({
364
  "id": int(idx),
@@ -367,32 +466,76 @@ class VectorDatabase:
367
  "score": score
368
  })
369
  valid_count += 1
370
-
371
  logger.info(f"📊 Found {valid_count} candidates above threshold {min_score}")
372
 
373
- if not candidates:
374
- return []
 
 
375
 
 
 
 
 
 
 
376
  # Rerank with FlashRank
377
  try:
378
  rerank_request = RerankRequest(query=query, passages=candidates)
379
  results = self.ranker.rerank(rerank_request)
380
 
381
- # Filter Reranked Results (Double Safety)
382
- # Sometimes vectors are okay but semantic meaning is still weak.
383
- # We keep only the top K that also pass the score check.
384
  final_results = [r for r in results[:final_k] if r['score'] > min_score]
385
 
386
- logger.info(f"🎯 Reranked to {len(final_results)} results (scores: {[round(r['score'], 3) for r in final_results]})")
387
-
388
  return final_results
389
 
390
  except Exception as e:
391
  logger.error(f"Reranking failed: {e}")
392
- # Fallback: return top candidates by vector similarity
393
- candidates.sort(key=lambda x: x["score"], reverse=True)
394
  return candidates[:final_k]
395
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  def get_user_stats(self, user_id: str):
397
  """Get statistics for a user's session"""
398
  with self.memory_lock:
 
97
  except:
98
  pass
99
 
100
+ def _chunk_smart_code(self, text, filename):
101
+ """
102
+ Structure-aware chunker for JS, HTML, CSS, etc.
103
+ Splits by logical boundaries (tags, functions) instead of random characters.
104
+ """
105
+ ext = os.path.splitext(filename)[1].lower()
106
+ chunks = []
107
+
108
+ # Define split patterns for different languages
109
+ patterns = {
110
+ # HTML/XML: Split before opening tags, effectively keeping tags grouped
111
+ '.html': r'(?=\n\s*<[^/])',
112
+ '.htm': r'(?=\n\s*<[^/])',
113
+ '.xml': r'(?=\n\s*<[^/])',
114
+ '.vue': r'(?=\n\s*<[^/])',
115
+ # JS/TS: Split before major keywords
116
+ '.js': r'(?=\n\s*(?:function|class|const|let|var|export|import|async))',
117
+ '.jsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async))',
118
+ '.ts': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type))',
119
+ '.tsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type))',
120
+ # CSS: Split before selectors
121
+ '.css': r'(?=\n\s*[.#@a-zA-Z])',
122
+ '.scss': r'(?=\n\s*[.#@a-zA-Z])',
123
+ }
124
+
125
+ pattern = patterns.get(ext)
126
+
127
+ # Fallback to standard if no pattern matches or regex fails
128
+ if not pattern:
129
+ return self._chunk_text_standard(text)
130
+
131
+ try:
132
+ # 1. Split by pattern
133
+ segments = re.split(pattern, text)
134
+
135
+ # 2. Re-group segments into chunks of appropriate size (e.g., 1000 chars)
136
+ current_chunk = ""
137
+ TARGET_SIZE = 1000
138
+
139
+ for seg in segments:
140
+ if not seg.strip(): continue
141
+
142
+ # If adding this segment exceeds target, save current and start new
143
+ if len(current_chunk) + len(seg) > TARGET_SIZE and len(current_chunk) > 100:
144
+ chunks.append({
145
+ "text": current_chunk.strip(),
146
+ "type": "code_block",
147
+ "name": f"block_{len(chunks)}"
148
+ })
149
+ current_chunk = seg
150
+ else:
151
+ current_chunk += seg
152
+
153
+ # Add final chunk
154
+ if current_chunk:
155
+ chunks.append({
156
+ "text": current_chunk.strip(),
157
+ "type": "code_block",
158
+ "name": f"block_{len(chunks)}"
159
+ })
160
+
161
+ return chunks
162
+
163
+ except Exception as e:
164
+ logger.warning(f"Smart chunking failed for {filename}: {e}. Falling back.")
165
+ return self._chunk_text_standard(text)
166
+
167
  def _chunk_python_code(self, text, filename):
168
+ """Improved AST chunker that captures EVERYTHING (not just functions)"""
169
  chunks = []
170
  try:
171
  tree = ast.parse(text)
172
  lines = text.splitlines()
173
 
174
+ # 1. Global Context (Imports & Assignments)
175
  global_context = []
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
+ # 2. Iterate nodes to find blocks
178
  for node in tree.body:
179
  if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)):
180
+ # Extract the block
181
  start = node.lineno - 1
182
  end = node.end_lineno
183
+ block_text = "\n".join(lines[start:end])
184
 
185
  chunks.append({
186
+ "text": block_text,
187
  "type": "code_function",
188
  "name": node.name
189
  })
190
+ elif isinstance(node, (ast.Import, ast.ImportFrom, ast.Assign, ast.Expr)):
191
+ # Group top-level scripts/imports together
192
+ # We approximate by grabbing the line
193
+ if hasattr(node, 'end_lineno'):
194
+ start = node.lineno - 1
195
+ end = node.end_lineno
196
+ global_context.append("\n".join(lines[start:end]))
197
 
198
+ # Add the collected global context as the first chunk
199
+ if global_context:
200
+ # Group globals into chunks of 1000 chars
201
+ full_global = "\n".join(global_context)
202
+ if len(full_global) > 100:
203
+ chunks.insert(0, {
204
+ "text": full_global[:1500], # Cap context size
205
+ "type": "code_context",
206
+ "name": "imports_and_globals"
207
+ })
208
+
209
  except Exception as e:
210
  logger.warning(f"AST parsing failed for {filename}: {e}")
211
  return self._chunk_text_standard(text)
212
 
213
+ # Fallback: if AST yielded nothing (e.g. empty file), use standard
214
+ if not chunks:
215
+ return self._chunk_text_standard(text)
216
+
217
  return chunks
218
 
219
  def _chunk_text_standard(self, text, chunk_size=500, overlap=50):
 
241
  return chunks
242
 
243
  def store_session_document(self, text: str, filename: str, user_id: str, chat_id: str):
244
+ """Store extracted file content with 'Whole File' capability & Verification"""
245
  if not text or len(text) < 10 or not user_id:
246
  logger.warning(f"Invalid input for {filename}")
247
  return False
 
255
  try:
256
  if ext == '.py':
257
  chunks_data = self._chunk_python_code(text, filename)
258
+ elif ext in ['.js', '.html', '.css', '.java', '.cpp', '.ts', '.tsx', '.jsx', '.vue', '.xml']:
259
+ # Use Smart Regex Chunking
260
+ chunks_data = self._chunk_smart_code(text, filename)
261
  else:
262
  chunks_data = self._chunk_text_standard(text, chunk_size=500, overlap=50)
263
  except Exception as e:
264
  logger.error(f"Chunking failed for {filename}: {e}")
 
265
  chunks_data = self._chunk_text_standard(text, chunk_size=500, overlap=50)
266
 
267
  # Ensure we have chunks
 
280
  final_texts = []
281
  final_meta = []
282
 
283
+ # 1. Process Standard Chunks
284
  for chunk in chunks_data:
285
  final_texts.append(chunk["text"])
286
  final_meta.append({
 
294
  "timestamp": time.time(),
295
  "chunk_index": len(final_texts)
296
  })
297
+
298
+ # 2. Add "Whole File" Entry (CRITICAL FOR FULL FILE RETRIEVAL)
299
+ # We embed a marker text, but store the ACTUAL content in metadata.
300
+ marker_text = f"Entire full content of file {filename} code"
301
+ final_texts.append(marker_text)
302
+ final_meta.append({
303
+ "text": marker_text, # Marker for search
304
+ "actual_content": text, # <<< THE FULL CONTENT
305
+ "source": filename,
306
+ "type": "whole_file", # Special type
307
+ "user_id": user_id,
308
+ "chat_id": chat_id,
309
+ "timestamp": time.time(),
310
+ "chunk_index": -1
311
+ })
312
 
313
  # Embed and add to index
314
  try:
 
326
 
327
  logger.info(f"✅ Stored {len(final_texts)} chunks from {filename} for user {user_id[:8]}")
328
 
329
+ # Verify storage (Self-Check)
330
  self._verify_storage(user_id, len(final_texts))
331
 
332
  return True
 
401
 
402
  def retrieve_session_context(self, query: str, user_id: str, chat_id: str, filter_type: str = None, top_k=100, final_k=5, min_score=0.25):
403
  """
404
+ Retrieve context with Filename Ranking Logic.
405
+ If user asks for a specific file, returns the WHOLE content.
406
  """
407
  if self.index.ntotal == 0 or not user_id:
408
  logger.warning(f"Empty index or missing user_id. Index size: {self.index.ntotal}")
409
  return []
410
 
411
+ # Debug info
412
  with self.memory_lock:
413
  total_vectors = self.index.ntotal
 
414
  user_vectors = sum(1 for m in self.metadata if m.get("user_id") == user_id)
415
 
416
  logger.info(f"🔍 Searching for user {user_id[:8]} (User vectors: {user_vectors}/{total_vectors})")
 
419
  query_vec = self.embedder.encode([query])
420
  faiss.normalize_L2(query_vec)
421
 
422
+ # Search
423
  search_k = min(top_k * 3, self.index.ntotal) if self.index.ntotal > 0 else 1
 
424
  with self.memory_lock:
425
  D, I = self.index.search(np.array(query_vec).astype('float32'), search_k)
426
 
 
427
  candidates = []
428
  valid_count = 0
429
+ query_lower = query.lower()
430
 
431
  for i, idx in enumerate(I[0]):
432
+ if idx == -1 or idx >= len(self.metadata): continue
 
433
 
434
  item = self.metadata[idx]
435
 
436
+ # 1. STRICT ISOLATION
437
+ if item.get("user_id") != user_id: continue
438
+ if item.get("chat_id") != chat_id: continue
439
+ if filter_type and item.get("type") != filter_type: continue
440
+
 
 
 
 
 
 
441
  score = D[0][i]
442
 
443
+ # 2. WHOLE FILE RANKING (The Missing Piece)
444
+ # If this is a "whole_file" marker AND the filename is in the query...
445
+ filename = item.get("source", "").lower()
446
+ is_whole_file = item.get("type") == "whole_file"
447
+
448
+ if is_whole_file:
449
+ # If user specifically asked for this file (e.g. "read index.html")
450
+ if filename in query_lower:
451
+ score = 2.0 # Force to top (override similarity)
452
+
453
+ # Replace the "marker text" with the ACTUAL full content
454
+ # This ensures the LLM gets the real code
455
+ if item.get("actual_content"):
456
+ item = item.copy() # Don't mutate original metadata
457
+ item["text"] = item["actual_content"]
458
+
459
+ # 3. GATEKEEPER (Noise Filter)
460
+ if score < min_score: continue
461
 
462
  candidates.append({
463
  "id": int(idx),
 
466
  "score": score
467
  })
468
  valid_count += 1
469
+
470
  logger.info(f"📊 Found {valid_count} candidates above threshold {min_score}")
471
 
472
+ if not candidates: return []
473
+
474
+ # Sort manually first (to handle our forced 2.0 scores)
475
+ candidates.sort(key=lambda x: x["score"], reverse=True)
476
 
477
+ # Optimization: If we found a forced match (Whole File), return immediately
478
+ # We don't need to rerank if we know exactly what the user wanted.
479
+ if candidates[0]["score"] >= 2.0:
480
+ logger.info(f"🎯 Returning Whole File: {candidates[0]['meta'].get('source')}")
481
+ return candidates[:1]
482
+
483
  # Rerank with FlashRank
484
  try:
485
  rerank_request = RerankRequest(query=query, passages=candidates)
486
  results = self.ranker.rerank(rerank_request)
487
 
488
+ # Filter low quality rerank results
 
 
489
  final_results = [r for r in results[:final_k] if r['score'] > min_score]
490
 
491
+ logger.info(f"🎯 Reranked to {len(final_results)} results")
 
492
  return final_results
493
 
494
  except Exception as e:
495
  logger.error(f"Reranking failed: {e}")
 
 
496
  return candidates[:final_k]
497
 
498
+ def delete_session(self, user_id: str, chat_id: str):
499
+ """Surgical Strike: Permanently remove ONLY one specific session"""
500
+ with self.memory_lock:
501
+ # 1. Filter: Keep everything that is NOT this specific chat
502
+ new_metadata = []
503
+ removed_count = 0
504
+
505
+ for meta in self.metadata:
506
+ # Check strict ownership and ID match
507
+ if meta.get("user_id") == user_id and meta.get("chat_id") == chat_id:
508
+ removed_count += 1
509
+ else:
510
+ new_metadata.append(meta)
511
+
512
+ if removed_count == 0:
513
+ return False # Nothing to delete
514
+
515
+ logger.info(f"🧹 Surgically removing {removed_count} vectors for session {chat_id}...")
516
+
517
+ # 2. Rebuild Index (Required for FAISS IndexFlatIP)
518
+ if not new_metadata:
519
+ self.index = faiss.IndexFlatIP(384) # Reset empty
520
+ else:
521
+ # Re-embed surviving text to rebuild index
522
+ # (Optimization: In a huge DB, use IndexIDMap, but for now this is safe)
523
+ surviving_texts = [m["text"] for m in new_metadata]
524
+ try:
525
+ embeddings = self.embedder.encode(surviving_texts)
526
+ faiss.normalize_L2(embeddings)
527
+
528
+ new_index = faiss.IndexFlatIP(384)
529
+ new_index.add(np.array(embeddings).astype('float32'))
530
+ self.index = new_index
531
+ except Exception as e:
532
+ logger.error(f"Rebuild failed: {e}")
533
+ return False
534
+
535
+ self.metadata = new_metadata
536
+ self._save_index()
537
+ return True
538
+
539
  def get_user_stats(self, user_id: str):
540
  """Get statistics for a user's session"""
541
  with self.memory_lock: