Spaces:
Running
Running
Update vector.py
Browse files
vector.py
CHANGED
|
@@ -11,6 +11,25 @@ import ast
|
|
| 11 |
import re
|
| 12 |
from filelock import FileLock
|
| 13 |
import atexit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# Configure Logging
|
| 16 |
logging.basicConfig(
|
|
@@ -25,23 +44,1108 @@ class VectorDatabase:
|
|
| 25 |
self.metadata_path = metadata_path
|
| 26 |
self.lock_path = index_path + ".lock"
|
| 27 |
|
| 28 |
-
# File lock for multi-process safety
|
| 29 |
-
self.file_lock = FileLock(self.lock_path, timeout=60)
|
| 30 |
-
self.memory_lock = threading.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
-
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
def _load_or_create_index(self):
|
| 47 |
"""Thread-safe and process-safe index loading/creation"""
|
|
@@ -50,8 +1154,18 @@ class VectorDatabase:
|
|
| 50 |
try:
|
| 51 |
logger.info("📂 Loading existing vector index...")
|
| 52 |
self.index = faiss.read_index(self.index_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
with open(self.metadata_path, "rb") as f:
|
| 54 |
self.metadata = pickle.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
logger.info(f"✅ Loaded index with {self.index.ntotal} vectors, {len(self.metadata)} metadata entries")
|
| 56 |
except Exception as e:
|
| 57 |
logger.error(f"⚠️ Failed to load index: {e}. Creating new one.")
|
|
@@ -61,298 +1175,148 @@ class VectorDatabase:
|
|
| 61 |
self._create_new_index()
|
| 62 |
|
| 63 |
def _create_new_index(self):
|
| 64 |
-
"""Create fresh
|
| 65 |
dimension = 384
|
| 66 |
-
|
| 67 |
-
# Or IndexFlatL2 for Euclidean distance
|
| 68 |
-
self.index = faiss.IndexFlatIP(dimension) # Cosine similarity
|
| 69 |
self.metadata = []
|
| 70 |
logger.info(f"🆕 Created new IndexFlatIP with dimension {dimension}")
|
| 71 |
|
| 72 |
def _save_index(self):
|
| 73 |
"""Thread-safe and process-safe index saving with atomic writes"""
|
| 74 |
with self.file_lock:
|
| 75 |
-
# Create temporary files
|
| 76 |
temp_index = f"{self.index_path}.tmp"
|
| 77 |
temp_meta = f"{self.metadata_path}.tmp"
|
| 78 |
|
| 79 |
try:
|
| 80 |
-
# Save to temporary files
|
| 81 |
faiss.write_index(self.index, temp_index)
|
| 82 |
with open(temp_meta, "wb") as f:
|
| 83 |
pickle.dump(self.metadata, f)
|
| 84 |
|
| 85 |
-
# Atomic rename (POSIX operation)
|
| 86 |
os.replace(temp_index, self.index_path)
|
| 87 |
os.replace(temp_meta, self.metadata_path)
|
| 88 |
|
| 89 |
-
logger.info(f"💾 Saved index: {self.index.ntotal} vectors, {len(self.metadata)} metadata")
|
| 90 |
except Exception as e:
|
| 91 |
logger.error(f"❌ Failed to save index: {e}")
|
| 92 |
-
# Clean up temp files on failure
|
| 93 |
for f in [temp_index, temp_meta]:
|
| 94 |
if os.path.exists(f):
|
| 95 |
try:
|
| 96 |
os.remove(f)
|
| 97 |
-
except:
|
| 98 |
-
|
|
|
|
|
|
|
| 99 |
|
| 100 |
-
def
|
| 101 |
-
"""
|
| 102 |
-
Structure-aware chunker for JS, HTML, CSS, etc.
|
| 103 |
-
Splits by logical boundaries (tags, functions) instead of random characters.
|
| 104 |
-
"""
|
| 105 |
-
ext = os.path.splitext(filename)[1].lower()
|
| 106 |
-
chunks = []
|
| 107 |
-
|
| 108 |
-
# Define split patterns for different languages
|
| 109 |
-
patterns = {
|
| 110 |
-
# HTML/XML: Split before opening tags, effectively keeping tags grouped
|
| 111 |
-
'.html': r'(?=\n\s*<[^/])',
|
| 112 |
-
'.htm': r'(?=\n\s*<[^/])',
|
| 113 |
-
'.xml': r'(?=\n\s*<[^/])',
|
| 114 |
-
'.vue': r'(?=\n\s*<[^/])',
|
| 115 |
-
# JS/TS: Split before major keywords
|
| 116 |
-
'.js': r'(?=\n\s*(?:function|class|const|let|var|export|import|async))',
|
| 117 |
-
'.jsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async))',
|
| 118 |
-
'.ts': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type))',
|
| 119 |
-
'.tsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type))',
|
| 120 |
-
# CSS: Split before selectors
|
| 121 |
-
'.css': r'(?=\n\s*[.#@a-zA-Z])',
|
| 122 |
-
'.scss': r'(?=\n\s*[.#@a-zA-Z])',
|
| 123 |
-
}
|
| 124 |
-
|
| 125 |
-
pattern = patterns.get(ext)
|
| 126 |
-
|
| 127 |
-
# Fallback to standard if no pattern matches or regex fails
|
| 128 |
-
if not pattern:
|
| 129 |
-
return self._chunk_text_standard(text)
|
| 130 |
-
|
| 131 |
try:
|
| 132 |
-
|
| 133 |
-
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
|
|
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
# If adding this segment exceeds target, save current and start new
|
| 143 |
-
if len(current_chunk) + len(seg) > TARGET_SIZE and len(current_chunk) > 100:
|
| 144 |
-
chunks.append({
|
| 145 |
-
"text": current_chunk.strip(),
|
| 146 |
-
"type": "code_block",
|
| 147 |
-
"name": f"block_{len(chunks)}"
|
| 148 |
-
})
|
| 149 |
-
current_chunk = seg
|
| 150 |
-
else:
|
| 151 |
-
current_chunk += seg
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
"text": current_chunk.strip(),
|
| 157 |
-
"type": "code_block",
|
| 158 |
-
"name": f"block_{len(chunks)}"
|
| 159 |
-
})
|
| 160 |
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
return self._chunk_text_standard(text)
|
| 166 |
-
|
| 167 |
-
def _chunk_python_code(self, text, filename):
|
| 168 |
-
"""Improved AST chunker that captures EVERYTHING (not just functions)"""
|
| 169 |
-
chunks = []
|
| 170 |
-
try:
|
| 171 |
-
tree = ast.parse(text)
|
| 172 |
-
lines = text.splitlines()
|
| 173 |
|
| 174 |
-
#
|
| 175 |
-
|
| 176 |
|
| 177 |
-
|
| 178 |
-
for node in tree.body:
|
| 179 |
-
if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)):
|
| 180 |
-
# Extract the block
|
| 181 |
-
start = node.lineno - 1
|
| 182 |
-
end = node.end_lineno
|
| 183 |
-
block_text = "\n".join(lines[start:end])
|
| 184 |
-
|
| 185 |
-
chunks.append({
|
| 186 |
-
"text": block_text,
|
| 187 |
-
"type": "code_function",
|
| 188 |
-
"name": node.name
|
| 189 |
-
})
|
| 190 |
-
elif isinstance(node, (ast.Import, ast.ImportFrom, ast.Assign, ast.Expr)):
|
| 191 |
-
# Group top-level scripts/imports together
|
| 192 |
-
# We approximate by grabbing the line
|
| 193 |
-
if hasattr(node, 'end_lineno'):
|
| 194 |
-
start = node.lineno - 1
|
| 195 |
-
end = node.end_lineno
|
| 196 |
-
global_context.append("\n".join(lines[start:end]))
|
| 197 |
|
| 198 |
-
# Add the collected global context as the first chunk
|
| 199 |
-
if global_context:
|
| 200 |
-
# Group globals into chunks of 1000 chars
|
| 201 |
-
full_global = "\n".join(global_context)
|
| 202 |
-
if len(full_global) > 100:
|
| 203 |
-
chunks.insert(0, {
|
| 204 |
-
"text": full_global[:1500], # Cap context size
|
| 205 |
-
"type": "code_context",
|
| 206 |
-
"name": "imports_and_globals"
|
| 207 |
-
})
|
| 208 |
-
|
| 209 |
except Exception as e:
|
| 210 |
-
logger.
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
# Fallback: if AST yielded nothing (e.g. empty file), use standards
|
| 214 |
-
if not chunks:
|
| 215 |
-
return self._chunk_text_standard(text)
|
| 216 |
-
|
| 217 |
-
return chunks
|
| 218 |
|
| 219 |
-
def
|
| 220 |
-
"""
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
# Handle very short text
|
| 224 |
-
if len(text) <= chunk_size:
|
| 225 |
-
return [{
|
| 226 |
-
"text": text,
|
| 227 |
-
"type": "text_block",
|
| 228 |
-
"name": "full_content"
|
| 229 |
-
}]
|
| 230 |
|
| 231 |
-
|
| 232 |
-
for i in range(0, len(text), chunk_size - overlap):
|
| 233 |
-
chunk = text[i:i + chunk_size]
|
| 234 |
-
if len(chunk) > 100: # Minimum chunk size
|
| 235 |
-
chunks.append({
|
| 236 |
-
"text": chunk,
|
| 237 |
-
"type": "text_block",
|
| 238 |
-
"name": f"chunk_{i//chunk_size}"
|
| 239 |
-
})
|
| 240 |
|
| 241 |
-
|
|
|
|
| 242 |
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
# Ensure we have chunks
|
| 268 |
-
if not chunks_data and text:
|
| 269 |
-
chunks_data = [{
|
| 270 |
-
"text": text[:2000],
|
| 271 |
-
"type": "fallback",
|
| 272 |
-
"name": "full_document"
|
| 273 |
-
}]
|
| 274 |
-
|
| 275 |
-
if not chunks_data:
|
| 276 |
-
logger.error(f"No chunks generated for {filename}")
|
| 277 |
-
return False
|
| 278 |
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
"
|
| 288 |
-
"
|
| 289 |
-
"
|
| 290 |
-
"subtype": chunk.get("type", "general"),
|
| 291 |
-
"name": chunk.get("name", "unknown"),
|
| 292 |
-
"user_id": user_id,
|
| 293 |
-
"chat_id": chat_id,
|
| 294 |
-
"timestamp": time.time(),
|
| 295 |
-
"chunk_index": len(final_texts)
|
| 296 |
})
|
| 297 |
-
|
| 298 |
-
# 2. Add "Whole File" Entry (FIXED FOR INTENT SEPARATION)
|
| 299 |
-
marker_text = f"Entire full content of file {filename} code"
|
| 300 |
-
final_texts.append(marker_text)
|
| 301 |
-
final_meta.append({
|
| 302 |
-
"text": marker_text,
|
| 303 |
-
"actual_content": text, # The full content
|
| 304 |
-
"source": filename,
|
| 305 |
-
|
| 306 |
-
# --- THE FIX ---
|
| 307 |
-
"type": "file", # Visible to 'file' searches
|
| 308 |
-
"subtype": "whole_file", # Identified by Ranking Logic
|
| 309 |
-
# ----------------
|
| 310 |
-
|
| 311 |
-
"user_id": user_id,
|
| 312 |
-
"chat_id": chat_id,
|
| 313 |
-
"timestamp": time.time(),
|
| 314 |
-
"chunk_index": -1
|
| 315 |
-
})
|
| 316 |
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
faiss.normalize_L2(embeddings)
|
| 321 |
-
|
| 322 |
-
with self.memory_lock:
|
| 323 |
-
self.index.add(np.array(embeddings).astype('float32'))
|
| 324 |
-
self.metadata.extend(final_meta)
|
| 325 |
-
self._save_index()
|
| 326 |
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
return True
|
| 333 |
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
user_vectors = sum(1 for m in self.metadata if m.get("user_id") == user_id)
|
| 342 |
-
|
| 343 |
-
logger.info(f"🔍 Storage verification: User {user_id[:8]} has {user_vectors} vectors (expected: {expected_count})")
|
| 344 |
|
| 345 |
-
|
| 346 |
-
logger.warning(f"⚠️ Storage mismatch for user {user_id[:8]}")
|
| 347 |
|
| 348 |
-
def store_chat_context(self, messages: list, user_id: str, chat_id: str):
|
| 349 |
"""Store chat history as session memory"""
|
| 350 |
if not messages or not user_id:
|
| 351 |
return False
|
| 352 |
|
| 353 |
-
# Format conversation
|
| 354 |
conversation = ""
|
| 355 |
-
for msg in messages[-10:]:
|
| 356 |
role = msg.get("role", "unknown")
|
| 357 |
content = msg.get("content", "")
|
| 358 |
if content:
|
|
@@ -361,13 +1325,11 @@ class VectorDatabase:
|
|
| 361 |
if len(conversation) < 50:
|
| 362 |
return False
|
| 363 |
|
| 364 |
-
|
| 365 |
-
chunks = self._chunk_text_standard(conversation, chunk_size=800, overlap=100)
|
| 366 |
|
| 367 |
if not chunks:
|
| 368 |
return False
|
| 369 |
|
| 370 |
-
# Prepare for indexing
|
| 371 |
texts = [c["text"] for c in chunks]
|
| 372 |
metadata_list = []
|
| 373 |
|
|
@@ -382,9 +1344,8 @@ class VectorDatabase:
|
|
| 382 |
"chunk_index": i
|
| 383 |
})
|
| 384 |
|
| 385 |
-
# Store in index
|
| 386 |
try:
|
| 387 |
-
embeddings = self.embedder.encode(texts)
|
| 388 |
faiss.normalize_L2(embeddings)
|
| 389 |
|
| 390 |
with self.memory_lock:
|
|
@@ -392,165 +1353,94 @@ class VectorDatabase:
|
|
| 392 |
self.metadata.extend(metadata_list)
|
| 393 |
self._save_index()
|
| 394 |
|
|
|
|
|
|
|
|
|
|
| 395 |
logger.info(f"💭 Stored {len(texts)} chat history chunks for user {user_id[:8]}")
|
| 396 |
return True
|
| 397 |
|
| 398 |
except Exception as e:
|
| 399 |
-
logger.error(f"Failed to store chat history: {e}")
|
| 400 |
return False
|
| 401 |
|
| 402 |
-
def
|
| 403 |
-
"""
|
| 404 |
-
Retrieve context with 'Whole File' capability.
|
| 405 |
-
MAINTAINS SEPARATION: Files vs. History
|
| 406 |
-
"""
|
| 407 |
-
if self.index.ntotal == 0 or not user_id:
|
| 408 |
-
return []
|
| 409 |
-
|
| 410 |
-
# Debug info
|
| 411 |
-
with self.memory_lock:
|
| 412 |
-
total_vectors = self.index.ntotal
|
| 413 |
-
user_vectors = sum(1 for m in self.metadata if m.get("user_id") == user_id)
|
| 414 |
-
|
| 415 |
-
logger.info(f"🔍 Searching for user {user_id[:8]} (User vectors: {user_vectors}/{total_vectors})")
|
| 416 |
-
|
| 417 |
-
# Encode query
|
| 418 |
-
query_vec = self.embedder.encode([query])
|
| 419 |
-
faiss.normalize_L2(query_vec)
|
| 420 |
-
|
| 421 |
-
# Search
|
| 422 |
-
search_k = min(top_k * 3, self.index.ntotal) if self.index.ntotal > 0 else 1
|
| 423 |
-
with self.memory_lock:
|
| 424 |
-
D, I = self.index.search(np.array(query_vec).astype('float32'), search_k)
|
| 425 |
-
|
| 426 |
-
candidates = []
|
| 427 |
-
valid_count = 0
|
| 428 |
-
query_lower = query.lower()
|
| 429 |
-
|
| 430 |
-
for i, idx in enumerate(I[0]):
|
| 431 |
-
if idx == -1 or idx >= len(self.metadata): continue
|
| 432 |
-
|
| 433 |
-
item = self.metadata[idx]
|
| 434 |
-
|
| 435 |
-
# 1. STRICT ISOLATION (User & Session)
|
| 436 |
-
if item.get("user_id") != user_id: continue
|
| 437 |
-
if item.get("chat_id") != chat_id: continue
|
| 438 |
-
|
| 439 |
-
# 2. INTENT SEPARATION (File vs. History)
|
| 440 |
-
# If front-end asks for 'file', we return 'file' (which now includes whole_files).
|
| 441 |
-
# If front-end asks for 'history', we return 'history'.
|
| 442 |
-
if filter_type and item.get("type") != filter_type:
|
| 443 |
-
continue
|
| 444 |
-
|
| 445 |
-
score = D[0][i]
|
| 446 |
-
|
| 447 |
-
# 3. WHOLE FILE RANKING LOGIC
|
| 448 |
-
# We now check 'subtype' instead of 'type'
|
| 449 |
-
filename = item.get("source", "").lower()
|
| 450 |
-
is_whole_file = item.get("subtype") == "whole_file" # <--- UPDATED CHECK
|
| 451 |
-
|
| 452 |
-
if is_whole_file:
|
| 453 |
-
if filename in query_lower:
|
| 454 |
-
score = 2.0 # Force to top
|
| 455 |
-
|
| 456 |
-
if item.get("actual_content"):
|
| 457 |
-
item = item.copy()
|
| 458 |
-
item["text"] = item["actual_content"]
|
| 459 |
-
|
| 460 |
-
# 4. GATEKEEPER
|
| 461 |
-
if score < min_score: continue
|
| 462 |
-
|
| 463 |
-
candidates.append({
|
| 464 |
-
"id": int(idx),
|
| 465 |
-
"text": item.get("text", ""),
|
| 466 |
-
"meta": item,
|
| 467 |
-
"score": score
|
| 468 |
-
})
|
| 469 |
-
valid_count += 1
|
| 470 |
-
|
| 471 |
-
if not candidates: return []
|
| 472 |
-
|
| 473 |
-
candidates.sort(key=lambda x: x["score"], reverse=True)
|
| 474 |
-
|
| 475 |
-
if candidates[0]["score"] >= 2.0:
|
| 476 |
-
logger.info(f"🎯 Returning Whole File: {candidates[0]['meta'].get('source')}")
|
| 477 |
-
return candidates[:1]
|
| 478 |
-
|
| 479 |
-
try:
|
| 480 |
-
rerank_request = RerankRequest(query=query, passages=candidates)
|
| 481 |
-
results = self.ranker.rerank(rerank_request)
|
| 482 |
-
final_results = [r for r in results[:final_k] if r['score'] > min_score]
|
| 483 |
-
return final_results
|
| 484 |
-
except Exception as e:
|
| 485 |
-
logger.error(f"Reranking failed: {e}")
|
| 486 |
-
return candidates[:final_k]
|
| 487 |
-
|
| 488 |
-
def delete_session(self, user_id: str, chat_id: str):
|
| 489 |
"""Surgical Strike: Permanently remove ONLY one specific session"""
|
| 490 |
with self.memory_lock:
|
| 491 |
-
# 1. Filter: Keep everything that is NOT this specific chat
|
| 492 |
new_metadata = []
|
| 493 |
removed_count = 0
|
| 494 |
|
| 495 |
for meta in self.metadata:
|
| 496 |
-
# Check strict ownership and ID match
|
| 497 |
if meta.get("user_id") == user_id and meta.get("chat_id") == chat_id:
|
| 498 |
removed_count += 1
|
| 499 |
else:
|
| 500 |
new_metadata.append(meta)
|
| 501 |
|
| 502 |
if removed_count == 0:
|
| 503 |
-
|
|
|
|
| 504 |
|
| 505 |
logger.info(f"🧹 Surgically removing {removed_count} vectors for session {chat_id}...")
|
| 506 |
|
| 507 |
-
# 2. Rebuild Index (Required for FAISS IndexFlatIP)
|
| 508 |
if not new_metadata:
|
| 509 |
-
self.index = faiss.IndexFlatIP(384)
|
| 510 |
else:
|
| 511 |
-
# Re-embed surviving text to rebuild index
|
| 512 |
-
# (Optimization: In a huge DB, use IndexIDMap, but for now this is safe)
|
| 513 |
surviving_texts = [m["text"] for m in new_metadata]
|
| 514 |
try:
|
| 515 |
-
embeddings = self.embedder.encode(surviving_texts)
|
| 516 |
faiss.normalize_L2(embeddings)
|
| 517 |
|
| 518 |
new_index = faiss.IndexFlatIP(384)
|
| 519 |
new_index.add(np.array(embeddings).astype('float32'))
|
| 520 |
self.index = new_index
|
| 521 |
except Exception as e:
|
| 522 |
-
logger.error(f"Rebuild failed: {e}")
|
| 523 |
return False
|
| 524 |
|
| 525 |
self.metadata = new_metadata
|
| 526 |
self._save_index()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
return True
|
| 528 |
-
|
| 529 |
-
def get_user_stats(self, user_id: str):
|
| 530 |
"""Get statistics for a user's session"""
|
| 531 |
with self.memory_lock:
|
| 532 |
user_vectors = []
|
| 533 |
-
for meta in self.metadata:
|
| 534 |
-
if meta.get("user_id") == user_id:
|
| 535 |
user_vectors.append(meta)
|
| 536 |
|
| 537 |
stats = {
|
| 538 |
"user_id": user_id,
|
| 539 |
"total_vectors": len(user_vectors),
|
| 540 |
"by_type": {},
|
| 541 |
-
"by_source": {}
|
|
|
|
|
|
|
| 542 |
}
|
| 543 |
|
| 544 |
-
for vec in user_vectors:
|
| 545 |
vec_type = vec.get("type", "unknown")
|
| 546 |
source = vec.get("source", "unknown")
|
|
|
|
| 547 |
|
| 548 |
stats["by_type"][vec_type] = stats["by_type"].get(vec_type, 0) + 1
|
| 549 |
stats["by_source"][source] = stats["by_source"].get(source, 0) + 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 550 |
|
| 551 |
return stats
|
| 552 |
|
| 553 |
-
def cleanup_old_sessions(self, max_age_hours=24):
|
| 554 |
"""Clean up old session data"""
|
| 555 |
current_time = time.time()
|
| 556 |
cutoff = current_time - (max_age_hours * 3600)
|
|
@@ -558,35 +1448,46 @@ class VectorDatabase:
|
|
| 558 |
with self.memory_lock:
|
| 559 |
old_metadata = []
|
| 560 |
new_metadata = []
|
|
|
|
| 561 |
|
| 562 |
-
for
|
| 563 |
if meta.get("timestamp", 0) < cutoff:
|
| 564 |
-
old_metadata.append(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
else:
|
| 566 |
new_metadata.append(meta)
|
| 567 |
|
| 568 |
if not old_metadata:
|
| 569 |
return 0
|
| 570 |
|
| 571 |
-
# Rebuild index with only recent vectors
|
| 572 |
logger.info(f"🧹 Cleaning up {len(old_metadata)} old vectors...")
|
| 573 |
|
| 574 |
-
# Extract recent texts
|
| 575 |
recent_texts = [m["text"] for m in new_metadata]
|
| 576 |
|
| 577 |
if recent_texts:
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
|
|
|
|
|
|
|
|
|
| 584 |
else:
|
| 585 |
self.index = faiss.IndexFlatIP(384)
|
| 586 |
|
| 587 |
self.metadata = new_metadata
|
| 588 |
self._save_index()
|
| 589 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
return len(old_metadata)
|
| 591 |
|
| 592 |
def _cleanup(self):
|
|
@@ -594,17 +1495,21 @@ class VectorDatabase:
|
|
| 594 |
try:
|
| 595 |
if hasattr(self, 'file_lock'):
|
| 596 |
self.file_lock.release()
|
| 597 |
-
|
| 598 |
-
|
|
|
|
| 599 |
|
| 600 |
# Global instance (singleton pattern)
|
| 601 |
_vdb_instance = None
|
|
|
|
| 602 |
|
| 603 |
-
def get_vector_db():
|
| 604 |
-
"""Singleton factory for VectorDatabase"""
|
| 605 |
global _vdb_instance
|
| 606 |
if _vdb_instance is None:
|
| 607 |
-
|
|
|
|
|
|
|
| 608 |
return _vdb_instance
|
| 609 |
|
| 610 |
# For backward compatibility
|
|
|
|
| 11 |
import re
|
| 12 |
from filelock import FileLock
|
| 13 |
import atexit
|
| 14 |
+
import gc
|
| 15 |
+
from typing import List, Dict, Any, Optional, Tuple, Union
|
| 16 |
+
from collections import defaultdict, OrderedDict # <-- FIX 1: Add OrderedDict
|
| 17 |
+
|
| 18 |
+
# === NEW IMPORTS FOR HYBRID SEARCH ===
|
| 19 |
+
try:
|
| 20 |
+
from rank_bm25 import BM25Okapi
|
| 21 |
+
BM25_AVAILABLE = True
|
| 22 |
+
except ImportError:
|
| 23 |
+
BM25_AVAILABLE = False
|
| 24 |
+
logging.warning("BM25 not available. Install: pip install rank-bm25")
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
import nltk
|
| 28 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
| 29 |
+
NLTK_AVAILABLE = True
|
| 30 |
+
except ImportError:
|
| 31 |
+
NLTK_AVAILABLE = False
|
| 32 |
+
logging.warning("NLTK not available. Install: pip install nltk")
|
| 33 |
|
| 34 |
# Configure Logging
|
| 35 |
logging.basicConfig(
|
|
|
|
| 44 |
self.metadata_path = metadata_path
|
| 45 |
self.lock_path = index_path + ".lock"
|
| 46 |
|
| 47 |
+
# File lock for multi-process safety
|
| 48 |
+
self.file_lock = FileLock(self.lock_path, timeout=60)
|
| 49 |
+
self.memory_lock = threading.RLock()
|
| 50 |
+
|
| 51 |
+
logger.info("🧠 Initializing Production Vector Engine with Hybrid Search...")
|
| 52 |
+
|
| 53 |
+
# Load models with error handling
|
| 54 |
+
try:
|
| 55 |
+
self.embedder = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
|
| 56 |
+
self.ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2", cache_dir="./flashrank_cache")
|
| 57 |
+
except Exception as e:
|
| 58 |
+
logger.error(f"❌ Failed to load models: {e}")
|
| 59 |
+
raise RuntimeError(f"Model initialization failed: {e}")
|
| 60 |
+
|
| 61 |
+
# Load or create index with file locking
|
| 62 |
+
self._load_or_create_index()
|
| 63 |
+
|
| 64 |
+
# === FIX 1: LAZY LOADING & LRU CACHE (Memory Safe) ===
|
| 65 |
+
# REMOVED: self._initialize_bm25_from_metadata() - No OOM on startup!
|
| 66 |
+
# Instead, use LRU Cache to load sessions only when searched
|
| 67 |
+
self.bm25_cache_size = 50 # Limit concurrent BM25 indices in memory
|
| 68 |
+
self.bm25_indices = OrderedDict() # {(user_id, chat_id): BM25Okapi} with LRU
|
| 69 |
+
self.bm25_docs = {} # {(user_id, chat_id): [tokenized_documents]}
|
| 70 |
+
self.bm25_doc_to_vector = {} # {(user_id, chat_id): [vector_ids]}
|
| 71 |
+
self.bm25_lock = threading.RLock()
|
| 72 |
+
|
| 73 |
+
# Performance tracking
|
| 74 |
+
self.query_history = []
|
| 75 |
+
self.performance_stats = {
|
| 76 |
+
"exact_matches": 0,
|
| 77 |
+
"semantic_matches": 0,
|
| 78 |
+
"bm25_matches": 0,
|
| 79 |
+
"hybrid_matches": 0,
|
| 80 |
+
"fallback_matches": 0,
|
| 81 |
+
"avg_retrieval_time": 0
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
# Query type classification stats
|
| 85 |
+
self.query_types = defaultdict(int)
|
| 86 |
+
|
| 87 |
+
# Register cleanup
|
| 88 |
+
atexit.register(self._cleanup)
|
| 89 |
+
|
| 90 |
+
logger.info(f"✅ Vector Engine Ready. Index: {self.index.ntotal} vectors, {len(self.metadata)} metadata entries")
|
| 91 |
+
logger.info(f"✅ BM25 LRU Cache: {self.bm25_cache_size} sessions max, BM25 Available: {BM25_AVAILABLE}")
|
| 92 |
+
|
| 93 |
+
# ==================== FIX 2: LAZY BM25 LOADING ====================
|
| 94 |
+
|
| 95 |
+
def _get_or_build_bm25(self, user_id: str, chat_id: str) -> Optional[BM25Okapi]:
|
| 96 |
+
"""
|
| 97 |
+
Retrieve BM25 index from cache or build it on-demand (Lazy Load).
|
| 98 |
+
Uses LRU eviction to prevent memory explosion.
|
| 99 |
+
"""
|
| 100 |
+
if not BM25_AVAILABLE:
|
| 101 |
+
return None
|
| 102 |
+
|
| 103 |
+
key = (user_id, chat_id)
|
| 104 |
+
|
| 105 |
+
with self.bm25_lock:
|
| 106 |
+
# 1. CACHE HIT: Move to end (mark as recently used)
|
| 107 |
+
if key in self.bm25_indices:
|
| 108 |
+
self.bm25_indices.move_to_end(key)
|
| 109 |
+
return self.bm25_indices[key]
|
| 110 |
+
|
| 111 |
+
# 2. CACHE MISS: Build index on the fly
|
| 112 |
+
logger.debug(f"🔄 Building BM25 index on-demand for session {key}")
|
| 113 |
+
|
| 114 |
+
tokenized_corpus = []
|
| 115 |
+
vector_ids = []
|
| 116 |
+
|
| 117 |
+
# Filter documents for this user only (session isolation)
|
| 118 |
+
with self.memory_lock:
|
| 119 |
+
for idx, meta in enumerate(self.metadata):
|
| 120 |
+
if meta.get("user_id") == user_id and meta.get("chat_id") == chat_id:
|
| 121 |
+
text = meta.get("text", "")
|
| 122 |
+
tokens = self._tokenize_for_bm25(text)
|
| 123 |
+
if tokens: # Only add non-empty tokenized docs
|
| 124 |
+
tokenized_corpus.append(tokens)
|
| 125 |
+
vector_ids.append(idx)
|
| 126 |
+
|
| 127 |
+
if not tokenized_corpus:
|
| 128 |
+
logger.debug(f"⚠️ No documents found for BM25 index {key}")
|
| 129 |
+
return None
|
| 130 |
+
|
| 131 |
+
# Build BM25 index
|
| 132 |
+
try:
|
| 133 |
+
bm25 = BM25Okapi(tokenized_corpus)
|
| 134 |
+
|
| 135 |
+
# Store additional metadata for scoring
|
| 136 |
+
self.bm25_docs[key] = tokenized_corpus
|
| 137 |
+
self.bm25_doc_to_vector[key] = vector_ids
|
| 138 |
+
|
| 139 |
+
# 3. STORE IN CACHE with LRU EVICTION POLICY
|
| 140 |
+
if len(self.bm25_indices) >= self.bm25_cache_size:
|
| 141 |
+
# Remove oldest entry
|
| 142 |
+
oldest_key, _ = self.bm25_indices.popitem(last=False)
|
| 143 |
+
# Clean up associated data
|
| 144 |
+
if oldest_key in self.bm25_docs:
|
| 145 |
+
del self.bm25_docs[oldest_key]
|
| 146 |
+
if oldest_key in self.bm25_doc_to_vector:
|
| 147 |
+
del self.bm25_doc_to_vector[oldest_key]
|
| 148 |
+
logger.debug(f"🧹 Evicted BM25 cache for session {oldest_key}")
|
| 149 |
+
|
| 150 |
+
self.bm25_indices[key] = bm25
|
| 151 |
+
logger.debug(f"✅ Built BM25 index for session {key}: {len(tokenized_corpus)} docs")
|
| 152 |
+
|
| 153 |
+
return bm25
|
| 154 |
+
|
| 155 |
+
except Exception as e:
|
| 156 |
+
logger.error(f"❌ Failed to build BM25 index for {key}: {e}")
|
| 157 |
+
return None
|
| 158 |
+
|
| 159 |
+
def _invalidate_bm25_cache(self, user_id: str, chat_id: str):
|
| 160 |
+
"""
|
| 161 |
+
Invalidate BM25 cache for a session (fast, no rebuild).
|
| 162 |
+
Called when new documents are added.
|
| 163 |
+
"""
|
| 164 |
+
key = (user_id, chat_id)
|
| 165 |
+
with self.bm25_lock:
|
| 166 |
+
if key in self.bm25_indices:
|
| 167 |
+
del self.bm25_indices[key]
|
| 168 |
+
if key in self.bm25_docs:
|
| 169 |
+
del self.bm25_docs[key]
|
| 170 |
+
if key in self.bm25_doc_to_vector:
|
| 171 |
+
del self.bm25_doc_to_vector[key]
|
| 172 |
+
logger.debug(f"🧹 Invalidated BM25 cache for session {key}")
|
| 173 |
+
|
| 174 |
+
def _tokenize_for_bm25(self, text: str) -> List[str]:
|
| 175 |
+
"""Tokenize text for BM25 with proper handling"""
|
| 176 |
+
if not text or not isinstance(text, str):
|
| 177 |
+
return []
|
| 178 |
+
|
| 179 |
+
# Simple tokenization if NLTK not available
|
| 180 |
+
if not NLTK_AVAILABLE:
|
| 181 |
+
# Basic regex tokenization (fallback)
|
| 182 |
+
tokens = re.findall(r'\b\w{2,}\b', text.lower())
|
| 183 |
+
return tokens
|
| 184 |
+
|
| 185 |
+
try:
|
| 186 |
+
# Use NLTK for better tokenization
|
| 187 |
+
tokens = word_tokenize(text.lower())
|
| 188 |
+
# Filter out very short tokens and keep alphanumeric
|
| 189 |
+
tokens = [t for t in tokens if len(t) >= 2 and re.match(r'^[a-z0-9]+$', t)]
|
| 190 |
+
return tokens
|
| 191 |
+
except Exception as e:
|
| 192 |
+
logger.warning(f"Tokenization failed: {e}, using fallback")
|
| 193 |
+
tokens = re.findall(r'\b\w{2,}\b', text.lower())
|
| 194 |
+
return tokens
|
| 195 |
+
|
| 196 |
+
# ==================== ENHANCED STORAGE WITH CACHE INVALIDATION ====================
|
| 197 |
+
|
| 198 |
+
def store_session_document(self, text: str, filename: str, user_id: str, chat_id: str) -> bool:
|
| 199 |
+
"""Store extracted file content with enhanced chunking and cache invalidation"""
|
| 200 |
+
if not text or len(text) < 10 or not user_id:
|
| 201 |
+
logger.warning(f"Invalid input for {filename}")
|
| 202 |
+
return False
|
| 203 |
+
|
| 204 |
+
logger.info(f"📥 Storing {filename} ({len(text)} chars) for user {user_id[:8]}...")
|
| 205 |
+
|
| 206 |
+
chunks_data = []
|
| 207 |
+
ext = os.path.splitext(filename)[1].lower()
|
| 208 |
+
|
| 209 |
+
try:
|
| 210 |
+
# ===== FIX 4: CORRECT METHOD NAMES =====
|
| 211 |
+
if ext == '.py':
|
| 212 |
+
chunks_data = self._chunk_python_ast(text, filename) # <-- Fixed name
|
| 213 |
+
elif ext in ['.js', '.html', '.css', '.java', '.cpp', '.ts', '.tsx', '.jsx', '.vue', '.xml', '.scss']:
|
| 214 |
+
chunks_data = self._chunk_smart_code(text, filename)
|
| 215 |
+
else:
|
| 216 |
+
chunks_data = self._chunk_text_enhanced(text, chunk_size=600, overlap=100)
|
| 217 |
+
except Exception as e:
|
| 218 |
+
logger.error(f"Chunking failed for {filename}: {e}")
|
| 219 |
+
chunks_data = self._chunk_text_enhanced(text, chunk_size=600, overlap=100)
|
| 220 |
+
|
| 221 |
+
if not chunks_data and text:
|
| 222 |
+
chunks_data = [{
|
| 223 |
+
"text": text[:2000],
|
| 224 |
+
"type": "fallback",
|
| 225 |
+
"name": "full_document"
|
| 226 |
+
}]
|
| 227 |
+
|
| 228 |
+
if not chunks_data:
|
| 229 |
+
logger.error(f"No chunks generated for {filename}")
|
| 230 |
+
return False
|
| 231 |
+
|
| 232 |
+
final_texts = []
|
| 233 |
+
final_meta = []
|
| 234 |
+
|
| 235 |
+
for chunk in chunks_data:
|
| 236 |
+
final_texts.append(chunk["text"])
|
| 237 |
+
final_meta.append({
|
| 238 |
+
"text": chunk["text"],
|
| 239 |
+
"source": filename,
|
| 240 |
+
"type": "file",
|
| 241 |
+
"subtype": chunk.get("type", "general"),
|
| 242 |
+
"name": chunk.get("name", "unknown"),
|
| 243 |
+
"user_id": user_id,
|
| 244 |
+
"chat_id": chat_id,
|
| 245 |
+
"timestamp": time.time(),
|
| 246 |
+
"chunk_index": len(final_texts)
|
| 247 |
+
})
|
| 248 |
+
|
| 249 |
+
# Whole file embedding for comprehensive answers
|
| 250 |
+
whole_file_text = text[:4000] if len(text) > 4000 else text
|
| 251 |
+
final_texts.append(f"Complete File: {filename} | Full Content: {whole_file_text}")
|
| 252 |
+
final_meta.append({
|
| 253 |
+
"text": whole_file_text,
|
| 254 |
+
"actual_content": text,
|
| 255 |
+
"source": filename,
|
| 256 |
+
"type": "file",
|
| 257 |
+
"subtype": "whole_file",
|
| 258 |
+
"is_whole_file": True,
|
| 259 |
+
"user_id": user_id,
|
| 260 |
+
"chat_id": chat_id,
|
| 261 |
+
"timestamp": time.time(),
|
| 262 |
+
"chunk_index": -1
|
| 263 |
+
})
|
| 264 |
+
|
| 265 |
+
try:
|
| 266 |
+
# Optimized embedding
|
| 267 |
+
embeddings = self.embedder.encode(
|
| 268 |
+
final_texts,
|
| 269 |
+
show_progress_bar=False,
|
| 270 |
+
batch_size=32,
|
| 271 |
+
convert_to_numpy=True,
|
| 272 |
+
normalize_embeddings=True
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
faiss.normalize_L2(embeddings)
|
| 276 |
+
|
| 277 |
+
with self.memory_lock:
|
| 278 |
+
self.index.add(np.array(embeddings).astype('float32'))
|
| 279 |
+
self.metadata.extend(final_meta)
|
| 280 |
+
self._save_index()
|
| 281 |
+
|
| 282 |
+
logger.info(f"✅ Stored {len(final_texts)} chunks from {filename} for user {user_id[:8]}")
|
| 283 |
+
|
| 284 |
+
# ===== FIX 4: CACHE INVALIDATION instead of Immediate Rebuild =====
|
| 285 |
+
# When new files arrive, just invalidate the old cache.
|
| 286 |
+
# It will auto-rebuild (including the new file) on next search.
|
| 287 |
+
self._invalidate_bm25_cache(user_id, chat_id)
|
| 288 |
+
|
| 289 |
+
self._verify_storage(user_id, chat_id, len(final_texts))
|
| 290 |
+
|
| 291 |
+
return True
|
| 292 |
+
|
| 293 |
+
except Exception as e:
|
| 294 |
+
logger.error(f"❌ Failed to store vectors for {filename}: {e}")
|
| 295 |
+
# Clean up partial storage
|
| 296 |
+
with self.memory_lock:
|
| 297 |
+
if self.index.ntotal >= len(final_texts):
|
| 298 |
+
logger.warning("Rolling back partial storage...")
|
| 299 |
+
self._rollback_partial_storage(user_id, chat_id)
|
| 300 |
+
return False
|
| 301 |
+
|
| 302 |
+
# ==================== UPDATED BM25 SEARCH WITH LAZY LOADING ====================
|
| 303 |
+
|
| 304 |
+
def bm25_search(self, query: str, user_id: str, chat_id: str,
|
| 305 |
+
top_k: int = 50, min_score: float = 0.0) -> List[Dict[str, Any]]:
|
| 306 |
+
"""
|
| 307 |
+
Pure BM25 search within a session with lazy loading.
|
| 308 |
+
Returns ranked results with BM25 scores.
|
| 309 |
+
"""
|
| 310 |
+
if not BM25_AVAILABLE:
|
| 311 |
+
logger.warning("BM25 not available. Falling back to semantic search.")
|
| 312 |
+
return []
|
| 313 |
+
|
| 314 |
+
start_time = time.time()
|
| 315 |
+
|
| 316 |
+
# ===== FIX 3: USE LAZY LOADER =====
|
| 317 |
+
bm25_index = self._get_or_build_bm25(user_id, chat_id)
|
| 318 |
+
|
| 319 |
+
if not bm25_index:
|
| 320 |
+
logger.warning(f"No BM25 index for session {(user_id[:8], chat_id[:8])}")
|
| 321 |
+
return []
|
| 322 |
+
|
| 323 |
+
# Tokenize query
|
| 324 |
+
query_tokens = self._tokenize_for_bm25(query)
|
| 325 |
+
if not query_tokens:
|
| 326 |
+
return []
|
| 327 |
+
|
| 328 |
+
try:
|
| 329 |
+
# Get BM25 scores from the lazy-loaded index
|
| 330 |
+
key = (user_id, chat_id)
|
| 331 |
+
bm25_scores = bm25_index.get_scores(query_tokens)
|
| 332 |
+
|
| 333 |
+
# Get top-k indices
|
| 334 |
+
top_indices = np.argsort(bm25_scores)[::-1][:top_k * 2]
|
| 335 |
+
|
| 336 |
+
results = []
|
| 337 |
+
for idx in top_indices:
|
| 338 |
+
score = float(bm25_scores[idx])
|
| 339 |
+
|
| 340 |
+
# Apply minimum score threshold
|
| 341 |
+
if score < min_score:
|
| 342 |
+
continue
|
| 343 |
+
|
| 344 |
+
# Map BM25 doc index to vector index
|
| 345 |
+
if (key in self.bm25_doc_to_vector and
|
| 346 |
+
idx < len(self.bm25_doc_to_vector[key])):
|
| 347 |
+
|
| 348 |
+
vector_idx = self.bm25_doc_to_vector[key][idx]
|
| 349 |
+
if vector_idx < len(self.metadata):
|
| 350 |
+
meta = self.metadata[vector_idx]
|
| 351 |
+
|
| 352 |
+
# Calculate normalized score (0-1 range)
|
| 353 |
+
normalized_score = min(score / 10.0, 1.0) if score > 0 else 0.0
|
| 354 |
+
|
| 355 |
+
results.append({
|
| 356 |
+
"id": int(vector_idx),
|
| 357 |
+
"text": meta.get("text", ""),
|
| 358 |
+
"meta": meta,
|
| 359 |
+
"score": normalized_score,
|
| 360 |
+
"match_type": "bm25",
|
| 361 |
+
"bm25_raw_score": score,
|
| 362 |
+
"is_whole_file": meta.get("is_whole_file", False)
|
| 363 |
+
})
|
| 364 |
+
|
| 365 |
+
# Sort by BM25 score
|
| 366 |
+
results.sort(key=lambda x: x["score"], reverse=True)
|
| 367 |
+
|
| 368 |
+
elapsed = time.time() - start_time
|
| 369 |
+
logger.debug(f"BM25 search completed in {elapsed:.3f}s: {len(results)} results")
|
| 370 |
+
|
| 371 |
+
return results[:top_k]
|
| 372 |
+
|
| 373 |
+
except Exception as e:
|
| 374 |
+
logger.error(f"BM25 search failed: {e}")
|
| 375 |
+
return []
|
| 376 |
+
|
| 377 |
+
# ==================== HYBRID RETRIEVAL ENGINE (UPDATED) ====================
|
| 378 |
+
|
| 379 |
+
def hybrid_retrieve(self, query: str, user_id: str, chat_id: str,
|
| 380 |
+
filter_type: str = None, top_k: int = 100,
|
| 381 |
+
final_k: int = 5, strategy: str = "smart") -> List[Dict[str, Any]]:
|
| 382 |
+
"""
|
| 383 |
+
HYBRID RETRIEVAL: BM25 + Semantic + Exact Fusion
|
| 384 |
+
Now with lazy-loaded BM25 indices for memory safety.
|
| 385 |
+
"""
|
| 386 |
+
logger.info(f"🤖 HYBRID SEARCH: '{query[:80]}...' | Strategy: {strategy}")
|
| 387 |
+
|
| 388 |
+
# Classify query type
|
| 389 |
+
query_category = self._classify_query(query)
|
| 390 |
+
self.query_types[query_category] += 1
|
| 391 |
+
|
| 392 |
+
# Choose strategy based on query type if "smart"
|
| 393 |
+
if strategy == "smart":
|
| 394 |
+
if query_category == "code":
|
| 395 |
+
strategy = "bm25_first"
|
| 396 |
+
elif query_category == "natural":
|
| 397 |
+
strategy = "semantic_first"
|
| 398 |
+
else:
|
| 399 |
+
strategy = "fusion"
|
| 400 |
+
|
| 401 |
+
start_time = time.time()
|
| 402 |
+
|
| 403 |
+
# === PHASE 1: GET RESULTS FROM BOTH METHODS ===
|
| 404 |
+
bm25_results = []
|
| 405 |
+
semantic_results = []
|
| 406 |
+
|
| 407 |
+
if strategy in ["bm25_first", "fusion", "weighted", "smart"]:
|
| 408 |
+
bm25_results = self.bm25_search(
|
| 409 |
+
query=query,
|
| 410 |
+
user_id=user_id,
|
| 411 |
+
chat_id=chat_id,
|
| 412 |
+
top_k=top_k * 2,
|
| 413 |
+
min_score=0.1
|
| 414 |
+
)
|
| 415 |
+
|
| 416 |
+
if strategy in ["semantic_first", "fusion", "weighted", "smart"]:
|
| 417 |
+
semantic_results = self._semantic_search(
|
| 418 |
+
query=query,
|
| 419 |
+
user_id=user_id,
|
| 420 |
+
chat_id=chat_id,
|
| 421 |
+
filter_type=filter_type,
|
| 422 |
+
top_k=top_k * 2,
|
| 423 |
+
min_score=0.1,
|
| 424 |
+
final_k=top_k
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
+
# === PHASE 2: APPLY STRATEGY ===
|
| 428 |
+
if strategy == "bm25_first":
|
| 429 |
+
results = self._bm25_first_fusion(bm25_results, semantic_results, final_k)
|
| 430 |
+
elif strategy == "semantic_first":
|
| 431 |
+
results = self._semantic_first_fusion(semantic_results, bm25_results, final_k)
|
| 432 |
+
elif strategy == "fusion":
|
| 433 |
+
results = self._reciprocal_rank_fusion(bm25_results, semantic_results, final_k)
|
| 434 |
+
elif strategy == "weighted":
|
| 435 |
+
results = self._weighted_fusion(bm25_results, semantic_results, final_k)
|
| 436 |
+
else:
|
| 437 |
+
# Default to fusion
|
| 438 |
+
results = self._reciprocal_rank_fusion(bm25_results, semantic_results, final_k)
|
| 439 |
+
|
| 440 |
+
# === PHASE 3: EXACT FALLBACK IF NO RESULTS ===
|
| 441 |
+
if not results:
|
| 442 |
+
logger.info("🔄 No hybrid results, trying exact fallback...")
|
| 443 |
+
results = self.retrieve_exact(
|
| 444 |
+
query=query,
|
| 445 |
+
user_id=user_id,
|
| 446 |
+
chat_id=chat_id,
|
| 447 |
+
filter_type=filter_type,
|
| 448 |
+
aggressive=True
|
| 449 |
+
)
|
| 450 |
+
if results:
|
| 451 |
+
self.performance_stats["fallback_matches"] += 1
|
| 452 |
+
return results[:final_k]
|
| 453 |
+
|
| 454 |
+
# === PHASE 4: SMART RERANKING ===
|
| 455 |
+
if results and len(results) > 1:
|
| 456 |
+
try:
|
| 457 |
+
results = self._smart_rerank(query, results, final_k)
|
| 458 |
+
except Exception as e:
|
| 459 |
+
logger.warning(f"Reranking failed: {e}")
|
| 460 |
+
|
| 461 |
+
# === PHASE 5: FINAL PROCESSING ===
|
| 462 |
+
elapsed = time.time() - start_time
|
| 463 |
+
|
| 464 |
+
# Boost whole files for complete answers
|
| 465 |
+
for result in results:
|
| 466 |
+
if result.get("is_whole_file"):
|
| 467 |
+
result["score"] = min(result["score"] * 1.2, 1.0)
|
| 468 |
+
|
| 469 |
+
# Ensure scores are in 0-1 range
|
| 470 |
+
for result in results:
|
| 471 |
+
result["score"] = min(max(result["score"], 0.0), 1.0)
|
| 472 |
+
|
| 473 |
+
# Sort by final score
|
| 474 |
+
results.sort(key=lambda x: x["score"], reverse=True)
|
| 475 |
+
|
| 476 |
+
# Update performance stats
|
| 477 |
+
if results:
|
| 478 |
+
self.performance_stats["hybrid_matches"] += 1
|
| 479 |
+
logger.info(f"✅ Hybrid search found {len(results)} results in {elapsed:.3f}s")
|
| 480 |
+
logger.info(f"🏆 Top score: {results[0]['score']:.3f}, Type: {results[0].get('match_type', 'unknown')}")
|
| 481 |
+
else:
|
| 482 |
+
logger.warning(f"❌ Hybrid search found no results")
|
| 483 |
+
|
| 484 |
+
return results[:final_k]
|
| 485 |
+
|
| 486 |
+
# ==================== CORE METHODS (PRESERVED WITH FIXES) ====================
|
| 487 |
+
|
| 488 |
+
def _chunk_python_ast(self, text: str, filename: str) -> List[Dict[str, Any]]:
|
| 489 |
+
"""Enhanced AST chunker with better context preservation"""
|
| 490 |
+
chunks = []
|
| 491 |
+
try:
|
| 492 |
+
tree = ast.parse(text)
|
| 493 |
+
lines = text.splitlines()
|
| 494 |
+
|
| 495 |
+
global_context = []
|
| 496 |
+
|
| 497 |
+
for node in tree.body:
|
| 498 |
+
if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)):
|
| 499 |
+
start = max(0, node.lineno - 4)
|
| 500 |
+
end = node.end_lineno + 2
|
| 501 |
+
block_text = "\n".join(lines[start:end])
|
| 502 |
+
|
| 503 |
+
chunks.append({
|
| 504 |
+
"text": f"File: {filename} | Type: {type(node).__name__} | Name: {node.name} | Content: {block_text}",
|
| 505 |
+
"type": "code_function",
|
| 506 |
+
"name": node.name,
|
| 507 |
+
"line_start": start,
|
| 508 |
+
"line_end": end
|
| 509 |
+
})
|
| 510 |
+
elif isinstance(node, (ast.Import, ast.ImportFrom, ast.Assign, ast.Expr)):
|
| 511 |
+
if hasattr(node, 'end_lineno'):
|
| 512 |
+
start = node.lineno - 1
|
| 513 |
+
end = node.end_lineno
|
| 514 |
+
global_context.append("\n".join(lines[start:end]))
|
| 515 |
+
|
| 516 |
+
# Add global context as a separate chunk
|
| 517 |
+
if global_context:
|
| 518 |
+
full_global = "\n".join(global_context)
|
| 519 |
+
if len(full_global) > 50:
|
| 520 |
+
chunks.insert(0, {
|
| 521 |
+
"text": f"File: {filename} | Type: imports_and_globals | Content: {full_global[:2000]}",
|
| 522 |
+
"type": "code_context",
|
| 523 |
+
"name": "imports_and_globals"
|
| 524 |
+
})
|
| 525 |
+
|
| 526 |
+
except Exception as e:
|
| 527 |
+
logger.warning(f"AST parsing failed for {filename}: {e}")
|
| 528 |
+
return self._chunk_text_enhanced(text)
|
| 529 |
+
|
| 530 |
+
if not chunks:
|
| 531 |
+
return self._chunk_text_enhanced(text)
|
| 532 |
+
|
| 533 |
+
return chunks
|
| 534 |
+
|
| 535 |
+
def _chunk_smart_code(self, text: str, filename: str) -> List[Dict[str, Any]]:
|
| 536 |
+
"""ENHANCED Structure-aware chunker with context preservation"""
|
| 537 |
+
ext = os.path.splitext(filename)[1].lower()
|
| 538 |
+
chunks = []
|
| 539 |
+
|
| 540 |
+
# Define split patterns for different languages
|
| 541 |
+
patterns = {
|
| 542 |
+
'.html': r'(?=\n\s*<[^/])',
|
| 543 |
+
'.htm': r'(?=\n\s*<[^/])',
|
| 544 |
+
'.xml': r'(?=\n\s*<[^/])',
|
| 545 |
+
'.vue': r'(?=\n\s*<[^/])',
|
| 546 |
+
'.js': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|def|if|for|while|switch))',
|
| 547 |
+
'.jsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|def|if|for|while|switch))',
|
| 548 |
+
'.ts': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type|def|if|for|while))',
|
| 549 |
+
'.tsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type|def|if|for|while))',
|
| 550 |
+
'.css': r'(?=\n\s*[.#@a-zA-Z])',
|
| 551 |
+
'.scss': r'(?=\n\s*[.#@a-zA-Z])',
|
| 552 |
+
'.java': r'(?=\n\s*(?:public|private|protected|class|interface|enum|@))',
|
| 553 |
+
'.cpp': r'(?=\n\s*(?:#include|using|namespace|class|struct|enum|template))',
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
pattern = patterns.get(ext)
|
| 557 |
+
|
| 558 |
+
# Fallback to standard if no pattern matches or regex fails
|
| 559 |
+
if not pattern:
|
| 560 |
+
return self._chunk_text_enhanced(text)
|
| 561 |
+
|
| 562 |
+
try:
|
| 563 |
+
segments = re.split(pattern, text)
|
| 564 |
+
|
| 565 |
+
# Process with CONTEXT OVERLAP for better retrieval
|
| 566 |
+
current_chunk = ""
|
| 567 |
+
TARGET_SIZE = 800
|
| 568 |
+
OVERLAP_SIZE = 100
|
| 569 |
+
|
| 570 |
+
for seg_idx, seg in enumerate(segments):
|
| 571 |
+
if not seg.strip():
|
| 572 |
+
continue
|
| 573 |
+
|
| 574 |
+
# Check if adding this segment would exceed target
|
| 575 |
+
if len(current_chunk) + len(seg) > TARGET_SIZE and len(current_chunk) > 50:
|
| 576 |
+
# Save current chunk
|
| 577 |
+
chunk_text = current_chunk.strip()
|
| 578 |
+
if chunk_text:
|
| 579 |
+
chunks.append({
|
| 580 |
+
"text": f"File: {filename} | Content: {chunk_text}",
|
| 581 |
+
"type": "code_block",
|
| 582 |
+
"name": f"block_{len(chunks)}",
|
| 583 |
+
"context_id": seg_idx
|
| 584 |
+
})
|
| 585 |
+
|
| 586 |
+
# Start new chunk with overlap from previous
|
| 587 |
+
current_chunk = current_chunk[-OVERLAP_SIZE:] + "\n" + seg if OVERLAP_SIZE > 0 else seg
|
| 588 |
+
else:
|
| 589 |
+
current_chunk += seg
|
| 590 |
+
|
| 591 |
+
# Add final chunk
|
| 592 |
+
if current_chunk:
|
| 593 |
+
chunks.append({
|
| 594 |
+
"text": f"File: {filename} | Content: {current_chunk.strip()}",
|
| 595 |
+
"type": "code_block",
|
| 596 |
+
"name": f"block_{len(chunks)}",
|
| 597 |
+
"context_id": len(segments)
|
| 598 |
+
})
|
| 599 |
+
|
| 600 |
+
return chunks
|
| 601 |
+
except Exception as e:
|
| 602 |
+
logger.warning(f"Smart chunking failed for {filename}: {e}. Falling back.")
|
| 603 |
+
return self._chunk_text_enhanced(text)
|
| 604 |
+
|
| 605 |
+
def _chunk_text_enhanced(self, text: str, chunk_size: int = 600, overlap: int = 100) -> List[Dict[str, Any]]:
|
| 606 |
+
"""Enhanced text chunking that preserves natural boundaries"""
|
| 607 |
+
chunks = []
|
| 608 |
+
|
| 609 |
+
# Try to split by paragraphs first
|
| 610 |
+
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
|
| 611 |
+
|
| 612 |
+
if not paragraphs:
|
| 613 |
+
# Fallback to standard chunking
|
| 614 |
+
return self._chunk_text_standard(text, chunk_size, overlap)
|
| 615 |
+
|
| 616 |
+
current_chunk = ""
|
| 617 |
+
for para in paragraphs:
|
| 618 |
+
if len(current_chunk) + len(para) > chunk_size and current_chunk:
|
| 619 |
+
chunks.append({
|
| 620 |
+
"text": current_chunk.strip(),
|
| 621 |
+
"type": "text_paragraph",
|
| 622 |
+
"name": f"para_{len(chunks)}"
|
| 623 |
+
})
|
| 624 |
+
# Keep last overlap portion
|
| 625 |
+
current_chunk = current_chunk[-overlap:] + "\n\n" + para if overlap > 0 else para
|
| 626 |
+
else:
|
| 627 |
+
current_chunk += "\n\n" + para if current_chunk else para
|
| 628 |
+
|
| 629 |
+
if current_chunk:
|
| 630 |
+
chunks.append({
|
| 631 |
+
"text": current_chunk.strip(),
|
| 632 |
+
"type": "text_paragraph",
|
| 633 |
+
"name": f"para_{len(chunks)}"
|
| 634 |
+
})
|
| 635 |
+
|
| 636 |
+
return chunks
|
| 637 |
+
|
| 638 |
+
def _chunk_text_standard(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[Dict[str, Any]]:
|
| 639 |
+
"""Standard text chunking with sliding window"""
|
| 640 |
+
chunks = []
|
| 641 |
+
|
| 642 |
+
if len(text) <= chunk_size:
|
| 643 |
+
return [{
|
| 644 |
+
"text": text,
|
| 645 |
+
"type": "text_block",
|
| 646 |
+
"name": "full_content"
|
| 647 |
+
}]
|
| 648 |
+
|
| 649 |
+
for i in range(0, len(text), chunk_size - overlap):
|
| 650 |
+
chunk = text[i:i + chunk_size]
|
| 651 |
+
if len(chunk) > 100:
|
| 652 |
+
chunks.append({
|
| 653 |
+
"text": chunk,
|
| 654 |
+
"type": "text_block",
|
| 655 |
+
"name": f"chunk_{i//chunk_size}"
|
| 656 |
+
})
|
| 657 |
+
|
| 658 |
+
return chunks
|
| 659 |
+
|
| 660 |
+
# ==================== HELPER METHODS FOR HYBRID SEARCH ====================
|
| 661 |
+
|
| 662 |
+
def _classify_query(self, query: str) -> str:
|
| 663 |
+
"""Classify query type to determine best search strategy"""
|
| 664 |
+
query_lower = query.lower()
|
| 665 |
+
|
| 666 |
+
# Code/technical query indicators
|
| 667 |
+
code_indicators = [
|
| 668 |
+
r'def\s+\w+\(', r'class\s+\w+', r'function\s+\w+',
|
| 669 |
+
r'import\s+', r'from\s+', r'\.py$', r'\.js$', r'\.java$',
|
| 670 |
+
r'\w+\(.*\)', r'\{.*\}', r'\[.*\]', r'=\s*\w+',
|
| 671 |
+
r'const\s+', r'let\s+', r'var\s+', r'type\s+',
|
| 672 |
+
r'interface\s+', r'export\s+', r'async\s+', r'await\s+',
|
| 673 |
+
r'SELECT\s+', r'FROM\s+', r'WHERE\s+', r'JOIN\s+',
|
| 674 |
+
r'#include', r'using\s+', r'namespace\s+', r'template\s+'
|
| 675 |
+
]
|
| 676 |
+
|
| 677 |
+
for pattern in code_indicators:
|
| 678 |
+
if re.search(pattern, query_lower):
|
| 679 |
+
return "code"
|
| 680 |
+
|
| 681 |
+
# Natural language query indicators
|
| 682 |
+
natural_indicators = [
|
| 683 |
+
r'^how\s+', r'^what\s+', r'^why\s+', r'^explain\s+',
|
| 684 |
+
r'^describe\s+', r'^summarize\s+', r'^tell\s+me\s+about',
|
| 685 |
+
r'\?$', r'please', r'could you', r'would you',
|
| 686 |
+
r'understand', r'meaning', r'concept', r'idea'
|
| 687 |
+
]
|
| 688 |
+
|
| 689 |
+
for pattern in natural_indicators:
|
| 690 |
+
if re.search(pattern, query_lower):
|
| 691 |
+
return "natural"
|
| 692 |
+
|
| 693 |
+
# Short keyword query (good for BM25)
|
| 694 |
+
words = query.split()
|
| 695 |
+
if len(words) <= 4 and len(query) < 30:
|
| 696 |
+
return "keyword"
|
| 697 |
+
|
| 698 |
+
# Mixed query
|
| 699 |
+
return "mixed"
|
| 700 |
+
|
| 701 |
+
def _bm25_first_fusion(self, bm25_results: List[Dict], semantic_results: List[Dict],
|
| 702 |
+
final_k: int) -> List[Dict]:
|
| 703 |
+
"""BM25 first, supplement with semantic if needed"""
|
| 704 |
+
results = bm25_results.copy()
|
| 705 |
+
|
| 706 |
+
# If BM25 results are weak, add semantic results
|
| 707 |
+
if not results or (results[0]["score"] < 0.3):
|
| 708 |
+
seen_ids = set(r["id"] for r in results)
|
| 709 |
+
for sem in semantic_results:
|
| 710 |
+
if sem["id"] not in seen_ids and len(results) < final_k * 2:
|
| 711 |
+
seen_ids.add(sem["id"])
|
| 712 |
+
sem["match_type"] = "semantic_supplement"
|
| 713 |
+
results.append(sem)
|
| 714 |
+
|
| 715 |
+
return results[:final_k]
|
| 716 |
+
|
| 717 |
+
def _semantic_first_fusion(self, semantic_results: List[Dict], bm25_results: List[Dict],
|
| 718 |
+
final_k: int) -> List[Dict]:
|
| 719 |
+
"""Semantic first, supplement with BM25 if needed"""
|
| 720 |
+
results = semantic_results.copy()
|
| 721 |
+
|
| 722 |
+
# If semantic results are weak, add BM25 results
|
| 723 |
+
if not results or (results[0]["score"] < 0.3):
|
| 724 |
+
seen_ids = set(r["id"] for r in results)
|
| 725 |
+
for bm in bm25_results:
|
| 726 |
+
if bm["id"] not in seen_ids and len(results) < final_k * 2:
|
| 727 |
+
seen_ids.add(bm["id"])
|
| 728 |
+
bm["match_type"] = "bm25_supplement"
|
| 729 |
+
results.append(bm)
|
| 730 |
+
|
| 731 |
+
return results[:final_k]
|
| 732 |
+
|
| 733 |
+
def _reciprocal_rank_fusion(self, results1: List[Dict], results2: List[Dict],
|
| 734 |
+
final_k: int, k: int = 60) -> List[Dict]:
|
| 735 |
+
"""Combine results using Reciprocal Rank Fusion (RRF)"""
|
| 736 |
+
# Create rank dictionaries
|
| 737 |
+
rank_map1 = {r["id"]: rank + 1 for rank, r in enumerate(results1)}
|
| 738 |
+
rank_map2 = {r["id"]: rank + 1 for rank, r in enumerate(results2)}
|
| 739 |
+
|
| 740 |
+
# Get all unique IDs
|
| 741 |
+
all_ids = set(rank_map1.keys()) | set(rank_map2.keys())
|
| 742 |
+
|
| 743 |
+
# Calculate RRF scores
|
| 744 |
+
rrf_scores = []
|
| 745 |
+
for doc_id in all_ids:
|
| 746 |
+
score = 0.0
|
| 747 |
+
if doc_id in rank_map1:
|
| 748 |
+
score += 1.0 / (rank_map1[doc_id] + k)
|
| 749 |
+
if doc_id in rank_map2:
|
| 750 |
+
score += 1.0 / (rank_map2[doc_id] + k)
|
| 751 |
+
rrf_scores.append((doc_id, score))
|
| 752 |
+
|
| 753 |
+
# Sort by RRF score
|
| 754 |
+
rrf_scores.sort(key=lambda x: x[1], reverse=True)
|
| 755 |
+
|
| 756 |
+
# Create result mapping for quick lookup
|
| 757 |
+
results_map = {}
|
| 758 |
+
for r in results1 + results2:
|
| 759 |
+
if r["id"] not in results_map:
|
| 760 |
+
results_map[r["id"]] = r
|
| 761 |
+
|
| 762 |
+
# Build final results
|
| 763 |
+
combined_results = []
|
| 764 |
+
for doc_id, rrf_score in rrf_scores:
|
| 765 |
+
if doc_id in results_map:
|
| 766 |
+
result = results_map[doc_id].copy()
|
| 767 |
+
result["score"] = rrf_score
|
| 768 |
+
result["match_type"] = "rrf_fusion"
|
| 769 |
+
combined_results.append(result)
|
| 770 |
+
|
| 771 |
+
return combined_results[:final_k]
|
| 772 |
+
|
| 773 |
+
def _weighted_fusion(self, bm25_results: List[Dict], semantic_results: List[Dict],
|
| 774 |
+
final_k: int, bm25_weight: float = 0.4,
|
| 775 |
+
semantic_weight: float = 0.6) -> List[Dict]:
|
| 776 |
+
"""Weighted combination of BM25 and semantic scores"""
|
| 777 |
+
# Normalize scores within each result set
|
| 778 |
+
def normalize_scores(results):
|
| 779 |
+
if not results:
|
| 780 |
+
return {}
|
| 781 |
+
max_score = max(r["score"] for r in results) if results else 1.0
|
| 782 |
+
if max_score == 0:
|
| 783 |
+
max_score = 1.0
|
| 784 |
+
return {r["id"]: r["score"] / max_score for r in results}
|
| 785 |
+
|
| 786 |
+
bm25_scores = normalize_scores(bm25_results)
|
| 787 |
+
semantic_scores = normalize_scores(semantic_results)
|
| 788 |
+
|
| 789 |
+
# Get all unique IDs
|
| 790 |
+
all_ids = set(bm25_scores.keys()) | set(semantic_scores.keys())
|
| 791 |
+
|
| 792 |
+
# Calculate weighted scores
|
| 793 |
+
weighted_scores = []
|
| 794 |
+
for doc_id in all_ids:
|
| 795 |
+
bm25_score = bm25_scores.get(doc_id, 0.0)
|
| 796 |
+
semantic_score = semantic_scores.get(doc_id, 0.0)
|
| 797 |
+
weighted = (bm25_score * bm25_weight) + (semantic_score * semantic_weight)
|
| 798 |
+
weighted_scores.append((doc_id, weighted))
|
| 799 |
+
|
| 800 |
+
# Sort by weighted score
|
| 801 |
+
weighted_scores.sort(key=lambda x: x[1], reverse=True)
|
| 802 |
+
|
| 803 |
+
# Create result mapping
|
| 804 |
+
results_map = {}
|
| 805 |
+
for r in bm25_results + semantic_results:
|
| 806 |
+
if r["id"] not in results_map:
|
| 807 |
+
results_map[r["id"]] = r
|
| 808 |
+
|
| 809 |
+
# Build final results
|
| 810 |
+
combined_results = []
|
| 811 |
+
for doc_id, weighted_score in weighted_scores:
|
| 812 |
+
if doc_id in results_map:
|
| 813 |
+
result = results_map[doc_id].copy()
|
| 814 |
+
result["score"] = weighted_score
|
| 815 |
+
result["match_type"] = "weighted_fusion"
|
| 816 |
+
combined_results.append(result)
|
| 817 |
+
|
| 818 |
+
return combined_results[:final_k]
|
| 819 |
+
|
| 820 |
+
def _smart_rerank(self, query: str, candidates: List[Dict], final_k: int) -> List[Dict]:
|
| 821 |
+
"""Smart reranking using cross-encoder"""
|
| 822 |
+
if len(candidates) <= 1:
|
| 823 |
+
return candidates
|
| 824 |
+
|
| 825 |
+
try:
|
| 826 |
+
# Prepare passages for reranking
|
| 827 |
+
passages = []
|
| 828 |
+
for cand in candidates[:30]:
|
| 829 |
+
text = cand.get("text", "")
|
| 830 |
+
if len(text) > 1000:
|
| 831 |
+
text = text[:1000] + "..."
|
| 832 |
+
|
| 833 |
+
source = cand.get("meta", {}).get("source", "unknown")
|
| 834 |
+
subtype = cand.get("meta", {}).get("subtype", "general")
|
| 835 |
+
|
| 836 |
+
passages.append({
|
| 837 |
+
"id": cand["id"],
|
| 838 |
+
"text": f"File: {source} | Type: {subtype} | Content: {text}"
|
| 839 |
+
})
|
| 840 |
+
|
| 841 |
+
if not passages:
|
| 842 |
+
return candidates
|
| 843 |
+
|
| 844 |
+
# Rerank with FlashRank
|
| 845 |
+
rerank_request = RerankRequest(query=query, passages=passages)
|
| 846 |
+
reranked = self.ranker.rerank(rerank_request)
|
| 847 |
+
|
| 848 |
+
# Update scores based on reranking
|
| 849 |
+
rerank_map = {r["id"]: r["score"] for r in reranked}
|
| 850 |
+
|
| 851 |
+
for cand in candidates:
|
| 852 |
+
if cand["id"] in rerank_map:
|
| 853 |
+
cand["score"] = (cand["score"] * 0.3) + (rerank_map[cand["id"]] * 0.7)
|
| 854 |
+
cand["match_type"] = cand.get("match_type", "unknown") + "_reranked"
|
| 855 |
+
|
| 856 |
+
candidates.sort(key=lambda x: x["score"], reverse=True)
|
| 857 |
+
|
| 858 |
+
logger.debug(f"Smart reranking applied to {len(candidates)} candidates")
|
| 859 |
+
|
| 860 |
+
except Exception as e:
|
| 861 |
+
logger.warning(f"Reranking error: {e}")
|
| 862 |
+
|
| 863 |
+
return candidates[:final_k]
|
| 864 |
+
|
| 865 |
+
# ==================== COMPATIBILITY METHODS (UPDATED) ====================
|
| 866 |
+
|
| 867 |
+
def retrieve_session_context(self, query: str, user_id: str, chat_id: str,
|
| 868 |
+
filter_type: str = None, top_k: int = 100,
|
| 869 |
+
final_k: int = 5, min_score: float = 0.25,
|
| 870 |
+
use_hybrid: bool = True) -> List[Dict[str, Any]]:
|
| 871 |
+
"""
|
| 872 |
+
Enhanced retrieval with hybrid capabilities
|
| 873 |
+
use_hybrid: Whether to use hybrid search (BM25 + semantic)
|
| 874 |
+
"""
|
| 875 |
+
# Use hybrid search by default if available
|
| 876 |
+
if use_hybrid and BM25_AVAILABLE:
|
| 877 |
+
return self.hybrid_retrieve(
|
| 878 |
+
query=query,
|
| 879 |
+
user_id=user_id,
|
| 880 |
+
chat_id=chat_id,
|
| 881 |
+
filter_type=filter_type,
|
| 882 |
+
top_k=top_k,
|
| 883 |
+
final_k=final_k,
|
| 884 |
+
strategy="smart"
|
| 885 |
+
)
|
| 886 |
+
|
| 887 |
+
# Fall back to original semantic search
|
| 888 |
+
return self._semantic_search(
|
| 889 |
+
query=query,
|
| 890 |
+
user_id=user_id,
|
| 891 |
+
chat_id=chat_id,
|
| 892 |
+
filter_type=filter_type,
|
| 893 |
+
top_k=top_k,
|
| 894 |
+
min_score=min_score,
|
| 895 |
+
final_k=final_k
|
| 896 |
+
)
|
| 897 |
+
|
| 898 |
+
def _semantic_search(self, query: str, user_id: str, chat_id: str,
|
| 899 |
+
filter_type: str = None, top_k: int = 100,
|
| 900 |
+
min_score: float = 0.25, final_k: int = 10) -> List[Dict[str, Any]]:
|
| 901 |
+
"""Core semantic search engine"""
|
| 902 |
+
with self.memory_lock:
|
| 903 |
+
total_vectors = self.index.ntotal
|
| 904 |
+
user_vectors = sum(1 for m in self.metadata if m.get("user_id") == user_id and m.get("chat_id") == chat_id)
|
| 905 |
+
|
| 906 |
+
if total_vectors == 0 or user_vectors == 0:
|
| 907 |
+
return []
|
| 908 |
+
|
| 909 |
+
try:
|
| 910 |
+
query_vec = self.embedder.encode([query], show_progress_bar=False)
|
| 911 |
+
faiss.normalize_L2(query_vec)
|
| 912 |
+
except Exception as e:
|
| 913 |
+
logger.error(f"❌ Failed to encode query: {e}")
|
| 914 |
+
return []
|
| 915 |
+
|
| 916 |
+
search_k = min(top_k * 2, total_vectors)
|
| 917 |
+
if search_k == 0:
|
| 918 |
+
search_k = min(10, total_vectors)
|
| 919 |
+
|
| 920 |
+
try:
|
| 921 |
+
with self.memory_lock:
|
| 922 |
+
if self.index.ntotal == 0:
|
| 923 |
+
return []
|
| 924 |
+
D, I = self.index.search(np.array(query_vec).astype('float32'), search_k)
|
| 925 |
+
except Exception as e:
|
| 926 |
+
logger.error(f"❌ Search failed: {e}")
|
| 927 |
+
return []
|
| 928 |
+
|
| 929 |
+
candidates = []
|
| 930 |
+
query_lower = query.lower()
|
| 931 |
+
|
| 932 |
+
for i, idx in enumerate(I[0]):
|
| 933 |
+
if idx == -1 or idx >= len(self.metadata):
|
| 934 |
+
continue
|
| 935 |
+
|
| 936 |
+
item = self.metadata[idx]
|
| 937 |
+
|
| 938 |
+
# Filter by user and chat
|
| 939 |
+
if item.get("user_id") != user_id or item.get("chat_id") != chat_id:
|
| 940 |
+
continue
|
| 941 |
+
|
| 942 |
+
# Filter by type if specified
|
| 943 |
+
if filter_type and item.get("type") != filter_type:
|
| 944 |
+
continue
|
| 945 |
+
|
| 946 |
+
score = float(D[0][i])
|
| 947 |
+
|
| 948 |
+
if np.isnan(score) or np.isinf(score):
|
| 949 |
+
continue
|
| 950 |
+
|
| 951 |
+
# Whole file boosting
|
| 952 |
+
is_whole_file = item.get("is_whole_file", False) or item.get("subtype") == "whole_file"
|
| 953 |
+
if is_whole_file:
|
| 954 |
+
filename = item.get("source", "").lower()
|
| 955 |
+
if filename in query_lower or any(word in filename for word in query_lower.split()):
|
| 956 |
+
score = 2.5
|
| 957 |
+
|
| 958 |
+
if item.get("actual_content"):
|
| 959 |
+
item = item.copy()
|
| 960 |
+
item["text"] = item["actual_content"]
|
| 961 |
+
|
| 962 |
+
if score < min_score:
|
| 963 |
+
continue
|
| 964 |
+
|
| 965 |
+
candidates.append({
|
| 966 |
+
"id": int(idx),
|
| 967 |
+
"text": item.get("text", ""),
|
| 968 |
+
"meta": item,
|
| 969 |
+
"score": score
|
| 970 |
+
})
|
| 971 |
+
|
| 972 |
+
return candidates
|
| 973 |
+
|
| 974 |
+
def retrieve_exact(self, query: str, user_id: str, chat_id: str,
|
| 975 |
+
filter_type: str = None, aggressive: bool = True) -> List[Dict[str, Any]]:
|
| 976 |
+
"""PRIMARY EXACT MATCH RETRIEVAL - Accuracy First!"""
|
| 977 |
+
start_time = time.time()
|
| 978 |
+
query_lower = query.lower().strip()
|
| 979 |
+
|
| 980 |
+
if self.index.ntotal == 0 or not user_id:
|
| 981 |
+
logger.warning(f"❌ Empty index or invalid user_id")
|
| 982 |
+
return []
|
| 983 |
+
|
| 984 |
+
logger.info(f"🎯 EXACT MODE: Searching for '{query[:80]}...'")
|
| 985 |
+
|
| 986 |
+
all_candidates = []
|
| 987 |
+
exact_matches = []
|
| 988 |
+
|
| 989 |
+
# TACTIC 1: BRUTE FORCE SUBSTRING SEARCH
|
| 990 |
+
logger.debug("🔍 Tactic 1: Brute force substring search...")
|
| 991 |
+
with self.memory_lock:
|
| 992 |
+
for idx, meta in enumerate(self.metadata):
|
| 993 |
+
if meta.get("user_id") != user_id or meta.get("chat_id") != chat_id:
|
| 994 |
+
continue
|
| 995 |
+
|
| 996 |
+
if filter_type and meta.get("type") != filter_type:
|
| 997 |
+
continue
|
| 998 |
+
|
| 999 |
+
text = meta.get("text", "").lower()
|
| 1000 |
+
actual_content = meta.get("actual_content", "").lower()
|
| 1001 |
+
|
| 1002 |
+
if query_lower in text or query_lower in actual_content:
|
| 1003 |
+
score = 3.0
|
| 1004 |
+
match_type = "exact_substring"
|
| 1005 |
+
|
| 1006 |
+
display_text = meta.get("actual_content", meta.get("text", ""))
|
| 1007 |
+
|
| 1008 |
+
exact_matches.append({
|
| 1009 |
+
"id": idx,
|
| 1010 |
+
"text": display_text,
|
| 1011 |
+
"meta": meta,
|
| 1012 |
+
"score": score,
|
| 1013 |
+
"match_type": match_type,
|
| 1014 |
+
"confidence": "perfect"
|
| 1015 |
+
})
|
| 1016 |
+
|
| 1017 |
+
if exact_matches:
|
| 1018 |
+
logger.info(f"✨ Found {len(exact_matches)} PERFECT exact matches!")
|
| 1019 |
+
self.performance_stats["exact_matches"] += 1
|
| 1020 |
+
|
| 1021 |
+
exact_matches.sort(key=lambda x: (
|
| 1022 |
+
1 if x["meta"].get("is_whole_file") else 0,
|
| 1023 |
+
x["score"]
|
| 1024 |
+
), reverse=True)
|
| 1025 |
+
|
| 1026 |
+
elapsed = time.time() - start_time
|
| 1027 |
+
logger.info(f"⚡ Exact match retrieval took {elapsed:.3f}s")
|
| 1028 |
+
return exact_matches[:3]
|
| 1029 |
+
|
| 1030 |
+
# TACTIC 2: KEYWORD MATCHING
|
| 1031 |
+
if aggressive:
|
| 1032 |
+
logger.debug("🔍 Tactic 2: Aggressive keyword matching...")
|
| 1033 |
+
keywords = [w for w in re.findall(r'\b\w{3,}\b', query_lower) if len(w) > 2]
|
| 1034 |
+
|
| 1035 |
+
if keywords:
|
| 1036 |
+
with self.memory_lock:
|
| 1037 |
+
for idx, meta in enumerate(self.metadata):
|
| 1038 |
+
if meta.get("user_id") != user_id or meta.get("chat_id") != chat_id:
|
| 1039 |
+
continue
|
| 1040 |
+
if filter_type and meta.get("type") != filter_type:
|
| 1041 |
+
continue
|
| 1042 |
+
|
| 1043 |
+
text = meta.get("text", "").lower()
|
| 1044 |
+
keyword_matches = sum(1 for kw in keywords if kw in text)
|
| 1045 |
+
|
| 1046 |
+
if keyword_matches >= max(1, len(keywords) * 0.6):
|
| 1047 |
+
score = 2.0 + (keyword_matches / len(keywords)) * 0.5
|
| 1048 |
+
all_candidates.append({
|
| 1049 |
+
"id": idx,
|
| 1050 |
+
"text": meta.get("actual_content", meta.get("text", "")),
|
| 1051 |
+
"meta": meta,
|
| 1052 |
+
"score": score,
|
| 1053 |
+
"match_type": "keyword_explosion",
|
| 1054 |
+
"keyword_match_ratio": keyword_matches / len(keywords)
|
| 1055 |
+
})
|
| 1056 |
+
|
| 1057 |
+
# TACTIC 3: SEMANTIC SEARCH WITH LOW THRESHOLD
|
| 1058 |
+
logger.debug("🔍 Tactic 3: Semantic search with low threshold...")
|
| 1059 |
+
semantic_results = self._semantic_search(
|
| 1060 |
+
query=query,
|
| 1061 |
+
user_id=user_id,
|
| 1062 |
+
chat_id=chat_id,
|
| 1063 |
+
filter_type=filter_type,
|
| 1064 |
+
top_k=200,
|
| 1065 |
+
min_score=0.1,
|
| 1066 |
+
final_k=30
|
| 1067 |
+
)
|
| 1068 |
+
|
| 1069 |
+
for res in semantic_results:
|
| 1070 |
+
res["match_type"] = "semantic_low_threshold"
|
| 1071 |
+
all_candidates.append(res)
|
| 1072 |
+
|
| 1073 |
+
# DEDUPLICATE AND RANK
|
| 1074 |
+
seen_ids = set()
|
| 1075 |
+
unique_candidates = []
|
| 1076 |
+
|
| 1077 |
+
for candidate in all_candidates:
|
| 1078 |
+
if candidate["id"] not in seen_ids:
|
| 1079 |
+
seen_ids.add(candidate["id"])
|
| 1080 |
+
unique_candidates.append(candidate)
|
| 1081 |
+
|
| 1082 |
+
unique_candidates.sort(key=lambda x: x["score"], reverse=True)
|
| 1083 |
+
|
| 1084 |
+
# Apply reranking if available
|
| 1085 |
+
if unique_candidates:
|
| 1086 |
+
try:
|
| 1087 |
+
passages = []
|
| 1088 |
+
for cand in unique_candidates[:50]:
|
| 1089 |
+
text_for_rerank = cand["text"]
|
| 1090 |
+
if len(text_for_rerank) > 1000:
|
| 1091 |
+
text_for_rerank = text_for_rerank[:1000] + "..."
|
| 1092 |
+
|
| 1093 |
+
passages.append({
|
| 1094 |
+
"id": cand["id"],
|
| 1095 |
+
"text": text_for_rerank
|
| 1096 |
+
})
|
| 1097 |
+
|
| 1098 |
+
if passages:
|
| 1099 |
+
rerank_request = RerankRequest(query=query, passages=passages)
|
| 1100 |
+
reranked = self.ranker.rerank(rerank_request)
|
| 1101 |
+
|
| 1102 |
+
rerank_map = {r["id"]: r["score"] for r in reranked}
|
| 1103 |
+
for cand in unique_candidates:
|
| 1104 |
+
if cand["id"] in rerank_map:
|
| 1105 |
+
cand["score"] = cand["score"] * 0.3 + rerank_map[cand["id"]] * 0.7
|
| 1106 |
+
|
| 1107 |
+
unique_candidates.sort(key=lambda x: x["score"], reverse=True)
|
| 1108 |
+
|
| 1109 |
+
except Exception as e:
|
| 1110 |
+
logger.warning(f"⚠️ Reranking failed: {e}")
|
| 1111 |
+
|
| 1112 |
+
# FINAL SELECTION
|
| 1113 |
+
final_results = []
|
| 1114 |
+
confidence_threshold = 0.4 if aggressive else 0.5
|
| 1115 |
|
| 1116 |
+
for cand in unique_candidates[:10]:
|
| 1117 |
+
if cand["score"] >= confidence_threshold:
|
| 1118 |
+
final_results.append(cand)
|
| 1119 |
|
| 1120 |
+
if final_results:
|
| 1121 |
+
self.performance_stats["semantic_matches"] += 1
|
| 1122 |
+
logger.info(f"✅ Found {len(final_results)} relevant results")
|
| 1123 |
+
|
| 1124 |
+
top_match = final_results[0]
|
| 1125 |
+
logger.info(f"🏆 Top match: Score={top_match['score']:.3f}, Type={top_match.get('match_type', 'unknown')}")
|
| 1126 |
+
|
| 1127 |
+
if top_match["meta"].get("is_whole_file"):
|
| 1128 |
+
logger.info(f"📄 Returning whole file: {top_match['meta'].get('source', 'unknown')}")
|
| 1129 |
|
| 1130 |
+
elapsed = time.time() - start_time
|
| 1131 |
+
logger.info(f"⏱️ Exact retrieval completed in {elapsed:.3f}s")
|
| 1132 |
|
| 1133 |
+
# Store in query history
|
| 1134 |
+
self.query_history.append({
|
| 1135 |
+
"query": query[:100],
|
| 1136 |
+
"timestamp": time.time(),
|
| 1137 |
+
"results_count": len(final_results),
|
| 1138 |
+
"top_score": final_results[0]["score"] if final_results else 0,
|
| 1139 |
+
"elapsed_time": elapsed,
|
| 1140 |
+
"method": "exact"
|
| 1141 |
+
})
|
| 1142 |
|
| 1143 |
+
if len(self.query_history) > 1000:
|
| 1144 |
+
self.query_history = self.query_history[-500:]
|
| 1145 |
+
|
| 1146 |
+
return final_results[:5]
|
| 1147 |
+
|
| 1148 |
+
# ==================== INFRASTRUCTURE METHODS ====================
|
| 1149 |
|
| 1150 |
def _load_or_create_index(self):
|
| 1151 |
"""Thread-safe and process-safe index loading/creation"""
|
|
|
|
| 1154 |
try:
|
| 1155 |
logger.info("📂 Loading existing vector index...")
|
| 1156 |
self.index = faiss.read_index(self.index_path)
|
| 1157 |
+
|
| 1158 |
+
if self.index.ntotal < 0:
|
| 1159 |
+
raise ValueError("Corrupt index: negative vector count")
|
| 1160 |
+
|
| 1161 |
with open(self.metadata_path, "rb") as f:
|
| 1162 |
self.metadata = pickle.load(f)
|
| 1163 |
+
|
| 1164 |
+
if len(self.metadata) != self.index.ntotal:
|
| 1165 |
+
logger.error(f"⚠️ Metadata mismatch: {len(self.metadata)} entries vs {self.index.ntotal} vectors. Rebuilding...")
|
| 1166 |
+
self._create_new_index()
|
| 1167 |
+
return
|
| 1168 |
+
|
| 1169 |
logger.info(f"✅ Loaded index with {self.index.ntotal} vectors, {len(self.metadata)} metadata entries")
|
| 1170 |
except Exception as e:
|
| 1171 |
logger.error(f"⚠️ Failed to load index: {e}. Creating new one.")
|
|
|
|
| 1175 |
self._create_new_index()
|
| 1176 |
|
| 1177 |
def _create_new_index(self):
|
| 1178 |
+
"""Create fresh IndexFlatIP for cosine similarity"""
|
| 1179 |
dimension = 384
|
| 1180 |
+
self.index = faiss.IndexFlatIP(dimension)
|
|
|
|
|
|
|
| 1181 |
self.metadata = []
|
| 1182 |
logger.info(f"🆕 Created new IndexFlatIP with dimension {dimension}")
|
| 1183 |
|
| 1184 |
def _save_index(self):
|
| 1185 |
"""Thread-safe and process-safe index saving with atomic writes"""
|
| 1186 |
with self.file_lock:
|
|
|
|
| 1187 |
temp_index = f"{self.index_path}.tmp"
|
| 1188 |
temp_meta = f"{self.metadata_path}.tmp"
|
| 1189 |
|
| 1190 |
try:
|
|
|
|
| 1191 |
faiss.write_index(self.index, temp_index)
|
| 1192 |
with open(temp_meta, "wb") as f:
|
| 1193 |
pickle.dump(self.metadata, f)
|
| 1194 |
|
|
|
|
| 1195 |
os.replace(temp_index, self.index_path)
|
| 1196 |
os.replace(temp_meta, self.metadata_path)
|
| 1197 |
|
| 1198 |
+
logger.info(f"💾 Saved index: {self.index.ntotal} vectors, {len(self.metadata)} metadata entries")
|
| 1199 |
except Exception as e:
|
| 1200 |
logger.error(f"❌ Failed to save index: {e}")
|
|
|
|
| 1201 |
for f in [temp_index, temp_meta]:
|
| 1202 |
if os.path.exists(f):
|
| 1203 |
try:
|
| 1204 |
os.remove(f)
|
| 1205 |
+
except Exception:
|
| 1206 |
+
logger.warning(f"Failed to remove temp file: {f}")
|
| 1207 |
+
finally:
|
| 1208 |
+
gc.collect()
|
| 1209 |
|
| 1210 |
+
def _rollback_partial_storage(self, user_id: str, chat_id: str):
|
| 1211 |
+
"""Remove partially stored vectors for a session"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1212 |
try:
|
| 1213 |
+
new_metadata = []
|
| 1214 |
+
surviving_texts = []
|
| 1215 |
|
| 1216 |
+
for meta in self.metadata:
|
| 1217 |
+
if meta.get("user_id") != user_id or meta.get("chat_id") != chat_id:
|
| 1218 |
+
new_metadata.append(meta)
|
| 1219 |
+
surviving_texts.append(meta["text"])
|
| 1220 |
|
| 1221 |
+
if len(new_metadata) == len(self.metadata):
|
| 1222 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1223 |
|
| 1224 |
+
if surviving_texts:
|
| 1225 |
+
embeddings = self.embedder.encode(surviving_texts, show_progress_bar=False)
|
| 1226 |
+
faiss.normalize_L2(embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1227 |
|
| 1228 |
+
new_index = faiss.IndexFlatIP(384)
|
| 1229 |
+
new_index.add(np.array(embeddings).astype('float32'))
|
| 1230 |
+
self.index = new_index
|
| 1231 |
+
else:
|
| 1232 |
+
self.index = faiss.IndexFlatIP(384)
|
| 1233 |
|
| 1234 |
+
self.metadata = new_metadata
|
| 1235 |
+
self._save_index()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1236 |
|
| 1237 |
+
# Invalidate BM25 cache
|
| 1238 |
+
self._invalidate_bm25_cache(user_id, chat_id)
|
| 1239 |
|
| 1240 |
+
logger.info(f"🔄 Rolled back partial storage for user {user_id[:8]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1242 |
except Exception as e:
|
| 1243 |
+
logger.error(f"❌ Rollback failed: {e}")
|
| 1244 |
+
self._create_new_index()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1245 |
|
| 1246 |
+
def _verify_storage(self, user_id: str, chat_id: str, expected_count: int):
|
| 1247 |
+
"""Verify vectors were stored correctly"""
|
| 1248 |
+
with self.memory_lock:
|
| 1249 |
+
user_vectors = sum(1 for m in self.metadata if m.get("user_id") == user_id and m.get("chat_id") == chat_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1250 |
|
| 1251 |
+
logger.info(f"🔍 Storage verification: User {user_id[:8]} has {user_vectors} vectors (expected: {expected_count})")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1252 |
|
| 1253 |
+
if user_vectors < expected_count:
|
| 1254 |
+
logger.warning(f"⚠️ Storage mismatch for user {user_id[:8]}")
|
| 1255 |
|
| 1256 |
+
# ==================== ANALYTICS & ADMIN METHODS ====================
|
| 1257 |
+
|
| 1258 |
+
def get_retrieval_analytics(self, query: str = None) -> Dict[str, Any]:
|
| 1259 |
+
"""Get detailed analytics about retrieval performance"""
|
| 1260 |
+
analytics = {
|
| 1261 |
+
"performance_stats": self.performance_stats.copy(),
|
| 1262 |
+
"query_types": dict(self.query_types),
|
| 1263 |
+
"query_history_count": len(self.query_history),
|
| 1264 |
+
"index_stats": {
|
| 1265 |
+
"total_vectors": self.index.ntotal,
|
| 1266 |
+
"metadata_count": len(self.metadata),
|
| 1267 |
+
"avg_metadata_size": 0,
|
| 1268 |
+
"bm25_cache_size": len(self.bm25_indices),
|
| 1269 |
+
"bm25_cache_capacity": self.bm25_cache_size,
|
| 1270 |
+
"bm25_available": BM25_AVAILABLE,
|
| 1271 |
+
"nltk_available": NLTK_AVAILABLE
|
| 1272 |
+
},
|
| 1273 |
+
"recent_queries": [],
|
| 1274 |
+
"cache_stats": {
|
| 1275 |
+
"bm25_cache_hits": 0, # Could be tracked with more instrumentation
|
| 1276 |
+
"bm25_cache_misses": 0
|
| 1277 |
+
}
|
| 1278 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1279 |
|
| 1280 |
+
if self.metadata:
|
| 1281 |
+
total_text_size = sum(len(m.get("text", "")) for m in self.metadata)
|
| 1282 |
+
analytics["index_stats"]["avg_metadata_size"] = total_text_size / len(self.metadata)
|
| 1283 |
|
| 1284 |
+
for qh in self.query_history[-10:]:
|
| 1285 |
+
analytics["recent_queries"].append({
|
| 1286 |
+
"query_preview": qh.get("query", "")[:50],
|
| 1287 |
+
"results": qh.get("results_count", 0),
|
| 1288 |
+
"top_score": qh.get("top_score", 0),
|
| 1289 |
+
"elapsed": qh.get("elapsed_time", 0),
|
| 1290 |
+
"method": qh.get("method", "unknown")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1291 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1292 |
|
| 1293 |
+
if query:
|
| 1294 |
+
query_lower = query.lower()
|
| 1295 |
+
keyword_matches = defaultdict(int)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1296 |
|
| 1297 |
+
for meta in self.metadata:
|
| 1298 |
+
text = meta.get("text", "").lower()
|
| 1299 |
+
for word in re.findall(r'\b\w{3,}\b', query_lower):
|
| 1300 |
+
if word in text:
|
| 1301 |
+
keyword_matches[word] += 1
|
|
|
|
| 1302 |
|
| 1303 |
+
analytics["query_analysis"] = {
|
| 1304 |
+
"query_length": len(query),
|
| 1305 |
+
"word_count": len(query.split()),
|
| 1306 |
+
"keyword_frequency": dict(keyword_matches),
|
| 1307 |
+
"has_file_reference": bool(re.search(r'\.(?:py|js|html|css|ts|java|cpp)', query, re.I)),
|
| 1308 |
+
"classified_as": self._classify_query(query)
|
| 1309 |
+
}
|
|
|
|
|
|
|
|
|
|
| 1310 |
|
| 1311 |
+
return analytics
|
|
|
|
| 1312 |
|
| 1313 |
+
def store_chat_context(self, messages: list, user_id: str, chat_id: str) -> bool:
|
| 1314 |
"""Store chat history as session memory"""
|
| 1315 |
if not messages or not user_id:
|
| 1316 |
return False
|
| 1317 |
|
|
|
|
| 1318 |
conversation = ""
|
| 1319 |
+
for msg in messages[-10:]:
|
| 1320 |
role = msg.get("role", "unknown")
|
| 1321 |
content = msg.get("content", "")
|
| 1322 |
if content:
|
|
|
|
| 1325 |
if len(conversation) < 50:
|
| 1326 |
return False
|
| 1327 |
|
| 1328 |
+
chunks = self._chunk_text_enhanced(conversation, chunk_size=800, overlap=100)
|
|
|
|
| 1329 |
|
| 1330 |
if not chunks:
|
| 1331 |
return False
|
| 1332 |
|
|
|
|
| 1333 |
texts = [c["text"] for c in chunks]
|
| 1334 |
metadata_list = []
|
| 1335 |
|
|
|
|
| 1344 |
"chunk_index": i
|
| 1345 |
})
|
| 1346 |
|
|
|
|
| 1347 |
try:
|
| 1348 |
+
embeddings = self.embedder.encode(texts, show_progress_bar=False)
|
| 1349 |
faiss.normalize_L2(embeddings)
|
| 1350 |
|
| 1351 |
with self.memory_lock:
|
|
|
|
| 1353 |
self.metadata.extend(metadata_list)
|
| 1354 |
self._save_index()
|
| 1355 |
|
| 1356 |
+
# Invalidate BM25 cache for this session
|
| 1357 |
+
self._invalidate_bm25_cache(user_id, chat_id)
|
| 1358 |
+
|
| 1359 |
logger.info(f"💭 Stored {len(texts)} chat history chunks for user {user_id[:8]}")
|
| 1360 |
return True
|
| 1361 |
|
| 1362 |
except Exception as e:
|
| 1363 |
+
logger.error(f"❌ Failed to store chat history: {e}")
|
| 1364 |
return False
|
| 1365 |
|
| 1366 |
+
def delete_session(self, user_id: str, chat_id: str) -> bool:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1367 |
"""Surgical Strike: Permanently remove ONLY one specific session"""
|
| 1368 |
with self.memory_lock:
|
|
|
|
| 1369 |
new_metadata = []
|
| 1370 |
removed_count = 0
|
| 1371 |
|
| 1372 |
for meta in self.metadata:
|
|
|
|
| 1373 |
if meta.get("user_id") == user_id and meta.get("chat_id") == chat_id:
|
| 1374 |
removed_count += 1
|
| 1375 |
else:
|
| 1376 |
new_metadata.append(meta)
|
| 1377 |
|
| 1378 |
if removed_count == 0:
|
| 1379 |
+
logger.info(f"ℹ️ No vectors to delete for session {chat_id}")
|
| 1380 |
+
return False
|
| 1381 |
|
| 1382 |
logger.info(f"🧹 Surgically removing {removed_count} vectors for session {chat_id}...")
|
| 1383 |
|
|
|
|
| 1384 |
if not new_metadata:
|
| 1385 |
+
self.index = faiss.IndexFlatIP(384)
|
| 1386 |
else:
|
|
|
|
|
|
|
| 1387 |
surviving_texts = [m["text"] for m in new_metadata]
|
| 1388 |
try:
|
| 1389 |
+
embeddings = self.embedder.encode(surviving_texts, show_progress_bar=False)
|
| 1390 |
faiss.normalize_L2(embeddings)
|
| 1391 |
|
| 1392 |
new_index = faiss.IndexFlatIP(384)
|
| 1393 |
new_index.add(np.array(embeddings).astype('float32'))
|
| 1394 |
self.index = new_index
|
| 1395 |
except Exception as e:
|
| 1396 |
+
logger.error(f"❌ Rebuild failed: {e}")
|
| 1397 |
return False
|
| 1398 |
|
| 1399 |
self.metadata = new_metadata
|
| 1400 |
self._save_index()
|
| 1401 |
+
|
| 1402 |
+
# Invalidate BM25 cache for this session
|
| 1403 |
+
self._invalidate_bm25_cache(user_id, chat_id)
|
| 1404 |
+
|
| 1405 |
+
logger.info(f"✅ Successfully deleted session {chat_id}")
|
| 1406 |
return True
|
| 1407 |
+
|
| 1408 |
+
def get_user_stats(self, user_id: str) -> Dict[str, Any]:
|
| 1409 |
"""Get statistics for a user's session"""
|
| 1410 |
with self.memory_lock:
|
| 1411 |
user_vectors = []
|
| 1412 |
+
for meta in enumerate(self.metadata):
|
| 1413 |
+
if meta[1].get("user_id") == user_id:
|
| 1414 |
user_vectors.append(meta)
|
| 1415 |
|
| 1416 |
stats = {
|
| 1417 |
"user_id": user_id,
|
| 1418 |
"total_vectors": len(user_vectors),
|
| 1419 |
"by_type": {},
|
| 1420 |
+
"by_source": {},
|
| 1421 |
+
"sessions": {},
|
| 1422 |
+
"bm25_cached": False
|
| 1423 |
}
|
| 1424 |
|
| 1425 |
+
for vec_id, vec in user_vectors:
|
| 1426 |
vec_type = vec.get("type", "unknown")
|
| 1427 |
source = vec.get("source", "unknown")
|
| 1428 |
+
chat_id = vec.get("chat_id", "unknown")
|
| 1429 |
|
| 1430 |
stats["by_type"][vec_type] = stats["by_type"].get(vec_type, 0) + 1
|
| 1431 |
stats["by_source"][source] = stats["by_source"].get(source, 0) + 1
|
| 1432 |
+
stats["sessions"][chat_id] = stats["sessions"].get(chat_id, 0) + 1
|
| 1433 |
+
|
| 1434 |
+
# Check if any session has BM25 in cache
|
| 1435 |
+
for chat_id in stats["sessions"]:
|
| 1436 |
+
key = (user_id, chat_id)
|
| 1437 |
+
if key in self.bm25_indices:
|
| 1438 |
+
stats["bm25_cached"] = True
|
| 1439 |
+
break
|
| 1440 |
|
| 1441 |
return stats
|
| 1442 |
|
| 1443 |
+
def cleanup_old_sessions(self, max_age_hours: int = 24) -> int:
|
| 1444 |
"""Clean up old session data"""
|
| 1445 |
current_time = time.time()
|
| 1446 |
cutoff = current_time - (max_age_hours * 3600)
|
|
|
|
| 1448 |
with self.memory_lock:
|
| 1449 |
old_metadata = []
|
| 1450 |
new_metadata = []
|
| 1451 |
+
affected_sessions = set()
|
| 1452 |
|
| 1453 |
+
for meta in self.metadata:
|
| 1454 |
if meta.get("timestamp", 0) < cutoff:
|
| 1455 |
+
old_metadata.append(meta)
|
| 1456 |
+
user_id = meta.get("user_id")
|
| 1457 |
+
chat_id = meta.get("chat_id")
|
| 1458 |
+
if user_id and chat_id:
|
| 1459 |
+
affected_sessions.add((user_id, chat_id))
|
| 1460 |
else:
|
| 1461 |
new_metadata.append(meta)
|
| 1462 |
|
| 1463 |
if not old_metadata:
|
| 1464 |
return 0
|
| 1465 |
|
|
|
|
| 1466 |
logger.info(f"🧹 Cleaning up {len(old_metadata)} old vectors...")
|
| 1467 |
|
|
|
|
| 1468 |
recent_texts = [m["text"] for m in new_metadata]
|
| 1469 |
|
| 1470 |
if recent_texts:
|
| 1471 |
+
try:
|
| 1472 |
+
embeddings = self.embedder.encode(recent_texts, show_progress_bar=False)
|
| 1473 |
+
faiss.normalize_L2(embeddings)
|
| 1474 |
+
|
| 1475 |
+
self.index = faiss.IndexFlatIP(384)
|
| 1476 |
+
self.index.add(np.array(embeddings).astype('float32'))
|
| 1477 |
+
except Exception as e:
|
| 1478 |
+
logger.error(f"❌ Failed to rebuild index: {e}")
|
| 1479 |
+
return 0
|
| 1480 |
else:
|
| 1481 |
self.index = faiss.IndexFlatIP(384)
|
| 1482 |
|
| 1483 |
self.metadata = new_metadata
|
| 1484 |
self._save_index()
|
| 1485 |
|
| 1486 |
+
# Remove affected sessions from BM25 cache
|
| 1487 |
+
for key in affected_sessions:
|
| 1488 |
+
self._invalidate_bm25_cache(*key)
|
| 1489 |
+
|
| 1490 |
+
logger.info(f"✅ Cleanup complete. Removed {len(old_metadata)} vectors.")
|
| 1491 |
return len(old_metadata)
|
| 1492 |
|
| 1493 |
def _cleanup(self):
|
|
|
|
| 1495 |
try:
|
| 1496 |
if hasattr(self, 'file_lock'):
|
| 1497 |
self.file_lock.release()
|
| 1498 |
+
gc.collect()
|
| 1499 |
+
except Exception as e:
|
| 1500 |
+
logger.warning(f"Cleanup warning: {e}")
|
| 1501 |
|
| 1502 |
# Global instance (singleton pattern)
|
| 1503 |
_vdb_instance = None
|
| 1504 |
+
_vdb_lock = threading.Lock()
|
| 1505 |
|
| 1506 |
+
def get_vector_db(index_path: str = "faiss_session_index.bin", metadata_path: str = "session_metadata.pkl") -> VectorDatabase:
|
| 1507 |
+
"""Singleton factory for VectorDatabase with thread-safe initialization"""
|
| 1508 |
global _vdb_instance
|
| 1509 |
if _vdb_instance is None:
|
| 1510 |
+
with _vdb_lock:
|
| 1511 |
+
if _vdb_instance is None:
|
| 1512 |
+
_vdb_instance = VectorDatabase(index_path, metadata_path)
|
| 1513 |
return _vdb_instance
|
| 1514 |
|
| 1515 |
# For backward compatibility
|