Spaces:
Running
Running
Update vector.py
Browse files
vector.py
CHANGED
|
@@ -191,7 +191,7 @@ class VectorDatabase:
|
|
| 191 |
|
| 192 |
# ==================== ENHANCED STORAGE WITH CACHE INVALIDATION ====================
|
| 193 |
|
| 194 |
-
def store_session_document(self, text: str, filename: str, user_id: str, chat_id: str) -> bool:
|
| 195 |
"""Store extracted file content with enhanced chunking and cache invalidation"""
|
| 196 |
if not text or len(text) < 10 or not user_id:
|
| 197 |
logger.warning(f"Invalid input for {filename}")
|
|
@@ -233,6 +233,7 @@ class VectorDatabase:
|
|
| 233 |
final_meta.append({
|
| 234 |
"text": chunk["text"],
|
| 235 |
"source": filename,
|
|
|
|
| 236 |
"type": "file",
|
| 237 |
"subtype": chunk.get("type", "general"),
|
| 238 |
"name": chunk.get("name", "unknown"),
|
|
@@ -249,6 +250,7 @@ class VectorDatabase:
|
|
| 249 |
"text": whole_file_text,
|
| 250 |
"actual_content": text,
|
| 251 |
"source": filename,
|
|
|
|
| 252 |
"type": "file",
|
| 253 |
"subtype": "whole_file",
|
| 254 |
"is_whole_file": True,
|
|
@@ -295,6 +297,52 @@ class VectorDatabase:
|
|
| 295 |
self._rollback_partial_storage(user_id, chat_id)
|
| 296 |
return False
|
| 297 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
# ==================== UPDATED BM25 SEARCH WITH LAZY LOADING ====================
|
| 299 |
|
| 300 |
def bm25_search(self, query: str, user_id: str, chat_id: str,
|
|
|
|
| 191 |
|
| 192 |
# ==================== ENHANCED STORAGE WITH CACHE INVALIDATION ====================
|
| 193 |
|
| 194 |
+
def store_session_document(self, text: str, filename: str, user_id: str, chat_id: str, file_id: str = None) -> bool:
|
| 195 |
"""Store extracted file content with enhanced chunking and cache invalidation"""
|
| 196 |
if not text or len(text) < 10 or not user_id:
|
| 197 |
logger.warning(f"Invalid input for {filename}")
|
|
|
|
| 233 |
final_meta.append({
|
| 234 |
"text": chunk["text"],
|
| 235 |
"source": filename,
|
| 236 |
+
"file_id": file_id,
|
| 237 |
"type": "file",
|
| 238 |
"subtype": chunk.get("type", "general"),
|
| 239 |
"name": chunk.get("name", "unknown"),
|
|
|
|
| 250 |
"text": whole_file_text,
|
| 251 |
"actual_content": text,
|
| 252 |
"source": filename,
|
| 253 |
+
"file_id": file_id,
|
| 254 |
"type": "file",
|
| 255 |
"subtype": "whole_file",
|
| 256 |
"is_whole_file": True,
|
|
|
|
| 297 |
self._rollback_partial_storage(user_id, chat_id)
|
| 298 |
return False
|
| 299 |
|
| 300 |
+
def delete_file(self, user_id: str, chat_id: str, file_id: str) -> bool:
|
| 301 |
+
"""Surgical Strike: Remove chunks belonging to a specific file ID"""
|
| 302 |
+
with self.memory_lock:
|
| 303 |
+
new_metadata = []
|
| 304 |
+
removed_count = 0
|
| 305 |
+
|
| 306 |
+
# Filter loop: Keep everything that DOESN'T match our file_id
|
| 307 |
+
for meta in self.metadata:
|
| 308 |
+
# Check matches: Must match User + Chat + FileID
|
| 309 |
+
if (meta.get("user_id") == user_id and
|
| 310 |
+
meta.get("chat_id") == chat_id and
|
| 311 |
+
meta.get("file_id") == file_id):
|
| 312 |
+
removed_count += 1
|
| 313 |
+
else:
|
| 314 |
+
new_metadata.append(meta)
|
| 315 |
+
|
| 316 |
+
if removed_count == 0:
|
| 317 |
+
logger.info(f"ℹ️ No vectors found for file_id {file_id}")
|
| 318 |
+
return False
|
| 319 |
+
|
| 320 |
+
logger.info(f"🧹 Surgically removing {removed_count} vectors for file {file_id}...")
|
| 321 |
+
|
| 322 |
+
# Rebuild Index (Standard Faiss Pattern)
|
| 323 |
+
if not new_metadata:
|
| 324 |
+
self.index = faiss.IndexFlatIP(384)
|
| 325 |
+
else:
|
| 326 |
+
surviving_texts = [m["text"] for m in new_metadata]
|
| 327 |
+
try:
|
| 328 |
+
embeddings = self.embedder.encode(surviving_texts, show_progress_bar=False)
|
| 329 |
+
faiss.normalize_L2(embeddings)
|
| 330 |
+
|
| 331 |
+
new_index = faiss.IndexFlatIP(384)
|
| 332 |
+
new_index.add(np.array(embeddings).astype('float32'))
|
| 333 |
+
self.index = new_index
|
| 334 |
+
except Exception as e:
|
| 335 |
+
logger.error(f"❌ Rebuild failed during file deletion: {e}")
|
| 336 |
+
return False
|
| 337 |
+
|
| 338 |
+
self.metadata = new_metadata
|
| 339 |
+
self._save_index()
|
| 340 |
+
|
| 341 |
+
# Invalidate Cache
|
| 342 |
+
self._invalidate_bm25_cache(user_id, chat_id)
|
| 343 |
+
|
| 344 |
+
logger.info(f"✅ Successfully deleted file {file_id}")
|
| 345 |
+
return True
|
| 346 |
# ==================== UPDATED BM25 SEARCH WITH LAZY LOADING ====================
|
| 347 |
|
| 348 |
def bm25_search(self, query: str, user_id: str, chat_id: str,
|