skillsync-backend / app /vectorstore /faiss_store.py
GitHub Actions
sync: github commit e4109213b5cedf256d6e30f65518976b7d530541 to HF Space
19dc325
Raw
History Blame Contribute Delete
4.67 kB
import faiss
import numpy as np
import pickle
import os
from typing import List, Dict, Optional
class FAISSStore:
_instance = None
def __new__(cls, *args, **kwargs):
if cls._instance is None:
cls._instance = super(FAISSStore, cls).__new__(cls)
return cls._instance
def __init__(self, dimension: int = 384, index_path: str = "faiss_index.bin"):
# Singleton init check
if hasattr(self, 'index'):
return
self.dimension = dimension
self.index_path = index_path
self.index = faiss.IndexFlatL2(dimension)
self.doc_map: Dict[int, Dict] = {} # Map FAISS ID to metadata
self.current_id = 0
self._loaded = False
def initialize(self):
"""Explicitly load the index."""
if not self._loaded:
self.load()
self._loaded = True
def _ensure_loaded(self):
if not self._loaded:
self.initialize()
def add_vectors(self, vectors: List[List[float]], metadatas: List[dict] = None):
self._ensure_loaded()
if not vectors:
return
vectors_np = np.array(vectors).astype('float32')
num_vectors = vectors_np.shape[0]
# Add to index
self.index.add(vectors_np)
# Map IDs to metadata
if metadatas:
for i in range(num_vectors):
# If we have less metadata items than vectors (e.g. 1 metadata for multiple chunks),
# reuse the first one or handle accordingly.
# Here we assume 1 metadata dict per call or list matching vectors.
# If metadatas is a list of same length, use it.
# If it's a single dict (common case for 1 resume -> N chunks), we duplicate it.
meta = metadatas[i] if len(metadatas) == num_vectors else metadatas[0]
self.doc_map[self.current_id] = meta
self.current_id += 1
else:
self.current_id += num_vectors
self.save() # Persist changes
def get_all_resumes(self) -> List[Dict]:
"""Returns a list of unique resumes stored in the index."""
self._ensure_loaded()
unique_resumes = {}
for meta in self.doc_map.values():
r_id = meta.get("resume_id")
if r_id and r_id not in unique_resumes:
unique_resumes[r_id] = meta
return list(unique_resumes.values())
def delete_by_resume_id(self, resume_id: str):
"""
Soft delete by removing from doc_map.
Note: FAISS IndexFlatL2 doesn't support easy row deletion without rebuilding.
For a prototype, we just remove metadata so it won't be returned in search results or lists.
Ideally we would rebuild the index, but that requires storing raw vectors separately.
"""
self._ensure_loaded()
keys_to_remove = [k for k, v in self.doc_map.items() if v.get("resume_id") == resume_id]
for k in keys_to_remove:
del self.doc_map[k]
if keys_to_remove:
self.save() # Persist changes
return len(keys_to_remove) > 0
def search(self, query_vector: List[float], k: int = 5, filter_resume_id: Optional[str] = None):
self._ensure_loaded()
query_np = np.array([query_vector]).astype('float32')
# We might search and find deleted items (id still in FAISS index), so we request > k
distances, indices = self.index.search(query_np, k * 3)
results = []
for dist, idx in zip(distances[0], indices[0]):
if idx != -1:
# Check if metadata still exists (wasn't deleted)
metadata = self.doc_map.get(idx)
if metadata:
# Filter check
if filter_resume_id and metadata.get("resume_id") != filter_resume_id:
continue
results.append((dist, idx, metadata))
# Return only top k valid results
return results[:k]
def save(self):
faiss.write_index(self.index, self.index_path)
with open(self.index_path + ".meta", "wb") as f:
pickle.dump(self.doc_map, f)
def load(self):
if os.path.exists(self.index_path):
self.index = faiss.read_index(self.index_path)
if os.path.exists(self.index_path + ".meta"):
with open(self.index_path + ".meta", "rb") as f:
self.doc_map = pickle.load(f)