import faiss import numpy as np import pickle import os from typing import List, Dict, Optional class FAISSStore: _instance = None def __new__(cls, *args, **kwargs): if cls._instance is None: cls._instance = super(FAISSStore, cls).__new__(cls) return cls._instance def __init__(self, dimension: int = 384, index_path: str = "faiss_index.bin"): # Singleton init check if hasattr(self, 'index'): return self.dimension = dimension self.index_path = index_path self.index = faiss.IndexFlatL2(dimension) self.doc_map: Dict[int, Dict] = {} # Map FAISS ID to metadata self.current_id = 0 self._loaded = False def initialize(self): """Explicitly load the index.""" if not self._loaded: self.load() self._loaded = True def _ensure_loaded(self): if not self._loaded: self.initialize() def add_vectors(self, vectors: List[List[float]], metadatas: List[dict] = None): self._ensure_loaded() if not vectors: return vectors_np = np.array(vectors).astype('float32') num_vectors = vectors_np.shape[0] # Add to index self.index.add(vectors_np) # Map IDs to metadata if metadatas: for i in range(num_vectors): # If we have less metadata items than vectors (e.g. 1 metadata for multiple chunks), # reuse the first one or handle accordingly. # Here we assume 1 metadata dict per call or list matching vectors. # If metadatas is a list of same length, use it. # If it's a single dict (common case for 1 resume -> N chunks), we duplicate it. meta = metadatas[i] if len(metadatas) == num_vectors else metadatas[0] self.doc_map[self.current_id] = meta self.current_id += 1 else: self.current_id += num_vectors self.save() # Persist changes def get_all_resumes(self) -> List[Dict]: """Returns a list of unique resumes stored in the index.""" self._ensure_loaded() unique_resumes = {} for meta in self.doc_map.values(): r_id = meta.get("resume_id") if r_id and r_id not in unique_resumes: unique_resumes[r_id] = meta return list(unique_resumes.values()) def delete_by_resume_id(self, resume_id: str): """ Soft delete by removing from doc_map. Note: FAISS IndexFlatL2 doesn't support easy row deletion without rebuilding. For a prototype, we just remove metadata so it won't be returned in search results or lists. Ideally we would rebuild the index, but that requires storing raw vectors separately. """ self._ensure_loaded() keys_to_remove = [k for k, v in self.doc_map.items() if v.get("resume_id") == resume_id] for k in keys_to_remove: del self.doc_map[k] if keys_to_remove: self.save() # Persist changes return len(keys_to_remove) > 0 def search(self, query_vector: List[float], k: int = 5, filter_resume_id: Optional[str] = None): self._ensure_loaded() query_np = np.array([query_vector]).astype('float32') # We might search and find deleted items (id still in FAISS index), so we request > k distances, indices = self.index.search(query_np, k * 3) results = [] for dist, idx in zip(distances[0], indices[0]): if idx != -1: # Check if metadata still exists (wasn't deleted) metadata = self.doc_map.get(idx) if metadata: # Filter check if filter_resume_id and metadata.get("resume_id") != filter_resume_id: continue results.append((dist, idx, metadata)) # Return only top k valid results return results[:k] def save(self): faiss.write_index(self.index, self.index_path) with open(self.index_path + ".meta", "wb") as f: pickle.dump(self.doc_map, f) def load(self): if os.path.exists(self.index_path): self.index = faiss.read_index(self.index_path) if os.path.exists(self.index_path + ".meta"): with open(self.index_path + ".meta", "rb") as f: self.doc_map = pickle.load(f)