import faiss import numpy as np import os # ========================================================== # BUILD FAISS INDEX (Cosine Similarity Safe Version) # ========================================================== def build_faiss_index(embeddings): """ 📘 Builds a FAISS index optimized for cosine similarity (float32-safe, dimension-aware). Args: embeddings (list | np.ndarray): List or array of embedding vectors. Returns: faiss.IndexFlatIP: FAISS index for cosine similarity search. """ # 🧩 Validation if embeddings is None: raise ValueError("❌ No embeddings provided to build FAISS index.") # ✅ Convert ndarray to list if needed if isinstance(embeddings, np.ndarray): # Handle (n, d) NumPy embeddings if embeddings.ndim == 1: embeddings = embeddings.reshape(1, -1) vectors = embeddings.astype("float32") elif isinstance(embeddings, list): vectors = np.array(embeddings, dtype="float32") else: raise TypeError(f"❌ Unexpected embeddings type: {type(embeddings)}") # ✅ Ensure there are embeddings to index if vectors.size == 0: raise ValueError("❌ Empty embeddings array provided.") # ✅ Normalize for cosine similarity faiss.normalize_L2(vectors) # ✅ Build FAISS index (Inner Product = Cosine Similarity) dim = vectors.shape[1] index = faiss.IndexFlatIP(dim) index.add(vectors) print(f"✅ FAISS index built successfully — {index.ntotal} vectors | dim={dim} | cosine similarity mode.") return index # ========================================================== # SEARCH FAISS INDEX (with sanity checks) # ========================================================== def search_faiss(query_embedding, index, chunks, top_k=3): """ 🔍 Searches FAISS index for semantically similar chunks. Args: query_embedding (np.ndarray): Query vector (1D or 2D). index (faiss.IndexFlatIP): Built FAISS index. chunks (list[str]): Original document chunks. top_k (int): Number of results to return. Returns: list[str]: Top-matching chunks. """ if index is None or index.ntotal == 0: raise ValueError("❌ FAISS index is empty or not initialized.") # ✅ Convert query to float32 and normalize query_vector = np.array([query_embedding], dtype="float32") if query_embedding.ndim == 1 else query_embedding.astype("float32") faiss.normalize_L2(query_vector) # ✅ Dimension check if query_vector.shape[1] != index.d: raise ValueError( f"❌ Embedding dimension mismatch: query={query_vector.shape[1]}, index={index.d}. " "Rebuild FAISS index with embeddings from the same model." ) # 🔍 Run search distances, indices = index.search(query_vector, top_k) results = [] for idx in indices[0]: if 0 <= idx < len(chunks): results.append(chunks[idx]) print(f"🔍 FAISS search completed — retrieved {len(results)} chunks (top_k={top_k})") return results # ========================================================== # SAVE / LOAD INDEX (Optional Utility) # ========================================================== def save_faiss_index(index, path="faiss_index.bin"): """💾 Save FAISS index to disk.""" faiss.write_index(index, path) print(f"💾 FAISS index saved to {path}") def load_faiss_index(path="faiss_index.bin"): """📂 Load FAISS index from disk.""" if not os.path.exists(path): raise FileNotFoundError(f"❌ No FAISS index found at {path}") index = faiss.read_index(path) print(f"📂 FAISS index loaded from {path}") return index # ========================================================== # DEBUG / LOCAL TEST # ========================================================== if __name__ == "__main__": # Example embeddings (3 vectors, dim=3) sample_embeddings = np.array([ [0.1, 0.2, 0.3], [0.2, 0.1, 0.4], [0.9, 0.8, 0.7] ], dtype="float32") query_vec = np.array([0.15, 0.18, 0.35], dtype="float32") # ✅ Build and search idx = build_faiss_index(sample_embeddings) results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2) print("🔎 Top Results:") for r in results: print("-", r)