import faiss import numpy as np # ----------------------------- # BUILD FAISS INDEX (Optimized for Cosine Similarity) # ----------------------------- def build_faiss_index(embeddings: list): """ 📘 Builds a FAISS index optimized for cosine similarity (fast + accurate). Args: embeddings (list): List of embedding vectors (lists of floats). Returns: faiss.IndexFlatIP: FAISS index for cosine similarity search. """ if not embeddings or not isinstance(embeddings, list): raise ValueError("❌ Invalid input: embeddings must be a non-empty list.") # Convert to numpy float32 array vectors = np.array(embeddings).astype("float32") dim = vectors.shape[1] # ✅ Normalize embeddings (turns dot product into cosine similarity) faiss.normalize_L2(vectors) # ✅ Use Inner Product index (fast cosine similarity) index = faiss.IndexFlatIP(dim) index.add(vectors) print(f"✅ FAISS index built with {index.ntotal} vectors of dimension {dim} (cosine similarity)") return index # ----------------------------- # SEARCH FAISS INDEX # ----------------------------- def search_faiss(query_embedding: list, index, chunks: list, top_k: int = 3): """ 🔍 Searches FAISS index for most similar chunks to query. Args: query_embedding (list): Embedding for user query. index (faiss.IndexFlatL2): Pre-built FAISS index. chunks (list[str]): Original text chunks. top_k (int): Number of most similar results to return. Returns: list[str]: Top-matching text chunks. """ if index is None or index.ntotal == 0: raise ValueError("❌ FAISS index is empty or not initialized.") # Convert query embedding to correct format query_vector = np.array([query_embedding]).astype("float32") # Search the index (returns distances + indices) distances, indices = index.search(query_vector, top_k) # Extract matched chunks with their distances (sorted) results = [] for i, idx in enumerate(indices[0]): if idx < len(chunks): results.append(chunks[idx]) return results # ----------------------------- # OPTIONAL: DEBUG / DEMO # ----------------------------- if __name__ == "__main__": # Example usage test sample_embeddings = [ [0.1, 0.2, 0.3], [0.2, 0.1, 0.4], [0.9, 0.8, 0.7] ] query_vec = [0.15, 0.18, 0.35] idx = build_faiss_index(sample_embeddings) results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2) print("🔎 Top Results:") for r in results: print("-", r)