import faiss import numpy as np # ----------------------------- # BUILD FAISS INDEX # ----------------------------- def build_faiss_index(embeddings: list): """ 📘 Builds a FAISS index for similarity search. Args: embeddings (list): List of embedding vectors (lists of floats). Returns: faiss.IndexFlatL2: FAISS index for vector similarity. """ if not embeddings or not isinstance(embeddings, list): raise ValueError("❌ Invalid input: embeddings must be a non-empty list.") # Convert to numpy float32 array vectors = np.array(embeddings).astype("float32") dim = vectors.shape[1] # Get embedding dimension (e.g., 384 or 768) # Create index using Euclidean (L2) distance index = faiss.IndexFlatL2(dim) # Add vectors to index index.add(vectors) print(f"✅ FAISS index built with {index.ntotal} vectors of dimension {dim}") return index # ----------------------------- # SEARCH FAISS INDEX # ----------------------------- def search_faiss(query_embedding: list, index, chunks: list, top_k: int = 3): """ 🔍 Searches FAISS index for most similar chunks to query. Args: query_embedding (list): Embedding for user query. index (faiss.IndexFlatL2): Pre-built FAISS index. chunks (list[str]): Original text chunks. top_k (int): Number of most similar results to return. Returns: list[str]: Top-matching text chunks. """ if index is None or index.ntotal == 0: raise ValueError("❌ FAISS index is empty or not initialized.") # Convert query embedding to correct format query_vector = np.array([query_embedding]).astype("float32") # Search the index (returns distances + indices) distances, indices = index.search(query_vector, top_k) # Extract matched chunks with their distances (sorted) results = [] for i, idx in enumerate(indices[0]): if idx < len(chunks): results.append(chunks[idx]) return results # ----------------------------- # OPTIONAL: DEBUG / DEMO # ----------------------------- if __name__ == "__main__": # Example usage test sample_embeddings = [ [0.1, 0.2, 0.3], [0.2, 0.1, 0.4], [0.9, 0.8, 0.7] ] query_vec = [0.15, 0.18, 0.35] idx = build_faiss_index(sample_embeddings) results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2) print("🔎 Top Results:") for r in results: print("-", r)