Shubham170793's picture
Update src/vectorstore.py
8e133cf verified
import faiss
import numpy as np
import os
# ==========================================================
# BUILD FAISS INDEX (Cosine Similarity Safe Version)
# ==========================================================
def build_faiss_index(embeddings):
"""
πŸ“˜ Builds a FAISS index optimized for cosine similarity (float32-safe, dimension-aware).
Args:
embeddings (list | np.ndarray): List or array of embedding vectors.
Returns:
faiss.IndexFlatIP: FAISS index for cosine similarity search.
"""
# 🧩 Validation
if embeddings is None:
raise ValueError("❌ No embeddings provided to build FAISS index.")
# βœ… Convert ndarray to list if needed
if isinstance(embeddings, np.ndarray):
# Handle (n, d) NumPy embeddings
if embeddings.ndim == 1:
embeddings = embeddings.reshape(1, -1)
vectors = embeddings.astype("float32")
elif isinstance(embeddings, list):
vectors = np.array(embeddings, dtype="float32")
else:
raise TypeError(f"❌ Unexpected embeddings type: {type(embeddings)}")
# βœ… Ensure there are embeddings to index
if vectors.size == 0:
raise ValueError("❌ Empty embeddings array provided.")
# βœ… Normalize for cosine similarity
faiss.normalize_L2(vectors)
# βœ… Build FAISS index (Inner Product = Cosine Similarity)
dim = vectors.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(vectors)
print(f"βœ… FAISS index built successfully β€” {index.ntotal} vectors | dim={dim} | cosine similarity mode.")
return index
# ==========================================================
# SEARCH FAISS INDEX (with sanity checks)
# ==========================================================
def search_faiss(query_embedding, index, chunks, top_k=3):
"""
πŸ” Searches FAISS index for semantically similar chunks.
Args:
query_embedding (np.ndarray): Query vector (1D or 2D).
index (faiss.IndexFlatIP): Built FAISS index.
chunks (list[str]): Original document chunks.
top_k (int): Number of results to return.
Returns:
list[str]: Top-matching chunks.
"""
if index is None or index.ntotal == 0:
raise ValueError("❌ FAISS index is empty or not initialized.")
# βœ… Convert query to float32 and normalize
query_vector = np.array([query_embedding], dtype="float32") if query_embedding.ndim == 1 else query_embedding.astype("float32")
faiss.normalize_L2(query_vector)
# βœ… Dimension check
if query_vector.shape[1] != index.d:
raise ValueError(
f"❌ Embedding dimension mismatch: query={query_vector.shape[1]}, index={index.d}. "
"Rebuild FAISS index with embeddings from the same model."
)
# πŸ” Run search
distances, indices = index.search(query_vector, top_k)
results = []
for idx in indices[0]:
if 0 <= idx < len(chunks):
results.append(chunks[idx])
print(f"πŸ” FAISS search completed β€” retrieved {len(results)} chunks (top_k={top_k})")
return results
# ==========================================================
# SAVE / LOAD INDEX (Optional Utility)
# ==========================================================
def save_faiss_index(index, path="faiss_index.bin"):
"""πŸ’Ύ Save FAISS index to disk."""
faiss.write_index(index, path)
print(f"πŸ’Ύ FAISS index saved to {path}")
def load_faiss_index(path="faiss_index.bin"):
"""πŸ“‚ Load FAISS index from disk."""
if not os.path.exists(path):
raise FileNotFoundError(f"❌ No FAISS index found at {path}")
index = faiss.read_index(path)
print(f"πŸ“‚ FAISS index loaded from {path}")
return index
# ==========================================================
# DEBUG / LOCAL TEST
# ==========================================================
if __name__ == "__main__":
# Example embeddings (3 vectors, dim=3)
sample_embeddings = np.array([
[0.1, 0.2, 0.3],
[0.2, 0.1, 0.4],
[0.9, 0.8, 0.7]
], dtype="float32")
query_vec = np.array([0.15, 0.18, 0.35], dtype="float32")
# βœ… Build and search
idx = build_faiss_index(sample_embeddings)
results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2)
print("πŸ”Ž Top Results:")
for r in results:
print("-", r)