Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 16

Commit

a5c876d

verified ·

1 Parent(s): 6df27c5

Update src/vectorstore.py

Browse files

Files changed (1) hide show

src/vectorstore.py +35 -29

src/vectorstore.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import faiss
 import numpy as np
-# -----------------------------
-# BUILD FAISS INDEX (Optimized for Cosine Similarity)
-# -----------------------------
 def build_faiss_index(embeddings: list):
     """
-    📘 Builds a FAISS index optimized for cosine similarity (fast + accurate).
     Args:
         embeddings (list): List of embedding vectors (lists of floats).
@@ -16,14 +16,13 @@ def build_faiss_index(embeddings: list):
     if not embeddings or not isinstance(embeddings, list):
         raise ValueError("❌ Invalid input: embeddings must be a non-empty list.")
-    # Convert to numpy float32 array
-    vectors = np.array(embeddings).astype("float32")
-    dim = vectors.shape[1]
-    # ✅ Normalize embeddings (turns dot product into cosine similarity)
     faiss.normalize_L2(vectors)
-    # ✅ Use Inner Product index (fast cosine similarity)
     index = faiss.IndexFlatIP(dim)
     index.add(vectors)
@@ -31,46 +30,53 @@ def build_faiss_index(embeddings: list):
     return index
-# -----------------------------
-# SEARCH FAISS INDEX
-# -----------------------------
-def search_faiss(query_embedding: list, index, chunks: list, top_k: int = 3):
     """
-    🔍 Searches FAISS index for most similar chunks to query.
     Args:
-        query_embedding (list): Embedding for user query.
-        index (faiss.IndexFlatL2): Pre-built FAISS index.
-        chunks (list[str]): Original text chunks.
-        top_k (int): Number of most similar results to return.
     Returns:
-        list[str]: Top-matching text chunks.
     """
     if index is None or index.ntotal == 0:
         raise ValueError("❌ FAISS index is empty or not initialized.")
-    # Convert query embedding to correct format
-    query_vector = np.array([query_embedding]).astype("float32")
-    # Search the index (returns distances + indices)
     distances, indices = index.search(query_vector, top_k)
-    # Extract matched chunks with their distances (sorted)
     results = []
     for i, idx in enumerate(indices[0]):
-        if idx < len(chunks):
             results.append(chunks[idx])
     return results
-# -----------------------------
-# OPTIONAL: DEBUG / DEMO
-# -----------------------------
 if __name__ == "__main__":
-    # Example usage test
     sample_embeddings = [
         [0.1, 0.2, 0.3],
         [0.2, 0.1, 0.4],

 import faiss
 import numpy as np
+# ==========================================================
+# BUILD FAISS INDEX (Cosine Similarity Safe Version)
+# ==========================================================
 def build_faiss_index(embeddings: list):
     """
+    📘 Builds a FAISS index optimized for cosine similarity (float32-safe, dimension-aware).
     Args:
         embeddings (list): List of embedding vectors (lists of floats).
     if not embeddings or not isinstance(embeddings, list):
         raise ValueError("❌ Invalid input: embeddings must be a non-empty list.")
+    # ✅ Convert to numpy float32
+    vectors = np.array(embeddings, dtype="float32")
+    # ✅ Normalize (so cosine == inner product)
     faiss.normalize_L2(vectors)
+    dim = vectors.shape[1]
     index = faiss.IndexFlatIP(dim)
     index.add(vectors)
     return index
+# ==========================================================
+# SEARCH FAISS INDEX (with sanity checks)
+# ==========================================================
+def search_faiss(query_embedding: np.ndarray, index, chunks: list, top_k: int = 3):
     """
+    🔍 Searches FAISS index for the most semantically similar chunks to query.
     Args:
+        query_embedding (np.ndarray): Query embedding vector.
+        index (faiss.IndexFlatIP): Pre-built FAISS index.
+        chunks (list[str]): Text chunks used to build the index.
+        top_k (int): Number of results to return.
     Returns:
+        list[str]: Top matching text chunks.
     """
     if index is None or index.ntotal == 0:
         raise ValueError("❌ FAISS index is empty or not initialized.")
+    # ✅ Ensure query vector is correct dtype and shape
+    query_vector = np.array([query_embedding], dtype="float32")
+    faiss.normalize_L2(query_vector)
+    # ✅ Dimension check
+    if query_vector.shape[1] != index.d:
+        raise ValueError(
+            f"❌ Embedding dimension mismatch: query={query_vector.shape[1]}, index={index.d}. "
+            "Please rebuild the FAISS index with the current embedding model."
+        )
+    # Search
     distances, indices = index.search(query_vector, top_k)
+    # ✅ Return sorted top-k chunks
     results = []
     for i, idx in enumerate(indices[0]):
+        if 0 <= idx < len(chunks):
             results.append(chunks[idx])
+    print(f"🔍 FAISS search completed — retrieved {len(results)} chunks (top_k={top_k})")
     return results
+# ==========================================================
+# DEBUG / LOCAL TEST
+# ==========================================================
 if __name__ == "__main__":
     sample_embeddings = [
         [0.1, 0.2, 0.3],
         [0.2, 0.1, 0.4],