Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

Shubham170793 commited on Oct 5

Commit

4da661f

verified ·

1 Parent(s): 6b0c8b8

Update src/vectorstore.py

Browse files

Files changed (1) hide show

src/vectorstore.py +59 -22

src/vectorstore.py CHANGED Viewed

@@ -1,47 +1,84 @@
-import faiss            # 📌 Library: specialized for vector search
-import numpy as np      # 📌 Library: handles numeric arrays efficiently
 def build_faiss_index(embeddings: list):
     """
-    📌 Build a FAISS index from embeddings.
     Args:
         embeddings (list): List of embedding vectors (lists of floats).
     Returns:
-        index (faiss.IndexFlatL2): FAISS index for similarity search.
     """
-    # Step 1: Get vector size (e.g., 384 for Hugging Face embeddings)
-    dimension = len(embeddings[0])
-    # Step 2: Create FAISS index that uses Euclidean (L2) distance
-    index = faiss.IndexFlatL2(dimension)
-    # Step 3: Add all embeddings to the index (convert to float32 array first)
-    index.add(np.array(embeddings).astype('float32'))
     return index
 def search_faiss(query_embedding: list, index, chunks: list, top_k: int = 3):
     """
-    📌 Search FAISS index to find most similar chunks.
     Args:
         query_embedding (list): Embedding for user query.
-        index: FAISS index object.
-        chunks (list): Original text chunks.
-        top_k (int): Number of results to return (default = 3).
     Returns:
-        list: Top matching text chunks.
     """
-    # Step 1: Turn query embedding into shape (1, dim) numpy array
-    query_vector = np.array([query_embedding]).astype('float32')
-    # Step 2: Search FAISS for nearest neighbors
     distances, indices = index.search(query_vector, top_k)
-    # Step 3: Map result indices back to actual text chunks
-    return [chunks[i] for i in indices[0]]

+import faiss
+import numpy as np
+# -----------------------------
+# BUILD FAISS INDEX
+# -----------------------------
 def build_faiss_index(embeddings: list):
     """
+    📘 Builds a FAISS index for similarity search.
     Args:
         embeddings (list): List of embedding vectors (lists of floats).
     Returns:
+        faiss.IndexFlatL2: FAISS index for vector similarity.
     """
+    if not embeddings or not isinstance(embeddings, list):
+        raise ValueError("❌ Invalid input: embeddings must be a non-empty list.")
+    # Convert to numpy float32 array
+    vectors = np.array(embeddings).astype("float32")
+    dim = vectors.shape[1]  # Get embedding dimension (e.g., 384 or 768)
+    # Create index using Euclidean (L2) distance
+    index = faiss.IndexFlatL2(dim)
+    # Add vectors to index
+    index.add(vectors)
+    print(f"✅ FAISS index built with {index.ntotal} vectors of dimension {dim}")
     return index
+# -----------------------------
+# SEARCH FAISS INDEX
+# -----------------------------
 def search_faiss(query_embedding: list, index, chunks: list, top_k: int = 3):
     """
+    🔍 Searches FAISS index for most similar chunks to query.
     Args:
         query_embedding (list): Embedding for user query.
+        index (faiss.IndexFlatL2): Pre-built FAISS index.
+        chunks (list[str]): Original text chunks.
+        top_k (int): Number of most similar results to return.
     Returns:
+        list[str]: Top-matching text chunks.
     """
+    if index is None or index.ntotal == 0:
+        raise ValueError("❌ FAISS index is empty or not initialized.")
+    # Convert query embedding to correct format
+    query_vector = np.array([query_embedding]).astype("float32")
+    # Search the index (returns distances + indices)
     distances, indices = index.search(query_vector, top_k)
+    # Extract matched chunks with their distances (sorted)
+    results = []
+    for i, idx in enumerate(indices[0]):
+        if idx < len(chunks):
+            results.append(chunks[idx])
+    return results
+# -----------------------------
+# OPTIONAL: DEBUG / DEMO
+# -----------------------------
+if __name__ == "__main__":
+    # Example usage test
+    sample_embeddings = [
+        [0.1, 0.2, 0.3],
+        [0.2, 0.1, 0.4],
+        [0.9, 0.8, 0.7]
+    ]
+    query_vec = [0.15, 0.18, 0.35]
+    idx = build_faiss_index(sample_embeddings)
+    results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2)
+    print("🔎 Top Results:")
+    for r in results:
+        print("-", r)