Shubham170793 commited on
Commit
a5c876d
Β·
verified Β·
1 Parent(s): 6df27c5

Update src/vectorstore.py

Browse files
Files changed (1) hide show
  1. src/vectorstore.py +35 -29
src/vectorstore.py CHANGED
@@ -1,12 +1,12 @@
1
  import faiss
2
  import numpy as np
3
 
4
- # -----------------------------
5
- # BUILD FAISS INDEX (Optimized for Cosine Similarity)
6
- # -----------------------------
7
  def build_faiss_index(embeddings: list):
8
  """
9
- πŸ“˜ Builds a FAISS index optimized for cosine similarity (fast + accurate).
10
 
11
  Args:
12
  embeddings (list): List of embedding vectors (lists of floats).
@@ -16,14 +16,13 @@ def build_faiss_index(embeddings: list):
16
  if not embeddings or not isinstance(embeddings, list):
17
  raise ValueError("❌ Invalid input: embeddings must be a non-empty list.")
18
 
19
- # Convert to numpy float32 array
20
- vectors = np.array(embeddings).astype("float32")
21
- dim = vectors.shape[1]
22
 
23
- # βœ… Normalize embeddings (turns dot product into cosine similarity)
24
  faiss.normalize_L2(vectors)
25
 
26
- # βœ… Use Inner Product index (fast cosine similarity)
27
  index = faiss.IndexFlatIP(dim)
28
  index.add(vectors)
29
 
@@ -31,46 +30,53 @@ def build_faiss_index(embeddings: list):
31
  return index
32
 
33
 
34
-
35
- # -----------------------------
36
- # SEARCH FAISS INDEX
37
- # -----------------------------
38
- def search_faiss(query_embedding: list, index, chunks: list, top_k: int = 3):
39
  """
40
- πŸ” Searches FAISS index for most similar chunks to query.
41
 
42
  Args:
43
- query_embedding (list): Embedding for user query.
44
- index (faiss.IndexFlatL2): Pre-built FAISS index.
45
- chunks (list[str]): Original text chunks.
46
- top_k (int): Number of most similar results to return.
47
 
48
  Returns:
49
- list[str]: Top-matching text chunks.
50
  """
51
  if index is None or index.ntotal == 0:
52
  raise ValueError("❌ FAISS index is empty or not initialized.")
53
 
54
- # Convert query embedding to correct format
55
- query_vector = np.array([query_embedding]).astype("float32")
 
 
 
 
 
 
 
 
56
 
57
- # Search the index (returns distances + indices)
58
  distances, indices = index.search(query_vector, top_k)
59
 
60
- # Extract matched chunks with their distances (sorted)
61
  results = []
62
  for i, idx in enumerate(indices[0]):
63
- if idx < len(chunks):
64
  results.append(chunks[idx])
65
 
 
66
  return results
67
 
68
 
69
- # -----------------------------
70
- # OPTIONAL: DEBUG / DEMO
71
- # -----------------------------
72
  if __name__ == "__main__":
73
- # Example usage test
74
  sample_embeddings = [
75
  [0.1, 0.2, 0.3],
76
  [0.2, 0.1, 0.4],
 
1
  import faiss
2
  import numpy as np
3
 
4
+ # ==========================================================
5
+ # BUILD FAISS INDEX (Cosine Similarity Safe Version)
6
+ # ==========================================================
7
  def build_faiss_index(embeddings: list):
8
  """
9
+ πŸ“˜ Builds a FAISS index optimized for cosine similarity (float32-safe, dimension-aware).
10
 
11
  Args:
12
  embeddings (list): List of embedding vectors (lists of floats).
 
16
  if not embeddings or not isinstance(embeddings, list):
17
  raise ValueError("❌ Invalid input: embeddings must be a non-empty list.")
18
 
19
+ # βœ… Convert to numpy float32
20
+ vectors = np.array(embeddings, dtype="float32")
 
21
 
22
+ # βœ… Normalize (so cosine == inner product)
23
  faiss.normalize_L2(vectors)
24
 
25
+ dim = vectors.shape[1]
26
  index = faiss.IndexFlatIP(dim)
27
  index.add(vectors)
28
 
 
30
  return index
31
 
32
 
33
+ # ==========================================================
34
+ # SEARCH FAISS INDEX (with sanity checks)
35
+ # ==========================================================
36
+ def search_faiss(query_embedding: np.ndarray, index, chunks: list, top_k: int = 3):
 
37
  """
38
+ πŸ” Searches FAISS index for the most semantically similar chunks to query.
39
 
40
  Args:
41
+ query_embedding (np.ndarray): Query embedding vector.
42
+ index (faiss.IndexFlatIP): Pre-built FAISS index.
43
+ chunks (list[str]): Text chunks used to build the index.
44
+ top_k (int): Number of results to return.
45
 
46
  Returns:
47
+ list[str]: Top matching text chunks.
48
  """
49
  if index is None or index.ntotal == 0:
50
  raise ValueError("❌ FAISS index is empty or not initialized.")
51
 
52
+ # βœ… Ensure query vector is correct dtype and shape
53
+ query_vector = np.array([query_embedding], dtype="float32")
54
+ faiss.normalize_L2(query_vector)
55
+
56
+ # βœ… Dimension check
57
+ if query_vector.shape[1] != index.d:
58
+ raise ValueError(
59
+ f"❌ Embedding dimension mismatch: query={query_vector.shape[1]}, index={index.d}. "
60
+ "Please rebuild the FAISS index with the current embedding model."
61
+ )
62
 
63
+ # Search
64
  distances, indices = index.search(query_vector, top_k)
65
 
66
+ # βœ… Return sorted top-k chunks
67
  results = []
68
  for i, idx in enumerate(indices[0]):
69
+ if 0 <= idx < len(chunks):
70
  results.append(chunks[idx])
71
 
72
+ print(f"πŸ” FAISS search completed β€” retrieved {len(results)} chunks (top_k={top_k})")
73
  return results
74
 
75
 
76
+ # ==========================================================
77
+ # DEBUG / LOCAL TEST
78
+ # ==========================================================
79
  if __name__ == "__main__":
 
80
  sample_embeddings = [
81
  [0.1, 0.2, 0.3],
82
  [0.2, 0.1, 0.4],