Shubham170793 commited on
Commit
8e133cf
Β·
verified Β·
1 Parent(s): 585fec8

Update src/vectorstore.py

Browse files
Files changed (1) hide show
  1. src/vectorstore.py +60 -26
src/vectorstore.py CHANGED
@@ -1,71 +1,84 @@
1
  import faiss
2
  import numpy as np
 
3
 
4
  # ==========================================================
5
  # BUILD FAISS INDEX (Cosine Similarity Safe Version)
6
  # ==========================================================
7
- def build_faiss_index(embeddings: list):
8
  """
9
  πŸ“˜ Builds a FAISS index optimized for cosine similarity (float32-safe, dimension-aware).
10
 
11
  Args:
12
- embeddings (list): List of embedding vectors (lists of floats).
13
  Returns:
14
  faiss.IndexFlatIP: FAISS index for cosine similarity search.
15
  """
16
- if not embeddings or not isinstance(embeddings, list):
17
- raise ValueError("❌ Invalid input: embeddings must be a non-empty list.")
18
-
19
- # βœ… Convert to numpy float32
20
- vectors = np.array(embeddings, dtype="float32")
21
-
22
- # βœ… Normalize (so cosine == inner product)
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  faiss.normalize_L2(vectors)
24
 
 
25
  dim = vectors.shape[1]
26
  index = faiss.IndexFlatIP(dim)
27
  index.add(vectors)
28
 
29
- print(f"βœ… FAISS index built with {index.ntotal} vectors of dimension {dim} (cosine similarity)")
30
  return index
31
 
32
 
33
  # ==========================================================
34
  # SEARCH FAISS INDEX (with sanity checks)
35
  # ==========================================================
36
- def search_faiss(query_embedding: np.ndarray, index, chunks: list, top_k: int = 3):
37
  """
38
- πŸ” Searches FAISS index for the most semantically similar chunks to query.
39
 
40
  Args:
41
- query_embedding (np.ndarray): Query embedding vector.
42
- index (faiss.IndexFlatIP): Pre-built FAISS index.
43
- chunks (list[str]): Text chunks used to build the index.
44
  top_k (int): Number of results to return.
45
 
46
  Returns:
47
- list[str]: Top matching text chunks.
48
  """
49
  if index is None or index.ntotal == 0:
50
  raise ValueError("❌ FAISS index is empty or not initialized.")
51
 
52
- # βœ… Ensure query vector is correct dtype and shape
53
- query_vector = np.array([query_embedding], dtype="float32")
54
  faiss.normalize_L2(query_vector)
55
 
56
  # βœ… Dimension check
57
  if query_vector.shape[1] != index.d:
58
  raise ValueError(
59
  f"❌ Embedding dimension mismatch: query={query_vector.shape[1]}, index={index.d}. "
60
- "Please rebuild the FAISS index with the current embedding model."
61
  )
62
 
63
- # Search
64
  distances, indices = index.search(query_vector, top_k)
65
-
66
- # βœ… Return sorted top-k chunks
67
  results = []
68
- for i, idx in enumerate(indices[0]):
69
  if 0 <= idx < len(chunks):
70
  results.append(chunks[idx])
71
 
@@ -73,17 +86,38 @@ def search_faiss(query_embedding: np.ndarray, index, chunks: list, top_k: int =
73
  return results
74
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  # ==========================================================
77
  # DEBUG / LOCAL TEST
78
  # ==========================================================
79
  if __name__ == "__main__":
80
- sample_embeddings = [
 
81
  [0.1, 0.2, 0.3],
82
  [0.2, 0.1, 0.4],
83
  [0.9, 0.8, 0.7]
84
- ]
85
- query_vec = [0.15, 0.18, 0.35]
 
86
 
 
87
  idx = build_faiss_index(sample_embeddings)
88
  results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2)
89
 
 
1
  import faiss
2
  import numpy as np
3
+ import os
4
 
5
  # ==========================================================
6
  # BUILD FAISS INDEX (Cosine Similarity Safe Version)
7
  # ==========================================================
8
+ def build_faiss_index(embeddings):
9
  """
10
  πŸ“˜ Builds a FAISS index optimized for cosine similarity (float32-safe, dimension-aware).
11
 
12
  Args:
13
+ embeddings (list | np.ndarray): List or array of embedding vectors.
14
  Returns:
15
  faiss.IndexFlatIP: FAISS index for cosine similarity search.
16
  """
17
+ # 🧩 Validation
18
+ if embeddings is None:
19
+ raise ValueError("❌ No embeddings provided to build FAISS index.")
20
+
21
+ # βœ… Convert ndarray to list if needed
22
+ if isinstance(embeddings, np.ndarray):
23
+ # Handle (n, d) NumPy embeddings
24
+ if embeddings.ndim == 1:
25
+ embeddings = embeddings.reshape(1, -1)
26
+ vectors = embeddings.astype("float32")
27
+ elif isinstance(embeddings, list):
28
+ vectors = np.array(embeddings, dtype="float32")
29
+ else:
30
+ raise TypeError(f"❌ Unexpected embeddings type: {type(embeddings)}")
31
+
32
+ # βœ… Ensure there are embeddings to index
33
+ if vectors.size == 0:
34
+ raise ValueError("❌ Empty embeddings array provided.")
35
+
36
+ # βœ… Normalize for cosine similarity
37
  faiss.normalize_L2(vectors)
38
 
39
+ # βœ… Build FAISS index (Inner Product = Cosine Similarity)
40
  dim = vectors.shape[1]
41
  index = faiss.IndexFlatIP(dim)
42
  index.add(vectors)
43
 
44
+ print(f"βœ… FAISS index built successfully β€” {index.ntotal} vectors | dim={dim} | cosine similarity mode.")
45
  return index
46
 
47
 
48
  # ==========================================================
49
  # SEARCH FAISS INDEX (with sanity checks)
50
  # ==========================================================
51
+ def search_faiss(query_embedding, index, chunks, top_k=3):
52
  """
53
+ πŸ” Searches FAISS index for semantically similar chunks.
54
 
55
  Args:
56
+ query_embedding (np.ndarray): Query vector (1D or 2D).
57
+ index (faiss.IndexFlatIP): Built FAISS index.
58
+ chunks (list[str]): Original document chunks.
59
  top_k (int): Number of results to return.
60
 
61
  Returns:
62
+ list[str]: Top-matching chunks.
63
  """
64
  if index is None or index.ntotal == 0:
65
  raise ValueError("❌ FAISS index is empty or not initialized.")
66
 
67
+ # βœ… Convert query to float32 and normalize
68
+ query_vector = np.array([query_embedding], dtype="float32") if query_embedding.ndim == 1 else query_embedding.astype("float32")
69
  faiss.normalize_L2(query_vector)
70
 
71
  # βœ… Dimension check
72
  if query_vector.shape[1] != index.d:
73
  raise ValueError(
74
  f"❌ Embedding dimension mismatch: query={query_vector.shape[1]}, index={index.d}. "
75
+ "Rebuild FAISS index with embeddings from the same model."
76
  )
77
 
78
+ # πŸ” Run search
79
  distances, indices = index.search(query_vector, top_k)
 
 
80
  results = []
81
+ for idx in indices[0]:
82
  if 0 <= idx < len(chunks):
83
  results.append(chunks[idx])
84
 
 
86
  return results
87
 
88
 
89
+ # ==========================================================
90
+ # SAVE / LOAD INDEX (Optional Utility)
91
+ # ==========================================================
92
+ def save_faiss_index(index, path="faiss_index.bin"):
93
+ """πŸ’Ύ Save FAISS index to disk."""
94
+ faiss.write_index(index, path)
95
+ print(f"πŸ’Ύ FAISS index saved to {path}")
96
+
97
+
98
+ def load_faiss_index(path="faiss_index.bin"):
99
+ """πŸ“‚ Load FAISS index from disk."""
100
+ if not os.path.exists(path):
101
+ raise FileNotFoundError(f"❌ No FAISS index found at {path}")
102
+ index = faiss.read_index(path)
103
+ print(f"πŸ“‚ FAISS index loaded from {path}")
104
+ return index
105
+
106
+
107
  # ==========================================================
108
  # DEBUG / LOCAL TEST
109
  # ==========================================================
110
  if __name__ == "__main__":
111
+ # Example embeddings (3 vectors, dim=3)
112
+ sample_embeddings = np.array([
113
  [0.1, 0.2, 0.3],
114
  [0.2, 0.1, 0.4],
115
  [0.9, 0.8, 0.7]
116
+ ], dtype="float32")
117
+
118
+ query_vec = np.array([0.15, 0.18, 0.35], dtype="float32")
119
 
120
+ # βœ… Build and search
121
  idx = build_faiss_index(sample_embeddings)
122
  results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2)
123