Shubham170793 commited on
Commit
4da661f
Β·
verified Β·
1 Parent(s): 6b0c8b8

Update src/vectorstore.py

Browse files
Files changed (1) hide show
  1. src/vectorstore.py +59 -22
src/vectorstore.py CHANGED
@@ -1,47 +1,84 @@
1
- import faiss # πŸ“Œ Library: specialized for vector search
2
- import numpy as np # πŸ“Œ Library: handles numeric arrays efficiently
3
-
4
 
 
 
 
5
  def build_faiss_index(embeddings: list):
6
  """
7
- πŸ“Œ Build a FAISS index from embeddings.
8
 
9
  Args:
10
  embeddings (list): List of embedding vectors (lists of floats).
11
-
12
  Returns:
13
- index (faiss.IndexFlatL2): FAISS index for similarity search.
14
  """
15
- # Step 1: Get vector size (e.g., 384 for Hugging Face embeddings)
16
- dimension = len(embeddings[0])
 
 
 
 
17
 
18
- # Step 2: Create FAISS index that uses Euclidean (L2) distance
19
- index = faiss.IndexFlatL2(dimension)
20
 
21
- # Step 3: Add all embeddings to the index (convert to float32 array first)
22
- index.add(np.array(embeddings).astype('float32'))
 
23
 
24
  return index
25
 
26
 
 
 
 
27
  def search_faiss(query_embedding: list, index, chunks: list, top_k: int = 3):
28
  """
29
- πŸ“Œ Search FAISS index to find most similar chunks.
30
 
31
  Args:
32
  query_embedding (list): Embedding for user query.
33
- index: FAISS index object.
34
- chunks (list): Original text chunks.
35
- top_k (int): Number of results to return (default = 3).
36
 
37
  Returns:
38
- list: Top matching text chunks.
39
  """
40
- # Step 1: Turn query embedding into shape (1, dim) numpy array
41
- query_vector = np.array([query_embedding]).astype('float32')
42
 
43
- # Step 2: Search FAISS for nearest neighbors
 
 
 
44
  distances, indices = index.search(query_vector, top_k)
45
 
46
- # Step 3: Map result indices back to actual text chunks
47
- return [chunks[i] for i in indices[0]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
 
3
 
4
+ # -----------------------------
5
+ # BUILD FAISS INDEX
6
+ # -----------------------------
7
  def build_faiss_index(embeddings: list):
8
  """
9
+ πŸ“˜ Builds a FAISS index for similarity search.
10
 
11
  Args:
12
  embeddings (list): List of embedding vectors (lists of floats).
 
13
  Returns:
14
+ faiss.IndexFlatL2: FAISS index for vector similarity.
15
  """
16
+ if not embeddings or not isinstance(embeddings, list):
17
+ raise ValueError("❌ Invalid input: embeddings must be a non-empty list.")
18
+
19
+ # Convert to numpy float32 array
20
+ vectors = np.array(embeddings).astype("float32")
21
+ dim = vectors.shape[1] # Get embedding dimension (e.g., 384 or 768)
22
 
23
+ # Create index using Euclidean (L2) distance
24
+ index = faiss.IndexFlatL2(dim)
25
 
26
+ # Add vectors to index
27
+ index.add(vectors)
28
+ print(f"βœ… FAISS index built with {index.ntotal} vectors of dimension {dim}")
29
 
30
  return index
31
 
32
 
33
+ # -----------------------------
34
+ # SEARCH FAISS INDEX
35
+ # -----------------------------
36
  def search_faiss(query_embedding: list, index, chunks: list, top_k: int = 3):
37
  """
38
+ πŸ” Searches FAISS index for most similar chunks to query.
39
 
40
  Args:
41
  query_embedding (list): Embedding for user query.
42
+ index (faiss.IndexFlatL2): Pre-built FAISS index.
43
+ chunks (list[str]): Original text chunks.
44
+ top_k (int): Number of most similar results to return.
45
 
46
  Returns:
47
+ list[str]: Top-matching text chunks.
48
  """
49
+ if index is None or index.ntotal == 0:
50
+ raise ValueError("❌ FAISS index is empty or not initialized.")
51
 
52
+ # Convert query embedding to correct format
53
+ query_vector = np.array([query_embedding]).astype("float32")
54
+
55
+ # Search the index (returns distances + indices)
56
  distances, indices = index.search(query_vector, top_k)
57
 
58
+ # Extract matched chunks with their distances (sorted)
59
+ results = []
60
+ for i, idx in enumerate(indices[0]):
61
+ if idx < len(chunks):
62
+ results.append(chunks[idx])
63
+
64
+ return results
65
+
66
+
67
+ # -----------------------------
68
+ # OPTIONAL: DEBUG / DEMO
69
+ # -----------------------------
70
+ if __name__ == "__main__":
71
+ # Example usage test
72
+ sample_embeddings = [
73
+ [0.1, 0.2, 0.3],
74
+ [0.2, 0.1, 0.4],
75
+ [0.9, 0.8, 0.7]
76
+ ]
77
+ query_vec = [0.15, 0.18, 0.35]
78
+
79
+ idx = build_faiss_index(sample_embeddings)
80
+ results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2)
81
+
82
+ print("πŸ”Ž Top Results:")
83
+ for r in results:
84
+ print("-", r)