File size: 2,519 Bytes
4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import faiss
import numpy as np
# -----------------------------
# BUILD FAISS INDEX
# -----------------------------
def build_faiss_index(embeddings: list):
"""
π Builds a FAISS index for similarity search.
Args:
embeddings (list): List of embedding vectors (lists of floats).
Returns:
faiss.IndexFlatL2: FAISS index for vector similarity.
"""
if not embeddings or not isinstance(embeddings, list):
raise ValueError("β Invalid input: embeddings must be a non-empty list.")
# Convert to numpy float32 array
vectors = np.array(embeddings).astype("float32")
dim = vectors.shape[1] # Get embedding dimension (e.g., 384 or 768)
# Create index using Euclidean (L2) distance
index = faiss.IndexFlatL2(dim)
# Add vectors to index
index.add(vectors)
print(f"β
FAISS index built with {index.ntotal} vectors of dimension {dim}")
return index
# -----------------------------
# SEARCH FAISS INDEX
# -----------------------------
def search_faiss(query_embedding: list, index, chunks: list, top_k: int = 3):
"""
π Searches FAISS index for most similar chunks to query.
Args:
query_embedding (list): Embedding for user query.
index (faiss.IndexFlatL2): Pre-built FAISS index.
chunks (list[str]): Original text chunks.
top_k (int): Number of most similar results to return.
Returns:
list[str]: Top-matching text chunks.
"""
if index is None or index.ntotal == 0:
raise ValueError("β FAISS index is empty or not initialized.")
# Convert query embedding to correct format
query_vector = np.array([query_embedding]).astype("float32")
# Search the index (returns distances + indices)
distances, indices = index.search(query_vector, top_k)
# Extract matched chunks with their distances (sorted)
results = []
for i, idx in enumerate(indices[0]):
if idx < len(chunks):
results.append(chunks[idx])
return results
# -----------------------------
# OPTIONAL: DEBUG / DEMO
# -----------------------------
if __name__ == "__main__":
# Example usage test
sample_embeddings = [
[0.1, 0.2, 0.3],
[0.2, 0.1, 0.4],
[0.9, 0.8, 0.7]
]
query_vec = [0.15, 0.18, 0.35]
idx = build_faiss_index(sample_embeddings)
results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2)
print("π Top Results:")
for r in results:
print("-", r)
|