File size: 2,652 Bytes
4da661f dcde1ea 4da661f d1ca01c 4da661f dcde1ea d1ca01c dcde1ea d1ca01c dcde1ea 4da661f d1ca01c dcde1ea d1ca01c dcde1ea d1ca01c 4da661f dcde1ea d1ca01c dcde1ea d1ca01c 4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f dcde1ea 4da661f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import faiss
import numpy as np
# -----------------------------
# BUILD FAISS INDEX (Optimized for Cosine Similarity)
# -----------------------------
def build_faiss_index(embeddings: list):
"""
π Builds a FAISS index optimized for cosine similarity (fast + accurate).
Args:
embeddings (list): List of embedding vectors (lists of floats).
Returns:
faiss.IndexFlatIP: FAISS index for cosine similarity search.
"""
if not embeddings or not isinstance(embeddings, list):
raise ValueError("β Invalid input: embeddings must be a non-empty list.")
# Convert to numpy float32 array
vectors = np.array(embeddings).astype("float32")
dim = vectors.shape[1]
# β
Normalize embeddings (turns dot product into cosine similarity)
faiss.normalize_L2(vectors)
# β
Use Inner Product index (fast cosine similarity)
index = faiss.IndexFlatIP(dim)
index.add(vectors)
print(f"β
FAISS index built with {index.ntotal} vectors of dimension {dim} (cosine similarity)")
return index
# -----------------------------
# SEARCH FAISS INDEX
# -----------------------------
def search_faiss(query_embedding: list, index, chunks: list, top_k: int = 3):
"""
π Searches FAISS index for most similar chunks to query.
Args:
query_embedding (list): Embedding for user query.
index (faiss.IndexFlatL2): Pre-built FAISS index.
chunks (list[str]): Original text chunks.
top_k (int): Number of most similar results to return.
Returns:
list[str]: Top-matching text chunks.
"""
if index is None or index.ntotal == 0:
raise ValueError("β FAISS index is empty or not initialized.")
# Convert query embedding to correct format
query_vector = np.array([query_embedding]).astype("float32")
# Search the index (returns distances + indices)
distances, indices = index.search(query_vector, top_k)
# Extract matched chunks with their distances (sorted)
results = []
for i, idx in enumerate(indices[0]):
if idx < len(chunks):
results.append(chunks[idx])
return results
# -----------------------------
# OPTIONAL: DEBUG / DEMO
# -----------------------------
if __name__ == "__main__":
# Example usage test
sample_embeddings = [
[0.1, 0.2, 0.3],
[0.2, 0.1, 0.4],
[0.9, 0.8, 0.7]
]
query_vec = [0.15, 0.18, 0.35]
idx = build_faiss_index(sample_embeddings)
results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2)
print("π Top Results:")
for r in results:
print("-", r)
|