|
|
import faiss |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_faiss_index(embeddings: list): |
|
|
""" |
|
|
π Builds a FAISS index optimized for cosine similarity (fast + accurate). |
|
|
|
|
|
Args: |
|
|
embeddings (list): List of embedding vectors (lists of floats). |
|
|
Returns: |
|
|
faiss.IndexFlatIP: FAISS index for cosine similarity search. |
|
|
""" |
|
|
if not embeddings or not isinstance(embeddings, list): |
|
|
raise ValueError("β Invalid input: embeddings must be a non-empty list.") |
|
|
|
|
|
|
|
|
vectors = np.array(embeddings).astype("float32") |
|
|
dim = vectors.shape[1] |
|
|
|
|
|
|
|
|
faiss.normalize_L2(vectors) |
|
|
|
|
|
|
|
|
index = faiss.IndexFlatIP(dim) |
|
|
index.add(vectors) |
|
|
|
|
|
print(f"β
FAISS index built with {index.ntotal} vectors of dimension {dim} (cosine similarity)") |
|
|
return index |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def search_faiss(query_embedding: list, index, chunks: list, top_k: int = 3): |
|
|
""" |
|
|
π Searches FAISS index for most similar chunks to query. |
|
|
|
|
|
Args: |
|
|
query_embedding (list): Embedding for user query. |
|
|
index (faiss.IndexFlatL2): Pre-built FAISS index. |
|
|
chunks (list[str]): Original text chunks. |
|
|
top_k (int): Number of most similar results to return. |
|
|
|
|
|
Returns: |
|
|
list[str]: Top-matching text chunks. |
|
|
""" |
|
|
if index is None or index.ntotal == 0: |
|
|
raise ValueError("β FAISS index is empty or not initialized.") |
|
|
|
|
|
|
|
|
query_vector = np.array([query_embedding]).astype("float32") |
|
|
|
|
|
|
|
|
distances, indices = index.search(query_vector, top_k) |
|
|
|
|
|
|
|
|
results = [] |
|
|
for i, idx in enumerate(indices[0]): |
|
|
if idx < len(chunks): |
|
|
results.append(chunks[idx]) |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
sample_embeddings = [ |
|
|
[0.1, 0.2, 0.3], |
|
|
[0.2, 0.1, 0.4], |
|
|
[0.9, 0.8, 0.7] |
|
|
] |
|
|
query_vec = [0.15, 0.18, 0.35] |
|
|
|
|
|
idx = build_faiss_index(sample_embeddings) |
|
|
results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2) |
|
|
|
|
|
print("π Top Results:") |
|
|
for r in results: |
|
|
print("-", r) |
|
|
|