File size: 4,398 Bytes
4da661f 8e133cf dcde1ea a5c876d 8e133cf dcde1ea a5c876d dcde1ea 8e133cf dcde1ea d1ca01c dcde1ea 8e133cf d1ca01c dcde1ea 8e133cf a5c876d d1ca01c 4da661f dcde1ea 8e133cf dcde1ea a5c876d 8e133cf dcde1ea 8e133cf dcde1ea 8e133cf a5c876d dcde1ea 8e133cf dcde1ea 4da661f dcde1ea 8e133cf a5c876d 8e133cf a5c876d 4da661f 8e133cf dcde1ea 4da661f 8e133cf a5c876d 4da661f a5c876d 4da661f 8e133cf a5c876d 4da661f 8e133cf 4da661f 8e133cf 4da661f 8e133cf 4da661f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import faiss
import numpy as np
import os
# ==========================================================
# BUILD FAISS INDEX (Cosine Similarity Safe Version)
# ==========================================================
def build_faiss_index(embeddings):
"""
π Builds a FAISS index optimized for cosine similarity (float32-safe, dimension-aware).
Args:
embeddings (list | np.ndarray): List or array of embedding vectors.
Returns:
faiss.IndexFlatIP: FAISS index for cosine similarity search.
"""
# π§© Validation
if embeddings is None:
raise ValueError("β No embeddings provided to build FAISS index.")
# β
Convert ndarray to list if needed
if isinstance(embeddings, np.ndarray):
# Handle (n, d) NumPy embeddings
if embeddings.ndim == 1:
embeddings = embeddings.reshape(1, -1)
vectors = embeddings.astype("float32")
elif isinstance(embeddings, list):
vectors = np.array(embeddings, dtype="float32")
else:
raise TypeError(f"β Unexpected embeddings type: {type(embeddings)}")
# β
Ensure there are embeddings to index
if vectors.size == 0:
raise ValueError("β Empty embeddings array provided.")
# β
Normalize for cosine similarity
faiss.normalize_L2(vectors)
# β
Build FAISS index (Inner Product = Cosine Similarity)
dim = vectors.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(vectors)
print(f"β
FAISS index built successfully β {index.ntotal} vectors | dim={dim} | cosine similarity mode.")
return index
# ==========================================================
# SEARCH FAISS INDEX (with sanity checks)
# ==========================================================
def search_faiss(query_embedding, index, chunks, top_k=3):
"""
π Searches FAISS index for semantically similar chunks.
Args:
query_embedding (np.ndarray): Query vector (1D or 2D).
index (faiss.IndexFlatIP): Built FAISS index.
chunks (list[str]): Original document chunks.
top_k (int): Number of results to return.
Returns:
list[str]: Top-matching chunks.
"""
if index is None or index.ntotal == 0:
raise ValueError("β FAISS index is empty or not initialized.")
# β
Convert query to float32 and normalize
query_vector = np.array([query_embedding], dtype="float32") if query_embedding.ndim == 1 else query_embedding.astype("float32")
faiss.normalize_L2(query_vector)
# β
Dimension check
if query_vector.shape[1] != index.d:
raise ValueError(
f"β Embedding dimension mismatch: query={query_vector.shape[1]}, index={index.d}. "
"Rebuild FAISS index with embeddings from the same model."
)
# π Run search
distances, indices = index.search(query_vector, top_k)
results = []
for idx in indices[0]:
if 0 <= idx < len(chunks):
results.append(chunks[idx])
print(f"π FAISS search completed β retrieved {len(results)} chunks (top_k={top_k})")
return results
# ==========================================================
# SAVE / LOAD INDEX (Optional Utility)
# ==========================================================
def save_faiss_index(index, path="faiss_index.bin"):
"""πΎ Save FAISS index to disk."""
faiss.write_index(index, path)
print(f"πΎ FAISS index saved to {path}")
def load_faiss_index(path="faiss_index.bin"):
"""π Load FAISS index from disk."""
if not os.path.exists(path):
raise FileNotFoundError(f"β No FAISS index found at {path}")
index = faiss.read_index(path)
print(f"π FAISS index loaded from {path}")
return index
# ==========================================================
# DEBUG / LOCAL TEST
# ==========================================================
if __name__ == "__main__":
# Example embeddings (3 vectors, dim=3)
sample_embeddings = np.array([
[0.1, 0.2, 0.3],
[0.2, 0.1, 0.4],
[0.9, 0.8, 0.7]
], dtype="float32")
query_vec = np.array([0.15, 0.18, 0.35], dtype="float32")
# β
Build and search
idx = build_faiss_index(sample_embeddings)
results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2)
print("π Top Results:")
for r in results:
print("-", r)
|