File size: 4,398 Bytes
4da661f
 
8e133cf
dcde1ea
a5c876d
 
 
8e133cf
dcde1ea
a5c876d
dcde1ea
 
8e133cf
dcde1ea
d1ca01c
dcde1ea
8e133cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1ca01c
dcde1ea
8e133cf
a5c876d
d1ca01c
4da661f
dcde1ea
8e133cf
dcde1ea
 
 
a5c876d
 
 
8e133cf
dcde1ea
8e133cf
dcde1ea
 
8e133cf
 
 
a5c876d
dcde1ea
 
8e133cf
dcde1ea
4da661f
 
dcde1ea
8e133cf
 
a5c876d
 
 
 
 
 
8e133cf
a5c876d
4da661f
8e133cf
dcde1ea
4da661f
8e133cf
a5c876d
4da661f
 
a5c876d
4da661f
 
 
8e133cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5c876d
 
 
4da661f
8e133cf
 
4da661f
 
 
8e133cf
 
 
4da661f
8e133cf
4da661f
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import faiss
import numpy as np
import os

# ==========================================================
# BUILD FAISS INDEX (Cosine Similarity Safe Version)
# ==========================================================
def build_faiss_index(embeddings):
    """
    πŸ“˜ Builds a FAISS index optimized for cosine similarity (float32-safe, dimension-aware).

    Args:
        embeddings (list | np.ndarray): List or array of embedding vectors.
    Returns:
        faiss.IndexFlatIP: FAISS index for cosine similarity search.
    """
    # 🧩 Validation
    if embeddings is None:
        raise ValueError("❌ No embeddings provided to build FAISS index.")

    # βœ… Convert ndarray to list if needed
    if isinstance(embeddings, np.ndarray):
        # Handle (n, d) NumPy embeddings
        if embeddings.ndim == 1:
            embeddings = embeddings.reshape(1, -1)
        vectors = embeddings.astype("float32")
    elif isinstance(embeddings, list):
        vectors = np.array(embeddings, dtype="float32")
    else:
        raise TypeError(f"❌ Unexpected embeddings type: {type(embeddings)}")

    # βœ… Ensure there are embeddings to index
    if vectors.size == 0:
        raise ValueError("❌ Empty embeddings array provided.")

    # βœ… Normalize for cosine similarity
    faiss.normalize_L2(vectors)

    # βœ… Build FAISS index (Inner Product = Cosine Similarity)
    dim = vectors.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(vectors)

    print(f"βœ… FAISS index built successfully β€” {index.ntotal} vectors | dim={dim} | cosine similarity mode.")
    return index


# ==========================================================
# SEARCH FAISS INDEX (with sanity checks)
# ==========================================================
def search_faiss(query_embedding, index, chunks, top_k=3):
    """
    πŸ” Searches FAISS index for semantically similar chunks.

    Args:
        query_embedding (np.ndarray): Query vector (1D or 2D).
        index (faiss.IndexFlatIP): Built FAISS index.
        chunks (list[str]): Original document chunks.
        top_k (int): Number of results to return.

    Returns:
        list[str]: Top-matching chunks.
    """
    if index is None or index.ntotal == 0:
        raise ValueError("❌ FAISS index is empty or not initialized.")

    # βœ… Convert query to float32 and normalize
    query_vector = np.array([query_embedding], dtype="float32") if query_embedding.ndim == 1 else query_embedding.astype("float32")
    faiss.normalize_L2(query_vector)

    # βœ… Dimension check
    if query_vector.shape[1] != index.d:
        raise ValueError(
            f"❌ Embedding dimension mismatch: query={query_vector.shape[1]}, index={index.d}. "
            "Rebuild FAISS index with embeddings from the same model."
        )

    # πŸ” Run search
    distances, indices = index.search(query_vector, top_k)
    results = []
    for idx in indices[0]:
        if 0 <= idx < len(chunks):
            results.append(chunks[idx])

    print(f"πŸ” FAISS search completed β€” retrieved {len(results)} chunks (top_k={top_k})")
    return results


# ==========================================================
# SAVE / LOAD INDEX (Optional Utility)
# ==========================================================
def save_faiss_index(index, path="faiss_index.bin"):
    """πŸ’Ύ Save FAISS index to disk."""
    faiss.write_index(index, path)
    print(f"πŸ’Ύ FAISS index saved to {path}")


def load_faiss_index(path="faiss_index.bin"):
    """πŸ“‚ Load FAISS index from disk."""
    if not os.path.exists(path):
        raise FileNotFoundError(f"❌ No FAISS index found at {path}")
    index = faiss.read_index(path)
    print(f"πŸ“‚ FAISS index loaded from {path}")
    return index


# ==========================================================
# DEBUG / LOCAL TEST
# ==========================================================
if __name__ == "__main__":
    # Example embeddings (3 vectors, dim=3)
    sample_embeddings = np.array([
        [0.1, 0.2, 0.3],
        [0.2, 0.1, 0.4],
        [0.9, 0.8, 0.7]
    ], dtype="float32")

    query_vec = np.array([0.15, 0.18, 0.35], dtype="float32")

    # βœ… Build and search
    idx = build_faiss_index(sample_embeddings)
    results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2)

    print("πŸ”Ž Top Results:")
    for r in results:
        print("-", r)