File size: 2,519 Bytes
4da661f
 
dcde1ea
4da661f
 
 
dcde1ea
 
4da661f
dcde1ea
 
 
 
4da661f
dcde1ea
4da661f
 
 
 
 
 
dcde1ea
4da661f
 
dcde1ea
4da661f
 
 
dcde1ea
 
 
 
4da661f
 
 
dcde1ea
 
4da661f
dcde1ea
 
 
4da661f
 
 
dcde1ea
 
4da661f
dcde1ea
4da661f
 
dcde1ea
4da661f
 
 
 
dcde1ea
 
4da661f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import faiss
import numpy as np

# -----------------------------
# BUILD FAISS INDEX
# -----------------------------
def build_faiss_index(embeddings: list):
    """
    πŸ“˜ Builds a FAISS index for similarity search.

    Args:
        embeddings (list): List of embedding vectors (lists of floats).
    Returns:
        faiss.IndexFlatL2: FAISS index for vector similarity.
    """
    if not embeddings or not isinstance(embeddings, list):
        raise ValueError("❌ Invalid input: embeddings must be a non-empty list.")

    # Convert to numpy float32 array
    vectors = np.array(embeddings).astype("float32")
    dim = vectors.shape[1]  # Get embedding dimension (e.g., 384 or 768)

    # Create index using Euclidean (L2) distance
    index = faiss.IndexFlatL2(dim)

    # Add vectors to index
    index.add(vectors)
    print(f"βœ… FAISS index built with {index.ntotal} vectors of dimension {dim}")

    return index


# -----------------------------
# SEARCH FAISS INDEX
# -----------------------------
def search_faiss(query_embedding: list, index, chunks: list, top_k: int = 3):
    """
    πŸ” Searches FAISS index for most similar chunks to query.

    Args:
        query_embedding (list): Embedding for user query.
        index (faiss.IndexFlatL2): Pre-built FAISS index.
        chunks (list[str]): Original text chunks.
        top_k (int): Number of most similar results to return.

    Returns:
        list[str]: Top-matching text chunks.
    """
    if index is None or index.ntotal == 0:
        raise ValueError("❌ FAISS index is empty or not initialized.")

    # Convert query embedding to correct format
    query_vector = np.array([query_embedding]).astype("float32")

    # Search the index (returns distances + indices)
    distances, indices = index.search(query_vector, top_k)

    # Extract matched chunks with their distances (sorted)
    results = []
    for i, idx in enumerate(indices[0]):
        if idx < len(chunks):
            results.append(chunks[idx])

    return results


# -----------------------------
# OPTIONAL: DEBUG / DEMO
# -----------------------------
if __name__ == "__main__":
    # Example usage test
    sample_embeddings = [
        [0.1, 0.2, 0.3],
        [0.2, 0.1, 0.4],
        [0.9, 0.8, 0.7]
    ]
    query_vec = [0.15, 0.18, 0.35]

    idx = build_faiss_index(sample_embeddings)
    results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2)

    print("πŸ”Ž Top Results:")
    for r in results:
        print("-", r)