File size: 5,478 Bytes
c6df419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24d4205
 
 
 
c6df419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55a492c
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os
import time
import uuid
import pickle
import hashlib
from typing import List, Dict
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

class VectorStore:
    def __init__(self):
        """Initialize FAISS vector store and embedding model"""
        model_name = os.getenv("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
        self.embedding_model = SentenceTransformer(model_name)

        self.embedding_dim = 384
        self.index_path = "faiss_index.bin"
        self.docs_path = "faiss_docs.pkl"

        if os.path.exists(self.index_path) and os.path.exists(self.docs_path):
            self.index = faiss.read_index(self.index_path)
            with open(self.docs_path, "rb") as f:
                self.docs = pickle.load(f)
            print(f"βœ… Loaded existing FAISS index with {len(self.docs)} documents")
        else:
            self.index = faiss.IndexFlatL2(self.embedding_dim)
            self.docs = []
            print("πŸ†• Created new FAISS index")

    def add_documents(self, chunks: List[Dict]) -> bool:
        try:
            if not chunks:
                print("⚠️ No chunks to add")
                return False

            print(f"πŸ“₯ Adding {len(chunks)} chunks to FAISS vector store...")
            texts = [chunk['text'] for chunk in chunks]
            vectors = self.embedding_model.encode(texts, show_progress_bar=True)
            self.index.add(np.array(vectors).astype("float32"))

            # Add metadata
            for i, chunk in enumerate(chunks):
                chunk['vector_index'] = len(self.docs) + i
                chunk['chunk_id'] = chunk.get('chunk_id', i)
                self.docs.append(chunk)

            # Save index and docs
            faiss.write_index(self.index, self.index_path)
            with open(self.docs_path, "wb") as f:
                pickle.dump(self.docs, f)

            print(f"βœ… Successfully added and saved {len(chunks)} documents.")
            return True
        except Exception as e:
            print(f"❌ Error adding documents: {str(e)}")
            return False

    def search_similar(self, query: str, top_k: int = 5) -> List[Dict]:
        try:
            query_vec = self.embedding_model.encode([query])
            D, I = self.index.search(np.array(query_vec).astype("float32"), top_k)

            similar_docs = []
            for i, idx in enumerate(I[0]):
                if idx < len(self.docs):
                    doc = self.docs[idx]
                    score = float(D[0][i])  # FAISS L2 distance
                    similar_docs.append({
                        'id': self._create_chunk_id(doc, idx),
                        'score': score,
                        'text': doc.get('text', ''),
                        'url': doc.get('url', ''),
                        'title': doc.get('title', ''),
                        'chunk_id': doc.get('chunk_id', 0)
                    })

            # Ensure sorted by closest match (smallest L2 distance)
            similar_docs = sorted(similar_docs, key=lambda x: x['score'])

            print("\n🧠 Retrieved Chunks:")
            for doc in similar_docs:
                print(f"- Score: {doc['score']:.2f} | Text: {doc['text'][:120]}...\n")

            return similar_docs
        except Exception as e:
            print(f"❌ Error searching: {str(e)}")
            return []

    def get_index_stats(self) -> Dict:
        return {
            'total_vectors': self.index.ntotal,
            'dimension': self.embedding_dim
        }

    def delete_all(self) -> bool:
        try:
            self.index = faiss.IndexFlatL2(self.embedding_dim)
            self.docs = []
            if os.path.exists(self.index_path): os.remove(self.index_path)
            if os.path.exists(self.docs_path): os.remove(self.docs_path)
            print("πŸ—‘οΈ All FAISS vectors and docs deleted")
            return True
        except Exception as e:
            print(f"❌ Error deleting vectors: {str(e)}")
            return False

    

    def _create_chunk_id(self, chunk: Dict, index: int) -> str:
        url = chunk.get('url', 'unknown')
        url_base = url.replace('https://', '').replace('http://', '').replace('/', '_')
        return f"{url_base}_{index}_{str(uuid.uuid4())[:8]}"



# Test run
if __name__ == "__main__":
    vs = VectorStore()

    sample_chunks = [
        {
            'text': 'Machine learning is a subset of artificial intelligence that focuses on algorithms.',
            'url': 'https://cloud.google.com/learn/artificial-intelligence-vs-machine-learning?hl=en',
            'title': 'Machine Learning Basics',
            'chunk_id': 0
        },
        {
            'text': 'Deep learning uses neural networks with multiple layers to learn complex patterns.',
            'url': 'https://www.ibm.com/think/topics/deep-learning',
            'title': 'Deep Learning Guide',
            'chunk_id': 1
        }
    ]

    if vs.add_documents(sample_chunks):
        results = vs.search_similar("What is machine learning?", top_k=2)
        for r in results:
            print(f"Score: {r['score']:.3f} | Text: {r['text'][:80]}...")

    print("πŸ“Š Stats:", vs.get_index_stats())
    if vs.add_documents(sample_chunks):
        print("\nβœ… Added chunks:")
        for i, chunk in enumerate(vs.docs[-len(sample_chunks):]):
            print(f"\nπŸ“„ Chunk {i}:")
            print(chunk['text'])  # full text of the chunk