Ali_Chatbot / vector_store.py
Ali Abdullah
Update vector_store.py
55a492c verified
import os
import time
import uuid
import pickle
import hashlib
from typing import List, Dict
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
class VectorStore:
def __init__(self):
"""Initialize FAISS vector store and embedding model"""
model_name = os.getenv("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
self.embedding_model = SentenceTransformer(model_name)
self.embedding_dim = 384
self.index_path = "faiss_index.bin"
self.docs_path = "faiss_docs.pkl"
if os.path.exists(self.index_path) and os.path.exists(self.docs_path):
self.index = faiss.read_index(self.index_path)
with open(self.docs_path, "rb") as f:
self.docs = pickle.load(f)
print(f"βœ… Loaded existing FAISS index with {len(self.docs)} documents")
else:
self.index = faiss.IndexFlatL2(self.embedding_dim)
self.docs = []
print("πŸ†• Created new FAISS index")
def add_documents(self, chunks: List[Dict]) -> bool:
try:
if not chunks:
print("⚠️ No chunks to add")
return False
print(f"πŸ“₯ Adding {len(chunks)} chunks to FAISS vector store...")
texts = [chunk['text'] for chunk in chunks]
vectors = self.embedding_model.encode(texts, show_progress_bar=True)
self.index.add(np.array(vectors).astype("float32"))
# Add metadata
for i, chunk in enumerate(chunks):
chunk['vector_index'] = len(self.docs) + i
chunk['chunk_id'] = chunk.get('chunk_id', i)
self.docs.append(chunk)
# Save index and docs
faiss.write_index(self.index, self.index_path)
with open(self.docs_path, "wb") as f:
pickle.dump(self.docs, f)
print(f"βœ… Successfully added and saved {len(chunks)} documents.")
return True
except Exception as e:
print(f"❌ Error adding documents: {str(e)}")
return False
def search_similar(self, query: str, top_k: int = 5) -> List[Dict]:
try:
query_vec = self.embedding_model.encode([query])
D, I = self.index.search(np.array(query_vec).astype("float32"), top_k)
similar_docs = []
for i, idx in enumerate(I[0]):
if idx < len(self.docs):
doc = self.docs[idx]
score = float(D[0][i]) # FAISS L2 distance
similar_docs.append({
'id': self._create_chunk_id(doc, idx),
'score': score,
'text': doc.get('text', ''),
'url': doc.get('url', ''),
'title': doc.get('title', ''),
'chunk_id': doc.get('chunk_id', 0)
})
# Ensure sorted by closest match (smallest L2 distance)
similar_docs = sorted(similar_docs, key=lambda x: x['score'])
print("\n🧠 Retrieved Chunks:")
for doc in similar_docs:
print(f"- Score: {doc['score']:.2f} | Text: {doc['text'][:120]}...\n")
return similar_docs
except Exception as e:
print(f"❌ Error searching: {str(e)}")
return []
def get_index_stats(self) -> Dict:
return {
'total_vectors': self.index.ntotal,
'dimension': self.embedding_dim
}
def delete_all(self) -> bool:
try:
self.index = faiss.IndexFlatL2(self.embedding_dim)
self.docs = []
if os.path.exists(self.index_path): os.remove(self.index_path)
if os.path.exists(self.docs_path): os.remove(self.docs_path)
print("πŸ—‘οΈ All FAISS vectors and docs deleted")
return True
except Exception as e:
print(f"❌ Error deleting vectors: {str(e)}")
return False
def _create_chunk_id(self, chunk: Dict, index: int) -> str:
url = chunk.get('url', 'unknown')
url_base = url.replace('https://', '').replace('http://', '').replace('/', '_')
return f"{url_base}_{index}_{str(uuid.uuid4())[:8]}"
# Test run
if __name__ == "__main__":
vs = VectorStore()
sample_chunks = [
{
'text': 'Machine learning is a subset of artificial intelligence that focuses on algorithms.',
'url': 'https://cloud.google.com/learn/artificial-intelligence-vs-machine-learning?hl=en',
'title': 'Machine Learning Basics',
'chunk_id': 0
},
{
'text': 'Deep learning uses neural networks with multiple layers to learn complex patterns.',
'url': 'https://www.ibm.com/think/topics/deep-learning',
'title': 'Deep Learning Guide',
'chunk_id': 1
}
]
if vs.add_documents(sample_chunks):
results = vs.search_similar("What is machine learning?", top_k=2)
for r in results:
print(f"Score: {r['score']:.3f} | Text: {r['text'][:80]}...")
print("πŸ“Š Stats:", vs.get_index_stats())
if vs.add_documents(sample_chunks):
print("\nβœ… Added chunks:")
for i, chunk in enumerate(vs.docs[-len(sample_chunks):]):
print(f"\nπŸ“„ Chunk {i}:")
print(chunk['text']) # full text of the chunk