FYP-Dashboard / src /vector_database.py
DevLujain
Deploy FYP dashboard
068aa4e
import json
import chromadb
from sentence_transformers import SentenceTransformer
from pathlib import Path
class VectorDatabase:
def __init__(self, db_path="data/vectordb", model_name="all-MiniLM-L6-v2"):
"""
Initialize vector database
Args:
db_path: Path to store vector database
model_name: Sentence transformer model to use
"""
# Initialize ChromaDB
self.client = chromadb.PersistentClient(path=db_path)
self.collection = self.client.get_or_create_collection(
name="documents",
metadata={"hnsw:space": "cosine"}
)
# Load embedding model
print(f"πŸ“¦ Loading embedding model: {model_name}")
self.model = SentenceTransformer(model_name)
print("βœ… Model loaded!")
def load_documents(self, json_path):
"""Load documents from JSON file"""
print(f"\nπŸ“‚ Loading documents from {json_path}")
with open(json_path, 'r', encoding='utf-8') as f:
documents = json.load(f)
print(f"βœ… Loaded {len(documents)} documents")
return documents
def create_embeddings(self, documents):
"""Create embeddings for all documents"""
print(f"\nπŸ”„ Creating embeddings for {len(documents)} documents...")
texts = [doc['content'] for doc in documents]
embeddings = self.model.encode(texts, show_progress_bar=True)
print(f"βœ… Created {len(embeddings)} embeddings")
return embeddings
def store_documents(self, documents, embeddings):
"""Store documents and embeddings in ChromaDB"""
print(f"\nπŸ’Ύ Storing documents in ChromaDB...")
# Prepare data for ChromaDB
ids = [doc['doc_id'] for doc in documents]
texts = [doc['content'] for doc in documents]
metadatas = [
{
'title': doc['title'],
'word_count': str(doc['word_count']),
'source_file': doc['source_file']
}
for doc in documents
]
# Convert embeddings to list format
embeddings_list = [emb.tolist() for emb in embeddings]
# Add to collection
self.collection.add(
ids=ids,
embeddings=embeddings_list,
documents=texts,
metadatas=metadatas
)
print(f"βœ… Stored {len(documents)} documents in ChromaDB")
def search(self, query, top_k=5):
"""Search for similar documents"""
print(f"\nπŸ” Searching for: '{query}'")
# Create embedding for query
query_embedding = self.model.encode([query])[0]
# Search in collection
results = self.collection.query(
query_embeddings=[query_embedding.tolist()],
n_results=top_k
)
return results
def display_results(self, results):
"""Display search results in readable format"""
if not results or not results['documents'] or len(results['documents'][0]) == 0:
print("❌ No results found")
return
print(f"\nβœ… Found {len(results['documents'][0])} results:\n")
for i, (doc, distance, metadata) in enumerate(
zip(
results['documents'][0],
results['distances'][0],
results['metadatas'][0]
)
):
print(f"--- Result {i+1} ---")
print(f"Title: {metadata['title']}")
print(f"Source: {metadata['source_file']}")
print(f"Similarity Score: {1 - distance:.3f}")
print(f"Preview: {doc[:200]}...")
print()
# Main execution
if __name__ == "__main__":
print("=" * 60)
print("πŸš€ VECTOR DATABASE SETUP")
print("=" * 60)
# Initialize vector database
vdb = VectorDatabase()
# Load documents
documents = vdb.load_documents("data/processed/processed_documents.json")
# Create embeddings
embeddings = vdb.create_embeddings(documents)
# Store in database
vdb.store_documents(documents, embeddings)
# Test search
print("\n" + "=" * 60)
print("πŸ§ͺ TESTING SEARCH")
print("=" * 60)
test_queries = [
"How do I create a FastAPI endpoint?",
"What is employee leave policy?",
"How do I work remotely?"
]
for query in test_queries:
results = vdb.search(query, top_k=3)
vdb.display_results(results)
print("\n" + "=" * 60)
print("βœ… VECTOR DATABASE SETUP COMPLETE!")
print("=" * 60)