Fraud-Chatbot / src /rag /vector_store.py
ahmzakif's picture
feat: add new project
fd99b61 verified
"""Vector store for document embeddings."""
import logging
from typing import List, Optional
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.retrievers import BaseRetriever
from src.config.config import settings
logger = logging.getLogger(__name__)
class VectorStore:
"""Vector store for document embeddings and retrieval."""
def __init__(
self,
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
persist_directory: Optional[str] = None,
) -> None:
"""Initialize vector store.
Args:
embedding_model: Name of the embedding model.
persist_directory: Directory to persist the vector store.
"""
self.embedding_model = embedding_model
self.persist_directory = persist_directory or settings.vector_store_path
# Initialize embeddings
self.embeddings = HuggingFaceEmbeddings(
model_name=embedding_model,
model_kwargs={"device": "cpu"},
)
self.vector_store: Optional[Chroma] = None
self.retriever: Optional[BaseRetriever] = None
def add_documents(self, documents: List[Document]) -> None:
"""Add documents to the vector store.
Args:
documents: List of documents to add.
"""
if not documents:
logger.warning("No documents to add")
return
try:
if self.vector_store is None:
# Create new vector store
self.vector_store = Chroma.from_documents(
documents=documents,
embedding=self.embeddings,
persist_directory=self.persist_directory,
)
else:
# Add to existing vector store
self.vector_store.add_documents(documents)
# Create retriever
self.retriever = self.vector_store.as_retriever(
search_kwargs={"k": 5}
)
logger.info(f"Added {len(documents)} documents to vector store")
except Exception as e:
logger.error(f"Error adding documents to vector store: {str(e)}")
raise
def similarity_search(
self,
query: str,
k: int = 5,
) -> List[Document]:
"""Search for similar documents.
Args:
query: Search query.
k: Number of results to return.
Returns:
List of similar documents.
"""
if self.vector_store is None:
raise ValueError("Vector store not initialized. Add documents first.")
try:
results = self.vector_store.similarity_search(query, k=k)
logger.info(f"Found {len(results)} similar documents for query: {query[:50]}...")
return results
except Exception as e:
logger.error(f"Error in similarity search: {str(e)}")
raise
def get_retriever(self) -> BaseRetriever:
"""Get the retriever for RAG.
Returns:
Base retriever instance.
"""
if self.retriever is None:
raise ValueError("Retriever not initialized. Add documents first.")
return self.retriever