"""Vector store for document embeddings.""" import logging from typing import List, Optional from langchain_core.documents import Document from langchain_chroma import Chroma from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_core.retrievers import BaseRetriever from src.config.config import settings logger = logging.getLogger(__name__) class VectorStore: """Vector store for document embeddings and retrieval.""" def __init__( self, embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", persist_directory: Optional[str] = None, ) -> None: """Initialize vector store. Args: embedding_model: Name of the embedding model. persist_directory: Directory to persist the vector store. """ self.embedding_model = embedding_model self.persist_directory = persist_directory or settings.vector_store_path # Initialize embeddings self.embeddings = HuggingFaceEmbeddings( model_name=embedding_model, model_kwargs={"device": "cpu"}, ) self.vector_store: Optional[Chroma] = None self.retriever: Optional[BaseRetriever] = None def add_documents(self, documents: List[Document]) -> None: """Add documents to the vector store. Args: documents: List of documents to add. """ if not documents: logger.warning("No documents to add") return try: if self.vector_store is None: # Create new vector store self.vector_store = Chroma.from_documents( documents=documents, embedding=self.embeddings, persist_directory=self.persist_directory, ) else: # Add to existing vector store self.vector_store.add_documents(documents) # Create retriever self.retriever = self.vector_store.as_retriever( search_kwargs={"k": 5} ) logger.info(f"Added {len(documents)} documents to vector store") except Exception as e: logger.error(f"Error adding documents to vector store: {str(e)}") raise def similarity_search( self, query: str, k: int = 5, ) -> List[Document]: """Search for similar documents. Args: query: Search query. k: Number of results to return. Returns: List of similar documents. """ if self.vector_store is None: raise ValueError("Vector store not initialized. Add documents first.") try: results = self.vector_store.similarity_search(query, k=k) logger.info(f"Found {len(results)} similar documents for query: {query[:50]}...") return results except Exception as e: logger.error(f"Error in similarity search: {str(e)}") raise def get_retriever(self) -> BaseRetriever: """Get the retriever for RAG. Returns: Base retriever instance. """ if self.retriever is None: raise ValueError("Retriever not initialized. Add documents first.") return self.retriever