Spaces:
Sleeping
Sleeping
| """Vector store for document embeddings.""" | |
| import logging | |
| from typing import List, Optional | |
| from langchain_core.documents import Document | |
| from langchain_chroma import Chroma | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_core.retrievers import BaseRetriever | |
| from src.config.config import settings | |
| logger = logging.getLogger(__name__) | |
| class VectorStore: | |
| """Vector store for document embeddings and retrieval.""" | |
| def __init__( | |
| self, | |
| embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", | |
| persist_directory: Optional[str] = None, | |
| ) -> None: | |
| """Initialize vector store. | |
| Args: | |
| embedding_model: Name of the embedding model. | |
| persist_directory: Directory to persist the vector store. | |
| """ | |
| self.embedding_model = embedding_model | |
| self.persist_directory = persist_directory or settings.vector_store_path | |
| # Initialize embeddings | |
| self.embeddings = HuggingFaceEmbeddings( | |
| model_name=embedding_model, | |
| model_kwargs={"device": "cpu"}, | |
| ) | |
| self.vector_store: Optional[Chroma] = None | |
| self.retriever: Optional[BaseRetriever] = None | |
| def add_documents(self, documents: List[Document]) -> None: | |
| """Add documents to the vector store. | |
| Args: | |
| documents: List of documents to add. | |
| """ | |
| if not documents: | |
| logger.warning("No documents to add") | |
| return | |
| try: | |
| if self.vector_store is None: | |
| # Create new vector store | |
| self.vector_store = Chroma.from_documents( | |
| documents=documents, | |
| embedding=self.embeddings, | |
| persist_directory=self.persist_directory, | |
| ) | |
| else: | |
| # Add to existing vector store | |
| self.vector_store.add_documents(documents) | |
| # Create retriever | |
| self.retriever = self.vector_store.as_retriever( | |
| search_kwargs={"k": 5} | |
| ) | |
| logger.info(f"Added {len(documents)} documents to vector store") | |
| except Exception as e: | |
| logger.error(f"Error adding documents to vector store: {str(e)}") | |
| raise | |
| def similarity_search( | |
| self, | |
| query: str, | |
| k: int = 5, | |
| ) -> List[Document]: | |
| """Search for similar documents. | |
| Args: | |
| query: Search query. | |
| k: Number of results to return. | |
| Returns: | |
| List of similar documents. | |
| """ | |
| if self.vector_store is None: | |
| raise ValueError("Vector store not initialized. Add documents first.") | |
| try: | |
| results = self.vector_store.similarity_search(query, k=k) | |
| logger.info(f"Found {len(results)} similar documents for query: {query[:50]}...") | |
| return results | |
| except Exception as e: | |
| logger.error(f"Error in similarity search: {str(e)}") | |
| raise | |
| def get_retriever(self) -> BaseRetriever: | |
| """Get the retriever for RAG. | |
| Returns: | |
| Base retriever instance. | |
| """ | |
| if self.retriever is None: | |
| raise ValueError("Retriever not initialized. Add documents first.") | |
| return self.retriever | |