File size: 3,471 Bytes
fd99b61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""Vector store for document embeddings."""

import logging
from typing import List, Optional

from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.retrievers import BaseRetriever

from src.config.config import settings

logger = logging.getLogger(__name__)


class VectorStore:
    """Vector store for document embeddings and retrieval."""

    def __init__(

        self,

        embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",

        persist_directory: Optional[str] = None,

    ) -> None:
        """Initialize vector store.



        Args:

            embedding_model: Name of the embedding model.

            persist_directory: Directory to persist the vector store.

        """
        self.embedding_model = embedding_model
        self.persist_directory = persist_directory or settings.vector_store_path

        # Initialize embeddings
        self.embeddings = HuggingFaceEmbeddings(
            model_name=embedding_model,
            model_kwargs={"device": "cpu"},
        )

        self.vector_store: Optional[Chroma] = None
        self.retriever: Optional[BaseRetriever] = None

    def add_documents(self, documents: List[Document]) -> None:
        """Add documents to the vector store.



        Args:

            documents: List of documents to add.

        """
        if not documents:
            logger.warning("No documents to add")
            return

        try:
            if self.vector_store is None:
                # Create new vector store
                self.vector_store = Chroma.from_documents(
                    documents=documents,
                    embedding=self.embeddings,
                    persist_directory=self.persist_directory,
                )
            else:
                # Add to existing vector store
                self.vector_store.add_documents(documents)

            # Create retriever
            self.retriever = self.vector_store.as_retriever(
                search_kwargs={"k": 5}
            )

            logger.info(f"Added {len(documents)} documents to vector store")
        except Exception as e:
            logger.error(f"Error adding documents to vector store: {str(e)}")
            raise

    def similarity_search(

        self,

        query: str,

        k: int = 5,

    ) -> List[Document]:
        """Search for similar documents.



        Args:

            query: Search query.

            k: Number of results to return.



        Returns:

            List of similar documents.

        """
        if self.vector_store is None:
            raise ValueError("Vector store not initialized. Add documents first.")

        try:
            results = self.vector_store.similarity_search(query, k=k)
            logger.info(f"Found {len(results)} similar documents for query: {query[:50]}...")
            return results
        except Exception as e:
            logger.error(f"Error in similarity search: {str(e)}")
            raise

    def get_retriever(self) -> BaseRetriever:
        """Get the retriever for RAG.



        Returns:

            Base retriever instance.

        """
        if self.retriever is None:
            raise ValueError("Retriever not initialized. Add documents first.")

        return self.retriever