File size: 7,632 Bytes
b35e487
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""

Embedding Manager Module

Handles text embeddings and vector database operations using sentence-transformers and FAISS

"""

import logging
import os
from typing import List, Optional
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

class EmbeddingManager:
    """Manages text embeddings and vector database operations"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """

        Initialize embedding manager

        

        Args:

            model_name: Name of the sentence transformer model to use

        """
        self.model_name = model_name
        self.embeddings = None
        self.vector_store = None
        
        # Configure logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # Initialize embeddings
        self._initialize_embeddings()
    
    def _initialize_embeddings(self):
        """Initialize the embedding model"""
        try:
            self.logger.info(f"Loading embedding model: {self.model_name}")
            
            # Use HuggingFaceEmbeddings wrapper for better LangChain integration
            self.embeddings = HuggingFaceEmbeddings(
                model_name=f"sentence-transformers/{self.model_name}",
                model_kwargs={'device': 'cpu'},  # Use CPU for compatibility
                encode_kwargs={'normalize_embeddings': True}
            )
            
            self.logger.info("Embedding model loaded successfully")
            
        except Exception as e:
            self.logger.error(f"Error loading embedding model: {e}")
            raise
    
    def create_knowledge_base(self, documents: List[Document]) -> FAISS:
        """

        Create FAISS knowledge base from documents

        

        Args:

            documents: List of Document objects

            

        Returns:

            FAISS vector store

        """
        try:
            if not documents:
                raise ValueError("No documents provided")
            
            self.logger.info(f"Creating knowledge base with {len(documents)} documents")
            
            # Extract texts and metadata
            texts = [doc.page_content for doc in documents]
            metadatas = [doc.metadata for doc in documents]
            
            # Create FAISS vector store
            self.vector_store = FAISS.from_texts(
                texts=texts,
                embedding=self.embeddings,
                metadatas=metadatas
            )
            
            self.logger.info("Knowledge base created successfully")
            return self.vector_store
            
        except Exception as e:
            self.logger.error(f"Error creating knowledge base: {e}")
            raise
    
    def similarity_search(self, query: str, k: int = 4) -> List[Document]:
        """

        Perform similarity search on the knowledge base

        

        Args:

            query: Search query

            k: Number of similar documents to return

            

        Returns:

            List of similar documents

        """
        try:
            if not self.vector_store:
                raise ValueError("Knowledge base not initialized")
            
            self.logger.info(f"Performing similarity search for: {query[:50]}...")
            
            # Perform search
            similar_docs = self.vector_store.similarity_search(query, k=k)
            
            self.logger.info(f"Found {len(similar_docs)} similar documents")
            return similar_docs
            
        except Exception as e:
            self.logger.error(f"Error in similarity search: {e}")
            raise
    
    def similarity_search_with_score(self, query: str, k: int = 4) -> List[tuple]:
        """

        Perform similarity search with scores

        

        Args:

            query: Search query

            k: Number of similar documents to return

            

        Returns:

            List of (document, score) tuples

        """
        try:
            if not self.vector_store:
                raise ValueError("Knowledge base not initialized")
            
            self.logger.info(f"Performing similarity search with scores for: {query[:50]}...")
            
            # Perform search with scores
            similar_docs_with_scores = self.vector_store.similarity_search_with_score(query, k=k)
            
            self.logger.info(f"Found {len(similar_docs_with_scores)} similar documents")
            return similar_docs_with_scores
            
        except Exception as e:
            self.logger.error(f"Error in similarity search with scores: {e}")
            raise
    
    def save_knowledge_base(self, path: str = "knowledge_base"):
        """

        Save the knowledge base to disk

        

        Args:

            path: Directory path to save the knowledge base

        """
        try:
            if not self.vector_store:
                raise ValueError("Knowledge base not initialized")
            
            self.logger.info(f"Saving knowledge base to: {path}")
            
            # Create directory if it doesn't exist
            os.makedirs(path, exist_ok=True)
            
            # Save vector store
            self.vector_store.save_local(path)
            
            self.logger.info("Knowledge base saved successfully")
            
        except Exception as e:
            self.logger.error(f"Error saving knowledge base: {e}")
            raise
    
    def load_knowledge_base(self, path: str = "knowledge_base") -> FAISS:
        """

        Load the knowledge base from disk

        

        Args:

            path: Directory path to load the knowledge base from

            

        Returns:

            FAISS vector store

        """
        try:
            self.logger.info(f"Loading knowledge base from: {path}")
            
            if not os.path.exists(path):
                raise FileNotFoundError(f"Knowledge base not found at: {path}")
            
            # Load vector store
            self.vector_store = FAISS.load_local(path, self.embeddings)
            
            self.logger.info("Knowledge base loaded successfully")
            return self.vector_store
            
        except Exception as e:
            self.logger.error(f"Error loading knowledge base: {e}")
            raise
    
    def get_knowledge_base_info(self) -> dict:
        """

        Get information about the knowledge base

        

        Returns:

            Dictionary with knowledge base information

        """
        if not self.vector_store:
            return {"status": "not_initialized", "documents": 0}
        
        try:
            # Get index info
            index = self.vector_store.index
            num_docs = index.ntotal if hasattr(index, 'ntotal') else "unknown"
            
            return {
                "status": "initialized",
                "documents": num_docs,
                "embedding_model": self.model_name,
                "index_type": type(index).__name__
            }
            
        except Exception as e:
            self.logger.error(f"Error getting knowledge base info: {e}")
            return {"status": "error", "error": str(e)}