File size: 11,180 Bytes
0a4529c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
# DEPENDENCIES
import numpy as np
from typing import List
from typing import Optional
from config.models import ChunkWithScore
from config.models import DocumentChunk
from config.settings import get_settings
from config.logging_config import get_logger
from utils.error_handler import handle_errors
from embeddings.bge_embedder import get_embedder
from utils.error_handler import VectorSearchError
from vector_store.faiss_manager import get_faiss_manager
from vector_store.metadata_store import get_metadata_store


# Setup Settings and Logging
settings = get_settings()
logger   = get_logger(__name__)


class VectorSearch:
    """
    FAISS-based vector similarity search: Uses existing FAISSManager from vector_store module
    Performs semantic search using embedding similarity
    """
    def __init__(self):
        """
        Initialize vector search
        """
        self.logger         = logger
        self.faiss_manager  = get_faiss_manager()
        self.embedder       = get_embedder()
        self.metadata_store = get_metadata_store()
        
        # Search statistics
        self.search_count   = 0
        self.total_results  = 0
        
        self.logger.info("Initialized VectorSearch")
    

    @handle_errors(error_type = VectorSearchError, log_error = True, reraise = True)
    def search(self, query: str, top_k: int = 10, min_score: float = 0.0) -> List[ChunkWithScore]:
        """
        Perform vector similarity search
        
        Arguments:
        ----------
            query     { str }   : Search query
            
            top_k     { int }   : Number of results to return
            
            min_score { float } : Minimum similarity score threshold
        
        Returns:
        --------
               { list }         : List of ChunkWithScore objects
        """
        if not query or not query.strip():
            self.logger.warning("Empty query provided to vector search")
            return []
        
        self.logger.debug(f"Performing vector search: '{query}' (top_k={top_k})")
        
        try:
            # Generate query embedding
            query_embedding = self.embedder.embed_text(text      = query, 
                                                       normalize = True,
                                                      )
            
            # Search FAISS index (returns List[Tuple[str, float]] = [(chunk_id, score), ...])
            faiss_results   = self.faiss_manager.search(query_embedding = query_embedding, 
                                                        top_k           = top_k,
                                                       )
            
            if not faiss_results:
                self.logger.info(f"No results found for query: '{query}'")
                return []
            
            # Convert to ChunkWithScore objects
            chunks_with_scores = list()
            
            for rank, (chunk_id, score) in enumerate(faiss_results, 1):
                # Filter by minimum score
                if (score < min_score):
                    continue
                
                # Get chunk metadata
                chunk_metadata = self.metadata_store.get_chunk_metadata(chunk_id)
                
                if not chunk_metadata:
                    self.logger.warning(f"Chunk metadata not found for: {chunk_id}")
                    continue
                
                # Create DocumentChunk
                chunk = self._metadata_to_chunk(chunk_metadata)
                
                # Create ChunkWithScore
                cws   = ChunkWithScore(chunk            = chunk,
                                       score            = score,
                                       rank             = rank,
                                       retrieval_method = 'vector',
                                      )
                
                chunks_with_scores.append(cws)
            
            # Update statistics
            self.search_count  += 1
            self.total_results += len(chunks_with_scores)
            
            self.logger.info(f"Vector search returned {len(chunks_with_scores)} results")
            
            return chunks_with_scores
            
        except Exception as e:
            self.logger.error(f"Vector search failed: {repr(e)}")
            raise VectorSearchError(f"Vector search failed: {repr(e)}")
    

    def search_with_embedding(self, query_embedding: np.ndarray, top_k: int = 10, min_score: float = 0.0) -> List[ChunkWithScore]:
        """
        Search using pre-computed query embedding
        
        Arguments:
        ----------
            query_embedding { np.ndarray } : Query embedding vector
            
            top_k           { int }        : Number of results
            
            min_score       { float }      : Minimum score threshold
        
        Returns:
        --------
                     { list }              : List of ChunkWithScore objects
        """
        self.logger.debug(f"Performing vector search with pre-computed embedding (top_k={top_k})")
        
        try:
            # Search FAISS index
            faiss_results      = self.faiss_manager.search(query_embedding=query_embedding, top_k=top_k)
            
            # Convert to ChunkWithScore objects
            chunks_with_scores = list()
            
            for rank, (chunk_id, score) in enumerate(faiss_results, 1):
                if (score < min_score):
                    continue
                
                chunk_metadata = self.metadata_store.get_chunk_metadata(chunk_id)
                
                if not chunk_metadata:
                    continue
                
                chunk = self._metadata_to_chunk(chunk_metadata)
                cws   = ChunkWithScore(chunk            = chunk,
                                       score            = score,
                                       rank             = rank,
                                       retrieval_method = 'vector',
                                      )
                
                chunks_with_scores.append(cws)
            
            self.search_count  += 1
            self.total_results += len(chunks_with_scores)
            
            return chunks_with_scores
            
        except Exception as e:
            self.logger.error(f"Vector search with embedding failed: {repr(e)}")
            raise VectorSearchError(f"Vector search with embedding failed: {repr(e)}")
    

    def _metadata_to_chunk(self, metadata: dict) -> DocumentChunk:
        """
        Convert metadata dictionary to DocumentChunk object
        
        Arguments:
        ----------
            metadata { dict } : Chunk metadata from store
        
        Returns:
        --------
            { DocumentChunk } : DocumentChunk object
        """
        return DocumentChunk(chunk_id      = metadata['chunk_id'],
                             document_id   = metadata['document_id'],
                             text          = metadata['text'],
                             embedding     = metadata.get('embedding'),
                             chunk_index   = metadata['chunk_index'],
                             start_char    = metadata['start_char'],
                             end_char      = metadata['end_char'],
                             page_number   = metadata.get('page_number'),
                             section_title = metadata.get('section_title'),
                             token_count   = metadata['token_count'],
                             metadata      = metadata.get('metadata', {}),
                            )
    

    def search_with_filters(self, query: str, top_k: int = 10, document_ids: Optional[List[str]] = None, 
                           min_score: float = 0.0) -> List[ChunkWithScore]:
        """
        Search with document filters
        
        Arguments:
        ----------
            query        { str }   : Search query
            
            top_k        { int }   : Number of results
            
            document_ids { list }  : Filter by specific documents
            
            min_score    { float } : Minimum score threshold
        
        Returns:
        --------
                  { list }        : Filtered ChunkWithScore objects
        """
        # Get more results for filtering
        results = self.search(query     = query, 
                              top_k     = top_k * 2, 
                              min_score = min_score,
                             )
        
        # Filter by document IDs if provided
        if document_ids:
            results = [r for r in results if r.chunk.document_id in document_ids]
        
        # Return top_k after filtering
        return results[:top_k]
    

    def batch_search(self, queries: List[str], top_k: int = 10) -> List[List[ChunkWithScore]]:
        """
        Perform batch vector search for multiple queries
        
        Arguments:
        ----------
            queries { list } : List of query strings
            
            top_k   { int }  : Number of results per query
        
        Returns:
        --------
                 { list }    : List of result lists
        """
        self.logger.info(f"Performing batch vector search for {len(queries)} queries")
        
        results = list()
        
        for query in queries:
            query_results = self.search(query, top_k)
            
            results.append(query_results)
        
        return results
    

    def get_search_statistics(self) -> dict:
        """
        Get vector search statistics
        
        Returns:
        --------
            { dict }    : Search statistics
        """
        avg_results = (self.total_results / self.search_count) if (self.search_count > 0) else 0
        
        return {"search_count"           : self.search_count,
                "total_results"          : self.total_results,
                "avg_results_per_query"  : avg_results,
                "faiss_index_stats"      : self.faiss_manager.get_index_stats(),
                "embedding_model"        : self.embedder.model_name,
                "embedding_dimension"    : self.embedder.embedding_dim,
               }


# Global vector search instance
_vector_search = None


def get_vector_search() -> VectorSearch:
    """
    Get global vector search instance
    
    Returns:
    --------
        { VectorSearch } : VectorSearch instance
    """
    global _vector_search

    if _vector_search is None:
        _vector_search = VectorSearch()
    
    return _vector_search


def search_vectors(query: str, top_k: int = 10, **kwargs) -> List[ChunkWithScore]:
    """
    Convenience function for vector search
    
    Arguments:
    ----------
        query  { str } : Search query

        top_k  { int } : Number of results
        
        **kwargs       : Additional arguments
    
    Returns:
    --------
            { list }   : ChunkWithScore results
    """
    searcher = get_vector_search()
    
    return searcher.search(query, top_k, **kwargs)