File size: 2,968 Bytes
e820a8a
 
 
5017c51
e820a8a
 
 
 
 
 
 
 
 
5017c51
e820a8a
 
5017c51
e820a8a
 
5017c51
 
 
 
 
 
 
 
 
 
e820a8a
5017c51
e820a8a
 
5017c51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e820a8a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# search_engine.py
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from utils import setup_logger
from config import Config

logger = setup_logger('search_engine')

class SearchEngine:
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings  # NEW: Store embeddings for reference
        self.index = self._build_faiss_index(embeddings)
        self.model = SentenceTransformer(Config.EMBEDDING_MODEL)
    
    def _build_faiss_index(self, embeddings):
        dimension = embeddings.shape[1]
        
        # NEW: Use IndexFlatIP for cosine similarity (better than L2)
        index = faiss.IndexFlatIP(dimension)
        
        # NEW: Normalize embeddings for cosine similarity
        embeddings_normalized = embeddings.astype('float32').copy()
        faiss.normalize_L2(embeddings_normalized)
        index.add(embeddings_normalized)
        
        logger.info(f"FAISS index built with {embeddings.shape[0]} vectors (cosine similarity)")
        return index
    
    def search(self, query):
        try:
            # Encode query
            query_embedding = self.model.encode([query]).astype('float32')
            
            # NEW: Normalize query for cosine similarity
            faiss.normalize_L2(query_embedding)
            
            # NEW: Get more results to filter
            search_k = min(Config.TOP_K * 2, len(self.documents))
            scores, indices = self.index.search(query_embedding, search_k)
            
            # NEW: Filter by similarity threshold
            valid_mask = scores[0] >= Config.MIN_SIMILARITY_SCORE
            filtered_indices = indices[0][valid_mask]
            filtered_scores = scores[0][valid_mask]
            
            # NEW: Limit to TOP_K after filtering
            if len(filtered_indices) > Config.TOP_K:
                filtered_indices = filtered_indices[:Config.TOP_K]
                filtered_scores = filtered_scores[:Config.TOP_K]
            
            # NEW: Handle no results case
            if len(filtered_indices) == 0:
                logger.warning(f"No results above similarity threshold {Config.MIN_SIMILARITY_SCORE}")
                return pd.DataFrame()
            
            # NEW: Add similarity scores to results
            results = self.documents.iloc[filtered_indices].copy()
            results['similarity_score'] = filtered_scores
            
            # NEW: Sort by similarity score (best first)
            results = results.sort_values('similarity_score', ascending=False)
            
            # NEW: Better logging
            logger.info(f"Found {len(results)} chunks (scores: {filtered_scores.min():.2f} - {filtered_scores.max():.2f})")
            
            return results
            
        except Exception as e:
            logger.error(f"Error searching documents: {e}")
            return pd.DataFrame()