Spaces:
Sleeping
Sleeping
Guilherme Favaron
Major update: Add hybrid search, reranking, multiple LLMs, and UI improvements
1b447de | """ | |
| Hybrid search: combina busca vetorial + BM25 | |
| """ | |
| from typing import List, Dict, Any, Optional | |
| from .database import DatabaseManager | |
| from .embeddings import EmbeddingManager | |
| from .bm25_search import BM25Searcher | |
| class HybridSearcher: | |
| """Busca híbrida usando vetorial + BM25""" | |
| def __init__( | |
| self, | |
| db_manager: DatabaseManager, | |
| embedding_manager: EmbeddingManager | |
| ): | |
| self.db = db_manager | |
| self.embeddings = embedding_manager | |
| self.bm25 = BM25Searcher() | |
| self.index_built = False | |
| def build_bm25_index(self, session_id: Optional[str] = None) -> bool: | |
| """ | |
| Constrói índice BM25 com documentos do banco | |
| Args: | |
| session_id: Filtro por sessão (None = todos) | |
| Returns: | |
| True se construído com sucesso | |
| """ | |
| try: | |
| all_docs = self.db.get_all_documents(session_id) | |
| if not all_docs: | |
| return False | |
| self.bm25.build_index(all_docs) | |
| self.index_built = True | |
| return True | |
| except Exception: | |
| return False | |
| def search( | |
| self, | |
| query: str, | |
| top_k: int = 10, | |
| alpha: float = 0.5, | |
| session_id: Optional[str] = None | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Busca híbrida com RRF (Reciprocal Rank Fusion) | |
| Args: | |
| query: Query do usuário | |
| top_k: Resultados finais | |
| alpha: Peso vetorial (0-1). 1-alpha = peso BM25 | |
| 0.0 = só BM25 | |
| 0.5 = balanceado | |
| 1.0 = só vetorial | |
| session_id: Filtro por sessão | |
| Returns: | |
| Resultados fusionados e reordenados | |
| """ | |
| # 1. Busca vetorial | |
| query_embedding = self.embeddings.encode_single(query) | |
| vector_results = self.db.search_similar( | |
| query_embedding, | |
| k=top_k * 2, # Busca 2x para ter margem | |
| session_id=session_id | |
| ) | |
| # 2. Busca BM25 (constrói índice se necessário) | |
| if not self.index_built: | |
| self.build_bm25_index(session_id) | |
| bm25_results = self.bm25.search(query, top_k=top_k * 2) | |
| # 3. Fusion com pesos | |
| return self._weighted_fusion( | |
| vector_results, | |
| bm25_results, | |
| top_k, | |
| alpha | |
| ) | |
| def _weighted_fusion( | |
| self, | |
| vector_results: List[Dict[str, Any]], | |
| bm25_results: List[Dict[str, Any]], | |
| top_k: int, | |
| alpha: float | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Combina resultados usando fusão ponderada | |
| Args: | |
| vector_results: Resultados da busca vetorial | |
| bm25_results: Resultados da busca BM25 | |
| top_k: Quantidade final | |
| alpha: Peso vetorial (1-alpha = peso BM25) | |
| Returns: | |
| Resultados fusionados | |
| """ | |
| # Normaliza scores vetoriais | |
| vector_scores = {doc['id']: doc['score'] for doc in vector_results} | |
| if vector_scores: | |
| max_vec = max(vector_scores.values()) | |
| vector_scores = {k: v/max_vec for k, v in vector_scores.items()} | |
| # Normaliza scores BM25 | |
| bm25_scores = {doc['id']: doc['bm25_score'] for doc in bm25_results} | |
| if bm25_scores: | |
| max_bm25 = max(bm25_scores.values()) | |
| bm25_scores = {k: v/max_bm25 for k, v in bm25_scores.items()} | |
| # Fusão ponderada | |
| all_ids = set(vector_scores.keys()) | set(bm25_scores.keys()) | |
| fused = [] | |
| for doc_id in all_ids: | |
| vec_score = vector_scores.get(doc_id, 0.0) | |
| bm_score = bm25_scores.get(doc_id, 0.0) | |
| # Score híbrido ponderado | |
| hybrid_score = alpha * vec_score + (1 - alpha) * bm_score | |
| # Pega documento completo (prioriza vetorial) | |
| doc = next((d for d in vector_results if d['id'] == doc_id), None) | |
| if not doc: | |
| doc = next((d for d in bm25_results if d['id'] == doc_id), None) | |
| if doc: | |
| doc = doc.copy() | |
| doc['hybrid_score'] = hybrid_score | |
| doc['vector_score'] = vec_score | |
| doc['bm25_score'] = bm_score | |
| fused.append(doc) | |
| # Ordena por hybrid_score | |
| fused.sort(key=lambda x: x['hybrid_score'], reverse=True) | |
| return fused[:top_k] | |
| def get_searcher_info(self) -> Dict[str, Any]: | |
| """Retorna informações do searcher""" | |
| return { | |
| "bm25_index_built": self.index_built, | |
| "bm25_info": self.bm25.get_index_info() | |
| } | |