Spaces:

Madras1
/

AetherMap

Running

App Files Files Community

Madras1 commited on 25 days ago

Commit

3b0df15

verified ·

1 Parent(s): 367c868

Upload app.py

Browse files

Files changed (1) hide show

app.py +144 -31

app.py CHANGED Viewed

@@ -30,8 +30,71 @@ from sklearn.preprocessing import StandardScaler
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from scipy.stats import entropy
 import faiss
 # NER & Language Detection
 import spacy
 from langdetect import detect, LangDetectException
@@ -697,19 +760,25 @@ async def process_api(
             custom_min_samples=min_samples_int
         )
-        # Criar índice FAISS para busca rápida
         embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
         faiss_index = faiss.IndexFlatIP(embeddings_normalized.shape[1])  # Inner Product = Cosine sim para vetores normalizados
         faiss_index.add(embeddings_normalized.astype('float32'))
         job_id = str(uuid.uuid4())
         cache[job_id] = {
             "embeddings": embeddings,
             "embeddings_normalized": embeddings_normalized,
             "faiss_index": faiss_index,
             "df": df
         }
-        logging.info(f"Job criado: {job_id} (FAISS index com {faiss_index.ntotal} vetores)")
         metricas_globais = calcular_metricas(df["full_text"].tolist())
         analise_de_duplicados = encontrar_duplicados(df, embeddings)
@@ -752,47 +821,91 @@ async def search_api(query: str = Form(...), job_id: str = Form(...)):
         cached_data = cache[job_id]
         df = cached_data["df"]
         faiss_index = cached_data.get("faiss_index")
-        # FASE 1: Busca FAISS (O(log N) ao invés de O(N))
         query_embedding = model.encode([query], convert_to_numpy=True)
         query_normalized = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
-        top_k_retrieval = min(50, faiss_index.ntotal) if faiss_index else 50
-        # Criar mapeamento de scores por índice
-        retrieval_scores = {}
         if faiss_index:
-            # FAISS search - retorna (distances, indices)
-            faiss_scores, top_indices = faiss_index.search(query_normalized.astype('float32'), top_k_retrieval)
-            faiss_scores = faiss_scores[0]  # Flatten
-            top_indices = top_indices[0]
-            scores = faiss_scores
-            # Mapear índices para scores
-            for i, idx in enumerate(top_indices):
                 if idx >= 0:
-                    retrieval_scores[int(idx)] = float(faiss_scores[i])
-            logging.info(f"FAISS search: top score = {scores[0]:.3f}")
-        else:
-            # Fallback para busca bruta se não tiver FAISS
-            corpus_embeddings = cached_data["embeddings"]
-            similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
-            top_indices = np.argsort(similarities)[-top_k_retrieval:][::-1]
-            scores = similarities[top_indices]
-            # Mapear índices para scores
-            for i, idx in enumerate(top_indices):
-                retrieval_scores[int(idx)] = float(similarities[idx])
         candidate_docs = []
         candidate_indices = []
-        for i, idx in enumerate(top_indices):
-            if idx < 0:  # FAISS pode retornar -1 se não tiver resultados suficientes
-                continue
-            if scores[i] > 0.15:
-                doc_text = df.iloc[int(idx)]["full_text"]
-                candidate_docs.append([query, doc_text])
-                candidate_indices.append(int(idx))
         if not candidate_docs:
              return {"summary": "Não foram encontrados documentos relevantes.", "results": []}

 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from scipy.stats import entropy
+from scipy.sparse import csr_matrix
 import faiss
+# ==============================================================================
+# BM25 IMPLEMENTAÇÃO SIMPLES (Sem dependência externa)
+# ==============================================================================
+class SimpleBM25:
+    """
+    Implementação simples de BM25 usando TF-IDF com ajustes.
+    Evita dependência externa do rank_bm25.
+    """
+    def __init__(self, corpus: List[str], k1: float = 1.5, b: float = 0.75):
+        self.k1 = k1
+        self.b = b
+        self.corpus = corpus
+        self.corpus_size = len(corpus)
+        # Tokenização simples
+        self.tokenized_corpus = [doc.lower().split() for doc in corpus]
+        # Calcular IDF
+        self.doc_freqs = {}
+        for doc in self.tokenized_corpus:
+            for term in set(doc):
+                self.doc_freqs[term] = self.doc_freqs.get(term, 0) + 1
+        # Calcular average document length
+        self.avgdl = sum(len(doc) for doc in self.tokenized_corpus) / self.corpus_size
+        # IDF pre-computado
+        self.idf = {}
+        for term, freq in self.doc_freqs.items():
+            self.idf[term] = np.log((self.corpus_size - freq + 0.5) / (freq + 0.5) + 1)
+    def get_scores(self, query: str) -> np.ndarray:
+        """Retorna scores BM25 para todos os documentos."""
+        query_terms = query.lower().split()
+        scores = np.zeros(self.corpus_size)
+        for i, doc in enumerate(self.tokenized_corpus):
+            doc_len = len(doc)
+            term_freqs = {}
+            for term in doc:
+                term_freqs[term] = term_freqs.get(term, 0) + 1
+            score = 0.0
+            for term in query_terms:
+                if term in term_freqs:
+                    tf = term_freqs[term]
+                    idf = self.idf.get(term, 0)
+                    # BM25 formula
+                    numerator = tf * (self.k1 + 1)
+                    denominator = tf + self.k1 * (1 - self.b + self.b * (doc_len / self.avgdl))
+                    score += idf * (numerator / denominator)
+            scores[i] = score
+        return scores
+    def get_top_n(self, query: str, n: int = 50) -> Tuple[np.ndarray, np.ndarray]:
+        """Retorna top N documentos (índices, scores)."""
+        scores = self.get_scores(query)
+        top_indices = np.argsort(scores)[-n:][::-1]
+        return top_indices, scores[top_indices]
 # NER & Language Detection
 import spacy
 from langdetect import detect, LangDetectException
             custom_min_samples=min_samples_int
         )
+        # Criar índice FAISS para busca rápida (semântica)
         embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
         faiss_index = faiss.IndexFlatIP(embeddings_normalized.shape[1])  # Inner Product = Cosine sim para vetores normalizados
         faiss_index.add(embeddings_normalized.astype('float32'))
+        # Criar índice BM25 para busca lexical (Hybrid Search)
+        corpus_texts = df["full_text"].tolist()
+        bm25_index = SimpleBM25(corpus_texts)
+        logging.info(f"BM25 index criado com {len(corpus_texts)} documentos")
         job_id = str(uuid.uuid4())
         cache[job_id] = {
             "embeddings": embeddings,
             "embeddings_normalized": embeddings_normalized,
             "faiss_index": faiss_index,
+            "bm25_index": bm25_index,  # Novo: índice BM25
             "df": df
         }
+        logging.info(f"Job criado: {job_id} (FAISS + BM25 hybrid search)")
         metricas_globais = calcular_metricas(df["full_text"].tolist())
         analise_de_duplicados = encontrar_duplicados(df, embeddings)
         cached_data = cache[job_id]
         df = cached_data["df"]
         faiss_index = cached_data.get("faiss_index")
+        bm25_index = cached_data.get("bm25_index")
+        # ==================================================================
+        # FASE 1: HYBRID SEARCH (FAISS Semântico + BM25 Lexical)
+        # ==================================================================
         query_embedding = model.encode([query], convert_to_numpy=True)
         query_normalized = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
+        top_k_retrieval = min(100, len(df))  # Aumentado para hybrid search
+        # --- 1A: Busca Semântica (FAISS) ---
+        semantic_scores = {}
+        semantic_ranks = {}
         if faiss_index:
+            faiss_scores, faiss_indices = faiss_index.search(
+                query_normalized.astype('float32'),
+                min(top_k_retrieval, faiss_index.ntotal)
+            )
+            faiss_scores = faiss_scores[0]
+            faiss_indices = faiss_indices[0]
+            for rank, (idx, score) in enumerate(zip(faiss_indices, faiss_scores)):
                 if idx >= 0:
+                    semantic_scores[int(idx)] = float(score)
+                    semantic_ranks[int(idx)] = rank + 1  # 1-indexed rank
+            logging.info(f"FAISS: top score = {faiss_scores[0]:.3f}")
+        # --- 1B: Busca Lexical (BM25) ---
+        lexical_scores = {}
+        lexical_ranks = {}
+        if bm25_index:
+            bm25_indices, bm25_scores = bm25_index.get_top_n(query, n=top_k_retrieval)
+            for rank, (idx, score) in enumerate(zip(bm25_indices, bm25_scores)):
+                if score > 0:  # Só inclui se teve match
+                    lexical_scores[int(idx)] = float(score)
+                    lexical_ranks[int(idx)] = rank + 1
+            if bm25_scores[0] > 0:
+                logging.info(f"BM25: top score = {bm25_scores[0]:.3f}, docs matched = {len(lexical_scores)}")
+            else:
+                logging.info("BM25: nenhum match léxico encontrado")
+        # --- 1C: Reciprocal Rank Fusion (RRF) ---
+        # Combina rankings de ambas as buscas
+        # RRF score = sum(1 / (k + rank)) onde k é constante (tipicamente 60)
+        k = 60  # Constante de suavização do RRF
+        all_indices = set(semantic_ranks.keys()) | set(lexical_ranks.keys())
+        hybrid_scores = {}
+        for idx in all_indices:
+            rrf_score = 0.0
+            # Contribuição semântica
+            if idx in semantic_ranks:
+                rrf_score += 1.0 / (k + semantic_ranks[idx])
+            # Contribuição lexical
+            if idx in lexical_ranks:
+                rrf_score += 1.0 / (k + lexical_ranks[idx])
+            hybrid_scores[idx] = rrf_score
+        # Ordenar por RRF score
+        sorted_candidates = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
+        # Log de debug
+        n_semantic_only = len(semantic_ranks.keys() - lexical_ranks.keys())
+        n_lexical_only = len(lexical_ranks.keys() - semantic_ranks.keys())
+        n_both = len(semantic_ranks.keys() & lexical_ranks.keys())
+        logging.info(f"Hybrid: {n_both} em ambos, {n_semantic_only} só semântico, {n_lexical_only} só léxico")
+        # Preparar candidatos para reranking
         candidate_docs = []
         candidate_indices = []
+        retrieval_scores = {}
+        for idx, rrf_score in sorted_candidates[:top_k_retrieval]:
+            doc_text = df.iloc[idx]["full_text"]
+            candidate_docs.append([query, doc_text])
+            candidate_indices.append(idx)
+            retrieval_scores[idx] = rrf_score  # Guardar RRF score
         if not candidate_docs:
              return {"summary": "Não foram encontrados documentos relevantes.", "results": []}