Upload app.py
Browse files
app.py
CHANGED
|
@@ -30,8 +30,71 @@ from sklearn.preprocessing import StandardScaler
|
|
| 30 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 31 |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
| 32 |
from scipy.stats import entropy
|
|
|
|
| 33 |
import faiss
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
# NER & Language Detection
|
| 36 |
import spacy
|
| 37 |
from langdetect import detect, LangDetectException
|
|
@@ -697,19 +760,25 @@ async def process_api(
|
|
| 697 |
custom_min_samples=min_samples_int
|
| 698 |
)
|
| 699 |
|
| 700 |
-
# Criar índice FAISS para busca rápida
|
| 701 |
embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
|
| 702 |
faiss_index = faiss.IndexFlatIP(embeddings_normalized.shape[1]) # Inner Product = Cosine sim para vetores normalizados
|
| 703 |
faiss_index.add(embeddings_normalized.astype('float32'))
|
| 704 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
job_id = str(uuid.uuid4())
|
| 706 |
cache[job_id] = {
|
| 707 |
"embeddings": embeddings,
|
| 708 |
"embeddings_normalized": embeddings_normalized,
|
| 709 |
"faiss_index": faiss_index,
|
|
|
|
| 710 |
"df": df
|
| 711 |
}
|
| 712 |
-
logging.info(f"Job criado: {job_id} (FAISS
|
| 713 |
|
| 714 |
metricas_globais = calcular_metricas(df["full_text"].tolist())
|
| 715 |
analise_de_duplicados = encontrar_duplicados(df, embeddings)
|
|
@@ -752,47 +821,91 @@ async def search_api(query: str = Form(...), job_id: str = Form(...)):
|
|
| 752 |
cached_data = cache[job_id]
|
| 753 |
df = cached_data["df"]
|
| 754 |
faiss_index = cached_data.get("faiss_index")
|
|
|
|
| 755 |
|
| 756 |
-
#
|
|
|
|
|
|
|
| 757 |
query_embedding = model.encode([query], convert_to_numpy=True)
|
| 758 |
query_normalized = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
|
| 759 |
|
| 760 |
-
top_k_retrieval = min(
|
| 761 |
|
| 762 |
-
#
|
| 763 |
-
|
|
|
|
| 764 |
|
| 765 |
if faiss_index:
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
|
|
|
| 773 |
if idx >= 0:
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 785 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 786 |
candidate_docs = []
|
| 787 |
candidate_indices = []
|
|
|
|
| 788 |
|
| 789 |
-
for
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
candidate_docs.append([query, doc_text])
|
| 795 |
-
candidate_indices.append(int(idx))
|
| 796 |
|
| 797 |
if not candidate_docs:
|
| 798 |
return {"summary": "Não foram encontrados documentos relevantes.", "results": []}
|
|
|
|
| 30 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 31 |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
| 32 |
from scipy.stats import entropy
|
| 33 |
+
from scipy.sparse import csr_matrix
|
| 34 |
import faiss
|
| 35 |
|
| 36 |
+
# ==============================================================================
|
| 37 |
+
# BM25 IMPLEMENTAÇÃO SIMPLES (Sem dependência externa)
|
| 38 |
+
# ==============================================================================
|
| 39 |
+
class SimpleBM25:
|
| 40 |
+
"""
|
| 41 |
+
Implementação simples de BM25 usando TF-IDF com ajustes.
|
| 42 |
+
Evita dependência externa do rank_bm25.
|
| 43 |
+
"""
|
| 44 |
+
def __init__(self, corpus: List[str], k1: float = 1.5, b: float = 0.75):
|
| 45 |
+
self.k1 = k1
|
| 46 |
+
self.b = b
|
| 47 |
+
self.corpus = corpus
|
| 48 |
+
self.corpus_size = len(corpus)
|
| 49 |
+
|
| 50 |
+
# Tokenização simples
|
| 51 |
+
self.tokenized_corpus = [doc.lower().split() for doc in corpus]
|
| 52 |
+
|
| 53 |
+
# Calcular IDF
|
| 54 |
+
self.doc_freqs = {}
|
| 55 |
+
for doc in self.tokenized_corpus:
|
| 56 |
+
for term in set(doc):
|
| 57 |
+
self.doc_freqs[term] = self.doc_freqs.get(term, 0) + 1
|
| 58 |
+
|
| 59 |
+
# Calcular average document length
|
| 60 |
+
self.avgdl = sum(len(doc) for doc in self.tokenized_corpus) / self.corpus_size
|
| 61 |
+
|
| 62 |
+
# IDF pre-computado
|
| 63 |
+
self.idf = {}
|
| 64 |
+
for term, freq in self.doc_freqs.items():
|
| 65 |
+
self.idf[term] = np.log((self.corpus_size - freq + 0.5) / (freq + 0.5) + 1)
|
| 66 |
+
|
| 67 |
+
def get_scores(self, query: str) -> np.ndarray:
|
| 68 |
+
"""Retorna scores BM25 para todos os documentos."""
|
| 69 |
+
query_terms = query.lower().split()
|
| 70 |
+
scores = np.zeros(self.corpus_size)
|
| 71 |
+
|
| 72 |
+
for i, doc in enumerate(self.tokenized_corpus):
|
| 73 |
+
doc_len = len(doc)
|
| 74 |
+
term_freqs = {}
|
| 75 |
+
for term in doc:
|
| 76 |
+
term_freqs[term] = term_freqs.get(term, 0) + 1
|
| 77 |
+
|
| 78 |
+
score = 0.0
|
| 79 |
+
for term in query_terms:
|
| 80 |
+
if term in term_freqs:
|
| 81 |
+
tf = term_freqs[term]
|
| 82 |
+
idf = self.idf.get(term, 0)
|
| 83 |
+
# BM25 formula
|
| 84 |
+
numerator = tf * (self.k1 + 1)
|
| 85 |
+
denominator = tf + self.k1 * (1 - self.b + self.b * (doc_len / self.avgdl))
|
| 86 |
+
score += idf * (numerator / denominator)
|
| 87 |
+
|
| 88 |
+
scores[i] = score
|
| 89 |
+
|
| 90 |
+
return scores
|
| 91 |
+
|
| 92 |
+
def get_top_n(self, query: str, n: int = 50) -> Tuple[np.ndarray, np.ndarray]:
|
| 93 |
+
"""Retorna top N documentos (índices, scores)."""
|
| 94 |
+
scores = self.get_scores(query)
|
| 95 |
+
top_indices = np.argsort(scores)[-n:][::-1]
|
| 96 |
+
return top_indices, scores[top_indices]
|
| 97 |
+
|
| 98 |
# NER & Language Detection
|
| 99 |
import spacy
|
| 100 |
from langdetect import detect, LangDetectException
|
|
|
|
| 760 |
custom_min_samples=min_samples_int
|
| 761 |
)
|
| 762 |
|
| 763 |
+
# Criar índice FAISS para busca rápida (semântica)
|
| 764 |
embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
|
| 765 |
faiss_index = faiss.IndexFlatIP(embeddings_normalized.shape[1]) # Inner Product = Cosine sim para vetores normalizados
|
| 766 |
faiss_index.add(embeddings_normalized.astype('float32'))
|
| 767 |
|
| 768 |
+
# Criar índice BM25 para busca lexical (Hybrid Search)
|
| 769 |
+
corpus_texts = df["full_text"].tolist()
|
| 770 |
+
bm25_index = SimpleBM25(corpus_texts)
|
| 771 |
+
logging.info(f"BM25 index criado com {len(corpus_texts)} documentos")
|
| 772 |
+
|
| 773 |
job_id = str(uuid.uuid4())
|
| 774 |
cache[job_id] = {
|
| 775 |
"embeddings": embeddings,
|
| 776 |
"embeddings_normalized": embeddings_normalized,
|
| 777 |
"faiss_index": faiss_index,
|
| 778 |
+
"bm25_index": bm25_index, # Novo: índice BM25
|
| 779 |
"df": df
|
| 780 |
}
|
| 781 |
+
logging.info(f"Job criado: {job_id} (FAISS + BM25 hybrid search)")
|
| 782 |
|
| 783 |
metricas_globais = calcular_metricas(df["full_text"].tolist())
|
| 784 |
analise_de_duplicados = encontrar_duplicados(df, embeddings)
|
|
|
|
| 821 |
cached_data = cache[job_id]
|
| 822 |
df = cached_data["df"]
|
| 823 |
faiss_index = cached_data.get("faiss_index")
|
| 824 |
+
bm25_index = cached_data.get("bm25_index")
|
| 825 |
|
| 826 |
+
# ==================================================================
|
| 827 |
+
# FASE 1: HYBRID SEARCH (FAISS Semântico + BM25 Lexical)
|
| 828 |
+
# ==================================================================
|
| 829 |
query_embedding = model.encode([query], convert_to_numpy=True)
|
| 830 |
query_normalized = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
|
| 831 |
|
| 832 |
+
top_k_retrieval = min(100, len(df)) # Aumentado para hybrid search
|
| 833 |
|
| 834 |
+
# --- 1A: Busca Semântica (FAISS) ---
|
| 835 |
+
semantic_scores = {}
|
| 836 |
+
semantic_ranks = {}
|
| 837 |
|
| 838 |
if faiss_index:
|
| 839 |
+
faiss_scores, faiss_indices = faiss_index.search(
|
| 840 |
+
query_normalized.astype('float32'),
|
| 841 |
+
min(top_k_retrieval, faiss_index.ntotal)
|
| 842 |
+
)
|
| 843 |
+
faiss_scores = faiss_scores[0]
|
| 844 |
+
faiss_indices = faiss_indices[0]
|
| 845 |
+
|
| 846 |
+
for rank, (idx, score) in enumerate(zip(faiss_indices, faiss_scores)):
|
| 847 |
if idx >= 0:
|
| 848 |
+
semantic_scores[int(idx)] = float(score)
|
| 849 |
+
semantic_ranks[int(idx)] = rank + 1 # 1-indexed rank
|
| 850 |
+
|
| 851 |
+
logging.info(f"FAISS: top score = {faiss_scores[0]:.3f}")
|
| 852 |
+
|
| 853 |
+
# --- 1B: Busca Lexical (BM25) ---
|
| 854 |
+
lexical_scores = {}
|
| 855 |
+
lexical_ranks = {}
|
| 856 |
+
|
| 857 |
+
if bm25_index:
|
| 858 |
+
bm25_indices, bm25_scores = bm25_index.get_top_n(query, n=top_k_retrieval)
|
| 859 |
+
|
| 860 |
+
for rank, (idx, score) in enumerate(zip(bm25_indices, bm25_scores)):
|
| 861 |
+
if score > 0: # Só inclui se teve match
|
| 862 |
+
lexical_scores[int(idx)] = float(score)
|
| 863 |
+
lexical_ranks[int(idx)] = rank + 1
|
| 864 |
+
|
| 865 |
+
if bm25_scores[0] > 0:
|
| 866 |
+
logging.info(f"BM25: top score = {bm25_scores[0]:.3f}, docs matched = {len(lexical_scores)}")
|
| 867 |
+
else:
|
| 868 |
+
logging.info("BM25: nenhum match léxico encontrado")
|
| 869 |
+
|
| 870 |
+
# --- 1C: Reciprocal Rank Fusion (RRF) ---
|
| 871 |
+
# Combina rankings de ambas as buscas
|
| 872 |
+
# RRF score = sum(1 / (k + rank)) onde k é constante (tipicamente 60)
|
| 873 |
+
k = 60 # Constante de suavização do RRF
|
| 874 |
+
all_indices = set(semantic_ranks.keys()) | set(lexical_ranks.keys())
|
| 875 |
|
| 876 |
+
hybrid_scores = {}
|
| 877 |
+
for idx in all_indices:
|
| 878 |
+
rrf_score = 0.0
|
| 879 |
+
|
| 880 |
+
# Contribuição semântica
|
| 881 |
+
if idx in semantic_ranks:
|
| 882 |
+
rrf_score += 1.0 / (k + semantic_ranks[idx])
|
| 883 |
+
|
| 884 |
+
# Contribuição lexical
|
| 885 |
+
if idx in lexical_ranks:
|
| 886 |
+
rrf_score += 1.0 / (k + lexical_ranks[idx])
|
| 887 |
+
|
| 888 |
+
hybrid_scores[idx] = rrf_score
|
| 889 |
+
|
| 890 |
+
# Ordenar por RRF score
|
| 891 |
+
sorted_candidates = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
|
| 892 |
+
|
| 893 |
+
# Log de debug
|
| 894 |
+
n_semantic_only = len(semantic_ranks.keys() - lexical_ranks.keys())
|
| 895 |
+
n_lexical_only = len(lexical_ranks.keys() - semantic_ranks.keys())
|
| 896 |
+
n_both = len(semantic_ranks.keys() & lexical_ranks.keys())
|
| 897 |
+
logging.info(f"Hybrid: {n_both} em ambos, {n_semantic_only} só semântico, {n_lexical_only} só léxico")
|
| 898 |
+
|
| 899 |
+
# Preparar candidatos para reranking
|
| 900 |
candidate_docs = []
|
| 901 |
candidate_indices = []
|
| 902 |
+
retrieval_scores = {}
|
| 903 |
|
| 904 |
+
for idx, rrf_score in sorted_candidates[:top_k_retrieval]:
|
| 905 |
+
doc_text = df.iloc[idx]["full_text"]
|
| 906 |
+
candidate_docs.append([query, doc_text])
|
| 907 |
+
candidate_indices.append(idx)
|
| 908 |
+
retrieval_scores[idx] = rrf_score # Guardar RRF score
|
|
|
|
|
|
|
| 909 |
|
| 910 |
if not candidate_docs:
|
| 911 |
return {"summary": "Não foram encontrados documentos relevantes.", "results": []}
|