Madras1 commited on
Commit
3b0df15
·
verified ·
1 Parent(s): 367c868

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -31
app.py CHANGED
@@ -30,8 +30,71 @@ from sklearn.preprocessing import StandardScaler
30
  from sklearn.metrics.pairwise import cosine_similarity
31
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
32
  from scipy.stats import entropy
 
33
  import faiss
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  # NER & Language Detection
36
  import spacy
37
  from langdetect import detect, LangDetectException
@@ -697,19 +760,25 @@ async def process_api(
697
  custom_min_samples=min_samples_int
698
  )
699
 
700
- # Criar índice FAISS para busca rápida
701
  embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
702
  faiss_index = faiss.IndexFlatIP(embeddings_normalized.shape[1]) # Inner Product = Cosine sim para vetores normalizados
703
  faiss_index.add(embeddings_normalized.astype('float32'))
704
 
 
 
 
 
 
705
  job_id = str(uuid.uuid4())
706
  cache[job_id] = {
707
  "embeddings": embeddings,
708
  "embeddings_normalized": embeddings_normalized,
709
  "faiss_index": faiss_index,
 
710
  "df": df
711
  }
712
- logging.info(f"Job criado: {job_id} (FAISS index com {faiss_index.ntotal} vetores)")
713
 
714
  metricas_globais = calcular_metricas(df["full_text"].tolist())
715
  analise_de_duplicados = encontrar_duplicados(df, embeddings)
@@ -752,47 +821,91 @@ async def search_api(query: str = Form(...), job_id: str = Form(...)):
752
  cached_data = cache[job_id]
753
  df = cached_data["df"]
754
  faiss_index = cached_data.get("faiss_index")
 
755
 
756
- # FASE 1: Busca FAISS (O(log N) ao invés de O(N))
 
 
757
  query_embedding = model.encode([query], convert_to_numpy=True)
758
  query_normalized = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
759
 
760
- top_k_retrieval = min(50, faiss_index.ntotal) if faiss_index else 50
761
 
762
- # Criar mapeamento de scores por índice
763
- retrieval_scores = {}
 
764
 
765
  if faiss_index:
766
- # FAISS search - retorna (distances, indices)
767
- faiss_scores, top_indices = faiss_index.search(query_normalized.astype('float32'), top_k_retrieval)
768
- faiss_scores = faiss_scores[0] # Flatten
769
- top_indices = top_indices[0]
770
- scores = faiss_scores
771
- # Mapear índices para scores
772
- for i, idx in enumerate(top_indices):
 
773
  if idx >= 0:
774
- retrieval_scores[int(idx)] = float(faiss_scores[i])
775
- logging.info(f"FAISS search: top score = {scores[0]:.3f}")
776
- else:
777
- # Fallback para busca bruta se não tiver FAISS
778
- corpus_embeddings = cached_data["embeddings"]
779
- similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
780
- top_indices = np.argsort(similarities)[-top_k_retrieval:][::-1]
781
- scores = similarities[top_indices]
782
- # Mapear índices para scores
783
- for i, idx in enumerate(top_indices):
784
- retrieval_scores[int(idx)] = float(similarities[idx])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
  candidate_docs = []
787
  candidate_indices = []
 
788
 
789
- for i, idx in enumerate(top_indices):
790
- if idx < 0: # FAISS pode retornar -1 se não tiver resultados suficientes
791
- continue
792
- if scores[i] > 0.15:
793
- doc_text = df.iloc[int(idx)]["full_text"]
794
- candidate_docs.append([query, doc_text])
795
- candidate_indices.append(int(idx))
796
 
797
  if not candidate_docs:
798
  return {"summary": "Não foram encontrados documentos relevantes.", "results": []}
 
30
  from sklearn.metrics.pairwise import cosine_similarity
31
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
32
  from scipy.stats import entropy
33
+ from scipy.sparse import csr_matrix
34
  import faiss
35
 
36
+ # ==============================================================================
37
+ # BM25 IMPLEMENTAÇÃO SIMPLES (Sem dependência externa)
38
+ # ==============================================================================
39
+ class SimpleBM25:
40
+ """
41
+ Implementação simples de BM25 usando TF-IDF com ajustes.
42
+ Evita dependência externa do rank_bm25.
43
+ """
44
+ def __init__(self, corpus: List[str], k1: float = 1.5, b: float = 0.75):
45
+ self.k1 = k1
46
+ self.b = b
47
+ self.corpus = corpus
48
+ self.corpus_size = len(corpus)
49
+
50
+ # Tokenização simples
51
+ self.tokenized_corpus = [doc.lower().split() for doc in corpus]
52
+
53
+ # Calcular IDF
54
+ self.doc_freqs = {}
55
+ for doc in self.tokenized_corpus:
56
+ for term in set(doc):
57
+ self.doc_freqs[term] = self.doc_freqs.get(term, 0) + 1
58
+
59
+ # Calcular average document length
60
+ self.avgdl = sum(len(doc) for doc in self.tokenized_corpus) / self.corpus_size
61
+
62
+ # IDF pre-computado
63
+ self.idf = {}
64
+ for term, freq in self.doc_freqs.items():
65
+ self.idf[term] = np.log((self.corpus_size - freq + 0.5) / (freq + 0.5) + 1)
66
+
67
+ def get_scores(self, query: str) -> np.ndarray:
68
+ """Retorna scores BM25 para todos os documentos."""
69
+ query_terms = query.lower().split()
70
+ scores = np.zeros(self.corpus_size)
71
+
72
+ for i, doc in enumerate(self.tokenized_corpus):
73
+ doc_len = len(doc)
74
+ term_freqs = {}
75
+ for term in doc:
76
+ term_freqs[term] = term_freqs.get(term, 0) + 1
77
+
78
+ score = 0.0
79
+ for term in query_terms:
80
+ if term in term_freqs:
81
+ tf = term_freqs[term]
82
+ idf = self.idf.get(term, 0)
83
+ # BM25 formula
84
+ numerator = tf * (self.k1 + 1)
85
+ denominator = tf + self.k1 * (1 - self.b + self.b * (doc_len / self.avgdl))
86
+ score += idf * (numerator / denominator)
87
+
88
+ scores[i] = score
89
+
90
+ return scores
91
+
92
+ def get_top_n(self, query: str, n: int = 50) -> Tuple[np.ndarray, np.ndarray]:
93
+ """Retorna top N documentos (índices, scores)."""
94
+ scores = self.get_scores(query)
95
+ top_indices = np.argsort(scores)[-n:][::-1]
96
+ return top_indices, scores[top_indices]
97
+
98
  # NER & Language Detection
99
  import spacy
100
  from langdetect import detect, LangDetectException
 
760
  custom_min_samples=min_samples_int
761
  )
762
 
763
+ # Criar índice FAISS para busca rápida (semântica)
764
  embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
765
  faiss_index = faiss.IndexFlatIP(embeddings_normalized.shape[1]) # Inner Product = Cosine sim para vetores normalizados
766
  faiss_index.add(embeddings_normalized.astype('float32'))
767
 
768
+ # Criar índice BM25 para busca lexical (Hybrid Search)
769
+ corpus_texts = df["full_text"].tolist()
770
+ bm25_index = SimpleBM25(corpus_texts)
771
+ logging.info(f"BM25 index criado com {len(corpus_texts)} documentos")
772
+
773
  job_id = str(uuid.uuid4())
774
  cache[job_id] = {
775
  "embeddings": embeddings,
776
  "embeddings_normalized": embeddings_normalized,
777
  "faiss_index": faiss_index,
778
+ "bm25_index": bm25_index, # Novo: índice BM25
779
  "df": df
780
  }
781
+ logging.info(f"Job criado: {job_id} (FAISS + BM25 hybrid search)")
782
 
783
  metricas_globais = calcular_metricas(df["full_text"].tolist())
784
  analise_de_duplicados = encontrar_duplicados(df, embeddings)
 
821
  cached_data = cache[job_id]
822
  df = cached_data["df"]
823
  faiss_index = cached_data.get("faiss_index")
824
+ bm25_index = cached_data.get("bm25_index")
825
 
826
+ # ==================================================================
827
+ # FASE 1: HYBRID SEARCH (FAISS Semântico + BM25 Lexical)
828
+ # ==================================================================
829
  query_embedding = model.encode([query], convert_to_numpy=True)
830
  query_normalized = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
831
 
832
+ top_k_retrieval = min(100, len(df)) # Aumentado para hybrid search
833
 
834
+ # --- 1A: Busca Semântica (FAISS) ---
835
+ semantic_scores = {}
836
+ semantic_ranks = {}
837
 
838
  if faiss_index:
839
+ faiss_scores, faiss_indices = faiss_index.search(
840
+ query_normalized.astype('float32'),
841
+ min(top_k_retrieval, faiss_index.ntotal)
842
+ )
843
+ faiss_scores = faiss_scores[0]
844
+ faiss_indices = faiss_indices[0]
845
+
846
+ for rank, (idx, score) in enumerate(zip(faiss_indices, faiss_scores)):
847
  if idx >= 0:
848
+ semantic_scores[int(idx)] = float(score)
849
+ semantic_ranks[int(idx)] = rank + 1 # 1-indexed rank
850
+
851
+ logging.info(f"FAISS: top score = {faiss_scores[0]:.3f}")
852
+
853
+ # --- 1B: Busca Lexical (BM25) ---
854
+ lexical_scores = {}
855
+ lexical_ranks = {}
856
+
857
+ if bm25_index:
858
+ bm25_indices, bm25_scores = bm25_index.get_top_n(query, n=top_k_retrieval)
859
+
860
+ for rank, (idx, score) in enumerate(zip(bm25_indices, bm25_scores)):
861
+ if score > 0: # Só inclui se teve match
862
+ lexical_scores[int(idx)] = float(score)
863
+ lexical_ranks[int(idx)] = rank + 1
864
+
865
+ if bm25_scores[0] > 0:
866
+ logging.info(f"BM25: top score = {bm25_scores[0]:.3f}, docs matched = {len(lexical_scores)}")
867
+ else:
868
+ logging.info("BM25: nenhum match léxico encontrado")
869
+
870
+ # --- 1C: Reciprocal Rank Fusion (RRF) ---
871
+ # Combina rankings de ambas as buscas
872
+ # RRF score = sum(1 / (k + rank)) onde k é constante (tipicamente 60)
873
+ k = 60 # Constante de suavização do RRF
874
+ all_indices = set(semantic_ranks.keys()) | set(lexical_ranks.keys())
875
 
876
+ hybrid_scores = {}
877
+ for idx in all_indices:
878
+ rrf_score = 0.0
879
+
880
+ # Contribuição semântica
881
+ if idx in semantic_ranks:
882
+ rrf_score += 1.0 / (k + semantic_ranks[idx])
883
+
884
+ # Contribuição lexical
885
+ if idx in lexical_ranks:
886
+ rrf_score += 1.0 / (k + lexical_ranks[idx])
887
+
888
+ hybrid_scores[idx] = rrf_score
889
+
890
+ # Ordenar por RRF score
891
+ sorted_candidates = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
892
+
893
+ # Log de debug
894
+ n_semantic_only = len(semantic_ranks.keys() - lexical_ranks.keys())
895
+ n_lexical_only = len(lexical_ranks.keys() - semantic_ranks.keys())
896
+ n_both = len(semantic_ranks.keys() & lexical_ranks.keys())
897
+ logging.info(f"Hybrid: {n_both} em ambos, {n_semantic_only} só semântico, {n_lexical_only} só léxico")
898
+
899
+ # Preparar candidatos para reranking
900
  candidate_docs = []
901
  candidate_indices = []
902
+ retrieval_scores = {}
903
 
904
+ for idx, rrf_score in sorted_candidates[:top_k_retrieval]:
905
+ doc_text = df.iloc[idx]["full_text"]
906
+ candidate_docs.append([query, doc_text])
907
+ candidate_indices.append(idx)
908
+ retrieval_scores[idx] = rrf_score # Guardar RRF score
 
 
909
 
910
  if not candidate_docs:
911
  return {"summary": "Não foram encontrados documentos relevantes.", "results": []}