Madras1 commited on
Commit
1a2524e
·
verified ·
1 Parent(s): 446d368

Upload 6 files

Browse files
Files changed (2) hide show
  1. app.py +48 -12
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,6 +1,6 @@
1
  # ==============================================================================
2
- # API do AetherMap — VERSÃO 7.3 (KNOWLEDGE GRAPH EDITION)
3
- # Backend com RAG Híbrido, CSV, Tavily, NER Entity Graph
4
  # ==============================================================================
5
 
6
  import numpy as np
@@ -30,6 +30,7 @@ from sklearn.preprocessing import StandardScaler
30
  from sklearn.metrics.pairwise import cosine_similarity
31
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
32
  from scipy.stats import entropy
 
33
 
34
  # NER & Language Detection
35
  import spacy
@@ -641,9 +642,19 @@ async def process_api(
641
 
642
  df, embeddings = processar_pipeline(textos)
643
 
 
 
 
 
 
644
  job_id = str(uuid.uuid4())
645
- cache[job_id] = {"embeddings": embeddings, "df": df}
646
- logging.info(f"Job criado: {job_id}")
 
 
 
 
 
647
 
648
  metricas_globais = calcular_metricas(df["full_text"].tolist())
649
  analise_de_duplicados = encontrar_duplicados(df, embeddings)
@@ -685,20 +696,34 @@ async def search_api(query: str = Form(...), job_id: str = Form(...)):
685
 
686
  cached_data = cache[job_id]
687
  df = cached_data["df"]
688
- corpus_embeddings = cached_data["embeddings"]
689
 
690
- # FASE 1: Varredura Ampla
691
  query_embedding = model.encode([query], convert_to_numpy=True)
692
- similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
693
 
694
- top_k_retrieval = 50
695
- top_indices = np.argsort(similarities)[-top_k_retrieval:][::-1]
 
 
 
 
 
 
 
 
 
 
 
 
696
 
697
  candidate_docs = []
698
  candidate_indices = []
699
 
700
- for idx in top_indices:
701
- if similarities[idx] > 0.15:
 
 
702
  doc_text = df.iloc[int(idx)]["full_text"]
703
  candidate_docs.append([query, doc_text])
704
  candidate_indices.append(int(idx))
@@ -886,13 +911,24 @@ async def search_web_api(
886
  # Processar através do pipeline com parâmetros para datasets pequenos
887
  df, embeddings = processar_pipeline(textos, small_dataset=True)
888
 
 
 
 
 
 
889
  # Adicionar URL de origem a cada ponto
890
  df["source_url"] = [sources[i]["url"] if i < len(sources) else "" for i in range(len(df))]
891
  df["source_title"] = [sources[i]["title"] if i < len(sources) else "" for i in range(len(df))]
892
 
893
  # Criar job e cachear
894
  job_id = str(uuid.uuid4())
895
- cache[job_id] = {"embeddings": embeddings, "df": df, "sources": sources}
 
 
 
 
 
 
896
  logging.info(f"Tavily Job criado: {job_id}")
897
 
898
  # Calcular métricas e análises
 
1
  # ==============================================================================
2
+ # API do AetherMap — VERSÃO 7.4 (FAISS EDITION)
3
+ # Backend com RAG Híbrido, CSV, Tavily, NER Entity Graph, FAISS ANN
4
  # ==============================================================================
5
 
6
  import numpy as np
 
30
  from sklearn.metrics.pairwise import cosine_similarity
31
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
32
  from scipy.stats import entropy
33
+ import faiss
34
 
35
  # NER & Language Detection
36
  import spacy
 
642
 
643
  df, embeddings = processar_pipeline(textos)
644
 
645
+ # Criar índice FAISS para busca rápida
646
+ embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
647
+ faiss_index = faiss.IndexFlatIP(embeddings_normalized.shape[1]) # Inner Product = Cosine sim para vetores normalizados
648
+ faiss_index.add(embeddings_normalized.astype('float32'))
649
+
650
  job_id = str(uuid.uuid4())
651
+ cache[job_id] = {
652
+ "embeddings": embeddings,
653
+ "embeddings_normalized": embeddings_normalized,
654
+ "faiss_index": faiss_index,
655
+ "df": df
656
+ }
657
+ logging.info(f"Job criado: {job_id} (FAISS index com {faiss_index.ntotal} vetores)")
658
 
659
  metricas_globais = calcular_metricas(df["full_text"].tolist())
660
  analise_de_duplicados = encontrar_duplicados(df, embeddings)
 
696
 
697
  cached_data = cache[job_id]
698
  df = cached_data["df"]
699
+ faiss_index = cached_data.get("faiss_index")
700
 
701
+ # FASE 1: Busca FAISS (O(log N) ao invés de O(N))
702
  query_embedding = model.encode([query], convert_to_numpy=True)
703
+ query_normalized = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
704
 
705
+ top_k_retrieval = min(50, faiss_index.ntotal) if faiss_index else 50
706
+
707
+ if faiss_index:
708
+ # FAISS search - retorna (distances, indices)
709
+ scores, top_indices = faiss_index.search(query_normalized.astype('float32'), top_k_retrieval)
710
+ scores = scores[0] # Flatten
711
+ top_indices = top_indices[0]
712
+ logging.info(f"FAISS search: top score = {scores[0]:.3f}")
713
+ else:
714
+ # Fallback para busca bruta se não tiver FAISS
715
+ corpus_embeddings = cached_data["embeddings"]
716
+ similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
717
+ top_indices = np.argsort(similarities)[-top_k_retrieval:][::-1]
718
+ scores = similarities[top_indices]
719
 
720
  candidate_docs = []
721
  candidate_indices = []
722
 
723
+ for i, idx in enumerate(top_indices):
724
+ if idx < 0: # FAISS pode retornar -1 se não tiver resultados suficientes
725
+ continue
726
+ if scores[i] > 0.15:
727
  doc_text = df.iloc[int(idx)]["full_text"]
728
  candidate_docs.append([query, doc_text])
729
  candidate_indices.append(int(idx))
 
911
  # Processar através do pipeline com parâmetros para datasets pequenos
912
  df, embeddings = processar_pipeline(textos, small_dataset=True)
913
 
914
+ # Criar índice FAISS para busca rápida
915
+ embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
916
+ faiss_index = faiss.IndexFlatIP(embeddings_normalized.shape[1])
917
+ faiss_index.add(embeddings_normalized.astype('float32'))
918
+
919
  # Adicionar URL de origem a cada ponto
920
  df["source_url"] = [sources[i]["url"] if i < len(sources) else "" for i in range(len(df))]
921
  df["source_title"] = [sources[i]["title"] if i < len(sources) else "" for i in range(len(df))]
922
 
923
  # Criar job e cachear
924
  job_id = str(uuid.uuid4())
925
+ cache[job_id] = {
926
+ "embeddings": embeddings,
927
+ "embeddings_normalized": embeddings_normalized,
928
+ "faiss_index": faiss_index,
929
+ "df": df,
930
+ "sources": sources
931
+ }
932
  logging.info(f"Tavily Job criado: {job_id}")
933
 
934
  # Calcular métricas e análises
requirements.txt CHANGED
@@ -15,6 +15,7 @@ scikit-learn
15
  scipy
16
  umap-learn
17
  hdbscan
 
18
  nltk
19
  spacy
20
  langdetect
 
15
  scipy
16
  umap-learn
17
  hdbscan
18
+ faiss-cpu
19
  nltk
20
  spacy
21
  langdetect