Upload 6 files
Browse files- app.py +48 -12
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# ==============================================================================
|
| 2 |
-
# API do AetherMap — VERSÃO 7.
|
| 3 |
-
# Backend com RAG Híbrido, CSV, Tavily, NER Entity Graph
|
| 4 |
# ==============================================================================
|
| 5 |
|
| 6 |
import numpy as np
|
|
@@ -30,6 +30,7 @@ from sklearn.preprocessing import StandardScaler
|
|
| 30 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 31 |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
| 32 |
from scipy.stats import entropy
|
|
|
|
| 33 |
|
| 34 |
# NER & Language Detection
|
| 35 |
import spacy
|
|
@@ -641,9 +642,19 @@ async def process_api(
|
|
| 641 |
|
| 642 |
df, embeddings = processar_pipeline(textos)
|
| 643 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 644 |
job_id = str(uuid.uuid4())
|
| 645 |
-
cache[job_id] = {
|
| 646 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
|
| 648 |
metricas_globais = calcular_metricas(df["full_text"].tolist())
|
| 649 |
analise_de_duplicados = encontrar_duplicados(df, embeddings)
|
|
@@ -685,20 +696,34 @@ async def search_api(query: str = Form(...), job_id: str = Form(...)):
|
|
| 685 |
|
| 686 |
cached_data = cache[job_id]
|
| 687 |
df = cached_data["df"]
|
| 688 |
-
|
| 689 |
|
| 690 |
-
# FASE 1:
|
| 691 |
query_embedding = model.encode([query], convert_to_numpy=True)
|
| 692 |
-
|
| 693 |
|
| 694 |
-
top_k_retrieval = 50
|
| 695 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 696 |
|
| 697 |
candidate_docs = []
|
| 698 |
candidate_indices = []
|
| 699 |
|
| 700 |
-
for idx in top_indices:
|
| 701 |
-
if
|
|
|
|
|
|
|
| 702 |
doc_text = df.iloc[int(idx)]["full_text"]
|
| 703 |
candidate_docs.append([query, doc_text])
|
| 704 |
candidate_indices.append(int(idx))
|
|
@@ -886,13 +911,24 @@ async def search_web_api(
|
|
| 886 |
# Processar através do pipeline com parâmetros para datasets pequenos
|
| 887 |
df, embeddings = processar_pipeline(textos, small_dataset=True)
|
| 888 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 889 |
# Adicionar URL de origem a cada ponto
|
| 890 |
df["source_url"] = [sources[i]["url"] if i < len(sources) else "" for i in range(len(df))]
|
| 891 |
df["source_title"] = [sources[i]["title"] if i < len(sources) else "" for i in range(len(df))]
|
| 892 |
|
| 893 |
# Criar job e cachear
|
| 894 |
job_id = str(uuid.uuid4())
|
| 895 |
-
cache[job_id] = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 896 |
logging.info(f"Tavily Job criado: {job_id}")
|
| 897 |
|
| 898 |
# Calcular métricas e análises
|
|
|
|
| 1 |
# ==============================================================================
|
| 2 |
+
# API do AetherMap — VERSÃO 7.4 (FAISS EDITION)
|
| 3 |
+
# Backend com RAG Híbrido, CSV, Tavily, NER Entity Graph, FAISS ANN
|
| 4 |
# ==============================================================================
|
| 5 |
|
| 6 |
import numpy as np
|
|
|
|
| 30 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 31 |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
| 32 |
from scipy.stats import entropy
|
| 33 |
+
import faiss
|
| 34 |
|
| 35 |
# NER & Language Detection
|
| 36 |
import spacy
|
|
|
|
| 642 |
|
| 643 |
df, embeddings = processar_pipeline(textos)
|
| 644 |
|
| 645 |
+
# Criar índice FAISS para busca rápida
|
| 646 |
+
embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
|
| 647 |
+
faiss_index = faiss.IndexFlatIP(embeddings_normalized.shape[1]) # Inner Product = Cosine sim para vetores normalizados
|
| 648 |
+
faiss_index.add(embeddings_normalized.astype('float32'))
|
| 649 |
+
|
| 650 |
job_id = str(uuid.uuid4())
|
| 651 |
+
cache[job_id] = {
|
| 652 |
+
"embeddings": embeddings,
|
| 653 |
+
"embeddings_normalized": embeddings_normalized,
|
| 654 |
+
"faiss_index": faiss_index,
|
| 655 |
+
"df": df
|
| 656 |
+
}
|
| 657 |
+
logging.info(f"Job criado: {job_id} (FAISS index com {faiss_index.ntotal} vetores)")
|
| 658 |
|
| 659 |
metricas_globais = calcular_metricas(df["full_text"].tolist())
|
| 660 |
analise_de_duplicados = encontrar_duplicados(df, embeddings)
|
|
|
|
| 696 |
|
| 697 |
cached_data = cache[job_id]
|
| 698 |
df = cached_data["df"]
|
| 699 |
+
faiss_index = cached_data.get("faiss_index")
|
| 700 |
|
| 701 |
+
# FASE 1: Busca FAISS (O(log N) ao invés de O(N))
|
| 702 |
query_embedding = model.encode([query], convert_to_numpy=True)
|
| 703 |
+
query_normalized = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
|
| 704 |
|
| 705 |
+
top_k_retrieval = min(50, faiss_index.ntotal) if faiss_index else 50
|
| 706 |
+
|
| 707 |
+
if faiss_index:
|
| 708 |
+
# FAISS search - retorna (distances, indices)
|
| 709 |
+
scores, top_indices = faiss_index.search(query_normalized.astype('float32'), top_k_retrieval)
|
| 710 |
+
scores = scores[0] # Flatten
|
| 711 |
+
top_indices = top_indices[0]
|
| 712 |
+
logging.info(f"FAISS search: top score = {scores[0]:.3f}")
|
| 713 |
+
else:
|
| 714 |
+
# Fallback para busca bruta se não tiver FAISS
|
| 715 |
+
corpus_embeddings = cached_data["embeddings"]
|
| 716 |
+
similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
|
| 717 |
+
top_indices = np.argsort(similarities)[-top_k_retrieval:][::-1]
|
| 718 |
+
scores = similarities[top_indices]
|
| 719 |
|
| 720 |
candidate_docs = []
|
| 721 |
candidate_indices = []
|
| 722 |
|
| 723 |
+
for i, idx in enumerate(top_indices):
|
| 724 |
+
if idx < 0: # FAISS pode retornar -1 se não tiver resultados suficientes
|
| 725 |
+
continue
|
| 726 |
+
if scores[i] > 0.15:
|
| 727 |
doc_text = df.iloc[int(idx)]["full_text"]
|
| 728 |
candidate_docs.append([query, doc_text])
|
| 729 |
candidate_indices.append(int(idx))
|
|
|
|
| 911 |
# Processar através do pipeline com parâmetros para datasets pequenos
|
| 912 |
df, embeddings = processar_pipeline(textos, small_dataset=True)
|
| 913 |
|
| 914 |
+
# Criar índice FAISS para busca rápida
|
| 915 |
+
embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
|
| 916 |
+
faiss_index = faiss.IndexFlatIP(embeddings_normalized.shape[1])
|
| 917 |
+
faiss_index.add(embeddings_normalized.astype('float32'))
|
| 918 |
+
|
| 919 |
# Adicionar URL de origem a cada ponto
|
| 920 |
df["source_url"] = [sources[i]["url"] if i < len(sources) else "" for i in range(len(df))]
|
| 921 |
df["source_title"] = [sources[i]["title"] if i < len(sources) else "" for i in range(len(df))]
|
| 922 |
|
| 923 |
# Criar job e cachear
|
| 924 |
job_id = str(uuid.uuid4())
|
| 925 |
+
cache[job_id] = {
|
| 926 |
+
"embeddings": embeddings,
|
| 927 |
+
"embeddings_normalized": embeddings_normalized,
|
| 928 |
+
"faiss_index": faiss_index,
|
| 929 |
+
"df": df,
|
| 930 |
+
"sources": sources
|
| 931 |
+
}
|
| 932 |
logging.info(f"Tavily Job criado: {job_id}")
|
| 933 |
|
| 934 |
# Calcular métricas e análises
|
requirements.txt
CHANGED
|
@@ -15,6 +15,7 @@ scikit-learn
|
|
| 15 |
scipy
|
| 16 |
umap-learn
|
| 17 |
hdbscan
|
|
|
|
| 18 |
nltk
|
| 19 |
spacy
|
| 20 |
langdetect
|
|
|
|
| 15 |
scipy
|
| 16 |
umap-learn
|
| 17 |
hdbscan
|
| 18 |
+
faiss-cpu
|
| 19 |
nltk
|
| 20 |
spacy
|
| 21 |
langdetect
|