Spaces:

Madras1
/

AetherMap

Sleeping

App Files Files Community

Madras1 commited on Nov 30, 2025

Commit

7ba80b9

verified ·

1 Parent(s): 1250f45

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -42

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # ==============================================================================
-#  API do AetherMap — VERSÃO 6.5 GOLD (THE COMMAND KILLER + NLTK CLEANUP)
-#  Backend com RAG Híbrido (Bi-Encoder + Cross-Encoder), Citações Nativas
-#  e Stopwords Dinâmicas (PT/EN).
 # ==============================================================================
 import numpy as np
@@ -63,37 +62,53 @@ except Exception as e:
 # ==============================================================================
-# GERENCIAMENTO INTELIGENTE DE STOP WORDS (NLTK)
 # ==============================================================================
 def carregar_stopwords():
     """
-    Carrega stop words em Português e Inglês usando NLTK.
-    Remove a necessidade de listas hardcoded gigantes.
     """
-    logging.info("Verificando dicionários de Stop Words (NLTK)...")
     try:
         nltk.data.find('corpora/stopwords')
     except LookupError:
         logging.info("Baixando corpus de stopwords...")
         nltk.download('stopwords')
-    # Carrega listas oficiais
-    pt_stops = set(stopwords.words('portuguese'))
-    en_stops = set(stopwords.words('english'))
-    # Palavras customizadas do domínio AetherMap/Web
-    custom_stops = {
-        'dá', 'pergunta', 'resposta', 'aethermap', 'documento',
-        'id', 'sobre', 'texto', 'análise', 'dados', 'cluster',
-        'http', 'https', 'www', 'com', 'br', 'html', 'org'
-    }
-    # União de todos os conjuntos
-    final_stops = list(pt_stops | en_stops | custom_stops)
-    logging.info(f"Total de Stop Words carregadas: {len(final_stops)}")
-    return final_stops
-# Variável global para ser usada nos Vectorizers
 STOP_WORDS_MULTILINGUAL = carregar_stopwords()
@@ -117,7 +132,6 @@ def load_reranker():
 # PIPELINE DE PROCESSAMENTO DE DADOS
 # ==============================================================================
 def preparar_textos(file_bytes: bytes, n_samples: int) -> List[str]:
-    # Decodifica e limpa linhas vazias ou muito curtas
     linhas = file_bytes.decode("utf-8", errors="ignore").splitlines()
     textos = [s for line in linhas if (s := line.strip()) and len(s.split()) > 3]
     return textos[:n_samples]
@@ -126,23 +140,23 @@ def processar_pipeline(textos: List[str]) -> (pd.DataFrame, np.ndarray):
     logging.info(f"Iniciando pipeline para {len(textos)} textos...")
     model = load_retriever()
-    # 1. Gerar Embeddings
     embeddings = model.encode(textos, batch_size=BATCH_SIZE, show_progress_bar=False, convert_to_numpy=True)
-    # 2. Redução Dimensional (UMAP)
     reducer = umap.UMAP(n_components=3, n_neighbors=UMAP_N_NEIGHBORS, min_dist=0.0, metric="cosine", random_state=42)
     emb_3d = reducer.fit_transform(embeddings)
     emb_3d = StandardScaler().fit_transform(emb_3d)
-    # 3. Clustering (HDBSCAN Dinâmico)
     num_textos = len(textos)
     min_size = max(10, int(num_textos * 0.02))
-    logging.info(f"HDBSCAN min_cluster_size definido para: {min_size}")
     clusterer = hdbscan.HDBSCAN(min_cluster_size=min_size)
     clusters = clusterer.fit_predict(emb_3d)
-    # 4. Criar DataFrame
     df = pd.DataFrame({
         "x": emb_3d[:, 0], "y": emb_3d[:, 1], "z": emb_3d[:, 2],
         "full_text": textos, "cluster": clusters.astype(str)
@@ -152,10 +166,10 @@ def processar_pipeline(textos: List[str]) -> (pd.DataFrame, np.ndarray):
     return df, embeddings
 def calcular_metricas(textos: List[str]) -> Dict[str, Any]:
-    logging.info("Calculando métricas globais com Stopwords NLTK...")
     if not textos: return {}
-    # Usando a nova lista STOP_WORDS_MULTILINGUAL
     vectorizer_count = CountVectorizer(stop_words=STOP_WORDS_MULTILINGUAL, max_features=1000)
     vectorizer_tfidf = TfidfVectorizer(stop_words=STOP_WORDS_MULTILINGUAL, max_features=1000)
@@ -202,14 +216,14 @@ def encontrar_duplicados(df: pd.DataFrame, embeddings: np.ndarray) -> Dict[str,
     return {"grupos_exatos": grupos_exatos, "pares_semanticos": pares_semanticos}
 def analisar_clusters(df: pd.DataFrame) -> Dict[str, Any]:
-    logging.info("Analisando clusters (TF-IDF NLTK)...")
     analise = {}
     ids_clusters_validos = sorted([c for c in df["cluster"].unique() if c != "-1"], key=int)
     for cid in ids_clusters_validos:
         textos_cluster = df[df["cluster"] == cid]["full_text"].tolist()
         if len(textos_cluster) < 2: continue
         try:
-            # Usando a nova lista aqui também
             vectorizer = TfidfVectorizer(stop_words=STOP_WORDS_MULTILINGUAL, max_features=1000)
             tfidf_matrix = vectorizer.fit_transform(textos_cluster)
             vocab = vectorizer.get_feature_names_out()
@@ -225,16 +239,15 @@ def analisar_clusters(df: pd.DataFrame) -> Dict[str, Any]:
 # ==============================================================================
 # API FASTAPI
 # ==============================================================================
-app = FastAPI(title="AetherMap API 6.5", version="6.5.0", description="Backend Semantic Search with Reranking & Citations")
-# Rota Raiz para evitar o "Not Found" feio
 @app.get("/")
 async def root():
-    return {"status": "online", "message": "Aether Map API 6.5 está operante. Use /docs para testar."}
 @app.post("/process/")
 async def process_api(n_samples: int = Form(10000), file: UploadFile = File(...)):
-    logging.info(f"Requisição recebida para {file.filename}.")
     try:
         file_bytes = await file.read()
         textos = preparar_textos(file_bytes, n_samples)
@@ -274,10 +287,10 @@ async def process_api(n_samples: int = Form(10000), file: UploadFile = File(...)
 @app.post("/search/")
 async def search_api(query: str = Form(...), job_id: str = Form(...)):
     """
-    ENDPOINT DE BUSCA (RAG Híbrido com Citações)
     1. Retrieval (Bi-Encoder) -> Top 50
     2. Reranking (Cross-Encoder) -> Top 5
-    3. Generation (Kimi K2) -> Resposta citada
     """
     logging.info(f"Busca: '{query}' [Job: {job_id}]")
     if job_id not in cache:
@@ -295,15 +308,16 @@ async def search_api(query: str = Form(...), job_id: str = Form(...)):
         query_embedding = model.encode([query], convert_to_numpy=True)
         similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
-        # Pega Top 50 candidatos (com filtro mínimo de relevância)
         top_k_retrieval = 50
         top_indices = np.argsort(similarities)[-top_k_retrieval:][::-1]
         candidate_docs = []
         candidate_indices = []
         for idx in top_indices:
-            if similarities[idx] > 0.15: # Filtro de ruído básico
                 doc_text = df.iloc[int(idx)]["full_text"]
                 candidate_docs.append([query, doc_text])
                 candidate_indices.append(int(idx))
@@ -311,7 +325,7 @@ async def search_api(query: str = Form(...), job_id: str = Form(...)):
         if not candidate_docs:
              return {"summary": "Não foram encontrados documentos relevantes.", "results": []}
-        # FASE 2: Reranking (O Juiz)
         logging.info(f"Reranking {len(candidate_docs)} documentos...")
         rerank_scores = reranker.predict(candidate_docs)
@@ -328,7 +342,7 @@ async def search_api(query: str = Form(...), job_id: str = Form(...)):
         for rank, (idx, score) in enumerate(rerank_results[:final_top_k]):
             doc_text = df.iloc[idx]["full_text"]
-            # Montagem do Contexto com ID para Citação
             context_parts.append(f"[ID: {rank+1}] DOCUMENTO:\n{doc_text}\n---------------------")
             final_results.append({
@@ -338,7 +352,7 @@ async def search_api(query: str = Form(...), job_id: str = Form(...)):
                 "citation_id": rank + 1
             })
-        # FASE 3: Geração com Citações (Kimi K2)
         summary = ""
         if groq_client:
             context_str = "\n".join(context_parts)

 # ==============================================================================
+#  API do AetherMap — VERSÃO 7.0 (THE CONFIGURABLE COMMAND KILLER)
+#  Backend com RAG Híbrido, Citações Nativas e Stopwords via Arquivo Externo.
 # ==============================================================================
 import numpy as np
 # ==============================================================================
+# GERENCIAMENTO HÍBRIDO DE STOP WORDS (NLTK + ARQUIVO TXT)
 # ==============================================================================
 def carregar_stopwords():
     """
+    Carrega stop words do NLTK e combina com um arquivo externo 'stopwords.txt'.
+    Isso permite editar a lista de palavras ignoradas sem tocar no código.
     """
+    logging.info("Iniciando carregamento de Stop Words...")
+    # 1. Base Gramatical (NLTK - Inglês e Português)
     try:
         nltk.data.find('corpora/stopwords')
     except LookupError:
         logging.info("Baixando corpus de stopwords...")
         nltk.download('stopwords')
+    # Cria um conjunto único com PT e EN
+    final_stops = set(stopwords.words('portuguese')) | set(stopwords.words('english'))
+    logging.info(f"Stopwords base (NLTK) carregadas: {len(final_stops)}")
+    # 2. Base Customizada (Lendo do arquivo stopwords.txt se existir)
+    arquivo_custom = "stopwords.txt"
+    if os.path.exists(arquivo_custom):
+        logging.info(f"Arquivo '{arquivo_custom}' encontrado. Lendo palavras customizadas...")
+        try:
+            count_custom = 0
+            with open(arquivo_custom, "r", encoding="utf-8") as f:
+                for linha in f:
+                    # Remove comentários (#) e espaços em branco
+                    palavra = linha.split('#')[0].strip().lower()
+                    # Só adiciona se não for vazia e tiver mais de 1 letra
+                    if palavra and len(palavra) > 1:
+                        final_stops.add(palavra)
+                        count_custom += 1
+            logging.info(f"{count_custom} stop words customizadas importadas do arquivo.")
+        except Exception as e:
+            logging.error(f"Erro ao ler '{arquivo_custom}': {e}")
+    else:
+        logging.warning(f"Arquivo '{arquivo_custom}' não encontrado no diretório. Usando apenas NLTK.")
+    # Converte para lista para compatibilidade com Scikit-Learn
+    lista_final = list(final_stops)
+    logging.info(f"Total final de Stop Words ativas: {len(lista_final)}")
+    return lista_final
+# Variável global carregada na inicialização
 STOP_WORDS_MULTILINGUAL = carregar_stopwords()
 # PIPELINE DE PROCESSAMENTO DE DADOS
 # ==============================================================================
 def preparar_textos(file_bytes: bytes, n_samples: int) -> List[str]:
     linhas = file_bytes.decode("utf-8", errors="ignore").splitlines()
     textos = [s for line in linhas if (s := line.strip()) and len(s.split()) > 3]
     return textos[:n_samples]
     logging.info(f"Iniciando pipeline para {len(textos)} textos...")
     model = load_retriever()
+    # 1. Embeddings
     embeddings = model.encode(textos, batch_size=BATCH_SIZE, show_progress_bar=False, convert_to_numpy=True)
+    # 2. UMAP
     reducer = umap.UMAP(n_components=3, n_neighbors=UMAP_N_NEIGHBORS, min_dist=0.0, metric="cosine", random_state=42)
     emb_3d = reducer.fit_transform(embeddings)
     emb_3d = StandardScaler().fit_transform(emb_3d)
+    # 3. HDBSCAN
     num_textos = len(textos)
     min_size = max(10, int(num_textos * 0.02))
+    logging.info(f"HDBSCAN min_cluster_size: {min_size}")
     clusterer = hdbscan.HDBSCAN(min_cluster_size=min_size)
     clusters = clusterer.fit_predict(emb_3d)
+    # 4. DataFrame
     df = pd.DataFrame({
         "x": emb_3d[:, 0], "y": emb_3d[:, 1], "z": emb_3d[:, 2],
         "full_text": textos, "cluster": clusters.astype(str)
     return df, embeddings
 def calcular_metricas(textos: List[str]) -> Dict[str, Any]:
+    logging.info("Calculando métricas globais...")
     if not textos: return {}
+    # Usando a lista global que combinou NLTK + Arquivo TXT
     vectorizer_count = CountVectorizer(stop_words=STOP_WORDS_MULTILINGUAL, max_features=1000)
     vectorizer_tfidf = TfidfVectorizer(stop_words=STOP_WORDS_MULTILINGUAL, max_features=1000)
     return {"grupos_exatos": grupos_exatos, "pares_semanticos": pares_semanticos}
 def analisar_clusters(df: pd.DataFrame) -> Dict[str, Any]:
+    logging.info("Analisando clusters...")
     analise = {}
     ids_clusters_validos = sorted([c for c in df["cluster"].unique() if c != "-1"], key=int)
     for cid in ids_clusters_validos:
         textos_cluster = df[df["cluster"] == cid]["full_text"].tolist()
         if len(textos_cluster) < 2: continue
         try:
+            # Usando a lista global aqui também
             vectorizer = TfidfVectorizer(stop_words=STOP_WORDS_MULTILINGUAL, max_features=1000)
             tfidf_matrix = vectorizer.fit_transform(textos_cluster)
             vocab = vectorizer.get_feature_names_out()
 # ==============================================================================
 # API FASTAPI
 # ==============================================================================
+app = FastAPI(title="AetherMap API 7.0", version="7.0.0", description="Backend Semantic Search with Reranking & Configurable Stopwords")
 @app.get("/")
 async def root():
+    return {"status": "online", "message": "Aether Map API 7.0 Operacional. Use /docs para interagir."}
 @app.post("/process/")
 async def process_api(n_samples: int = Form(10000), file: UploadFile = File(...)):
+    logging.info(f"Processando arquivo: {file.filename}")
     try:
         file_bytes = await file.read()
         textos = preparar_textos(file_bytes, n_samples)
 @app.post("/search/")
 async def search_api(query: str = Form(...), job_id: str = Form(...)):
     """
+    ENDPOINT DE BUSCA (RAG Híbrido)
     1. Retrieval (Bi-Encoder) -> Top 50
     2. Reranking (Cross-Encoder) -> Top 5
+    3. Generation (Kimi K2) -> Resposta citada [ID: X]
     """
     logging.info(f"Busca: '{query}' [Job: {job_id}]")
     if job_id not in cache:
         query_embedding = model.encode([query], convert_to_numpy=True)
         similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
+        # Pega Top 50 candidatos
         top_k_retrieval = 50
         top_indices = np.argsort(similarities)[-top_k_retrieval:][::-1]
         candidate_docs = []
         candidate_indices = []
+        # Filtro de ruído (Cosseno > 0.15)
         for idx in top_indices:
+            if similarities[idx] > 0.15:
                 doc_text = df.iloc[int(idx)]["full_text"]
                 candidate_docs.append([query, doc_text])
                 candidate_indices.append(int(idx))
         if not candidate_docs:
              return {"summary": "Não foram encontrados documentos relevantes.", "results": []}
+        # FASE 2: Reranking (Cross-Encoder)
         logging.info(f"Reranking {len(candidate_docs)} documentos...")
         rerank_scores = reranker.predict(candidate_docs)
         for rank, (idx, score) in enumerate(rerank_results[:final_top_k]):
             doc_text = df.iloc[idx]["full_text"]
+            # Montagem do Contexto para Citação
             context_parts.append(f"[ID: {rank+1}] DOCUMENTO:\n{doc_text}\n---------------------")
             final_results.append({
                 "citation_id": rank + 1
             })
+        # FASE 3: Geração (Kimi K2)
         summary = ""
         if groq_client:
             context_str = "\n".join(context_parts)