Spaces:

Madras1
/

AetherMap

Sleeping

App Files Files Community

Madras1 commited on Dec 19, 2025

Commit

7613840

verified ·

1 Parent(s): 5a2a4d1

Upload 6 files

Browse files

Files changed (6) hide show

.gitattributes +35 -35
Dockerfile +33 -33
README.md +11 -11
app.py +608 -461
requirements.txt +22 -21
stopwords.txt +545 -545

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -1,33 +1,33 @@
-# ==============================================================================
-# Dockerfile — AetherMap API (versão profissional)
-# ==============================================================================
-# Imagem Python robusta (não slim → evita erros de build)
-FROM python:3.10
-# Define diretório da aplicação
-WORKDIR /app
-# --- INSTALAR TORCH CPU ANTES (CRÍTICO!) ---
-# Isso garante que a versão certa (CPU) seja instalada
-RUN pip install --no-cache-dir \
-    torch \
-    torchvision \
-    torchaudio \
-    --index-url https://download.pytorch.org/whl/cpu
-# Copiar requirements
-COPY requirements.txt .
-# Instalar dependências restantes
-RUN pip install --no-cache-dir -r requirements.txt
-# Copiar código da aplicação
-COPY . .
-# Expor porta usada pelo Hugging Face Spaces
-EXPOSE 7860
-# Comando padrão para executar FastAPI
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

+# ==============================================================================
+# Dockerfile — AetherMap API (versão profissional)
+# ==============================================================================
+# Imagem Python robusta (não slim → evita erros de build)
+FROM python:3.10
+# Define diretório da aplicação
+WORKDIR /app
+# --- INSTALAR TORCH CPU ANTES (CRÍTICO!) ---
+# Isso garante que a versão certa (CPU) seja instalada
+RUN pip install --no-cache-dir \
+    torch \
+    torchvision \
+    torchaudio \
+    --index-url https://download.pytorch.org/whl/cpu
+# Copiar requirements
+COPY requirements.txt .
+# Instalar dependências restantes
+RUN pip install --no-cache-dir -r requirements.txt
+# Copiar código da aplicação
+COPY . .
+# Expor porta usada pelo Hugging Face Spaces
+EXPOSE 7860
+# Comando padrão para executar FastAPI
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,11 +1,11 @@
----
-title: AetherMap
-emoji: 🦀
-colorFrom: indigo
-colorTo: pink
-sdk: docker
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: AetherMap
+emoji: 🦀
+colorFrom: indigo
+colorTo: pink
+sdk: docker
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,462 +1,609 @@
-# ==============================================================================
-#  API do AetherMap — VERSÃO 7.1 (OBSERVABILITY EDITION)
-#  Backend com RAG Híbrido, Citações Nativas e Monitoramento Prometheus
-# ==============================================================================
-import numpy as np
-import pandas as pd
-import torch
-import gc
-import uuid
-import os
-import json
-import logging
-import time  # Adicionado para medir tempo
-import nltk
-from nltk.corpus import stopwords
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException
-from fastapi.responses import JSONResponse
-from typing import List, Dict, Any
-from functools import lru_cache
-# Ferramentas de Alquimia (ML & NLP)
-from sentence_transformers import SentenceTransformer, CrossEncoder
-import umap
-import hdbscan
-from sklearn.preprocessing import StandardScaler
-from sklearn.metrics.pairwise import cosine_similarity
-from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
-from scipy.stats import entropy
-# Monitoramento (O Toque da Berta)
-from prometheus_fastapi_instrumentator import Instrumentator
-from prometheus_client import Histogram
-# A Conexão com o Oráculo
-from groq import Groq
-# ==============================================================================
-# CONFIGURAÇÕES GERAIS E LOGGING
-# ==============================================================================
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-# Modelos de IA
-RETRIEVAL_MODEL = "all-MiniLM-L6-v2"                # Rápido para varredura inicial
-RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2" # Preciso para reordenação
-# Parâmetros de Processamento
-BATCH_SIZE = 256
-UMAP_N_NEIGHBORS = 30
-# Cache de Sessão (Na memória RAM)
-cache: Dict[str, Any] = {}
-# Definição de Métricas Customizadas do Prometheus
-# Isso permite separar a latência da sua lógica vs a latência da API externa
-GROQ_LATENCY = Histogram(
-    "groq_api_latency_seconds",
-    "Tempo de resposta da API externa Groq (LLM Generation)",
-    buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0]
-)
-# Inicialização do Cliente Groq
-GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
-try:
-    if not GROQ_API_KEY:
-        logging.warning("GROQ_API_KEY não encontrada. Funcionalidades de LLM estarão indisponíveis.")
-        groq_client = None
-    else:
-        groq_client = Groq(api_key=GROQ_API_KEY)
-        logging.info("Cliente Groq inicializado com sucesso.")
-except Exception as e:
-    logging.error(f"FALHA AO INICIALIZAR GROQ: {e}")
-    groq_client = None
-# ==============================================================================
-# GERENCIAMENTO HÍBRIDO DE STOP WORDS (NLTK + ARQUIVO TXT)
-# ==============================================================================
-def carregar_stopwords():
-    """
-    Carrega stop words do NLTK e combina com um arquivo externo 'stopwords.txt'.
-    """
-    logging.info("Iniciando carregamento de Stop Words...")
-    # 1. Base Gramatical (NLTK - Inglês e Português)
-    try:
-        nltk.data.find('corpora/stopwords')
-    except LookupError:
-        logging.info("Baixando corpus de stopwords...")
-        nltk.download('stopwords')
-    # Cria um conjunto único com PT e EN
-    final_stops = set(stopwords.words('portuguese')) | set(stopwords.words('english'))
-    logging.info(f"Stopwords base (NLTK) carregadas: {len(final_stops)}")
-    # 2. Base Customizada
-    arquivo_custom = "stopwords.txt"
-    if os.path.exists(arquivo_custom):
-        logging.info(f"Arquivo '{arquivo_custom}' encontrado. Lendo palavras customizadas...")
-        try:
-            count_custom = 0
-            with open(arquivo_custom, "r", encoding="utf-8") as f:
-                for linha in f:
-                    palavra = linha.split('#')[0].strip().lower()
-                    if palavra and len(palavra) > 1:
-                        final_stops.add(palavra)
-                        count_custom += 1
-            logging.info(f"{count_custom} stop words customizadas importadas do arquivo.")
-        except Exception as e:
-            logging.error(f"Erro ao ler '{arquivo_custom}': {e}")
-    else:
-        logging.warning(f"Arquivo '{arquivo_custom}' não encontrado. Usando apenas NLTK.")
-    lista_final = list(final_stops)
-    logging.info(f"Total final de Stop Words ativas: {len(lista_final)}")
-    return lista_final
-# Variável global carregada na inicialização
-STOP_WORDS_MULTILINGUAL = carregar_stopwords()
-# ==============================================================================
-# CARREGAMENTO DE MODELOS (COM CACHE)
-# ==============================================================================
-@lru_cache(maxsize=1)
-def load_retriever():
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    logging.info(f"Carregando Retriever '{RETRIEVAL_MODEL}' em: {device}")
-    return SentenceTransformer(RETRIEVAL_MODEL, device=device)
-@lru_cache(maxsize=1)
-def load_reranker():
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    logging.info(f"Carregando Reranker '{RERANKER_MODEL}' em: {device}")
-    return CrossEncoder(RERANKER_MODEL, device=device)
-# ==============================================================================
-# PIPELINE DE PROCESSAMENTO DE DADOS
-# ==============================================================================
-def preparar_textos(file_bytes: bytes, n_samples: int) -> List[str]:
-    linhas = file_bytes.decode("utf-8", errors="ignore").splitlines()
-    textos = [s for line in linhas if (s := line.strip()) and len(s.split()) > 3]
-    return textos[:n_samples]
-def processar_pipeline(textos: List[str]) -> (pd.DataFrame, np.ndarray):
-    logging.info(f"Iniciando pipeline para {len(textos)} textos...")
-    model = load_retriever()
-    # 1. Embeddings
-    embeddings = model.encode(textos, batch_size=BATCH_SIZE, show_progress_bar=False, convert_to_numpy=True)
-    # 2. UMAP
-    reducer = umap.UMAP(n_components=3, n_neighbors=UMAP_N_NEIGHBORS, min_dist=0.0, metric="cosine", random_state=42)
-    emb_3d = reducer.fit_transform(embeddings)
-    emb_3d = StandardScaler().fit_transform(emb_3d)
-    # 3. HDBSCAN
-    num_textos = len(textos)
-    min_size = max(10, int(num_textos * 0.02))
-    logging.info(f"HDBSCAN min_cluster_size: {min_size}")
-    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_size)
-    clusters = clusterer.fit_predict(emb_3d)
-    # 4. DataFrame
-    df = pd.DataFrame({
-        "x": emb_3d[:, 0], "y": emb_3d[:, 1], "z": emb_3d[:, 2],
-        "full_text": textos, "cluster": clusters.astype(str)
-    })
-    del reducer, clusterer, emb_3d; gc.collect()
-    return df, embeddings
-def calcular_metricas(textos: List[str]) -> Dict[str, Any]:
-    logging.info("Calculando métricas globais...")
-    if not textos: return {}
-    vectorizer_count = CountVectorizer(stop_words=STOP_WORDS_MULTILINGUAL, max_features=1000)
-    vectorizer_tfidf = TfidfVectorizer(stop_words=STOP_WORDS_MULTILINGUAL, max_features=1000)
-    try:
-        counts_matrix = vectorizer_count.fit_transform(textos)
-        tfidf_matrix = vectorizer_tfidf.fit_transform(textos)
-    except ValueError:
-        return {"riqueza_lexical": 0, "top_tfidf_palavras": [], "entropia": 0.0}
-    vocab_count = vectorizer_count.get_feature_names_out()
-    contagens = counts_matrix.sum(axis=0).A1
-    vocab_tfidf = vectorizer_tfidf.get_feature_names_out()
-    soma_tfidf = tfidf_matrix.sum(axis=0).A1
-    top_idx_tfidf = np.argsort(soma_tfidf)[-10:][::-1]
-    top_tfidf = [{"palavra": vocab_tfidf[i], "score": round(float(soma_tfidf[i]), 4)} for i in top_idx_tfidf]
-    return {
-        "riqueza_lexical": len(vocab_count),
-        "top_tfidf_palavras": top_tfidf,
-        "entropia": float(entropy(contagens / contagens.sum(), base=2)) if contagens.sum() > 0 else 0.0
-    }
-def encontrar_duplicados(df: pd.DataFrame, embeddings: np.ndarray) -> Dict[str, Any]:
-    logging.info("Detectando duplicados...")
-    mask = df["full_text"].duplicated(keep=False)
-    grupos_exatos = {t: [int(i) for i in idxs] for t, idxs in df[mask].groupby("full_text").groups.items()}
-    pares_semanticos = []
-    if 2 < len(embeddings) < 5000:
-        sim = cosine_similarity(embeddings)
-        triu_indices = np.triu_indices_from(sim, k=1)
-        sim_vetor = sim[triu_indices]
-        pares_idx = np.where(sim_vetor > 0.98)[0]
-        top_pares_idx = pares_idx[np.argsort(sim_vetor[pares_idx])[-5:][::-1]]
-        for i in top_pares_idx:
-            idx1, idx2 = triu_indices[0][i], triu_indices[1][i]
-            if df["full_text"].iloc[idx1] != df["full_text"].iloc[idx2]:
-                pares_semanticos.append({
-                    "similaridade": float(sim[idx1, idx2]),
-                    "texto1": df["full_text"].iloc[idx1],
-                    "texto2": df["full_text"].iloc[idx2]
-                })
-    return {"grupos_exatos": grupos_exatos, "pares_semanticos": pares_semanticos}
-def analisar_clusters(df: pd.DataFrame) -> Dict[str, Any]:
-    logging.info("Analisando clusters...")
-    analise = {}
-    ids_clusters_validos = sorted([c for c in df["cluster"].unique() if c != "-1"], key=int)
-    for cid in ids_clusters_validos:
-        textos_cluster = df[df["cluster"] == cid]["full_text"].tolist()
-        if len(textos_cluster) < 2: continue
-        try:
-            vectorizer = TfidfVectorizer(stop_words=STOP_WORDS_MULTILINGUAL, max_features=1000)
-            tfidf_matrix = vectorizer.fit_transform(textos_cluster)
-            vocab = vectorizer.get_feature_names_out()
-            soma = tfidf_matrix.sum(axis=0).A1
-            top_idx = np.argsort(soma)[-5:][::-1]
-            top_palavras = [{"palavra": vocab[i], "score": round(float(soma[i]), 4)} for i in top_idx]
-        except ValueError:
-            top_palavras = []
-        analise[cid] = {"num_documentos": len(textos_cluster), "top_palavras": top_palavras}
-    return analise
-# ==============================================================================
-# API FASTAPI & INSTRUMENTAÇÃO
-# ==============================================================================
-app = FastAPI(title="AetherMap API 7.1", version="7.1.0", description="Backend Semantic Search + Prometheus Metrics")
-# --- A MÁGICA ACONTECE AQUI ---
-# Isso expõe automaticamente o endpoint /metrics para o Prometheus/Grafana
-Instrumentator().instrument(app).expose(app)
-# ------------------------------
-@app.get("/")
-async def root():
-    return {"status": "online", "message": "Aether Map API 7.1 (Observability Ready)."}
-@app.post("/process/")
-async def process_api(n_samples: int = Form(10000), file: UploadFile = File(...)):
-    logging.info(f"Processando arquivo: {file.filename}")
-    try:
-        file_bytes = await file.read()
-        textos = preparar_textos(file_bytes, n_samples)
-        if not textos: raise HTTPException(status_code=400, detail="Nenhum texto válido encontrado.")
-        df, embeddings = processar_pipeline(textos)
-        job_id = str(uuid.uuid4())
-        cache[job_id] = {"embeddings": embeddings, "df": df}
-        logging.info(f"Job criado: {job_id}")
-        metricas_globais = calcular_metricas(df["full_text"].tolist())
-        analise_de_duplicados = encontrar_duplicados(df, embeddings)
-        analise_por_cluster_tfidf = analisar_clusters(df)
-        n_clusters = len(df["cluster"].unique()) - (1 if "-1" in df["cluster"].unique() else 0)
-        n_ruido = int((df["cluster"] == "-1").sum())
-        return {
-            "job_id": job_id,
-            "metadata": {
-                "filename": file.filename,
-                "num_documents_processed": len(df),
-                "num_clusters_found": n_clusters,
-                "num_noise_points": n_ruido
-            },
-            "metrics": metricas_globais,
-            "duplicates": analise_de_duplicados,
-            "cluster_analysis": analise_por_cluster_tfidf,
-            "plot_data": df[["x", "y", "z", "cluster", "full_text"]].to_dict("records"),
-        }
-    except Exception as e:
-        logging.error(f"ERRO EM /process/: {e}", exc_info=True)
-        raise HTTPException(status_code=500, detail=str(e))
-@app.post("/search/")
-async def search_api(query: str = Form(...), job_id: str = Form(...)):
-    """
-    ENDPOINT DE BUSCA (RAG Híbrido) com Monitoramento de Latência
-    """
-    logging.info(f"Busca: '{query}' [Job: {job_id}]")
-    if job_id not in cache:
-        raise HTTPException(status_code=404, detail="Job ID não encontrado.")
-    try:
-        model = load_retriever()
-        reranker = load_reranker()
-        cached_data = cache[job_id]
-        df = cached_data["df"]
-        corpus_embeddings = cached_data["embeddings"]
-        # FASE 1: Varredura Ampla
-        query_embedding = model.encode([query], convert_to_numpy=True)
-        similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
-        top_k_retrieval = 50
-        top_indices = np.argsort(similarities)[-top_k_retrieval:][::-1]
-        candidate_docs = []
-        candidate_indices = []
-        for idx in top_indices:
-            if similarities[idx] > 0.15:
-                doc_text = df.iloc[int(idx)]["full_text"]
-                candidate_docs.append([query, doc_text])
-                candidate_indices.append(int(idx))
-        if not candidate_docs:
-             return {"summary": "Não foram encontrados documentos relevantes.", "results": []}
-        # FASE 2: Reranking
-        logging.info(f"Reranking {len(candidate_docs)} documentos...")
-        rerank_scores = reranker.predict(candidate_docs)
-        rerank_results = sorted(
-            zip(candidate_indices, rerank_scores),
-            key=lambda x: x[1],
-            reverse=True
-        )
-        final_top_k = 5
-        final_results = []
-        context_parts = []
-        for rank, (idx, score) in enumerate(rerank_results[:final_top_k]):
-            doc_text = df.iloc[idx]["full_text"]
-            context_parts.append(f"[ID: {rank+1}] DOCUMENTO:\n{doc_text}\n---------------------")
-            final_results.append({
-                "index": idx,
-                "score": float(score),
-                "cosine_score": float(similarities[idx]),
-                "citation_id": rank + 1
-            })
-        # FASE 3: Geração (Groq) com TELEMETRIA
-        summary = ""
-        if groq_client:
-            context_str = "\n".join(context_parts)
-            rag_prompt = (
-                "INSTRUÇÃO DE SISTEMA:\n"
-                "Você é o Aetherius, um motor de busca semântica de alta precisão.\n"
-                "Sua missão é responder à pergunta do usuário baseando-se ESTRITAMENTE nos documentos fornecidos.\n\n"
-                "REGRAS OBRIGATÓRIAS:\n"
-                "1. CITAÇÕES: Toda afirmação deve ter fonte [ID: x]. Ex: 'O lucro subiu [ID: 1].'\n"
-                "2. HONESTIDADE: Se não estiver no texto, diga que não encontrou.\n"
-                "3. IDIOMA: Português do Brasil.\n\n"
-                f"CONTEXTO RECUPERADO:\n{context_str}\n\n"
-                f"PERGUNTA DO USUÁRIO: \"{query}\"\n\n"
-                "RESPOSTA:"
-            )
-            try:
-                # --- INÍCIO DA MEDIÇÃO DA API EXTERNA ---
-                start_time_groq = time.time()
-                chat_completion = groq_client.chat.completions.create(
-                    messages=[{"role": "user", "content": rag_prompt}],
-                    model="moonshotai/kimi-k2-instruct-0905",
-                    temperature=0.1,
-                    max_tokens=1024
-                )
-                # Registra o tempo gasto apenas na chamada da API
-                duration = time.time() - start_time_groq
-                GROQ_LATENCY.observe(duration)
-                # --- FIM DA MEDIÇÃO ---
-                summary = chat_completion.choices[0].message.content.strip()
-            except Exception as e:
-                logging.warning(f"Erro na geração do LLM: {e}")
-                summary = "Não foi possível gerar o resumo automático, mas os documentos estão listados abaixo."
-        return {"summary": summary, "results": final_results}
-    except Exception as e:
-        logging.error(f"ERRO EM /search/: {e}", exc_info=True)
-        raise HTTPException(status_code=500, detail=str(e))
-@app.post("/describe_clusters/")
-async def describe_clusters_api(job_id: str = Form(...)):
-    logging.info(f"Descrevendo clusters para Job: {job_id}")
-    if not groq_client: raise HTTPException(status_code=503, detail="Groq indisponível.")
-    if job_id not in cache: raise HTTPException(status_code=404, detail="Job não encontrado.")
-    try:
-        cached_data = cache[job_id]
-        df = cached_data["df"]
-        embeddings = cached_data["embeddings"]
-        champion_docs_by_cluster = {}
-        cluster_ids = sorted([c for c in df["cluster"].unique() if c != "-1"], key=int)
-        for cid in cluster_ids:
-            mask = df["cluster"] == cid
-            cluster_embeddings = embeddings[mask]
-            cluster_texts = df[mask]["full_text"].tolist()
-            if len(cluster_texts) < 3: continue
-            centroid = np.mean(cluster_embeddings, axis=0)
-            similarities = cosine_similarity([centroid], cluster_embeddings)[0]
-            top_indices = np.argsort(similarities)[-3:][::-1]
-            champion_docs_by_cluster[cid] = [cluster_texts[i] for i in top_indices]
-        if not champion_docs_by_cluster: return {"insights": {}}
-        prompt_sections = []
-        for cid, docs in champion_docs_by_cluster.items():
-            doc_list = "\n".join([f"- {doc[:300]}..." for doc in docs])
-            prompt_sections.append(f"Grupo {cid}:\n{doc_list}")
-        master_prompt = (
-            "Analise os grupos de texto abaixo. Para cada grupo, retorne um JSON com 'topic_name' e 'core_insight'.\n"
-            "Responda APENAS o JSON válido.\n\n" + "\n\n".join(prompt_sections)
-        )
-        # --- INÍCIO DA MEDIÇÃO DA API EXTERNA ---
-        start_time_groq = time.time()
-        chat_completion = groq_client.chat.completions.create(
-            messages=[
-                {"role": "system", "content": "JSON Output Only."},
-                {"role": "user", "content": master_prompt},
-            ], model="meta-llama/llama-4-maverick-17b-128e-instruct", temperature=0.2,
-        )
-        duration = time.time() - start_time_groq
-        GROQ_LATENCY.observe(duration)
-        # --- FIM DA MEDIÇÃO ---
-        response_content = chat_completion.choices[0].message.content
-        insights = json.loads(response_content.strip().replace("```json", "").replace("```", ""))
-        return {"insights": insights}
-    except Exception as e:
-        logging.error(f"ERRO EM /describe_clusters/: {e}", exc_info=True)
         raise HTTPException(status_code=500, detail=str(e))

+# ==============================================================================
+#  API do AetherMap — VERSÃO 7.2 (CSV + TAVILY EDITION)
+#  Backend com RAG Híbrido, CSV Support, Web Search via Tavily
+# ==============================================================================
+import numpy as np
+import pandas as pd
+import torch
+import gc
+import uuid
+import os
+import io
+import json
+import logging
+import time
+import nltk
+from nltk.corpus import stopwords
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from fastapi.responses import JSONResponse
+from typing import List, Dict, Any
+from functools import lru_cache
+# Ferramentas de Alquimia (ML & NLP)
+from sentence_transformers import SentenceTransformer, CrossEncoder
+import umap
+import hdbscan
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from scipy.stats import entropy
+# Monitoramento (O Toque da Berta)
+from prometheus_fastapi_instrumentator import Instrumentator
+from prometheus_client import Histogram
+# A Conexão com o Oráculo
+from groq import Groq
+# ==============================================================================
+# CONFIGURAÇÕES GERAIS E LOGGING
+# ==============================================================================
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# Modelos de IA
+RETRIEVAL_MODEL = "all-MiniLM-L6-v2"                # Rápido para varredura inicial
+RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2" # Preciso para reordenação
+# Parâmetros de Processamento
+BATCH_SIZE = 256
+UMAP_N_NEIGHBORS = 30
+# Cache de Sessão (Na memória RAM)
+cache: Dict[str, Any] = {}
+# Definição de Métricas Customizadas do Prometheus
+# Isso permite separar a latência da sua lógica vs a latência da API externa
+GROQ_LATENCY = Histogram(
+    "groq_api_latency_seconds",
+    "Tempo de resposta da API externa Groq (LLM Generation)",
+    buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0]
+)
+# Inicialização do Cliente Groq
+GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
+try:
+    if not GROQ_API_KEY:
+        logging.warning("GROQ_API_KEY não encontrada. Funcionalidades de LLM estarão indisponíveis.")
+        groq_client = None
+    else:
+        groq_client = Groq(api_key=GROQ_API_KEY)
+        logging.info("Cliente Groq inicializado com sucesso.")
+except Exception as e:
+    logging.error(f"FALHA AO INICIALIZAR GROQ: {e}")
+    groq_client = None
+# Inicialização do Cliente Tavily (Web Search)
+TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY")
+tavily_client = None
+try:
+    if TAVILY_API_KEY:
+        from tavily import TavilyClient
+        tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
+        logging.info("Cliente Tavily inicializado com sucesso.")
+    else:
+        logging.warning("TAVILY_API_KEY não encontrada. Busca web estará indisponível.")
+except Exception as e:
+    logging.error(f"FALHA AO INICIALIZAR TAVILY: {e}")
+    tavily_client = None
+# ==============================================================================
+# GERENCIAMENTO HÍBRIDO DE STOP WORDS (NLTK + ARQUIVO TXT)
+# ==============================================================================
+def carregar_stopwords():
+    """
+    Carrega stop words do NLTK e combina com um arquivo externo 'stopwords.txt'.
+    """
+    logging.info("Iniciando carregamento de Stop Words...")
+    # 1. Base Gramatical (NLTK - Inglês e Português)
+    try:
+        nltk.data.find('corpora/stopwords')
+    except LookupError:
+        logging.info("Baixando corpus de stopwords...")
+        nltk.download('stopwords')
+    # Cria um conjunto único com PT e EN
+    final_stops = set(stopwords.words('portuguese')) | set(stopwords.words('english'))
+    logging.info(f"Stopwords base (NLTK) carregadas: {len(final_stops)}")
+    # 2. Base Customizada
+    arquivo_custom = "stopwords.txt"
+    if os.path.exists(arquivo_custom):
+        logging.info(f"Arquivo '{arquivo_custom}' encontrado. Lendo palavras customizadas...")
+        try:
+            count_custom = 0
+            with open(arquivo_custom, "r", encoding="utf-8") as f:
+                for linha in f:
+                    palavra = linha.split('#')[0].strip().lower()
+                    if palavra and len(palavra) > 1:
+                        final_stops.add(palavra)
+                        count_custom += 1
+            logging.info(f"{count_custom} stop words customizadas importadas do arquivo.")
+        except Exception as e:
+            logging.error(f"Erro ao ler '{arquivo_custom}': {e}")
+    else:
+        logging.warning(f"Arquivo '{arquivo_custom}' não encontrado. Usando apenas NLTK.")
+    lista_final = list(final_stops)
+    logging.info(f"Total final de Stop Words ativas: {len(lista_final)}")
+    return lista_final
+# Variável global carregada na inicialização
+STOP_WORDS_MULTILINGUAL = carregar_stopwords()
+# ==============================================================================
+# CARREGAMENTO DE MODELOS (COM CACHE)
+# ==============================================================================
+@lru_cache(maxsize=1)
+def load_retriever():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    logging.info(f"Carregando Retriever '{RETRIEVAL_MODEL}' em: {device}")
+    return SentenceTransformer(RETRIEVAL_MODEL, device=device)
+@lru_cache(maxsize=1)
+def load_reranker():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    logging.info(f"Carregando Reranker '{RERANKER_MODEL}' em: {device}")
+    return CrossEncoder(RERANKER_MODEL, device=device)
+# ==============================================================================
+# PIPELINE DE PROCESSAMENTO DE DADOS
+# ==============================================================================
+def preparar_textos(file_bytes: bytes, n_samples: int) -> List[str]:
+    """Prepara textos de arquivo TXT (uma linha por documento)."""
+    linhas = file_bytes.decode("utf-8", errors="ignore").splitlines()
+    textos = [s for line in linhas if (s := line.strip()) and len(s.split()) > 3]
+    return textos[:n_samples]
+def preparar_textos_csv(file_bytes: bytes, text_column: str, n_samples: int) -> List[str]:
+    """Prepara textos de arquivo CSV extraindo coluna especificada."""
+    try:
+        df = pd.read_csv(io.BytesIO(file_bytes), encoding="utf-8")
+    except UnicodeDecodeError:
+        df = pd.read_csv(io.BytesIO(file_bytes), encoding="latin-1")
+    if text_column not in df.columns:
+        available = ", ".join(df.columns.tolist()[:10])
+        raise ValueError(f"Coluna '{text_column}' não encontrada. Colunas disponíveis: {available}")
+    textos = df[text_column].dropna().astype(str).tolist()
+    # Filtrar textos muito curtos
+    textos = [t.strip() for t in textos if len(t.strip().split()) > 3]
+    return textos[:n_samples]
+def get_csv_columns(file_bytes: bytes) -> List[str]:
+    """Retorna lista de colunas de um arquivo CSV."""
+    try:
+        df = pd.read_csv(io.BytesIO(file_bytes), nrows=0, encoding="utf-8")
+    except UnicodeDecodeError:
+        df = pd.read_csv(io.BytesIO(file_bytes), nrows=0, encoding="latin-1")
+    return df.columns.tolist()
+def processar_pipeline(textos: List[str]) -> (pd.DataFrame, np.ndarray):
+    logging.info(f"Iniciando pipeline para {len(textos)} textos...")
+    model = load_retriever()
+    # 1. Embeddings
+    embeddings = model.encode(textos, batch_size=BATCH_SIZE, show_progress_bar=False, convert_to_numpy=True)
+    # 2. UMAP
+    reducer = umap.UMAP(n_components=3, n_neighbors=UMAP_N_NEIGHBORS, min_dist=0.0, metric="cosine", random_state=42)
+    emb_3d = reducer.fit_transform(embeddings)
+    emb_3d = StandardScaler().fit_transform(emb_3d)
+    # 3. HDBSCAN
+    num_textos = len(textos)
+    min_size = max(10, int(num_textos * 0.02))
+    logging.info(f"HDBSCAN min_cluster_size: {min_size}")
+    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_size)
+    clusters = clusterer.fit_predict(emb_3d)
+    # 4. DataFrame
+    df = pd.DataFrame({
+        "x": emb_3d[:, 0], "y": emb_3d[:, 1], "z": emb_3d[:, 2],
+        "full_text": textos, "cluster": clusters.astype(str)
+    })
+    del reducer, clusterer, emb_3d; gc.collect()
+    return df, embeddings
+def calcular_metricas(textos: List[str]) -> Dict[str, Any]:
+    logging.info("Calculando métricas globais...")
+    if not textos: return {}
+    vectorizer_count = CountVectorizer(stop_words=STOP_WORDS_MULTILINGUAL, max_features=1000)
+    vectorizer_tfidf = TfidfVectorizer(stop_words=STOP_WORDS_MULTILINGUAL, max_features=1000)
+    try:
+        counts_matrix = vectorizer_count.fit_transform(textos)
+        tfidf_matrix = vectorizer_tfidf.fit_transform(textos)
+    except ValueError:
+        return {"riqueza_lexical": 0, "top_tfidf_palavras": [], "entropia": 0.0}
+    vocab_count = vectorizer_count.get_feature_names_out()
+    contagens = counts_matrix.sum(axis=0).A1
+    vocab_tfidf = vectorizer_tfidf.get_feature_names_out()
+    soma_tfidf = tfidf_matrix.sum(axis=0).A1
+    top_idx_tfidf = np.argsort(soma_tfidf)[-10:][::-1]
+    top_tfidf = [{"palavra": vocab_tfidf[i], "score": round(float(soma_tfidf[i]), 4)} for i in top_idx_tfidf]
+    return {
+        "riqueza_lexical": len(vocab_count),
+        "top_tfidf_palavras": top_tfidf,
+        "entropia": float(entropy(contagens / contagens.sum(), base=2)) if contagens.sum() > 0 else 0.0
+    }
+def encontrar_duplicados(df: pd.DataFrame, embeddings: np.ndarray) -> Dict[str, Any]:
+    logging.info("Detectando duplicados...")
+    mask = df["full_text"].duplicated(keep=False)
+    grupos_exatos = {t: [int(i) for i in idxs] for t, idxs in df[mask].groupby("full_text").groups.items()}
+    pares_semanticos = []
+    if 2 < len(embeddings) < 5000:
+        sim = cosine_similarity(embeddings)
+        triu_indices = np.triu_indices_from(sim, k=1)
+        sim_vetor = sim[triu_indices]
+        pares_idx = np.where(sim_vetor > 0.98)[0]
+        top_pares_idx = pares_idx[np.argsort(sim_vetor[pares_idx])[-5:][::-1]]
+        for i in top_pares_idx:
+            idx1, idx2 = triu_indices[0][i], triu_indices[1][i]
+            if df["full_text"].iloc[idx1] != df["full_text"].iloc[idx2]:
+                pares_semanticos.append({
+                    "similaridade": float(sim[idx1, idx2]),
+                    "texto1": df["full_text"].iloc[idx1],
+                    "texto2": df["full_text"].iloc[idx2]
+                })
+    return {"grupos_exatos": grupos_exatos, "pares_semanticos": pares_semanticos}
+def analisar_clusters(df: pd.DataFrame) -> Dict[str, Any]:
+    logging.info("Analisando clusters...")
+    analise = {}
+    ids_clusters_validos = sorted([c for c in df["cluster"].unique() if c != "-1"], key=int)
+    for cid in ids_clusters_validos:
+        textos_cluster = df[df["cluster"] == cid]["full_text"].tolist()
+        if len(textos_cluster) < 2: continue
+        try:
+            vectorizer = TfidfVectorizer(stop_words=STOP_WORDS_MULTILINGUAL, max_features=1000)
+            tfidf_matrix = vectorizer.fit_transform(textos_cluster)
+            vocab = vectorizer.get_feature_names_out()
+            soma = tfidf_matrix.sum(axis=0).A1
+            top_idx = np.argsort(soma)[-5:][::-1]
+            top_palavras = [{"palavra": vocab[i], "score": round(float(soma[i]), 4)} for i in top_idx]
+        except ValueError:
+            top_palavras = []
+        analise[cid] = {"num_documentos": len(textos_cluster), "top_palavras": top_palavras}
+    return analise
+# ==============================================================================
+# API FASTAPI & INSTRUMENTAÇÃO
+# ==============================================================================
+app = FastAPI(title="AetherMap API 7.2", version="7.2.0", description="Backend Semantic Search + CSV + Tavily Web Search")
+# --- A MÁGICA ACONTECE AQUI ---
+# Isso expõe automaticamente o endpoint /metrics para o Prometheus/Grafana
+Instrumentator().instrument(app).expose(app)
+# ------------------------------
+@app.get("/")
+async def root():
+    return {"status": "online", "message": "Aether Map API 7.2 (CSV + Tavily Ready)."}
+@app.post("/csv_columns/")
+async def get_columns_api(file: UploadFile = File(...)):
+    """Retorna as colunas de um arquivo CSV para preview."""
+    if not file.filename.lower().endswith('.csv'):
+        raise HTTPException(status_code=400, detail="Arquivo deve ser CSV.")
+    try:
+        file_bytes = await file.read()
+        columns = get_csv_columns(file_bytes)
+        return {"columns": columns, "filename": file.filename}
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Erro ao ler CSV: {str(e)}")
+@app.post("/process/")
+async def process_api(
+    n_samples: int = Form(10000),
+    file: UploadFile = File(...),
+    text_column: str = Form(None)  # Coluna de texto para CSV
+):
+    logging.info(f"Processando arquivo: {file.filename}")
+    try:
+        file_bytes = await file.read()
+        # Detectar tipo de arquivo e processar
+        if file.filename.lower().endswith('.csv'):
+            if not text_column:
+                raise HTTPException(status_code=400, detail="Para CSV, informe 'text_column'.")
+            textos = preparar_textos_csv(file_bytes, text_column, n_samples)
+        else:
+            textos = preparar_textos(file_bytes, n_samples)
+        if not textos: raise HTTPException(status_code=400, detail="Nenhum texto válido encontrado.")
+        df, embeddings = processar_pipeline(textos)
+        job_id = str(uuid.uuid4())
+        cache[job_id] = {"embeddings": embeddings, "df": df}
+        logging.info(f"Job criado: {job_id}")
+        metricas_globais = calcular_metricas(df["full_text"].tolist())
+        analise_de_duplicados = encontrar_duplicados(df, embeddings)
+        analise_por_cluster_tfidf = analisar_clusters(df)
+        n_clusters = len(df["cluster"].unique()) - (1 if "-1" in df["cluster"].unique() else 0)
+        n_ruido = int((df["cluster"] == "-1").sum())
+        return {
+            "job_id": job_id,
+            "metadata": {
+                "filename": file.filename,
+                "num_documents_processed": len(df),
+                "num_clusters_found": n_clusters,
+                "num_noise_points": n_ruido
+            },
+            "metrics": metricas_globais,
+            "duplicates": analise_de_duplicados,
+            "cluster_analysis": analise_por_cluster_tfidf,
+            "plot_data": df[["x", "y", "z", "cluster", "full_text"]].to_dict("records"),
+        }
+    except Exception as e:
+        logging.error(f"ERRO EM /process/: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/search/")
+async def search_api(query: str = Form(...), job_id: str = Form(...)):
+    """
+    ENDPOINT DE BUSCA (RAG Híbrido) com Monitoramento de Latência
+    """
+    logging.info(f"Busca: '{query}' [Job: {job_id}]")
+    if job_id not in cache:
+        raise HTTPException(status_code=404, detail="Job ID não encontrado.")
+    try:
+        model = load_retriever()
+        reranker = load_reranker()
+        cached_data = cache[job_id]
+        df = cached_data["df"]
+        corpus_embeddings = cached_data["embeddings"]
+        # FASE 1: Varredura Ampla
+        query_embedding = model.encode([query], convert_to_numpy=True)
+        similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
+        top_k_retrieval = 50
+        top_indices = np.argsort(similarities)[-top_k_retrieval:][::-1]
+        candidate_docs = []
+        candidate_indices = []
+        for idx in top_indices:
+            if similarities[idx] > 0.15:
+                doc_text = df.iloc[int(idx)]["full_text"]
+                candidate_docs.append([query, doc_text])
+                candidate_indices.append(int(idx))
+        if not candidate_docs:
+             return {"summary": "Não foram encontrados documentos relevantes.", "results": []}
+        # FASE 2: Reranking
+        logging.info(f"Reranking {len(candidate_docs)} documentos...")
+        rerank_scores = reranker.predict(candidate_docs)
+        rerank_results = sorted(
+            zip(candidate_indices, rerank_scores),
+            key=lambda x: x[1],
+            reverse=True
+        )
+        final_top_k = 5
+        final_results = []
+        context_parts = []
+        for rank, (idx, score) in enumerate(rerank_results[:final_top_k]):
+            doc_text = df.iloc[idx]["full_text"]
+            context_parts.append(f"[ID: {rank+1}] DOCUMENTO:\n{doc_text}\n---------------------")
+            final_results.append({
+                "index": idx,
+                "score": float(score),
+                "cosine_score": float(similarities[idx]),
+                "citation_id": rank + 1
+            })
+        # FASE 3: Geração (Groq) com TELEMETRIA
+        summary = ""
+        if groq_client:
+            context_str = "\n".join(context_parts)
+            rag_prompt = (
+                "INSTRUÇÃO DE SISTEMA:\n"
+                "Você é o Aetherius, um motor de busca semântica de alta precisão.\n"
+                "Sua missão é responder à pergunta do usuário baseando-se ESTRITAMENTE nos documentos fornecidos.\n\n"
+                "REGRAS OBRIGATÓRIAS:\n"
+                "1. CITAÇÕES: Toda afirmação deve ter fonte [ID: x]. Ex: 'O lucro subiu [ID: 1].'\n"
+                "2. HONESTIDADE: Se não estiver no texto, diga que não encontrou.\n"
+                "3. IDIOMA: Português do Brasil.\n\n"
+                f"CONTEXTO RECUPERADO:\n{context_str}\n\n"
+                f"PERGUNTA DO USUÁRIO: \"{query}\"\n\n"
+                "RESPOSTA:"
+            )
+            try:
+                # --- INÍCIO DA MEDIÇÃO DA API EXTERNA ---
+                start_time_groq = time.time()
+                chat_completion = groq_client.chat.completions.create(
+                    messages=[{"role": "user", "content": rag_prompt}],
+                    model="moonshotai/kimi-k2-instruct-0905",
+                    temperature=0.1,
+                    max_tokens=1024
+                )
+                # Registra o tempo gasto apenas na chamada da API
+                duration = time.time() - start_time_groq
+                GROQ_LATENCY.observe(duration)
+                # --- FIM DA MEDIÇÃO ---
+                summary = chat_completion.choices[0].message.content.strip()
+            except Exception as e:
+                logging.warning(f"Erro na geração do LLM: {e}")
+                summary = "Não foi possível gerar o resumo automático, mas os documentos estão listados abaixo."
+        return {"summary": summary, "results": final_results}
+    except Exception as e:
+        logging.error(f"ERRO EM /search/: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/describe_clusters/")
+async def describe_clusters_api(job_id: str = Form(...)):
+    logging.info(f"Descrevendo clusters para Job: {job_id}")
+    if not groq_client: raise HTTPException(status_code=503, detail="Groq indisponível.")
+    if job_id not in cache: raise HTTPException(status_code=404, detail="Job não encontrado.")
+    try:
+        cached_data = cache[job_id]
+        df = cached_data["df"]
+        embeddings = cached_data["embeddings"]
+        champion_docs_by_cluster = {}
+        cluster_ids = sorted([c for c in df["cluster"].unique() if c != "-1"], key=int)
+        for cid in cluster_ids:
+            mask = df["cluster"] == cid
+            cluster_embeddings = embeddings[mask]
+            cluster_texts = df[mask]["full_text"].tolist()
+            if len(cluster_texts) < 3: continue
+            centroid = np.mean(cluster_embeddings, axis=0)
+            similarities = cosine_similarity([centroid], cluster_embeddings)[0]
+            top_indices = np.argsort(similarities)[-3:][::-1]
+            champion_docs_by_cluster[cid] = [cluster_texts[i] for i in top_indices]
+        if not champion_docs_by_cluster: return {"insights": {}}
+        prompt_sections = []
+        for cid, docs in champion_docs_by_cluster.items():
+            doc_list = "\n".join([f"- {doc[:300]}..." for doc in docs])
+            prompt_sections.append(f"Grupo {cid}:\n{doc_list}")
+        master_prompt = (
+            "Analise os grupos de texto abaixo. Para cada grupo, retorne um JSON com 'topic_name' e 'core_insight'.\n"
+            "Responda APENAS o JSON válido.\n\n" + "\n\n".join(prompt_sections)
+        )
+        # --- INÍCIO DA MEDIÇÃO DA API EXTERNA ---
+        start_time_groq = time.time()
+        chat_completion = groq_client.chat.completions.create(
+            messages=[
+                {"role": "system", "content": "JSON Output Only."},
+                {"role": "user", "content": master_prompt},
+            ], model="meta-llama/llama-4-maverick-17b-128e-instruct", temperature=0.2,
+        )
+        duration = time.time() - start_time_groq
+        GROQ_LATENCY.observe(duration)
+        # --- FIM DA MEDIÇÃO ---
+        response_content = chat_completion.choices[0].message.content
+        insights = json.loads(response_content.strip().replace("```json", "").replace("```", ""))
+        return {"insights": insights}
+    except Exception as e:
+        logging.error(f"ERRO EM /describe_clusters/: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+# ==============================================================================
+# ENDPOINT TAVILY WEB SEARCH
+# ==============================================================================
+@app.post("/search_web/")
+async def search_web_api(
+    query: str = Form(...),
+    max_results: int = Form(20),
+    search_depth: str = Form("basic")  # "basic" ou "advanced"
+):
+    """
+    Busca na web via Tavily e processa resultados para visualização.
+    """
+    if not tavily_client:
+        raise HTTPException(status_code=503, detail="Tavily não configurado. Defina TAVILY_API_KEY.")
+    logging.info(f"Tavily Search: '{query}' (max: {max_results})")
+    try:
+        # Buscar via Tavily
+        search_result = tavily_client.search(
+            query=query,
+            max_results=max_results,
+            search_depth=search_depth,
+            include_answer=False
+        )
+        results = search_result.get("results", [])
+        if not results:
+            return {"error": "Nenhum resultado encontrado.", "results_count": 0}
+        # Extrair textos dos resultados
+        textos = []
+        sources = []
+        for r in results:
+            title = r.get("title", "")
+            content = r.get("content", "")
+            url = r.get("url", "")
+            # Combinar título + conteúdo
+            full_text = f"{title}: {content}" if title else content
+            if len(full_text.strip().split()) > 5:
+                textos.append(full_text.strip())
+                sources.append({"title": title, "url": url})
+        if not textos:
+            return {"error": "Resultados sem conteúdo válido.", "results_count": 0}
+        # Processar através do pipeline existente
+        df, embeddings = processar_pipeline(textos)
+        # Criar job e cachear
+        job_id = str(uuid.uuid4())
+        cache[job_id] = {"embeddings": embeddings, "df": df, "sources": sources}
+        logging.info(f"Tavily Job criado: {job_id}")
+        # Calcular métricas e análises
+        metricas_globais = calcular_metricas(df["full_text"].tolist())
+        analise_por_cluster_tfidf = analisar_clusters(df)
+        n_clusters = len(df["cluster"].unique()) - (1 if "-1" in df["cluster"].unique() else 0)
+        n_ruido = int((df["cluster"] == "-1").sum())
+        return {
+            "job_id": job_id,
+            "metadata": {
+                "query": query,
+                "source": "tavily_web_search",
+                "num_documents_processed": len(df),
+                "num_clusters_found": n_clusters,
+                "num_noise_points": n_ruido
+            },
+            "metrics": metricas_globais,
+            "cluster_analysis": analise_por_cluster_tfidf,
+            "plot_data": df[["x", "y", "z", "cluster", "full_text"]].to_dict("records"),
+            "sources": sources  # URLs originais
+        }
+    except Exception as e:
+        logging.error(f"ERRO EM /search_web/: {e}", exc_info=True)
         raise HTTPException(status_code=500, detail=str(e))

requirements.txt CHANGED Viewed

@@ -1,22 +1,23 @@
-# --- SERVIDOR E API ---
-fastapi
-uvicorn[standard]
-python-multipart
-groq
-prometheus-fastapi-instrumentator
-prometheus-client
-# --- MACHINE LEARNING E NLP ---
-sentence-transformers
-numpy
-pandas
-scikit-learn
-scipy
-umap-learn
-hdbscan
-nltk
-# --- TORCH CPU ---
-torch
-torchvision
 torchaudio

+# --- SERVIDOR E API ---
+fastapi
+uvicorn[standard]
+python-multipart
+groq
+prometheus-fastapi-instrumentator
+prometheus-client
+tavily-python
+# --- MACHINE LEARNING E NLP ---
+sentence-transformers
+numpy
+pandas
+scikit-learn
+scipy
+umap-learn
+hdbscan
+nltk
+# --- TORCH CPU ---
+torch
+torchvision
 torchaudio

stopwords.txt CHANGED Viewed

@@ -1,546 +1,546 @@
-# ==============================================================================
-# STOPWORDS DO AETHER MAP - LISTA MESTRA (PT + EN)
-# ==============================================================================
-# --- TERMOS DE SISTEMA E WEB ---
-http
-https
-www
-com
-br
-org
-net
-html
-php
-jsp
-asp
-pdf
-docx
-xlsx
-json
-api
-id
-url
-email
-site
-website
-page
-pagina
-link
-click
-login
-# --- TERMOS GENÉRICOS DE DOCUMENTOS ---
-document
-documento
-texto
-text
-file
-arquivo
-data
-dados
-database
-base
-dataset
-sample
-amostra
-example
-exemplo
-case
-caso
-study
-estudo
-analysis
-analise
-análise
-report
-relatorio
-relatório
-paper
-artigo
-results
-resultados
-conclusion
-conclusão
-introduction
-introdução
-abstract
-resumo
-chapter
-capitulo
-capítulo
-section
-seção
-part
-parte
-figure
-figura
-fig
-table
-tabela
-tab
-chart
-grafico
-gráfico
-image
-imagem
-source
-fonte
-reference
-referencia
-referência
-bibliography
-bibliografia
-et
-al
-citation
-citação
-# --- INGLÊS ACADÊMICO E "FILLER WORDS" ---
-the
-be
-to
-of
-and
-a
-in
-that
-have
-i
-it
-for
-not
-on
-with
-he
-as
-you
-do
-at
-this
-but
-his
-by
-from
-they
-we
-say
-her
-she
-or
-an
-will
-my
-one
-all
-would
-there
-their
-what
-so
-up
-out
-if
-about
-who
-get
-which
-go
-me
-when
-make
-can
-like
-time
-no
-just
-him
-know
-take
-people
-into
-year
-your
-good
-some
-could
-them
-see
-other
-than
-then
-now
-look
-only
-come
-its
-over
-think
-also
-back
-after
-use
-two
-how
-our
-work
-first
-well
-way
-even
-new
-want
-because
-any
-these
-give
-day
-most
-us
-is
-are
-was
-were
-been
-has
-had
-did
-does
-may
-might
-should
-must
-shall
-used
-using
-uses
-based
-found
-show
-shown
-shows
-suggest
-suggests
-however
-therefore
-thus
-hence
-although
-though
-whereas
-while
-meanwhile
-furthermore
-moreover
-additionally
-besides
-indeed
-fact
-overall
-general
-specific
-significantly
-associated
-related
-various
-several
-many
-much
-less
-more
-high
-low
-increase
-decrease
-positive
-negative
-# --- PORTUGUÊS ACADÊMICO E "PALAVRAS VAZIAS" ---
-de
-a
-o
-que
-e
-do
-da
-em
-um
-para
-é
-com
-não
-uma
-os
-no
-se
-na
-por
-mais
-as
-dos
-como
-mas
-foi
-ao
-ele
-das
-tem
-à
-seu
-sua
-ou
-ser
-quando
-muito
-há
-nos
-já
-está
-eu
-também
-só
-pelo
-pela
-até
-isso
-ela
-entre
-era
-depois
-sem
-mesmo
-aos
-ter
-seus
-quem
-nas
-me
-esse
-eles
-estão
-você
-tinha
-foram
-essa
-num
-nem
-suas
-meu
-às
-minha
-numa
-pelos
-elas
-havia
-seja
-qual
-será
-nós
-tenho
-lhe
-deles
-essas
-esses
-pelas
-este
-fosse
-dele
-tu
-te
-vocês
-vos
-lhes
-meus
-minhas
-teu
-tua
-teus
-tuas
-nosso
-nossa
-nossos
-nossas
-dela
-delas
-esta
-estes
-estas
-aquele
-aquela
-aqueles
-aquelas
-isto
-aquilo
-estou
-está
-estamos
-estão
-estive
-esteve
-estivemos
-estiveram
-estava
-estávamos
-estavam
-estivera
-estivéramos
-esteja
-estejamos
-estejam
-estivesse
-estivéssemos
-estivessem
-estiver
-estivermos
-estiverem
-hei
-há
-havemos
-hão
-houve
-houvemos
-houveram
-houvera
-houvéramos
-haja
-hajamos
-hajam
-houvesse
-houvéssemos
-houvessem
-houver
-houvermos
-houverem
-houverei
-houverá
-houveremos
-houverão
-houveria
-houveríamos
-houveriam
-sou
-somos
-são
-era
-éramos
-eram
-fui
-foi
-fomos
-foram
-fora
-fôramos
-seja
-sejamos
-sejam
-fosse
-fôssemos
-fossem
-for
-formos
-forem
-serei
-será
-seremos
-serão
-seria
-seríamos
-seriam
-tenho
-tem
-temos
-tém
-tinha
-tínhamos
-tinham
-tive
-teve
-tivemos
-tiveram
-tivera
-tivéramos
-tenha
-tenhamos
-tenham
-tivesse
-tivéssemos
-tivessem
-tiver
-tivermos
-tiverem
-terei
-terá
-teremos
-terão
-teria
-teríamos
-teriam
-dá
-pode
-poder
-podem
-poderia
-poderiam
-fazer
-feito
-faz
-fazem
-dizer
-diz
-disse
-dizem
-coisa
-coisas
-tudo
-todo
-toda
-todos
-todas
-algo
-alguém
-algum
-alguma
-alguns
-algumas
-nada
-ninguém
-nenhum
-nenhuma
-cada
-onde
-aonde
-qualquer
-vários
-várias
-apenas
-somente
-através
-mediante
-conforme
-segundo
-visto
-dado
-sendo
-tendo
-havendo
-ficando
-geral
-grande
-pequeno
-novo
-nova
-velho
-velha
-bom
-boa
-mau
-má
-alto
-baixo
-primeiro
-segundo
-terceiro
-último
-próximo
-anterior
 seguinte

+# ==============================================================================
+# STOPWORDS DO AETHER MAP - LISTA MESTRA (PT + EN)
+# ==============================================================================
+# --- TERMOS DE SISTEMA E WEB ---
+http
+https
+www
+com
+br
+org
+net
+html
+php
+jsp
+asp
+pdf
+docx
+xlsx
+json
+api
+id
+url
+email
+site
+website
+page
+pagina
+link
+click
+login
+# --- TERMOS GENÉRICOS DE DOCUMENTOS ---
+document
+documento
+texto
+text
+file
+arquivo
+data
+dados
+database
+base
+dataset
+sample
+amostra
+example
+exemplo
+case
+caso
+study
+estudo
+analysis
+analise
+análise
+report
+relatorio
+relatório
+paper
+artigo
+results
+resultados
+conclusion
+conclusão
+introduction
+introdução
+abstract
+resumo
+chapter
+capitulo
+capítulo
+section
+seção
+part
+parte
+figure
+figura
+fig
+table
+tabela
+tab
+chart
+grafico
+gráfico
+image
+imagem
+source
+fonte
+reference
+referencia
+referência
+bibliography
+bibliografia
+et
+al
+citation
+citação
+# --- INGLÊS ACADÊMICO E "FILLER WORDS" ---
+the
+be
+to
+of
+and
+a
+in
+that
+have
+i
+it
+for
+not
+on
+with
+he
+as
+you
+do
+at
+this
+but
+his
+by
+from
+they
+we
+say
+her
+she
+or
+an
+will
+my
+one
+all
+would
+there
+their
+what
+so
+up
+out
+if
+about
+who
+get
+which
+go
+me
+when
+make
+can
+like
+time
+no
+just
+him
+know
+take
+people
+into
+year
+your
+good
+some
+could
+them
+see
+other
+than
+then
+now
+look
+only
+come
+its
+over
+think
+also
+back
+after
+use
+two
+how
+our
+work
+first
+well
+way
+even
+new
+want
+because
+any
+these
+give
+day
+most
+us
+is
+are
+was
+were
+been
+has
+had
+did
+does
+may
+might
+should
+must
+shall
+used
+using
+uses
+based
+found
+show
+shown
+shows
+suggest
+suggests
+however
+therefore
+thus
+hence
+although
+though
+whereas
+while
+meanwhile
+furthermore
+moreover
+additionally
+besides
+indeed
+fact
+overall
+general
+specific
+significantly
+associated
+related
+various
+several
+many
+much
+less
+more
+high
+low
+increase
+decrease
+positive
+negative
+# --- PORTUGUÊS ACADÊMICO E "PALAVRAS VAZIAS" ---
+de
+a
+o
+que
+e
+do
+da
+em
+um
+para
+é
+com
+não
+uma
+os
+no
+se
+na
+por
+mais
+as
+dos
+como
+mas
+foi
+ao
+ele
+das
+tem
+à
+seu
+sua
+ou
+ser
+quando
+muito
+há
+nos
+já
+está
+eu
+também
+só
+pelo
+pela
+até
+isso
+ela
+entre
+era
+depois
+sem
+mesmo
+aos
+ter
+seus
+quem
+nas
+me
+esse
+eles
+estão
+você
+tinha
+foram
+essa
+num
+nem
+suas
+meu
+às
+minha
+numa
+pelos
+elas
+havia
+seja
+qual
+será
+nós
+tenho
+lhe
+deles
+essas
+esses
+pelas
+este
+fosse
+dele
+tu
+te
+vocês
+vos
+lhes
+meus
+minhas
+teu
+tua
+teus
+tuas
+nosso
+nossa
+nossos
+nossas
+dela
+delas
+esta
+estes
+estas
+aquele
+aquela
+aqueles
+aquelas
+isto
+aquilo
+estou
+está
+estamos
+estão
+estive
+esteve
+estivemos
+estiveram
+estava
+estávamos
+estavam
+estivera
+estivéramos
+esteja
+estejamos
+estejam
+estivesse
+estivéssemos
+estivessem
+estiver
+estivermos
+estiverem
+hei
+há
+havemos
+hão
+houve
+houvemos
+houveram
+houvera
+houvéramos
+haja
+hajamos
+hajam
+houvesse
+houvéssemos
+houvessem
+houver
+houvermos
+houverem
+houverei
+houverá
+houveremos
+houverão
+houveria
+houveríamos
+houveriam
+sou
+somos
+são
+era
+éramos
+eram
+fui
+foi
+fomos
+foram
+fora
+fôramos
+seja
+sejamos
+sejam
+fosse
+fôssemos
+fossem
+for
+formos
+forem
+serei
+será
+seremos
+serão
+seria
+seríamos
+seriam
+tenho
+tem
+temos
+tém
+tinha
+tínhamos
+tinham
+tive
+teve
+tivemos
+tiveram
+tivera
+tivéramos
+tenha
+tenhamos
+tenham
+tivesse
+tivéssemos
+tivessem
+tiver
+tivermos
+tiverem
+terei
+terá
+teremos
+terão
+teria
+teríamos
+teriam
+dá
+pode
+poder
+podem
+poderia
+poderiam
+fazer
+feito
+faz
+fazem
+dizer
+diz
+disse
+dizem
+coisa
+coisas
+tudo
+todo
+toda
+todos
+todas
+algo
+alguém
+algum
+alguma
+alguns
+algumas
+nada
+ninguém
+nenhum
+nenhuma
+cada
+onde
+aonde
+qualquer
+vários
+várias
+apenas
+somente
+através
+mediante
+conforme
+segundo
+visto
+dado
+sendo
+tendo
+havendo
+ficando
+geral
+grande
+pequeno
+novo
+nova
+velho
+velha
+bom
+boa
+mau
+má
+alto
+baixo
+primeiro
+segundo
+terceiro
+último
+próximo
+anterior
 seguinte