Upload app.py
Browse files
app.py
CHANGED
|
@@ -3,23 +3,24 @@
|
|
| 3 |
# Backend com RAG Híbrido, CSV, Tavily, NER Entity Graph, FAISS ANN
|
| 4 |
# ==============================================================================
|
| 5 |
|
| 6 |
-
import numpy as np
|
| 7 |
-
import pandas as pd
|
| 8 |
-
import torch
|
| 9 |
-
import gc
|
| 10 |
-
import uuid
|
| 11 |
-
import os
|
| 12 |
-
import io
|
| 13 |
-
import json
|
| 14 |
-
import logging
|
| 15 |
-
import time
|
| 16 |
-
import
|
|
|
|
| 17 |
from nltk.corpus import stopwords
|
| 18 |
from collections import defaultdict
|
| 19 |
|
| 20 |
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
|
| 21 |
from fastapi.responses import JSONResponse
|
| 22 |
-
from typing import List, Dict, Any, Tuple
|
| 23 |
from functools import lru_cache
|
| 24 |
|
| 25 |
# Ferramentas de Alquimia (ML & NLP)
|
|
@@ -115,9 +116,19 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
|
|
| 115 |
RETRIEVAL_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" # Leve e bom PT-BR
|
| 116 |
RERANKER_MODEL = "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1" # Reranker nativo PT-BR
|
| 117 |
|
| 118 |
-
# Parâmetros de Processamento
|
| 119 |
-
BATCH_SIZE = 256
|
| 120 |
-
UMAP_N_NEIGHBORS = 30
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
# Cache de Sessão (Na memória RAM)
|
| 123 |
cache: Dict[str, Any] = {}
|
|
@@ -687,9 +698,9 @@ def processar_pipeline(
|
|
| 687 |
del reducer, clusterer, emb_3d; gc.collect()
|
| 688 |
return df, embeddings
|
| 689 |
|
| 690 |
-
def calcular_metricas(textos: List[str]) -> Dict[str, Any]:
|
| 691 |
-
logging.info("Calculando métricas globais...")
|
| 692 |
-
if not textos: return {}
|
| 693 |
|
| 694 |
# Token pattern: só palavras alfabéticas com 3+ caracteres (ignora números)
|
| 695 |
token_pattern = r'\b[a-zA-ZÀ-ÿ]{3,}\b'
|
|
@@ -719,33 +730,73 @@ def calcular_metricas(textos: List[str]) -> Dict[str, Any]:
|
|
| 719 |
top_idx_tfidf = np.argsort(soma_tfidf)[-10:][::-1]
|
| 720 |
top_tfidf = [{"palavra": vocab_tfidf[i], "score": round(float(soma_tfidf[i]), 4)} for i in top_idx_tfidf]
|
| 721 |
|
| 722 |
-
return {
|
| 723 |
-
"riqueza_lexical": len(vocab_count),
|
| 724 |
-
"top_tfidf_palavras": top_tfidf,
|
| 725 |
-
"entropia": float(entropy(contagens / contagens.sum(), base=2)) if contagens.sum() > 0 else 0.0
|
| 726 |
-
}
|
| 727 |
-
|
| 728 |
-
def
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 749 |
|
| 750 |
def analisar_clusters(df: pd.DataFrame) -> Dict[str, Any]:
|
| 751 |
logging.info("Analisando clusters...")
|
|
@@ -832,10 +883,9 @@ async def process_api(
|
|
| 832 |
custom_min_samples=min_samples_int
|
| 833 |
)
|
| 834 |
|
| 835 |
-
# Criar índice FAISS para busca rápida (semântica)
|
| 836 |
-
embeddings_normalized =
|
| 837 |
-
faiss_index =
|
| 838 |
-
faiss_index.add(embeddings_normalized.astype('float32'))
|
| 839 |
|
| 840 |
# Criar índice BM25 para busca lexical (Hybrid Search)
|
| 841 |
corpus_texts = df["full_text"].tolist()
|
|
@@ -853,7 +903,7 @@ async def process_api(
|
|
| 853 |
logging.info(f"Job criado: {job_id} (FAISS + BM25 hybrid search)")
|
| 854 |
|
| 855 |
metricas_globais = calcular_metricas(df["full_text"].tolist())
|
| 856 |
-
analise_de_duplicados = encontrar_duplicados(df,
|
| 857 |
analise_por_cluster_tfidf = analisar_clusters(df)
|
| 858 |
|
| 859 |
n_clusters = len(df["cluster"].unique()) - (1 if "-1" in df["cluster"].unique() else 0)
|
|
@@ -1389,4 +1439,4 @@ Use os dados concretos fornecidos, não generalize. Seja específico citando ent
|
|
| 1389 |
|
| 1390 |
except Exception as e:
|
| 1391 |
logging.error(f"ERRO EM /analyze_graph/: {e}", exc_info=True)
|
| 1392 |
-
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
| 3 |
# Backend com RAG Híbrido, CSV, Tavily, NER Entity Graph, FAISS ANN
|
| 4 |
# ==============================================================================
|
| 5 |
|
| 6 |
+
import numpy as np
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import torch
|
| 9 |
+
import gc
|
| 10 |
+
import uuid
|
| 11 |
+
import os
|
| 12 |
+
import io
|
| 13 |
+
import json
|
| 14 |
+
import logging
|
| 15 |
+
import time
|
| 16 |
+
import heapq
|
| 17 |
+
import nltk
|
| 18 |
from nltk.corpus import stopwords
|
| 19 |
from collections import defaultdict
|
| 20 |
|
| 21 |
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
|
| 22 |
from fastapi.responses import JSONResponse
|
| 23 |
+
from typing import List, Dict, Any, Tuple, Optional
|
| 24 |
from functools import lru_cache
|
| 25 |
|
| 26 |
# Ferramentas de Alquimia (ML & NLP)
|
|
|
|
| 116 |
RETRIEVAL_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" # Leve e bom PT-BR
|
| 117 |
RERANKER_MODEL = "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1" # Reranker nativo PT-BR
|
| 118 |
|
| 119 |
+
# Parâmetros de Processamento
|
| 120 |
+
BATCH_SIZE = 256
|
| 121 |
+
UMAP_N_NEIGHBORS = 30
|
| 122 |
+
|
| 123 |
+
# Parâmetros FAISS (HNSW) e Duplicados
|
| 124 |
+
FAISS_HNSW_MIN_SIZE = int(os.environ.get("FAISS_HNSW_MIN_SIZE", 2000))
|
| 125 |
+
FAISS_HNSW_M = int(os.environ.get("FAISS_HNSW_M", 32))
|
| 126 |
+
FAISS_HNSW_EF_CONSTRUCTION = int(os.environ.get("FAISS_HNSW_EF_CONSTRUCTION", 80))
|
| 127 |
+
FAISS_HNSW_EF_SEARCH = int(os.environ.get("FAISS_HNSW_EF_SEARCH", 64))
|
| 128 |
+
|
| 129 |
+
DUPLICATE_KNN_K = int(os.environ.get("DUPLICATE_KNN_K", 10))
|
| 130 |
+
DUPLICATE_SIM_THRESHOLD = float(os.environ.get("DUPLICATE_SIM_THRESHOLD", 0.98))
|
| 131 |
+
DUPLICATE_TOP_K = int(os.environ.get("DUPLICATE_TOP_K", 5))
|
| 132 |
|
| 133 |
# Cache de Sessão (Na memória RAM)
|
| 134 |
cache: Dict[str, Any] = {}
|
|
|
|
| 698 |
del reducer, clusterer, emb_3d; gc.collect()
|
| 699 |
return df, embeddings
|
| 700 |
|
| 701 |
+
def calcular_metricas(textos: List[str]) -> Dict[str, Any]:
|
| 702 |
+
logging.info("Calculando métricas globais...")
|
| 703 |
+
if not textos: return {}
|
| 704 |
|
| 705 |
# Token pattern: só palavras alfabéticas com 3+ caracteres (ignora números)
|
| 706 |
token_pattern = r'\b[a-zA-ZÀ-ÿ]{3,}\b'
|
|
|
|
| 730 |
top_idx_tfidf = np.argsort(soma_tfidf)[-10:][::-1]
|
| 731 |
top_tfidf = [{"palavra": vocab_tfidf[i], "score": round(float(soma_tfidf[i]), 4)} for i in top_idx_tfidf]
|
| 732 |
|
| 733 |
+
return {
|
| 734 |
+
"riqueza_lexical": len(vocab_count),
|
| 735 |
+
"top_tfidf_palavras": top_tfidf,
|
| 736 |
+
"entropia": float(entropy(contagens / contagens.sum(), base=2)) if contagens.sum() > 0 else 0.0
|
| 737 |
+
}
|
| 738 |
+
|
| 739 |
+
def normalize_embeddings(embeddings: np.ndarray) -> np.ndarray:
|
| 740 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
| 741 |
+
norms[norms == 0] = 1.0
|
| 742 |
+
return embeddings / norms
|
| 743 |
+
|
| 744 |
+
def build_faiss_index(embeddings_normalized: np.ndarray) -> faiss.Index:
|
| 745 |
+
dim = embeddings_normalized.shape[1]
|
| 746 |
+
if embeddings_normalized.shape[0] >= FAISS_HNSW_MIN_SIZE:
|
| 747 |
+
index = faiss.IndexHNSWFlat(dim, FAISS_HNSW_M, faiss.METRIC_INNER_PRODUCT)
|
| 748 |
+
index.hnsw.efConstruction = FAISS_HNSW_EF_CONSTRUCTION
|
| 749 |
+
index.hnsw.efSearch = FAISS_HNSW_EF_SEARCH
|
| 750 |
+
index_type = "HNSW"
|
| 751 |
+
else:
|
| 752 |
+
index = faiss.IndexFlatIP(dim)
|
| 753 |
+
index_type = "FlatIP"
|
| 754 |
+
index.add(embeddings_normalized.astype('float32'))
|
| 755 |
+
logging.info(f"FAISS index criado ({index_type}) com {index.ntotal} vetores")
|
| 756 |
+
return index
|
| 757 |
+
|
| 758 |
+
def encontrar_duplicados(
|
| 759 |
+
df: pd.DataFrame,
|
| 760 |
+
embeddings_normalized: np.ndarray,
|
| 761 |
+
faiss_index: Optional[faiss.Index] = None
|
| 762 |
+
) -> Dict[str, Any]:
|
| 763 |
+
logging.info("Detectando duplicados...")
|
| 764 |
+
mask = df["full_text"].duplicated(keep=False)
|
| 765 |
+
grupos_exatos = {t: [int(i) for i in idxs] for t, idxs in df[mask].groupby("full_text").groups.items()}
|
| 766 |
+
pares_semanticos = []
|
| 767 |
+
|
| 768 |
+
if len(embeddings_normalized) > 2:
|
| 769 |
+
if faiss_index is None:
|
| 770 |
+
faiss_index = build_faiss_index(embeddings_normalized)
|
| 771 |
+
if hasattr(faiss_index, "hnsw"):
|
| 772 |
+
faiss_index.hnsw.efSearch = max(faiss_index.hnsw.efSearch, FAISS_HNSW_EF_SEARCH)
|
| 773 |
+
|
| 774 |
+
k = min(DUPLICATE_KNN_K + 1, len(embeddings_normalized))
|
| 775 |
+
scores, indices = faiss_index.search(embeddings_normalized.astype('float32'), k)
|
| 776 |
+
top_pairs = []
|
| 777 |
+
|
| 778 |
+
for i in range(len(embeddings_normalized)):
|
| 779 |
+
for rank, j in enumerate(indices[i]):
|
| 780 |
+
if j <= i:
|
| 781 |
+
continue
|
| 782 |
+
score = float(scores[i][rank])
|
| 783 |
+
if score < DUPLICATE_SIM_THRESHOLD:
|
| 784 |
+
continue
|
| 785 |
+
if df["full_text"].iloc[i] == df["full_text"].iloc[j]:
|
| 786 |
+
continue
|
| 787 |
+
|
| 788 |
+
if len(top_pairs) < DUPLICATE_TOP_K:
|
| 789 |
+
heapq.heappush(top_pairs, (score, i, j))
|
| 790 |
+
elif score > top_pairs[0][0]:
|
| 791 |
+
heapq.heapreplace(top_pairs, (score, i, j))
|
| 792 |
+
|
| 793 |
+
for score, idx1, idx2 in sorted(top_pairs, reverse=True):
|
| 794 |
+
pares_semanticos.append({
|
| 795 |
+
"similaridade": float(score),
|
| 796 |
+
"texto1": df["full_text"].iloc[idx1],
|
| 797 |
+
"texto2": df["full_text"].iloc[idx2]
|
| 798 |
+
})
|
| 799 |
+
return {"grupos_exatos": grupos_exatos, "pares_semanticos": pares_semanticos}
|
| 800 |
|
| 801 |
def analisar_clusters(df: pd.DataFrame) -> Dict[str, Any]:
|
| 802 |
logging.info("Analisando clusters...")
|
|
|
|
| 883 |
custom_min_samples=min_samples_int
|
| 884 |
)
|
| 885 |
|
| 886 |
+
# Criar índice FAISS para busca rápida (semântica)
|
| 887 |
+
embeddings_normalized = normalize_embeddings(embeddings)
|
| 888 |
+
faiss_index = build_faiss_index(embeddings_normalized)
|
|
|
|
| 889 |
|
| 890 |
# Criar índice BM25 para busca lexical (Hybrid Search)
|
| 891 |
corpus_texts = df["full_text"].tolist()
|
|
|
|
| 903 |
logging.info(f"Job criado: {job_id} (FAISS + BM25 hybrid search)")
|
| 904 |
|
| 905 |
metricas_globais = calcular_metricas(df["full_text"].tolist())
|
| 906 |
+
analise_de_duplicados = encontrar_duplicados(df, embeddings_normalized, faiss_index)
|
| 907 |
analise_por_cluster_tfidf = analisar_clusters(df)
|
| 908 |
|
| 909 |
n_clusters = len(df["cluster"].unique()) - (1 if "-1" in df["cluster"].unique() else 0)
|
|
|
|
| 1439 |
|
| 1440 |
except Exception as e:
|
| 1441 |
logging.error(f"ERRO EM /analyze_graph/: {e}", exc_info=True)
|
| 1442 |
+
raise HTTPException(status_code=500, detail=str(e))
|