Madras1 commited on
Commit
dcdca2d
·
verified ·
1 Parent(s): b59c07a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -51
app.py CHANGED
@@ -3,23 +3,24 @@
3
  # Backend com RAG Híbrido, CSV, Tavily, NER Entity Graph, FAISS ANN
4
  # ==============================================================================
5
 
6
- import numpy as np
7
- import pandas as pd
8
- import torch
9
- import gc
10
- import uuid
11
- import os
12
- import io
13
- import json
14
- import logging
15
- import time
16
- import nltk
 
17
  from nltk.corpus import stopwords
18
  from collections import defaultdict
19
 
20
  from fastapi import FastAPI, UploadFile, File, Form, HTTPException
21
  from fastapi.responses import JSONResponse
22
- from typing import List, Dict, Any, Tuple
23
  from functools import lru_cache
24
 
25
  # Ferramentas de Alquimia (ML & NLP)
@@ -115,9 +116,19 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
115
  RETRIEVAL_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" # Leve e bom PT-BR
116
  RERANKER_MODEL = "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1" # Reranker nativo PT-BR
117
 
118
- # Parâmetros de Processamento
119
- BATCH_SIZE = 256
120
- UMAP_N_NEIGHBORS = 30
 
 
 
 
 
 
 
 
 
 
121
 
122
  # Cache de Sessão (Na memória RAM)
123
  cache: Dict[str, Any] = {}
@@ -687,9 +698,9 @@ def processar_pipeline(
687
  del reducer, clusterer, emb_3d; gc.collect()
688
  return df, embeddings
689
 
690
- def calcular_metricas(textos: List[str]) -> Dict[str, Any]:
691
- logging.info("Calculando métricas globais...")
692
- if not textos: return {}
693
 
694
  # Token pattern: só palavras alfabéticas com 3+ caracteres (ignora números)
695
  token_pattern = r'\b[a-zA-ZÀ-ÿ]{3,}\b'
@@ -719,33 +730,73 @@ def calcular_metricas(textos: List[str]) -> Dict[str, Any]:
719
  top_idx_tfidf = np.argsort(soma_tfidf)[-10:][::-1]
720
  top_tfidf = [{"palavra": vocab_tfidf[i], "score": round(float(soma_tfidf[i]), 4)} for i in top_idx_tfidf]
721
 
722
- return {
723
- "riqueza_lexical": len(vocab_count),
724
- "top_tfidf_palavras": top_tfidf,
725
- "entropia": float(entropy(contagens / contagens.sum(), base=2)) if contagens.sum() > 0 else 0.0
726
- }
727
-
728
- def encontrar_duplicados(df: pd.DataFrame, embeddings: np.ndarray) -> Dict[str, Any]:
729
- logging.info("Detectando duplicados...")
730
- mask = df["full_text"].duplicated(keep=False)
731
- grupos_exatos = {t: [int(i) for i in idxs] for t, idxs in df[mask].groupby("full_text").groups.items()}
732
- pares_semanticos = []
733
-
734
- if 2 < len(embeddings) < 5000:
735
- sim = cosine_similarity(embeddings)
736
- triu_indices = np.triu_indices_from(sim, k=1)
737
- sim_vetor = sim[triu_indices]
738
- pares_idx = np.where(sim_vetor > 0.98)[0]
739
- top_pares_idx = pares_idx[np.argsort(sim_vetor[pares_idx])[-5:][::-1]]
740
- for i in top_pares_idx:
741
- idx1, idx2 = triu_indices[0][i], triu_indices[1][i]
742
- if df["full_text"].iloc[idx1] != df["full_text"].iloc[idx2]:
743
- pares_semanticos.append({
744
- "similaridade": float(sim[idx1, idx2]),
745
- "texto1": df["full_text"].iloc[idx1],
746
- "texto2": df["full_text"].iloc[idx2]
747
- })
748
- return {"grupos_exatos": grupos_exatos, "pares_semanticos": pares_semanticos}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
749
 
750
  def analisar_clusters(df: pd.DataFrame) -> Dict[str, Any]:
751
  logging.info("Analisando clusters...")
@@ -832,10 +883,9 @@ async def process_api(
832
  custom_min_samples=min_samples_int
833
  )
834
 
835
- # Criar índice FAISS para busca rápida (semântica)
836
- embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
837
- faiss_index = faiss.IndexFlatIP(embeddings_normalized.shape[1]) # Inner Product = Cosine sim para vetores normalizados
838
- faiss_index.add(embeddings_normalized.astype('float32'))
839
 
840
  # Criar índice BM25 para busca lexical (Hybrid Search)
841
  corpus_texts = df["full_text"].tolist()
@@ -853,7 +903,7 @@ async def process_api(
853
  logging.info(f"Job criado: {job_id} (FAISS + BM25 hybrid search)")
854
 
855
  metricas_globais = calcular_metricas(df["full_text"].tolist())
856
- analise_de_duplicados = encontrar_duplicados(df, embeddings)
857
  analise_por_cluster_tfidf = analisar_clusters(df)
858
 
859
  n_clusters = len(df["cluster"].unique()) - (1 if "-1" in df["cluster"].unique() else 0)
@@ -1389,4 +1439,4 @@ Use os dados concretos fornecidos, não generalize. Seja específico citando ent
1389
 
1390
  except Exception as e:
1391
  logging.error(f"ERRO EM /analyze_graph/: {e}", exc_info=True)
1392
- raise HTTPException(status_code=500, detail=str(e))
 
3
  # Backend com RAG Híbrido, CSV, Tavily, NER Entity Graph, FAISS ANN
4
  # ==============================================================================
5
 
6
+ import numpy as np
7
+ import pandas as pd
8
+ import torch
9
+ import gc
10
+ import uuid
11
+ import os
12
+ import io
13
+ import json
14
+ import logging
15
+ import time
16
+ import heapq
17
+ import nltk
18
  from nltk.corpus import stopwords
19
  from collections import defaultdict
20
 
21
  from fastapi import FastAPI, UploadFile, File, Form, HTTPException
22
  from fastapi.responses import JSONResponse
23
+ from typing import List, Dict, Any, Tuple, Optional
24
  from functools import lru_cache
25
 
26
  # Ferramentas de Alquimia (ML & NLP)
 
116
  RETRIEVAL_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" # Leve e bom PT-BR
117
  RERANKER_MODEL = "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1" # Reranker nativo PT-BR
118
 
119
+ # Parâmetros de Processamento
120
+ BATCH_SIZE = 256
121
+ UMAP_N_NEIGHBORS = 30
122
+
123
+ # Parâmetros FAISS (HNSW) e Duplicados
124
+ FAISS_HNSW_MIN_SIZE = int(os.environ.get("FAISS_HNSW_MIN_SIZE", 2000))
125
+ FAISS_HNSW_M = int(os.environ.get("FAISS_HNSW_M", 32))
126
+ FAISS_HNSW_EF_CONSTRUCTION = int(os.environ.get("FAISS_HNSW_EF_CONSTRUCTION", 80))
127
+ FAISS_HNSW_EF_SEARCH = int(os.environ.get("FAISS_HNSW_EF_SEARCH", 64))
128
+
129
+ DUPLICATE_KNN_K = int(os.environ.get("DUPLICATE_KNN_K", 10))
130
+ DUPLICATE_SIM_THRESHOLD = float(os.environ.get("DUPLICATE_SIM_THRESHOLD", 0.98))
131
+ DUPLICATE_TOP_K = int(os.environ.get("DUPLICATE_TOP_K", 5))
132
 
133
  # Cache de Sessão (Na memória RAM)
134
  cache: Dict[str, Any] = {}
 
698
  del reducer, clusterer, emb_3d; gc.collect()
699
  return df, embeddings
700
 
701
+ def calcular_metricas(textos: List[str]) -> Dict[str, Any]:
702
+ logging.info("Calculando métricas globais...")
703
+ if not textos: return {}
704
 
705
  # Token pattern: só palavras alfabéticas com 3+ caracteres (ignora números)
706
  token_pattern = r'\b[a-zA-ZÀ-ÿ]{3,}\b'
 
730
  top_idx_tfidf = np.argsort(soma_tfidf)[-10:][::-1]
731
  top_tfidf = [{"palavra": vocab_tfidf[i], "score": round(float(soma_tfidf[i]), 4)} for i in top_idx_tfidf]
732
 
733
+ return {
734
+ "riqueza_lexical": len(vocab_count),
735
+ "top_tfidf_palavras": top_tfidf,
736
+ "entropia": float(entropy(contagens / contagens.sum(), base=2)) if contagens.sum() > 0 else 0.0
737
+ }
738
+
739
+ def normalize_embeddings(embeddings: np.ndarray) -> np.ndarray:
740
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
741
+ norms[norms == 0] = 1.0
742
+ return embeddings / norms
743
+
744
+ def build_faiss_index(embeddings_normalized: np.ndarray) -> faiss.Index:
745
+ dim = embeddings_normalized.shape[1]
746
+ if embeddings_normalized.shape[0] >= FAISS_HNSW_MIN_SIZE:
747
+ index = faiss.IndexHNSWFlat(dim, FAISS_HNSW_M, faiss.METRIC_INNER_PRODUCT)
748
+ index.hnsw.efConstruction = FAISS_HNSW_EF_CONSTRUCTION
749
+ index.hnsw.efSearch = FAISS_HNSW_EF_SEARCH
750
+ index_type = "HNSW"
751
+ else:
752
+ index = faiss.IndexFlatIP(dim)
753
+ index_type = "FlatIP"
754
+ index.add(embeddings_normalized.astype('float32'))
755
+ logging.info(f"FAISS index criado ({index_type}) com {index.ntotal} vetores")
756
+ return index
757
+
758
+ def encontrar_duplicados(
759
+ df: pd.DataFrame,
760
+ embeddings_normalized: np.ndarray,
761
+ faiss_index: Optional[faiss.Index] = None
762
+ ) -> Dict[str, Any]:
763
+ logging.info("Detectando duplicados...")
764
+ mask = df["full_text"].duplicated(keep=False)
765
+ grupos_exatos = {t: [int(i) for i in idxs] for t, idxs in df[mask].groupby("full_text").groups.items()}
766
+ pares_semanticos = []
767
+
768
+ if len(embeddings_normalized) > 2:
769
+ if faiss_index is None:
770
+ faiss_index = build_faiss_index(embeddings_normalized)
771
+ if hasattr(faiss_index, "hnsw"):
772
+ faiss_index.hnsw.efSearch = max(faiss_index.hnsw.efSearch, FAISS_HNSW_EF_SEARCH)
773
+
774
+ k = min(DUPLICATE_KNN_K + 1, len(embeddings_normalized))
775
+ scores, indices = faiss_index.search(embeddings_normalized.astype('float32'), k)
776
+ top_pairs = []
777
+
778
+ for i in range(len(embeddings_normalized)):
779
+ for rank, j in enumerate(indices[i]):
780
+ if j <= i:
781
+ continue
782
+ score = float(scores[i][rank])
783
+ if score < DUPLICATE_SIM_THRESHOLD:
784
+ continue
785
+ if df["full_text"].iloc[i] == df["full_text"].iloc[j]:
786
+ continue
787
+
788
+ if len(top_pairs) < DUPLICATE_TOP_K:
789
+ heapq.heappush(top_pairs, (score, i, j))
790
+ elif score > top_pairs[0][0]:
791
+ heapq.heapreplace(top_pairs, (score, i, j))
792
+
793
+ for score, idx1, idx2 in sorted(top_pairs, reverse=True):
794
+ pares_semanticos.append({
795
+ "similaridade": float(score),
796
+ "texto1": df["full_text"].iloc[idx1],
797
+ "texto2": df["full_text"].iloc[idx2]
798
+ })
799
+ return {"grupos_exatos": grupos_exatos, "pares_semanticos": pares_semanticos}
800
 
801
  def analisar_clusters(df: pd.DataFrame) -> Dict[str, Any]:
802
  logging.info("Analisando clusters...")
 
883
  custom_min_samples=min_samples_int
884
  )
885
 
886
+ # Criar índice FAISS para busca rápida (semântica)
887
+ embeddings_normalized = normalize_embeddings(embeddings)
888
+ faiss_index = build_faiss_index(embeddings_normalized)
 
889
 
890
  # Criar índice BM25 para busca lexical (Hybrid Search)
891
  corpus_texts = df["full_text"].tolist()
 
903
  logging.info(f"Job criado: {job_id} (FAISS + BM25 hybrid search)")
904
 
905
  metricas_globais = calcular_metricas(df["full_text"].tolist())
906
+ analise_de_duplicados = encontrar_duplicados(df, embeddings_normalized, faiss_index)
907
  analise_por_cluster_tfidf = analisar_clusters(df)
908
 
909
  n_clusters = len(df["cluster"].unique()) - (1 if "-1" in df["cluster"].unique() else 0)
 
1439
 
1440
  except Exception as e:
1441
  logging.error(f"ERRO EM /analyze_graph/: {e}", exc_info=True)
1442
+ raise HTTPException(status_code=500, detail=str(e))