Madras1 commited on
Commit
3c9cd4c
·
verified ·
1 Parent(s): 8cf9cdd

Upload 6 files

Browse files
Files changed (2) hide show
  1. app.py +137 -3
  2. requirements.txt +2 -0
app.py CHANGED
@@ -1,6 +1,6 @@
1
  # ==============================================================================
2
- # API do AetherMap — VERSÃO 7.2 (CSV + TAVILY EDITION)
3
- # Backend com RAG Híbrido, CSV Support, Web Search via Tavily
4
  # ==============================================================================
5
 
6
  import numpy as np
@@ -15,10 +15,11 @@ import logging
15
  import time
16
  import nltk
17
  from nltk.corpus import stopwords
 
18
 
19
  from fastapi import FastAPI, UploadFile, File, Form, HTTPException
20
  from fastapi.responses import JSONResponse
21
- from typing import List, Dict, Any
22
  from functools import lru_cache
23
 
24
  # Ferramentas de Alquimia (ML & NLP)
@@ -30,6 +31,10 @@ from sklearn.metrics.pairwise import cosine_similarity
30
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
31
  from scipy.stats import entropy
32
 
 
 
 
 
33
  # Monitoramento (O Toque da Berta)
34
  from prometheus_fastapi_instrumentator import Instrumentator
35
  from prometheus_client import Histogram
@@ -151,6 +156,88 @@ def load_reranker():
151
  logging.info(f"Carregando Reranker '{RERANKER_MODEL}' em: {device}")
152
  return CrossEncoder(RERANKER_MODEL, device=device)
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  # ==============================================================================
156
  # PIPELINE DE PROCESSAMENTO DE DADOS
@@ -643,4 +730,51 @@ async def search_web_api(
643
 
644
  except Exception as e:
645
  logging.error(f"ERRO EM /search_web/: {e}", exc_info=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
  raise HTTPException(status_code=500, detail=str(e))
 
1
  # ==============================================================================
2
+ # API do AetherMap — VERSÃO 7.3 (KNOWLEDGE GRAPH EDITION)
3
+ # Backend com RAG Híbrido, CSV, Tavily, NER Entity Graph
4
  # ==============================================================================
5
 
6
  import numpy as np
 
15
  import time
16
  import nltk
17
  from nltk.corpus import stopwords
18
+ from collections import defaultdict
19
 
20
  from fastapi import FastAPI, UploadFile, File, Form, HTTPException
21
  from fastapi.responses import JSONResponse
22
+ from typing import List, Dict, Any, Tuple
23
  from functools import lru_cache
24
 
25
  # Ferramentas de Alquimia (ML & NLP)
 
31
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
32
  from scipy.stats import entropy
33
 
34
+ # NER & Language Detection
35
+ import spacy
36
+ from langdetect import detect, LangDetectException
37
+
38
  # Monitoramento (O Toque da Berta)
39
  from prometheus_fastapi_instrumentator import Instrumentator
40
  from prometheus_client import Histogram
 
156
  logging.info(f"Carregando Reranker '{RERANKER_MODEL}' em: {device}")
157
  return CrossEncoder(RERANKER_MODEL, device=device)
158
 
159
+ # Cache for spaCy models
160
+ _spacy_models = {}
161
+
162
+ def load_spacy_model(lang: str):
163
+ """Carrega modelo spaCy com cache."""
164
+ if lang not in _spacy_models:
165
+ model_name = "pt_core_news_sm" if lang == "pt" else "en_core_web_sm"
166
+ try:
167
+ _spacy_models[lang] = spacy.load(model_name)
168
+ logging.info(f"Modelo spaCy '{model_name}' carregado.")
169
+ except OSError:
170
+ logging.warning(f"Modelo {model_name} não encontrado. Baixando...")
171
+ spacy.cli.download(model_name.replace("_", "-").replace("-sm", ""))
172
+ _spacy_models[lang] = spacy.load(model_name)
173
+ return _spacy_models[lang]
174
+
175
+ def detect_language(texts: List[str]) -> str:
176
+ """Detecta idioma predominante nos textos."""
177
+ sample = " ".join(texts[:10])[:1000]
178
+ try:
179
+ lang = detect(sample)
180
+ return "pt" if lang == "pt" else "en"
181
+ except LangDetectException:
182
+ return "en"
183
+
184
+ def extract_entities(textos: List[str]) -> List[List[Tuple[str, str]]]:
185
+ """Extrai entidades nomeadas de cada texto."""
186
+ lang = detect_language(textos)
187
+ nlp = load_spacy_model(lang)
188
+
189
+ entities_by_doc = []
190
+ for text in textos:
191
+ # Limitar tamanho do texto para performance
192
+ doc = nlp(text[:2000])
193
+ entities = [(ent.text.lower().strip(), ent.label_) for ent in doc.ents
194
+ if len(ent.text.strip()) > 2 and ent.label_ in ("PERSON", "PER", "ORG", "GPE", "LOC")]
195
+ entities_by_doc.append(entities)
196
+
197
+ return entities_by_doc
198
+
199
+ def build_entity_graph(entities_by_doc: List[List[Tuple[str, str]]],
200
+ positions: List[Dict]) -> Dict[str, Any]:
201
+ """Constrói grafo de conexões baseado em entidades compartilhadas."""
202
+ # Inverter: entidade -> lista de doc indices
203
+ entity_to_docs = defaultdict(set)
204
+
205
+ for doc_idx, entities in enumerate(entities_by_doc):
206
+ for entity_text, entity_type in entities:
207
+ entity_to_docs[(entity_text, entity_type)].add(doc_idx)
208
+
209
+ # Construir arestas (conexões entre docs que compartilham entidades)
210
+ edges = []
211
+ seen_pairs = set()
212
+
213
+ for (entity_text, entity_type), doc_indices in entity_to_docs.items():
214
+ if len(doc_indices) < 2:
215
+ continue
216
+
217
+ doc_list = sorted(doc_indices)
218
+ for i in range(len(doc_list)):
219
+ for j in range(i + 1, doc_list):
220
+ pair = (doc_list[i], doc_list[j])
221
+ if pair not in seen_pairs:
222
+ seen_pairs.add(pair)
223
+ edges.append({
224
+ "source": doc_list[i],
225
+ "target": doc_list[j],
226
+ "entity": entity_text,
227
+ "entity_type": entity_type
228
+ })
229
+
230
+ # Contar entidades mais frequentes
231
+ entity_counts = [(k, len(v)) for k, v in entity_to_docs.items() if len(v) >= 2]
232
+ top_entities = sorted(entity_counts, key=lambda x: x[1], reverse=True)[:20]
233
+
234
+ return {
235
+ "edges": edges,
236
+ "edge_count": len(edges),
237
+ "connected_pairs": len(seen_pairs),
238
+ "top_entities": [{"entity": e[0][0], "type": e[0][1], "docs": e[1]} for e in top_entities]
239
+ }
240
+
241
 
242
  # ==============================================================================
243
  # PIPELINE DE PROCESSAMENTO DE DADOS
 
730
 
731
  except Exception as e:
732
  logging.error(f"ERRO EM /search_web/: {e}", exc_info=True)
733
+ raise HTTPException(status_code=500, detail=str(e))
734
+
735
+
736
+ # ==============================================================================
737
+ # ENDPOINT KNOWLEDGE GRAPH (NER)
738
+ # ==============================================================================
739
+ @app.post("/entity_graph/")
740
+ async def entity_graph_api(job_id: str = Form(...)):
741
+ """
742
+ Extrai entidades nomeadas e constrói grafo de conexões entre documentos.
743
+ """
744
+ if job_id not in cache:
745
+ raise HTTPException(status_code=404, detail="Job ID não encontrado.")
746
+
747
+ logging.info(f"Construindo Knowledge Graph para Job: {job_id}")
748
+
749
+ try:
750
+ cached_data = cache[job_id]
751
+ df = cached_data["df"]
752
+ textos = df["full_text"].tolist()
753
+
754
+ # Extrair entidades
755
+ logging.info(f"Extraindo entidades de {len(textos)} documentos...")
756
+ entities_by_doc = extract_entities(textos)
757
+
758
+ # Construir posições dos pontos
759
+ positions = df[["x", "y", "z"]].to_dict("records")
760
+
761
+ # Construir grafo
762
+ graph_data = build_entity_graph(entities_by_doc, positions)
763
+
764
+ # Adicionar posições ao resultado
765
+ graph_data["positions"] = positions
766
+ graph_data["num_documents"] = len(textos)
767
+
768
+ # Entidades por documento (para tooltip)
769
+ graph_data["entities_by_doc"] = [
770
+ [{"text": e[0], "type": e[1]} for e in ents]
771
+ for ents in entities_by_doc
772
+ ]
773
+
774
+ logging.info(f"Grafo construído: {graph_data['edge_count']} arestas, {len(graph_data['top_entities'])} entidades principais")
775
+
776
+ return graph_data
777
+
778
+ except Exception as e:
779
+ logging.error(f"ERRO EM /entity_graph/: {e}", exc_info=True)
780
  raise HTTPException(status_code=500, detail=str(e))
requirements.txt CHANGED
@@ -16,6 +16,8 @@ scipy
16
  umap-learn
17
  hdbscan
18
  nltk
 
 
19
 
20
  # --- TORCH CPU ---
21
  torch
 
16
  umap-learn
17
  hdbscan
18
  nltk
19
+ spacy
20
+ langdetect
21
 
22
  # --- TORCH CPU ---
23
  torch