Upload 6 files
Browse files- app.py +137 -3
- requirements.txt +2 -0
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# ==============================================================================
|
| 2 |
-
# API do AetherMap — VERSÃO 7.
|
| 3 |
-
# Backend com RAG Híbrido, CSV
|
| 4 |
# ==============================================================================
|
| 5 |
|
| 6 |
import numpy as np
|
|
@@ -15,10 +15,11 @@ import logging
|
|
| 15 |
import time
|
| 16 |
import nltk
|
| 17 |
from nltk.corpus import stopwords
|
|
|
|
| 18 |
|
| 19 |
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
|
| 20 |
from fastapi.responses import JSONResponse
|
| 21 |
-
from typing import List, Dict, Any
|
| 22 |
from functools import lru_cache
|
| 23 |
|
| 24 |
# Ferramentas de Alquimia (ML & NLP)
|
|
@@ -30,6 +31,10 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
| 30 |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
| 31 |
from scipy.stats import entropy
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
# Monitoramento (O Toque da Berta)
|
| 34 |
from prometheus_fastapi_instrumentator import Instrumentator
|
| 35 |
from prometheus_client import Histogram
|
|
@@ -151,6 +156,88 @@ def load_reranker():
|
|
| 151 |
logging.info(f"Carregando Reranker '{RERANKER_MODEL}' em: {device}")
|
| 152 |
return CrossEncoder(RERANKER_MODEL, device=device)
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
# ==============================================================================
|
| 156 |
# PIPELINE DE PROCESSAMENTO DE DADOS
|
|
@@ -643,4 +730,51 @@ async def search_web_api(
|
|
| 643 |
|
| 644 |
except Exception as e:
|
| 645 |
logging.error(f"ERRO EM /search_web/: {e}", exc_info=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 646 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
| 1 |
# ==============================================================================
|
| 2 |
+
# API do AetherMap — VERSÃO 7.3 (KNOWLEDGE GRAPH EDITION)
|
| 3 |
+
# Backend com RAG Híbrido, CSV, Tavily, NER Entity Graph
|
| 4 |
# ==============================================================================
|
| 5 |
|
| 6 |
import numpy as np
|
|
|
|
| 15 |
import time
|
| 16 |
import nltk
|
| 17 |
from nltk.corpus import stopwords
|
| 18 |
+
from collections import defaultdict
|
| 19 |
|
| 20 |
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
|
| 21 |
from fastapi.responses import JSONResponse
|
| 22 |
+
from typing import List, Dict, Any, Tuple
|
| 23 |
from functools import lru_cache
|
| 24 |
|
| 25 |
# Ferramentas de Alquimia (ML & NLP)
|
|
|
|
| 31 |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
| 32 |
from scipy.stats import entropy
|
| 33 |
|
| 34 |
+
# NER & Language Detection
|
| 35 |
+
import spacy
|
| 36 |
+
from langdetect import detect, LangDetectException
|
| 37 |
+
|
| 38 |
# Monitoramento (O Toque da Berta)
|
| 39 |
from prometheus_fastapi_instrumentator import Instrumentator
|
| 40 |
from prometheus_client import Histogram
|
|
|
|
| 156 |
logging.info(f"Carregando Reranker '{RERANKER_MODEL}' em: {device}")
|
| 157 |
return CrossEncoder(RERANKER_MODEL, device=device)
|
| 158 |
|
| 159 |
+
# Cache for spaCy models
|
| 160 |
+
_spacy_models = {}
|
| 161 |
+
|
| 162 |
+
def load_spacy_model(lang: str):
|
| 163 |
+
"""Carrega modelo spaCy com cache."""
|
| 164 |
+
if lang not in _spacy_models:
|
| 165 |
+
model_name = "pt_core_news_sm" if lang == "pt" else "en_core_web_sm"
|
| 166 |
+
try:
|
| 167 |
+
_spacy_models[lang] = spacy.load(model_name)
|
| 168 |
+
logging.info(f"Modelo spaCy '{model_name}' carregado.")
|
| 169 |
+
except OSError:
|
| 170 |
+
logging.warning(f"Modelo {model_name} não encontrado. Baixando...")
|
| 171 |
+
spacy.cli.download(model_name.replace("_", "-").replace("-sm", ""))
|
| 172 |
+
_spacy_models[lang] = spacy.load(model_name)
|
| 173 |
+
return _spacy_models[lang]
|
| 174 |
+
|
| 175 |
+
def detect_language(texts: List[str]) -> str:
|
| 176 |
+
"""Detecta idioma predominante nos textos."""
|
| 177 |
+
sample = " ".join(texts[:10])[:1000]
|
| 178 |
+
try:
|
| 179 |
+
lang = detect(sample)
|
| 180 |
+
return "pt" if lang == "pt" else "en"
|
| 181 |
+
except LangDetectException:
|
| 182 |
+
return "en"
|
| 183 |
+
|
| 184 |
+
def extract_entities(textos: List[str]) -> List[List[Tuple[str, str]]]:
|
| 185 |
+
"""Extrai entidades nomeadas de cada texto."""
|
| 186 |
+
lang = detect_language(textos)
|
| 187 |
+
nlp = load_spacy_model(lang)
|
| 188 |
+
|
| 189 |
+
entities_by_doc = []
|
| 190 |
+
for text in textos:
|
| 191 |
+
# Limitar tamanho do texto para performance
|
| 192 |
+
doc = nlp(text[:2000])
|
| 193 |
+
entities = [(ent.text.lower().strip(), ent.label_) for ent in doc.ents
|
| 194 |
+
if len(ent.text.strip()) > 2 and ent.label_ in ("PERSON", "PER", "ORG", "GPE", "LOC")]
|
| 195 |
+
entities_by_doc.append(entities)
|
| 196 |
+
|
| 197 |
+
return entities_by_doc
|
| 198 |
+
|
| 199 |
+
def build_entity_graph(entities_by_doc: List[List[Tuple[str, str]]],
|
| 200 |
+
positions: List[Dict]) -> Dict[str, Any]:
|
| 201 |
+
"""Constrói grafo de conexões baseado em entidades compartilhadas."""
|
| 202 |
+
# Inverter: entidade -> lista de doc indices
|
| 203 |
+
entity_to_docs = defaultdict(set)
|
| 204 |
+
|
| 205 |
+
for doc_idx, entities in enumerate(entities_by_doc):
|
| 206 |
+
for entity_text, entity_type in entities:
|
| 207 |
+
entity_to_docs[(entity_text, entity_type)].add(doc_idx)
|
| 208 |
+
|
| 209 |
+
# Construir arestas (conexões entre docs que compartilham entidades)
|
| 210 |
+
edges = []
|
| 211 |
+
seen_pairs = set()
|
| 212 |
+
|
| 213 |
+
for (entity_text, entity_type), doc_indices in entity_to_docs.items():
|
| 214 |
+
if len(doc_indices) < 2:
|
| 215 |
+
continue
|
| 216 |
+
|
| 217 |
+
doc_list = sorted(doc_indices)
|
| 218 |
+
for i in range(len(doc_list)):
|
| 219 |
+
for j in range(i + 1, doc_list):
|
| 220 |
+
pair = (doc_list[i], doc_list[j])
|
| 221 |
+
if pair not in seen_pairs:
|
| 222 |
+
seen_pairs.add(pair)
|
| 223 |
+
edges.append({
|
| 224 |
+
"source": doc_list[i],
|
| 225 |
+
"target": doc_list[j],
|
| 226 |
+
"entity": entity_text,
|
| 227 |
+
"entity_type": entity_type
|
| 228 |
+
})
|
| 229 |
+
|
| 230 |
+
# Contar entidades mais frequentes
|
| 231 |
+
entity_counts = [(k, len(v)) for k, v in entity_to_docs.items() if len(v) >= 2]
|
| 232 |
+
top_entities = sorted(entity_counts, key=lambda x: x[1], reverse=True)[:20]
|
| 233 |
+
|
| 234 |
+
return {
|
| 235 |
+
"edges": edges,
|
| 236 |
+
"edge_count": len(edges),
|
| 237 |
+
"connected_pairs": len(seen_pairs),
|
| 238 |
+
"top_entities": [{"entity": e[0][0], "type": e[0][1], "docs": e[1]} for e in top_entities]
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
|
| 242 |
# ==============================================================================
|
| 243 |
# PIPELINE DE PROCESSAMENTO DE DADOS
|
|
|
|
| 730 |
|
| 731 |
except Exception as e:
|
| 732 |
logging.error(f"ERRO EM /search_web/: {e}", exc_info=True)
|
| 733 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 734 |
+
|
| 735 |
+
|
| 736 |
+
# ==============================================================================
|
| 737 |
+
# ENDPOINT KNOWLEDGE GRAPH (NER)
|
| 738 |
+
# ==============================================================================
|
| 739 |
+
@app.post("/entity_graph/")
|
| 740 |
+
async def entity_graph_api(job_id: str = Form(...)):
|
| 741 |
+
"""
|
| 742 |
+
Extrai entidades nomeadas e constrói grafo de conexões entre documentos.
|
| 743 |
+
"""
|
| 744 |
+
if job_id not in cache:
|
| 745 |
+
raise HTTPException(status_code=404, detail="Job ID não encontrado.")
|
| 746 |
+
|
| 747 |
+
logging.info(f"Construindo Knowledge Graph para Job: {job_id}")
|
| 748 |
+
|
| 749 |
+
try:
|
| 750 |
+
cached_data = cache[job_id]
|
| 751 |
+
df = cached_data["df"]
|
| 752 |
+
textos = df["full_text"].tolist()
|
| 753 |
+
|
| 754 |
+
# Extrair entidades
|
| 755 |
+
logging.info(f"Extraindo entidades de {len(textos)} documentos...")
|
| 756 |
+
entities_by_doc = extract_entities(textos)
|
| 757 |
+
|
| 758 |
+
# Construir posições dos pontos
|
| 759 |
+
positions = df[["x", "y", "z"]].to_dict("records")
|
| 760 |
+
|
| 761 |
+
# Construir grafo
|
| 762 |
+
graph_data = build_entity_graph(entities_by_doc, positions)
|
| 763 |
+
|
| 764 |
+
# Adicionar posições ao resultado
|
| 765 |
+
graph_data["positions"] = positions
|
| 766 |
+
graph_data["num_documents"] = len(textos)
|
| 767 |
+
|
| 768 |
+
# Entidades por documento (para tooltip)
|
| 769 |
+
graph_data["entities_by_doc"] = [
|
| 770 |
+
[{"text": e[0], "type": e[1]} for e in ents]
|
| 771 |
+
for ents in entities_by_doc
|
| 772 |
+
]
|
| 773 |
+
|
| 774 |
+
logging.info(f"Grafo construído: {graph_data['edge_count']} arestas, {len(graph_data['top_entities'])} entidades principais")
|
| 775 |
+
|
| 776 |
+
return graph_data
|
| 777 |
+
|
| 778 |
+
except Exception as e:
|
| 779 |
+
logging.error(f"ERRO EM /entity_graph/: {e}", exc_info=True)
|
| 780 |
raise HTTPException(status_code=500, detail=str(e))
|
requirements.txt
CHANGED
|
@@ -16,6 +16,8 @@ scipy
|
|
| 16 |
umap-learn
|
| 17 |
hdbscan
|
| 18 |
nltk
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# --- TORCH CPU ---
|
| 21 |
torch
|
|
|
|
| 16 |
umap-learn
|
| 17 |
hdbscan
|
| 18 |
nltk
|
| 19 |
+
spacy
|
| 20 |
+
langdetect
|
| 21 |
|
| 22 |
# --- TORCH CPU ---
|
| 23 |
torch
|