| |
|
| |
|
| |
|
| |
|
| |
|
| | import numpy as np |
| | import pandas as pd |
| | import torch |
| | import gc |
| | import uuid |
| | import os |
| | import io |
| | import json |
| | import logging |
| | import time |
| | import heapq |
| | import nltk |
| | from nltk.corpus import stopwords
|
| | from collections import defaultdict
|
| |
|
| | from fastapi import FastAPI, UploadFile, File, Form, HTTPException
|
| | from fastapi.responses import JSONResponse
|
| | from typing import List, Dict, Any, Tuple, Optional |
| | from functools import lru_cache
|
| |
|
| |
|
| | from sentence_transformers import SentenceTransformer, CrossEncoder
|
| | import umap
|
| | import hdbscan
|
| | from sklearn.preprocessing import StandardScaler
|
| | from sklearn.metrics.pairwise import cosine_similarity
|
| | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
| | from scipy.stats import entropy
|
| | from scipy.sparse import csr_matrix
|
| | import faiss
|
| |
|
| |
|
| |
|
| |
|
| | class SimpleBM25:
|
| | """
|
| | Implementação simples de BM25 usando TF-IDF com ajustes.
|
| | Evita dependência externa do rank_bm25.
|
| | """
|
| | def __init__(self, corpus: List[str], k1: float = 1.5, b: float = 0.75):
|
| | self.k1 = k1
|
| | self.b = b
|
| | self.corpus = corpus
|
| | self.corpus_size = len(corpus)
|
| |
|
| |
|
| | self.tokenized_corpus = [doc.lower().split() for doc in corpus]
|
| |
|
| |
|
| | self.doc_freqs = {}
|
| | for doc in self.tokenized_corpus:
|
| | for term in set(doc):
|
| | self.doc_freqs[term] = self.doc_freqs.get(term, 0) + 1
|
| |
|
| |
|
| | self.avgdl = sum(len(doc) for doc in self.tokenized_corpus) / self.corpus_size
|
| |
|
| |
|
| | self.idf = {}
|
| | for term, freq in self.doc_freqs.items():
|
| | self.idf[term] = np.log((self.corpus_size - freq + 0.5) / (freq + 0.5) + 1)
|
| |
|
| | def get_scores(self, query: str) -> np.ndarray:
|
| | """Retorna scores BM25 para todos os documentos."""
|
| | query_terms = query.lower().split()
|
| | scores = np.zeros(self.corpus_size)
|
| |
|
| | for i, doc in enumerate(self.tokenized_corpus):
|
| | doc_len = len(doc)
|
| | term_freqs = {}
|
| | for term in doc:
|
| | term_freqs[term] = term_freqs.get(term, 0) + 1
|
| |
|
| | score = 0.0
|
| | for term in query_terms:
|
| | if term in term_freqs:
|
| | tf = term_freqs[term]
|
| | idf = self.idf.get(term, 0)
|
| |
|
| | numerator = tf * (self.k1 + 1)
|
| | denominator = tf + self.k1 * (1 - self.b + self.b * (doc_len / self.avgdl))
|
| | score += idf * (numerator / denominator)
|
| |
|
| | scores[i] = score
|
| |
|
| | return scores
|
| |
|
| | def get_top_n(self, query: str, n: int = 50) -> Tuple[np.ndarray, np.ndarray]:
|
| | """Retorna top N documentos (índices, scores)."""
|
| | scores = self.get_scores(query)
|
| | top_indices = np.argsort(scores)[-n:][::-1]
|
| | return top_indices, scores[top_indices]
|
| |
|
| |
|
| | import spacy
|
| | from langdetect import detect, LangDetectException
|
| |
|
| |
|
| | from prometheus_fastapi_instrumentator import Instrumentator
|
| | from prometheus_client import Histogram
|
| |
|
| |
|
| | from openai import OpenAI
|
| |
|
| |
|
| |
|
| |
|
| | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| |
|
| |
|
| | RETRIEVAL_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
| | RERANKER_MODEL = "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1"
|
| |
|
| | |
| | BATCH_SIZE = 256 |
| | UMAP_N_NEIGHBORS = 30 |
| |
|
| | |
| | FAISS_HNSW_MIN_SIZE = int(os.environ.get("FAISS_HNSW_MIN_SIZE", 2000)) |
| | FAISS_HNSW_M = int(os.environ.get("FAISS_HNSW_M", 32)) |
| | FAISS_HNSW_EF_CONSTRUCTION = int(os.environ.get("FAISS_HNSW_EF_CONSTRUCTION", 80)) |
| | FAISS_HNSW_EF_SEARCH = int(os.environ.get("FAISS_HNSW_EF_SEARCH", 64)) |
| |
|
| | DUPLICATE_KNN_K = int(os.environ.get("DUPLICATE_KNN_K", 10)) |
| | DUPLICATE_SIM_THRESHOLD = float(os.environ.get("DUPLICATE_SIM_THRESHOLD", 0.98)) |
| | DUPLICATE_TOP_K = int(os.environ.get("DUPLICATE_TOP_K", 5)) |
| |
|
| |
|
| | cache: Dict[str, Any] = {}
|
| |
|
| |
|
| | LLM_LATENCY = Histogram(
|
| | "llm_api_latency_seconds",
|
| | "Tempo de resposta da API externa LLM (OpenRouter)",
|
| | buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0]
|
| | )
|
| |
|
| |
|
| | LLM_MODEL = os.environ.get("LLM_MODEL", "nex-agi/deepseek-v3.1-nex-n1:free")
|
| |
|
| |
|
| | OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
|
| | try:
|
| | if not OPENROUTER_API_KEY:
|
| | logging.warning("OPENROUTER_API_KEY não encontrada. Funcionalidades de LLM estarão indisponíveis.")
|
| | llm_client = None
|
| | else:
|
| | llm_client = OpenAI(
|
| | base_url="https://openrouter.ai/api/v1",
|
| | api_key=OPENROUTER_API_KEY
|
| | )
|
| | logging.info(f"Cliente OpenRouter inicializado com modelo: {LLM_MODEL}")
|
| | except Exception as e:
|
| | logging.error(f"FALHA AO INICIALIZAR OPENROUTER: {e}")
|
| | llm_client = None
|
| |
|
| |
|
| | TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY")
|
| | tavily_client = None
|
| | try:
|
| | if TAVILY_API_KEY:
|
| | from tavily import TavilyClient
|
| | tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
|
| | logging.info("Cliente Tavily inicializado com sucesso.")
|
| | else:
|
| | logging.warning("TAVILY_API_KEY não encontrada. Busca web estará indisponível.")
|
| | except Exception as e:
|
| | logging.error(f"FALHA AO INICIALIZAR TAVILY: {e}")
|
| | tavily_client = None
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def expand_query(original_query: str) -> str:
|
| | """
|
| | Usa LLM para expandir a query com termos relacionados.
|
| | Aumenta recall sem perder precisão.
|
| | """
|
| | if not llm_client:
|
| | return original_query
|
| |
|
| | try:
|
| | expansion_prompt = f"""Expanda esta query de busca adicionando sinônimos e termos relacionados.
|
| | Mantenha conciso (máximo 50 palavras). Retorne APENAS a query expandida.
|
| |
|
| | Query original: {original_query}
|
| |
|
| | Query expandida:"""
|
| |
|
| | response = llm_client.chat.completions.create(
|
| | messages=[{"role": "user", "content": expansion_prompt}],
|
| | model=LLM_MODEL,
|
| | temperature=0.3,
|
| | max_tokens=100
|
| | )
|
| |
|
| | expanded = response.choices[0].message.content.strip()
|
| | logging.info(f"Query expandida: '{original_query}' → '{expanded[:50]}...'")
|
| | return expanded
|
| |
|
| | except Exception as e:
|
| | logging.warning(f"Erro na expansão de query: {e}")
|
| | return original_query
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def chunk_text(text: str, chunk_size: int = 300, overlap: int = 100) -> List[str]:
|
| | """
|
| | Divide texto em chunks menores com overlap.
|
| | Melhora precisão do retrieval.
|
| | """
|
| | if len(text) <= chunk_size:
|
| | return [text]
|
| |
|
| | chunks = []
|
| | start = 0
|
| |
|
| | while start < len(text):
|
| | end = start + chunk_size
|
| |
|
| |
|
| | if end < len(text):
|
| | space_idx = text.rfind(' ', start, end)
|
| | if space_idx > start:
|
| | end = space_idx
|
| |
|
| | chunks.append(text[start:end].strip())
|
| | start = end - overlap
|
| |
|
| | if start >= len(text):
|
| | break
|
| |
|
| | return chunks if chunks else [text]
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def carregar_stopwords():
|
| | """
|
| | Carrega stop words do NLTK e combina com um arquivo externo 'stopwords.txt'.
|
| | """
|
| | logging.info("Iniciando carregamento de Stop Words...")
|
| |
|
| |
|
| | try:
|
| | nltk.data.find('corpora/stopwords')
|
| | except LookupError:
|
| | logging.info("Baixando corpus de stopwords...")
|
| | nltk.download('stopwords')
|
| |
|
| |
|
| | final_stops = set(stopwords.words('portuguese')) | set(stopwords.words('english'))
|
| | logging.info(f"Stopwords base (NLTK) carregadas: {len(final_stops)}")
|
| |
|
| |
|
| | arquivo_custom = "stopwords.txt"
|
| |
|
| | if os.path.exists(arquivo_custom):
|
| | logging.info(f"Arquivo '{arquivo_custom}' encontrado. Lendo palavras customizadas...")
|
| | try:
|
| | count_custom = 0
|
| | with open(arquivo_custom, "r", encoding="utf-8") as f:
|
| | for linha in f:
|
| | palavra = linha.split('#')[0].strip().lower()
|
| | if palavra and len(palavra) > 1:
|
| | final_stops.add(palavra)
|
| | count_custom += 1
|
| | logging.info(f"{count_custom} stop words customizadas importadas do arquivo.")
|
| | except Exception as e:
|
| | logging.error(f"Erro ao ler '{arquivo_custom}': {e}")
|
| | else:
|
| | logging.warning(f"Arquivo '{arquivo_custom}' não encontrado. Usando apenas NLTK.")
|
| |
|
| | lista_final = list(final_stops)
|
| | logging.info(f"Total final de Stop Words ativas: {len(lista_final)}")
|
| | return lista_final
|
| |
|
| |
|
| | STOP_WORDS_MULTILINGUAL = carregar_stopwords()
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | @lru_cache(maxsize=1)
|
| | def load_retriever():
|
| | device = "cuda" if torch.cuda.is_available() else "cpu"
|
| | logging.info(f"Carregando Retriever '{RETRIEVAL_MODEL}' em: {device}")
|
| | return SentenceTransformer(RETRIEVAL_MODEL, device=device)
|
| |
|
| | @lru_cache(maxsize=1)
|
| | def load_reranker():
|
| | device = "cuda" if torch.cuda.is_available() else "cpu"
|
| | logging.info(f"Carregando Reranker '{RERANKER_MODEL}' em: {device}")
|
| | return CrossEncoder(RERANKER_MODEL, device=device)
|
| |
|
| |
|
| | _spacy_models = {}
|
| | _spacy_available = True
|
| |
|
| | def load_spacy_model(lang: str):
|
| | """Carrega modelo spaCy com cache."""
|
| | global _spacy_available
|
| |
|
| | if not _spacy_available:
|
| | return None
|
| |
|
| | if lang not in _spacy_models:
|
| | model_name = "pt_core_news_sm" if lang == "pt" else "en_core_web_sm"
|
| | try:
|
| | _spacy_models[lang] = spacy.load(model_name)
|
| | logging.info(f"Modelo spaCy '{model_name}' carregado.")
|
| | except OSError:
|
| | logging.warning(f"Modelo {model_name} não encontrado. Tentando baixar...")
|
| | try:
|
| | import subprocess
|
| | subprocess.run(["python", "-m", "spacy", "download", model_name], check=True)
|
| | _spacy_models[lang] = spacy.load(model_name)
|
| | except Exception as e:
|
| | logging.error(f"Falha ao baixar modelo spaCy: {e}")
|
| | _spacy_available = False
|
| | return None
|
| | return _spacy_models[lang]
|
| |
|
| | def detect_language(texts: List[str]) -> str:
|
| | """Detecta idioma predominante nos textos."""
|
| | sample = " ".join(texts[:10])[:1000]
|
| | try:
|
| | lang = detect(sample)
|
| | return "pt" if lang == "pt" else "en"
|
| | except LangDetectException:
|
| | return "en"
|
| |
|
| | def extract_entities(textos: List[str]) -> List[List[Tuple[str, str]]]:
|
| | """Extrai entidades nomeadas de cada texto."""
|
| | lang = detect_language(textos)
|
| | nlp = load_spacy_model(lang)
|
| |
|
| |
|
| | if nlp is None:
|
| | logging.warning("spaCy não disponível. Retornando entidades vazias.")
|
| | return [[] for _ in textos]
|
| |
|
| | entities_by_doc = []
|
| | for text in textos:
|
| |
|
| | doc = nlp(text[:2000])
|
| | entities = [(ent.text.lower().strip(), ent.label_) for ent in doc.ents
|
| | if len(ent.text.strip()) > 2 and ent.label_ in ("PERSON", "PER", "ORG", "GPE", "LOC")]
|
| | entities_by_doc.append(entities)
|
| |
|
| |
|
| | return normalize_entities(entities_by_doc)
|
| |
|
| | def normalize_entities(entities_by_doc: List[List[Tuple[str, str]]]) -> List[List[Tuple[str, str]]]:
|
| | """Normaliza entidades para agrupar variações do mesmo nome."""
|
| |
|
| |
|
| | all_entities = defaultdict(set)
|
| | for entities in entities_by_doc:
|
| | for text, etype in entities:
|
| | all_entities[etype].add(text)
|
| |
|
| |
|
| |
|
| | normalization_map = {}
|
| |
|
| | for etype, entity_set in all_entities.items():
|
| | entities_list = sorted(entity_set, key=len, reverse=True)
|
| |
|
| | for entity in entities_list:
|
| | if entity in normalization_map:
|
| | continue
|
| |
|
| |
|
| | canonical = entity
|
| | for other in entities_list:
|
| | if other == entity:
|
| | continue
|
| |
|
| |
|
| | if other in entity or entity in other:
|
| |
|
| | if len(entity) >= len(other):
|
| | normalization_map[(other, etype)] = (entity, etype)
|
| | else:
|
| | normalization_map[(entity, etype)] = (other, etype)
|
| | canonical = other
|
| |
|
| |
|
| | if (entity, etype) not in normalization_map:
|
| | normalization_map[(entity, etype)] = (canonical, etype)
|
| |
|
| |
|
| | normalized_docs = []
|
| | for entities in entities_by_doc:
|
| | normalized = []
|
| | seen = set()
|
| | for text, etype in entities:
|
| | canonical = normalization_map.get((text, etype), (text, etype))
|
| | if canonical not in seen:
|
| | seen.add(canonical)
|
| | normalized.append(canonical)
|
| | normalized_docs.append(normalized)
|
| |
|
| | logging.info(f"Normalização: {len(all_entities)} tipos, mapa com {len(normalization_map)} entradas")
|
| | return normalized_docs
|
| |
|
| | def build_entity_graph(entities_by_doc: List[List[Tuple[str, str]]],
|
| | positions: List[Dict]) -> Dict[str, Any]:
|
| | """Constrói grafo de conexões baseado em entidades compartilhadas."""
|
| |
|
| | entity_to_docs = defaultdict(set)
|
| |
|
| | for doc_idx, entities in enumerate(entities_by_doc):
|
| | for entity_text, entity_type in entities:
|
| | entity_to_docs[(entity_text, entity_type)].add(doc_idx)
|
| |
|
| |
|
| | edges = []
|
| | seen_pairs = set()
|
| |
|
| | for (entity_text, entity_type), doc_indices in entity_to_docs.items():
|
| | if len(doc_indices) < 2:
|
| | continue
|
| |
|
| | doc_list = sorted(doc_indices)
|
| | for i in range(len(doc_list)):
|
| | for j in range(i + 1, len(doc_list)):
|
| | pair = (doc_list[i], doc_list[j])
|
| | if pair not in seen_pairs:
|
| | seen_pairs.add(pair)
|
| | edges.append({
|
| | "source": doc_list[i],
|
| | "target": doc_list[j],
|
| | "entity": entity_text,
|
| | "entity_type": entity_type
|
| | })
|
| |
|
| |
|
| | entity_counts = [(k, len(v)) for k, v in entity_to_docs.items() if len(v) >= 2]
|
| | top_entities = sorted(entity_counts, key=lambda x: x[1], reverse=True)[:20]
|
| |
|
| | return {
|
| | "edges": edges,
|
| | "edge_count": len(edges),
|
| | "connected_pairs": len(seen_pairs),
|
| | "top_entities": [{"entity": e[0][0], "type": e[0][1], "docs": e[1]} for e in top_entities]
|
| | }
|
| |
|
| | def build_entity_to_entity_graph(entities_by_doc: List[List[Tuple[str, str]]]) -> Dict[str, Any]:
|
| | """
|
| | Constrói grafo onde os NÓS são entidades e ARESTAS são co-ocorrências.
|
| | Entidades que aparecem no mesmo documento são conectadas.
|
| | """
|
| |
|
| | cooccurrence = defaultdict(int)
|
| | entity_doc_count = defaultdict(int)
|
| |
|
| | for entities in entities_by_doc:
|
| | unique_entities = list(set(entities))
|
| |
|
| |
|
| | for ent in unique_entities:
|
| | entity_doc_count[ent] += 1
|
| |
|
| |
|
| | for i in range(len(unique_entities)):
|
| | for j in range(i + 1, len(unique_entities)):
|
| | pair = tuple(sorted([unique_entities[i], unique_entities[j]], key=str))
|
| | cooccurrence[pair] += 1
|
| |
|
| |
|
| | nodes = []
|
| | entity_to_id = {}
|
| |
|
| |
|
| | valid_entities = [(e, c) for e, c in entity_doc_count.items() if c >= 2]
|
| | valid_entities.sort(key=lambda x: x[1], reverse=True)
|
| | valid_entities = valid_entities[:50]
|
| |
|
| | node_count = len(valid_entities)
|
| |
|
| | node_idx = 0
|
| | for entity, count in valid_entities:
|
| | entity_to_id[entity] = node_idx
|
| |
|
| |
|
| |
|
| | golden_angle = np.pi * (3 - np.sqrt(5))
|
| | theta = node_idx * golden_angle
|
| | phi = np.arccos(1 - 2 * (node_idx + 0.5) / max(node_count, 1))
|
| |
|
| | radius = 3.0
|
| | x = radius * np.sin(phi) * np.cos(theta)
|
| | y = radius * np.sin(phi) * np.sin(theta)
|
| | z = radius * np.cos(phi)
|
| |
|
| | nodes.append({
|
| | "id": node_idx,
|
| | "entity": entity[0],
|
| | "type": entity[1],
|
| | "docs": count,
|
| | "x": float(x),
|
| | "y": float(y),
|
| | "z": float(z)
|
| | })
|
| | node_idx += 1
|
| |
|
| |
|
| | entity_edges = []
|
| | for (ent1, ent2), weight in cooccurrence.items():
|
| | if ent1 in entity_to_id and ent2 in entity_to_id and weight >= 1:
|
| | entity_edges.append({
|
| | "source": entity_to_id[ent1],
|
| | "target": entity_to_id[ent2],
|
| | "weight": weight,
|
| | "source_entity": ent1[0],
|
| | "target_entity": ent2[0],
|
| | "reason": f"Aparecem juntos em {weight} documento(s)"
|
| | })
|
| |
|
| |
|
| | entity_edges.sort(key=lambda x: x["weight"], reverse=True)
|
| |
|
| |
|
| |
|
| | degree = defaultdict(int)
|
| | for edge in entity_edges:
|
| | degree[edge["source"]] += edge["weight"]
|
| | degree[edge["target"]] += edge["weight"]
|
| |
|
| |
|
| | max_degree = max(degree.values()) if degree else 1
|
| |
|
| |
|
| | hubs = []
|
| | for node in nodes:
|
| | node_degree = degree.get(node["id"], 0)
|
| | node["degree"] = node_degree
|
| | node["centrality"] = round(node_degree / max_degree, 3)
|
| |
|
| |
|
| | if node["centrality"] > 0.7:
|
| | node["role"] = "hub"
|
| | hubs.append(node)
|
| | elif node["centrality"] > 0.3:
|
| | node["role"] = "connector"
|
| | else:
|
| | node["role"] = "peripheral"
|
| |
|
| |
|
| | hubs.sort(key=lambda x: x["degree"], reverse=True)
|
| |
|
| | return {
|
| | "nodes": nodes,
|
| | "edges": entity_edges[:200],
|
| | "node_count": len(nodes),
|
| | "edge_count": len(entity_edges),
|
| | "hubs": [{"entity": h["entity"], "type": h["type"], "degree": h["degree"]} for h in hubs[:5]],
|
| | "insights": {
|
| | "total_connections": sum(degree.values()) // 2,
|
| | "avg_degree": round(sum(degree.values()) / len(degree), 1) if degree else 0,
|
| | "hub_count": len(hubs)
|
| | }
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def preparar_textos(file_bytes: bytes, n_samples: int) -> List[str]:
|
| | """Prepara textos de arquivo TXT (uma linha por documento)."""
|
| | linhas = file_bytes.decode("utf-8", errors="ignore").splitlines()
|
| | textos = [s for line in linhas if (s := line.strip()) and len(s.split()) > 3]
|
| | return textos[:n_samples]
|
| |
|
| | def detect_csv_separator(file_bytes: bytes) -> str:
|
| | """Detecta separador do CSV (vírgula ou ponto-e-vírgula)."""
|
| | sample = file_bytes[:4096].decode("utf-8", errors="ignore")
|
| | first_line = sample.split('\n')[0]
|
| |
|
| |
|
| | commas = first_line.count(',')
|
| | semicolons = first_line.count(';')
|
| |
|
| | return ';' if semicolons > commas else ','
|
| |
|
| | def read_csv_smart(file_bytes: bytes, nrows=None) -> pd.DataFrame:
|
| | """Lê CSV com detecção automática de separador e encoding."""
|
| | sep = detect_csv_separator(file_bytes)
|
| |
|
| | try:
|
| | df = pd.read_csv(io.BytesIO(file_bytes), sep=sep, encoding="utf-8", nrows=nrows)
|
| | except UnicodeDecodeError:
|
| | df = pd.read_csv(io.BytesIO(file_bytes), sep=sep, encoding="latin-1", nrows=nrows)
|
| |
|
| | return df
|
| |
|
| | def preparar_textos_csv(file_bytes: bytes, text_column: str, n_samples: int) -> List[str]:
|
| | """Prepara textos de arquivo CSV extraindo coluna especificada."""
|
| | df = read_csv_smart(file_bytes)
|
| |
|
| | if text_column not in df.columns:
|
| | available = ", ".join(df.columns.tolist()[:10])
|
| | raise ValueError(f"Coluna '{text_column}' não encontrada. Colunas disponíveis: {available}")
|
| |
|
| | textos = df[text_column].dropna().astype(str).tolist()
|
| |
|
| | textos = [t.strip() for t in textos if len(t.strip().split()) > 3]
|
| | return textos[:n_samples]
|
| |
|
| | def get_csv_columns(file_bytes: bytes) -> List[str]:
|
| | """Retorna lista de colunas de um arquivo CSV."""
|
| | df = read_csv_smart(file_bytes, nrows=0)
|
| | return df.columns.tolist()
|
| |
|
| |
|
| | def processar_pipeline(
|
| | textos: List[str],
|
| | small_dataset: bool = False,
|
| | fast_mode: bool = False,
|
| | custom_min_cluster_size: int = 0,
|
| | custom_min_samples: int = 0
|
| | ) -> (pd.DataFrame, np.ndarray):
|
| | """
|
| | Processa textos através do pipeline de embeddings, redução dimensional e clustering.
|
| |
|
| | Args:
|
| | textos: Lista de textos
|
| | small_dataset: Se True, usa parâmetros otimizados para datasets pequenos (Tavily)
|
| | fast_mode: Se True, usa PCA ao invés de UMAP (10-100x mais rápido)
|
| | custom_min_cluster_size: Tamanho mínimo do cluster (0 = Auto)
|
| | custom_min_samples: Número mínimo de amostras (0 = Auto)
|
| | """
|
| | num_textos = len(textos)
|
| | logging.info(f"Iniciando pipeline para {num_textos} textos (fast_mode={fast_mode})...")
|
| | model = load_retriever()
|
| |
|
| |
|
| | embeddings = model.encode(textos, batch_size=BATCH_SIZE, show_progress_bar=False, convert_to_numpy=True)
|
| |
|
| |
|
| | if fast_mode:
|
| |
|
| | from sklearn.decomposition import PCA
|
| | logging.info("Usando PCA (modo rápido)...")
|
| | reducer = PCA(n_components=3, random_state=42)
|
| | emb_3d = reducer.fit_transform(embeddings)
|
| | else:
|
| |
|
| | n_neighbors = min(15, max(3, num_textos - 1)) if small_dataset else UMAP_N_NEIGHBORS
|
| | logging.info(f"Usando UMAP (n_neighbors={n_neighbors})...")
|
| | reducer = umap.UMAP(n_components=3, n_neighbors=n_neighbors, min_dist=0.0, metric="cosine", random_state=42)
|
| | emb_3d = reducer.fit_transform(embeddings)
|
| |
|
| | emb_3d = StandardScaler().fit_transform(emb_3d)
|
| |
|
| |
|
| | if custom_min_cluster_size > 0:
|
| |
|
| | min_size = custom_min_cluster_size
|
| | min_samples_val = custom_min_samples if custom_min_samples > 0 else None
|
| | logging.info(f"Usando parâmetros customizados: min_cluster_size={min_size}, min_samples={min_samples_val}")
|
| | elif small_dataset:
|
| |
|
| | min_size = max(2, int(num_textos * 0.1))
|
| | min_samples_val = 1
|
| | elif fast_mode:
|
| |
|
| | min_size = max(8, int(num_textos * 0.01))
|
| | min_samples_val = 3
|
| | else:
|
| |
|
| | min_size = max(10, int(num_textos * 0.02))
|
| | min_samples_val = None
|
| |
|
| | logging.info(f"HDBSCAN: min_cluster_size={min_size}, min_samples={min_samples_val}")
|
| |
|
| | clusterer = hdbscan.HDBSCAN(min_cluster_size=min_size, min_samples=min_samples_val)
|
| | clusters = clusterer.fit_predict(emb_3d)
|
| |
|
| |
|
| | df = pd.DataFrame({
|
| | "x": emb_3d[:, 0], "y": emb_3d[:, 1], "z": emb_3d[:, 2],
|
| | "full_text": textos, "cluster": clusters.astype(str)
|
| | })
|
| |
|
| | del reducer, clusterer, emb_3d; gc.collect()
|
| | return df, embeddings
|
| |
|
| | def calcular_metricas(textos: List[str]) -> Dict[str, Any]: |
| | logging.info("Calculando métricas globais...") |
| | if not textos: return {} |
| |
|
| |
|
| | token_pattern = r'\b[a-zA-ZÀ-ÿ]{3,}\b'
|
| |
|
| | vectorizer_count = CountVectorizer(
|
| | stop_words=STOP_WORDS_MULTILINGUAL,
|
| | max_features=1000,
|
| | token_pattern=token_pattern
|
| | )
|
| | vectorizer_tfidf = TfidfVectorizer(
|
| | stop_words=STOP_WORDS_MULTILINGUAL,
|
| | max_features=1000,
|
| | token_pattern=token_pattern
|
| | )
|
| |
|
| | try:
|
| | counts_matrix = vectorizer_count.fit_transform(textos)
|
| | tfidf_matrix = vectorizer_tfidf.fit_transform(textos)
|
| | except ValueError:
|
| | return {"riqueza_lexical": 0, "top_tfidf_palavras": [], "entropia": 0.0}
|
| |
|
| | vocab_count = vectorizer_count.get_feature_names_out()
|
| | contagens = counts_matrix.sum(axis=0).A1
|
| |
|
| | vocab_tfidf = vectorizer_tfidf.get_feature_names_out()
|
| | soma_tfidf = tfidf_matrix.sum(axis=0).A1
|
| | top_idx_tfidf = np.argsort(soma_tfidf)[-10:][::-1]
|
| | top_tfidf = [{"palavra": vocab_tfidf[i], "score": round(float(soma_tfidf[i]), 4)} for i in top_idx_tfidf]
|
| |
|
| | return { |
| | "riqueza_lexical": len(vocab_count), |
| | "top_tfidf_palavras": top_tfidf, |
| | "entropia": float(entropy(contagens / contagens.sum(), base=2)) if contagens.sum() > 0 else 0.0 |
| | } |
| |
|
| | def normalize_embeddings(embeddings: np.ndarray) -> np.ndarray: |
| | norms = np.linalg.norm(embeddings, axis=1, keepdims=True) |
| | norms[norms == 0] = 1.0 |
| | return embeddings / norms |
| |
|
| | def build_faiss_index(embeddings_normalized: np.ndarray) -> faiss.Index: |
| | dim = embeddings_normalized.shape[1] |
| | if embeddings_normalized.shape[0] >= FAISS_HNSW_MIN_SIZE: |
| | index = faiss.IndexHNSWFlat(dim, FAISS_HNSW_M, faiss.METRIC_INNER_PRODUCT) |
| | index.hnsw.efConstruction = FAISS_HNSW_EF_CONSTRUCTION |
| | index.hnsw.efSearch = FAISS_HNSW_EF_SEARCH |
| | index_type = "HNSW" |
| | else: |
| | index = faiss.IndexFlatIP(dim) |
| | index_type = "FlatIP" |
| | index.add(embeddings_normalized.astype('float32')) |
| | logging.info(f"FAISS index criado ({index_type}) com {index.ntotal} vetores") |
| | return index |
| |
|
| | def encontrar_duplicados( |
| | df: pd.DataFrame, |
| | embeddings_normalized: np.ndarray, |
| | faiss_index: Optional[faiss.Index] = None |
| | ) -> Dict[str, Any]: |
| | logging.info("Detectando duplicados...") |
| | mask = df["full_text"].duplicated(keep=False) |
| | grupos_exatos = {t: [int(i) for i in idxs] for t, idxs in df[mask].groupby("full_text").groups.items()} |
| | pares_semanticos = [] |
| | |
| | if len(embeddings_normalized) > 2: |
| | if faiss_index is None: |
| | faiss_index = build_faiss_index(embeddings_normalized) |
| | if hasattr(faiss_index, "hnsw"): |
| | faiss_index.hnsw.efSearch = max(faiss_index.hnsw.efSearch, FAISS_HNSW_EF_SEARCH) |
| |
|
| | k = min(DUPLICATE_KNN_K + 1, len(embeddings_normalized)) |
| | scores, indices = faiss_index.search(embeddings_normalized.astype('float32'), k) |
| | top_pairs = [] |
| |
|
| | for i in range(len(embeddings_normalized)): |
| | for rank, j in enumerate(indices[i]): |
| | if j <= i: |
| | continue |
| | score = float(scores[i][rank]) |
| | if score < DUPLICATE_SIM_THRESHOLD: |
| | continue |
| | if df["full_text"].iloc[i] == df["full_text"].iloc[j]: |
| | continue |
| |
|
| | if len(top_pairs) < DUPLICATE_TOP_K: |
| | heapq.heappush(top_pairs, (score, i, j)) |
| | elif score > top_pairs[0][0]: |
| | heapq.heapreplace(top_pairs, (score, i, j)) |
| |
|
| | for score, idx1, idx2 in sorted(top_pairs, reverse=True): |
| | pares_semanticos.append({ |
| | "similaridade": float(score), |
| | "texto1": df["full_text"].iloc[idx1], |
| | "texto2": df["full_text"].iloc[idx2] |
| | }) |
| | return {"grupos_exatos": grupos_exatos, "pares_semanticos": pares_semanticos} |
| |
|
| | def analisar_clusters(df: pd.DataFrame) -> Dict[str, Any]:
|
| | logging.info("Analisando clusters...")
|
| | analise = {}
|
| | ids_clusters_validos = sorted([c for c in df["cluster"].unique() if c != "-1"], key=int)
|
| | for cid in ids_clusters_validos:
|
| | textos_cluster = df[df["cluster"] == cid]["full_text"].tolist()
|
| | if len(textos_cluster) < 2: continue
|
| | try:
|
| | vectorizer = TfidfVectorizer(stop_words=STOP_WORDS_MULTILINGUAL, max_features=1000)
|
| | tfidf_matrix = vectorizer.fit_transform(textos_cluster)
|
| | vocab = vectorizer.get_feature_names_out()
|
| | soma = tfidf_matrix.sum(axis=0).A1
|
| | top_idx = np.argsort(soma)[-5:][::-1]
|
| | top_palavras = [{"palavra": vocab[i], "score": round(float(soma[i]), 4)} for i in top_idx]
|
| | except ValueError:
|
| | top_palavras = []
|
| | analise[cid] = {"num_documentos": len(textos_cluster), "top_palavras": top_palavras}
|
| | return analise
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | app = FastAPI(title="AetherMap API 7.2", version="7.2.0", description="Backend Semantic Search + CSV + Tavily Web Search")
|
| |
|
| |
|
| |
|
| | Instrumentator().instrument(app).expose(app)
|
| |
|
| |
|
| | @app.get("/")
|
| | async def root():
|
| | return {"status": "online", "message": "Aether Map API 7.2 (CSV + Tavily Ready)."}
|
| |
|
| | @app.post("/csv_columns/")
|
| | async def get_columns_api(file: UploadFile = File(...)):
|
| | """Retorna as colunas de um arquivo CSV para preview."""
|
| | if not file.filename.lower().endswith('.csv'):
|
| | raise HTTPException(status_code=400, detail="Arquivo deve ser CSV.")
|
| | try:
|
| | file_bytes = await file.read()
|
| | columns = get_csv_columns(file_bytes)
|
| | return {"columns": columns, "filename": file.filename}
|
| | except Exception as e:
|
| | raise HTTPException(status_code=400, detail=f"Erro ao ler CSV: {str(e)}")
|
| |
|
| | @app.post("/process/")
|
| | async def process_api(
|
| | n_samples: int = Form(10000),
|
| | file: UploadFile = File(...),
|
| | text_column: str = Form(None),
|
| | fast_mode: str = Form("false"),
|
| | min_cluster_size: str = Form("0"),
|
| | min_samples: str = Form("0")
|
| | ):
|
| |
|
| | fast_mode_bool = fast_mode.lower() in ("true", "1", "yes")
|
| | min_cluster_size_int = int(min_cluster_size) if min_cluster_size.isdigit() else 0
|
| | min_samples_int = int(min_samples) if min_samples.isdigit() else 0
|
| |
|
| | logging.info(f"Processando: {file.filename} (fast_mode={fast_mode_bool}, min_cluster={min_cluster_size_int}, min_samples={min_samples_int})")
|
| | try:
|
| | file_bytes = await file.read()
|
| |
|
| |
|
| | if file.filename.lower().endswith('.csv'):
|
| | if not text_column:
|
| | raise HTTPException(status_code=400, detail="Para CSV, informe 'text_column'.")
|
| | textos = preparar_textos_csv(file_bytes, text_column, n_samples)
|
| | else:
|
| | textos = preparar_textos(file_bytes, n_samples)
|
| |
|
| | if not textos: raise HTTPException(status_code=400, detail="Nenhum texto válido encontrado.")
|
| |
|
| |
|
| | if len(textos) > 5000 and not fast_mode_bool:
|
| | logging.warning(f"Dataset grande ({len(textos)} docs) - considere usar fast_mode=True")
|
| |
|
| | df, embeddings = processar_pipeline(
|
| | textos,
|
| | fast_mode=fast_mode_bool,
|
| | custom_min_cluster_size=min_cluster_size_int,
|
| | custom_min_samples=min_samples_int
|
| | )
|
| |
|
| | |
| | embeddings_normalized = normalize_embeddings(embeddings) |
| | faiss_index = build_faiss_index(embeddings_normalized) |
| |
|
| |
|
| | corpus_texts = df["full_text"].tolist()
|
| | bm25_index = SimpleBM25(corpus_texts)
|
| | logging.info(f"BM25 index criado com {len(corpus_texts)} documentos")
|
| |
|
| | job_id = str(uuid.uuid4())
|
| | cache[job_id] = {
|
| | "embeddings": embeddings,
|
| | "embeddings_normalized": embeddings_normalized,
|
| | "faiss_index": faiss_index,
|
| | "bm25_index": bm25_index,
|
| | "df": df
|
| | }
|
| | logging.info(f"Job criado: {job_id} (FAISS + BM25 hybrid search)")
|
| |
|
| | metricas_globais = calcular_metricas(df["full_text"].tolist())
|
| | analise_de_duplicados = encontrar_duplicados(df, embeddings_normalized, faiss_index) |
| | analise_por_cluster_tfidf = analisar_clusters(df)
|
| |
|
| | n_clusters = len(df["cluster"].unique()) - (1 if "-1" in df["cluster"].unique() else 0)
|
| | n_ruido = int((df["cluster"] == "-1").sum())
|
| |
|
| | return {
|
| | "job_id": job_id,
|
| | "metadata": {
|
| | "filename": file.filename,
|
| | "num_documents_processed": len(df),
|
| | "num_clusters_found": n_clusters,
|
| | "num_noise_points": n_ruido
|
| | },
|
| | "metrics": metricas_globais,
|
| | "duplicates": analise_de_duplicados,
|
| | "cluster_analysis": analise_por_cluster_tfidf,
|
| | "plot_data": df[["x", "y", "z", "cluster", "full_text"]].to_dict("records"),
|
| | }
|
| | except Exception as e:
|
| | logging.error(f"ERRO EM /process/: {e}", exc_info=True)
|
| | raise HTTPException(status_code=500, detail=str(e))
|
| |
|
| |
|
| | @app.post("/search/")
|
| | async def search_api(
|
| | query: str = Form(...),
|
| | job_id: str = Form(...),
|
| | turbo_mode: str = Form("false")
|
| | ):
|
| | """
|
| | ENDPOINT DE BUSCA (RAG Híbrido) com Monitoramento de Latência
|
| | """
|
| | logging.info(f"Busca: '{query}' [Job: {job_id}]")
|
| | if job_id not in cache:
|
| | raise HTTPException(status_code=404, detail="Job ID não encontrado.")
|
| |
|
| | try:
|
| |
|
| | turbo_mode_bool = turbo_mode.lower() in ("true", "1", "yes")
|
| | if turbo_mode_bool:
|
| | logging.info("TURBO MODE ATIVADO - Skip query expansion, menos candidatos")
|
| |
|
| | model = load_retriever()
|
| | reranker = load_reranker()
|
| |
|
| | cached_data = cache[job_id]
|
| | df = cached_data["df"]
|
| | faiss_index = cached_data.get("faiss_index")
|
| | bm25_index = cached_data.get("bm25_index")
|
| |
|
| |
|
| |
|
| |
|
| | if turbo_mode_bool:
|
| | expanded_query = query
|
| | else:
|
| | expanded_query = expand_query(query)
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | query_embedding = model.encode([expanded_query], convert_to_numpy=True)
|
| | query_normalized = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
|
| |
|
| |
|
| | top_k_retrieval = min(30 if turbo_mode_bool else 100, len(df))
|
| |
|
| |
|
| | semantic_scores = {}
|
| | semantic_ranks = {}
|
| |
|
| | if faiss_index:
|
| | faiss_scores, faiss_indices = faiss_index.search(
|
| | query_normalized.astype('float32'),
|
| | min(top_k_retrieval, faiss_index.ntotal)
|
| | )
|
| | faiss_scores = faiss_scores[0]
|
| | faiss_indices = faiss_indices[0]
|
| |
|
| | for rank, (idx, score) in enumerate(zip(faiss_indices, faiss_scores)):
|
| | if idx >= 0:
|
| | semantic_scores[int(idx)] = float(score)
|
| | semantic_ranks[int(idx)] = rank + 1
|
| |
|
| | logging.info(f"FAISS: top score = {faiss_scores[0]:.3f}")
|
| |
|
| |
|
| | lexical_scores = {}
|
| | lexical_ranks = {}
|
| |
|
| | if bm25_index:
|
| | bm25_indices, bm25_scores = bm25_index.get_top_n(expanded_query, n=top_k_retrieval)
|
| |
|
| | for rank, (idx, score) in enumerate(zip(bm25_indices, bm25_scores)):
|
| | if score > 0:
|
| | lexical_scores[int(idx)] = float(score)
|
| | lexical_ranks[int(idx)] = rank + 1
|
| |
|
| | if bm25_scores[0] > 0:
|
| | logging.info(f"BM25: top score = {bm25_scores[0]:.3f}, docs matched = {len(lexical_scores)}")
|
| | else:
|
| | logging.info("BM25: nenhum match léxico encontrado")
|
| |
|
| |
|
| |
|
| |
|
| | k = 60
|
| | all_indices = set(semantic_ranks.keys()) | set(lexical_ranks.keys())
|
| |
|
| | hybrid_scores = {}
|
| | for idx in all_indices:
|
| | rrf_score = 0.0
|
| |
|
| |
|
| | if idx in semantic_ranks:
|
| | rrf_score += 1.0 / (k + semantic_ranks[idx])
|
| |
|
| |
|
| | if idx in lexical_ranks:
|
| | rrf_score += 1.0 / (k + lexical_ranks[idx])
|
| |
|
| | hybrid_scores[idx] = rrf_score
|
| |
|
| |
|
| | sorted_candidates = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
|
| |
|
| |
|
| | n_semantic_only = len(semantic_ranks.keys() - lexical_ranks.keys())
|
| | n_lexical_only = len(lexical_ranks.keys() - semantic_ranks.keys())
|
| | n_both = len(semantic_ranks.keys() & lexical_ranks.keys())
|
| | logging.info(f"Hybrid: {n_both} em ambos, {n_semantic_only} só semântico, {n_lexical_only} só léxico")
|
| |
|
| |
|
| | candidate_docs = []
|
| | candidate_indices = []
|
| | retrieval_scores = {}
|
| |
|
| | for idx, rrf_score in sorted_candidates[:top_k_retrieval]:
|
| | doc_text = df.iloc[idx]["full_text"]
|
| | candidate_docs.append([query, doc_text])
|
| | candidate_indices.append(idx)
|
| | retrieval_scores[idx] = rrf_score
|
| |
|
| | if not candidate_docs:
|
| | return {"summary": "Não foram encontrados documentos relevantes.", "results": []}
|
| |
|
| |
|
| | logging.info(f"Reranking {len(candidate_docs)} documentos...")
|
| | rerank_scores = reranker.predict(candidate_docs)
|
| |
|
| | rerank_results = sorted(
|
| | zip(candidate_indices, rerank_scores),
|
| | key=lambda x: x[1],
|
| | reverse=True
|
| | )
|
| |
|
| | final_top_k = 5 if turbo_mode_bool else 10
|
| | final_results = []
|
| | context_parts = []
|
| |
|
| | for rank, (idx, score) in enumerate(rerank_results[:final_top_k]):
|
| | doc_text = df.iloc[idx]["full_text"]
|
| | context_parts.append(f"[ID: {rank+1}] DOCUMENTO:\n{doc_text}\n---------------------")
|
| |
|
| | final_results.append({
|
| | "index": idx,
|
| | "score": float(score),
|
| | "cosine_score": retrieval_scores.get(idx, 0.0),
|
| | "citation_id": rank + 1
|
| | })
|
| |
|
| |
|
| | summary = ""
|
| | if llm_client:
|
| | context_str = "\n".join(context_parts)
|
| | rag_prompt = (
|
| | "INSTRUÇÃO DE SISTEMA:\n"
|
| | "Você é o Aetherius, um motor de busca semântica de alta precisão.\n"
|
| | "Sua missão é responder à pergunta do usuário baseando-se ESTRITAMENTE nos documentos fornecidos.\n\n"
|
| | "REGRAS OBRIGATÓRIAS:\n"
|
| | "1. CITAÇÕES: Toda afirmação deve ter fonte [ID: x]. Ex: 'O lucro subiu [ID: 1].'\n"
|
| | "2. HONESTIDADE: Se não estiver no texto, diga que não encontrou.\n"
|
| | "3. IDIOMA: Português do Brasil.\n\n"
|
| | f"CONTEXTO RECUPERADO:\n{context_str}\n\n"
|
| | f"PERGUNTA DO USUÁRIO: \"{query}\"\n\n"
|
| | "RESPOSTA:"
|
| | )
|
| |
|
| | try:
|
| |
|
| | start_time_llm = time.time()
|
| |
|
| | chat_completion = llm_client.chat.completions.create(
|
| | messages=[{"role": "user", "content": rag_prompt}],
|
| | model=LLM_MODEL,
|
| | temperature=0.1,
|
| | max_tokens=1024
|
| | )
|
| |
|
| |
|
| | duration = time.time() - start_time_llm
|
| | LLM_LATENCY.observe(duration)
|
| |
|
| |
|
| | summary = chat_completion.choices[0].message.content.strip()
|
| | except Exception as e:
|
| | logging.warning(f"Erro na geração do LLM: {e}")
|
| | summary = "Não foi possível gerar o resumo automático, mas os documentos estão listados abaixo."
|
| |
|
| | return {"summary": summary, "results": final_results}
|
| |
|
| | except Exception as e:
|
| | logging.error(f"ERRO EM /search/: {e}", exc_info=True)
|
| | raise HTTPException(status_code=500, detail=str(e))
|
| |
|
| |
|
| | @app.post("/describe_clusters/")
|
| | async def describe_clusters_api(job_id: str = Form(...)):
|
| | logging.info(f"Descrevendo clusters para Job: {job_id}")
|
| | if not llm_client: raise HTTPException(status_code=503, detail="LLM indisponível.")
|
| | if job_id not in cache: raise HTTPException(status_code=404, detail="Job não encontrado.")
|
| |
|
| | try:
|
| | cached_data = cache[job_id]
|
| | df = cached_data["df"]
|
| | embeddings = cached_data["embeddings"]
|
| |
|
| | champion_docs_by_cluster = {}
|
| | cluster_ids = sorted([c for c in df["cluster"].unique() if c != "-1"], key=int)
|
| |
|
| | for cid in cluster_ids:
|
| | mask = df["cluster"] == cid
|
| | cluster_embeddings = embeddings[mask]
|
| | cluster_texts = df[mask]["full_text"].tolist()
|
| | if len(cluster_texts) < 3: continue
|
| |
|
| | centroid = np.mean(cluster_embeddings, axis=0)
|
| | similarities = cosine_similarity([centroid], cluster_embeddings)[0]
|
| | top_indices = np.argsort(similarities)[-3:][::-1]
|
| | champion_docs_by_cluster[cid] = [cluster_texts[i] for i in top_indices]
|
| |
|
| | if not champion_docs_by_cluster: return {"insights": {}}
|
| |
|
| | prompt_sections = []
|
| | for cid, docs in champion_docs_by_cluster.items():
|
| | doc_list = "\n".join([f"- {doc[:300]}..." for doc in docs])
|
| | prompt_sections.append(f"Grupo {cid}:\n{doc_list}")
|
| |
|
| | master_prompt = (
|
| | "Analise os grupos de texto abaixo. Para cada grupo, retorne um JSON com 'topic_name' e 'core_insight'.\n"
|
| | "Responda APENAS o JSON válido.\n\n" + "\n\n".join(prompt_sections)
|
| | )
|
| |
|
| |
|
| | start_time_llm = time.time()
|
| |
|
| | chat_completion = llm_client.chat.completions.create(
|
| | messages=[
|
| | {"role": "system", "content": "JSON Output Only."},
|
| | {"role": "user", "content": master_prompt},
|
| | ], model=LLM_MODEL, temperature=0.2,
|
| | )
|
| |
|
| | duration = time.time() - start_time_llm
|
| | LLM_LATENCY.observe(duration)
|
| |
|
| |
|
| | response_content = chat_completion.choices[0].message.content
|
| | insights = json.loads(response_content.strip().replace("```json", "").replace("```", ""))
|
| | return {"insights": insights}
|
| |
|
| | except Exception as e:
|
| | logging.error(f"ERRO EM /describe_clusters/: {e}", exc_info=True)
|
| | raise HTTPException(status_code=500, detail=str(e))
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | @app.post("/search_web/")
|
| | async def search_web_api(
|
| | query: str = Form(...),
|
| | max_results: int = Form(20),
|
| | search_depth: str = Form("basic")
|
| | ):
|
| | """
|
| | Busca na web via Tavily e processa resultados para visualização.
|
| | """
|
| | if not tavily_client:
|
| | raise HTTPException(status_code=503, detail="Tavily não configurado. Defina TAVILY_API_KEY.")
|
| |
|
| | logging.info(f"Tavily Search: '{query}' (max: {max_results})")
|
| |
|
| | try:
|
| |
|
| | search_result = tavily_client.search(
|
| | query=query,
|
| | max_results=max_results,
|
| | search_depth=search_depth,
|
| | include_answer=False
|
| | )
|
| |
|
| | results = search_result.get("results", [])
|
| | if not results:
|
| | return {"error": "Nenhum resultado encontrado.", "results_count": 0}
|
| |
|
| |
|
| | textos = []
|
| | sources = []
|
| | for r in results:
|
| | title = r.get("title", "")
|
| | content = r.get("content", "")
|
| | url = r.get("url", "")
|
| |
|
| |
|
| | full_text = f"{title}: {content}" if title else content
|
| | if len(full_text.strip().split()) > 5:
|
| | textos.append(full_text.strip())
|
| | sources.append({"title": title, "url": url})
|
| |
|
| | if not textos:
|
| | return {"error": "Resultados sem conteúdo válido.", "results_count": 0}
|
| |
|
| |
|
| | df, embeddings = processar_pipeline(textos, small_dataset=True)
|
| |
|
| |
|
| | embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
|
| | faiss_index = faiss.IndexFlatIP(embeddings_normalized.shape[1])
|
| | faiss_index.add(embeddings_normalized.astype('float32'))
|
| |
|
| |
|
| | df["source_url"] = [sources[i]["url"] if i < len(sources) else "" for i in range(len(df))]
|
| | df["source_title"] = [sources[i]["title"] if i < len(sources) else "" for i in range(len(df))]
|
| |
|
| |
|
| | job_id = str(uuid.uuid4())
|
| | cache[job_id] = {
|
| | "embeddings": embeddings,
|
| | "embeddings_normalized": embeddings_normalized,
|
| | "faiss_index": faiss_index,
|
| | "df": df,
|
| | "sources": sources
|
| | }
|
| | logging.info(f"Tavily Job criado: {job_id}")
|
| |
|
| |
|
| | metricas_globais = calcular_metricas(df["full_text"].tolist())
|
| | analise_por_cluster_tfidf = analisar_clusters(df)
|
| |
|
| | n_clusters = len(df["cluster"].unique()) - (1 if "-1" in df["cluster"].unique() else 0)
|
| | n_ruido = int((df["cluster"] == "-1").sum())
|
| |
|
| | return {
|
| | "job_id": job_id,
|
| | "metadata": {
|
| | "query": query,
|
| | "source": "tavily_web_search",
|
| | "num_documents_processed": len(df),
|
| | "num_clusters_found": n_clusters,
|
| | "num_noise_points": n_ruido
|
| | },
|
| | "metrics": metricas_globais,
|
| | "cluster_analysis": analise_por_cluster_tfidf,
|
| | "plot_data": df[["x", "y", "z", "cluster", "full_text", "source_url", "source_title"]].to_dict("records"),
|
| | "sources": sources
|
| | }
|
| |
|
| | except Exception as e:
|
| | logging.error(f"ERRO EM /search_web/: {e}", exc_info=True)
|
| | raise HTTPException(status_code=500, detail=str(e))
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | @app.post("/entity_graph/")
|
| | async def entity_graph_api(job_id: str = Form(...)):
|
| | """
|
| | Extrai entidades nomeadas e constrói grafo de conexões entre documentos.
|
| | """
|
| | if job_id not in cache:
|
| | raise HTTPException(status_code=404, detail="Job ID não encontrado.")
|
| |
|
| | logging.info(f"Construindo Knowledge Graph para Job: {job_id}")
|
| |
|
| | try:
|
| | cached_data = cache[job_id]
|
| | df = cached_data["df"]
|
| | textos = df["full_text"].tolist()
|
| |
|
| |
|
| | logging.info(f"Extraindo entidades de {len(textos)} documentos...")
|
| | entities_by_doc = extract_entities(textos)
|
| |
|
| |
|
| | positions = df[["x", "y", "z"]].to_dict("records")
|
| |
|
| |
|
| | graph_data = build_entity_graph(entities_by_doc, positions)
|
| |
|
| |
|
| | graph_data["positions"] = positions
|
| | graph_data["num_documents"] = len(textos)
|
| |
|
| |
|
| | graph_data["entities_by_doc"] = [
|
| | [{"text": e[0], "type": e[1]} for e in ents]
|
| | for ents in entities_by_doc
|
| | ]
|
| |
|
| |
|
| | entity_network = build_entity_to_entity_graph(entities_by_doc)
|
| | graph_data["entity_network"] = entity_network
|
| |
|
| | logging.info(f"Grafo construído: {graph_data['edge_count']} arestas doc-doc, {entity_network['node_count']} nós entidade")
|
| |
|
| | return graph_data
|
| |
|
| | except Exception as e:
|
| | logging.error(f"ERRO EM /entity_graph/: {e}", exc_info=True)
|
| | raise HTTPException(status_code=500, detail=str(e))
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | @app.post("/analyze_graph/")
|
| | async def analyze_graph_api(job_id: str = Form(...)):
|
| | """
|
| | Usa o LLM para analisar o Knowledge Graph e extrair insights semânticos.
|
| | """
|
| | if job_id not in cache:
|
| | raise HTTPException(status_code=404, detail="Job ID não encontrado.")
|
| |
|
| | logging.info(f"Analisando grafo com LLM para Job: {job_id}")
|
| |
|
| | try:
|
| | cached_data = cache[job_id]
|
| | df = cached_data["df"]
|
| | textos = df["full_text"].tolist()
|
| | clusters = df["cluster"].tolist() if "cluster" in df.columns else ["0"] * len(textos)
|
| |
|
| |
|
| | entities_by_doc = extract_entities(textos)
|
| | entity_network = build_entity_to_entity_graph(entities_by_doc)
|
| |
|
| |
|
| | entity_clusters = defaultdict(lambda: defaultdict(int))
|
| | for doc_idx, (entities, cluster) in enumerate(zip(entities_by_doc, clusters)):
|
| | for ent_text, ent_type in entities:
|
| | entity_clusters[ent_text][str(cluster)] += 1
|
| |
|
| |
|
| | nodes = entity_network.get("nodes", [])[:15]
|
| | edges = entity_network.get("edges", [])[:20]
|
| | hubs = entity_network.get("hubs", [])[:5]
|
| | insights = entity_network.get("insights", {})
|
| |
|
| |
|
| | cluster_context = []
|
| | unique_clusters = sorted(set(str(c) for c in clusters if str(c) != "-1"))
|
| | for cluster_id in unique_clusters[:5]:
|
| | cluster_docs = [textos[i][:200] for i, c in enumerate(clusters) if str(c) == cluster_id][:3]
|
| | cluster_entities = [(ent, entity_clusters[ent].get(cluster_id, 0))
|
| | for ent in entity_clusters if entity_clusters[ent].get(cluster_id, 0) > 0]
|
| | cluster_entities.sort(key=lambda x: x[1], reverse=True)
|
| |
|
| | cluster_context.append(f"""
|
| | ### Cluster {cluster_id} ({len([c for c in clusters if str(c) == cluster_id])} docs)
|
| | Entidades principais: {', '.join([f"{e[0]}({e[1]})" for e in cluster_entities[:5]])}
|
| | Exemplo de documento: "{cluster_docs[0][:150]}..."
|
| | """)
|
| |
|
| | graph_summary = f"""
|
| | ANÁLISE DE KNOWLEDGE GRAPH COM CONTEXTO
|
| |
|
| | ## Visão Geral
|
| | - {len(nodes)} entidades principais
|
| | - {insights.get('total_connections', 0)} conexões totais
|
| | - {insights.get('hub_count', 0)} hubs identificados
|
| | - {len(unique_clusters)} clusters de documentos
|
| |
|
| | ## Entidades Principais (por importância):
|
| | {chr(10).join([f"- {n['entity']} ({n['type']}): {n['docs']} docs, centralidade {n.get('centrality', 0)}" for n in nodes])}
|
| |
|
| | ## Hubs Centrais (entidades mais conectadas):
|
| | {chr(10).join([f"- {h['entity']} ({h['type']}): {h['degree']} conexões" for h in hubs]) if hubs else "Nenhum hub identificado"}
|
| |
|
| | ## Conexões Mais Fortes:
|
| | {chr(10).join([f"- {e['source_entity']} ↔ {e['target_entity']}: {e['weight']} co-ocorrências" for e in edges[:10]])}
|
| |
|
| | ## CONTEXTO POR CLUSTER (IMPORTANTE - USE ESTAS REFERÊNCIAS):
|
| | {chr(10).join(cluster_context)}
|
| | """
|
| |
|
| |
|
| | prompt = f"""Você é um analista de inteligência especializado em análise de redes e grafos de conhecimento.
|
| |
|
| | Analise o seguinte Knowledge Graph extraído de documentos e forneça insights acionáveis:
|
| |
|
| | {graph_summary}
|
| |
|
| | Por favor forneça:
|
| | 1. **Narrativa Central**: Qual é a história principal que conecta estas entidades?
|
| | 2. **Atores Chave**: Quem são os principais players e qual seu papel?
|
| | 3. **Relações Ocultas**: Que conexões não-óbvias você identifica?
|
| | 4. **Padrões por Cluster**: Como as entidades se distribuem entre os clusters? Qual cluster tem foco diferente?
|
| | 5. **Investigação**: O que valeria a pena investigar mais a fundo?
|
| |
|
| | IMPORTANTE: Sempre referencie os clusters específicos (ex: "No Cluster 0...", "Já no Cluster 1...").
|
| | Use os dados concretos fornecidos, não generalize. Seja específico citando entidades e clusters."""
|
| |
|
| |
|
| | completion = groq_client.chat.completions.create(
|
| | model="llama-3.3-70b-versatile",
|
| | messages=[{"role": "user", "content": prompt}],
|
| | temperature=0.7,
|
| | max_tokens=1500
|
| | )
|
| |
|
| | analysis = completion.choices[0].message.content
|
| |
|
| | return {
|
| | "analysis": analysis,
|
| | "graph_summary": {
|
| | "entities": len(nodes),
|
| | "connections": insights.get('total_connections', 0),
|
| | "hubs": len(hubs)
|
| | }
|
| | }
|
| |
|
| | except Exception as e:
|
| | logging.error(f"ERRO EM /analyze_graph/: {e}", exc_info=True)
|
| | raise HTTPException(status_code=500, detail=str(e)) |
| |
|