|
|
""" |
|
|
AetherMap Client |
|
|
Client para integração com AetherMap API - busca semântica, NER e análise de grafos. |
|
|
""" |
|
|
import httpx |
|
|
import json |
|
|
import io |
|
|
from typing import List, Dict, Any, Optional |
|
|
from dataclasses import dataclass, field |
|
|
from datetime import datetime |
|
|
import logging |
|
|
|
|
|
from app.config import settings |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
AETHERMAP_URL = getattr(settings, 'aethermap_url', 'https://madras1-aethermap.hf.space') |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class ProcessResult: |
|
|
"""Resultado do processamento de documentos""" |
|
|
job_id: str |
|
|
num_documents: int |
|
|
num_clusters: int |
|
|
num_noise: int |
|
|
metrics: Dict[str, Any] = field(default_factory=dict) |
|
|
cluster_analysis: Dict[str, Any] = field(default_factory=dict) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class SearchResult: |
|
|
"""Resultado de busca semântica""" |
|
|
summary: str |
|
|
results: List[Dict[str, Any]] = field(default_factory=list) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class EntityNode: |
|
|
"""Nó de entidade no grafo""" |
|
|
entity: str |
|
|
entity_type: str |
|
|
docs: int |
|
|
degree: int = 0 |
|
|
centrality: float = 0.0 |
|
|
role: str = "peripheral" |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class EntityEdge: |
|
|
"""Aresta do grafo de entidades""" |
|
|
source_entity: str |
|
|
target_entity: str |
|
|
weight: int |
|
|
reason: str |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class EntityGraphResult: |
|
|
"""Resultado da extração de entidades""" |
|
|
nodes: List[EntityNode] = field(default_factory=list) |
|
|
edges: List[EntityEdge] = field(default_factory=list) |
|
|
hubs: List[Dict[str, Any]] = field(default_factory=list) |
|
|
insights: Dict[str, Any] = field(default_factory=dict) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class GraphAnalysis: |
|
|
"""Análise do grafo via LLM""" |
|
|
analysis: str |
|
|
key_entities: List[str] = field(default_factory=list) |
|
|
relationships: List[str] = field(default_factory=list) |
|
|
|
|
|
|
|
|
class AetherMapClient: |
|
|
""" |
|
|
Client para AetherMap API. |
|
|
|
|
|
Funcionalidades: |
|
|
- Processamento de documentos (embeddings + clusters) |
|
|
- Busca semântica RAG (FAISS + BM25 + reranking + LLM) |
|
|
- Extração de entidades NER |
|
|
- Análise de grafo via LLM |
|
|
""" |
|
|
|
|
|
def __init__(self, base_url: str = None, timeout: float = 1800.0): |
|
|
self.base_url = (base_url or AETHERMAP_URL).rstrip('/') |
|
|
self.timeout = timeout |
|
|
self._current_job_id: Optional[str] = None |
|
|
|
|
|
@property |
|
|
def current_job_id(self) -> Optional[str]: |
|
|
"""Retorna o job_id atual""" |
|
|
return self._current_job_id |
|
|
|
|
|
async def process_documents( |
|
|
self, |
|
|
texts: List[str], |
|
|
fast_mode: bool = True, |
|
|
min_cluster_size: int = 0, |
|
|
min_samples: int = 0 |
|
|
) -> ProcessResult: |
|
|
""" |
|
|
Processa uma lista de textos gerando embeddings e clusters. |
|
|
|
|
|
Args: |
|
|
texts: Lista de textos/documentos |
|
|
fast_mode: Se True, usa PCA (rápido). Se False, usa UMAP (preciso) |
|
|
min_cluster_size: Tamanho mínimo do cluster (0=auto) |
|
|
min_samples: Mínimo de amostras (0=auto) |
|
|
|
|
|
Returns: |
|
|
ProcessResult com job_id e métricas |
|
|
""" |
|
|
|
|
|
content = "\n".join(texts) |
|
|
file_bytes = content.encode('utf-8') |
|
|
|
|
|
try: |
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client: |
|
|
files = { |
|
|
'file': ('documents.txt', io.BytesIO(file_bytes), 'text/plain') |
|
|
} |
|
|
data = { |
|
|
'n_samples': str(len(texts)), |
|
|
'fast_mode': 'true' if fast_mode else 'false', |
|
|
'min_cluster_size': str(min_cluster_size), |
|
|
'min_samples': str(min_samples) |
|
|
} |
|
|
|
|
|
logger.info(f"AetherMap: Processando {len(texts)} documentos para {self.base_url}/process/") |
|
|
|
|
|
response = await client.post( |
|
|
f"{self.base_url}/process/", |
|
|
files=files, |
|
|
data=data |
|
|
) |
|
|
|
|
|
logger.info(f"AetherMap: Response status {response.status_code}") |
|
|
|
|
|
if response.status_code != 200: |
|
|
error_text = response.text[:500] if response.text else "No response body" |
|
|
logger.error(f"AetherMap error: {response.status_code} - {error_text}") |
|
|
raise Exception(f"AetherMap error: {response.status_code} - {error_text}") |
|
|
|
|
|
result = response.json() |
|
|
|
|
|
self._current_job_id = result.get('job_id') |
|
|
metadata = result.get('metadata', {}) |
|
|
|
|
|
logger.info(f"AetherMap: Job criado {self._current_job_id}") |
|
|
|
|
|
return ProcessResult( |
|
|
job_id=self._current_job_id or "unknown", |
|
|
num_documents=metadata.get('num_documents_processed', len(texts)), |
|
|
num_clusters=metadata.get('num_clusters_found', 0), |
|
|
num_noise=metadata.get('num_noise_points', 0), |
|
|
metrics=result.get('metrics', {}), |
|
|
cluster_analysis=result.get('cluster_analysis', {}) |
|
|
) |
|
|
except httpx.TimeoutException: |
|
|
logger.error(f"AetherMap: Timeout ao conectar com {self.base_url}") |
|
|
raise Exception(f"Timeout: AetherMap Space pode estar dormindo. Tente novamente em alguns segundos.") |
|
|
except httpx.ConnectError as e: |
|
|
logger.error(f"AetherMap: Erro de conexão: {e}") |
|
|
raise Exception(f"Erro de conexão com AetherMap: {e}") |
|
|
except Exception as e: |
|
|
logger.error(f"AetherMap: Erro inesperado: {e}") |
|
|
raise |
|
|
|
|
|
async def semantic_search( |
|
|
self, |
|
|
query: str, |
|
|
job_id: str = None, |
|
|
turbo_mode: bool = False |
|
|
) -> SearchResult: |
|
|
""" |
|
|
Busca semântica RAG híbrida nos documentos processados. |
|
|
|
|
|
Args: |
|
|
query: Termo de busca |
|
|
job_id: ID do job (se não fornecido, usa o último) |
|
|
turbo_mode: Se True, busca mais rápida (menos precisa) |
|
|
|
|
|
Returns: |
|
|
SearchResult com resumo e resultados |
|
|
""" |
|
|
job_id = job_id or self._current_job_id |
|
|
if not job_id: |
|
|
raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") |
|
|
|
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client: |
|
|
data = { |
|
|
'query': query, |
|
|
'job_id': job_id, |
|
|
'turbo_mode': 'true' if turbo_mode else 'false' |
|
|
} |
|
|
|
|
|
logger.info(f"AetherMap: Buscando '{query}'...") |
|
|
|
|
|
response = await client.post( |
|
|
f"{self.base_url}/search/", |
|
|
data=data |
|
|
) |
|
|
|
|
|
if response.status_code != 200: |
|
|
raise Exception(f"AetherMap search error: {response.status_code} - {response.text}") |
|
|
|
|
|
result = response.json() |
|
|
|
|
|
return SearchResult( |
|
|
summary=result.get('summary', ''), |
|
|
results=result.get('results', []) |
|
|
) |
|
|
|
|
|
async def extract_entities(self, job_id: str = None) -> EntityGraphResult: |
|
|
""" |
|
|
Extrai entidades nomeadas (NER) e cria grafo de conexões. |
|
|
|
|
|
Args: |
|
|
job_id: ID do job (se não fornecido, usa o último) |
|
|
|
|
|
Returns: |
|
|
EntityGraphResult com nós, arestas e insights |
|
|
""" |
|
|
job_id = job_id or self._current_job_id |
|
|
if not job_id: |
|
|
raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") |
|
|
|
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client: |
|
|
data = {'job_id': job_id} |
|
|
|
|
|
logger.info(f"AetherMap: Extraindo entidades...") |
|
|
|
|
|
response = await client.post( |
|
|
f"{self.base_url}/entity_graph/", |
|
|
data=data |
|
|
) |
|
|
|
|
|
if response.status_code != 200: |
|
|
raise Exception(f"AetherMap entity_graph error: {response.status_code} - {response.text}") |
|
|
|
|
|
result = response.json() |
|
|
|
|
|
|
|
|
nodes = [ |
|
|
EntityNode( |
|
|
entity=n.get('entity', ''), |
|
|
entity_type=n.get('type', ''), |
|
|
docs=n.get('docs', 0), |
|
|
degree=n.get('degree', 0), |
|
|
centrality=n.get('centrality', 0.0), |
|
|
role=n.get('role', 'peripheral') |
|
|
) |
|
|
for n in result.get('nodes', []) |
|
|
] |
|
|
|
|
|
edges = [ |
|
|
EntityEdge( |
|
|
source_entity=e.get('source_entity', ''), |
|
|
target_entity=e.get('target_entity', ''), |
|
|
weight=e.get('weight', 0), |
|
|
reason=e.get('reason', '') |
|
|
) |
|
|
for e in result.get('edges', []) |
|
|
] |
|
|
|
|
|
return EntityGraphResult( |
|
|
nodes=nodes, |
|
|
edges=edges, |
|
|
hubs=result.get('hubs', []), |
|
|
insights=result.get('insights', {}) |
|
|
) |
|
|
|
|
|
async def analyze_graph(self, job_id: str = None) -> GraphAnalysis: |
|
|
""" |
|
|
Usa LLM para analisar o Knowledge Graph e extrair insights. |
|
|
|
|
|
Args: |
|
|
job_id: ID do job (se não fornecido, usa o último) |
|
|
|
|
|
Returns: |
|
|
GraphAnalysis com análise textual |
|
|
""" |
|
|
job_id = job_id or self._current_job_id |
|
|
if not job_id: |
|
|
raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") |
|
|
|
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client: |
|
|
data = {'job_id': job_id} |
|
|
|
|
|
logger.info(f"AetherMap: Analisando grafo com LLM...") |
|
|
|
|
|
response = await client.post( |
|
|
f"{self.base_url}/analyze_graph/", |
|
|
data=data |
|
|
) |
|
|
|
|
|
if response.status_code != 200: |
|
|
raise Exception(f"AetherMap analyze_graph error: {response.status_code} - {response.text}") |
|
|
|
|
|
result = response.json() |
|
|
|
|
|
return GraphAnalysis( |
|
|
analysis=result.get('analysis', ''), |
|
|
key_entities=result.get('key_entities', []), |
|
|
relationships=result.get('relationships', []) |
|
|
) |
|
|
|
|
|
async def describe_clusters(self, job_id: str = None) -> Dict[str, Any]: |
|
|
""" |
|
|
Usa LLM para descrever cada cluster encontrado. |
|
|
|
|
|
Args: |
|
|
job_id: ID do job (se não fornecido, usa o último) |
|
|
|
|
|
Returns: |
|
|
Dict com insights por cluster |
|
|
""" |
|
|
job_id = job_id or self._current_job_id |
|
|
if not job_id: |
|
|
raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") |
|
|
|
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client: |
|
|
data = {'job_id': job_id} |
|
|
|
|
|
logger.info(f"AetherMap: Descrevendo clusters...") |
|
|
|
|
|
response = await client.post( |
|
|
f"{self.base_url}/describe_clusters/", |
|
|
data=data |
|
|
) |
|
|
|
|
|
if response.status_code != 200: |
|
|
raise Exception(f"AetherMap describe_clusters error: {response.status_code} - {response.text}") |
|
|
|
|
|
return response.json() |
|
|
|
|
|
|
|
|
|
|
|
aethermap = AetherMapClient() |
|
|
|