Madras1 commited on
Commit
c817084
·
verified ·
1 Parent(s): 8721c10

Update app/services/aethermap_client.py

Browse files
Files changed (1) hide show
  1. app/services/aethermap_client.py +343 -343
app/services/aethermap_client.py CHANGED
@@ -1,343 +1,343 @@
1
- """
2
- AetherMap Client
3
- Client para integração com AetherMap API - busca semântica, NER e análise de grafos.
4
- """
5
- import httpx
6
- import json
7
- import io
8
- from typing import List, Dict, Any, Optional
9
- from dataclasses import dataclass, field
10
- from datetime import datetime
11
- import logging
12
-
13
- from app.config import settings
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
- # URL base do AetherMap (HuggingFace Space)
19
- AETHERMAP_URL = getattr(settings, 'aethermap_url', 'https://madras1-aethermap.hf.space')
20
-
21
-
22
- @dataclass
23
- class ProcessResult:
24
- """Resultado do processamento de documentos"""
25
- job_id: str
26
- num_documents: int
27
- num_clusters: int
28
- num_noise: int
29
- metrics: Dict[str, Any] = field(default_factory=dict)
30
- cluster_analysis: Dict[str, Any] = field(default_factory=dict)
31
-
32
-
33
- @dataclass
34
- class SearchResult:
35
- """Resultado de busca semântica"""
36
- summary: str # Resposta RAG gerada pelo LLM
37
- results: List[Dict[str, Any]] = field(default_factory=list)
38
-
39
-
40
- @dataclass
41
- class EntityNode:
42
- """Nó de entidade no grafo"""
43
- entity: str
44
- entity_type: str
45
- docs: int
46
- degree: int = 0
47
- centrality: float = 0.0
48
- role: str = "peripheral" # hub, connector, peripheral
49
-
50
-
51
- @dataclass
52
- class EntityEdge:
53
- """Aresta do grafo de entidades"""
54
- source_entity: str
55
- target_entity: str
56
- weight: int
57
- reason: str
58
-
59
-
60
- @dataclass
61
- class EntityGraphResult:
62
- """Resultado da extração de entidades"""
63
- nodes: List[EntityNode] = field(default_factory=list)
64
- edges: List[EntityEdge] = field(default_factory=list)
65
- hubs: List[Dict[str, Any]] = field(default_factory=list)
66
- insights: Dict[str, Any] = field(default_factory=dict)
67
-
68
-
69
- @dataclass
70
- class GraphAnalysis:
71
- """Análise do grafo via LLM"""
72
- analysis: str
73
- key_entities: List[str] = field(default_factory=list)
74
- relationships: List[str] = field(default_factory=list)
75
-
76
-
77
- class AetherMapClient:
78
- """
79
- Client para AetherMap API.
80
-
81
- Funcionalidades:
82
- - Processamento de documentos (embeddings + clusters)
83
- - Busca semântica RAG (FAISS + BM25 + reranking + LLM)
84
- - Extração de entidades NER
85
- - Análise de grafo via LLM
86
- """
87
-
88
- def __init__(self, base_url: str = None, timeout: float = 600.0):
89
- self.base_url = (base_url or AETHERMAP_URL).rstrip('/')
90
- self.timeout = timeout
91
- self._current_job_id: Optional[str] = None
92
-
93
- @property
94
- def current_job_id(self) -> Optional[str]:
95
- """Retorna o job_id atual"""
96
- return self._current_job_id
97
-
98
- async def process_documents(
99
- self,
100
- texts: List[str],
101
- fast_mode: bool = True,
102
- min_cluster_size: int = 0,
103
- min_samples: int = 0
104
- ) -> ProcessResult:
105
- """
106
- Processa uma lista de textos gerando embeddings e clusters.
107
-
108
- Args:
109
- texts: Lista de textos/documentos
110
- fast_mode: Se True, usa PCA (rápido). Se False, usa UMAP (preciso)
111
- min_cluster_size: Tamanho mínimo do cluster (0=auto)
112
- min_samples: Mínimo de amostras (0=auto)
113
-
114
- Returns:
115
- ProcessResult com job_id e métricas
116
- """
117
- # Criar arquivo TXT em memória
118
- content = "\n".join(texts)
119
- file_bytes = content.encode('utf-8')
120
-
121
- try:
122
- async with httpx.AsyncClient(timeout=self.timeout) as client:
123
- files = {
124
- 'file': ('documents.txt', io.BytesIO(file_bytes), 'text/plain')
125
- }
126
- data = {
127
- 'n_samples': str(len(texts)),
128
- 'fast_mode': 'true' if fast_mode else 'false',
129
- 'min_cluster_size': str(min_cluster_size),
130
- 'min_samples': str(min_samples)
131
- }
132
-
133
- logger.info(f"AetherMap: Processando {len(texts)} documentos para {self.base_url}/process/")
134
-
135
- response = await client.post(
136
- f"{self.base_url}/process/",
137
- files=files,
138
- data=data
139
- )
140
-
141
- logger.info(f"AetherMap: Response status {response.status_code}")
142
-
143
- if response.status_code != 200:
144
- error_text = response.text[:500] if response.text else "No response body"
145
- logger.error(f"AetherMap error: {response.status_code} - {error_text}")
146
- raise Exception(f"AetherMap error: {response.status_code} - {error_text}")
147
-
148
- result = response.json()
149
-
150
- self._current_job_id = result.get('job_id')
151
- metadata = result.get('metadata', {})
152
-
153
- logger.info(f"AetherMap: Job criado {self._current_job_id}")
154
-
155
- return ProcessResult(
156
- job_id=self._current_job_id or "unknown",
157
- num_documents=metadata.get('num_documents_processed', len(texts)),
158
- num_clusters=metadata.get('num_clusters_found', 0),
159
- num_noise=metadata.get('num_noise_points', 0),
160
- metrics=result.get('metrics', {}),
161
- cluster_analysis=result.get('cluster_analysis', {})
162
- )
163
- except httpx.TimeoutException:
164
- logger.error(f"AetherMap: Timeout ao conectar com {self.base_url}")
165
- raise Exception(f"Timeout: AetherMap Space pode estar dormindo. Tente novamente em alguns segundos.")
166
- except httpx.ConnectError as e:
167
- logger.error(f"AetherMap: Erro de conexão: {e}")
168
- raise Exception(f"Erro de conexão com AetherMap: {e}")
169
- except Exception as e:
170
- logger.error(f"AetherMap: Erro inesperado: {e}")
171
- raise
172
-
173
- async def semantic_search(
174
- self,
175
- query: str,
176
- job_id: str = None,
177
- turbo_mode: bool = False
178
- ) -> SearchResult:
179
- """
180
- Busca semântica RAG híbrida nos documentos processados.
181
-
182
- Args:
183
- query: Termo de busca
184
- job_id: ID do job (se não fornecido, usa o último)
185
- turbo_mode: Se True, busca mais rápida (menos precisa)
186
-
187
- Returns:
188
- SearchResult com resumo e resultados
189
- """
190
- job_id = job_id or self._current_job_id
191
- if not job_id:
192
- raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
193
-
194
- async with httpx.AsyncClient(timeout=self.timeout) as client:
195
- data = {
196
- 'query': query,
197
- 'job_id': job_id,
198
- 'turbo_mode': 'true' if turbo_mode else 'false'
199
- }
200
-
201
- logger.info(f"AetherMap: Buscando '{query}'...")
202
-
203
- response = await client.post(
204
- f"{self.base_url}/search/",
205
- data=data
206
- )
207
-
208
- if response.status_code != 200:
209
- raise Exception(f"AetherMap search error: {response.status_code} - {response.text}")
210
-
211
- result = response.json()
212
-
213
- return SearchResult(
214
- summary=result.get('summary', ''),
215
- results=result.get('results', [])
216
- )
217
-
218
- async def extract_entities(self, job_id: str = None) -> EntityGraphResult:
219
- """
220
- Extrai entidades nomeadas (NER) e cria grafo de conexões.
221
-
222
- Args:
223
- job_id: ID do job (se não fornecido, usa o último)
224
-
225
- Returns:
226
- EntityGraphResult com nós, arestas e insights
227
- """
228
- job_id = job_id or self._current_job_id
229
- if not job_id:
230
- raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
231
-
232
- async with httpx.AsyncClient(timeout=self.timeout) as client:
233
- data = {'job_id': job_id}
234
-
235
- logger.info(f"AetherMap: Extraindo entidades...")
236
-
237
- response = await client.post(
238
- f"{self.base_url}/entity_graph/",
239
- data=data
240
- )
241
-
242
- if response.status_code != 200:
243
- raise Exception(f"AetherMap entity_graph error: {response.status_code} - {response.text}")
244
-
245
- result = response.json()
246
-
247
- # Converter para dataclasses
248
- nodes = [
249
- EntityNode(
250
- entity=n.get('entity', ''),
251
- entity_type=n.get('type', ''),
252
- docs=n.get('docs', 0),
253
- degree=n.get('degree', 0),
254
- centrality=n.get('centrality', 0.0),
255
- role=n.get('role', 'peripheral')
256
- )
257
- for n in result.get('nodes', [])
258
- ]
259
-
260
- edges = [
261
- EntityEdge(
262
- source_entity=e.get('source_entity', ''),
263
- target_entity=e.get('target_entity', ''),
264
- weight=e.get('weight', 0),
265
- reason=e.get('reason', '')
266
- )
267
- for e in result.get('edges', [])
268
- ]
269
-
270
- return EntityGraphResult(
271
- nodes=nodes,
272
- edges=edges,
273
- hubs=result.get('hubs', []),
274
- insights=result.get('insights', {})
275
- )
276
-
277
- async def analyze_graph(self, job_id: str = None) -> GraphAnalysis:
278
- """
279
- Usa LLM para analisar o Knowledge Graph e extrair insights.
280
-
281
- Args:
282
- job_id: ID do job (se não fornecido, usa o último)
283
-
284
- Returns:
285
- GraphAnalysis com análise textual
286
- """
287
- job_id = job_id or self._current_job_id
288
- if not job_id:
289
- raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
290
-
291
- async with httpx.AsyncClient(timeout=self.timeout) as client:
292
- data = {'job_id': job_id}
293
-
294
- logger.info(f"AetherMap: Analisando grafo com LLM...")
295
-
296
- response = await client.post(
297
- f"{self.base_url}/analyze_graph/",
298
- data=data
299
- )
300
-
301
- if response.status_code != 200:
302
- raise Exception(f"AetherMap analyze_graph error: {response.status_code} - {response.text}")
303
-
304
- result = response.json()
305
-
306
- return GraphAnalysis(
307
- analysis=result.get('analysis', ''),
308
- key_entities=result.get('key_entities', []),
309
- relationships=result.get('relationships', [])
310
- )
311
-
312
- async def describe_clusters(self, job_id: str = None) -> Dict[str, Any]:
313
- """
314
- Usa LLM para descrever cada cluster encontrado.
315
-
316
- Args:
317
- job_id: ID do job (se não fornecido, usa o último)
318
-
319
- Returns:
320
- Dict com insights por cluster
321
- """
322
- job_id = job_id or self._current_job_id
323
- if not job_id:
324
- raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
325
-
326
- async with httpx.AsyncClient(timeout=self.timeout) as client:
327
- data = {'job_id': job_id}
328
-
329
- logger.info(f"AetherMap: Descrevendo clusters...")
330
-
331
- response = await client.post(
332
- f"{self.base_url}/describe_clusters/",
333
- data=data
334
- )
335
-
336
- if response.status_code != 200:
337
- raise Exception(f"AetherMap describe_clusters error: {response.status_code} - {response.text}")
338
-
339
- return response.json()
340
-
341
-
342
- # Instância global do client
343
- aethermap = AetherMapClient()
 
1
+ """
2
+ AetherMap Client
3
+ Client para integração com AetherMap API - busca semântica, NER e análise de grafos.
4
+ """
5
+ import httpx
6
+ import json
7
+ import io
8
+ from typing import List, Dict, Any, Optional
9
+ from dataclasses import dataclass, field
10
+ from datetime import datetime
11
+ import logging
12
+
13
+ from app.config import settings
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ # URL base do AetherMap (HuggingFace Space)
19
+ AETHERMAP_URL = getattr(settings, 'aethermap_url', 'https://madras1-aethermap.hf.space')
20
+
21
+
22
+ @dataclass
23
+ class ProcessResult:
24
+ """Resultado do processamento de documentos"""
25
+ job_id: str
26
+ num_documents: int
27
+ num_clusters: int
28
+ num_noise: int
29
+ metrics: Dict[str, Any] = field(default_factory=dict)
30
+ cluster_analysis: Dict[str, Any] = field(default_factory=dict)
31
+
32
+
33
+ @dataclass
34
+ class SearchResult:
35
+ """Resultado de busca semântica"""
36
+ summary: str # Resposta RAG gerada pelo LLM
37
+ results: List[Dict[str, Any]] = field(default_factory=list)
38
+
39
+
40
+ @dataclass
41
+ class EntityNode:
42
+ """Nó de entidade no grafo"""
43
+ entity: str
44
+ entity_type: str
45
+ docs: int
46
+ degree: int = 0
47
+ centrality: float = 0.0
48
+ role: str = "peripheral" # hub, connector, peripheral
49
+
50
+
51
+ @dataclass
52
+ class EntityEdge:
53
+ """Aresta do grafo de entidades"""
54
+ source_entity: str
55
+ target_entity: str
56
+ weight: int
57
+ reason: str
58
+
59
+
60
+ @dataclass
61
+ class EntityGraphResult:
62
+ """Resultado da extração de entidades"""
63
+ nodes: List[EntityNode] = field(default_factory=list)
64
+ edges: List[EntityEdge] = field(default_factory=list)
65
+ hubs: List[Dict[str, Any]] = field(default_factory=list)
66
+ insights: Dict[str, Any] = field(default_factory=dict)
67
+
68
+
69
+ @dataclass
70
+ class GraphAnalysis:
71
+ """Análise do grafo via LLM"""
72
+ analysis: str
73
+ key_entities: List[str] = field(default_factory=list)
74
+ relationships: List[str] = field(default_factory=list)
75
+
76
+
77
+ class AetherMapClient:
78
+ """
79
+ Client para AetherMap API.
80
+
81
+ Funcionalidades:
82
+ - Processamento de documentos (embeddings + clusters)
83
+ - Busca semântica RAG (FAISS + BM25 + reranking + LLM)
84
+ - Extração de entidades NER
85
+ - Análise de grafo via LLM
86
+ """
87
+
88
+ def __init__(self, base_url: str = None, timeout: float = 1800.0):
89
+ self.base_url = (base_url or AETHERMAP_URL).rstrip('/')
90
+ self.timeout = timeout
91
+ self._current_job_id: Optional[str] = None
92
+
93
+ @property
94
+ def current_job_id(self) -> Optional[str]:
95
+ """Retorna o job_id atual"""
96
+ return self._current_job_id
97
+
98
+ async def process_documents(
99
+ self,
100
+ texts: List[str],
101
+ fast_mode: bool = True,
102
+ min_cluster_size: int = 0,
103
+ min_samples: int = 0
104
+ ) -> ProcessResult:
105
+ """
106
+ Processa uma lista de textos gerando embeddings e clusters.
107
+
108
+ Args:
109
+ texts: Lista de textos/documentos
110
+ fast_mode: Se True, usa PCA (rápido). Se False, usa UMAP (preciso)
111
+ min_cluster_size: Tamanho mínimo do cluster (0=auto)
112
+ min_samples: Mínimo de amostras (0=auto)
113
+
114
+ Returns:
115
+ ProcessResult com job_id e métricas
116
+ """
117
+ # Criar arquivo TXT em memória
118
+ content = "\n".join(texts)
119
+ file_bytes = content.encode('utf-8')
120
+
121
+ try:
122
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
123
+ files = {
124
+ 'file': ('documents.txt', io.BytesIO(file_bytes), 'text/plain')
125
+ }
126
+ data = {
127
+ 'n_samples': str(len(texts)),
128
+ 'fast_mode': 'true' if fast_mode else 'false',
129
+ 'min_cluster_size': str(min_cluster_size),
130
+ 'min_samples': str(min_samples)
131
+ }
132
+
133
+ logger.info(f"AetherMap: Processando {len(texts)} documentos para {self.base_url}/process/")
134
+
135
+ response = await client.post(
136
+ f"{self.base_url}/process/",
137
+ files=files,
138
+ data=data
139
+ )
140
+
141
+ logger.info(f"AetherMap: Response status {response.status_code}")
142
+
143
+ if response.status_code != 200:
144
+ error_text = response.text[:500] if response.text else "No response body"
145
+ logger.error(f"AetherMap error: {response.status_code} - {error_text}")
146
+ raise Exception(f"AetherMap error: {response.status_code} - {error_text}")
147
+
148
+ result = response.json()
149
+
150
+ self._current_job_id = result.get('job_id')
151
+ metadata = result.get('metadata', {})
152
+
153
+ logger.info(f"AetherMap: Job criado {self._current_job_id}")
154
+
155
+ return ProcessResult(
156
+ job_id=self._current_job_id or "unknown",
157
+ num_documents=metadata.get('num_documents_processed', len(texts)),
158
+ num_clusters=metadata.get('num_clusters_found', 0),
159
+ num_noise=metadata.get('num_noise_points', 0),
160
+ metrics=result.get('metrics', {}),
161
+ cluster_analysis=result.get('cluster_analysis', {})
162
+ )
163
+ except httpx.TimeoutException:
164
+ logger.error(f"AetherMap: Timeout ao conectar com {self.base_url}")
165
+ raise Exception(f"Timeout: AetherMap Space pode estar dormindo. Tente novamente em alguns segundos.")
166
+ except httpx.ConnectError as e:
167
+ logger.error(f"AetherMap: Erro de conexão: {e}")
168
+ raise Exception(f"Erro de conexão com AetherMap: {e}")
169
+ except Exception as e:
170
+ logger.error(f"AetherMap: Erro inesperado: {e}")
171
+ raise
172
+
173
+ async def semantic_search(
174
+ self,
175
+ query: str,
176
+ job_id: str = None,
177
+ turbo_mode: bool = False
178
+ ) -> SearchResult:
179
+ """
180
+ Busca semântica RAG híbrida nos documentos processados.
181
+
182
+ Args:
183
+ query: Termo de busca
184
+ job_id: ID do job (se não fornecido, usa o último)
185
+ turbo_mode: Se True, busca mais rápida (menos precisa)
186
+
187
+ Returns:
188
+ SearchResult com resumo e resultados
189
+ """
190
+ job_id = job_id or self._current_job_id
191
+ if not job_id:
192
+ raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
193
+
194
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
195
+ data = {
196
+ 'query': query,
197
+ 'job_id': job_id,
198
+ 'turbo_mode': 'true' if turbo_mode else 'false'
199
+ }
200
+
201
+ logger.info(f"AetherMap: Buscando '{query}'...")
202
+
203
+ response = await client.post(
204
+ f"{self.base_url}/search/",
205
+ data=data
206
+ )
207
+
208
+ if response.status_code != 200:
209
+ raise Exception(f"AetherMap search error: {response.status_code} - {response.text}")
210
+
211
+ result = response.json()
212
+
213
+ return SearchResult(
214
+ summary=result.get('summary', ''),
215
+ results=result.get('results', [])
216
+ )
217
+
218
+ async def extract_entities(self, job_id: str = None) -> EntityGraphResult:
219
+ """
220
+ Extrai entidades nomeadas (NER) e cria grafo de conexões.
221
+
222
+ Args:
223
+ job_id: ID do job (se não fornecido, usa o último)
224
+
225
+ Returns:
226
+ EntityGraphResult com nós, arestas e insights
227
+ """
228
+ job_id = job_id or self._current_job_id
229
+ if not job_id:
230
+ raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
231
+
232
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
233
+ data = {'job_id': job_id}
234
+
235
+ logger.info(f"AetherMap: Extraindo entidades...")
236
+
237
+ response = await client.post(
238
+ f"{self.base_url}/entity_graph/",
239
+ data=data
240
+ )
241
+
242
+ if response.status_code != 200:
243
+ raise Exception(f"AetherMap entity_graph error: {response.status_code} - {response.text}")
244
+
245
+ result = response.json()
246
+
247
+ # Converter para dataclasses
248
+ nodes = [
249
+ EntityNode(
250
+ entity=n.get('entity', ''),
251
+ entity_type=n.get('type', ''),
252
+ docs=n.get('docs', 0),
253
+ degree=n.get('degree', 0),
254
+ centrality=n.get('centrality', 0.0),
255
+ role=n.get('role', 'peripheral')
256
+ )
257
+ for n in result.get('nodes', [])
258
+ ]
259
+
260
+ edges = [
261
+ EntityEdge(
262
+ source_entity=e.get('source_entity', ''),
263
+ target_entity=e.get('target_entity', ''),
264
+ weight=e.get('weight', 0),
265
+ reason=e.get('reason', '')
266
+ )
267
+ for e in result.get('edges', [])
268
+ ]
269
+
270
+ return EntityGraphResult(
271
+ nodes=nodes,
272
+ edges=edges,
273
+ hubs=result.get('hubs', []),
274
+ insights=result.get('insights', {})
275
+ )
276
+
277
+ async def analyze_graph(self, job_id: str = None) -> GraphAnalysis:
278
+ """
279
+ Usa LLM para analisar o Knowledge Graph e extrair insights.
280
+
281
+ Args:
282
+ job_id: ID do job (se não fornecido, usa o último)
283
+
284
+ Returns:
285
+ GraphAnalysis com análise textual
286
+ """
287
+ job_id = job_id or self._current_job_id
288
+ if not job_id:
289
+ raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
290
+
291
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
292
+ data = {'job_id': job_id}
293
+
294
+ logger.info(f"AetherMap: Analisando grafo com LLM...")
295
+
296
+ response = await client.post(
297
+ f"{self.base_url}/analyze_graph/",
298
+ data=data
299
+ )
300
+
301
+ if response.status_code != 200:
302
+ raise Exception(f"AetherMap analyze_graph error: {response.status_code} - {response.text}")
303
+
304
+ result = response.json()
305
+
306
+ return GraphAnalysis(
307
+ analysis=result.get('analysis', ''),
308
+ key_entities=result.get('key_entities', []),
309
+ relationships=result.get('relationships', [])
310
+ )
311
+
312
+ async def describe_clusters(self, job_id: str = None) -> Dict[str, Any]:
313
+ """
314
+ Usa LLM para descrever cada cluster encontrado.
315
+
316
+ Args:
317
+ job_id: ID do job (se não fornecido, usa o último)
318
+
319
+ Returns:
320
+ Dict com insights por cluster
321
+ """
322
+ job_id = job_id or self._current_job_id
323
+ if not job_id:
324
+ raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
325
+
326
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
327
+ data = {'job_id': job_id}
328
+
329
+ logger.info(f"AetherMap: Descrevendo clusters...")
330
+
331
+ response = await client.post(
332
+ f"{self.base_url}/describe_clusters/",
333
+ data=data
334
+ )
335
+
336
+ if response.status_code != 200:
337
+ raise Exception(f"AetherMap describe_clusters error: {response.status_code} - {response.text}")
338
+
339
+ return response.json()
340
+
341
+
342
+ # Instância global do client
343
+ aethermap = AetherMapClient()