Spaces:

Madras1
/

Numidium

Runtime error

App Files Files Community

Madras1 commited on Jan 22

Commit

c817084

verified ·

1 Parent(s): 8721c10

Update app/services/aethermap_client.py

Browse files

Files changed (1) hide show

app/services/aethermap_client.py +343 -343

app/services/aethermap_client.py CHANGED Viewed

@@ -1,343 +1,343 @@
-"""
-AetherMap Client
-Client para integração com AetherMap API - busca semântica, NER e análise de grafos.
-"""
-import httpx
-import json
-import io
-from typing import List, Dict, Any, Optional
-from dataclasses import dataclass, field
-from datetime import datetime
-import logging
-from app.config import settings
-logger = logging.getLogger(__name__)
-# URL base do AetherMap (HuggingFace Space)
-AETHERMAP_URL = getattr(settings, 'aethermap_url', 'https://madras1-aethermap.hf.space')
-@dataclass
-class ProcessResult:
-    """Resultado do processamento de documentos"""
-    job_id: str
-    num_documents: int
-    num_clusters: int
-    num_noise: int
-    metrics: Dict[str, Any] = field(default_factory=dict)
-    cluster_analysis: Dict[str, Any] = field(default_factory=dict)
-@dataclass
-class SearchResult:
-    """Resultado de busca semântica"""
-    summary: str  # Resposta RAG gerada pelo LLM
-    results: List[Dict[str, Any]] = field(default_factory=list)
-@dataclass
-class EntityNode:
-    """Nó de entidade no grafo"""
-    entity: str
-    entity_type: str
-    docs: int
-    degree: int = 0
-    centrality: float = 0.0
-    role: str = "peripheral"  # hub, connector, peripheral
-@dataclass
-class EntityEdge:
-    """Aresta do grafo de entidades"""
-    source_entity: str
-    target_entity: str
-    weight: int
-    reason: str
-@dataclass
-class EntityGraphResult:
-    """Resultado da extração de entidades"""
-    nodes: List[EntityNode] = field(default_factory=list)
-    edges: List[EntityEdge] = field(default_factory=list)
-    hubs: List[Dict[str, Any]] = field(default_factory=list)
-    insights: Dict[str, Any] = field(default_factory=dict)
-@dataclass
-class GraphAnalysis:
-    """Análise do grafo via LLM"""
-    analysis: str
-    key_entities: List[str] = field(default_factory=list)
-    relationships: List[str] = field(default_factory=list)
-class AetherMapClient:
-    """
-    Client para AetherMap API.
-    Funcionalidades:
-    - Processamento de documentos (embeddings + clusters)
-    - Busca semântica RAG (FAISS + BM25 + reranking + LLM)
-    - Extração de entidades NER
-    - Análise de grafo via LLM
-    """
-    def __init__(self, base_url: str = None, timeout: float = 600.0):
-        self.base_url = (base_url or AETHERMAP_URL).rstrip('/')
-        self.timeout = timeout
-        self._current_job_id: Optional[str] = None
-    @property
-    def current_job_id(self) -> Optional[str]:
-        """Retorna o job_id atual"""
-        return self._current_job_id
-    async def process_documents(
-        self,
-        texts: List[str],
-        fast_mode: bool = True,
-        min_cluster_size: int = 0,
-        min_samples: int = 0
-    ) -> ProcessResult:
-        """
-        Processa uma lista de textos gerando embeddings e clusters.
-        Args:
-            texts: Lista de textos/documentos
-            fast_mode: Se True, usa PCA (rápido). Se False, usa UMAP (preciso)
-            min_cluster_size: Tamanho mínimo do cluster (0=auto)
-            min_samples: Mínimo de amostras (0=auto)
-        Returns:
-            ProcessResult com job_id e métricas
-        """
-        # Criar arquivo TXT em memória
-        content = "\n".join(texts)
-        file_bytes = content.encode('utf-8')
-        try:
-            async with httpx.AsyncClient(timeout=self.timeout) as client:
-                files = {
-                    'file': ('documents.txt', io.BytesIO(file_bytes), 'text/plain')
-                }
-                data = {
-                    'n_samples': str(len(texts)),
-                    'fast_mode': 'true' if fast_mode else 'false',
-                    'min_cluster_size': str(min_cluster_size),
-                    'min_samples': str(min_samples)
-                }
-                logger.info(f"AetherMap: Processando {len(texts)} documentos para {self.base_url}/process/")
-                response = await client.post(
-                    f"{self.base_url}/process/",
-                    files=files,
-                    data=data
-                )
-                logger.info(f"AetherMap: Response status {response.status_code}")
-                if response.status_code != 200:
-                    error_text = response.text[:500] if response.text else "No response body"
-                    logger.error(f"AetherMap error: {response.status_code} - {error_text}")
-                    raise Exception(f"AetherMap error: {response.status_code} - {error_text}")
-                result = response.json()
-                self._current_job_id = result.get('job_id')
-                metadata = result.get('metadata', {})
-                logger.info(f"AetherMap: Job criado {self._current_job_id}")
-                return ProcessResult(
-                    job_id=self._current_job_id or "unknown",
-                    num_documents=metadata.get('num_documents_processed', len(texts)),
-                    num_clusters=metadata.get('num_clusters_found', 0),
-                    num_noise=metadata.get('num_noise_points', 0),
-                    metrics=result.get('metrics', {}),
-                    cluster_analysis=result.get('cluster_analysis', {})
-                )
-        except httpx.TimeoutException:
-            logger.error(f"AetherMap: Timeout ao conectar com {self.base_url}")
-            raise Exception(f"Timeout: AetherMap Space pode estar dormindo. Tente novamente em alguns segundos.")
-        except httpx.ConnectError as e:
-            logger.error(f"AetherMap: Erro de conexão: {e}")
-            raise Exception(f"Erro de conexão com AetherMap: {e}")
-        except Exception as e:
-            logger.error(f"AetherMap: Erro inesperado: {e}")
-            raise
-    async def semantic_search(
-        self,
-        query: str,
-        job_id: str = None,
-        turbo_mode: bool = False
-    ) -> SearchResult:
-        """
-        Busca semântica RAG híbrida nos documentos processados.
-        Args:
-            query: Termo de busca
-            job_id: ID do job (se não fornecido, usa o último)
-            turbo_mode: Se True, busca mais rápida (menos precisa)
-        Returns:
-            SearchResult com resumo e resultados
-        """
-        job_id = job_id or self._current_job_id
-        if not job_id:
-            raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
-        async with httpx.AsyncClient(timeout=self.timeout) as client:
-            data = {
-                'query': query,
-                'job_id': job_id,
-                'turbo_mode': 'true' if turbo_mode else 'false'
-            }
-            logger.info(f"AetherMap: Buscando '{query}'...")
-            response = await client.post(
-                f"{self.base_url}/search/",
-                data=data
-            )
-            if response.status_code != 200:
-                raise Exception(f"AetherMap search error: {response.status_code} - {response.text}")
-            result = response.json()
-            return SearchResult(
-                summary=result.get('summary', ''),
-                results=result.get('results', [])
-            )
-    async def extract_entities(self, job_id: str = None) -> EntityGraphResult:
-        """
-        Extrai entidades nomeadas (NER) e cria grafo de conexões.
-        Args:
-            job_id: ID do job (se não fornecido, usa o último)
-        Returns:
-            EntityGraphResult com nós, arestas e insights
-        """
-        job_id = job_id or self._current_job_id
-        if not job_id:
-            raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
-        async with httpx.AsyncClient(timeout=self.timeout) as client:
-            data = {'job_id': job_id}
-            logger.info(f"AetherMap: Extraindo entidades...")
-            response = await client.post(
-                f"{self.base_url}/entity_graph/",
-                data=data
-            )
-            if response.status_code != 200:
-                raise Exception(f"AetherMap entity_graph error: {response.status_code} - {response.text}")
-            result = response.json()
-            # Converter para dataclasses
-            nodes = [
-                EntityNode(
-                    entity=n.get('entity', ''),
-                    entity_type=n.get('type', ''),
-                    docs=n.get('docs', 0),
-                    degree=n.get('degree', 0),
-                    centrality=n.get('centrality', 0.0),
-                    role=n.get('role', 'peripheral')
-                )
-                for n in result.get('nodes', [])
-            ]
-            edges = [
-                EntityEdge(
-                    source_entity=e.get('source_entity', ''),
-                    target_entity=e.get('target_entity', ''),
-                    weight=e.get('weight', 0),
-                    reason=e.get('reason', '')
-                )
-                for e in result.get('edges', [])
-            ]
-            return EntityGraphResult(
-                nodes=nodes,
-                edges=edges,
-                hubs=result.get('hubs', []),
-                insights=result.get('insights', {})
-            )
-    async def analyze_graph(self, job_id: str = None) -> GraphAnalysis:
-        """
-        Usa LLM para analisar o Knowledge Graph e extrair insights.
-        Args:
-            job_id: ID do job (se não fornecido, usa o último)
-        Returns:
-            GraphAnalysis com análise textual
-        """
-        job_id = job_id or self._current_job_id
-        if not job_id:
-            raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
-        async with httpx.AsyncClient(timeout=self.timeout) as client:
-            data = {'job_id': job_id}
-            logger.info(f"AetherMap: Analisando grafo com LLM...")
-            response = await client.post(
-                f"{self.base_url}/analyze_graph/",
-                data=data
-            )
-            if response.status_code != 200:
-                raise Exception(f"AetherMap analyze_graph error: {response.status_code} - {response.text}")
-            result = response.json()
-            return GraphAnalysis(
-                analysis=result.get('analysis', ''),
-                key_entities=result.get('key_entities', []),
-                relationships=result.get('relationships', [])
-            )
-    async def describe_clusters(self, job_id: str = None) -> Dict[str, Any]:
-        """
-        Usa LLM para descrever cada cluster encontrado.
-        Args:
-            job_id: ID do job (se não fornecido, usa o último)
-        Returns:
-            Dict com insights por cluster
-        """
-        job_id = job_id or self._current_job_id
-        if not job_id:
-            raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
-        async with httpx.AsyncClient(timeout=self.timeout) as client:
-            data = {'job_id': job_id}
-            logger.info(f"AetherMap: Descrevendo clusters...")
-            response = await client.post(
-                f"{self.base_url}/describe_clusters/",
-                data=data
-            )
-            if response.status_code != 200:
-                raise Exception(f"AetherMap describe_clusters error: {response.status_code} - {response.text}")
-            return response.json()
-# Instância global do client
-aethermap = AetherMapClient()

+"""
+AetherMap Client
+Client para integração com AetherMap API - busca semântica, NER e análise de grafos.
+"""
+import httpx
+import json
+import io
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+from datetime import datetime
+import logging
+from app.config import settings
+logger = logging.getLogger(__name__)
+# URL base do AetherMap (HuggingFace Space)
+AETHERMAP_URL = getattr(settings, 'aethermap_url', 'https://madras1-aethermap.hf.space')
+@dataclass
+class ProcessResult:
+    """Resultado do processamento de documentos"""
+    job_id: str
+    num_documents: int
+    num_clusters: int
+    num_noise: int
+    metrics: Dict[str, Any] = field(default_factory=dict)
+    cluster_analysis: Dict[str, Any] = field(default_factory=dict)
+@dataclass
+class SearchResult:
+    """Resultado de busca semântica"""
+    summary: str  # Resposta RAG gerada pelo LLM
+    results: List[Dict[str, Any]] = field(default_factory=list)
+@dataclass
+class EntityNode:
+    """Nó de entidade no grafo"""
+    entity: str
+    entity_type: str
+    docs: int
+    degree: int = 0
+    centrality: float = 0.0
+    role: str = "peripheral"  # hub, connector, peripheral
+@dataclass
+class EntityEdge:
+    """Aresta do grafo de entidades"""
+    source_entity: str
+    target_entity: str
+    weight: int
+    reason: str
+@dataclass
+class EntityGraphResult:
+    """Resultado da extração de entidades"""
+    nodes: List[EntityNode] = field(default_factory=list)
+    edges: List[EntityEdge] = field(default_factory=list)
+    hubs: List[Dict[str, Any]] = field(default_factory=list)
+    insights: Dict[str, Any] = field(default_factory=dict)
+@dataclass
+class GraphAnalysis:
+    """Análise do grafo via LLM"""
+    analysis: str
+    key_entities: List[str] = field(default_factory=list)
+    relationships: List[str] = field(default_factory=list)
+class AetherMapClient:
+    """
+    Client para AetherMap API.
+    Funcionalidades:
+    - Processamento de documentos (embeddings + clusters)
+    - Busca semântica RAG (FAISS + BM25 + reranking + LLM)
+    - Extração de entidades NER
+    - Análise de grafo via LLM
+    """
+    def __init__(self, base_url: str = None, timeout: float = 1800.0):
+        self.base_url = (base_url or AETHERMAP_URL).rstrip('/')
+        self.timeout = timeout
+        self._current_job_id: Optional[str] = None
+    @property
+    def current_job_id(self) -> Optional[str]:
+        """Retorna o job_id atual"""
+        return self._current_job_id
+    async def process_documents(
+        self,
+        texts: List[str],
+        fast_mode: bool = True,
+        min_cluster_size: int = 0,
+        min_samples: int = 0
+    ) -> ProcessResult:
+        """
+        Processa uma lista de textos gerando embeddings e clusters.
+        Args:
+            texts: Lista de textos/documentos
+            fast_mode: Se True, usa PCA (rápido). Se False, usa UMAP (preciso)
+            min_cluster_size: Tamanho mínimo do cluster (0=auto)
+            min_samples: Mínimo de amostras (0=auto)
+        Returns:
+            ProcessResult com job_id e métricas
+        """
+        # Criar arquivo TXT em memória
+        content = "\n".join(texts)
+        file_bytes = content.encode('utf-8')
+        try:
+            async with httpx.AsyncClient(timeout=self.timeout) as client:
+                files = {
+                    'file': ('documents.txt', io.BytesIO(file_bytes), 'text/plain')
+                }
+                data = {
+                    'n_samples': str(len(texts)),
+                    'fast_mode': 'true' if fast_mode else 'false',
+                    'min_cluster_size': str(min_cluster_size),
+                    'min_samples': str(min_samples)
+                }
+                logger.info(f"AetherMap: Processando {len(texts)} documentos para {self.base_url}/process/")
+                response = await client.post(
+                    f"{self.base_url}/process/",
+                    files=files,
+                    data=data
+                )
+                logger.info(f"AetherMap: Response status {response.status_code}")
+                if response.status_code != 200:
+                    error_text = response.text[:500] if response.text else "No response body"
+                    logger.error(f"AetherMap error: {response.status_code} - {error_text}")
+                    raise Exception(f"AetherMap error: {response.status_code} - {error_text}")
+                result = response.json()
+                self._current_job_id = result.get('job_id')
+                metadata = result.get('metadata', {})
+                logger.info(f"AetherMap: Job criado {self._current_job_id}")
+                return ProcessResult(
+                    job_id=self._current_job_id or "unknown",
+                    num_documents=metadata.get('num_documents_processed', len(texts)),
+                    num_clusters=metadata.get('num_clusters_found', 0),
+                    num_noise=metadata.get('num_noise_points', 0),
+                    metrics=result.get('metrics', {}),
+                    cluster_analysis=result.get('cluster_analysis', {})
+                )
+        except httpx.TimeoutException:
+            logger.error(f"AetherMap: Timeout ao conectar com {self.base_url}")
+            raise Exception(f"Timeout: AetherMap Space pode estar dormindo. Tente novamente em alguns segundos.")
+        except httpx.ConnectError as e:
+            logger.error(f"AetherMap: Erro de conexão: {e}")
+            raise Exception(f"Erro de conexão com AetherMap: {e}")
+        except Exception as e:
+            logger.error(f"AetherMap: Erro inesperado: {e}")
+            raise
+    async def semantic_search(
+        self,
+        query: str,
+        job_id: str = None,
+        turbo_mode: bool = False
+    ) -> SearchResult:
+        """
+        Busca semântica RAG híbrida nos documentos processados.
+        Args:
+            query: Termo de busca
+            job_id: ID do job (se não fornecido, usa o último)
+            turbo_mode: Se True, busca mais rápida (menos precisa)
+        Returns:
+            SearchResult com resumo e resultados
+        """
+        job_id = job_id or self._current_job_id
+        if not job_id:
+            raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            data = {
+                'query': query,
+                'job_id': job_id,
+                'turbo_mode': 'true' if turbo_mode else 'false'
+            }
+            logger.info(f"AetherMap: Buscando '{query}'...")
+            response = await client.post(
+                f"{self.base_url}/search/",
+                data=data
+            )
+            if response.status_code != 200:
+                raise Exception(f"AetherMap search error: {response.status_code} - {response.text}")
+            result = response.json()
+            return SearchResult(
+                summary=result.get('summary', ''),
+                results=result.get('results', [])
+            )
+    async def extract_entities(self, job_id: str = None) -> EntityGraphResult:
+        """
+        Extrai entidades nomeadas (NER) e cria grafo de conexões.
+        Args:
+            job_id: ID do job (se não fornecido, usa o último)
+        Returns:
+            EntityGraphResult com nós, arestas e insights
+        """
+        job_id = job_id or self._current_job_id
+        if not job_id:
+            raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            data = {'job_id': job_id}
+            logger.info(f"AetherMap: Extraindo entidades...")
+            response = await client.post(
+                f"{self.base_url}/entity_graph/",
+                data=data
+            )
+            if response.status_code != 200:
+                raise Exception(f"AetherMap entity_graph error: {response.status_code} - {response.text}")
+            result = response.json()
+            # Converter para dataclasses
+            nodes = [
+                EntityNode(
+                    entity=n.get('entity', ''),
+                    entity_type=n.get('type', ''),
+                    docs=n.get('docs', 0),
+                    degree=n.get('degree', 0),
+                    centrality=n.get('centrality', 0.0),
+                    role=n.get('role', 'peripheral')
+                )
+                for n in result.get('nodes', [])
+            ]
+            edges = [
+                EntityEdge(
+                    source_entity=e.get('source_entity', ''),
+                    target_entity=e.get('target_entity', ''),
+                    weight=e.get('weight', 0),
+                    reason=e.get('reason', '')
+                )
+                for e in result.get('edges', [])
+            ]
+            return EntityGraphResult(
+                nodes=nodes,
+                edges=edges,
+                hubs=result.get('hubs', []),
+                insights=result.get('insights', {})
+            )
+    async def analyze_graph(self, job_id: str = None) -> GraphAnalysis:
+        """
+        Usa LLM para analisar o Knowledge Graph e extrair insights.
+        Args:
+            job_id: ID do job (se não fornecido, usa o último)
+        Returns:
+            GraphAnalysis com análise textual
+        """
+        job_id = job_id or self._current_job_id
+        if not job_id:
+            raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            data = {'job_id': job_id}
+            logger.info(f"AetherMap: Analisando grafo com LLM...")
+            response = await client.post(
+                f"{self.base_url}/analyze_graph/",
+                data=data
+            )
+            if response.status_code != 200:
+                raise Exception(f"AetherMap analyze_graph error: {response.status_code} - {response.text}")
+            result = response.json()
+            return GraphAnalysis(
+                analysis=result.get('analysis', ''),
+                key_entities=result.get('key_entities', []),
+                relationships=result.get('relationships', [])
+            )
+    async def describe_clusters(self, job_id: str = None) -> Dict[str, Any]:
+        """
+        Usa LLM para descrever cada cluster encontrado.
+        Args:
+            job_id: ID do job (se não fornecido, usa o último)
+        Returns:
+            Dict com insights por cluster
+        """
+        job_id = job_id or self._current_job_id
+        if not job_id:
+            raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            data = {'job_id': job_id}
+            logger.info(f"AetherMap: Descrevendo clusters...")
+            response = await client.post(
+                f"{self.base_url}/describe_clusters/",
+                data=data
+            )
+            if response.status_code != 200:
+                raise Exception(f"AetherMap describe_clusters error: {response.status_code} - {response.text}")
+            return response.json()
+# Instância global do client
+aethermap = AetherMapClient()