Spaces:

Madras1
/

Numidium

Runtime error

App Files Files Community

Madras1 commited on Jan 22

Commit

d2fb3fb

verified ·

1 Parent(s): 8f5a30e

Upload 79 files

Browse files

Files changed (5) hide show

app/api/routes/aethermap.py +307 -0
app/config.py +3 -0
app/main.py +2 -1
app/services/aethermap_client.py +329 -0
app/services/investigator_agent.py +66 -2

app/api/routes/aethermap.py ADDED Viewed

	@@ -0,0 +1,307 @@

+"""
+AetherMap Routes - Document Mapping & Semantic Search
+Integrates with AetherMap API for document clustering, NER, and semantic search.
+"""
+from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends
+from pydantic import BaseModel, Field
+from typing import Optional, List, Dict, Any
+from sqlalchemy.orm import Session
+import io
+from app.api.deps import get_scoped_db
+from app.services.aethermap_client import aethermap, ProcessResult, SearchResult, EntityGraphResult
+router = APIRouter()
+# ============================================================================
+# Request/Response Models
+# ============================================================================
+class IndexDocumentsRequest(BaseModel):
+    """Request to index documents from text list"""
+    documents: List[str] = Field(..., description="Lista de textos para indexar")
+    fast_mode: bool = Field(True, description="Modo rápido (PCA) ou preciso (UMAP)")
+class IndexEntitiesRequest(BaseModel):
+    """Request to index entities from NUMIDIUM database"""
+    entity_types: Optional[List[str]] = Field(None, description="Filtrar por tipos de entidade")
+    limit: int = Field(500, description="Limite de entidades")
+class SemanticSearchRequest(BaseModel):
+    """Request for semantic search"""
+    query: str = Field(..., description="Termo de busca")
+    turbo_mode: bool = Field(True, description="Modo turbo (mais rápido)")
+class IndexResponse(BaseModel):
+    """Response from indexing"""
+    job_id: str
+    num_documents: int
+    num_clusters: int
+    num_noise: int
+    metrics: Dict[str, Any] = {}
+    cluster_analysis: Dict[str, Any] = {}
+class SearchResponse(BaseModel):
+    """Response from search"""
+    summary: str
+    results: List[Dict[str, Any]] = []
+class EntityGraphResponse(BaseModel):
+    """Response from NER extraction"""
+    hubs: List[Dict[str, Any]] = []
+    insights: Dict[str, Any] = {}
+    node_count: int = 0
+    edge_count: int = 0
+class StatusResponse(BaseModel):
+    """AetherMap status"""
+    connected: bool
+    job_id: Optional[str] = None
+    documents_indexed: int = 0
+# ============================================================================
+# Endpoints
+# ============================================================================
+@router.get("/status", response_model=StatusResponse)
+async def get_status():
+    """
+    Get AetherMap connection status.
+    """
+    return StatusResponse(
+        connected=True,
+        job_id=aethermap.current_job_id,
+        documents_indexed=0  # TODO: track this
+    )
+@router.post("/index", response_model=IndexResponse)
+async def index_documents(request: IndexDocumentsRequest):
+    """
+    Index a list of documents for semantic search.
+    The documents will be:
+    - Embedded using sentence transformers
+    - Clustered using HDBSCAN
+    - Indexed in FAISS + BM25 for hybrid search
+    """
+    try:
+        if not request.documents:
+            raise HTTPException(status_code=400, detail="Nenhum documento fornecido")
+        result = await aethermap.process_documents(
+            texts=request.documents,
+            fast_mode=request.fast_mode
+        )
+        return IndexResponse(
+            job_id=result.job_id,
+            num_documents=result.num_documents,
+            num_clusters=result.num_clusters,
+            num_noise=result.num_noise,
+            metrics=result.metrics,
+            cluster_analysis=result.cluster_analysis
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/index-entities", response_model=IndexResponse)
+async def index_entities(
+    request: IndexEntitiesRequest,
+    db: Session = Depends(get_scoped_db)
+):
+    """
+    Index entities from NUMIDIUM database.
+    Collects entity names and descriptions, sends to AetherMap for processing.
+    """
+    from app.models.entity import Entity
+    try:
+        query = db.query(Entity)
+        if request.entity_types:
+            query = query.filter(Entity.type.in_(request.entity_types))
+        entities = query.limit(request.limit).all()
+        if not entities:
+            raise HTTPException(status_code=404, detail="Nenhuma entidade encontrada")
+        # Build text representations
+        documents = []
+        for e in entities:
+            text = f"{e.name} ({e.type})"
+            if e.description:
+                text += f": {e.description[:1000]}"
+            documents.append(text)
+        result = await aethermap.process_documents(
+            texts=documents,
+            fast_mode=request.fast_mode if hasattr(request, 'fast_mode') else True
+        )
+        return IndexResponse(
+            job_id=result.job_id,
+            num_documents=result.num_documents,
+            num_clusters=result.num_clusters,
+            num_noise=result.num_noise,
+            metrics=result.metrics,
+            cluster_analysis=result.cluster_analysis
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/upload", response_model=IndexResponse)
+async def upload_documents(
+    file: UploadFile = File(...),
+    fast_mode: bool = Form(True)
+):
+    """
+    Upload a file (TXT or CSV) for indexing.
+    - TXT: One document per line
+    - CSV: Will use first text column found
+    """
+    try:
+        content = await file.read()
+        text = content.decode('utf-8', errors='ignore')
+        # Split by lines for TXT
+        documents = [line.strip() for line in text.splitlines() if line.strip()]
+        if not documents:
+            raise HTTPException(status_code=400, detail="Arquivo vazio ou sem texto válido")
+        result = await aethermap.process_documents(
+            texts=documents,
+            fast_mode=fast_mode
+        )
+        return IndexResponse(
+            job_id=result.job_id,
+            num_documents=result.num_documents,
+            num_clusters=result.num_clusters,
+            num_noise=result.num_noise,
+            metrics=result.metrics,
+            cluster_analysis=result.cluster_analysis
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/search", response_model=SearchResponse)
+async def semantic_search(request: SemanticSearchRequest):
+    """
+    Semantic search in indexed documents.
+    Uses hybrid RAG (FAISS + BM25 + reranking + LLM).
+    Returns a summary answering the query with citations.
+    """
+    try:
+        if not aethermap.current_job_id:
+            raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.")
+        result = await aethermap.semantic_search(
+            query=request.query,
+            turbo_mode=request.turbo_mode
+        )
+        return SearchResponse(
+            summary=result.summary,
+            results=result.results
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/entities", response_model=EntityGraphResponse)
+async def extract_entities():
+    """
+    Extract named entities (NER) from indexed documents.
+    Returns:
+    - Hub entities (most connected)
+    - Relationship insights
+    - Graph metrics
+    """
+    try:
+        if not aethermap.current_job_id:
+            raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.")
+        result = await aethermap.extract_entities()
+        return EntityGraphResponse(
+            hubs=result.hubs,
+            insights=result.insights,
+            node_count=len(result.nodes),
+            edge_count=len(result.edges)
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/analyze")
+async def analyze_graph():
+    """
+    Analyze entity graph using LLM.
+    Returns semantic insights about relationships and patterns.
+    """
+    try:
+        if not aethermap.current_job_id:
+            raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.")
+        result = await aethermap.analyze_graph()
+        return {
+            "analysis": result.analysis,
+            "key_entities": result.key_entities,
+            "relationships": result.relationships
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/describe-clusters")
+async def describe_clusters():
+    """
+    Get LLM descriptions for each cluster found.
+    """
+    try:
+        if not aethermap.current_job_id:
+            raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.")
+        result = await aethermap.describe_clusters()
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

app/config.py CHANGED Viewed

@@ -23,6 +23,9 @@ class Settings(BaseSettings):
     # Cerebras API for LLM-based entity extraction
     cerebras_api_key: str = ""
     # CORS
     cors_origins: list[str] = ["*"]

     # Cerebras API for LLM-based entity extraction
     cerebras_api_key: str = ""
+    # AetherMap API for semantic search and NER
+    aethermap_url: str = "https://madras1-aethermap.hf.space"
     # CORS
     cors_origins: list[str] = ["*"]

app/main.py CHANGED Viewed

@@ -8,7 +8,7 @@ from contextlib import asynccontextmanager
 from app.config import settings
 from app.core.database import init_db
-from app.api.routes import entities, relationships, events, search, ingest, analyze, graph, research, chat, investigate, dados_publicos, timeline, session
 @asynccontextmanager
@@ -63,6 +63,7 @@ app.include_router(investigate.router, prefix="/api/v1")
 app.include_router(dados_publicos.router, prefix="/api/v1")
 app.include_router(timeline.router, prefix="/api/v1")
 app.include_router(session.router, prefix="/api/v1")
 @app.get("/")

 from app.config import settings
 from app.core.database import init_db
+from app.api.routes import entities, relationships, events, search, ingest, analyze, graph, research, chat, investigate, dados_publicos, timeline, session, aethermap
 @asynccontextmanager
 app.include_router(dados_publicos.router, prefix="/api/v1")
 app.include_router(timeline.router, prefix="/api/v1")
 app.include_router(session.router, prefix="/api/v1")
+app.include_router(aethermap.router, prefix="/api/v1/aethermap", tags=["aethermap"])
 @app.get("/")

app/services/aethermap_client.py ADDED Viewed

	@@ -0,0 +1,329 @@

+"""
+AetherMap Client
+Client para integração com AetherMap API - busca semântica, NER e análise de grafos.
+"""
+import httpx
+import json
+import io
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+from datetime import datetime
+import logging
+from app.config import settings
+logger = logging.getLogger(__name__)
+# URL base do AetherMap (HuggingFace Space)
+AETHERMAP_URL = getattr(settings, 'aethermap_url', 'https://madras1-aethermap.hf.space')
+@dataclass
+class ProcessResult:
+    """Resultado do processamento de documentos"""
+    job_id: str
+    num_documents: int
+    num_clusters: int
+    num_noise: int
+    metrics: Dict[str, Any] = field(default_factory=dict)
+    cluster_analysis: Dict[str, Any] = field(default_factory=dict)
+@dataclass
+class SearchResult:
+    """Resultado de busca semântica"""
+    summary: str  # Resposta RAG gerada pelo LLM
+    results: List[Dict[str, Any]] = field(default_factory=list)
+@dataclass
+class EntityNode:
+    """Nó de entidade no grafo"""
+    entity: str
+    entity_type: str
+    docs: int
+    degree: int = 0
+    centrality: float = 0.0
+    role: str = "peripheral"  # hub, connector, peripheral
+@dataclass
+class EntityEdge:
+    """Aresta do grafo de entidades"""
+    source_entity: str
+    target_entity: str
+    weight: int
+    reason: str
+@dataclass
+class EntityGraphResult:
+    """Resultado da extração de entidades"""
+    nodes: List[EntityNode] = field(default_factory=list)
+    edges: List[EntityEdge] = field(default_factory=list)
+    hubs: List[Dict[str, Any]] = field(default_factory=list)
+    insights: Dict[str, Any] = field(default_factory=dict)
+@dataclass
+class GraphAnalysis:
+    """Análise do grafo via LLM"""
+    analysis: str
+    key_entities: List[str] = field(default_factory=list)
+    relationships: List[str] = field(default_factory=list)
+class AetherMapClient:
+    """
+    Client para AetherMap API.
+    Funcionalidades:
+    - Processamento de documentos (embeddings + clusters)
+    - Busca semântica RAG (FAISS + BM25 + reranking + LLM)
+    - Extração de entidades NER
+    - Análise de grafo via LLM
+    """
+    def __init__(self, base_url: str = None, timeout: float = 120.0):
+        self.base_url = (base_url or AETHERMAP_URL).rstrip('/')
+        self.timeout = timeout
+        self._current_job_id: Optional[str] = None
+    @property
+    def current_job_id(self) -> Optional[str]:
+        """Retorna o job_id atual"""
+        return self._current_job_id
+    async def process_documents(
+        self,
+        texts: List[str],
+        fast_mode: bool = True,
+        min_cluster_size: int = 0,
+        min_samples: int = 0
+    ) -> ProcessResult:
+        """
+        Processa uma lista de textos gerando embeddings e clusters.
+        Args:
+            texts: Lista de textos/documentos
+            fast_mode: Se True, usa PCA (rápido). Se False, usa UMAP (preciso)
+            min_cluster_size: Tamanho mínimo do cluster (0=auto)
+            min_samples: Mínimo de amostras (0=auto)
+        Returns:
+            ProcessResult com job_id e métricas
+        """
+        # Criar arquivo TXT em memória
+        content = "\n".join(texts)
+        file_bytes = content.encode('utf-8')
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            files = {
+                'file': ('documents.txt', io.BytesIO(file_bytes), 'text/plain')
+            }
+            data = {
+                'n_samples': str(len(texts)),
+                'fast_mode': 'true' if fast_mode else 'false',
+                'min_cluster_size': str(min_cluster_size),
+                'min_samples': str(min_samples)
+            }
+            logger.info(f"AetherMap: Processando {len(texts)} documentos...")
+            response = await client.post(
+                f"{self.base_url}/process/",
+                files=files,
+                data=data
+            )
+            if response.status_code != 200:
+                raise Exception(f"AetherMap error: {response.status_code} - {response.text}")
+            result = response.json()
+            self._current_job_id = result.get('job_id')
+            metadata = result.get('metadata', {})
+            logger.info(f"AetherMap: Job criado {self._current_job_id}")
+            return ProcessResult(
+                job_id=self._current_job_id,
+                num_documents=metadata.get('num_documents_processed', 0),
+                num_clusters=metadata.get('num_clusters_found', 0),
+                num_noise=metadata.get('num_noise_points', 0),
+                metrics=result.get('metrics', {}),
+                cluster_analysis=result.get('cluster_analysis', {})
+            )
+    async def semantic_search(
+        self,
+        query: str,
+        job_id: str = None,
+        turbo_mode: bool = False
+    ) -> SearchResult:
+        """
+        Busca semântica RAG híbrida nos documentos processados.
+        Args:
+            query: Termo de busca
+            job_id: ID do job (se não fornecido, usa o último)
+            turbo_mode: Se True, busca mais rápida (menos precisa)
+        Returns:
+            SearchResult com resumo e resultados
+        """
+        job_id = job_id or self._current_job_id
+        if not job_id:
+            raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            data = {
+                'query': query,
+                'job_id': job_id,
+                'turbo_mode': 'true' if turbo_mode else 'false'
+            }
+            logger.info(f"AetherMap: Buscando '{query}'...")
+            response = await client.post(
+                f"{self.base_url}/search/",
+                data=data
+            )
+            if response.status_code != 200:
+                raise Exception(f"AetherMap search error: {response.status_code} - {response.text}")
+            result = response.json()
+            return SearchResult(
+                summary=result.get('summary', ''),
+                results=result.get('results', [])
+            )
+    async def extract_entities(self, job_id: str = None) -> EntityGraphResult:
+        """
+        Extrai entidades nomeadas (NER) e cria grafo de conexões.
+        Args:
+            job_id: ID do job (se não fornecido, usa o último)
+        Returns:
+            EntityGraphResult com nós, arestas e insights
+        """
+        job_id = job_id or self._current_job_id
+        if not job_id:
+            raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            data = {'job_id': job_id}
+            logger.info(f"AetherMap: Extraindo entidades...")
+            response = await client.post(
+                f"{self.base_url}/entity_graph/",
+                data=data
+            )
+            if response.status_code != 200:
+                raise Exception(f"AetherMap entity_graph error: {response.status_code} - {response.text}")
+            result = response.json()
+            # Converter para dataclasses
+            nodes = [
+                EntityNode(
+                    entity=n.get('entity', ''),
+                    entity_type=n.get('type', ''),
+                    docs=n.get('docs', 0),
+                    degree=n.get('degree', 0),
+                    centrality=n.get('centrality', 0.0),
+                    role=n.get('role', 'peripheral')
+                )
+                for n in result.get('nodes', [])
+            ]
+            edges = [
+                EntityEdge(
+                    source_entity=e.get('source_entity', ''),
+                    target_entity=e.get('target_entity', ''),
+                    weight=e.get('weight', 0),
+                    reason=e.get('reason', '')
+                )
+                for e in result.get('edges', [])
+            ]
+            return EntityGraphResult(
+                nodes=nodes,
+                edges=edges,
+                hubs=result.get('hubs', []),
+                insights=result.get('insights', {})
+            )
+    async def analyze_graph(self, job_id: str = None) -> GraphAnalysis:
+        """
+        Usa LLM para analisar o Knowledge Graph e extrair insights.
+        Args:
+            job_id: ID do job (se não fornecido, usa o último)
+        Returns:
+            GraphAnalysis com análise textual
+        """
+        job_id = job_id or self._current_job_id
+        if not job_id:
+            raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            data = {'job_id': job_id}
+            logger.info(f"AetherMap: Analisando grafo com LLM...")
+            response = await client.post(
+                f"{self.base_url}/analyze_graph/",
+                data=data
+            )
+            if response.status_code != 200:
+                raise Exception(f"AetherMap analyze_graph error: {response.status_code} - {response.text}")
+            result = response.json()
+            return GraphAnalysis(
+                analysis=result.get('analysis', ''),
+                key_entities=result.get('key_entities', []),
+                relationships=result.get('relationships', [])
+            )
+    async def describe_clusters(self, job_id: str = None) -> Dict[str, Any]:
+        """
+        Usa LLM para descrever cada cluster encontrado.
+        Args:
+            job_id: ID do job (se não fornecido, usa o último)
+        Returns:
+            Dict com insights por cluster
+        """
+        job_id = job_id or self._current_job_id
+        if not job_id:
+            raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            data = {'job_id': job_id}
+            logger.info(f"AetherMap: Descrevendo clusters...")
+            response = await client.post(
+                f"{self.base_url}/describe_clusters/",
+                data=data
+            )
+            if response.status_code != 200:
+                raise Exception(f"AetherMap describe_clusters error: {response.status_code} - {response.text}")
+            return response.json()
+# Instância global do client
+aethermap = AetherMapClient()

app/services/investigator_agent.py CHANGED Viewed

@@ -304,8 +304,6 @@ class InvestigatorAgent:
             elif tool_name == "lookup_cnpj":
                 return await self._lookup_cnpj(arguments.get("cnpj", ""))
-            elif tool_name == "lookup_phone":
-                return await self._lookup_phone(arguments.get("phone", ""))
             elif tool_name == "web_search":
                 return await self._web_search(
@@ -316,6 +314,12 @@ class InvestigatorAgent:
             elif tool_name == "deep_research":
                 return await self._deep_research(arguments.get("topic", ""))
             elif tool_name == "save_finding":
                 finding = Finding(
                     title=arguments.get("title", ""),
@@ -474,6 +478,66 @@ class InvestigatorAgent:
         except Exception as e:
             return f"Erro na pesquisa: {str(e)}"
     async def investigate(
         self,
         mission: str,

             elif tool_name == "lookup_cnpj":
                 return await self._lookup_cnpj(arguments.get("cnpj", ""))
             elif tool_name == "web_search":
                 return await self._web_search(
             elif tool_name == "deep_research":
                 return await self._deep_research(arguments.get("topic", ""))
+            elif tool_name == "aether_search":
+                return await self._aether_search(arguments.get("query", ""))
+            elif tool_name == "aether_entities":
+                return await self._aether_entities()
             elif tool_name == "save_finding":
                 finding = Finding(
                     title=arguments.get("title", ""),
         except Exception as e:
             return f"Erro na pesquisa: {str(e)}"
+    async def _aether_search(self, query: str) -> str:
+        """Semantic search via AetherMap"""
+        try:
+            # Check if we have a job_id cached
+            if not aethermap.current_job_id:
+                # Index entities from database first
+                if self.db:
+                    entities = self.db.query(Entity).limit(500).all()
+                    if entities:
+                        texts = []
+                        for e in entities:
+                            text = f"{e.name} ({e.type})"
+                            if e.description:
+                                text += f": {e.description[:500]}"
+                            texts.append(text)
+                        if texts:
+                            result = await aethermap.process_documents(texts, fast_mode=True)
+                            # Continue with search
+            if aethermap.current_job_id:
+                result = await aethermap.semantic_search(query, turbo_mode=True)
+                return f"RAG Response:\n{result.summary}"
+            else:
+                return "Nenhum documento indexado no AetherMap."
+        except Exception as e:
+            return f"Erro no AetherMap search: {str(e)}"
+    async def _aether_entities(self) -> str:
+        """Extract NER entities via AetherMap"""
+        try:
+            if not aethermap.current_job_id:
+                return "Nenhum documento indexado. Use aether_search primeiro."
+            result = await aethermap.extract_entities()
+            # Format response
+            output = []
+            if result.hubs:
+                output.append("**Entidades Centrais (Hubs):**")
+                for hub in result.hubs[:5]:
+                    output.append(f"- {hub.get('entity')} ({hub.get('type')}): {hub.get('degree')} conexões")
+            if result.insights:
+                output.append(f"\n**Insights:**")
+                output.append(f"- Total de conexões: {result.insights.get('total_connections', 0)}")
+                output.append(f"- Grau médio: {result.insights.get('avg_degree', 0)}")
+            if result.edges:
+                output.append(f"\n**Top 5 Relacionamentos:**")
+                for edge in result.edges[:5]:
+                    output.append(f"- {edge.source_entity} <-> {edge.target_entity}: {edge.reason}")
+            return "\n".join(output) if output else "Nenhuma entidade significativa encontrada."
+        except Exception as e:
+            return f"Erro na extração de entidades: {str(e)}"
     async def investigate(
         self,
         mission: str,