diff --git a/Dockerfile b/Dockerfile
index 05a1891dae3cddb81c245180f5c1c088584295ee..7f23e0d6eb0ca91cf496513fb4d0227ffac10ec9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,33 +1,24 @@
-# ==============================================================================
-# Dockerfile — AetherMap API (versão profissional)
-# ==============================================================================
+FROM python:3.11-slim
-# Imagem Python robusta (não slim → evita erros de build)
-FROM python:3.10
-
-# Define diretório da aplicação
WORKDIR /app
-# --- INSTALAR TORCH CPU ANTES (CRÍTICO!) ---
-# Isso garante que a versão certa (CPU) seja instalada
-RUN pip install --no-cache-dir \
- torch \
- torchvision \
- torchaudio \
- --index-url https://download.pytorch.org/whl/cpu
-
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+ gcc \
+ && rm -rf /var/lib/apt/lists/*
-# Copiar requirements
+# Copy requirements first for better caching
COPY requirements.txt .
-
-# Instalar dependências restantes
RUN pip install --no-cache-dir -r requirements.txt
-# Copiar código da aplicação
+# Copy application code
COPY . .
-# Expor porta usada pelo Hugging Face Spaces
+# Create data directory for SQLite
+RUN mkdir -p /app/data
+
+# Expose port (HF Spaces uses 7860)
EXPOSE 7860
-# Comando padrão para executar FastAPI
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
+# Run the application
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
diff --git a/README.md b/README.md
index 1b0b6975ed9d1181da24ce796835935436320b98..b969ddabfb79f685dcf88fbe98f72b5d3bc1bead 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,27 @@
---
-title: AetherMap
-emoji: 🦀
-colorFrom: indigo
-colorTo: pink
+title: Numidium
+emoji: 🔮
+colorFrom: blue
+colorTo: red
sdk: docker
pinned: false
-license: apache-2.0
---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Numidium API
+
+Backend do sistema de inteligência Numidium/VANTAGE.
+
+## Endpoints
+
+- `/docs` - Documentação Swagger
+- `/api/v1/entities` - CRUD de entidades
+- `/api/v1/relationships` - Conexões
+- `/api/v1/events` - Eventos
+- `/api/v1/search` - Busca global
+- `/api/v1/ingest` - Ingestão de dados (Wikipedia, News)
+
+## Stack
+
+- FastAPI
+- SQLite
+- BeautifulSoup (scraping)
diff --git a/app/__init__.py b/app/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ca62e91c6b6d2fd4d3a0d2f3169941e71d37af3
--- /dev/null
+++ b/app/__init__.py
@@ -0,0 +1 @@
+# Numidium Backend App
diff --git a/app/__pycache__/__init__.cpython-311.pyc b/app/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a44e729bba8a6e6cdf407034b3b1ec551cfb6fe
Binary files /dev/null and b/app/__pycache__/__init__.cpython-311.pyc differ
diff --git a/app/__pycache__/config.cpython-311.pyc b/app/__pycache__/config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6ba5b98d9de60400fecda19a96033ffd700d3a1
Binary files /dev/null and b/app/__pycache__/config.cpython-311.pyc differ
diff --git a/app/api/__init__.py b/app/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce0a2733c6eceaf10144429177e8f20db9604545
--- /dev/null
+++ b/app/api/__init__.py
@@ -0,0 +1 @@
+# API module
diff --git a/app/api/__pycache__/__init__.cpython-311.pyc b/app/api/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e59a223a6007cd27a3443d5ab5a26d31df7fb4ff
Binary files /dev/null and b/app/api/__pycache__/__init__.cpython-311.pyc differ
diff --git a/app/api/__pycache__/deps.cpython-311.pyc b/app/api/__pycache__/deps.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07e15cf0e980065fc3e41e4e0eea81575dc514d5
Binary files /dev/null and b/app/api/__pycache__/deps.cpython-311.pyc differ
diff --git a/app/api/deps.py b/app/api/deps.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcea9d8c46a65a9857513605150ce15591631945
--- /dev/null
+++ b/app/api/deps.py
@@ -0,0 +1,35 @@
+"""
+API dependencies.
+"""
+from typing import Generator, Optional
+
+from fastapi import Cookie, Header
+from sqlalchemy.orm import Session
+
+from app.core.database import get_db_for_session, get_default_session
+
+
+def get_session_id(
+ x_session_id: Optional[str] = Header(None),
+ numidium_session: Optional[str] = Cookie(None)
+) -> Optional[str]:
+ """Return the session id from header or cookie."""
+ return x_session_id or numidium_session
+
+
+def get_scoped_db(
+ x_session_id: Optional[str] = Header(None),
+ numidium_session: Optional[str] = Cookie(None)
+) -> Generator[Session, None, None]:
+ """
+ Provide a session-scoped DB if available, otherwise the default DB.
+ """
+ session_id = x_session_id or numidium_session
+ if session_id:
+ db = get_db_for_session(session_id)
+ else:
+ db = get_default_session()
+ try:
+ yield db
+ finally:
+ db.close()
diff --git a/app/api/routes/__init__.py b/app/api/routes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e37c97a33d27ba2e879921f79996d8fdc3edbb73
--- /dev/null
+++ b/app/api/routes/__init__.py
@@ -0,0 +1,2 @@
+# API Routes module
+from app.api.routes import entities, relationships, events, search, ingest
diff --git a/app/api/routes/__pycache__/__init__.cpython-311.pyc b/app/api/routes/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e48c20bb1f744a1d1037323ce205527266cb5c7c
Binary files /dev/null and b/app/api/routes/__pycache__/__init__.cpython-311.pyc differ
diff --git a/app/api/routes/__pycache__/entities.cpython-311.pyc b/app/api/routes/__pycache__/entities.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee99ab907a18b99a588dfe960d31a7e21c7e53d6
Binary files /dev/null and b/app/api/routes/__pycache__/entities.cpython-311.pyc differ
diff --git a/app/api/routes/__pycache__/events.cpython-311.pyc b/app/api/routes/__pycache__/events.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52f29ec16d10fc54bd6be7d6e32591d65d3acfcc
Binary files /dev/null and b/app/api/routes/__pycache__/events.cpython-311.pyc differ
diff --git a/app/api/routes/__pycache__/ingest.cpython-311.pyc b/app/api/routes/__pycache__/ingest.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e524bafc4ce081ccccb32d94f2426c10b1e79b9a
Binary files /dev/null and b/app/api/routes/__pycache__/ingest.cpython-311.pyc differ
diff --git a/app/api/routes/__pycache__/investigate.cpython-311.pyc b/app/api/routes/__pycache__/investigate.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61c0e309052c422eb7d506d8623cfaed4ff4e01e
Binary files /dev/null and b/app/api/routes/__pycache__/investigate.cpython-311.pyc differ
diff --git a/app/api/routes/__pycache__/relationships.cpython-311.pyc b/app/api/routes/__pycache__/relationships.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73c88868d8b1ad76745a529fe05928d06408c415
Binary files /dev/null and b/app/api/routes/__pycache__/relationships.cpython-311.pyc differ
diff --git a/app/api/routes/__pycache__/search.cpython-311.pyc b/app/api/routes/__pycache__/search.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83951b1b069fe2d10b140852fbc85e7294cac015
Binary files /dev/null and b/app/api/routes/__pycache__/search.cpython-311.pyc differ
diff --git a/app/api/routes/aethermap.py b/app/api/routes/aethermap.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc0535153069d293dcdbe97be9565e0a17728e3e
--- /dev/null
+++ b/app/api/routes/aethermap.py
@@ -0,0 +1,307 @@
+"""
+AetherMap Routes - Document Mapping & Semantic Search
+Integrates with AetherMap API for document clustering, NER, and semantic search.
+"""
+from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends
+from pydantic import BaseModel, Field
+from typing import Optional, List, Dict, Any
+from sqlalchemy.orm import Session
+import io
+
+from app.api.deps import get_scoped_db
+from app.services.aethermap_client import aethermap, ProcessResult, SearchResult, EntityGraphResult
+
+
+router = APIRouter()
+
+
+# ============================================================================
+# Request/Response Models
+# ============================================================================
+
+class IndexDocumentsRequest(BaseModel):
+ """Request to index documents from text list"""
+ documents: List[str] = Field(..., description="Lista de textos para indexar")
+ fast_mode: bool = Field(True, description="Modo rápido (PCA) ou preciso (UMAP)")
+
+
+class IndexEntitiesRequest(BaseModel):
+ """Request to index entities from NUMIDIUM database"""
+ entity_types: Optional[List[str]] = Field(None, description="Filtrar por tipos de entidade")
+ limit: int = Field(500, description="Limite de entidades")
+
+
+class SemanticSearchRequest(BaseModel):
+ """Request for semantic search"""
+ query: str = Field(..., description="Termo de busca")
+ turbo_mode: bool = Field(True, description="Modo turbo (mais rápido)")
+
+
+class IndexResponse(BaseModel):
+ """Response from indexing"""
+ job_id: str
+ num_documents: int
+ num_clusters: int
+ num_noise: int
+ metrics: Dict[str, Any] = {}
+ cluster_analysis: Dict[str, Any] = {}
+
+
+class SearchResponse(BaseModel):
+ """Response from search"""
+ summary: str
+ results: List[Dict[str, Any]] = []
+
+
+class EntityGraphResponse(BaseModel):
+ """Response from NER extraction"""
+ hubs: List[Dict[str, Any]] = []
+ insights: Dict[str, Any] = {}
+ node_count: int = 0
+ edge_count: int = 0
+
+
+class StatusResponse(BaseModel):
+ """AetherMap status"""
+ connected: bool
+ job_id: Optional[str] = None
+ documents_indexed: int = 0
+
+
+# ============================================================================
+# Endpoints
+# ============================================================================
+
+@router.get("/status", response_model=StatusResponse)
+async def get_status():
+ """
+ Get AetherMap connection status.
+ """
+ return StatusResponse(
+ connected=True,
+ job_id=aethermap.current_job_id,
+ documents_indexed=0 # TODO: track this
+ )
+
+
+@router.post("/index", response_model=IndexResponse)
+async def index_documents(request: IndexDocumentsRequest):
+ """
+ Index a list of documents for semantic search.
+
+ The documents will be:
+ - Embedded using sentence transformers
+ - Clustered using HDBSCAN
+ - Indexed in FAISS + BM25 for hybrid search
+ """
+ try:
+ if not request.documents:
+ raise HTTPException(status_code=400, detail="Nenhum documento fornecido")
+
+ result = await aethermap.process_documents(
+ texts=request.documents,
+ fast_mode=request.fast_mode
+ )
+
+ return IndexResponse(
+ job_id=result.job_id,
+ num_documents=result.num_documents,
+ num_clusters=result.num_clusters,
+ num_noise=result.num_noise,
+ metrics=result.metrics,
+ cluster_analysis=result.cluster_analysis
+ )
+
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/index-entities", response_model=IndexResponse)
+async def index_entities(
+ request: IndexEntitiesRequest,
+ db: Session = Depends(get_scoped_db)
+):
+ """
+ Index entities from NUMIDIUM database.
+
+ Collects entity names and descriptions, sends to AetherMap for processing.
+ """
+ from app.models.entity import Entity
+
+ try:
+ query = db.query(Entity)
+
+ if request.entity_types:
+ query = query.filter(Entity.type.in_(request.entity_types))
+
+ entities = query.limit(request.limit).all()
+
+ if not entities:
+ raise HTTPException(status_code=404, detail="Nenhuma entidade encontrada")
+
+ # Build text representations
+ documents = []
+ for e in entities:
+ text = f"{e.name} ({e.type})"
+ if e.description:
+ text += f": {e.description[:1000]}"
+ documents.append(text)
+
+ result = await aethermap.process_documents(
+ texts=documents,
+ fast_mode=request.fast_mode if hasattr(request, 'fast_mode') else True
+ )
+
+ return IndexResponse(
+ job_id=result.job_id,
+ num_documents=result.num_documents,
+ num_clusters=result.num_clusters,
+ num_noise=result.num_noise,
+ metrics=result.metrics,
+ cluster_analysis=result.cluster_analysis
+ )
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/upload", response_model=IndexResponse)
+async def upload_documents(
+ file: UploadFile = File(...),
+ fast_mode: bool = Form(True)
+):
+ """
+ Upload a file (TXT or CSV) for indexing.
+
+ - TXT: One document per line
+ - CSV: Will use first text column found
+ """
+ try:
+ content = await file.read()
+ text = content.decode('utf-8', errors='ignore')
+
+ # Split by lines for TXT
+ documents = [line.strip() for line in text.splitlines() if line.strip()]
+
+ if not documents:
+ raise HTTPException(status_code=400, detail="Arquivo vazio ou sem texto válido")
+
+ result = await aethermap.process_documents(
+ texts=documents,
+ fast_mode=fast_mode
+ )
+
+ return IndexResponse(
+ job_id=result.job_id,
+ num_documents=result.num_documents,
+ num_clusters=result.num_clusters,
+ num_noise=result.num_noise,
+ metrics=result.metrics,
+ cluster_analysis=result.cluster_analysis
+ )
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/search", response_model=SearchResponse)
+async def semantic_search(request: SemanticSearchRequest):
+ """
+ Semantic search in indexed documents.
+
+ Uses hybrid RAG (FAISS + BM25 + reranking + LLM).
+ Returns a summary answering the query with citations.
+ """
+ try:
+ if not aethermap.current_job_id:
+ raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.")
+
+ result = await aethermap.semantic_search(
+ query=request.query,
+ turbo_mode=request.turbo_mode
+ )
+
+ return SearchResponse(
+ summary=result.summary,
+ results=result.results
+ )
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/entities", response_model=EntityGraphResponse)
+async def extract_entities():
+ """
+ Extract named entities (NER) from indexed documents.
+
+ Returns:
+ - Hub entities (most connected)
+ - Relationship insights
+ - Graph metrics
+ """
+ try:
+ if not aethermap.current_job_id:
+ raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.")
+
+ result = await aethermap.extract_entities()
+
+ return EntityGraphResponse(
+ hubs=result.hubs,
+ insights=result.insights,
+ node_count=len(result.nodes),
+ edge_count=len(result.edges)
+ )
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/analyze")
+async def analyze_graph():
+ """
+ Analyze entity graph using LLM.
+
+ Returns semantic insights about relationships and patterns.
+ """
+ try:
+ if not aethermap.current_job_id:
+ raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.")
+
+ result = await aethermap.analyze_graph()
+
+ return {
+ "analysis": result.analysis,
+ "key_entities": result.key_entities,
+ "relationships": result.relationships
+ }
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/describe-clusters")
+async def describe_clusters():
+ """
+ Get LLM descriptions for each cluster found.
+ """
+ try:
+ if not aethermap.current_job_id:
+ raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.")
+
+ result = await aethermap.describe_clusters()
+
+ return result
+
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
diff --git a/app/api/routes/analyze.py b/app/api/routes/analyze.py
new file mode 100644
index 0000000000000000000000000000000000000000..37b93947c0e0c9f2a5a626301007c1cf30b212d6
--- /dev/null
+++ b/app/api/routes/analyze.py
@@ -0,0 +1,309 @@
+"""
+Analyze API Routes - LLM-based text analysis
+"""
+from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel, Field
+from typing import Optional, List
+from sqlalchemy.orm import Session
+import traceback
+
+from app.api.deps import get_scoped_db
+from app.services.nlp import entity_extractor
+from app.services.geocoding import geocode
+from app.models.entity import Entity, Relationship, Event
+from app.config import settings
+
+
+router = APIRouter(prefix="/analyze", tags=["Analysis"])
+
+
+class AnalyzeRequest(BaseModel):
+ """Request model for text analysis"""
+ text: str = Field(..., min_length=10, description="Text to analyze")
+ auto_create: bool = Field(default=False, description="Auto-create extracted entities in database")
+
+
+class ExtractedEntityResponse(BaseModel):
+ """Response model for an extracted entity"""
+ name: str
+ type: str
+ role: Optional[str] = None
+ aliases: Optional[List[str]] = None
+ description: Optional[str] = None
+ created: bool = False # Whether it was created in DB
+ entity_id: Optional[str] = None # DB ID if created
+
+
+class ExtractedRelationshipResponse(BaseModel):
+ """Response model for an extracted relationship"""
+ source: str
+ target: str
+ relationship_type: str
+ context: Optional[str] = None
+ created: bool = False
+
+
+class ExtractedEventResponse(BaseModel):
+ """Response model for an extracted event"""
+ description: str
+ event_type: Optional[str] = None
+ date: Optional[str] = None
+ location: Optional[str] = None
+ participants: Optional[List[str]] = None
+ created: bool = False
+ event_id: Optional[str] = None
+
+
+class AnalyzeResponse(BaseModel):
+ """Response model for analysis"""
+ entities: List[ExtractedEntityResponse]
+ relationships: List[ExtractedRelationshipResponse]
+ events: List[ExtractedEventResponse]
+ stats: dict
+
+
+@router.post("", response_model=AnalyzeResponse)
+async def analyze_text(request: AnalyzeRequest, db: Session = Depends(get_scoped_db)):
+ """
+ Analyze text using LLM to extract entities, relationships, and events.
+
+ Uses Cerebras API with Qwen 3 235B for intelligent extraction.
+
+ Args:
+ text: Text to analyze (min 10 characters)
+ auto_create: If true, automatically creates entities in the database
+
+ Returns:
+ Extracted entities, relationships, events, and statistics
+ """
+ try:
+ # Extract using LLM
+ result = await entity_extractor.extract(request.text)
+
+ # Prepare response
+ entities_response = []
+ relationships_response = []
+ events_response = []
+
+ created_entities = 0
+ created_relationships = 0
+ created_events = 0
+
+ # Helper function to parse date strings
+ def parse_date(date_str):
+ if not date_str:
+ return None
+ from datetime import datetime
+ try:
+ # Try YYYY-MM-DD format
+ return datetime.strptime(date_str[:10], "%Y-%m-%d")
+ except:
+ try:
+ # Try YYYY format
+ return datetime.strptime(date_str[:4], "%Y")
+ except:
+ return None
+
+ # Process entities
+ for entity in result.entities:
+ entity_data = ExtractedEntityResponse(
+ name=entity.name,
+ type=entity.type,
+ role=entity.role,
+ aliases=entity.aliases,
+ description=entity.description,
+ created=False
+ )
+
+ if request.auto_create and entity.name:
+ # Check if entity already exists
+ existing = db.query(Entity).filter(
+ Entity.name.ilike(f"%{entity.name}%")
+ ).first()
+
+ if not existing:
+ # Get coordinates for location entities
+ lat, lng = None, None
+ if entity.type == "location":
+ coords = await geocode(entity.name)
+ if coords:
+ lat, lng = coords
+
+ # Parse event_date if available
+ event_date = parse_date(getattr(entity, 'event_date', None))
+
+ # Create new entity
+ new_entity = Entity(
+ name=entity.name,
+ type=entity.type if entity.type in ["person", "organization", "location", "event"] else "person",
+ description=entity.description or entity.role or "",
+ source="llm_extraction",
+ latitude=lat,
+ longitude=lng,
+ event_date=event_date,
+ properties={"role": entity.role, "aliases": entity.aliases}
+ )
+ db.add(new_entity)
+ db.commit()
+ db.refresh(new_entity)
+
+ entity_data.created = True
+ entity_data.entity_id = new_entity.id
+ created_entities += 1
+ else:
+ entity_data.entity_id = existing.id
+
+ entities_response.append(entity_data)
+
+ # Process relationships
+ for rel in result.relationships:
+ rel_data = ExtractedRelationshipResponse(
+ source=rel.source,
+ target=rel.target,
+ relationship_type=rel.relationship_type,
+ context=rel.context,
+ created=False
+ )
+
+ if request.auto_create:
+ # Find source and target entities
+ source_entity = db.query(Entity).filter(
+ Entity.name.ilike(f"%{rel.source}%")
+ ).first()
+ target_entity = db.query(Entity).filter(
+ Entity.name.ilike(f"%{rel.target}%")
+ ).first()
+
+ if source_entity and target_entity:
+ # Check if relationship exists
+ existing_rel = db.query(Relationship).filter(
+ Relationship.source_id == source_entity.id,
+ Relationship.target_id == target_entity.id,
+ Relationship.type == rel.relationship_type
+ ).first()
+
+ if not existing_rel:
+ # Parse event_date if available
+ rel_event_date = parse_date(getattr(rel, 'event_date', None))
+
+ new_rel = Relationship(
+ source_id=source_entity.id,
+ target_id=target_entity.id,
+ type=rel.relationship_type,
+ event_date=rel_event_date,
+ properties={"context": rel.context}
+ )
+ db.add(new_rel)
+ db.commit()
+ rel_data.created = True
+ created_relationships += 1
+
+ relationships_response.append(rel_data)
+
+ # Process events
+ for event in result.events:
+ event_data = ExtractedEventResponse(
+ description=event.description,
+ event_type=event.event_type,
+ date=event.date,
+ location=event.location,
+ participants=event.participants,
+ created=False
+ )
+
+ if request.auto_create and event.description:
+ # Create event
+ new_event = Event(
+ title=event.description[:100] if len(event.description) > 100 else event.description,
+ description=event.description,
+ type=event.event_type or "general",
+ source="llm_extraction"
+ )
+ db.add(new_event)
+ db.commit()
+ db.refresh(new_event)
+
+ event_data.created = True
+ event_data.event_id = new_event.id
+ created_events += 1
+
+ events_response.append(event_data)
+
+ return AnalyzeResponse(
+ entities=entities_response,
+ relationships=relationships_response,
+ events=events_response,
+ stats={
+ "total_entities": len(entities_response),
+ "total_relationships": len(relationships_response),
+ "total_events": len(events_response),
+ "created_entities": created_entities,
+ "created_relationships": created_relationships,
+ "created_events": created_events
+ }
+ )
+
+ except Exception as e:
+ # Log the full error with traceback
+ print(f"=== ANALYZE ERROR ===")
+ print(f"Error type: {type(e).__name__}")
+ print(f"Error message: {str(e)}")
+ print(f"Traceback:")
+ traceback.print_exc()
+ print(f"=== END ERROR ===")
+ raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
+
+
+@router.get("/debug")
+async def debug_config():
+ """
+ Debug endpoint to check if API is configured correctly.
+ """
+ api_key = settings.cerebras_api_key
+ return {
+ "cerebras_api_key_configured": bool(api_key),
+ "cerebras_api_key_length": len(api_key) if api_key else 0,
+ "cerebras_api_key_preview": f"{api_key[:8]}...{api_key[-4:]}" if api_key and len(api_key) > 12 else "NOT SET"
+ }
+
+
+@router.post("/quick")
+async def quick_analyze(request: AnalyzeRequest):
+ """
+ Quick analysis without database operations.
+ Returns only extracted data without creating anything.
+ """
+ try:
+ result = await entity_extractor.extract(request.text)
+
+ return {
+ "entities": [
+ {
+ "name": e.name,
+ "type": e.type,
+ "role": e.role,
+ "aliases": e.aliases
+ }
+ for e in result.entities
+ ],
+ "relationships": [
+ {
+ "source": r.source,
+ "target": r.target,
+ "type": r.relationship_type,
+ "context": r.context
+ }
+ for r in result.relationships
+ ],
+ "events": [
+ {
+ "description": ev.description,
+ "type": ev.event_type,
+ "date": ev.date,
+ "participants": ev.participants
+ }
+ for ev in result.events
+ ]
+ }
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
diff --git a/app/api/routes/chat.py b/app/api/routes/chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..f75b133bf9956e67eb7b1b86312192d7fa093c46
--- /dev/null
+++ b/app/api/routes/chat.py
@@ -0,0 +1,63 @@
+"""
+Chat API Routes - Intelligent chat with RAG
+"""
+from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel, Field
+from typing import Optional
+from sqlalchemy.orm import Session
+
+from app.api.deps import get_scoped_db, get_session_id
+from app.services.chat import chat_service
+
+
+router = APIRouter(prefix="/chat", tags=["Chat"])
+
+
+class ChatRequest(BaseModel):
+ """Chat request model"""
+ message: str = Field(..., min_length=1, description="User message")
+ use_web: bool = Field(default=True, description="Include web search")
+ use_history: bool = Field(default=True, description="Use conversation history")
+
+
+class ChatResponse(BaseModel):
+ """Chat response model"""
+ answer: str
+ local_context_used: bool
+ web_context_used: bool
+ entities_found: int
+
+
+@router.post("", response_model=ChatResponse)
+async def chat(
+ request: ChatRequest,
+ db: Session = Depends(get_scoped_db),
+ session_id: Optional[str] = Depends(get_session_id)
+):
+ """
+ Send a message and get an intelligent response.
+
+ Uses:
+ - Local NUMIDIUM knowledge (entities/relationships)
+ - Lancer web search (if enabled)
+ - Cerebras LLM for synthesis
+ """
+ try:
+ result = await chat_service.chat(
+ message=request.message,
+ db=db,
+ use_web=request.use_web,
+ use_history=request.use_history,
+ session_id=session_id
+ )
+ return ChatResponse(**result)
+
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/clear")
+async def clear_history(session_id: Optional[str] = Depends(get_session_id)):
+ """Clear conversation history"""
+ chat_service.clear_history(session_id=session_id)
+ return {"message": "Historico limpo"}
diff --git a/app/api/routes/dados_publicos.py b/app/api/routes/dados_publicos.py
new file mode 100644
index 0000000000000000000000000000000000000000..842e82d2d17f48687b92bb8012105eab495a8051
--- /dev/null
+++ b/app/api/routes/dados_publicos.py
@@ -0,0 +1,155 @@
+"""
+Public Data API Routes - IBGE and TSE data access
+"""
+from fastapi import APIRouter, HTTPException, Query
+from pydantic import BaseModel, Field
+from typing import Optional, List, Dict, Any
+
+from app.services.ibge_api import (
+ listar_estados,
+ listar_municipios,
+ buscar_municipio,
+ enriquecer_localizacao
+)
+from app.services.tse_api import (
+ listar_eleicoes,
+ buscar_candidatos,
+ obter_candidato_detalhes,
+ buscar_politico
+)
+
+
+router = APIRouter(prefix="/dados", tags=["Public Data"])
+
+
+# ========== IBGE Endpoints ==========
+
+class EstadoResponse(BaseModel):
+ id: int
+ sigla: str
+ nome: str
+ regiao: str
+
+
+class MunicipioResponse(BaseModel):
+ id: int
+ nome: str
+ estado_sigla: str
+ estado_nome: str
+ regiao: str
+
+
+@router.get("/ibge/estados", response_model=List[EstadoResponse])
+async def get_estados():
+ """List all Brazilian states"""
+ estados = await listar_estados()
+ return [EstadoResponse(**e.__dict__) for e in estados]
+
+
+@router.get("/ibge/municipios/{uf}", response_model=List[MunicipioResponse])
+async def get_municipios(uf: str):
+ """List municipalities in a state"""
+ municipios = await listar_municipios(uf)
+ return [MunicipioResponse(**m.__dict__) for m in municipios]
+
+
+@router.get("/ibge/buscar")
+async def buscar_cidade(
+ nome: str = Query(..., min_length=2),
+ uf: Optional[str] = None
+):
+ """Search for a municipality by name"""
+ municipios = await buscar_municipio(nome, uf)
+ return [MunicipioResponse(**m.__dict__) for m in municipios]
+
+
+@router.get("/ibge/enriquecer")
+async def enriquecer_cidade(
+ cidade: str = Query(..., min_length=2),
+ uf: Optional[str] = None
+):
+ """Enrich a location name with IBGE data"""
+ return await enriquecer_localizacao(cidade, uf)
+
+
+# ========== TSE Endpoints ==========
+
+class EleicaoResponse(BaseModel):
+ id: int
+ ano: int
+ descricao: str
+ turno: int
+
+
+class CandidatoResponse(BaseModel):
+ id: int
+ nome: str
+ nome_urna: str
+ numero: str
+ cargo: str
+ partido_sigla: str
+ uf: str
+ municipio: str
+ situacao: str
+ total_bens: float
+
+
+class CandidatoDetalhadoResponse(BaseModel):
+ id: int
+ nome: str
+ nome_urna: str
+ numero: str
+ cargo: str
+ partido_sigla: str
+ partido_nome: str
+ uf: str
+ municipio: str
+ situacao: str
+ data_nascimento: str
+ genero: str
+ grau_instrucao: str
+ ocupacao: str
+ total_bens: float
+ bens: List[Dict[str, Any]]
+
+
+@router.get("/tse/eleicoes", response_model=List[EleicaoResponse])
+async def get_eleicoes():
+ """List available elections"""
+ eleicoes = await listar_eleicoes()
+ return [EleicaoResponse(**e.__dict__) for e in eleicoes]
+
+
+@router.get("/tse/candidatos")
+async def get_candidatos(
+ nome: str = Query(..., min_length=3),
+ ano: int = Query(default=2024),
+ uf: Optional[str] = None,
+ cargo: Optional[str] = None
+):
+ """Search for candidates by name"""
+ candidatos = await buscar_candidatos(nome, ano=ano, uf=uf, cargo=cargo)
+ return [CandidatoResponse(**c.__dict__) for c in candidatos]
+
+
+@router.get("/tse/candidato/{id_candidato}")
+async def get_candidato_detalhes(
+ id_candidato: int,
+ ano: int = Query(default=2024)
+):
+ """Get detailed candidate information including assets"""
+ candidato = await obter_candidato_detalhes(id_candidato, ano=ano)
+
+ if not candidato:
+ raise HTTPException(status_code=404, detail="Candidato não encontrado")
+
+ return CandidatoDetalhadoResponse(**candidato.__dict__)
+
+
+@router.get("/tse/politico")
+async def pesquisar_politico(nome: str = Query(..., min_length=3)):
+ """
+ Search for a politician across multiple elections.
+ Returns consolidated career information.
+ """
+ return await buscar_politico(nome)
diff --git a/app/api/routes/entities.py b/app/api/routes/entities.py
new file mode 100644
index 0000000000000000000000000000000000000000..2727179e0e20a58a8a5893f1821a15c24df3013f
--- /dev/null
+++ b/app/api/routes/entities.py
@@ -0,0 +1,353 @@
+"""
+Entity CRUD Routes
+"""
+from fastapi import APIRouter, Depends, HTTPException, Query
+from sqlalchemy.orm import Session
+from sqlalchemy import or_
+from typing import List, Optional
+
+from app.api.deps import get_scoped_db
+from app.models import Entity, Relationship
+from app.schemas import EntityCreate, EntityUpdate, EntityResponse, GraphData, GraphNode, GraphEdge
+
+router = APIRouter(prefix="/entities", tags=["Entities"])
+
+
+@router.get("", response_model=List[EntityResponse])
+def list_entities(
+ type: Optional[str] = None,
+ search: Optional[str] = None,
+ project_id: Optional[str] = None,
+ limit: int = Query(default=50, le=200),
+ offset: int = 0,
+ db: Session = Depends(get_scoped_db)
+):
+ """Lista todas as entidades com filtros opcionais"""
+ query = db.query(Entity)
+
+ if project_id:
+ query = query.filter(Entity.project_id == project_id)
+
+ if type:
+ query = query.filter(Entity.type == type)
+
+ if search:
+ query = query.filter(
+ or_(
+ Entity.name.ilike(f"%{search}%"),
+ Entity.description.ilike(f"%{search}%")
+ )
+ )
+
+ query = query.order_by(Entity.created_at.desc())
+ return query.offset(offset).limit(limit).all()
+
+
+@router.get("/types")
+def get_entity_types(db: Session = Depends(get_scoped_db)):
+ """Retorna todos os tipos de entidade únicos"""
+ types = db.query(Entity.type).distinct().all()
+ return [t[0] for t in types]
+
+
+@router.get("/suggest-merge")
+async def suggest_merge_candidates(
+ limit: int = Query(default=10, le=50),
+ db: Session = Depends(get_scoped_db)
+):
+ """
+ Use LLM to find potential duplicate entities that could be merged.
+ Returns pairs of entities that might be the same.
+ """
+ import httpx
+ import json
+ import re
+ from app.config import settings
+
+ # Get all entities
+ entities = db.query(Entity).order_by(Entity.name).limit(200).all()
+
+ if len(entities) < 2:
+ return {"candidates": [], "message": "Not enough entities to compare"}
+
+ # Build entity list for LLM
+ entity_list = []
+ for e in entities:
+ aliases = (e.properties or {}).get("aliases", [])
+ entity_list.append({
+ "id": e.id,
+ "name": e.name,
+ "type": e.type,
+ "aliases": aliases[:5] if aliases else []
+ })
+
+ # Ask LLM to find duplicates
+ prompt = f"""Analise esta lista de entidades e encontre possíveis DUPLICATAS (mesma pessoa/organização/local com nomes diferentes).
+
+Entidades:
+{entity_list[:100]}
+
+Retorne APENAS um JSON válido com pares de IDs que são provavelmente a mesma entidade:
+```json
+{{
+ "duplicates": [
+ {{
+ "id1": "uuid1",
+ "id2": "uuid2",
+ "confidence": 0.95,
+ "reason": "Mesmo nome com variação"
+ }}
+ ]
+}}
+```
+
+Se não houver duplicatas, retorne: {{"duplicates": []}}
+"""
+
+ try:
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ response = await client.post(
+ "https://api.cerebras.ai/v1/chat/completions",
+ headers={
+ "Authorization": f"Bearer {settings.cerebras_api_key}",
+ "Content-Type": "application/json"
+ },
+ json={
+ "model": "zai-glm-4.7",
+ "messages": [
+ {"role": "system", "content": "Você é um especialista em detecção de entidades duplicadas. Responda apenas em JSON válido."},
+ {"role": "user", "content": prompt}
+ ],
+ "temperature": 0.1,
+ "max_tokens": 1024
+ }
+ )
+
+ if response.status_code != 200:
+ return {"candidates": [], "error": "LLM API error"}
+
+ data = response.json()
+ content = data["choices"][0]["message"]["content"]
+
+ # Parse JSON from response
+ json_match = re.search(r'\{.*\}', content, re.DOTALL)
+ if json_match:
+ result = json.loads(json_match.group(0))
+
+ # Enrich with entity names
+ candidates = []
+ for dup in result.get("duplicates", [])[:limit]:
+ e1 = next((e for e in entities if e.id == dup.get("id1")), None)
+ e2 = next((e for e in entities if e.id == dup.get("id2")), None)
+ if e1 and e2:
+ candidates.append({
+ "entity1": {"id": e1.id, "name": e1.name, "type": e1.type},
+ "entity2": {"id": e2.id, "name": e2.name, "type": e2.type},
+ "confidence": dup.get("confidence", 0.5),
+ "reason": dup.get("reason", "Possível duplicata")
+ })
+
+ return {"candidates": candidates}
+
+ return {"candidates": [], "message": "No duplicates found"}
+
+ except Exception as e:
+ return {"candidates": [], "error": str(e)}
+
+
+@router.get("/{entity_id}", response_model=EntityResponse)
+def get_entity(entity_id: str, db: Session = Depends(get_scoped_db)):
+ """Busca uma entidade por ID"""
+ entity = db.query(Entity).filter(Entity.id == entity_id).first()
+ if not entity:
+ raise HTTPException(status_code=404, detail="Entity not found")
+ return entity
+
+
+@router.post("", response_model=EntityResponse, status_code=201)
+def create_entity(entity: EntityCreate, db: Session = Depends(get_scoped_db)):
+ """Cria uma nova entidade"""
+ db_entity = Entity(**entity.model_dump())
+ db.add(db_entity)
+ db.commit()
+ db.refresh(db_entity)
+ return db_entity
+
+
+@router.put("/{entity_id}", response_model=EntityResponse)
+def update_entity(entity_id: str, entity: EntityUpdate, db: Session = Depends(get_scoped_db)):
+ """Atualiza uma entidade existente"""
+ db_entity = db.query(Entity).filter(Entity.id == entity_id).first()
+ if not db_entity:
+ raise HTTPException(status_code=404, detail="Entity not found")
+
+ update_data = entity.model_dump(exclude_unset=True)
+ for field, value in update_data.items():
+ setattr(db_entity, field, value)
+
+ db.commit()
+ db.refresh(db_entity)
+ return db_entity
+
+
+@router.delete("/{entity_id}")
+def delete_entity(entity_id: str, db: Session = Depends(get_scoped_db)):
+ """Deleta uma entidade"""
+ db_entity = db.query(Entity).filter(Entity.id == entity_id).first()
+ if not db_entity:
+ raise HTTPException(status_code=404, detail="Entity not found")
+
+ # Delete related relationships
+ db.query(Relationship).filter(
+ or_(
+ Relationship.source_id == entity_id,
+ Relationship.target_id == entity_id
+ )
+ ).delete()
+
+ db.delete(db_entity)
+ db.commit()
+ return {"message": "Entity deleted"}
+
+
+@router.get("/{entity_id}/connections", response_model=GraphData)
+def get_entity_connections(
+ entity_id: str,
+ depth: int = Query(default=1, le=3),
+ db: Session = Depends(get_scoped_db)
+):
+ """
+ Retorna o grafo de conexões de uma entidade
+ Usado para visualização de rede no frontend
+ """
+ entity = db.query(Entity).filter(Entity.id == entity_id).first()
+ if not entity:
+ raise HTTPException(status_code=404, detail="Entity not found")
+
+ nodes = {}
+ edges = []
+ visited = set()
+
+ def explore(eid: str, current_depth: int):
+ if current_depth > depth or eid in visited:
+ return
+ visited.add(eid)
+
+ e = db.query(Entity).filter(Entity.id == eid).first()
+ if not e:
+ return
+
+ nodes[e.id] = GraphNode(
+ id=e.id,
+ type=e.type,
+ name=e.name,
+ properties=e.properties or {}
+ )
+
+ # Outgoing relationships
+ for rel in db.query(Relationship).filter(Relationship.source_id == eid).all():
+ edges.append(GraphEdge(
+ source=rel.source_id,
+ target=rel.target_id,
+ type=rel.type,
+ confidence=rel.confidence
+ ))
+ explore(rel.target_id, current_depth + 1)
+
+ # Incoming relationships
+ for rel in db.query(Relationship).filter(Relationship.target_id == eid).all():
+ edges.append(GraphEdge(
+ source=rel.source_id,
+ target=rel.target_id,
+ type=rel.type,
+ confidence=rel.confidence
+ ))
+ explore(rel.source_id, current_depth + 1)
+
+ explore(entity_id, 0)
+
+ return GraphData(
+ nodes=list(nodes.values()),
+ edges=edges
+ )
+
+
+@router.post("/merge")
+def merge_entities(
+ primary_id: str,
+ secondary_id: str,
+ db: Session = Depends(get_scoped_db)
+):
+ """
+ Merge two entities into one.
+ The primary entity is kept, the secondary is deleted.
+ All relationships from secondary are transferred to primary.
+ """
+ if primary_id == secondary_id:
+ raise HTTPException(status_code=400, detail="Cannot merge entity with itself")
+
+ primary = db.query(Entity).filter(Entity.id == primary_id).first()
+ secondary = db.query(Entity).filter(Entity.id == secondary_id).first()
+
+ if not primary:
+ raise HTTPException(status_code=404, detail="Primary entity not found")
+ if not secondary:
+ raise HTTPException(status_code=404, detail="Secondary entity not found")
+
+ # Merge properties
+ primary_props = primary.properties or {}
+ secondary_props = secondary.properties or {}
+
+ # Add aliases from secondary
+ aliases = primary_props.get("aliases", []) or []
+ if secondary.name not in aliases:
+ aliases.append(secondary.name)
+ secondary_aliases = secondary_props.get("aliases", []) or []
+ for alias in secondary_aliases:
+ if alias not in aliases:
+ aliases.append(alias)
+ primary_props["aliases"] = aliases
+
+ # Add merge history
+ merge_history = primary_props.get("merged_from", []) or []
+ merge_history.append({
+ "id": secondary.id,
+ "name": secondary.name,
+ "source": secondary.source
+ })
+ primary_props["merged_from"] = merge_history
+
+ # Combine descriptions if primary has none
+ if not primary.description and secondary.description:
+ primary.description = secondary.description
+
+ primary.properties = primary_props
+
+ # Transfer relationships from secondary to primary
+ # Update source_id
+ db.query(Relationship).filter(
+ Relationship.source_id == secondary_id
+ ).update({"source_id": primary_id})
+
+ # Update target_id
+ db.query(Relationship).filter(
+ Relationship.target_id == secondary_id
+ ).update({"target_id": primary_id})
+
+ # Delete duplicate relationships (same source, target, type)
+ # This is a simple approach - in production you'd want more sophisticated deduplication
+
+ # Delete the secondary entity
+ db.delete(secondary)
+ db.commit()
+ db.refresh(primary)
+
+ return {
+ "message": f"Merged '{secondary.name}' into '{primary.name}'",
+ "primary": {
+ "id": primary.id,
+ "name": primary.name,
+ "aliases": aliases
+ }
+ }
+
diff --git a/app/api/routes/events.py b/app/api/routes/events.py
new file mode 100644
index 0000000000000000000000000000000000000000..19a16292e599f2a33bffe593cf788d69be9f28dd
--- /dev/null
+++ b/app/api/routes/events.py
@@ -0,0 +1,113 @@
+"""
+Events CRUD Routes
+"""
+from fastapi import APIRouter, Depends, HTTPException, Query
+from sqlalchemy.orm import Session
+from sqlalchemy import or_
+from typing import List, Optional
+from datetime import datetime
+
+from app.api.deps import get_scoped_db
+from app.models import Event
+from app.schemas import EventCreate, EventResponse
+
+router = APIRouter(prefix="/events", tags=["Events"])
+
+
+@router.get("/", response_model=List[EventResponse])
+def list_events(
+ type: Optional[str] = None,
+ search: Optional[str] = None,
+ start_date: Optional[datetime] = None,
+ end_date: Optional[datetime] = None,
+ limit: int = Query(default=50, le=200),
+ offset: int = 0,
+ db: Session = Depends(get_scoped_db)
+):
+ """Lista eventos com filtros opcionais"""
+ query = db.query(Event)
+
+ if type:
+ query = query.filter(Event.type == type)
+
+ if search:
+ query = query.filter(
+ or_(
+ Event.title.ilike(f"%{search}%"),
+ Event.description.ilike(f"%{search}%")
+ )
+ )
+
+ if start_date:
+ query = query.filter(Event.event_date >= start_date)
+ if end_date:
+ query = query.filter(Event.event_date <= end_date)
+
+ query = query.order_by(Event.event_date.desc().nullslast())
+ return query.offset(offset).limit(limit).all()
+
+
+@router.get("/types")
+def get_event_types(db: Session = Depends(get_scoped_db)):
+ """Retorna todos os tipos de evento unicos"""
+ types = db.query(Event.type).distinct().all()
+ return [t[0] for t in types]
+
+
+@router.get("/timeline")
+def get_timeline(
+ entity_id: Optional[str] = None,
+ limit: int = Query(default=50, le=200),
+ db: Session = Depends(get_scoped_db)
+):
+ """
+ Retorna eventos em formato timeline.
+ """
+ query = db.query(Event).filter(Event.event_date.isnot(None))
+
+ if entity_id:
+ query = query.filter(Event.entity_ids.contains([entity_id]))
+
+ events = query.order_by(Event.event_date.asc()).limit(limit).all()
+
+ return [
+ {
+ "id": e.id,
+ "title": e.title,
+ "date": e.event_date.isoformat() if e.event_date else None,
+ "type": e.type,
+ "location": e.location_name
+ }
+ for e in events
+ ]
+
+
+@router.get("/{event_id}", response_model=EventResponse)
+def get_event(event_id: str, db: Session = Depends(get_scoped_db)):
+ """Busca um evento por ID"""
+ event = db.query(Event).filter(Event.id == event_id).first()
+ if not event:
+ raise HTTPException(status_code=404, detail="Event not found")
+ return event
+
+
+@router.post("/", response_model=EventResponse, status_code=201)
+def create_event(event: EventCreate, db: Session = Depends(get_scoped_db)):
+ """Cria um novo evento"""
+ db_event = Event(**event.model_dump())
+ db.add(db_event)
+ db.commit()
+ db.refresh(db_event)
+ return db_event
+
+
+@router.delete("/{event_id}")
+def delete_event(event_id: str, db: Session = Depends(get_scoped_db)):
+ """Deleta um evento"""
+ db_event = db.query(Event).filter(Event.id == event_id).first()
+ if not db_event:
+ raise HTTPException(status_code=404, detail="Event not found")
+
+ db.delete(db_event)
+ db.commit()
+ return {"message": "Event deleted"}
diff --git a/app/api/routes/graph.py b/app/api/routes/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..66a0886d6fb53b0884ef1e803f2acefd29f92873
--- /dev/null
+++ b/app/api/routes/graph.py
@@ -0,0 +1,173 @@
+"""
+Graph API Routes - Network visualization endpoints
+"""
+from fastapi import APIRouter, Depends, HTTPException, Query
+from typing import Optional, List
+from sqlalchemy.orm import Session
+from sqlalchemy import or_
+
+from app.api.deps import get_scoped_db
+from app.models.entity import Entity, Relationship
+
+
+router = APIRouter(prefix="/graph", tags=["Graph"])
+
+
+@router.get("")
+async def get_graph(
+ entity_type: Optional[str] = Query(None, description="Filter by entity type"),
+ limit: int = Query(100, le=500, description="Maximum number of entities"),
+ db: Session = Depends(get_scoped_db)
+):
+ """
+ Get graph data for visualization.
+ Returns nodes (entities) and edges (relationships).
+ """
+ try:
+ # Get entities
+ query = db.query(Entity)
+ if entity_type:
+ query = query.filter(Entity.type == entity_type)
+
+ entities = query.limit(limit).all()
+ entity_ids = [e.id for e in entities]
+
+ # Get relationships between these entities
+ relationships = db.query(Relationship).filter(
+ or_(
+ Relationship.source_id.in_(entity_ids),
+ Relationship.target_id.in_(entity_ids)
+ )
+ ).all()
+
+ # Format for Cytoscape.js
+ nodes = []
+ for e in entities:
+ nodes.append({
+ "data": {
+ "id": e.id,
+ "label": e.name[:30] + "..." if len(e.name) > 30 else e.name,
+ "fullName": e.name,
+ "type": e.type,
+ "description": e.description[:100] if e.description else "",
+ "source": e.source or "unknown"
+ }
+ })
+
+ edges = []
+ for r in relationships:
+ if r.source_id in entity_ids and r.target_id in entity_ids:
+ edges.append({
+ "data": {
+ "id": r.id,
+ "source": r.source_id,
+ "target": r.target_id,
+ "label": r.type,
+ "type": r.type
+ }
+ })
+
+ return {
+ "nodes": nodes,
+ "edges": edges,
+ "stats": {
+ "total_nodes": len(nodes),
+ "total_edges": len(edges)
+ }
+ }
+
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=f"Failed to get graph: {str(e)}")
+
+
+@router.get("/entity/{entity_id}")
+async def get_entity_graph(
+ entity_id: str,
+ depth: int = Query(1, ge=1, le=3, description="How many levels of connections to include"),
+ db: Session = Depends(get_scoped_db)
+):
+ """
+ Get graph centered on a specific entity.
+ """
+ try:
+ # Get the central entity
+ central = db.query(Entity).filter(Entity.id == entity_id).first()
+ if not central:
+ raise HTTPException(status_code=404, detail="Entity not found")
+
+ # Collect entity IDs at each depth level
+ collected_ids = {entity_id}
+ current_level = {entity_id}
+
+ for _ in range(depth):
+ rels = db.query(Relationship).filter(
+ or_(
+ Relationship.source_id.in_(current_level),
+ Relationship.target_id.in_(current_level)
+ )
+ ).all()
+
+ next_level = set()
+ for r in rels:
+ next_level.add(r.source_id)
+ next_level.add(r.target_id)
+
+ current_level = next_level - collected_ids
+ collected_ids.update(next_level)
+
+ # Get all entities
+ entities = db.query(Entity).filter(Entity.id.in_(collected_ids)).all()
+
+ # Get all relationships between collected entities
+ relationships = db.query(Relationship).filter(
+ Relationship.source_id.in_(collected_ids),
+ Relationship.target_id.in_(collected_ids)
+ ).all()
+
+ # Format for Cytoscape
+ nodes = []
+ for e in entities:
+ nodes.append({
+ "data": {
+ "id": e.id,
+ "label": e.name[:30] + "..." if len(e.name) > 30 else e.name,
+ "fullName": e.name,
+ "type": e.type,
+ "description": e.description[:100] if e.description else "",
+ "source": e.source or "unknown",
+ "isCentral": e.id == entity_id
+ }
+ })
+
+ edges = []
+ for r in relationships:
+ edges.append({
+ "data": {
+ "id": r.id,
+ "source": r.source_id,
+ "target": r.target_id,
+ "label": r.type,
+ "type": r.type
+ }
+ })
+
+ return {
+ "central": {
+ "id": central.id,
+ "name": central.name,
+ "type": central.type
+ },
+ "nodes": nodes,
+ "edges": edges,
+ "stats": {
+ "total_nodes": len(nodes),
+ "total_edges": len(edges),
+ "depth": depth
+ }
+ }
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=f"Failed to get entity graph: {str(e)}")
+
diff --git a/app/api/routes/ingest.py b/app/api/routes/ingest.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2216481b8ad615180d4b4bfad5d7f24e453774a
--- /dev/null
+++ b/app/api/routes/ingest.py
@@ -0,0 +1,341 @@
+"""
+Data Ingestion Routes
+Endpoints para importar dados de fontes externas
+"""
+from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
+from sqlalchemy.orm import Session
+from typing import Optional, List
+from datetime import datetime
+import asyncio
+
+from app.api.deps import get_scoped_db
+from app.models import Entity, Document, Relationship
+from app.schemas import EntityResponse, DocumentResponse
+from app.services.ingestion import wikipedia_scraper, news_service
+from app.services.nlp import entity_extractor
+from app.services.geocoding import geocode
+
+router = APIRouter(prefix="/ingest", tags=["Data Ingestion"])
+
+
+def parse_event_date(date_str):
+ """Parse date string to datetime object"""
+ if not date_str:
+ return None
+ try:
+ # Try YYYY-MM-DD format
+ return datetime.strptime(date_str[:10], "%Y-%m-%d")
+ except:
+ try:
+ # Try YYYY format
+ return datetime.strptime(date_str[:4], "%Y")
+ except:
+ return None
+
+
+# ========== Wikipedia ==========
+
+@router.get("/wikipedia/search")
+def search_wikipedia(q: str, limit: int = 10):
+ """Busca artigos na Wikipedia"""
+ results = wikipedia_scraper.search(q, limit)
+ return results
+
+
+@router.post("/wikipedia/entity", response_model=EntityResponse)
+async def import_from_wikipedia(
+ title: str,
+ entity_type: str = "person",
+ project_id: Optional[str] = None,
+ auto_extract: bool = True,
+ db: Session = Depends(get_scoped_db)
+):
+ """
+ Importa uma entidade da Wikipedia
+ entity_type: person, organization, location
+ project_id: ID do projeto para associar a entidade
+ auto_extract: Se True, usa LLM para extrair entidades relacionadas
+ """
+ # Check if entity already exists
+ existing = db.query(Entity).filter(
+ Entity.name == title,
+ Entity.source == "wikipedia"
+ ).first()
+
+ if existing:
+ return existing
+
+ # Scrape based on type
+ if entity_type == "person":
+ data = wikipedia_scraper.scrape_person(title)
+ elif entity_type == "organization":
+ data = wikipedia_scraper.scrape_organization(title)
+ elif entity_type == "location":
+ data = wikipedia_scraper.scrape_location(title)
+ else:
+ data = wikipedia_scraper.scrape_person(title) # default
+
+ if not data:
+ raise HTTPException(status_code=404, detail="Article not found on Wikipedia")
+
+ # Create main entity with project_id
+ entity = Entity(**data)
+ entity.project_id = project_id
+ db.add(entity)
+ db.commit()
+ db.refresh(entity)
+
+ # Auto-extract entities and relationships using LLM
+ if auto_extract and data.get("description"):
+ try:
+ # Limit text to avoid token limits
+ text_to_analyze = data["description"][:3000]
+ result = await entity_extractor.extract(text_to_analyze)
+
+ # Create extracted entities
+ created_entities = {}
+ for ext_entity in result.entities:
+ # Skip if same as main entity
+ if ext_entity.name.lower() == title.lower():
+ created_entities[ext_entity.name] = entity
+ continue
+
+ # Check if entity exists (by similar name)
+ existing_ent = db.query(Entity).filter(
+ Entity.name.ilike(f"%{ext_entity.name}%")
+ ).first()
+
+ if existing_ent:
+ created_entities[ext_entity.name] = existing_ent
+ else:
+ # Get coordinates for location entities
+ lat, lng = None, None
+ if ext_entity.type == "location":
+ coords = await geocode(ext_entity.name)
+ if coords:
+ lat, lng = coords
+
+ # Parse event_date
+ event_date = parse_event_date(getattr(ext_entity, 'event_date', None))
+
+ new_ent = Entity(
+ name=ext_entity.name,
+ type=ext_entity.type if ext_entity.type in ["person", "organization", "location", "event"] else "person",
+ description=ext_entity.description or ext_entity.role,
+ source="wikipedia_extraction",
+ latitude=lat,
+ longitude=lng,
+ event_date=event_date,
+ project_id=project_id,
+ properties={"role": ext_entity.role, "aliases": ext_entity.aliases, "extracted_from": title}
+ )
+ db.add(new_ent)
+ db.commit()
+ db.refresh(new_ent)
+ created_entities[ext_entity.name] = new_ent
+
+ # Create relationships
+ for rel in result.relationships:
+ source_ent = created_entities.get(rel.source) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.source}%")).first()
+ target_ent = created_entities.get(rel.target) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.target}%")).first()
+
+ if source_ent and target_ent and source_ent.id != target_ent.id:
+ # Check if relationship exists
+ existing_rel = db.query(Relationship).filter(
+ Relationship.source_id == source_ent.id,
+ Relationship.target_id == target_ent.id,
+ Relationship.type == rel.relationship_type
+ ).first()
+
+ if not existing_rel:
+ # Parse relationship event_date
+ rel_event_date = parse_event_date(getattr(rel, 'event_date', None))
+
+ new_rel = Relationship(
+ source_id=source_ent.id,
+ target_id=target_ent.id,
+ type=rel.relationship_type,
+ event_date=rel_event_date,
+ properties={"context": rel.context, "extracted_from": title}
+ )
+ db.add(new_rel)
+
+ db.commit()
+
+ except Exception as e:
+ print(f"NER extraction error: {e}")
+ # Continue without extraction if it fails
+
+ return entity
+
+
+# ========== News ==========
+
+@router.get("/news/feeds")
+def list_available_feeds():
+ """Lista os feeds de notícias disponíveis"""
+ return list(news_service.RSS_FEEDS.keys())
+
+
+@router.get("/news/fetch")
+def fetch_news(feed: Optional[str] = None):
+ """
+ Busca notícias dos feeds RSS
+ Se feed não for especificado, busca de todos
+ """
+ if feed:
+ if feed not in news_service.RSS_FEEDS:
+ raise HTTPException(status_code=404, detail="Feed not found")
+ url = news_service.RSS_FEEDS[feed]
+ articles = news_service.fetch_feed(url)
+ else:
+ articles = news_service.fetch_all_feeds()
+
+ return articles
+
+
+@router.get("/news/search")
+def search_news(q: str):
+ """Busca notícias por palavra-chave via Google News"""
+ return news_service.search_news(q)
+
+
+@router.post("/news/import")
+async def import_news(
+ query: Optional[str] = None,
+ feed: Optional[str] = None,
+ auto_extract: bool = True,
+ db: Session = Depends(get_scoped_db)
+):
+ """
+ Importa notícias como documentos no sistema
+ auto_extract: Se True, usa LLM para extrair entidades de cada notícia
+ """
+ if query:
+ articles = news_service.search_news(query)
+ elif feed:
+ if feed not in news_service.RSS_FEEDS:
+ raise HTTPException(status_code=404, detail="Feed not found")
+ articles = news_service.fetch_feed(news_service.RSS_FEEDS[feed])
+ else:
+ articles = news_service.fetch_all_feeds()
+
+ imported = 0
+ extracted_entities = 0
+
+ for article in articles:
+ # Check if document already exists (by URL)
+ if article.get("url"):
+ existing = db.query(Document).filter(
+ Document.source_url == article["url"]
+ ).first()
+ if existing:
+ continue
+
+ doc_data = news_service.to_document(article)
+ doc = Document(**doc_data)
+ db.add(doc)
+ db.commit()
+ imported += 1
+
+ # Extract entities from article content
+ if auto_extract:
+ try:
+ text_to_analyze = f"{article.get('title', '')} {article.get('description', '')}".strip()
+ if len(text_to_analyze) >= 20:
+ result = await entity_extractor.extract(text_to_analyze[:2000])
+
+ created_entities = {}
+ for ext_entity in result.entities:
+ # Check if entity exists
+ existing_ent = db.query(Entity).filter(
+ Entity.name.ilike(f"%{ext_entity.name}%")
+ ).first()
+
+ if existing_ent:
+ created_entities[ext_entity.name] = existing_ent
+ else:
+ # Get coordinates for location entities
+ lat, lng = None, None
+ if ext_entity.type == "location":
+ coords = await geocode(ext_entity.name)
+ if coords:
+ lat, lng = coords
+
+ new_ent = Entity(
+ name=ext_entity.name,
+ type=ext_entity.type if ext_entity.type in ["person", "organization", "location", "event"] else "person",
+ description=ext_entity.description or ext_entity.role,
+ source="news_extraction",
+ latitude=lat,
+ longitude=lng,
+ properties={"role": ext_entity.role, "aliases": ext_entity.aliases, "from_article": article.get('title', '')}
+ )
+ db.add(new_ent)
+ db.commit()
+ db.refresh(new_ent)
+ created_entities[ext_entity.name] = new_ent
+ extracted_entities += 1
+
+ # Create relationships
+ for rel in result.relationships:
+ source_ent = created_entities.get(rel.source) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.source}%")).first()
+ target_ent = created_entities.get(rel.target) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.target}%")).first()
+
+ if source_ent and target_ent and source_ent.id != target_ent.id:
+ existing_rel = db.query(Relationship).filter(
+ Relationship.source_id == source_ent.id,
+ Relationship.target_id == target_ent.id,
+ Relationship.type == rel.relationship_type
+ ).first()
+
+ if not existing_rel:
+ new_rel = Relationship(
+ source_id=source_ent.id,
+ target_id=target_ent.id,
+ type=rel.relationship_type,
+ properties={"context": rel.context}
+ )
+ db.add(new_rel)
+
+ db.commit()
+
+ except Exception as e:
+ print(f"NER extraction error for article: {e}")
+ # Continue without extraction
+
+ return {
+ "message": f"Imported {imported} articles",
+ "total_found": len(articles),
+ "extracted_entities": extracted_entities
+ }
+
+
+# ========== Manual Import ==========
+
+@router.post("/bulk/entities")
+def bulk_import_entities(
+ entities: List[dict],
+ db: Session = Depends(get_scoped_db)
+):
+ """
+ Importa múltiplas entidades de uma vez
+ Útil para importar de CSV/JSON
+ """
+ imported = 0
+ for entity_data in entities:
+ entity = Entity(
+ type=entity_data.get("type", "unknown"),
+ name=entity_data.get("name", "Unnamed"),
+ description=entity_data.get("description"),
+ properties=entity_data.get("properties", {}),
+ latitude=entity_data.get("latitude"),
+ longitude=entity_data.get("longitude"),
+ source=entity_data.get("source", "manual")
+ )
+ db.add(entity)
+ imported += 1
+
+ db.commit()
+
+ return {"message": f"Imported {imported} entities"}
diff --git a/app/api/routes/investigate.py b/app/api/routes/investigate.py
new file mode 100644
index 0000000000000000000000000000000000000000..646857df8ac0eed0f99ac443367d25c7a6af1512
--- /dev/null
+++ b/app/api/routes/investigate.py
@@ -0,0 +1,207 @@
+"""
+Investigation API Routes - Build dossiers on companies and people
+"""
+from fastapi import APIRouter, HTTPException, Depends
+from pydantic import BaseModel, Field
+from typing import Optional, List, Dict, Any
+from sqlalchemy.orm import Session
+
+from app.services.investigation import (
+ investigar_empresa,
+ investigar_pessoa,
+ dossier_to_dict
+)
+from app.services.brazil_apis import consultar_cnpj
+from app.services.investigator_agent import investigator_agent
+from app.api.deps import get_scoped_db
+
+
+router = APIRouter(prefix="/investigate", tags=["Investigation"])
+
+
+class InvestigateCompanyRequest(BaseModel):
+ """Request to investigate a company"""
+ cnpj: str = Field(..., min_length=11, description="CNPJ da empresa")
+
+
+class InvestigatePersonRequest(BaseModel):
+ """Request to investigate a person"""
+ nome: str = Field(..., min_length=2, description="Nome da pessoa")
+ cpf: Optional[str] = Field(None, description="CPF (opcional)")
+
+
+class DossierResponse(BaseModel):
+ """Dossier response"""
+ tipo: str
+ alvo: str
+ cnpj_cpf: Optional[str]
+ red_flags: List[str]
+ score_risco: int
+ data_geracao: str
+ fonte_dados: List[str]
+ secoes: Dict[str, Any]
+
+
+class CNPJResponse(BaseModel):
+ """Quick CNPJ lookup response"""
+ cnpj: str
+ razao_social: str
+ nome_fantasia: str
+ situacao: str
+ data_abertura: str
+ capital_social: float
+ endereco: str
+ telefone: str
+ email: str
+ atividade: str
+ socios: List[Dict[str, Any]]
+
+
+@router.post("/company", response_model=DossierResponse)
+async def investigate_company(request: InvestigateCompanyRequest):
+ """
+ Build a comprehensive dossier on a company.
+
+ Collects:
+ - Cadastral data from CNPJ
+ - Partners/owners
+ - Sanctions (CEIS, CNEP, CEPIM)
+ - News and media mentions
+ - Related entities
+
+ Returns risk score and red flags.
+ """
+ try:
+ dossier = await investigar_empresa(request.cnpj)
+ return DossierResponse(**dossier_to_dict(dossier))
+
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/person", response_model=DossierResponse)
+async def investigate_person(request: InvestigatePersonRequest):
+ """
+ Build a dossier on a person.
+
+ Note: Due to LGPD, personal data is limited.
+ Mainly uses web search for public information.
+ """
+ try:
+ dossier = await investigar_pessoa(request.nome, request.cpf)
+ return DossierResponse(**dossier_to_dict(dossier))
+
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.get("/cnpj/{cnpj}", response_model=CNPJResponse)
+async def lookup_cnpj(cnpj: str):
+ """
+ Quick CNPJ lookup - returns basic company data.
+ """
+ try:
+ data = await consultar_cnpj(cnpj)
+
+ if not data:
+ raise HTTPException(status_code=404, detail="CNPJ não encontrado")
+
+ return CNPJResponse(
+ cnpj=data.cnpj,
+ razao_social=data.razao_social,
+ nome_fantasia=data.nome_fantasia,
+ situacao=data.situacao,
+ data_abertura=data.data_abertura,
+ capital_social=data.capital_social,
+ endereco=f"{data.logradouro}, {data.numero} - {data.bairro}, {data.cidade}/{data.uf}",
+ telefone=data.telefone,
+ email=data.email,
+ atividade=f"{data.cnae_principal} - {data.cnae_descricao}",
+ socios=data.socios
+ )
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+# ===========================================
+# Autonomous Investigation Agent
+# ===========================================
+
+class AgentInvestigateRequest(BaseModel):
+ """Request for autonomous investigation"""
+ mission: str = Field(..., min_length=5, description="Missão de investigação em linguagem natural")
+ max_iterations: int = Field(10, ge=1, le=20, description="Máximo de iterações do agente")
+
+
+class FindingResponse(BaseModel):
+ """A finding from investigation"""
+ title: str
+ content: str
+ source: str
+ timestamp: str
+
+
+class AgentInvestigateResponse(BaseModel):
+ """Response from autonomous investigation"""
+ mission: str
+ status: str
+ report: str
+ findings: List[FindingResponse]
+ entities_discovered: int
+ connections_mapped: int
+ iterations: int
+ tools_used: List[str]
+
+
+@router.post("/agent", response_model=AgentInvestigateResponse)
+async def investigate_with_agent(
+ request: AgentInvestigateRequest,
+ db: Session = Depends(get_scoped_db)
+):
+ """
+ Autonomous investigation with AI agent.
+
+ The agent will:
+ 1. Search NUMIDIUM for existing entities
+ 2. Query CNPJ data for Brazilian companies
+ 3. Search the web for news and public info
+ 4. Follow leads and connections
+ 5. Generate a comprehensive report
+
+ Example missions:
+ - "Investigue a rede de empresas de João Silva"
+ - "Descubra os sócios da empresa CNPJ 11.222.333/0001-44"
+ - "Pesquise sobre a empresa XYZ e suas conexões"
+ """
+ try:
+ result = await investigator_agent.investigate(
+ mission=request.mission,
+ db=db,
+ max_iterations=request.max_iterations
+ )
+
+ return AgentInvestigateResponse(
+ mission=result.mission,
+ status=result.status,
+ report=result.report,
+ findings=[
+ FindingResponse(
+ title=f.title,
+ content=f.content,
+ source=f.source,
+ timestamp=f.timestamp
+ )
+ for f in result.findings
+ ],
+ entities_discovered=len(result.entities_discovered),
+ connections_mapped=len(result.connections_mapped),
+ iterations=result.iterations,
+ tools_used=result.tools_used
+ )
+
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
diff --git a/app/api/routes/projects.py b/app/api/routes/projects.py
new file mode 100644
index 0000000000000000000000000000000000000000..d283d8b2d566c49e7a32cba8acc10b39307b4299
--- /dev/null
+++ b/app/api/routes/projects.py
@@ -0,0 +1,135 @@
+"""
+Projects API Routes - Workspace management
+"""
+from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel
+from typing import Optional, List
+from datetime import datetime
+from sqlalchemy.orm import Session
+
+from app.api.deps import get_scoped_db
+from app.models import Project, Entity, Relationship
+
+
+router = APIRouter(prefix="/projects", tags=["Projects"])
+
+
+class ProjectCreate(BaseModel):
+ name: str
+ description: Optional[str] = None
+ color: str = "#00d4ff"
+ icon: str = "folder"
+
+
+class ProjectResponse(BaseModel):
+ id: str
+ name: str
+ description: Optional[str]
+ color: str
+ icon: str
+ entity_count: int = 0
+ created_at: datetime
+
+ class Config:
+ from_attributes = True
+
+
+@router.get("", response_model=List[ProjectResponse])
+def list_projects(db: Session = Depends(get_scoped_db)):
+ """List all projects"""
+ projects = db.query(Project).order_by(Project.created_at.desc()).all()
+
+ result = []
+ for p in projects:
+ entity_count = db.query(Entity).filter(Entity.project_id == p.id).count()
+ result.append(ProjectResponse(
+ id=p.id,
+ name=p.name,
+ description=p.description,
+ color=p.color,
+ icon=p.icon,
+ entity_count=entity_count,
+ created_at=p.created_at
+ ))
+
+ return result
+
+
+@router.post("", response_model=ProjectResponse)
+def create_project(project: ProjectCreate, db: Session = Depends(get_scoped_db)):
+ """Create a new project"""
+ new_project = Project(
+ name=project.name,
+ description=project.description,
+ color=project.color,
+ icon=project.icon
+ )
+ db.add(new_project)
+ db.commit()
+ db.refresh(new_project)
+
+ return ProjectResponse(
+ id=new_project.id,
+ name=new_project.name,
+ description=new_project.description,
+ color=new_project.color,
+ icon=new_project.icon,
+ entity_count=0,
+ created_at=new_project.created_at
+ )
+
+
+@router.get("/{project_id}", response_model=ProjectResponse)
+def get_project(project_id: str, db: Session = Depends(get_scoped_db)):
+ """Get project by ID"""
+ project = db.query(Project).filter(Project.id == project_id).first()
+
+ if not project:
+ raise HTTPException(status_code=404, detail="Project not found")
+
+ entity_count = db.query(Entity).filter(Entity.project_id == project_id).count()
+
+ return ProjectResponse(
+ id=project.id,
+ name=project.name,
+ description=project.description,
+ color=project.color,
+ icon=project.icon,
+ entity_count=entity_count,
+ created_at=project.created_at
+ )
+
+
+@router.delete("/{project_id}")
+def delete_project(project_id: str, db: Session = Depends(get_scoped_db)):
+ """Delete project and optionally its entities"""
+ project = db.query(Project).filter(Project.id == project_id).first()
+
+ if not project:
+ raise HTTPException(status_code=404, detail="Project not found")
+
+ # Set entities and relationships to no project (null)
+ db.query(Entity).filter(Entity.project_id == project_id).update({"project_id": None})
+ db.query(Relationship).filter(Relationship.project_id == project_id).update({"project_id": None})
+
+ db.delete(project)
+ db.commit()
+
+ return {"message": f"Project '{project.name}' deleted"}
+
+
+@router.put("/{project_id}")
+def update_project(project_id: str, project: ProjectCreate, db: Session = Depends(get_scoped_db)):
+ """Update project"""
+ existing = db.query(Project).filter(Project.id == project_id).first()
+
+ if not existing:
+ raise HTTPException(status_code=404, detail="Project not found")
+
+ existing.name = project.name
+ existing.description = project.description
+ existing.color = project.color
+ existing.icon = project.icon
+ db.commit()
+
+ return {"message": "Project updated"}
diff --git a/app/api/routes/relationships.py b/app/api/routes/relationships.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5887de9ce8df297614adf63c4db8d365fc33114
--- /dev/null
+++ b/app/api/routes/relationships.py
@@ -0,0 +1,76 @@
+"""
+Relationship CRUD Routes
+"""
+from fastapi import APIRouter, Depends, HTTPException, Query
+from sqlalchemy.orm import Session
+from typing import List, Optional
+
+from app.api.deps import get_scoped_db
+from app.models import Relationship, Entity
+from app.schemas import RelationshipCreate, RelationshipResponse
+
+router = APIRouter(prefix="/relationships", tags=["Relationships"])
+
+
+@router.get("/", response_model=List[RelationshipResponse])
+def list_relationships(
+ type: Optional[str] = None,
+ source_id: Optional[str] = None,
+ target_id: Optional[str] = None,
+ limit: int = Query(default=50, le=200),
+ db: Session = Depends(get_scoped_db)
+):
+ """Lista relacionamentos com filtros opcionais"""
+ query = db.query(Relationship)
+
+ if type:
+ query = query.filter(Relationship.type == type)
+ if source_id:
+ query = query.filter(Relationship.source_id == source_id)
+ if target_id:
+ query = query.filter(Relationship.target_id == target_id)
+
+ return query.limit(limit).all()
+
+
+@router.get("/types")
+def get_relationship_types(db: Session = Depends(get_scoped_db)):
+ """Retorna todos os tipos de relacionamento unicos"""
+ types = db.query(Relationship.type).distinct().all()
+ return [t[0] for t in types]
+
+
+@router.post("/", response_model=RelationshipResponse, status_code=201)
+def create_relationship(
+ rel: RelationshipCreate,
+ db: Session = Depends(get_scoped_db)
+):
+ """Cria um novo relacionamento entre entidades"""
+ source = db.query(Entity).filter(Entity.id == rel.source_id).first()
+ target = db.query(Entity).filter(Entity.id == rel.target_id).first()
+
+ if not source:
+ raise HTTPException(status_code=404, detail="Source entity not found")
+ if not target:
+ raise HTTPException(status_code=404, detail="Target entity not found")
+
+ db_rel = Relationship(**rel.model_dump())
+ db.add(db_rel)
+ db.commit()
+ db.refresh(db_rel)
+ return db_rel
+
+
+@router.delete("/{relationship_id}")
+def delete_relationship(
+ relationship_id: str,
+ db: Session = Depends(get_scoped_db)
+):
+ """Deleta um relacionamento"""
+ db_rel = db.query(Relationship).filter(Relationship.id == relationship_id).first()
+ if not db_rel:
+ raise HTTPException(status_code=404, detail="Relationship not found")
+
+ db.delete(db_rel)
+ db.commit()
+ return {"message": "Relationship deleted"}
diff --git a/app/api/routes/research.py b/app/api/routes/research.py
new file mode 100644
index 0000000000000000000000000000000000000000..41eb6efdb31bbc7cb0da78df28ce780a75fc0f9b
--- /dev/null
+++ b/app/api/routes/research.py
@@ -0,0 +1,158 @@
+"""
+Research API Routes - Deep research with automatic entity extraction
+"""
+from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel, Field
+from typing import Optional, List
+import traceback
+from sqlalchemy.orm import Session
+
+from app.api.deps import get_scoped_db
+from app.services import lancer
+from app.services.nlp import entity_extractor
+from app.services.geocoding import geocode
+from app.models.entity import Entity, Relationship
+
+
+router = APIRouter(prefix="/research", tags=["Research"])
+
+
+class ResearchRequest(BaseModel):
+ """Request model for research"""
+ query: str = Field(..., min_length=3, description="Research query")
+ mode: str = Field(default="search", description="Research mode: search, deep, heavy")
+ max_results: int = Field(default=10, le=20)
+ auto_extract: bool = Field(default=True, description="Auto-extract entities using NER")
+
+
+class ResearchResponse(BaseModel):
+ """Response model for research"""
+ query: str
+ answer: Optional[str]
+ sources: List[dict]
+ citations: List[dict]
+ extracted_entities: int
+ extracted_relationships: int
+ processing_time_ms: float
+
+
+@router.post("", response_model=ResearchResponse)
+async def research(request: ResearchRequest, db: Session = Depends(get_scoped_db)):
+ """
+ Perform AI-powered research using Lancer API and optionally extract entities.
+
+ Modes:
+ - search: Fast search with AI synthesis
+ - deep: Multi-dimensional deep research (slower, more comprehensive)
+ - heavy: Search with full content scraping
+ """
+ try:
+ # Call Lancer API based on mode
+ if request.mode == "deep":
+ result = await lancer.deep_research(request.query)
+ elif request.mode == "heavy":
+ result = await lancer.heavy_search(request.query, request.max_results)
+ else:
+ result = await lancer.search(request.query, request.max_results)
+
+ extracted_entities = 0
+ extracted_relationships = 0
+
+ # Extract entities if enabled
+ if request.auto_extract and result.raw_text:
+ try:
+ # Limit text to avoid token limits
+ text_to_analyze = result.raw_text[:5000]
+ ner_result = await entity_extractor.extract(text_to_analyze)
+
+ created_entities = {}
+
+ # Create entities
+ for entity in ner_result.entities:
+ # Check if exists
+ existing = db.query(Entity).filter(
+ Entity.name.ilike(f"%{entity.name}%")
+ ).first()
+
+ if existing:
+ created_entities[entity.name] = existing
+ else:
+ # Geocode if location
+ lat, lng = None, None
+ if entity.type == "location":
+ coords = await geocode(entity.name)
+ if coords:
+ lat, lng = coords
+
+ new_entity = Entity(
+ name=entity.name,
+ type=entity.type if entity.type in ["person", "organization", "location", "event"] else "person",
+ description=entity.description or entity.role or "",
+ source="lancer_research",
+ latitude=lat,
+ longitude=lng,
+ properties={
+ "role": entity.role,
+ "aliases": entity.aliases,
+ "research_query": request.query
+ }
+ )
+ db.add(new_entity)
+ db.commit()
+ db.refresh(new_entity)
+ created_entities[entity.name] = new_entity
+ extracted_entities += 1
+
+ # Create relationships
+ for rel in ner_result.relationships:
+ source_ent = created_entities.get(rel.source) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.source}%")).first()
+ target_ent = created_entities.get(rel.target) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.target}%")).first()
+
+ if source_ent and target_ent and source_ent.id != target_ent.id:
+ existing_rel = db.query(Relationship).filter(
+ Relationship.source_id == source_ent.id,
+ Relationship.target_id == target_ent.id,
+ Relationship.type == rel.relationship_type
+ ).first()
+
+ if not existing_rel:
+ new_rel = Relationship(
+ source_id=source_ent.id,
+ target_id=target_ent.id,
+ type=rel.relationship_type,
+ properties={"context": rel.context, "research_query": request.query}
+ )
+ db.add(new_rel)
+ extracted_relationships += 1
+
+ db.commit()
+
+ except Exception as e:
+ print(f"NER extraction error: {e}")
+ traceback.print_exc()
+
+ # Prepare sources for response
+ sources = [
+ {
+ "title": r.title,
+ "url": r.url,
+ "content": r.content[:300] if r.content else "",
+ "score": r.score
+ }
+ for r in result.results[:10]
+ ]
+
+ return ResearchResponse(
+ query=result.query,
+ answer=result.answer,
+ sources=sources,
+ citations=result.citations,
+ extracted_entities=extracted_entities,
+ extracted_relationships=extracted_relationships,
+ processing_time_ms=result.processing_time_ms
+ )
+
+ except Exception as e:
+ print(f"Research error: {e}")
+ traceback.print_exc()
+ raise HTTPException(status_code=500, detail=str(e))
diff --git a/app/api/routes/search.py b/app/api/routes/search.py
new file mode 100644
index 0000000000000000000000000000000000000000..27ad925fb6abc0eb121ff2660bc06fd55fd322f0
--- /dev/null
+++ b/app/api/routes/search.py
@@ -0,0 +1,126 @@
+"""
+Search and Analytics Routes
+"""
+from fastapi import APIRouter, Depends, Query
+from sqlalchemy.orm import Session
+from sqlalchemy import or_, func
+from typing import Optional
+
+from app.api.deps import get_scoped_db
+from app.models import Entity, Relationship, Event, Document
+from app.schemas import SearchResult, SystemStats
+
+router = APIRouter(prefix="/search", tags=["Search"])
+
+
+@router.get("", response_model=SearchResult)
+def global_search(
+ q: str = Query(..., min_length=2, description="Search query"),
+ types: Optional[str] = Query(None, description="Entity types (comma-separated)"),
+ limit: int = Query(default=20, le=100),
+ db: Session = Depends(get_scoped_db)
+):
+ """
+ Busca global em todas as entidades, eventos e documentos.
+ """
+ search_term = f"%{q}%"
+ type_filter = types.split(",") if types else None
+
+ entity_query = db.query(Entity).filter(
+ or_(
+ Entity.name.ilike(search_term),
+ Entity.description.ilike(search_term)
+ )
+ )
+ if type_filter:
+ entity_query = entity_query.filter(Entity.type.in_(type_filter))
+ entities = entity_query.limit(limit).all()
+
+ events = db.query(Event).filter(
+ or_(
+ Event.title.ilike(search_term),
+ Event.description.ilike(search_term)
+ )
+ ).limit(limit).all()
+
+ documents = db.query(Document).filter(
+ or_(
+ Document.title.ilike(search_term),
+ Document.content.ilike(search_term)
+ )
+ ).limit(limit).all()
+
+ return SearchResult(
+ entities=entities,
+ events=events,
+ documents=documents
+ )
+
+
+@router.get("/stats", response_model=SystemStats)
+def get_system_stats(db: Session = Depends(get_scoped_db)):
+ """
+ Retorna estatisticas gerais do sistema.
+ """
+ total_entities = db.query(Entity).count()
+ total_relationships = db.query(Relationship).count()
+ total_events = db.query(Event).count()
+ total_documents = db.query(Document).count()
+
+ type_counts = db.query(
+ Entity.type,
+ func.count(Entity.id)
+ ).group_by(Entity.type).all()
+
+ entities_by_type = {t: c for t, c in type_counts}
+
+ recent = db.query(Entity).order_by(Entity.created_at.desc()).limit(10).all()
+ recent_activity = [
+ {
+ "id": e.id,
+ "type": e.type,
+ "name": e.name,
+ "created_at": e.created_at.isoformat()
+ }
+ for e in recent
+ ]
+
+ return SystemStats(
+ total_entities=total_entities,
+ total_relationships=total_relationships,
+ total_events=total_events,
+ total_documents=total_documents,
+ entities_by_type=entities_by_type,
+ recent_activity=recent_activity
+ )
+
+
+@router.get("/geo")
+def get_geo_data(
+ entity_type: Optional[str] = None,
+ db: Session = Depends(get_scoped_db)
+):
+ """
+ Retorna entidades com geolocalizacao.
+ """
+ query = db.query(Entity).filter(
+ Entity.latitude.isnot(None),
+ Entity.longitude.isnot(None)
+ )
+
+ if entity_type:
+ query = query.filter(Entity.type == entity_type)
+
+ entities = query.all()
+
+ return [
+ {
+ "id": e.id,
+ "type": e.type,
+ "name": e.name,
+ "lat": e.latitude,
+ "lng": e.longitude,
+ "properties": e.properties
+ }
+ for e in entities
+ ]
diff --git a/app/api/routes/session.py b/app/api/routes/session.py
new file mode 100644
index 0000000000000000000000000000000000000000..c81ea29a5363a585f72aa5f0df7e2798292c189e
--- /dev/null
+++ b/app/api/routes/session.py
@@ -0,0 +1,44 @@
+"""
+Session management routes
+"""
+from fastapi import APIRouter, Header, Cookie, Response, Request
+from typing import Optional
+import uuid
+
+from app.core.database import create_new_session_id
+from app.config import settings
+
+router = APIRouter(prefix="/session", tags=["Session"])
+
+
+@router.post("/create")
+def create_session(response: Response, request: Request):
+ """Create a new session and return session_id"""
+ session_id = create_new_session_id()
+ secure = settings.cookie_secure
+ samesite = settings.cookie_samesite
+ proto = request.headers.get("x-forwarded-proto", request.url.scheme)
+ if proto != "https" and secure:
+ secure = False
+ samesite = "lax"
+ response.set_cookie(
+ key="numidium_session",
+ value=session_id,
+ max_age=60*60*24*365, # 1 year
+ httponly=True,
+ samesite=samesite,
+ secure=secure
+ )
+ return {"session_id": session_id}
+
+
+@router.get("/current")
+def get_current_session(
+ numidium_session: Optional[str] = Cookie(None),
+ x_session_id: Optional[str] = Header(None)
+):
+ """Get current session ID"""
+ session_id = x_session_id or numidium_session
+ if not session_id:
+ return {"session_id": None, "message": "No session. Call POST /session/create"}
+ return {"session_id": session_id}
diff --git a/app/api/routes/timeline.py b/app/api/routes/timeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa45453faf038d34277ffb6a5f1481a2748a8b0e
--- /dev/null
+++ b/app/api/routes/timeline.py
@@ -0,0 +1,165 @@
+"""
+Timeline API Routes - Temporal view of entities and relationships
+"""
+from fastapi import APIRouter, Depends, Query
+from pydantic import BaseModel
+from typing import Optional, List, Dict, Any
+from datetime import datetime, timedelta
+from collections import defaultdict
+from sqlalchemy.orm import Session
+
+from app.api.deps import get_scoped_db
+from app.models.entity import Entity, Relationship
+
+
+router = APIRouter(prefix="/timeline", tags=["Timeline"])
+
+
+class TimelineEvent(BaseModel):
+ id: str
+ type: str # "entity" or "relationship"
+ entity_type: Optional[str] = None
+ name: str
+ description: Optional[str] = None
+ date: str
+ icon: str
+
+
+class TimelineGroup(BaseModel):
+ date: str
+ label: str
+ events: List[TimelineEvent]
+
+
+class TimelineResponse(BaseModel):
+ groups: List[TimelineGroup]
+ total_events: int
+
+
+@router.get("", response_model=TimelineResponse)
+async def get_timeline(
+ days: int = Query(default=30, ge=1, le=365),
+ entity_type: Optional[str] = None,
+ limit: int = Query(default=100, ge=1, le=500),
+ db: Session = Depends(get_scoped_db)
+):
+ """
+ Get timeline of recent entities and relationships.
+ Groups events by date.
+ """
+ # Calculate date range
+ end_date = datetime.now()
+ start_date = end_date - timedelta(days=days)
+
+ events = []
+
+ # Get entities
+ query = db.query(Entity).filter(
+ Entity.created_at >= start_date
+ )
+
+ if entity_type:
+ query = query.filter(Entity.type == entity_type)
+
+ entities = query.order_by(Entity.created_at.desc()).limit(limit).all()
+
+ icon_map = {
+ "person": "👤",
+ "organization": "🏢",
+ "location": "📍",
+ "event": "📅",
+ "concept": "💡",
+ "product": "📦"
+ }
+
+ for e in entities:
+ # Prefer event_date over created_at
+ date = e.event_date if e.event_date else e.created_at
+ events.append(TimelineEvent(
+ id=e.id,
+ type="entity",
+ entity_type=e.type,
+ name=e.name,
+ description=e.description[:100] if e.description else None,
+ date=date.isoformat() if date else datetime.now().isoformat(),
+ icon=icon_map.get(e.type, "📄")
+ ))
+
+ # Get relationships
+ relationships = db.query(Relationship).filter(
+ Relationship.created_at >= start_date
+ ).order_by(Relationship.created_at.desc()).limit(limit // 2).all()
+
+ for r in relationships:
+ source = db.query(Entity).filter(Entity.id == r.source_id).first()
+ target = db.query(Entity).filter(Entity.id == r.target_id).first()
+
+ if source and target:
+ # Prefer event_date over created_at
+ date = r.event_date if r.event_date else r.created_at
+ events.append(TimelineEvent(
+ id=r.id,
+ type="relationship",
+ name=f"{source.name} → {target.name}",
+ description=r.type,
+ date=date.isoformat() if date else datetime.now().isoformat(),
+ icon="🔗"
+ ))
+
+ # Sort by date
+ events.sort(key=lambda x: x.date, reverse=True)
+
+ # Group by date
+ groups_dict = defaultdict(list)
+ for event in events:
+ date_key = event.date[:10] # YYYY-MM-DD
+ groups_dict[date_key].append(event)
+
+ # Format groups
+ groups = []
+ for date_key in sorted(groups_dict.keys(), reverse=True):
+ try:
+ dt = datetime.fromisoformat(date_key)
+ label = dt.strftime("%d %b %Y")
+ except:
+ label = date_key
+
+ groups.append(TimelineGroup(
+ date=date_key,
+ label=label,
+ events=groups_dict[date_key]
+ ))
+
+ return TimelineResponse(
+ groups=groups,
+ total_events=len(events)
+ )
+
+
+@router.get("/stats")
+async def get_timeline_stats(db: Session = Depends(get_scoped_db)):
+ """Get statistics for timeline visualization"""
+
+ # Count entities by type
+ entity_counts = {}
+ for entity_type in ["person", "organization", "location", "event", "concept"]:
+ count = db.query(Entity).filter(Entity.type == entity_type).count()
+ entity_counts[entity_type] = count
+
+ # Count relationships
+ relationship_count = db.query(Relationship).count()
+
+ # Recent activity (last 7 days)
+ week_ago = datetime.now() - timedelta(days=7)
+ recent_entities = db.query(Entity).filter(Entity.created_at >= week_ago).count()
+ recent_relationships = db.query(Relationship).filter(Relationship.created_at >= week_ago).count()
+
+ return {
+ "entity_counts": entity_counts,
+ "relationship_count": relationship_count,
+ "recent_activity": {
+ "entities": recent_entities,
+ "relationships": recent_relationships,
+ "total": recent_entities + recent_relationships
+ }
+ }
diff --git a/app/config.py b/app/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..23f3497193305642c7ae08e7c907009e44c489f0
--- /dev/null
+++ b/app/config.py
@@ -0,0 +1,47 @@
+"""
+Numidium Backend Configuration
+"""
+from pydantic_settings import BaseSettings
+from functools import lru_cache
+import os
+
+
+class Settings(BaseSettings):
+ """Application settings"""
+
+ # App Info
+ app_name: str = "Numidium"
+ app_version: str = "0.1.0"
+ debug: bool = False
+
+ # Database
+ database_url: str = "sqlite:///./data/numidium.db"
+
+ # APIs (opcional - pode configurar depois)
+ newsapi_key: str = ""
+
+ # Cerebras API for LLM-based entity extraction
+ cerebras_api_key: str = ""
+
+ # AetherMap API for semantic search and NER
+ aethermap_url: str = "https://madras1-aethermap.hf.space"
+
+ # CORS
+ cors_origins: list[str] = ["*"]
+
+ # Session cookie
+ cookie_secure: bool = True
+ cookie_samesite: str = "none"
+
+ class Config:
+ env_file = ".env"
+ env_file_encoding = "utf-8"
+
+
+@lru_cache()
+def get_settings() -> Settings:
+ """Get cached settings"""
+ return Settings()
+
+
+settings = get_settings()
diff --git a/app/core/__init__.py b/app/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e8825ce5959f9f016f6f5ed46c2a54fdd15d9e8
--- /dev/null
+++ b/app/core/__init__.py
@@ -0,0 +1,2 @@
+# Core module
+from app.core.database import get_db, init_db, Base
diff --git a/app/core/__pycache__/__init__.cpython-311.pyc b/app/core/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5dc2c47dff4d25a449c31d5b491838968bd8699
Binary files /dev/null and b/app/core/__pycache__/__init__.cpython-311.pyc differ
diff --git a/app/core/__pycache__/database.cpython-311.pyc b/app/core/__pycache__/database.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d66b606dc407b3d70e7e6b1d62893eb13ff9d42
Binary files /dev/null and b/app/core/__pycache__/database.cpython-311.pyc differ
diff --git a/app/core/database.py b/app/core/database.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fbd7f1d970d02b46df83e466a09287bfc0090be
--- /dev/null
+++ b/app/core/database.py
@@ -0,0 +1,115 @@
+"""
+Database configuration and session management
+Per-session databases - each user session gets its own SQLite file
+"""
+from sqlalchemy import create_engine, text
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker, Session
+from typing import Optional
+import os
+import uuid
+
+# Ensure data directory exists
+os.makedirs("data/sessions", exist_ok=True)
+
+# Base class for models
+Base = declarative_base()
+
+# Cache for session engines
+_session_engines = {}
+_session_makers = {}
+
+
+def get_session_engine(session_id: str):
+ """Get or create engine for a specific session"""
+ if session_id not in _session_engines:
+ db_path = f"data/sessions/{session_id}.db"
+ engine = create_engine(
+ f"sqlite:///./{db_path}",
+ connect_args={"check_same_thread": False}
+ )
+ _session_engines[session_id] = engine
+ _session_makers[session_id] = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+ # Initialize tables for this session
+ Base.metadata.create_all(bind=engine)
+ _run_migrations(engine)
+
+ return _session_engines[session_id]
+
+
+def get_session_db(session_id: str):
+ """Get database session for a specific user session"""
+ get_session_engine(session_id) # Ensure engine exists
+ SessionLocal = _session_makers[session_id]
+ db = SessionLocal()
+ try:
+ yield db
+ finally:
+ db.close()
+
+
+def get_db_for_session(session_id: str) -> Session:
+ """Direct session getter (non-generator) for routes"""
+ get_session_engine(session_id)
+ SessionLocal = _session_makers[session_id]
+ return SessionLocal()
+
+
+# Legacy - default database for backwards compatibility
+from app.config import settings
+engine = create_engine(
+ settings.database_url,
+ connect_args={"check_same_thread": False}
+)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+
+def get_default_session() -> Session:
+ """Create a new session for the default database."""
+ return SessionLocal()
+
+
+def get_db():
+ """Legacy: Default database session"""
+ db = get_default_session()
+ try:
+ yield db
+ finally:
+ db.close()
+
+
+def _run_migrations(eng):
+ """Run migrations on an engine"""
+ with eng.connect() as conn:
+ try:
+ conn.execute(text("ALTER TABLE entities ADD COLUMN event_date DATETIME"))
+ conn.commit()
+ except Exception:
+ pass
+ try:
+ conn.execute(text("ALTER TABLE relationships ADD COLUMN event_date DATETIME"))
+ conn.commit()
+ except Exception:
+ pass
+ try:
+ conn.execute(text("ALTER TABLE entities ADD COLUMN project_id VARCHAR(36)"))
+ conn.commit()
+ except Exception:
+ pass
+ try:
+ conn.execute(text("ALTER TABLE relationships ADD COLUMN project_id VARCHAR(36)"))
+ conn.commit()
+ except Exception:
+ pass
+
+
+def init_db():
+ """Initialize default database tables"""
+ Base.metadata.create_all(bind=engine)
+ _run_migrations(engine)
+
+
+def create_new_session_id() -> str:
+ """Generate a new session ID"""
+ return str(uuid.uuid4())
diff --git a/app/main.py b/app/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..7abfa0ca7fb7e31fc2a58e35b5528eb7b135bada
--- /dev/null
+++ b/app/main.py
@@ -0,0 +1,99 @@
+"""
+Numidium Backend - Main Application
+Plataforma de Inteligência e Análise de Dados
+"""
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from contextlib import asynccontextmanager
+
+from app.config import settings
+from app.core.database import init_db
+from app.api.routes import entities, relationships, events, search, ingest, analyze, graph, research, chat, investigate, dados_publicos, timeline, session, aethermap
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+ """Startup and shutdown events"""
+ # Startup: Initialize database
+ init_db()
+ print("🚀 Numidium Backend started!")
+ print(f"📊 Database: {settings.database_url}")
+ yield
+ # Shutdown
+ print("👋 Numidium Backend shutting down...")
+
+
+# Create FastAPI app
+app = FastAPI(
+ title="Numidium API",
+ description="""
+ ## 🔮 Sistema de Inteligência e Análise de Dados
+
+ Backend do VANTAGE - Uma plataforma para:
+ - 📥 Ingestão de dados de múltiplas fontes (Wikipedia, News, Manual)
+ - 🔗 Mapeamento de conexões entre entidades
+ - 🗺️ Visualização geográfica
+ - 📊 Análise de grafos e relacionamentos
+ - 🔍 Busca global
+ """,
+ version=settings.app_version,
+ lifespan=lifespan
+)
+
+# CORS middleware
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=settings.cors_origins,
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+
+# Include routers
+app.include_router(entities.router, prefix="/api/v1")
+app.include_router(relationships.router, prefix="/api/v1")
+app.include_router(events.router, prefix="/api/v1")
+app.include_router(search.router, prefix="/api/v1")
+app.include_router(ingest.router, prefix="/api/v1")
+app.include_router(analyze.router, prefix="/api/v1")
+app.include_router(graph.router, prefix="/api/v1")
+app.include_router(research.router, prefix="/api/v1")
+app.include_router(chat.router, prefix="/api/v1")
+app.include_router(investigate.router, prefix="/api/v1")
+app.include_router(dados_publicos.router, prefix="/api/v1")
+app.include_router(timeline.router, prefix="/api/v1")
+app.include_router(session.router, prefix="/api/v1")
+app.include_router(aethermap.router, prefix="/api/v1/aethermap", tags=["aethermap"])
+
+
+@app.get("/")
+def root():
+ """Root endpoint - API info"""
+ return {
+ "name": "Numidium",
+ "version": settings.app_version,
+ "status": "online",
+ "docs": "/docs",
+ "description": "Sistema de Inteligência e Análise de Dados"
+ }
+
+
+@app.get("/health")
+def health_check():
+ """Health check endpoint for HF Spaces"""
+ return {"status": "healthy"}
+
+
+@app.get("/api/v1")
+def api_info():
+ """API v1 info"""
+ return {
+ "version": "1.0.0",
+ "endpoints": {
+ "entities": "/api/v1/entities",
+ "relationships": "/api/v1/relationships",
+ "events": "/api/v1/events",
+ "search": "/api/v1/search",
+ "ingest": "/api/v1/ingest"
+ }
+ }
diff --git a/app/models/__init__.py b/app/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dda9ada652332c2b420769a6ace731249e11cfd8
--- /dev/null
+++ b/app/models/__init__.py
@@ -0,0 +1,3 @@
+# Models module
+from app.models.entity import Entity, Relationship, Event, Document
+from app.models.project import Project
diff --git a/app/models/__pycache__/__init__.cpython-311.pyc b/app/models/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5bf1933151b2e8a290dc79c4647a626d3d0500ff
Binary files /dev/null and b/app/models/__pycache__/__init__.cpython-311.pyc differ
diff --git a/app/models/__pycache__/entity.cpython-311.pyc b/app/models/__pycache__/entity.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08def07731fef26be3662b7e40f9afca7961637f
Binary files /dev/null and b/app/models/__pycache__/entity.cpython-311.pyc differ
diff --git a/app/models/__pycache__/project.cpython-311.pyc b/app/models/__pycache__/project.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e11c4a8127b91fa0a97a4fc6607860af7c04d37
Binary files /dev/null and b/app/models/__pycache__/project.cpython-311.pyc differ
diff --git a/app/models/entity.py b/app/models/entity.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f9afbd7c789db76ca4d482de4655cd99eb3bda
--- /dev/null
+++ b/app/models/entity.py
@@ -0,0 +1,143 @@
+"""
+SQLAlchemy Models for Numidium
+"""
+from sqlalchemy import Column, String, Text, DateTime, Float, JSON, ForeignKey, Table
+from sqlalchemy.orm import relationship
+from datetime import datetime
+import uuid
+
+from app.core.database import Base
+
+
+def generate_uuid():
+ return str(uuid.uuid4())
+
+
+class Entity(Base):
+ """
+ Entidade - qualquer coisa rastreável no sistema
+ Pode ser: pessoa, organização, local, veículo, evento, documento, etc.
+ """
+ __tablename__ = "entities"
+
+ id = Column(String(36), primary_key=True, default=generate_uuid)
+ project_id = Column(String(36), ForeignKey("projects.id"), nullable=True, index=True)
+ type = Column(String(50), nullable=False, index=True) # person, organization, location, etc
+ name = Column(String(255), nullable=False, index=True)
+ description = Column(Text, nullable=True)
+ properties = Column(JSON, default=dict) # Dados flexíveis
+
+ # Geolocalização (opcional)
+ latitude = Column(Float, nullable=True)
+ longitude = Column(Float, nullable=True)
+
+ # Data histórica do evento/entidade (quando aconteceu, não quando foi adicionado)
+ event_date = Column(DateTime, nullable=True)
+
+ # Fonte do dado
+ source = Column(String(100), nullable=True) # wikipedia, newsapi, manual, etc
+ source_url = Column(Text, nullable=True)
+
+ # Timestamps
+ created_at = Column(DateTime, default=datetime.utcnow)
+ updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+
+ # Relacionamentos
+ outgoing_relationships = relationship(
+ "Relationship",
+ foreign_keys="Relationship.source_id",
+ back_populates="source_entity"
+ )
+ incoming_relationships = relationship(
+ "Relationship",
+ foreign_keys="Relationship.target_id",
+ back_populates="target_entity"
+ )
+
+
+class Relationship(Base):
+ """
+ Relacionamento entre duas entidades
+ Exemplos: works_for, knows, owns, located_at, participated_in
+ """
+ __tablename__ = "relationships"
+
+ id = Column(String(36), primary_key=True, default=generate_uuid)
+ project_id = Column(String(36), ForeignKey("projects.id"), nullable=True, index=True)
+ source_id = Column(String(36), ForeignKey("entities.id"), nullable=False)
+ target_id = Column(String(36), ForeignKey("entities.id"), nullable=False)
+ type = Column(String(50), nullable=False, index=True) # works_for, knows, owns, etc
+ properties = Column(JSON, default=dict)
+ confidence = Column(Float, default=1.0) # 0-1, quão certo estamos dessa conexão
+
+ # Data histórica do relacionamento (quando aconteceu)
+ event_date = Column(DateTime, nullable=True)
+
+ # Fonte
+ source = Column(String(100), nullable=True)
+
+ # Timestamps
+ created_at = Column(DateTime, default=datetime.utcnow)
+
+ # Relacionamentos
+ source_entity = relationship("Entity", foreign_keys=[source_id], back_populates="outgoing_relationships")
+ target_entity = relationship("Entity", foreign_keys=[target_id], back_populates="incoming_relationships")
+
+
+class Event(Base):
+ """
+ Evento - algo que aconteceu envolvendo entidades
+ """
+ __tablename__ = "events"
+
+ id = Column(String(36), primary_key=True, default=generate_uuid)
+ type = Column(String(50), nullable=False, index=True)
+ title = Column(String(255), nullable=False)
+ description = Column(Text, nullable=True)
+
+ # Quando aconteceu
+ event_date = Column(DateTime, nullable=True)
+
+ # Onde aconteceu
+ location_name = Column(String(255), nullable=True)
+ latitude = Column(Float, nullable=True)
+ longitude = Column(Float, nullable=True)
+
+ # Entidades envolvidas (armazenado como JSON array de IDs)
+ entity_ids = Column(JSON, default=list)
+
+ # Fonte
+ source = Column(String(100), nullable=True)
+ source_url = Column(Text, nullable=True)
+
+ # Metadados
+ properties = Column(JSON, default=dict)
+
+ # Timestamps
+ created_at = Column(DateTime, default=datetime.utcnow)
+
+
+class Document(Base):
+ """
+ Documento - texto/arquivo para análise
+ """
+ __tablename__ = "documents"
+
+ id = Column(String(36), primary_key=True, default=generate_uuid)
+ title = Column(String(255), nullable=False)
+ content = Column(Text, nullable=True)
+ summary = Column(Text, nullable=True) # Resumo gerado por IA
+
+ # Tipo de documento
+ doc_type = Column(String(50), default="text") # text, news, report, etc
+
+ # Entidades mencionadas (extraídas por NLP)
+ mentioned_entities = Column(JSON, default=list)
+
+ # Fonte
+ source = Column(String(100), nullable=True)
+ source_url = Column(Text, nullable=True)
+
+ # Timestamps
+ published_at = Column(DateTime, nullable=True)
+ created_at = Column(DateTime, default=datetime.utcnow)
diff --git a/app/models/project.py b/app/models/project.py
new file mode 100644
index 0000000000000000000000000000000000000000..72f601e1975770622c146cc3b1b9fb6fbd912a3c
--- /dev/null
+++ b/app/models/project.py
@@ -0,0 +1,29 @@
+"""
+Project Model - Workspaces for organizing investigations
+"""
+from sqlalchemy import Column, String, Text, DateTime
+from datetime import datetime
+import uuid
+
+from app.core.database import Base
+
+
+def generate_uuid():
+ return str(uuid.uuid4())
+
+
+class Project(Base):
+ """
+ Projeto/Workspace - agrupa entidades e relacionamentos por investigação
+ """
+ __tablename__ = "projects"
+
+ id = Column(String(36), primary_key=True, default=generate_uuid)
+ name = Column(String(255), nullable=False)
+ description = Column(Text, nullable=True)
+ color = Column(String(7), default="#00d4ff") # Hex color for UI
+ icon = Column(String(50), default="folder") # Icon name
+
+ # Timestamps
+ created_at = Column(DateTime, default=datetime.utcnow)
+ updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
diff --git a/app/schemas/__init__.py b/app/schemas/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5afaa5cdacc5762ea19abe607c7dab6309e351a8
--- /dev/null
+++ b/app/schemas/__init__.py
@@ -0,0 +1,10 @@
+# Schemas module
+from app.schemas.schemas import (
+ EntityCreate, EntityUpdate, EntityResponse,
+ RelationshipCreate, RelationshipResponse,
+ EventCreate, EventResponse,
+ DocumentCreate, DocumentResponse,
+ GraphData, GraphNode, GraphEdge,
+ SearchQuery, SearchResult,
+ SystemStats
+)
diff --git a/app/schemas/__pycache__/__init__.cpython-311.pyc b/app/schemas/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a0991c3a362725e79629796654b9dc0ed9c9668
Binary files /dev/null and b/app/schemas/__pycache__/__init__.cpython-311.pyc differ
diff --git a/app/schemas/__pycache__/schemas.cpython-311.pyc b/app/schemas/__pycache__/schemas.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8358505555f3036c07fa84d01cd9cd01b7b97b9f
Binary files /dev/null and b/app/schemas/__pycache__/schemas.cpython-311.pyc differ
diff --git a/app/schemas/schemas.py b/app/schemas/schemas.py
new file mode 100644
index 0000000000000000000000000000000000000000..afbff0c301ddb1fbe8cae0e4848fcafc48082ff0
--- /dev/null
+++ b/app/schemas/schemas.py
@@ -0,0 +1,163 @@
+"""
+Pydantic Schemas for API validation
+"""
+from pydantic import BaseModel, Field
+from typing import Optional, List, Any
+from datetime import datetime
+
+
+# ========== Entity Schemas ==========
+
+class EntityBase(BaseModel):
+ type: str = Field(..., description="Tipo da entidade: person, organization, location, etc")
+ name: str = Field(..., description="Nome da entidade")
+ description: Optional[str] = None
+ properties: dict = Field(default_factory=dict)
+ latitude: Optional[float] = None
+ longitude: Optional[float] = None
+ source: Optional[str] = None
+ source_url: Optional[str] = None
+
+
+class EntityCreate(EntityBase):
+ pass
+
+
+class EntityUpdate(BaseModel):
+ type: Optional[str] = None
+ name: Optional[str] = None
+ description: Optional[str] = None
+ properties: Optional[dict] = None
+ latitude: Optional[float] = None
+ longitude: Optional[float] = None
+
+
+class EntityResponse(EntityBase):
+ id: str
+ created_at: datetime
+ updated_at: datetime
+
+ class Config:
+ from_attributes = True
+
+
+# ========== Relationship Schemas ==========
+
+class RelationshipBase(BaseModel):
+ source_id: str
+ target_id: str
+ type: str = Field(..., description="Tipo: works_for, knows, owns, located_at, etc")
+ properties: dict = Field(default_factory=dict)
+ confidence: float = Field(default=1.0, ge=0, le=1)
+ source: Optional[str] = None
+
+
+class RelationshipCreate(RelationshipBase):
+ pass
+
+
+class RelationshipResponse(RelationshipBase):
+ id: str
+ created_at: datetime
+
+ class Config:
+ from_attributes = True
+
+
+# ========== Event Schemas ==========
+
+class EventBase(BaseModel):
+ type: str
+ title: str
+ description: Optional[str] = None
+ event_date: Optional[datetime] = None
+ location_name: Optional[str] = None
+ latitude: Optional[float] = None
+ longitude: Optional[float] = None
+ entity_ids: List[str] = Field(default_factory=list)
+ source: Optional[str] = None
+ source_url: Optional[str] = None
+ properties: dict = Field(default_factory=dict)
+
+
+class EventCreate(EventBase):
+ pass
+
+
+class EventResponse(EventBase):
+ id: str
+ created_at: datetime
+
+ class Config:
+ from_attributes = True
+
+
+# ========== Document Schemas ==========
+
+class DocumentBase(BaseModel):
+ title: str
+ content: Optional[str] = None
+ doc_type: str = "text"
+ source: Optional[str] = None
+ source_url: Optional[str] = None
+ published_at: Optional[datetime] = None
+
+
+class DocumentCreate(DocumentBase):
+ pass
+
+
+class DocumentResponse(DocumentBase):
+ id: str
+ summary: Optional[str] = None
+ mentioned_entities: List[str] = []
+ created_at: datetime
+
+ class Config:
+ from_attributes = True
+
+
+# ========== Graph Schemas ==========
+
+class GraphNode(BaseModel):
+ id: str
+ type: str
+ name: str
+ properties: dict = {}
+
+
+class GraphEdge(BaseModel):
+ source: str
+ target: str
+ type: str
+ confidence: float = 1.0
+
+
+class GraphData(BaseModel):
+ nodes: List[GraphNode]
+ edges: List[GraphEdge]
+
+
+# ========== Search Schemas ==========
+
+class SearchQuery(BaseModel):
+ query: str
+ entity_types: Optional[List[str]] = None
+ limit: int = Field(default=20, le=100)
+
+
+class SearchResult(BaseModel):
+ entities: List[EntityResponse]
+ events: List[EventResponse]
+ documents: List[DocumentResponse]
+
+
+# ========== Stats Schemas ==========
+
+class SystemStats(BaseModel):
+ total_entities: int
+ total_relationships: int
+ total_events: int
+ total_documents: int
+ entities_by_type: dict
+ recent_activity: List[dict]
diff --git a/app/services/__init__.py b/app/services/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7f87b77ce421c83e59588e4c341ebab500c3c41
--- /dev/null
+++ b/app/services/__init__.py
@@ -0,0 +1 @@
+# Services module
diff --git a/app/services/__pycache__/__init__.cpython-311.pyc b/app/services/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bdd1095f0f04ab7b53c9f32b8bbce7e4a48236e3
Binary files /dev/null and b/app/services/__pycache__/__init__.cpython-311.pyc differ
diff --git a/app/services/__pycache__/brazil_apis.cpython-311.pyc b/app/services/__pycache__/brazil_apis.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9751973d578e79e328d2912fe354747fb95d79a9
Binary files /dev/null and b/app/services/__pycache__/brazil_apis.cpython-311.pyc differ
diff --git a/app/services/__pycache__/geocoding.cpython-311.pyc b/app/services/__pycache__/geocoding.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..664fbab62acd7d6c1db2527f8ab8b4b7a11662e9
Binary files /dev/null and b/app/services/__pycache__/geocoding.cpython-311.pyc differ
diff --git a/app/services/__pycache__/investigation.cpython-311.pyc b/app/services/__pycache__/investigation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..219ab0bfb74e241ee442a062277765b2d3f84c26
Binary files /dev/null and b/app/services/__pycache__/investigation.cpython-311.pyc differ
diff --git a/app/services/__pycache__/investigator_agent.cpython-311.pyc b/app/services/__pycache__/investigator_agent.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17cd96446bf23aa53179b163aba52d3f2b0ece2e
Binary files /dev/null and b/app/services/__pycache__/investigator_agent.cpython-311.pyc differ
diff --git a/app/services/__pycache__/lancer.cpython-311.pyc b/app/services/__pycache__/lancer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0415c81ac5d96a648b50a300ce010fe32a9a53bb
Binary files /dev/null and b/app/services/__pycache__/lancer.cpython-311.pyc differ
diff --git a/app/services/__pycache__/transparencia_api.cpython-311.pyc b/app/services/__pycache__/transparencia_api.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8758135b5b049f71eceec09add4595fb7b11dd12
Binary files /dev/null and b/app/services/__pycache__/transparencia_api.cpython-311.pyc differ
diff --git a/app/services/aethermap_client.py b/app/services/aethermap_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e9a6490f843a94391b376e528db87554c8e31cf
--- /dev/null
+++ b/app/services/aethermap_client.py
@@ -0,0 +1,343 @@
+"""
+AetherMap Client
+Client para integração com AetherMap API - busca semântica, NER e análise de grafos.
+"""
+import httpx
+import json
+import io
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+from datetime import datetime
+import logging
+
+from app.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+# URL base do AetherMap (HuggingFace Space)
+AETHERMAP_URL = getattr(settings, 'aethermap_url', 'https://madras1-aethermap.hf.space')
+
+
+@dataclass
+class ProcessResult:
+ """Resultado do processamento de documentos"""
+ job_id: str
+ num_documents: int
+ num_clusters: int
+ num_noise: int
+ metrics: Dict[str, Any] = field(default_factory=dict)
+ cluster_analysis: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class SearchResult:
+ """Resultado de busca semântica"""
+ summary: str # Resposta RAG gerada pelo LLM
+ results: List[Dict[str, Any]] = field(default_factory=list)
+
+
+@dataclass
+class EntityNode:
+ """Nó de entidade no grafo"""
+ entity: str
+ entity_type: str
+ docs: int
+ degree: int = 0
+ centrality: float = 0.0
+ role: str = "peripheral" # hub, connector, peripheral
+
+
+@dataclass
+class EntityEdge:
+ """Aresta do grafo de entidades"""
+ source_entity: str
+ target_entity: str
+ weight: int
+ reason: str
+
+
+@dataclass
+class EntityGraphResult:
+ """Resultado da extração de entidades"""
+ nodes: List[EntityNode] = field(default_factory=list)
+ edges: List[EntityEdge] = field(default_factory=list)
+ hubs: List[Dict[str, Any]] = field(default_factory=list)
+ insights: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class GraphAnalysis:
+ """Análise do grafo via LLM"""
+ analysis: str
+ key_entities: List[str] = field(default_factory=list)
+ relationships: List[str] = field(default_factory=list)
+
+
+class AetherMapClient:
+ """
+ Client para AetherMap API.
+
+ Funcionalidades:
+ - Processamento de documentos (embeddings + clusters)
+ - Busca semântica RAG (FAISS + BM25 + reranking + LLM)
+ - Extração de entidades NER
+ - Análise de grafo via LLM
+ """
+
+ def __init__(self, base_url: str = None, timeout: float = 600.0):
+ self.base_url = (base_url or AETHERMAP_URL).rstrip('/')
+ self.timeout = timeout
+ self._current_job_id: Optional[str] = None
+
+ @property
+ def current_job_id(self) -> Optional[str]:
+ """Retorna o job_id atual"""
+ return self._current_job_id
+
+ async def process_documents(
+ self,
+ texts: List[str],
+ fast_mode: bool = True,
+ min_cluster_size: int = 0,
+ min_samples: int = 0
+ ) -> ProcessResult:
+ """
+ Processa uma lista de textos gerando embeddings e clusters.
+
+ Args:
+ texts: Lista de textos/documentos
+ fast_mode: Se True, usa PCA (rápido). Se False, usa UMAP (preciso)
+ min_cluster_size: Tamanho mínimo do cluster (0=auto)
+ min_samples: Mínimo de amostras (0=auto)
+
+ Returns:
+ ProcessResult com job_id e métricas
+ """
+ # Criar arquivo TXT em memória
+ content = "\n".join(texts)
+ file_bytes = content.encode('utf-8')
+
+ try:
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
+ files = {
+ 'file': ('documents.txt', io.BytesIO(file_bytes), 'text/plain')
+ }
+ data = {
+ 'n_samples': str(len(texts)),
+ 'fast_mode': 'true' if fast_mode else 'false',
+ 'min_cluster_size': str(min_cluster_size),
+ 'min_samples': str(min_samples)
+ }
+
+ logger.info(f"AetherMap: Processando {len(texts)} documentos para {self.base_url}/process/")
+
+ response = await client.post(
+ f"{self.base_url}/process/",
+ files=files,
+ data=data
+ )
+
+ logger.info(f"AetherMap: Response status {response.status_code}")
+
+ if response.status_code != 200:
+ error_text = response.text[:500] if response.text else "No response body"
+ logger.error(f"AetherMap error: {response.status_code} - {error_text}")
+ raise Exception(f"AetherMap error: {response.status_code} - {error_text}")
+
+ result = response.json()
+
+ self._current_job_id = result.get('job_id')
+ metadata = result.get('metadata', {})
+
+ logger.info(f"AetherMap: Job criado {self._current_job_id}")
+
+ return ProcessResult(
+ job_id=self._current_job_id or "unknown",
+ num_documents=metadata.get('num_documents_processed', len(texts)),
+ num_clusters=metadata.get('num_clusters_found', 0),
+ num_noise=metadata.get('num_noise_points', 0),
+ metrics=result.get('metrics', {}),
+ cluster_analysis=result.get('cluster_analysis', {})
+ )
+ except httpx.TimeoutException:
+ logger.error(f"AetherMap: Timeout ao conectar com {self.base_url}")
+ raise Exception(f"Timeout: AetherMap Space pode estar dormindo. Tente novamente em alguns segundos.")
+ except httpx.ConnectError as e:
+ logger.error(f"AetherMap: Erro de conexão: {e}")
+ raise Exception(f"Erro de conexão com AetherMap: {e}")
+ except Exception as e:
+ logger.error(f"AetherMap: Erro inesperado: {e}")
+ raise
+
+ async def semantic_search(
+ self,
+ query: str,
+ job_id: str = None,
+ turbo_mode: bool = False
+ ) -> SearchResult:
+ """
+ Busca semântica RAG híbrida nos documentos processados.
+
+ Args:
+ query: Termo de busca
+ job_id: ID do job (se não fornecido, usa o último)
+ turbo_mode: Se True, busca mais rápida (menos precisa)
+
+ Returns:
+ SearchResult com resumo e resultados
+ """
+ job_id = job_id or self._current_job_id
+ if not job_id:
+ raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
+
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
+ data = {
+ 'query': query,
+ 'job_id': job_id,
+ 'turbo_mode': 'true' if turbo_mode else 'false'
+ }
+
+ logger.info(f"AetherMap: Buscando '{query}'...")
+
+ response = await client.post(
+ f"{self.base_url}/search/",
+ data=data
+ )
+
+ if response.status_code != 200:
+ raise Exception(f"AetherMap search error: {response.status_code} - {response.text}")
+
+ result = response.json()
+
+ return SearchResult(
+ summary=result.get('summary', ''),
+ results=result.get('results', [])
+ )
+
+ async def extract_entities(self, job_id: str = None) -> EntityGraphResult:
+ """
+ Extrai entidades nomeadas (NER) e cria grafo de conexões.
+
+ Args:
+ job_id: ID do job (se não fornecido, usa o último)
+
+ Returns:
+ EntityGraphResult com nós, arestas e insights
+ """
+ job_id = job_id or self._current_job_id
+ if not job_id:
+ raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
+
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
+ data = {'job_id': job_id}
+
+ logger.info(f"AetherMap: Extraindo entidades...")
+
+ response = await client.post(
+ f"{self.base_url}/entity_graph/",
+ data=data
+ )
+
+ if response.status_code != 200:
+ raise Exception(f"AetherMap entity_graph error: {response.status_code} - {response.text}")
+
+ result = response.json()
+
+ # Converter para dataclasses
+ nodes = [
+ EntityNode(
+ entity=n.get('entity', ''),
+ entity_type=n.get('type', ''),
+ docs=n.get('docs', 0),
+ degree=n.get('degree', 0),
+ centrality=n.get('centrality', 0.0),
+ role=n.get('role', 'peripheral')
+ )
+ for n in result.get('nodes', [])
+ ]
+
+ edges = [
+ EntityEdge(
+ source_entity=e.get('source_entity', ''),
+ target_entity=e.get('target_entity', ''),
+ weight=e.get('weight', 0),
+ reason=e.get('reason', '')
+ )
+ for e in result.get('edges', [])
+ ]
+
+ return EntityGraphResult(
+ nodes=nodes,
+ edges=edges,
+ hubs=result.get('hubs', []),
+ insights=result.get('insights', {})
+ )
+
+ async def analyze_graph(self, job_id: str = None) -> GraphAnalysis:
+ """
+ Usa LLM para analisar o Knowledge Graph e extrair insights.
+
+ Args:
+ job_id: ID do job (se não fornecido, usa o último)
+
+ Returns:
+ GraphAnalysis com análise textual
+ """
+ job_id = job_id or self._current_job_id
+ if not job_id:
+ raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
+
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
+ data = {'job_id': job_id}
+
+ logger.info(f"AetherMap: Analisando grafo com LLM...")
+
+ response = await client.post(
+ f"{self.base_url}/analyze_graph/",
+ data=data
+ )
+
+ if response.status_code != 200:
+ raise Exception(f"AetherMap analyze_graph error: {response.status_code} - {response.text}")
+
+ result = response.json()
+
+ return GraphAnalysis(
+ analysis=result.get('analysis', ''),
+ key_entities=result.get('key_entities', []),
+ relationships=result.get('relationships', [])
+ )
+
+ async def describe_clusters(self, job_id: str = None) -> Dict[str, Any]:
+ """
+ Usa LLM para descrever cada cluster encontrado.
+
+ Args:
+ job_id: ID do job (se não fornecido, usa o último)
+
+ Returns:
+ Dict com insights por cluster
+ """
+ job_id = job_id or self._current_job_id
+ if not job_id:
+ raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.")
+
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
+ data = {'job_id': job_id}
+
+ logger.info(f"AetherMap: Descrevendo clusters...")
+
+ response = await client.post(
+ f"{self.base_url}/describe_clusters/",
+ data=data
+ )
+
+ if response.status_code != 200:
+ raise Exception(f"AetherMap describe_clusters error: {response.status_code} - {response.text}")
+
+ return response.json()
+
+
+# Instância global do client
+aethermap = AetherMapClient()
diff --git a/app/services/analysis/__init__.py b/app/services/analysis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..18e8fe19c13e9ec59fb147e63518a8ddbeef5f25
--- /dev/null
+++ b/app/services/analysis/__init__.py
@@ -0,0 +1 @@
+# Analysis services
diff --git a/app/services/brazil_apis.py b/app/services/brazil_apis.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cf938529a35708355664dd05f60288519c0d7df
--- /dev/null
+++ b/app/services/brazil_apis.py
@@ -0,0 +1,218 @@
+"""
+Brazilian Data APIs Service
+Consolidates access to public Brazilian data APIs for investigation
+"""
+import httpx
+from typing import Optional, Dict, Any, List
+from dataclasses import dataclass, field
+import re
+
+
+# API URLs
+CNPJA_URL = "https://api.cnpja.com.br/office"
+OPENCNPJ_URL = "https://api.opencnpj.org/v1/cnpj"
+BRASILAPI_CNPJ = "https://brasilapi.com.br/api/cnpj/v1"
+BRASILAPI_CEP = "https://brasilapi.com.br/api/cep/v2"
+
+
+@dataclass
+class CompanyData:
+ """Data structure for company information"""
+ cnpj: str
+ razao_social: str = ""
+ nome_fantasia: str = ""
+ situacao: str = ""
+ data_abertura: str = ""
+ natureza_juridica: str = ""
+ capital_social: float = 0.0
+ porte: str = ""
+
+ # Address
+ logradouro: str = ""
+ numero: str = ""
+ complemento: str = ""
+ bairro: str = ""
+ cidade: str = ""
+ uf: str = ""
+ cep: str = ""
+
+ # Contact
+ telefone: str = ""
+ email: str = ""
+
+ # Activity
+ cnae_principal: str = ""
+ cnae_descricao: str = ""
+ cnaes_secundarios: List[str] = field(default_factory=list)
+
+ # Partners/Owners
+ socios: List[Dict[str, Any]] = field(default_factory=list)
+
+ # Source
+ fonte: str = ""
+
+
+def clean_cnpj(cnpj: str) -> str:
+ """Remove formatting from CNPJ"""
+ return re.sub(r'[^0-9]', '', cnpj)
+
+
+async def consultar_cnpj(cnpj: str) -> Optional[CompanyData]:
+ """
+ Query CNPJ data from available APIs.
+ Tries BrasilAPI first (more reliable), then falls back to others.
+ """
+ cnpj_clean = clean_cnpj(cnpj)
+
+ if len(cnpj_clean) != 14:
+ return None
+
+ # Try BrasilAPI first
+ result = await _query_brasilapi(cnpj_clean)
+ if result:
+ return result
+
+ # Fallback to OpenCNPJ
+ result = await _query_opencnpj(cnpj_clean)
+ if result:
+ return result
+
+ return None
+
+
+async def _query_brasilapi(cnpj: str) -> Optional[CompanyData]:
+ """Query BrasilAPI for CNPJ data"""
+ try:
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ response = await client.get(f"{BRASILAPI_CNPJ}/{cnpj}")
+
+ if response.status_code != 200:
+ return None
+
+ data = response.json()
+
+ # Parse partners
+ socios = []
+ for socio in data.get("qsa", []):
+ socios.append({
+ "nome": socio.get("nome_socio", ""),
+ "qualificacao": socio.get("qualificacao_socio", ""),
+ "cpf_cnpj": socio.get("cnpj_cpf_do_socio", ""),
+ "data_entrada": socio.get("data_entrada_sociedade", "")
+ })
+
+ # Parse CNAEs
+ cnaes_sec = []
+ for cnae in data.get("cnaes_secundarios", []):
+ if isinstance(cnae, dict):
+ cnaes_sec.append(f"{cnae.get('codigo', '')} - {cnae.get('descricao', '')}")
+ else:
+ cnaes_sec.append(str(cnae))
+
+ return CompanyData(
+ cnpj=cnpj,
+ razao_social=data.get("razao_social", ""),
+ nome_fantasia=data.get("nome_fantasia", ""),
+ situacao=data.get("descricao_situacao_cadastral", ""),
+ data_abertura=data.get("data_inicio_atividade", ""),
+ natureza_juridica=data.get("natureza_juridica", ""),
+ capital_social=float(data.get("capital_social", 0)),
+ porte=data.get("porte", ""),
+ logradouro=data.get("logradouro", ""),
+ numero=data.get("numero", ""),
+ complemento=data.get("complemento", ""),
+ bairro=data.get("bairro", ""),
+ cidade=data.get("municipio", ""),
+ uf=data.get("uf", ""),
+ cep=data.get("cep", ""),
+ telefone=data.get("ddd_telefone_1", ""),
+ email=data.get("email", ""),
+ cnae_principal=str(data.get("cnae_fiscal", "")),
+ cnae_descricao=data.get("cnae_fiscal_descricao", ""),
+ cnaes_secundarios=cnaes_sec,
+ socios=socios,
+ fonte="BrasilAPI"
+ )
+
+ except Exception as e:
+ print(f"BrasilAPI error: {e}")
+ return None
+
+
+async def _query_opencnpj(cnpj: str) -> Optional[CompanyData]:
+ """Query OpenCNPJ API"""
+ try:
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ response = await client.get(f"{OPENCNPJ_URL}/{cnpj}")
+
+ if response.status_code != 200:
+ return None
+
+ data = response.json()
+
+ # Parse partners
+ socios = []
+ for socio in data.get("socios", []):
+ socios.append({
+ "nome": socio.get("nome", ""),
+ "qualificacao": socio.get("qualificacao", ""),
+ "cpf_cnpj": "",
+ "data_entrada": socio.get("data_entrada", "")
+ })
+
+ return CompanyData(
+ cnpj=cnpj,
+ razao_social=data.get("razao_social", ""),
+ nome_fantasia=data.get("nome_fantasia", ""),
+ situacao=data.get("situacao_cadastral", ""),
+ data_abertura=data.get("data_inicio_atividade", ""),
+ natureza_juridica=data.get("natureza_juridica", ""),
+ capital_social=float(data.get("capital_social", 0) or 0),
+ porte=data.get("porte", ""),
+ logradouro=data.get("logradouro", ""),
+ numero=data.get("numero", ""),
+ complemento=data.get("complemento", ""),
+ bairro=data.get("bairro", ""),
+ cidade=data.get("municipio", ""),
+ uf=data.get("uf", ""),
+ cep=data.get("cep", ""),
+ telefone=data.get("telefone", ""),
+ email=data.get("email", ""),
+ cnae_principal=data.get("cnae_principal", {}).get("codigo", ""),
+ cnae_descricao=data.get("cnae_principal", {}).get("descricao", ""),
+ cnaes_secundarios=[],
+ socios=socios,
+ fonte="OpenCNPJ"
+ )
+
+ except Exception as e:
+ print(f"OpenCNPJ error: {e}")
+ return None
+
+
+async def consultar_cep(cep: str) -> Optional[Dict[str, Any]]:
+ """Query address by CEP"""
+ cep_clean = re.sub(r'[^0-9]', '', cep)
+
+ try:
+ async with httpx.AsyncClient(timeout=15.0) as client:
+ response = await client.get(f"{BRASILAPI_CEP}/{cep_clean}")
+
+ if response.status_code != 200:
+ return None
+
+ return response.json()
+
+ except Exception as e:
+ print(f"CEP query error: {e}")
+ return None
+
+
+async def buscar_empresas_por_nome(nome: str, uf: Optional[str] = None) -> List[Dict[str, Any]]:
+ """
+ Search companies by name using web search (via Lancer).
+ This is a workaround since direct name search APIs are paid.
+ """
+ # This would need Lancer integration for web search
+ # For now, return empty - will be filled by investigation service
+ return []
diff --git a/app/services/chat.py b/app/services/chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..89595f334653e11a19d2103c28ccfaeb97110844
--- /dev/null
+++ b/app/services/chat.py
@@ -0,0 +1,213 @@
+"""
+Chat Service - Intelligent chat with RAG capabilities
+Uses local database + Lancer for comprehensive responses
+"""
+import httpx
+from typing import Optional, List, Dict, Any
+from sqlalchemy.orm import Session
+
+from app.config import settings
+from app.models.entity import Entity, Relationship
+
+
+LANCER_URL = "https://madras1-lancer.hf.space/api/v1"
+
+SYSTEM_PROMPT = """Você é um assistente de inteligência do NUMIDIUM.
+Você tem acesso a um grafo de conhecimento com entidades e relacionamentos,
+e pode pesquisar na web para informações atualizadas.
+
+Responda em português brasileiro de forma clara e direta.
+Se não tiver certeza, diga que não sabe em vez de inventar."""
+
+
+class ChatService:
+ """Chat service with RAG using local database and Lancer"""
+
+ def __init__(self):
+ self.api_url = "https://api.cerebras.ai/v1/chat/completions"
+ self.conversation_history: Dict[str, List[Dict[str, str]]] = {}
+
+ def _get_history(self, session_id: Optional[str]) -> List[Dict[str, str]]:
+ key = session_id or "default"
+ if key not in self.conversation_history:
+ self.conversation_history[key] = []
+ return self.conversation_history[key]
+
+ def clear_history(self, session_id: Optional[str] = None):
+ """Clear conversation history"""
+ key = session_id or "default"
+ self.conversation_history.pop(key, None)
+
+ def _get_local_context(self, query: str, db: Session, limit: int = 5) -> str:
+ """Get relevant entities from local database"""
+ # Search entities by name
+ entities = db.query(Entity).filter(
+ Entity.name.ilike(f"%{query}%")
+ ).limit(limit).all()
+
+ # Also search by description
+ if len(entities) < limit:
+ desc_entities = db.query(Entity).filter(
+ Entity.description.ilike(f"%{query}%")
+ ).limit(limit - len(entities)).all()
+ entities.extend(desc_entities)
+
+ if not entities:
+ # Try splitting query into words
+ words = query.split()
+ for word in words:
+ if len(word) > 3:
+ word_entities = db.query(Entity).filter(
+ Entity.name.ilike(f"%{word}%")
+ ).limit(2).all()
+ entities.extend(word_entities)
+
+ if not entities:
+ return ""
+
+ context_parts = []
+ seen_ids = set()
+
+ for entity in entities:
+ if entity.id in seen_ids:
+ continue
+ seen_ids.add(entity.id)
+
+ ctx = f"• {entity.name} ({entity.type})"
+ if entity.description:
+ ctx += f": {entity.description[:200]}"
+
+ # Get relationships
+ relationships = db.query(Relationship).filter(
+ (Relationship.source_id == entity.id) |
+ (Relationship.target_id == entity.id)
+ ).limit(5).all()
+
+ if relationships:
+ related = []
+ for rel in relationships:
+ if rel.source_id == entity.id:
+ target = db.query(Entity).filter(Entity.id == rel.target_id).first()
+ if target:
+ related.append(f"{rel.type} → {target.name}")
+ else:
+ source = db.query(Entity).filter(Entity.id == rel.source_id).first()
+ if source:
+ related.append(f"{source.name} → {rel.type}")
+
+ if related:
+ ctx += f" | Relações: {', '.join(related[:3])}"
+
+ context_parts.append(ctx)
+
+ return "\n".join(context_parts)
+
+ async def _get_web_context(self, query: str) -> str:
+ """Get context from Lancer web search"""
+ try:
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ response = await client.post(
+ f"{LANCER_URL}/search",
+ json={
+ "query": query,
+ "max_results": 5,
+ "include_answer": True
+ }
+ )
+
+ if response.status_code == 200:
+ data = response.json()
+ if data.get("answer"):
+ return f"Informações da web:\n{data['answer'][:1000]}"
+
+ return ""
+ except Exception as e:
+ print(f"Lancer error: {e}")
+ return ""
+
+ async def _call_llm(self, messages: List[Dict[str, str]]) -> str:
+ """Call Cerebras LLM"""
+ try:
+ async with httpx.AsyncClient(timeout=60.0) as client:
+ response = await client.post(
+ self.api_url,
+ headers={
+ "Authorization": f"Bearer {settings.cerebras_api_key}",
+ "Content-Type": "application/json"
+ },
+ json={
+ "model": "qwen-3-32b",
+ "messages": messages,
+ "temperature": 0.7,
+ "max_tokens": 2048
+ }
+ )
+
+ if response.status_code == 200:
+ data = response.json()
+ return data["choices"][0]["message"]["content"]
+ else:
+ return f"Erro na API: {response.status_code}"
+
+ except Exception as e:
+ return f"Erro: {str(e)}"
+
+ async def chat(
+ self,
+ message: str,
+ db: Session,
+ use_web: bool = True,
+ use_history: bool = True,
+ session_id: Optional[str] = None
+ ) -> Dict[str, Any]:
+ """Process chat message with RAG"""
+ history = self._get_history(session_id)
+
+ # Get local context
+ local_context = self._get_local_context(message, db)
+
+ # Get web context if enabled
+ web_context = ""
+ if use_web:
+ web_context = await self._get_web_context(message)
+
+ # Build context
+ context_parts = []
+ if local_context:
+ context_parts.append(f"📊 Conhecimento local:\n{local_context}")
+ if web_context:
+ context_parts.append(f"🌐 {web_context}")
+
+ context = "\n\n".join(context_parts) if context_parts else "Nenhum contexto disponível."
+
+ # Build messages
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+
+ if use_history and history:
+ messages.extend(history[-6:])
+
+ user_message = f"""Contexto:
+{context}
+
+Pergunta: {message}"""
+
+ messages.append({"role": "user", "content": user_message})
+
+ # Call LLM
+ response = await self._call_llm(messages)
+
+ # Store history
+ if use_history:
+ history.append({"role": "user", "content": message})
+ history.append({"role": "assistant", "content": response})
+
+ return {
+ "answer": response,
+ "local_context_used": bool(local_context),
+ "web_context_used": bool(web_context),
+ "entities_found": local_context.count("•") if local_context else 0
+ }
+
+
+# Singleton
+chat_service = ChatService()
diff --git a/app/services/geocoding.py b/app/services/geocoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..06863f2be60350c5cd8251ca8cfa063809135cea
--- /dev/null
+++ b/app/services/geocoding.py
@@ -0,0 +1,63 @@
+"""
+Geocoding Service - Uses Nominatim (OpenStreetMap) for free geocoding
+"""
+import httpx
+from typing import Optional, Tuple
+import asyncio
+
+
+NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
+USER_AGENT = "NUMIDIUM/1.0 (Intelligence System)"
+
+
+async def geocode(location_name: str) -> Optional[Tuple[float, float]]:
+ """
+ Convert a location name to coordinates using Nominatim.
+ Returns (latitude, longitude) or None if not found.
+
+ Note: Nominatim has rate limits (1 request/second), so be careful with batch operations.
+ """
+ try:
+ async with httpx.AsyncClient(timeout=10.0) as client:
+ response = await client.get(
+ NOMINATIM_URL,
+ params={
+ "q": location_name,
+ "format": "json",
+ "limit": 1,
+ "addressdetails": 0
+ },
+ headers={
+ "User-Agent": USER_AGENT
+ }
+ )
+
+ if response.status_code == 200:
+ data = response.json()
+ if data and len(data) > 0:
+ lat = float(data[0]["lat"])
+ lon = float(data[0]["lon"])
+ return (lat, lon)
+
+ return None
+
+ except Exception as e:
+ print(f"Geocoding error for '{location_name}': {e}")
+ return None
+
+
+async def geocode_batch(location_names: list[str], delay: float = 1.0) -> dict[str, Tuple[float, float]]:
+ """
+ Geocode multiple locations with proper rate limiting.
+ Returns a dict mapping location names to (lat, lon) tuples.
+ """
+ results = {}
+
+ for name in location_names:
+ coords = await geocode(name)
+ if coords:
+ results[name] = coords
+ # Respect Nominatim rate limits
+ await asyncio.sleep(delay)
+
+ return results
diff --git a/app/services/ibge_api.py b/app/services/ibge_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..26d5000ed2798dfe0f7a1ce55603f305dad74783
--- /dev/null
+++ b/app/services/ibge_api.py
@@ -0,0 +1,192 @@
+"""
+IBGE API Service
+Access to Brazilian geographic and demographic data
+"""
+import httpx
+from typing import Optional, Dict, Any, List
+from dataclasses import dataclass
+
+
+IBGE_BASE_URL = "https://servicodados.ibge.gov.br/api/v1"
+
+
+@dataclass
+class Estado:
+ """Brazilian state data"""
+ id: int
+ sigla: str
+ nome: str
+ regiao: str
+
+
+@dataclass
+class Municipio:
+ """Brazilian municipality data"""
+ id: int
+ nome: str
+ estado_sigla: str
+ estado_nome: str
+ regiao: str
+ # Optional enriched data
+ populacao: Optional[int] = None
+ area_km2: Optional[float] = None
+
+
+async def listar_estados() -> List[Estado]:
+ """List all Brazilian states"""
+ try:
+ async with httpx.AsyncClient(timeout=15.0) as client:
+ response = await client.get(f"{IBGE_BASE_URL}/localidades/estados")
+
+ if response.status_code != 200:
+ return []
+
+ data = response.json()
+ estados = []
+
+ for item in data:
+ estados.append(Estado(
+ id=item["id"],
+ sigla=item["sigla"],
+ nome=item["nome"],
+ regiao=item.get("regiao", {}).get("nome", "")
+ ))
+
+ return sorted(estados, key=lambda x: x.nome)
+
+ except Exception as e:
+ print(f"IBGE estados error: {e}")
+ return []
+
+
+async def listar_municipios(uf: str) -> List[Municipio]:
+ """List all municipalities in a state"""
+ try:
+ async with httpx.AsyncClient(timeout=15.0) as client:
+ response = await client.get(
+ f"{IBGE_BASE_URL}/localidades/estados/{uf}/municipios"
+ )
+
+ if response.status_code != 200:
+ return []
+
+ data = response.json()
+ municipios = []
+
+ for item in data:
+ municipios.append(Municipio(
+ id=item["id"],
+ nome=item["nome"],
+ estado_sigla=uf.upper(),
+ estado_nome=item.get("microrregiao", {}).get("mesorregiao", {}).get("UF", {}).get("nome", ""),
+ regiao=item.get("microrregiao", {}).get("mesorregiao", {}).get("UF", {}).get("regiao", {}).get("nome", "")
+ ))
+
+ return sorted(municipios, key=lambda x: x.nome)
+
+ except Exception as e:
+ print(f"IBGE municipios error: {e}")
+ return []
+
+
+async def buscar_municipio(nome: str, uf: Optional[str] = None) -> List[Municipio]:
+ """Search for municipalities by name"""
+ try:
+ # If UF provided, search only that state
+ if uf:
+ municipios = await listar_municipios(uf)
+ return [m for m in municipios if nome.lower() in m.nome.lower()]
+
+ # Otherwise search all states (slower)
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ response = await client.get(f"{IBGE_BASE_URL}/localidades/municipios")
+
+ if response.status_code != 200:
+ return []
+
+ data = response.json()
+ results = []
+
+ for item in data:
+ if nome.lower() in item["nome"].lower():
+ uf_info = item.get("microrregiao", {}).get("mesorregiao", {}).get("UF", {})
+ results.append(Municipio(
+ id=item["id"],
+ nome=item["nome"],
+ estado_sigla=uf_info.get("sigla", ""),
+ estado_nome=uf_info.get("nome", ""),
+ regiao=uf_info.get("regiao", {}).get("nome", "")
+ ))
+
+ return results[:20] # Limit results
+
+ except Exception as e:
+ print(f"IBGE search error: {e}")
+ return []
+
+
+async def obter_municipio_por_id(id_municipio: int) -> Optional[Municipio]:
+ """Get municipality by IBGE code"""
+ try:
+ async with httpx.AsyncClient(timeout=15.0) as client:
+ response = await client.get(
+ f"{IBGE_BASE_URL}/localidades/municipios/{id_municipio}"
+ )
+
+ if response.status_code != 200:
+ return None
+
+ item = response.json()
+ uf_info = item.get("microrregiao", {}).get("mesorregiao", {}).get("UF", {})
+
+ return Municipio(
+ id=item["id"],
+ nome=item["nome"],
+ estado_sigla=uf_info.get("sigla", ""),
+ estado_nome=uf_info.get("nome", ""),
+ regiao=uf_info.get("regiao", {}).get("nome", "")
+ )
+
+ except Exception as e:
+ print(f"IBGE municipio error: {e}")
+ return None
+
+
+async def enriquecer_localizacao(cidade: str, uf: Optional[str] = None) -> Dict[str, Any]:
+ """
+ Enrich a location name with IBGE data.
+ Useful for adding context to extracted locations.
+ """
+ resultado = {
+ "cidade_original": cidade,
+ "encontrado": False,
+ "ibge_codigo": None,
+ "cidade": None,
+ "estado": None,
+ "estado_sigla": None,
+ "regiao": None
+ }
+
+ municipios = await buscar_municipio(cidade, uf)
+
+ if municipios:
+ # Take best match (exact or first)
+ melhor = None
+ for m in municipios:
+ if m.nome.lower() == cidade.lower():
+ melhor = m
+ break
+
+ if not melhor:
+ melhor = municipios[0]
+
+ resultado.update({
+ "encontrado": True,
+ "ibge_codigo": melhor.id,
+ "cidade": melhor.nome,
+ "estado": melhor.estado_nome,
+ "estado_sigla": melhor.estado_sigla,
+ "regiao": melhor.regiao
+ })
+
+ return resultado
diff --git a/app/services/ingestion/__init__.py b/app/services/ingestion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53751fc389795a6893e21379a16b0680f55cda41
--- /dev/null
+++ b/app/services/ingestion/__init__.py
@@ -0,0 +1,3 @@
+# Ingestion services
+from app.services.ingestion.wikipedia import wikipedia_scraper
+from app.services.ingestion.news import news_service
diff --git a/app/services/ingestion/__pycache__/__init__.cpython-311.pyc b/app/services/ingestion/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de09d686a52c85f16de0eac33cbd28ca9065604d
Binary files /dev/null and b/app/services/ingestion/__pycache__/__init__.cpython-311.pyc differ
diff --git a/app/services/ingestion/__pycache__/news.cpython-311.pyc b/app/services/ingestion/__pycache__/news.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47a4ad23456ff8907ab2a47285b1b74cd099a8fe
Binary files /dev/null and b/app/services/ingestion/__pycache__/news.cpython-311.pyc differ
diff --git a/app/services/ingestion/__pycache__/wikipedia.cpython-311.pyc b/app/services/ingestion/__pycache__/wikipedia.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..215244f9f9e1bdf8dc6071c4e0237f41318f352a
Binary files /dev/null and b/app/services/ingestion/__pycache__/wikipedia.cpython-311.pyc differ
diff --git a/app/services/ingestion/news.py b/app/services/ingestion/news.py
new file mode 100644
index 0000000000000000000000000000000000000000..1aba8df40e8cfb6d2cc19900fea89cf6ce04cf14
--- /dev/null
+++ b/app/services/ingestion/news.py
@@ -0,0 +1,86 @@
+"""
+News API Client Service
+Usa RSS feeds públicos para não precisar de API key
+"""
+import feedparser
+import requests
+from typing import List, Dict
+from datetime import datetime
+import re
+
+
+class NewsService:
+ """Serviço para buscar notícias de fontes públicas via RSS"""
+
+ # RSS feeds públicos brasileiros e internacionais
+ RSS_FEEDS = {
+ "g1": "https://g1.globo.com/rss/g1/",
+ "folha": "https://feeds.folha.uol.com.br/folha/rss/rss091.xml",
+ "bbc_brasil": "https://www.bbc.com/portuguese/articles/rss.xml",
+ "reuters": "https://www.reutersagency.com/feed/",
+ "google_news_br": "https://news.google.com/rss?hl=pt-BR&gl=BR&ceid=BR:pt-419"
+ }
+
+ def fetch_feed(self, feed_url: str) -> List[Dict]:
+ """Busca artigos de um feed RSS"""
+ try:
+ feed = feedparser.parse(feed_url)
+ articles = []
+
+ for entry in feed.entries[:20]: # Limitar a 20 artigos
+ published = None
+ if hasattr(entry, 'published_parsed') and entry.published_parsed:
+ published = datetime(*entry.published_parsed[:6])
+
+ articles.append({
+ "title": entry.get("title", ""),
+ "description": self._clean_html(entry.get("summary", "")),
+ "url": entry.get("link", ""),
+ "published_at": published,
+ "source": feed.feed.get("title", "Unknown")
+ })
+
+ return articles
+ except Exception as e:
+ print(f"Error fetching feed {feed_url}: {e}")
+ return []
+
+ def fetch_all_feeds(self) -> List[Dict]:
+ """Busca artigos de todos os feeds configurados"""
+ all_articles = []
+ for name, url in self.RSS_FEEDS.items():
+ articles = self.fetch_feed(url)
+ for article in articles:
+ article["feed_name"] = name
+ all_articles.extend(articles)
+ return all_articles
+
+ def search_news(self, query: str) -> List[Dict]:
+ """
+ Busca notícias pelo Google News RSS
+ """
+ # Google News RSS search
+ search_url = f"https://news.google.com/rss/search?q={query}&hl=pt-BR&gl=BR&ceid=BR:pt-419"
+ return self.fetch_feed(search_url)
+
+ def _clean_html(self, text: str) -> str:
+ """Remove HTML tags do texto"""
+ clean = re.compile('<.*?>')
+ return re.sub(clean, '', text)
+
+ def to_document(self, article: Dict) -> Dict:
+ """
+ Converte um artigo de notícia para o formato Document
+ """
+ return {
+ "title": article["title"],
+ "content": article.get("description", ""),
+ "doc_type": "news",
+ "source": article.get("source", "news"),
+ "source_url": article.get("url"),
+ "published_at": article.get("published_at")
+ }
+
+
+# Singleton instance
+news_service = NewsService()
diff --git a/app/services/ingestion/wikipedia.py b/app/services/ingestion/wikipedia.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c64a6f77d4bcd406506966ad4b1c3a75972a8e3
--- /dev/null
+++ b/app/services/ingestion/wikipedia.py
@@ -0,0 +1,215 @@
+"""
+Wikipedia Scraper Service
+"""
+import requests
+from bs4 import BeautifulSoup
+from typing import Optional, Dict, List
+import re
+
+
+class WikipediaScraper:
+ """Scraper para extrair dados da Wikipedia"""
+
+ BASE_URL = "https://pt.wikipedia.org"
+ API_URL = "https://pt.wikipedia.org/w/api.php"
+
+ # User-Agent obrigatório para API da Wikipedia
+ HEADERS = {
+ "User-Agent": "NumidiumBot/1.0 (https://github.com/numidium; contact@numidium.app) Python/3.11"
+ }
+
+ def search(self, query: str, limit: int = 10) -> List[Dict]:
+ """
+ Busca artigos na Wikipedia
+ """
+ try:
+ params = {
+ "action": "query",
+ "list": "search",
+ "srsearch": query,
+ "srlimit": limit,
+ "format": "json"
+ }
+
+ response = requests.get(
+ self.API_URL,
+ params=params,
+ headers=self.HEADERS,
+ timeout=10
+ )
+ response.raise_for_status()
+ data = response.json()
+
+ results = []
+ for item in data.get("query", {}).get("search", []):
+ results.append({
+ "title": item["title"],
+ "snippet": BeautifulSoup(item["snippet"], "html.parser").get_text(),
+ "pageid": item["pageid"]
+ })
+
+ return results
+ except Exception as e:
+ print(f"Wikipedia search error: {e}")
+ return []
+
+ def get_article(self, title: str) -> Optional[Dict]:
+ """
+ Busca informações completas de um artigo
+ """
+ try:
+ params = {
+ "action": "query",
+ "titles": title,
+ "prop": "extracts|pageimages|coordinates|categories",
+ "exintro": True,
+ "explaintext": True,
+ "pithumbsize": 300,
+ "format": "json"
+ }
+
+ response = requests.get(
+ self.API_URL,
+ params=params,
+ headers=self.HEADERS,
+ timeout=10
+ )
+ response.raise_for_status()
+ data = response.json()
+
+ pages = data.get("query", {}).get("pages", {})
+ for page_id, page in pages.items():
+ if page_id == "-1":
+ return None
+
+ result = {
+ "title": page.get("title"),
+ "extract": page.get("extract"),
+ "pageid": page.get("pageid"),
+ "url": f"{self.BASE_URL}/wiki/{page.get('title', '').replace(' ', '_')}",
+ "thumbnail": page.get("thumbnail", {}).get("source"),
+ "categories": [c["title"].replace("Categoria:", "")
+ for c in page.get("categories", [])]
+ }
+
+ # Coordenadas se disponíveis
+ if "coordinates" in page:
+ coords = page["coordinates"][0]
+ result["latitude"] = coords.get("lat")
+ result["longitude"] = coords.get("lon")
+
+ return result
+
+ return None
+ except Exception as e:
+ print(f"Wikipedia article error: {e}")
+ return None
+
+ def get_infobox(self, title: str) -> Dict:
+ """
+ Tenta extrair dados estruturados do infobox de um artigo
+ """
+ try:
+ url = f"{self.BASE_URL}/wiki/{title.replace(' ', '_')}"
+ response = requests.get(url, headers=self.HEADERS, timeout=10)
+ soup = BeautifulSoup(response.text, "html.parser")
+
+ infobox = soup.find("table", class_="infobox")
+ if not infobox:
+ return {}
+
+ data = {}
+ for row in infobox.find_all("tr"):
+ header = row.find("th")
+ cell = row.find("td")
+ if header and cell:
+ key = header.get_text(strip=True)
+ value = cell.get_text(strip=True)
+ # Clean up the value
+ value = re.sub(r'\[\d+\]', '', value) # Remove references
+ data[key] = value
+
+ return data
+ except Exception as e:
+ print(f"Infobox error: {e}")
+ return {}
+
+ def scrape_person(self, name: str) -> Optional[Dict]:
+ """
+ Scrape dados de uma pessoa da Wikipedia
+ Retorna dados formatados para criar uma Entity
+ """
+ article = self.get_article(name)
+ if not article:
+ return None
+
+ infobox = self.get_infobox(name)
+
+ return {
+ "type": "person",
+ "name": article["title"],
+ "description": article.get("extract"),
+ "source": "wikipedia",
+ "source_url": article["url"],
+ "properties": {
+ "thumbnail": article.get("thumbnail"),
+ "categories": article.get("categories", []),
+ **infobox
+ },
+ "latitude": article.get("latitude"),
+ "longitude": article.get("longitude")
+ }
+
+ def scrape_organization(self, name: str) -> Optional[Dict]:
+ """
+ Scrape dados de uma organização da Wikipedia
+ """
+ article = self.get_article(name)
+ if not article:
+ return None
+
+ infobox = self.get_infobox(name)
+
+ return {
+ "type": "organization",
+ "name": article["title"],
+ "description": article.get("extract"),
+ "source": "wikipedia",
+ "source_url": article["url"],
+ "properties": {
+ "thumbnail": article.get("thumbnail"),
+ "categories": article.get("categories", []),
+ **infobox
+ },
+ "latitude": article.get("latitude"),
+ "longitude": article.get("longitude")
+ }
+
+ def scrape_location(self, name: str) -> Optional[Dict]:
+ """
+ Scrape dados de um local da Wikipedia
+ """
+ article = self.get_article(name)
+ if not article:
+ return None
+
+ infobox = self.get_infobox(name)
+
+ return {
+ "type": "location",
+ "name": article["title"],
+ "description": article.get("extract"),
+ "source": "wikipedia",
+ "source_url": article["url"],
+ "properties": {
+ "thumbnail": article.get("thumbnail"),
+ "categories": article.get("categories", []),
+ **infobox
+ },
+ "latitude": article.get("latitude"),
+ "longitude": article.get("longitude")
+ }
+
+
+# Singleton instance
+wikipedia_scraper = WikipediaScraper()
diff --git a/app/services/investigation.py b/app/services/investigation.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfbfc764bbd26579e5e92285959e952e3d8afa7e
--- /dev/null
+++ b/app/services/investigation.py
@@ -0,0 +1,324 @@
+"""
+Investigation Service - Builds comprehensive dossiers
+Combines CNPJ data, transparency/sanctions, Lancer web search, and NER
+"""
+import httpx
+from typing import Optional, Dict, Any, List
+from dataclasses import dataclass, field, asdict
+import asyncio
+
+from app.services.brazil_apis import consultar_cnpj, CompanyData
+from app.services.transparencia_api import verificar_sancoes
+# from app.services.tse_api import buscar_politico # TSE API needs fixing
+from app.services import lancer
+from app.services.nlp import entity_extractor
+from app.core.database import get_db
+from app.models.entity import Entity, Relationship
+
+
+LANCER_URL = "https://madras1-lancer.hf.space/api/v1"
+
+
+@dataclass
+class DossierSection:
+ """A section of the dossier"""
+ titulo: str
+ conteudo: Any
+ status: str = "ok" # ok, warning, danger, info
+ icone: str = "📋"
+
+
+@dataclass
+class Dossier:
+ """Complete investigation dossier"""
+ tipo: str # "organization" or "person"
+ alvo: str # Target name
+ cnpj_cpf: Optional[str] = None
+
+ # Sections
+ dados_cadastrais: Optional[DossierSection] = None
+ socios: Optional[DossierSection] = None
+ sancoes: Optional[DossierSection] = None
+ dados_politicos: Optional[DossierSection] = None # TSE data
+ noticias: Optional[DossierSection] = None
+ entidades_relacionadas: Optional[DossierSection] = None
+
+ # Metadata
+ red_flags: List[str] = field(default_factory=list)
+ score_risco: int = 0 # 0-100
+ data_geracao: str = ""
+ fonte_dados: List[str] = field(default_factory=list)
+
+
+async def investigar_empresa(nome_ou_cnpj: str) -> Dossier:
+ """
+ Investigate a company and build a comprehensive dossier.
+ """
+ import re
+ from datetime import datetime
+
+ dossier = Dossier(
+ tipo="organization",
+ alvo=nome_ou_cnpj,
+ data_geracao=datetime.now().isoformat()
+ )
+
+ # Check if input is CNPJ
+ cnpj_clean = re.sub(r'[^0-9]', '', nome_ou_cnpj)
+ is_cnpj = len(cnpj_clean) == 14
+
+ company_data = None
+
+ # 1. Get company data from CNPJ
+ if is_cnpj:
+ dossier.cnpj_cpf = cnpj_clean
+ company_data = await consultar_cnpj(cnpj_clean)
+
+ if company_data:
+ dossier.alvo = company_data.razao_social or company_data.nome_fantasia or nome_ou_cnpj
+ dossier.fonte_dados.append(company_data.fonte)
+
+ # Build cadastral section
+ dossier.dados_cadastrais = DossierSection(
+ titulo="Dados Cadastrais",
+ icone="🏢",
+ conteudo={
+ "cnpj": company_data.cnpj,
+ "razao_social": company_data.razao_social,
+ "nome_fantasia": company_data.nome_fantasia,
+ "situacao": company_data.situacao,
+ "data_abertura": company_data.data_abertura,
+ "natureza_juridica": company_data.natureza_juridica,
+ "capital_social": company_data.capital_social,
+ "porte": company_data.porte,
+ "endereco": f"{company_data.logradouro}, {company_data.numero} - {company_data.bairro}, {company_data.cidade}/{company_data.uf}",
+ "cep": company_data.cep,
+ "telefone": company_data.telefone,
+ "email": company_data.email,
+ "atividade_principal": f"{company_data.cnae_principal} - {company_data.cnae_descricao}"
+ }
+ )
+
+ # Check situação for red flags
+ if company_data.situacao and "ATIVA" not in company_data.situacao.upper():
+ dossier.red_flags.append(f"⚠️ Situação cadastral: {company_data.situacao}")
+ dossier.dados_cadastrais.status = "warning"
+
+ # Build partners section
+ if company_data.socios:
+ dossier.socios = DossierSection(
+ titulo=f"Sócios ({len(company_data.socios)})",
+ icone="👥",
+ conteudo=company_data.socios
+ )
+
+ # 2. Check sanctions/transparency
+ if dossier.cnpj_cpf:
+ sancoes = await verificar_sancoes(dossier.cnpj_cpf)
+ dossier.fonte_dados.append("Portal da Transparência")
+
+ if sancoes["tem_sancoes"]:
+ dossier.red_flags.append(f"🚨 Encontrado em {sancoes['total_sancoes']} lista(s) de sanções")
+ dossier.score_risco += 40
+
+ dossier.sancoes = DossierSection(
+ titulo=f"Sanções ({sancoes['total_sancoes']})",
+ icone="⚠️",
+ status="danger",
+ conteudo=sancoes
+ )
+ else:
+ dossier.sancoes = DossierSection(
+ titulo="Sanções",
+ icone="✅",
+ status="ok",
+ conteudo={"mensagem": "Nenhuma sanção encontrada nos cadastros públicos"}
+ )
+
+ # 3. Web search for news and context
+ search_query = dossier.alvo
+ if company_data and company_data.nome_fantasia:
+ search_query = company_data.nome_fantasia
+
+ try:
+ web_result = await lancer.search(f"{search_query} notícias escândalos processos", max_results=8)
+
+ if web_result.answer or web_result.results:
+ dossier.fonte_dados.append("Lancer Web Search")
+
+ news_content = {
+ "resumo": web_result.answer or "Sem resumo disponível",
+ "fontes": [
+ {"titulo": r.title, "url": r.url, "snippet": r.content[:200]}
+ for r in web_result.results[:5]
+ ]
+ }
+
+ dossier.noticias = DossierSection(
+ titulo="Notícias e Mídia",
+ icone="📰",
+ conteudo=news_content
+ )
+
+ # Check for negative keywords in news
+ negative_keywords = ["escândalo", "fraude", "corrupção", "prisão", "investigado", "denúncia", "irregularidade"]
+ raw_text = (web_result.answer or "").lower()
+ for kw in negative_keywords:
+ if kw in raw_text:
+ dossier.red_flags.append(f"📰 Menção a '{kw}' encontrada nas notícias")
+ dossier.noticias.status = "warning"
+ dossier.score_risco += 10
+ break
+ except Exception as e:
+ print(f"Web search error: {e}")
+
+ # 4. Extract related entities using NER
+ if dossier.noticias and dossier.noticias.conteudo.get("resumo"):
+ try:
+ text_to_analyze = dossier.noticias.conteudo.get("resumo", "")[:3000]
+ ner_result = await entity_extractor.extract(text_to_analyze)
+
+ if ner_result.entities:
+ entities = [
+ {"nome": e.name, "tipo": e.type, "descricao": e.description or e.role}
+ for e in ner_result.entities[:10]
+ ]
+
+ dossier.entidades_relacionadas = DossierSection(
+ titulo=f"Entidades Relacionadas ({len(entities)})",
+ icone="🔗",
+ conteudo=entities
+ )
+ except Exception as e:
+ print(f"NER error: {e}")
+
+ # Calculate final risk score
+ dossier.score_risco = min(100, dossier.score_risco + len(dossier.red_flags) * 5)
+
+ return dossier
+
+
+async def investigar_pessoa(nome: str, cpf: Optional[str] = None) -> Dossier:
+ """
+ Investigate a person and build a dossier.
+ Note: CPF data is heavily protected by LGPD, so mainly uses web search.
+ """
+ from datetime import datetime
+
+ dossier = Dossier(
+ tipo="person",
+ alvo=nome,
+ cnpj_cpf=cpf,
+ data_geracao=datetime.now().isoformat()
+ )
+
+ # 1. Check sanctions if CPF provided
+ if cpf:
+ sancoes = await verificar_sancoes(cpf)
+ dossier.fonte_dados.append("Portal da Transparência")
+
+ if sancoes["tem_sancoes"]:
+ dossier.red_flags.append(f"🚨 Encontrado em {sancoes['total_sancoes']} lista(s) de sanções")
+ dossier.score_risco += 50
+
+ dossier.sancoes = DossierSection(
+ titulo=f"Sanções ({sancoes['total_sancoes']})",
+ icone="⚠️",
+ status="danger",
+ conteudo=sancoes
+ )
+
+ # 2. Check TSE for political data (DISABLED - API needs fixing)
+ # try:
+ # tse_data = await buscar_politico(nome)
+ # if tse_data.get("encontrado"):
+ # dossier.fonte_dados.append("TSE (DivulgaCand)")
+ # candidaturas = tse_data.get("candidaturas", [])
+ # patrimonio = tse_data.get("total_patrimonio", 0)
+ # partidos = tse_data.get("partidos", [])
+ # dossier.dados_politicos = DossierSection(...)
+ # except Exception as e:
+ # print(f"TSE search error: {e}")
+
+
+ # 3. Web search for information
+ try:
+ web_result = await lancer.search(f'"{nome}" biografia cargo empresa', max_results=10)
+
+ if web_result.answer or web_result.results:
+ dossier.fonte_dados.append("Lancer Web Search")
+
+ dossier.noticias = DossierSection(
+ titulo="Informações Públicas",
+ icone="🌐",
+ conteudo={
+ "resumo": web_result.answer or "Informações limitadas",
+ "fontes": [
+ {"titulo": r.title, "url": r.url, "snippet": r.content[:200]}
+ for r in web_result.results[:5]
+ ]
+ }
+ )
+
+ # Check for negative keywords
+ negative_keywords = ["preso", "condenado", "investigado", "acusado", "escândalo", "fraude"]
+ raw_text = (web_result.answer or "").lower()
+ for kw in negative_keywords:
+ if kw in raw_text:
+ dossier.red_flags.append(f"📰 Menção a '{kw}' encontrada")
+ dossier.noticias.status = "warning"
+ dossier.score_risco += 15
+ break
+ except Exception as e:
+ print(f"Web search error: {e}")
+
+ # 3. Extract related entities
+ if dossier.noticias and dossier.noticias.conteudo.get("resumo"):
+ try:
+ ner_result = await entity_extractor.extract(dossier.noticias.conteudo["resumo"][:2000])
+
+ if ner_result.entities:
+ entities = [
+ {"nome": e.name, "tipo": e.type, "descricao": e.description or e.role}
+ for e in ner_result.entities[:10]
+ if e.name.lower() != nome.lower() # Exclude the target
+ ]
+
+ if entities:
+ dossier.entidades_relacionadas = DossierSection(
+ titulo=f"Conexões ({len(entities)})",
+ icone="🔗",
+ conteudo=entities
+ )
+ except Exception as e:
+ print(f"NER error: {e}")
+
+ dossier.score_risco = min(100, dossier.score_risco + len(dossier.red_flags) * 5)
+
+ return dossier
+
+
+def dossier_to_dict(dossier: Dossier) -> Dict[str, Any]:
+ """Convert dossier to dictionary for JSON response"""
+ result = {
+ "tipo": dossier.tipo,
+ "alvo": dossier.alvo,
+ "cnpj_cpf": dossier.cnpj_cpf,
+ "red_flags": dossier.red_flags,
+ "score_risco": dossier.score_risco,
+ "data_geracao": dossier.data_geracao,
+ "fonte_dados": dossier.fonte_dados,
+ "secoes": {}
+ }
+
+ for field_name in ["dados_cadastrais", "socios", "sancoes", "dados_politicos", "noticias", "entidades_relacionadas"]:
+ section = getattr(dossier, field_name)
+ if section:
+ result["secoes"][field_name] = {
+ "titulo": section.titulo,
+ "icone": section.icone,
+ "status": section.status,
+ "conteudo": section.conteudo
+ }
+
+ return result
diff --git a/app/services/investigator_agent.py b/app/services/investigator_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..56b74ad4c994947ed35f3185df53fc586a4232cc
--- /dev/null
+++ b/app/services/investigator_agent.py
@@ -0,0 +1,659 @@
+"""
+Investigator Agent - Autonomous Investigation with Tool Calling
+Uses Cerebras native tool calling for multi-source investigations
+"""
+import json
+import re
+import httpx
+from typing import Optional, List, Dict, Any
+from dataclasses import dataclass, field
+from datetime import datetime
+from sqlalchemy.orm import Session
+
+from app.config import settings
+from app.services import lancer
+from app.services.brazil_apis import consultar_cnpj
+from app.models.entity import Entity, Relationship
+
+
+def sanitize_text(text: str) -> str:
+ """
+ Clean up text from model that may contain thinking artifacts.
+ Only removes thinking tags, does NOT remove valid characters.
+ """
+ if not text:
+ return text
+
+ # Remove thinking tags and content between them
+ text = re.sub(r'.*?', '', text, flags=re.DOTALL)
+ text = re.sub(r'<\|think\|>.*?<\|/think\|>', '', text, flags=re.DOTALL)
+
+ # Remove other common model artifacts like <|...|> tags
+ text = re.sub(r'<\|.*?\|>', '', text)
+
+ # Clean up excessive newlines only
+ text = re.sub(r'\n{3,}', '\n\n', text)
+
+ return text.strip()
+
+
+@dataclass
+class Finding:
+ """A discovery made during investigation"""
+ title: str
+ content: str
+ source: str
+ timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+
+
+@dataclass
+class InvestigationResult:
+ """Complete investigation result"""
+ mission: str
+ findings: List[Finding]
+ entities_discovered: List[Dict[str, Any]]
+ connections_mapped: List[Dict[str, Any]]
+ report: str
+ iterations: int
+ tools_used: List[str]
+ status: str = "completed"
+
+
+# Tool definitions for Cerebras API
+TOOLS = [
+ {
+ "type": "function",
+ "function": {
+ "name": "search_entity",
+ "description": "Buscar entidade no NUMIDIUM (grafo de conhecimento) por nome. Use para encontrar pessoas, empresas ou locais já conhecidos.",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "query": {
+ "type": "string",
+ "description": "Nome ou termo para buscar"
+ },
+ "entity_type": {
+ "type": "string",
+ "enum": ["person", "organization", "location", "any"],
+ "description": "Tipo de entidade (opcional)"
+ }
+ },
+ "required": ["query"]
+ }
+ }
+ },
+ {
+ "type": "function",
+ "function": {
+ "name": "get_connections",
+ "description": "Obter a rede de conexões de uma entidade específica. Retorna entidades relacionadas.",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "entity_id": {
+ "type": "string",
+ "description": "ID da entidade no NUMIDIUM"
+ }
+ },
+ "required": ["entity_id"]
+ }
+ }
+ },
+ {
+ "type": "function",
+ "function": {
+ "name": "lookup_cnpj",
+ "description": "Consultar dados de uma empresa brasileira pelo CNPJ. Retorna razão social, sócios, endereço, CNAEs, etc.",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "cnpj": {
+ "type": "string",
+ "description": "CNPJ da empresa (com ou sem formatação)"
+ }
+ },
+ "required": ["cnpj"]
+ }
+ }
+ },
+ {
+ "type": "function",
+ "function": {
+ "name": "web_search",
+ "description": "Pesquisar informações na web. Use para buscar notícias, artigos e informações públicas.",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "query": {
+ "type": "string",
+ "description": "Termo de busca"
+ },
+ "freshness": {
+ "type": "string",
+ "enum": ["day", "week", "month", "any"],
+ "description": "Frescor dos resultados",
+ "default": "any"
+ }
+ },
+ "required": ["query"]
+ }
+ }
+ },
+ {
+ "type": "function",
+ "function": {
+ "name": "deep_research",
+ "description": "Pesquisa profunda e multi-dimensional sobre um tema. Use para tópicos complexos.",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "topic": {
+ "type": "string",
+ "description": "Tópico para pesquisa profunda"
+ }
+ },
+ "required": ["topic"]
+ }
+ }
+ },
+ {
+ "type": "function",
+ "function": {
+ "name": "save_finding",
+ "description": "Salvar uma descoberta importante da investigação.",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "title": {
+ "type": "string",
+ "description": "Título curto da descoberta"
+ },
+ "content": {
+ "type": "string",
+ "description": "Conteúdo detalhado"
+ },
+ "source": {
+ "type": "string",
+ "description": "Fonte da informação"
+ }
+ },
+ "required": ["title", "content", "source"]
+ }
+ }
+ },
+ {
+ "type": "function",
+ "function": {
+ "name": "finish_investigation",
+ "description": "Finalizar a investigação e gerar o relatório final.",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "summary": {
+ "type": "string",
+ "description": "Resumo das descobertas principais"
+ }
+ },
+ "required": ["summary"]
+ }
+ }
+ }
+]
+
+
+SYSTEM_PROMPT = """Você é um agente investigador autônomo do sistema NUMIDIUM/AVANGARD. /no_think
+
+Sua missão é investigar temas usando múltiplas fontes de dados:
+- NUMIDIUM: Grafo de conhecimento com entidades e relacionamentos
+- Consulta CNPJ: Dados oficiais de empresas brasileiras (BrasilAPI)
+- Web Search: Pesquisa na internet via Lancer
+
+## Estratégia de Investigação:
+
+1. Comece buscando no NUMIDIUM se já temos informações sobre o alvo
+2. Para empresas brasileiras, consulte o CNPJ para obter sócios e dados
+3. Use web_search para buscar notícias e informações públicas
+4. Para cada sócio/conexão descoberta, considere investigar mais a fundo
+5. Use save_finding para registrar descobertas importantes
+6. Quando tiver informações suficientes, use finish_investigation
+
+## Regras:
+- Seja metódico e siga pistas
+- Não invente informações - use apenas dados das ferramentas
+- Priorize qualidade sobre quantidade
+- Cite sempre as fontes
+- NÃO use pensamento interno ou tags . Responda diretamente."""
+
+
+class InvestigatorAgent:
+ """Autonomous investigation agent with tool calling"""
+
+ def __init__(self):
+ self.api_url = "https://api.cerebras.ai/v1/chat/completions"
+ self.api_key = settings.cerebras_api_key
+ self.model = "zai-glm-4.7"
+
+ # Investigation state
+ self.findings: List[Finding] = []
+ self.entities_discovered: List[Dict[str, Any]] = []
+ self.connections_mapped: List[Dict[str, Any]] = []
+ self.tools_used: List[str] = []
+ self.messages: List[Dict[str, Any]] = []
+ self.db: Optional[Session] = None
+
+ def _reset_state(self):
+ """Reset investigation state"""
+ self.findings = []
+ self.entities_discovered = []
+ self.connections_mapped = []
+ self.tools_used = []
+ self.messages = []
+
+ async def _call_llm(
+ self,
+ messages: List[Dict[str, Any]],
+ tools: List[Dict] = None
+ ) -> Dict[str, Any]:
+ """Call Cerebras API with tool calling support"""
+ try:
+ payload = {
+ "model": self.model,
+ "messages": messages,
+ "temperature": 0.3,
+ "max_tokens": 2048,
+ }
+
+ if tools:
+ payload["tools"] = tools
+ payload["tool_choice"] = "auto"
+ payload["parallel_tool_calls"] = True
+
+ async with httpx.AsyncClient(timeout=60.0) as client:
+ response = await client.post(
+ self.api_url,
+ headers={
+ "Authorization": f"Bearer {self.api_key}",
+ "Content-Type": "application/json"
+ },
+ json=payload
+ )
+
+ if response.status_code != 200:
+ raise Exception(f"API error: {response.status_code} - {response.text}")
+
+ return response.json()
+
+ except Exception as e:
+ raise Exception(f"LLM call failed: {str(e)}")
+
+ async def _execute_tool(self, tool_name: str, arguments: Dict) -> str:
+ """Execute a tool and return the result"""
+ self.tools_used.append(tool_name)
+
+ try:
+ if tool_name == "search_entity":
+ return await self._search_entity(
+ arguments.get("query", ""),
+ arguments.get("entity_type")
+ )
+
+ elif tool_name == "get_connections":
+ return await self._get_connections(arguments.get("entity_id"))
+
+ elif tool_name == "lookup_cnpj":
+ return await self._lookup_cnpj(arguments.get("cnpj", ""))
+
+
+ elif tool_name == "web_search":
+ return await self._web_search(
+ arguments.get("query", ""),
+ arguments.get("freshness", "any")
+ )
+
+ elif tool_name == "deep_research":
+ return await self._deep_research(arguments.get("topic", ""))
+
+ elif tool_name == "aether_search":
+ return await self._aether_search(arguments.get("query", ""))
+
+ elif tool_name == "aether_entities":
+ return await self._aether_entities()
+
+ elif tool_name == "save_finding":
+ finding = Finding(
+ title=arguments.get("title", ""),
+ content=arguments.get("content", ""),
+ source=arguments.get("source", "")
+ )
+ self.findings.append(finding)
+ return f"Descoberta salva: {finding.title}"
+
+ elif tool_name == "finish_investigation":
+ return f"INVESTIGATION_COMPLETE: {arguments.get('summary', '')}"
+
+ else:
+ return f"Ferramenta desconhecida: {tool_name}"
+
+ except Exception as e:
+ return f"Erro ao executar {tool_name}: {str(e)}"
+
+ async def _search_entity(self, query: str, entity_type: Optional[str]) -> str:
+ """Search entities in database"""
+ if not self.db:
+ return "Erro: Banco de dados não disponível"
+
+ q = self.db.query(Entity).filter(Entity.name.ilike(f"%{query}%"))
+ if entity_type and entity_type != "any":
+ q = q.filter(Entity.type == entity_type)
+
+ entities = q.limit(10).all()
+
+ if entities:
+ result = []
+ for e in entities:
+ self.entities_discovered.append({
+ "id": str(e.id),
+ "name": e.name,
+ "type": e.type
+ })
+ result.append({
+ "id": str(e.id),
+ "name": e.name,
+ "type": e.type,
+ "description": e.description[:200] if e.description else None
+ })
+ return json.dumps(result, ensure_ascii=False, indent=2)
+
+ return "Nenhuma entidade encontrada no NUMIDIUM."
+
+ async def _get_connections(self, entity_id: str) -> str:
+ """Get entity connections"""
+ if not self.db:
+ return "Erro: Banco de dados não disponível"
+
+ relationships = self.db.query(Relationship).filter(
+ (Relationship.source_id == entity_id) | (Relationship.target_id == entity_id)
+ ).limit(20).all()
+
+ if relationships:
+ connections = []
+ for rel in relationships:
+ source = self.db.query(Entity).filter(Entity.id == rel.source_id).first()
+ target = self.db.query(Entity).filter(Entity.id == rel.target_id).first()
+ if source and target:
+ connections.append({
+ "source": source.name,
+ "target": target.name,
+ "type": rel.type
+ })
+ return json.dumps(connections, ensure_ascii=False, indent=2)
+
+ return "Nenhuma conexão encontrada."
+
+ async def _lookup_cnpj(self, cnpj: str) -> str:
+ """Lookup CNPJ via BrasilAPI"""
+ cnpj_clean = cnpj.replace(".", "").replace("/", "").replace("-", "")
+ result = await consultar_cnpj(cnpj_clean)
+
+ if result:
+ data = {
+ "razao_social": result.razao_social,
+ "nome_fantasia": result.nome_fantasia,
+ "situacao": result.situacao,
+ "data_abertura": result.data_abertura,
+ "capital_social": result.capital_social,
+ "endereco": f"{result.logradouro}, {result.numero} - {result.cidade}/{result.uf}",
+ "cnae": f"{result.cnae_principal} - {result.cnae_descricao}",
+ "socios": result.socios
+ }
+ return json.dumps(data, ensure_ascii=False, indent=2)
+
+ return "CNPJ não encontrado."
+
+ async def _lookup_phone(self, phone: str) -> str:
+ """Lookup phone number via NumVerify API"""
+ # Clean phone number - keep only digits
+ phone_clean = "".join(c for c in phone if c.isdigit())
+
+ # NumVerify API key (free tier: 100 req/month)
+ numverify_key = getattr(settings, 'numverify_api_key', None)
+
+ if not numverify_key:
+ # Fallback: just do a web search for the number
+ return await self._web_search(f'"{phone_clean}" telefone', "any")
+
+ try:
+ async with httpx.AsyncClient(timeout=10.0) as client:
+ response = await client.get(
+ "http://apilayer.net/api/validate",
+ params={
+ "access_key": numverify_key,
+ "number": phone_clean,
+ "country_code": "", # Auto-detect
+ "format": 1
+ }
+ )
+
+ if response.status_code == 200:
+ data = response.json()
+
+ if data.get("valid"):
+ result = {
+ "numero": data.get("international_format"),
+ "valido": True,
+ "pais": data.get("country_name"),
+ "codigo_pais": data.get("country_code"),
+ "operadora": data.get("carrier"),
+ "tipo_linha": data.get("line_type"), # mobile, landline, etc
+ "localizacao": data.get("location")
+ }
+ return json.dumps(result, ensure_ascii=False, indent=2)
+ else:
+ return f"Número {phone_clean} não é válido ou não foi encontrado."
+
+ return "Erro ao consultar número."
+
+ except Exception as e:
+ # Fallback to web search
+ return await self._web_search(f'"{phone_clean}" telefone', "any")
+
+ async def _web_search(self, query: str, freshness: str) -> str:
+ """Web search via Lancer"""
+ try:
+ result = await lancer.search(query, max_results=5, freshness=freshness)
+ if result.answer:
+ return f"Resumo: {result.answer}\n\nFontes: {len(result.results)} resultados"
+ return "Nenhum resultado encontrado."
+ except Exception as e:
+ return f"Erro na busca web: {str(e)}"
+
+ async def _deep_research(self, topic: str) -> str:
+ """Deep research via Lancer"""
+ try:
+ result = await lancer.deep_research(topic, max_dimensions=3)
+ if result.answer:
+ return result.answer
+ return "Pesquisa profunda não retornou resultados."
+ except Exception as e:
+ return f"Erro na pesquisa: {str(e)}"
+
+ async def _aether_search(self, query: str) -> str:
+ """Semantic search via AetherMap"""
+ try:
+ # Check if we have a job_id cached
+ if not aethermap.current_job_id:
+ # Index entities from database first
+ if self.db:
+ entities = self.db.query(Entity).limit(500).all()
+ if entities:
+ texts = []
+ for e in entities:
+ text = f"{e.name} ({e.type})"
+ if e.description:
+ text += f": {e.description[:500]}"
+ texts.append(text)
+
+ if texts:
+ result = await aethermap.process_documents(texts, fast_mode=True)
+ # Continue with search
+
+ if aethermap.current_job_id:
+ result = await aethermap.semantic_search(query, turbo_mode=True)
+ return f"RAG Response:\n{result.summary}"
+ else:
+ return "Nenhum documento indexado no AetherMap."
+
+ except Exception as e:
+ return f"Erro no AetherMap search: {str(e)}"
+
+ async def _aether_entities(self) -> str:
+ """Extract NER entities via AetherMap"""
+ try:
+ if not aethermap.current_job_id:
+ return "Nenhum documento indexado. Use aether_search primeiro."
+
+ result = await aethermap.extract_entities()
+
+ # Format response
+ output = []
+
+ if result.hubs:
+ output.append("**Entidades Centrais (Hubs):**")
+ for hub in result.hubs[:5]:
+ output.append(f"- {hub.get('entity')} ({hub.get('type')}): {hub.get('degree')} conexões")
+
+ if result.insights:
+ output.append(f"\n**Insights:**")
+ output.append(f"- Total de conexões: {result.insights.get('total_connections', 0)}")
+ output.append(f"- Grau médio: {result.insights.get('avg_degree', 0)}")
+
+ if result.edges:
+ output.append(f"\n**Top 5 Relacionamentos:**")
+ for edge in result.edges[:5]:
+ output.append(f"- {edge.source_entity} <-> {edge.target_entity}: {edge.reason}")
+
+ return "\n".join(output) if output else "Nenhuma entidade significativa encontrada."
+
+ except Exception as e:
+ return f"Erro na extração de entidades: {str(e)}"
+
+ async def investigate(
+ self,
+ mission: str,
+ db: Session,
+ max_iterations: int = 10
+ ) -> InvestigationResult:
+ """Main investigation loop"""
+ self._reset_state()
+ self.db = db
+
+ self.messages = [
+ {"role": "system", "content": SYSTEM_PROMPT},
+ {"role": "user", "content": f"Missão de investigação: {mission}\n\nComece a investigação."}
+ ]
+
+ iteration = 0
+ final_summary = ""
+
+ while iteration < max_iterations:
+ iteration += 1
+
+ response = await self._call_llm(self.messages, TOOLS)
+
+ choice = response["choices"][0]
+ message = choice["message"]
+ self.messages.append(message)
+
+ tool_calls = message.get("tool_calls", [])
+
+ if not tool_calls:
+ if message.get("content"):
+ final_summary = message["content"]
+ break
+
+ for tool_call in tool_calls:
+ func = tool_call["function"]
+ tool_name = func["name"]
+
+ try:
+ arguments = json.loads(func["arguments"])
+ except:
+ arguments = {}
+
+ result = await self._execute_tool(tool_name, arguments)
+
+ if result.startswith("INVESTIGATION_COMPLETE:"):
+ final_summary = result.replace("INVESTIGATION_COMPLETE:", "").strip()
+ break
+
+ self.messages.append({
+ "role": "tool",
+ "tool_call_id": tool_call["id"],
+ "content": result
+ })
+
+ if final_summary:
+ break
+
+ if not final_summary:
+ final_summary = await self._generate_report(mission)
+
+ # Sanitize all text outputs to remove thinking artifacts
+ final_summary = sanitize_text(final_summary)
+
+ # Sanitize findings content
+ sanitized_findings = []
+ for f in self.findings:
+ sanitized_findings.append(Finding(
+ title=sanitize_text(f.title),
+ content=sanitize_text(f.content),
+ source=f.source,
+ timestamp=f.timestamp
+ ))
+
+ return InvestigationResult(
+ mission=mission,
+ findings=sanitized_findings,
+ entities_discovered=self.entities_discovered,
+ connections_mapped=self.connections_mapped,
+ report=final_summary,
+ iterations=iteration,
+ tools_used=list(set(self.tools_used)),
+ status="completed"
+ )
+
+ async def _generate_report(self, mission: str) -> str:
+ """Generate final report"""
+ findings_text = "\n".join([
+ f"- {f.title}: {f.content} (Fonte: {f.source})"
+ for f in self.findings
+ ]) or "Nenhuma descoberta registrada."
+
+ entities_text = ", ".join([
+ e.get("name", "Unknown") for e in self.entities_discovered[:10]
+ ]) or "Nenhuma entidade."
+
+ prompt = f"""Gere um relatório de investigação:
+
+Missão: {mission}
+
+Descobertas:
+{findings_text}
+
+Entidades: {entities_text}
+
+Ferramentas usadas: {', '.join(set(self.tools_used))}
+
+Gere relatório estruturado com: Resumo Executivo, Descobertas, Entidades, Recomendações."""
+
+ response = await self._call_llm([
+ {"role": "system", "content": "Gere relatórios concisos."},
+ {"role": "user", "content": prompt}
+ ])
+
+ return sanitize_text(response["choices"][0]["message"]["content"])
+
+
+# Singleton
+investigator_agent = InvestigatorAgent()
diff --git a/app/services/lancer.py b/app/services/lancer.py
new file mode 100644
index 0000000000000000000000000000000000000000..179868cdd00136f0a9376b6ea6fdff3df5b48abf
--- /dev/null
+++ b/app/services/lancer.py
@@ -0,0 +1,198 @@
+"""
+Lancer Deep Research Service
+Integrates with Lancer Search API for AI-powered research
+"""
+import httpx
+from typing import Optional, List, Dict, Any
+from dataclasses import dataclass
+
+
+LANCER_BASE_URL = "https://madras1-lancer.hf.space"
+
+
+@dataclass
+class SearchResult:
+ """Individual search result from Lancer"""
+ title: str
+ url: str
+ content: str
+ score: float
+ published_date: Optional[str] = None
+
+
+@dataclass
+class ResearchResponse:
+ """Response from Lancer research/search"""
+ query: str
+ answer: Optional[str]
+ results: List[SearchResult]
+ citations: List[Dict[str, Any]]
+ processing_time_ms: float
+ raw_text: str # Combined text for NER extraction
+
+
+async def search(
+ query: str,
+ max_results: int = 10,
+ freshness: str = "any"
+) -> ResearchResponse:
+ """
+ Perform a search with AI synthesis using Lancer API.
+ """
+ try:
+ async with httpx.AsyncClient(timeout=60.0) as client:
+ response = await client.post(
+ f"{LANCER_BASE_URL}/api/v1/search",
+ json={
+ "query": query,
+ "max_results": max_results,
+ "freshness": freshness,
+ "include_answer": True
+ }
+ )
+
+ if response.status_code != 200:
+ raise Exception(f"Lancer API error: {response.status_code}")
+
+ data = response.json()
+
+ results = [
+ SearchResult(
+ title=r.get("title", ""),
+ url=r.get("url", ""),
+ content=r.get("content", ""),
+ score=r.get("score", 0.0),
+ published_date=r.get("published_date")
+ )
+ for r in data.get("results", [])
+ ]
+
+ # Combine all text for NER
+ raw_text = data.get("answer", "") or ""
+ for r in results:
+ raw_text += f"\n{r.title}. {r.content}"
+
+ return ResearchResponse(
+ query=data.get("query", query),
+ answer=data.get("answer"),
+ results=results,
+ citations=data.get("citations", []),
+ processing_time_ms=data.get("processing_time_ms", 0),
+ raw_text=raw_text
+ )
+
+ except Exception as e:
+ raise Exception(f"Lancer search failed: {str(e)}")
+
+
+async def deep_research(
+ query: str,
+ max_dimensions: int = 5,
+ max_sources_per_dim: int = 5
+) -> ResearchResponse:
+ """
+ Perform deep multi-dimensional research using Lancer API.
+ This provides richer, more comprehensive analysis.
+ """
+ try:
+ async with httpx.AsyncClient(timeout=120.0) as client:
+ response = await client.post(
+ f"{LANCER_BASE_URL}/api/v1/research/deep",
+ json={
+ "query": query,
+ "max_dimensions": max_dimensions,
+ "max_sources_per_dim": max_sources_per_dim,
+ "max_total_searches": 20
+ }
+ )
+
+ if response.status_code != 200:
+ raise Exception(f"Lancer API error: {response.status_code}")
+
+ data = response.json()
+
+ # Deep research returns a different format - adapt it
+ results = []
+ raw_text = ""
+
+ # Extract from dimensions if present
+ if "dimensions" in data:
+ for dim in data["dimensions"]:
+ dim_name = dim.get("dimension", "")
+ raw_text += f"\n## {dim_name}\n"
+ for r in dim.get("results", []):
+ results.append(SearchResult(
+ title=r.get("title", ""),
+ url=r.get("url", ""),
+ content=r.get("content", ""),
+ score=r.get("score", 0.0)
+ ))
+ raw_text += f"{r.get('title', '')}. {r.get('content', '')}\n"
+
+ # Add final report
+ final_report = data.get("final_report", data.get("report", ""))
+ if final_report:
+ raw_text = final_report + "\n" + raw_text
+
+ return ResearchResponse(
+ query=query,
+ answer=final_report,
+ results=results,
+ citations=data.get("citations", []),
+ processing_time_ms=data.get("processing_time_ms", 0),
+ raw_text=raw_text
+ )
+
+ except Exception as e:
+ raise Exception(f"Lancer deep research failed: {str(e)}")
+
+
+async def heavy_search(
+ query: str,
+ max_results: int = 5
+) -> ResearchResponse:
+ """
+ Heavy search with full content scraping from sources.
+ Slower but provides more context.
+ """
+ try:
+ async with httpx.AsyncClient(timeout=90.0) as client:
+ response = await client.post(
+ f"{LANCER_BASE_URL}/api/v1/search/heavy",
+ json={
+ "query": query,
+ "max_results": max_results,
+ "include_answer": True
+ }
+ )
+
+ if response.status_code != 200:
+ raise Exception(f"Lancer API error: {response.status_code}")
+
+ data = response.json()
+
+ results = [
+ SearchResult(
+ title=r.get("title", ""),
+ url=r.get("url", ""),
+ content=r.get("content", ""),
+ score=r.get("score", 0.0)
+ )
+ for r in data.get("results", [])
+ ]
+
+ raw_text = data.get("answer", "") or ""
+ for r in results:
+ raw_text += f"\n{r.title}. {r.content}"
+
+ return ResearchResponse(
+ query=query,
+ answer=data.get("answer"),
+ results=results,
+ citations=data.get("citations", []),
+ processing_time_ms=data.get("processing_time_ms", 0),
+ raw_text=raw_text
+ )
+
+ except Exception as e:
+ raise Exception(f"Lancer heavy search failed: {str(e)}")
diff --git a/app/services/nlp/__init__.py b/app/services/nlp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9265c7e61b3b29a87dcd75c4455abd114be3e18
--- /dev/null
+++ b/app/services/nlp/__init__.py
@@ -0,0 +1,2 @@
+# NLP Services
+from .entity_extractor import entity_extractor
diff --git a/app/services/nlp/__pycache__/__init__.cpython-311.pyc b/app/services/nlp/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8671a044592ad7e7b9a10ee976be1a78f1f7958d
Binary files /dev/null and b/app/services/nlp/__pycache__/__init__.cpython-311.pyc differ
diff --git a/app/services/nlp/__pycache__/entity_extractor.cpython-311.pyc b/app/services/nlp/__pycache__/entity_extractor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7aac7bb33e176996105a8d539ec88db2b3ceaf5
Binary files /dev/null and b/app/services/nlp/__pycache__/entity_extractor.cpython-311.pyc differ
diff --git a/app/services/nlp/entity_extractor.py b/app/services/nlp/entity_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8855cc0c67661840ce99ce63f7aafe9da23e60b2
--- /dev/null
+++ b/app/services/nlp/entity_extractor.py
@@ -0,0 +1,265 @@
+"""
+Entity Extractor Service - LLM-based NER
+Uses Cerebras API with Qwen 3 235B for intelligent entity and relationship extraction
+"""
+import json
+import re
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass
+import httpx
+
+from app.config import settings
+
+
+@dataclass
+class ExtractedEntity:
+ """Represents an extracted entity"""
+ name: str
+ type: str # person, organization, location, event
+ role: Optional[str] = None
+ aliases: Optional[List[str]] = None
+ description: Optional[str] = None
+ latitude: Optional[float] = None
+ longitude: Optional[float] = None
+ event_date: Optional[str] = None # Date in ISO format (YYYY-MM-DD)
+
+
+@dataclass
+class ExtractedRelationship:
+ """Represents a relationship between entities"""
+ source: str
+ target: str
+ relationship_type: str
+ context: Optional[str] = None
+ event_date: Optional[str] = None # Date in ISO format (YYYY-MM-DD)
+
+
+@dataclass
+class ExtractedEvent:
+ """Represents an extracted event"""
+ description: str
+ event_type: Optional[str] = None
+ date: Optional[str] = None
+ location: Optional[str] = None
+ participants: Optional[List[str]] = None
+
+
+@dataclass
+class ExtractionResult:
+ """Complete extraction result"""
+ entities: List[ExtractedEntity]
+ relationships: List[ExtractedRelationship]
+ events: List[ExtractedEvent]
+ raw_response: Optional[str] = None
+
+
+EXTRACTION_PROMPT = """Você é um especialista em extração de informações estruturadas de textos.
+
+Analise o texto fornecido e extraia TODAS as entidades, relacionamentos e eventos mencionados.
+
+## Regras:
+1. Identifique entidades: pessoas, organizações, locais, eventos
+2. Para PESSOAS: inclua nome completo (se mencionado ou conhecido), cargo/função
+3. Para ORGANIZAÇÕES: inclua nome oficial e siglas
+4. Para LOCAIS: seja específico (cidade, país, endereço)
+5. Identifique RELACIONAMENTOS entre entidades (quem trabalha onde, quem conhece quem, etc.)
+6. Identifique EVENTOS mencionados (reuniões, anúncios, eleições, etc.)
+7. EXTRAIA DATAS sempre que mencionadas (formato YYYY-MM-DD ou YYYY se só o ano)
+
+## Formato de resposta (JSON válido):
+```json
+{{
+ "entities": [
+ {{
+ "name": "Nome Completo",
+ "type": "person|organization|location|event",
+ "role": "cargo ou função (opcional)",
+ "aliases": ["apelidos", "siglas"],
+ "description": "breve descrição se relevante",
+ "event_date": "YYYY-MM-DD ou YYYY (data relevante como nascimento, fundação, etc)"
+ }}
+ ],
+ "relationships": [
+ {{
+ "source": "Nome da Entidade 1",
+ "target": "Nome da Entidade 2",
+ "relationship_type": "tipo de relação (trabalha em, preside, fundou, reuniu-se com, etc.)",
+ "context": "contexto da relação",
+ "event_date": "YYYY-MM-DD ou YYYY (quando o relacionamento aconteceu/iniciou)"
+ }}
+ ],
+ "events": [
+ {{
+ "description": "O que aconteceu",
+ "event_type": "meeting|announcement|election|crime|etc",
+ "date": "YYYY-MM-DD ou YYYY",
+ "location": "local se mencionado",
+ "participants": ["lista de participantes"]
+ }}
+ ]
+}}
+```
+
+Retorne APENAS o JSON, sem texto adicional.
+
+## Texto para análise:
+{text}
+"""
+
+
+class EntityExtractor:
+ """
+ LLM-based Entity Extractor using Cerebras API
+ """
+
+ def __init__(self):
+ self.api_key = settings.cerebras_api_key
+ self.base_url = "https://api.cerebras.ai/v1"
+ self.model = "qwen-3-235b-a22b-instruct-2507"
+ self.timeout = 60.0
+
+ async def extract(self, text: str) -> ExtractionResult:
+ """
+ Extract entities, relationships, and events from text using LLM
+
+ Args:
+ text: The text to analyze
+
+ Returns:
+ ExtractionResult with all extracted information
+ """
+ if not self.api_key:
+ raise ValueError("CEREBRAS_API_KEY not configured. Please set the environment variable.")
+
+ if not text or len(text.strip()) < 10:
+ return ExtractionResult(entities=[], relationships=[], events=[])
+
+ # Prepare the prompt
+ prompt = EXTRACTION_PROMPT.format(text=text)
+
+ try:
+ # Call Cerebras API
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
+ response = await client.post(
+ f"{self.base_url}/chat/completions",
+ headers={
+ "Authorization": f"Bearer {self.api_key}",
+ "Content-Type": "application/json"
+ },
+ json={
+ "model": self.model,
+ "messages": [
+ {
+ "role": "system",
+ "content": "Você é um assistente especialista em extração de entidades e relacionamentos. Sempre responda em JSON válido."
+ },
+ {
+ "role": "user",
+ "content": prompt
+ }
+ ],
+ "temperature": 0.1, # Low temperature for consistent extraction
+ "max_tokens": 4096
+ }
+ )
+
+ if response.status_code != 200:
+ error_text = response.text
+ print(f"Cerebras API error: {response.status_code} - {error_text}")
+ raise ValueError(f"Cerebras API error: {response.status_code}")
+
+ data = response.json()
+
+ # Parse the response
+ raw_content = data["choices"][0]["message"]["content"]
+ return self._parse_response(raw_content)
+
+ except httpx.TimeoutException:
+ print("Cerebras API timeout")
+ raise ValueError("API timeout - please try again with shorter text")
+ except httpx.RequestError as e:
+ print(f"Cerebras API request error: {e}")
+ raise ValueError(f"API connection error: {str(e)}")
+ except KeyError as e:
+ print(f"Unexpected API response format: {e}")
+ raise ValueError("Unexpected API response format")
+
+ def _parse_response(self, content: str) -> ExtractionResult:
+ """Parse the LLM response into structured data"""
+ try:
+ # Try to extract JSON from the response
+ # Sometimes the model wraps it in ```json ... ```
+ json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL)
+ if json_match:
+ json_str = json_match.group(1)
+ else:
+ # Try to find raw JSON
+ json_match = re.search(r'\{.*\}', content, re.DOTALL)
+ if json_match:
+ json_str = json_match.group(0)
+ else:
+ json_str = content
+
+ data = json.loads(json_str)
+
+ # Parse entities
+ entities = []
+ for e in data.get("entities", []):
+ entities.append(ExtractedEntity(
+ name=e.get("name", ""),
+ type=e.get("type", "unknown"),
+ role=e.get("role"),
+ aliases=e.get("aliases", []),
+ description=e.get("description"),
+ event_date=e.get("event_date")
+ ))
+
+ # Parse relationships
+ relationships = []
+ for r in data.get("relationships", []):
+ relationships.append(ExtractedRelationship(
+ source=r.get("source", ""),
+ target=r.get("target", ""),
+ relationship_type=r.get("relationship_type", "related_to"),
+ context=r.get("context"),
+ event_date=r.get("event_date")
+ ))
+
+ # Parse events
+ events = []
+ for ev in data.get("events", []):
+ events.append(ExtractedEvent(
+ description=ev.get("description", ""),
+ event_type=ev.get("event_type"),
+ date=ev.get("date"),
+ location=ev.get("location"),
+ participants=ev.get("participants", [])
+ ))
+
+ return ExtractionResult(
+ entities=entities,
+ relationships=relationships,
+ events=events,
+ raw_response=content
+ )
+
+ except json.JSONDecodeError as e:
+ print(f"Failed to parse LLM response: {e}")
+ print(f"Raw content: {content}")
+ return ExtractionResult(
+ entities=[],
+ relationships=[],
+ events=[],
+ raw_response=content
+ )
+
+ def extract_sync(self, text: str) -> ExtractionResult:
+ """
+ Synchronous version of extract for non-async contexts
+ """
+ import asyncio
+ return asyncio.run(self.extract(text))
+
+
+# Singleton instance
+entity_extractor = EntityExtractor()
diff --git a/app/services/transparencia_api.py b/app/services/transparencia_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..13face26b77d772b00023efd68c0a6af8dc03d9e
--- /dev/null
+++ b/app/services/transparencia_api.py
@@ -0,0 +1,146 @@
+"""
+Portal da Transparência APIs
+Access to Brazilian government transparency data
+"""
+import httpx
+from typing import Optional, Dict, Any, List
+from dataclasses import dataclass
+
+
+# Portal da Transparência base URL
+TRANSPARENCIA_URL = "https://api.portaldatransparencia.gov.br/api-de-dados"
+
+
+@dataclass
+class SanctionRecord:
+ """Data structure for sanction/punishment records"""
+ tipo: str # CEIS, CNEP, CEPIM
+ cpf_cnpj: str
+ nome: str
+ tipo_pessoa: str # 'F' or 'J'
+
+ # Sanction details
+ tipo_sancao: str = ""
+ data_inicio: str = ""
+ data_fim: str = ""
+ orgao_sancionador: str = ""
+ uf_orgao: str = ""
+ fundamentacao_legal: str = ""
+
+ # Source
+ fonte_url: str = ""
+
+
+async def consultar_ceis(cnpj_cpf: str, token: Optional[str] = None) -> List[SanctionRecord]:
+ """
+ Query CEIS - Cadastro de Empresas Inidôneas e Suspensas
+ Note: Requires authentication token from Portal da Transparência
+ """
+ # Without token, we can still try - some endpoints work without auth
+ return await _query_sanctions("ceis", cnpj_cpf, token)
+
+
+async def consultar_cnep(cnpj_cpf: str, token: Optional[str] = None) -> List[SanctionRecord]:
+ """
+ Query CNEP - Cadastro Nacional de Empresas Punidas
+ """
+ return await _query_sanctions("cnep", cnpj_cpf, token)
+
+
+async def consultar_cepim(cnpj_cpf: str, token: Optional[str] = None) -> List[SanctionRecord]:
+ """
+ Query CEPIM - Cadastro de Entidades Privadas sem Fins Lucrativos Impedidas
+ """
+ return await _query_sanctions("cepim", cnpj_cpf, token)
+
+
+async def _query_sanctions(
+ endpoint: str,
+ cnpj_cpf: str,
+ token: Optional[str] = None
+) -> List[SanctionRecord]:
+ """Internal function to query sanction APIs"""
+ try:
+ headers = {}
+ if token:
+ headers["chave-api-dados"] = token
+
+ params = {"cnpjCpf": cnpj_cpf}
+
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ response = await client.get(
+ f"{TRANSPARENCIA_URL}/{endpoint}",
+ params=params,
+ headers=headers
+ )
+
+ if response.status_code == 401:
+ # Need authentication - return empty for now
+ print(f"Portal da Transparência requires authentication for {endpoint}")
+ return []
+
+ if response.status_code != 200:
+ return []
+
+ data = response.json()
+ if not isinstance(data, list):
+ data = [data] if data else []
+
+ records = []
+ for item in data:
+ records.append(SanctionRecord(
+ tipo=endpoint.upper(),
+ cpf_cnpj=item.get("cpfCnpj", ""),
+ nome=item.get("nomeRazaoSocial", item.get("nome", "")),
+ tipo_pessoa=item.get("tipoPessoa", ""),
+ tipo_sancao=item.get("tipoSancao", {}).get("descricao", "") if isinstance(item.get("tipoSancao"), dict) else str(item.get("tipoSancao", "")),
+ data_inicio=item.get("dataInicioSancao", ""),
+ data_fim=item.get("dataFimSancao", ""),
+ orgao_sancionador=item.get("orgaoSancionador", {}).get("nome", "") if isinstance(item.get("orgaoSancionador"), dict) else str(item.get("orgaoSancionador", "")),
+ uf_orgao=item.get("ufOrgaoSancionador", ""),
+ fundamentacao_legal=item.get("fundamentacaoLegal", ""),
+ fonte_url=f"https://portaldatransparencia.gov.br/{endpoint}"
+ ))
+
+ return records
+
+ except Exception as e:
+ print(f"Transparência API error ({endpoint}): {e}")
+ return []
+
+
+async def verificar_sancoes(cnpj_cpf: str, token: Optional[str] = None) -> Dict[str, Any]:
+ """
+ Check all sanction databases for a CNPJ/CPF
+ Returns consolidated result
+ """
+ import asyncio
+
+ # Query all databases in parallel
+ ceis_task = consultar_ceis(cnpj_cpf, token)
+ cnep_task = consultar_cnep(cnpj_cpf, token)
+ cepim_task = consultar_cepim(cnpj_cpf, token)
+
+ ceis, cnep, cepim = await asyncio.gather(ceis_task, cnep_task, cepim_task)
+
+ all_sanctions = ceis + cnep + cepim
+
+ return {
+ "cnpj_cpf": cnpj_cpf,
+ "tem_sancoes": len(all_sanctions) > 0,
+ "total_sancoes": len(all_sanctions),
+ "ceis": len(ceis),
+ "cnep": len(cnep),
+ "cepim": len(cepim),
+ "registros": [
+ {
+ "tipo": s.tipo,
+ "tipo_sancao": s.tipo_sancao,
+ "orgao": s.orgao_sancionador,
+ "inicio": s.data_inicio,
+ "fim": s.data_fim,
+ "fundamentacao": s.fundamentacao_legal
+ }
+ for s in all_sanctions
+ ]
+ }
diff --git a/app/services/tse_api.py b/app/services/tse_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..e851625961d9a2c673f2eab9d91c44145d05e9cc
--- /dev/null
+++ b/app/services/tse_api.py
@@ -0,0 +1,270 @@
+"""
+TSE (Tribunal Superior Eleitoral) API Service
+Access to Brazilian electoral data - candidates, assets, donations
+"""
+import httpx
+from typing import Optional, Dict, Any, List
+from dataclasses import dataclass, field
+
+
+# DivulgaCand API (unofficial but functional)
+TSE_DIVULGACAND_URL = "https://divulgacandcontas.tse.jus.br/divulga/rest/v1"
+
+
+@dataclass
+class Candidato:
+ """Electoral candidate data"""
+ id: int
+ nome: str
+ nome_urna: str
+ cpf_parcial: str = "" # TSE only shows partial
+ numero: str = ""
+ cargo: str = ""
+ partido_sigla: str = ""
+ partido_nome: str = ""
+ coligacao: str = ""
+ situacao: str = ""
+
+ # Location
+ uf: str = ""
+ municipio: str = ""
+
+ # Personal
+ data_nascimento: str = ""
+ genero: str = ""
+ grau_instrucao: str = ""
+ ocupacao: str = ""
+
+ # Assets
+ total_bens: float = 0.0
+ bens: List[Dict[str, Any]] = field(default_factory=list)
+
+ # Campaign
+ total_receitas: float = 0.0
+ total_despesas: float = 0.0
+
+
+@dataclass
+class Eleicao:
+ """Election metadata"""
+ id: int
+ ano: int
+ descricao: str
+ turno: int = 1
+
+
+async def listar_eleicoes() -> List[Eleicao]:
+ """List available elections"""
+ try:
+ async with httpx.AsyncClient(timeout=15.0) as client:
+ response = await client.get(f"{TSE_DIVULGACAND_URL}/eleicao/ordinarias")
+
+ if response.status_code != 200:
+ return []
+
+ data = response.json()
+ eleicoes = []
+
+ for item in data:
+ eleicoes.append(Eleicao(
+ id=item.get("id", 0),
+ ano=item.get("ano", 0),
+ descricao=item.get("descricaoEleicao", ""),
+ turno=item.get("turno", 1)
+ ))
+
+ return sorted(eleicoes, key=lambda x: x.ano, reverse=True)
+
+ except Exception as e:
+ print(f"TSE eleicoes error: {e}")
+ return []
+
+
+async def buscar_candidatos(
+ nome: str,
+ ano: int = 2024,
+ uf: Optional[str] = None,
+ cargo: Optional[str] = None
+) -> List[Candidato]:
+ """
+ Search for candidates by name.
+
+ Args:
+ nome: Candidate name to search
+ ano: Election year (default 2024)
+ uf: State filter (optional)
+ cargo: Position filter (optional)
+ """
+ try:
+ # First get the election ID for the year
+ eleicoes = await listar_eleicoes()
+ eleicao = next((e for e in eleicoes if e.ano == ano), None)
+
+ if not eleicao:
+ # Try common election IDs
+ eleicao_id = {2024: 546, 2022: 544, 2020: 426, 2018: 295}.get(ano, 546)
+ else:
+ eleicao_id = eleicao.id
+
+ # Build search URL
+ base_url = f"{TSE_DIVULGACAND_URL}/candidatura/listar/{ano}/{eleicao_id}"
+
+ params = {"nomeCompleto": nome}
+ if uf:
+ params["uf"] = uf.upper()
+ if cargo:
+ params["cargo"] = cargo
+
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ response = await client.get(base_url, params=params)
+
+ if response.status_code != 200:
+ return []
+
+ data = response.json()
+ candidatos_data = data.get("candidatos", [])
+
+ candidatos = []
+ for item in candidatos_data:
+ candidatos.append(Candidato(
+ id=item.get("id", 0),
+ nome=item.get("nomeCompleto", ""),
+ nome_urna=item.get("nomeUrna", ""),
+ cpf_parcial=item.get("cpf", "")[:3] + ".***.***-**" if item.get("cpf") else "",
+ numero=str(item.get("numero", "")),
+ cargo=item.get("cargo", {}).get("nome", "") if isinstance(item.get("cargo"), dict) else str(item.get("cargo", "")),
+ partido_sigla=item.get("partido", {}).get("sigla", "") if isinstance(item.get("partido"), dict) else "",
+ partido_nome=item.get("partido", {}).get("nome", "") if isinstance(item.get("partido"), dict) else "",
+ uf=item.get("ufSigla", "") or item.get("uf", ""),
+ municipio=item.get("municipio", {}).get("nome", "") if isinstance(item.get("municipio"), dict) else "",
+ situacao=item.get("situacao", ""),
+ total_bens=float(item.get("totalDeBens", 0) or 0)
+ ))
+
+ return candidatos
+
+ except Exception as e:
+ print(f"TSE search error: {e}")
+ return []
+
+
+async def obter_candidato_detalhes(
+ id_candidato: int,
+ ano: int = 2024,
+ eleicao_id: Optional[int] = None
+) -> Optional[Candidato]:
+ """Get detailed candidate information including assets"""
+ try:
+ if not eleicao_id:
+ eleicao_id = {2024: 546, 2022: 544, 2020: 426, 2018: 295}.get(ano, 546)
+
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ # Get candidate details
+ response = await client.get(
+ f"{TSE_DIVULGACAND_URL}/candidatura/buscar/{ano}/{eleicao_id}/candidato/{id_candidato}"
+ )
+
+ if response.status_code != 200:
+ return None
+
+ item = response.json()
+
+ candidato = Candidato(
+ id=item.get("id", 0),
+ nome=item.get("nomeCompleto", ""),
+ nome_urna=item.get("nomeUrna", ""),
+ numero=str(item.get("numero", "")),
+ cargo=item.get("cargo", {}).get("nome", "") if isinstance(item.get("cargo"), dict) else "",
+ partido_sigla=item.get("partido", {}).get("sigla", "") if isinstance(item.get("partido"), dict) else "",
+ partido_nome=item.get("partido", {}).get("nome", "") if isinstance(item.get("partido"), dict) else "",
+ uf=item.get("ufSigla", ""),
+ municipio=item.get("localCandidatura", ""),
+ situacao=item.get("situacao", ""),
+ data_nascimento=item.get("dataNascimento", ""),
+ genero=item.get("genero", ""),
+ grau_instrucao=item.get("grauInstrucao", ""),
+ ocupacao=item.get("ocupacao", ""),
+ total_bens=float(item.get("totalDeBens", 0) or 0)
+ )
+
+ # Try to get assets (bens)
+ try:
+ bens_response = await client.get(
+ f"{TSE_DIVULGACAND_URL}/candidatura/buscar/{ano}/{eleicao_id}/candidato/{id_candidato}/bens"
+ )
+ if bens_response.status_code == 200:
+ bens_data = bens_response.json()
+ candidato.bens = [
+ {
+ "tipo": b.get("tipoBem", ""),
+ "descricao": b.get("descricao", ""),
+ "valor": float(b.get("valor", 0) or 0)
+ }
+ for b in bens_data
+ ]
+ except:
+ pass
+
+ return candidato
+
+ except Exception as e:
+ print(f"TSE details error: {e}")
+ return None
+
+
+async def buscar_politico(nome: str) -> Dict[str, Any]:
+ """
+ Search for a politician across multiple elections.
+ Returns consolidated information.
+ """
+ resultado = {
+ "nome": nome,
+ "encontrado": False,
+ "candidaturas": [],
+ "ultimo_cargo": None,
+ "total_patrimonio": 0.0,
+ "partidos": set(),
+ "ufs": set()
+ }
+
+ # Search in recent elections - continue through ALL years
+ for ano in [2024, 2022, 2020, 2018]:
+ try:
+ candidatos = await buscar_candidatos(nome, ano=ano)
+ print(f"TSE: Buscando '{nome}' em {ano} - encontrados: {len(candidatos)}")
+
+ for c in candidatos:
+ # Match if nome is in the candidate's full name
+ if nome.lower() in c.nome.lower() or nome.lower() in c.nome_urna.lower():
+ resultado["encontrado"] = True
+ resultado["candidaturas"].append({
+ "ano": ano,
+ "cargo": c.cargo,
+ "partido": c.partido_sigla,
+ "uf": c.uf,
+ "situacao": c.situacao,
+ "patrimonio": c.total_bens
+ })
+
+ if c.partido_sigla:
+ resultado["partidos"].add(c.partido_sigla)
+ if c.uf:
+ resultado["ufs"].add(c.uf)
+
+ if c.total_bens > resultado["total_patrimonio"]:
+ resultado["total_patrimonio"] = c.total_bens
+
+ if not resultado["ultimo_cargo"]:
+ resultado["ultimo_cargo"] = f"{c.cargo} ({ano})"
+ except Exception as e:
+ print(f"TSE search {ano} error: {e}")
+ continue
+
+ # Convert sets to lists for JSON
+ resultado["partidos"] = list(resultado["partidos"])
+ resultado["ufs"] = list(resultado["ufs"])
+
+ print(f"TSE resultado para '{nome}': encontrado={resultado['encontrado']}, candidaturas={len(resultado['candidaturas'])}")
+
+ return resultado
+
diff --git a/data/numidium.db b/data/numidium.db
new file mode 100644
index 0000000000000000000000000000000000000000..5293aaa509c3eea7032c820aaf1ed43707885ac6
Binary files /dev/null and b/data/numidium.db differ
diff --git a/requirements.txt b/requirements.txt
index d3d8138add29dc954de2005bd86bdec53dd629e8..8d6b074afd2336d205fa0443bc46feac92470d10 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,28 +1,12 @@
-# --- SERVIDOR E API ---
-fastapi
-uvicorn[standard]
-python-multipart
-openai
-prometheus-fastapi-instrumentator
-prometheus-client
-tavily-python
-
-# --- MACHINE LEARNING E NLP ---
-sentence-transformers
-numpy
-pandas
-scikit-learn
-scipy
-umap-learn
-hdbscan
-faiss-cpu
-nltk
-spacy
-langdetect
-https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
-https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl
-
-# --- TORCH CPU (>=2.6 required for CVE-2025-32434) ---
-torch>=2.6.0
-torchvision
-torchaudio
\ No newline at end of file
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+sqlalchemy==2.0.23
+pydantic==2.5.2
+pydantic-settings==2.1.0
+requests==2.31.0
+beautifulsoup4==4.12.2
+httpx==0.25.2
+python-multipart==0.0.6
+aiohttp==3.9.1
+feedparser==6.0.10
+# httpx already included - used for Cerebras API calls