diff --git a/app/__init__.py b/app/__init__.py deleted file mode 100644 index 5ca62e91c6b6d2fd4d3a0d2f3169941e71d37af3..0000000000000000000000000000000000000000 --- a/app/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Numidium Backend App diff --git a/app/__pycache__/__init__.cpython-311.pyc b/app/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 5a44e729bba8a6e6cdf407034b3b1ec551cfb6fe..0000000000000000000000000000000000000000 Binary files a/app/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/app/__pycache__/config.cpython-311.pyc b/app/__pycache__/config.cpython-311.pyc deleted file mode 100644 index f6ba5b98d9de60400fecda19a96033ffd700d3a1..0000000000000000000000000000000000000000 Binary files a/app/__pycache__/config.cpython-311.pyc and /dev/null differ diff --git a/app/api/__init__.py b/app/api/__init__.py deleted file mode 100644 index ce0a2733c6eceaf10144429177e8f20db9604545..0000000000000000000000000000000000000000 --- a/app/api/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# API module diff --git a/app/api/__pycache__/__init__.cpython-311.pyc b/app/api/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index e59a223a6007cd27a3443d5ab5a26d31df7fb4ff..0000000000000000000000000000000000000000 Binary files a/app/api/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/app/api/__pycache__/deps.cpython-311.pyc b/app/api/__pycache__/deps.cpython-311.pyc deleted file mode 100644 index 07e15cf0e980065fc3e41e4e0eea81575dc514d5..0000000000000000000000000000000000000000 Binary files a/app/api/__pycache__/deps.cpython-311.pyc and /dev/null differ diff --git a/app/api/deps.py b/app/api/deps.py deleted file mode 100644 index bcea9d8c46a65a9857513605150ce15591631945..0000000000000000000000000000000000000000 --- a/app/api/deps.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -API dependencies. -""" -from typing import Generator, Optional - -from fastapi import Cookie, Header -from sqlalchemy.orm import Session - -from app.core.database import get_db_for_session, get_default_session - - -def get_session_id( - x_session_id: Optional[str] = Header(None), - numidium_session: Optional[str] = Cookie(None) -) -> Optional[str]: - """Return the session id from header or cookie.""" - return x_session_id or numidium_session - - -def get_scoped_db( - x_session_id: Optional[str] = Header(None), - numidium_session: Optional[str] = Cookie(None) -) -> Generator[Session, None, None]: - """ - Provide a session-scoped DB if available, otherwise the default DB. - """ - session_id = x_session_id or numidium_session - if session_id: - db = get_db_for_session(session_id) - else: - db = get_default_session() - try: - yield db - finally: - db.close() diff --git a/app/api/routes/__init__.py b/app/api/routes/__init__.py deleted file mode 100644 index e37c97a33d27ba2e879921f79996d8fdc3edbb73..0000000000000000000000000000000000000000 --- a/app/api/routes/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# API Routes module -from app.api.routes import entities, relationships, events, search, ingest diff --git a/app/api/routes/__pycache__/__init__.cpython-311.pyc b/app/api/routes/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index e48c20bb1f744a1d1037323ce205527266cb5c7c..0000000000000000000000000000000000000000 Binary files a/app/api/routes/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/app/api/routes/__pycache__/entities.cpython-311.pyc b/app/api/routes/__pycache__/entities.cpython-311.pyc deleted file mode 100644 index ee99ab907a18b99a588dfe960d31a7e21c7e53d6..0000000000000000000000000000000000000000 Binary files a/app/api/routes/__pycache__/entities.cpython-311.pyc and /dev/null differ diff --git a/app/api/routes/__pycache__/events.cpython-311.pyc b/app/api/routes/__pycache__/events.cpython-311.pyc deleted file mode 100644 index 52f29ec16d10fc54bd6be7d6e32591d65d3acfcc..0000000000000000000000000000000000000000 Binary files a/app/api/routes/__pycache__/events.cpython-311.pyc and /dev/null differ diff --git a/app/api/routes/__pycache__/ingest.cpython-311.pyc b/app/api/routes/__pycache__/ingest.cpython-311.pyc deleted file mode 100644 index e524bafc4ce081ccccb32d94f2426c10b1e79b9a..0000000000000000000000000000000000000000 Binary files a/app/api/routes/__pycache__/ingest.cpython-311.pyc and /dev/null differ diff --git a/app/api/routes/__pycache__/investigate.cpython-311.pyc b/app/api/routes/__pycache__/investigate.cpython-311.pyc deleted file mode 100644 index 61c0e309052c422eb7d506d8623cfaed4ff4e01e..0000000000000000000000000000000000000000 Binary files a/app/api/routes/__pycache__/investigate.cpython-311.pyc and /dev/null differ diff --git a/app/api/routes/__pycache__/relationships.cpython-311.pyc b/app/api/routes/__pycache__/relationships.cpython-311.pyc deleted file mode 100644 index 73c88868d8b1ad76745a529fe05928d06408c415..0000000000000000000000000000000000000000 Binary files a/app/api/routes/__pycache__/relationships.cpython-311.pyc and /dev/null differ diff --git a/app/api/routes/__pycache__/search.cpython-311.pyc b/app/api/routes/__pycache__/search.cpython-311.pyc deleted file mode 100644 index 83951b1b069fe2d10b140852fbc85e7294cac015..0000000000000000000000000000000000000000 Binary files a/app/api/routes/__pycache__/search.cpython-311.pyc and /dev/null differ diff --git a/app/api/routes/aethermap.py b/app/api/routes/aethermap.py deleted file mode 100644 index bc0535153069d293dcdbe97be9565e0a17728e3e..0000000000000000000000000000000000000000 --- a/app/api/routes/aethermap.py +++ /dev/null @@ -1,307 +0,0 @@ -""" -AetherMap Routes - Document Mapping & Semantic Search -Integrates with AetherMap API for document clustering, NER, and semantic search. -""" -from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends -from pydantic import BaseModel, Field -from typing import Optional, List, Dict, Any -from sqlalchemy.orm import Session -import io - -from app.api.deps import get_scoped_db -from app.services.aethermap_client import aethermap, ProcessResult, SearchResult, EntityGraphResult - - -router = APIRouter() - - -# ============================================================================ -# Request/Response Models -# ============================================================================ - -class IndexDocumentsRequest(BaseModel): - """Request to index documents from text list""" - documents: List[str] = Field(..., description="Lista de textos para indexar") - fast_mode: bool = Field(True, description="Modo rápido (PCA) ou preciso (UMAP)") - - -class IndexEntitiesRequest(BaseModel): - """Request to index entities from NUMIDIUM database""" - entity_types: Optional[List[str]] = Field(None, description="Filtrar por tipos de entidade") - limit: int = Field(500, description="Limite de entidades") - - -class SemanticSearchRequest(BaseModel): - """Request for semantic search""" - query: str = Field(..., description="Termo de busca") - turbo_mode: bool = Field(True, description="Modo turbo (mais rápido)") - - -class IndexResponse(BaseModel): - """Response from indexing""" - job_id: str - num_documents: int - num_clusters: int - num_noise: int - metrics: Dict[str, Any] = {} - cluster_analysis: Dict[str, Any] = {} - - -class SearchResponse(BaseModel): - """Response from search""" - summary: str - results: List[Dict[str, Any]] = [] - - -class EntityGraphResponse(BaseModel): - """Response from NER extraction""" - hubs: List[Dict[str, Any]] = [] - insights: Dict[str, Any] = {} - node_count: int = 0 - edge_count: int = 0 - - -class StatusResponse(BaseModel): - """AetherMap status""" - connected: bool - job_id: Optional[str] = None - documents_indexed: int = 0 - - -# ============================================================================ -# Endpoints -# ============================================================================ - -@router.get("/status", response_model=StatusResponse) -async def get_status(): - """ - Get AetherMap connection status. - """ - return StatusResponse( - connected=True, - job_id=aethermap.current_job_id, - documents_indexed=0 # TODO: track this - ) - - -@router.post("/index", response_model=IndexResponse) -async def index_documents(request: IndexDocumentsRequest): - """ - Index a list of documents for semantic search. - - The documents will be: - - Embedded using sentence transformers - - Clustered using HDBSCAN - - Indexed in FAISS + BM25 for hybrid search - """ - try: - if not request.documents: - raise HTTPException(status_code=400, detail="Nenhum documento fornecido") - - result = await aethermap.process_documents( - texts=request.documents, - fast_mode=request.fast_mode - ) - - return IndexResponse( - job_id=result.job_id, - num_documents=result.num_documents, - num_clusters=result.num_clusters, - num_noise=result.num_noise, - metrics=result.metrics, - cluster_analysis=result.cluster_analysis - ) - - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -@router.post("/index-entities", response_model=IndexResponse) -async def index_entities( - request: IndexEntitiesRequest, - db: Session = Depends(get_scoped_db) -): - """ - Index entities from NUMIDIUM database. - - Collects entity names and descriptions, sends to AetherMap for processing. - """ - from app.models.entity import Entity - - try: - query = db.query(Entity) - - if request.entity_types: - query = query.filter(Entity.type.in_(request.entity_types)) - - entities = query.limit(request.limit).all() - - if not entities: - raise HTTPException(status_code=404, detail="Nenhuma entidade encontrada") - - # Build text representations - documents = [] - for e in entities: - text = f"{e.name} ({e.type})" - if e.description: - text += f": {e.description[:1000]}" - documents.append(text) - - result = await aethermap.process_documents( - texts=documents, - fast_mode=request.fast_mode if hasattr(request, 'fast_mode') else True - ) - - return IndexResponse( - job_id=result.job_id, - num_documents=result.num_documents, - num_clusters=result.num_clusters, - num_noise=result.num_noise, - metrics=result.metrics, - cluster_analysis=result.cluster_analysis - ) - - except HTTPException: - raise - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -@router.post("/upload", response_model=IndexResponse) -async def upload_documents( - file: UploadFile = File(...), - fast_mode: bool = Form(True) -): - """ - Upload a file (TXT or CSV) for indexing. - - - TXT: One document per line - - CSV: Will use first text column found - """ - try: - content = await file.read() - text = content.decode('utf-8', errors='ignore') - - # Split by lines for TXT - documents = [line.strip() for line in text.splitlines() if line.strip()] - - if not documents: - raise HTTPException(status_code=400, detail="Arquivo vazio ou sem texto válido") - - result = await aethermap.process_documents( - texts=documents, - fast_mode=fast_mode - ) - - return IndexResponse( - job_id=result.job_id, - num_documents=result.num_documents, - num_clusters=result.num_clusters, - num_noise=result.num_noise, - metrics=result.metrics, - cluster_analysis=result.cluster_analysis - ) - - except HTTPException: - raise - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -@router.post("/search", response_model=SearchResponse) -async def semantic_search(request: SemanticSearchRequest): - """ - Semantic search in indexed documents. - - Uses hybrid RAG (FAISS + BM25 + reranking + LLM). - Returns a summary answering the query with citations. - """ - try: - if not aethermap.current_job_id: - raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.") - - result = await aethermap.semantic_search( - query=request.query, - turbo_mode=request.turbo_mode - ) - - return SearchResponse( - summary=result.summary, - results=result.results - ) - - except HTTPException: - raise - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -@router.post("/entities", response_model=EntityGraphResponse) -async def extract_entities(): - """ - Extract named entities (NER) from indexed documents. - - Returns: - - Hub entities (most connected) - - Relationship insights - - Graph metrics - """ - try: - if not aethermap.current_job_id: - raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.") - - result = await aethermap.extract_entities() - - return EntityGraphResponse( - hubs=result.hubs, - insights=result.insights, - node_count=len(result.nodes), - edge_count=len(result.edges) - ) - - except HTTPException: - raise - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -@router.post("/analyze") -async def analyze_graph(): - """ - Analyze entity graph using LLM. - - Returns semantic insights about relationships and patterns. - """ - try: - if not aethermap.current_job_id: - raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.") - - result = await aethermap.analyze_graph() - - return { - "analysis": result.analysis, - "key_entities": result.key_entities, - "relationships": result.relationships - } - - except HTTPException: - raise - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -@router.post("/describe-clusters") -async def describe_clusters(): - """ - Get LLM descriptions for each cluster found. - """ - try: - if not aethermap.current_job_id: - raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.") - - result = await aethermap.describe_clusters() - - return result - - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) diff --git a/app/api/routes/analyze.py b/app/api/routes/analyze.py deleted file mode 100644 index 37b93947c0e0c9f2a5a626301007c1cf30b212d6..0000000000000000000000000000000000000000 --- a/app/api/routes/analyze.py +++ /dev/null @@ -1,309 +0,0 @@ -""" -Analyze API Routes - LLM-based text analysis -""" -from fastapi import APIRouter, Depends, HTTPException -from pydantic import BaseModel, Field -from typing import Optional, List -from sqlalchemy.orm import Session -import traceback - -from app.api.deps import get_scoped_db -from app.services.nlp import entity_extractor -from app.services.geocoding import geocode -from app.models.entity import Entity, Relationship, Event -from app.config import settings - - -router = APIRouter(prefix="/analyze", tags=["Analysis"]) - - -class AnalyzeRequest(BaseModel): - """Request model for text analysis""" - text: str = Field(..., min_length=10, description="Text to analyze") - auto_create: bool = Field(default=False, description="Auto-create extracted entities in database") - - -class ExtractedEntityResponse(BaseModel): - """Response model for an extracted entity""" - name: str - type: str - role: Optional[str] = None - aliases: Optional[List[str]] = None - description: Optional[str] = None - created: bool = False # Whether it was created in DB - entity_id: Optional[str] = None # DB ID if created - - -class ExtractedRelationshipResponse(BaseModel): - """Response model for an extracted relationship""" - source: str - target: str - relationship_type: str - context: Optional[str] = None - created: bool = False - - -class ExtractedEventResponse(BaseModel): - """Response model for an extracted event""" - description: str - event_type: Optional[str] = None - date: Optional[str] = None - location: Optional[str] = None - participants: Optional[List[str]] = None - created: bool = False - event_id: Optional[str] = None - - -class AnalyzeResponse(BaseModel): - """Response model for analysis""" - entities: List[ExtractedEntityResponse] - relationships: List[ExtractedRelationshipResponse] - events: List[ExtractedEventResponse] - stats: dict - - -@router.post("", response_model=AnalyzeResponse) -async def analyze_text(request: AnalyzeRequest, db: Session = Depends(get_scoped_db)): - """ - Analyze text using LLM to extract entities, relationships, and events. - - Uses Cerebras API with Qwen 3 235B for intelligent extraction. - - Args: - text: Text to analyze (min 10 characters) - auto_create: If true, automatically creates entities in the database - - Returns: - Extracted entities, relationships, events, and statistics - """ - try: - # Extract using LLM - result = await entity_extractor.extract(request.text) - - # Prepare response - entities_response = [] - relationships_response = [] - events_response = [] - - created_entities = 0 - created_relationships = 0 - created_events = 0 - - # Helper function to parse date strings - def parse_date(date_str): - if not date_str: - return None - from datetime import datetime - try: - # Try YYYY-MM-DD format - return datetime.strptime(date_str[:10], "%Y-%m-%d") - except: - try: - # Try YYYY format - return datetime.strptime(date_str[:4], "%Y") - except: - return None - - # Process entities - for entity in result.entities: - entity_data = ExtractedEntityResponse( - name=entity.name, - type=entity.type, - role=entity.role, - aliases=entity.aliases, - description=entity.description, - created=False - ) - - if request.auto_create and entity.name: - # Check if entity already exists - existing = db.query(Entity).filter( - Entity.name.ilike(f"%{entity.name}%") - ).first() - - if not existing: - # Get coordinates for location entities - lat, lng = None, None - if entity.type == "location": - coords = await geocode(entity.name) - if coords: - lat, lng = coords - - # Parse event_date if available - event_date = parse_date(getattr(entity, 'event_date', None)) - - # Create new entity - new_entity = Entity( - name=entity.name, - type=entity.type if entity.type in ["person", "organization", "location", "event"] else "person", - description=entity.description or entity.role or "", - source="llm_extraction", - latitude=lat, - longitude=lng, - event_date=event_date, - properties={"role": entity.role, "aliases": entity.aliases} - ) - db.add(new_entity) - db.commit() - db.refresh(new_entity) - - entity_data.created = True - entity_data.entity_id = new_entity.id - created_entities += 1 - else: - entity_data.entity_id = existing.id - - entities_response.append(entity_data) - - # Process relationships - for rel in result.relationships: - rel_data = ExtractedRelationshipResponse( - source=rel.source, - target=rel.target, - relationship_type=rel.relationship_type, - context=rel.context, - created=False - ) - - if request.auto_create: - # Find source and target entities - source_entity = db.query(Entity).filter( - Entity.name.ilike(f"%{rel.source}%") - ).first() - target_entity = db.query(Entity).filter( - Entity.name.ilike(f"%{rel.target}%") - ).first() - - if source_entity and target_entity: - # Check if relationship exists - existing_rel = db.query(Relationship).filter( - Relationship.source_id == source_entity.id, - Relationship.target_id == target_entity.id, - Relationship.type == rel.relationship_type - ).first() - - if not existing_rel: - # Parse event_date if available - rel_event_date = parse_date(getattr(rel, 'event_date', None)) - - new_rel = Relationship( - source_id=source_entity.id, - target_id=target_entity.id, - type=rel.relationship_type, - event_date=rel_event_date, - properties={"context": rel.context} - ) - db.add(new_rel) - db.commit() - rel_data.created = True - created_relationships += 1 - - relationships_response.append(rel_data) - - # Process events - for event in result.events: - event_data = ExtractedEventResponse( - description=event.description, - event_type=event.event_type, - date=event.date, - location=event.location, - participants=event.participants, - created=False - ) - - if request.auto_create and event.description: - # Create event - new_event = Event( - title=event.description[:100] if len(event.description) > 100 else event.description, - description=event.description, - type=event.event_type or "general", - source="llm_extraction" - ) - db.add(new_event) - db.commit() - db.refresh(new_event) - - event_data.created = True - event_data.event_id = new_event.id - created_events += 1 - - events_response.append(event_data) - - return AnalyzeResponse( - entities=entities_response, - relationships=relationships_response, - events=events_response, - stats={ - "total_entities": len(entities_response), - "total_relationships": len(relationships_response), - "total_events": len(events_response), - "created_entities": created_entities, - "created_relationships": created_relationships, - "created_events": created_events - } - ) - - except Exception as e: - # Log the full error with traceback - print(f"=== ANALYZE ERROR ===") - print(f"Error type: {type(e).__name__}") - print(f"Error message: {str(e)}") - print(f"Traceback:") - traceback.print_exc() - print(f"=== END ERROR ===") - raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}") - - -@router.get("/debug") -async def debug_config(): - """ - Debug endpoint to check if API is configured correctly. - """ - api_key = settings.cerebras_api_key - return { - "cerebras_api_key_configured": bool(api_key), - "cerebras_api_key_length": len(api_key) if api_key else 0, - "cerebras_api_key_preview": f"{api_key[:8]}...{api_key[-4:]}" if api_key and len(api_key) > 12 else "NOT SET" - } - - -@router.post("/quick") -async def quick_analyze(request: AnalyzeRequest): - """ - Quick analysis without database operations. - Returns only extracted data without creating anything. - """ - try: - result = await entity_extractor.extract(request.text) - - return { - "entities": [ - { - "name": e.name, - "type": e.type, - "role": e.role, - "aliases": e.aliases - } - for e in result.entities - ], - "relationships": [ - { - "source": r.source, - "target": r.target, - "type": r.relationship_type, - "context": r.context - } - for r in result.relationships - ], - "events": [ - { - "description": ev.description, - "type": ev.event_type, - "date": ev.date, - "participants": ev.participants - } - for ev in result.events - ] - } - except Exception as e: - raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}") diff --git a/app/api/routes/chat.py b/app/api/routes/chat.py deleted file mode 100644 index f75b133bf9956e67eb7b1b86312192d7fa093c46..0000000000000000000000000000000000000000 --- a/app/api/routes/chat.py +++ /dev/null @@ -1,63 +0,0 @@ -""" -Chat API Routes - Intelligent chat with RAG -""" -from fastapi import APIRouter, Depends, HTTPException -from pydantic import BaseModel, Field -from typing import Optional -from sqlalchemy.orm import Session - -from app.api.deps import get_scoped_db, get_session_id -from app.services.chat import chat_service - - -router = APIRouter(prefix="/chat", tags=["Chat"]) - - -class ChatRequest(BaseModel): - """Chat request model""" - message: str = Field(..., min_length=1, description="User message") - use_web: bool = Field(default=True, description="Include web search") - use_history: bool = Field(default=True, description="Use conversation history") - - -class ChatResponse(BaseModel): - """Chat response model""" - answer: str - local_context_used: bool - web_context_used: bool - entities_found: int - - -@router.post("", response_model=ChatResponse) -async def chat( - request: ChatRequest, - db: Session = Depends(get_scoped_db), - session_id: Optional[str] = Depends(get_session_id) -): - """ - Send a message and get an intelligent response. - - Uses: - - Local NUMIDIUM knowledge (entities/relationships) - - Lancer web search (if enabled) - - Cerebras LLM for synthesis - """ - try: - result = await chat_service.chat( - message=request.message, - db=db, - use_web=request.use_web, - use_history=request.use_history, - session_id=session_id - ) - return ChatResponse(**result) - - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -@router.post("/clear") -async def clear_history(session_id: Optional[str] = Depends(get_session_id)): - """Clear conversation history""" - chat_service.clear_history(session_id=session_id) - return {"message": "Historico limpo"} diff --git a/app/api/routes/dados_publicos.py b/app/api/routes/dados_publicos.py deleted file mode 100644 index 842e82d2d17f48687b92bb8012105eab495a8051..0000000000000000000000000000000000000000 --- a/app/api/routes/dados_publicos.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -Public Data API Routes - IBGE and TSE data access -""" -from fastapi import APIRouter, HTTPException, Query -from pydantic import BaseModel, Field -from typing import Optional, List, Dict, Any - -from app.services.ibge_api import ( - listar_estados, - listar_municipios, - buscar_municipio, - enriquecer_localizacao -) -from app.services.tse_api import ( - listar_eleicoes, - buscar_candidatos, - obter_candidato_detalhes, - buscar_politico -) - - -router = APIRouter(prefix="/dados", tags=["Public Data"]) - - -# ========== IBGE Endpoints ========== - -class EstadoResponse(BaseModel): - id: int - sigla: str - nome: str - regiao: str - - -class MunicipioResponse(BaseModel): - id: int - nome: str - estado_sigla: str - estado_nome: str - regiao: str - - -@router.get("/ibge/estados", response_model=List[EstadoResponse]) -async def get_estados(): - """List all Brazilian states""" - estados = await listar_estados() - return [EstadoResponse(**e.__dict__) for e in estados] - - -@router.get("/ibge/municipios/{uf}", response_model=List[MunicipioResponse]) -async def get_municipios(uf: str): - """List municipalities in a state""" - municipios = await listar_municipios(uf) - return [MunicipioResponse(**m.__dict__) for m in municipios] - - -@router.get("/ibge/buscar") -async def buscar_cidade( - nome: str = Query(..., min_length=2), - uf: Optional[str] = None -): - """Search for a municipality by name""" - municipios = await buscar_municipio(nome, uf) - return [MunicipioResponse(**m.__dict__) for m in municipios] - - -@router.get("/ibge/enriquecer") -async def enriquecer_cidade( - cidade: str = Query(..., min_length=2), - uf: Optional[str] = None -): - """Enrich a location name with IBGE data""" - return await enriquecer_localizacao(cidade, uf) - - -# ========== TSE Endpoints ========== - -class EleicaoResponse(BaseModel): - id: int - ano: int - descricao: str - turno: int - - -class CandidatoResponse(BaseModel): - id: int - nome: str - nome_urna: str - numero: str - cargo: str - partido_sigla: str - uf: str - municipio: str - situacao: str - total_bens: float - - -class CandidatoDetalhadoResponse(BaseModel): - id: int - nome: str - nome_urna: str - numero: str - cargo: str - partido_sigla: str - partido_nome: str - uf: str - municipio: str - situacao: str - data_nascimento: str - genero: str - grau_instrucao: str - ocupacao: str - total_bens: float - bens: List[Dict[str, Any]] - - -@router.get("/tse/eleicoes", response_model=List[EleicaoResponse]) -async def get_eleicoes(): - """List available elections""" - eleicoes = await listar_eleicoes() - return [EleicaoResponse(**e.__dict__) for e in eleicoes] - - -@router.get("/tse/candidatos") -async def get_candidatos( - nome: str = Query(..., min_length=3), - ano: int = Query(default=2024), - uf: Optional[str] = None, - cargo: Optional[str] = None -): - """Search for candidates by name""" - candidatos = await buscar_candidatos(nome, ano=ano, uf=uf, cargo=cargo) - return [CandidatoResponse(**c.__dict__) for c in candidatos] - - -@router.get("/tse/candidato/{id_candidato}") -async def get_candidato_detalhes( - id_candidato: int, - ano: int = Query(default=2024) -): - """Get detailed candidate information including assets""" - candidato = await obter_candidato_detalhes(id_candidato, ano=ano) - - if not candidato: - raise HTTPException(status_code=404, detail="Candidato não encontrado") - - return CandidatoDetalhadoResponse(**candidato.__dict__) - - -@router.get("/tse/politico") -async def pesquisar_politico(nome: str = Query(..., min_length=3)): - """ - Search for a politician across multiple elections. - Returns consolidated career information. - """ - return await buscar_politico(nome) diff --git a/app/api/routes/entities.py b/app/api/routes/entities.py deleted file mode 100644 index 2727179e0e20a58a8a5893f1821a15c24df3013f..0000000000000000000000000000000000000000 --- a/app/api/routes/entities.py +++ /dev/null @@ -1,353 +0,0 @@ -""" -Entity CRUD Routes -""" -from fastapi import APIRouter, Depends, HTTPException, Query -from sqlalchemy.orm import Session -from sqlalchemy import or_ -from typing import List, Optional - -from app.api.deps import get_scoped_db -from app.models import Entity, Relationship -from app.schemas import EntityCreate, EntityUpdate, EntityResponse, GraphData, GraphNode, GraphEdge - -router = APIRouter(prefix="/entities", tags=["Entities"]) - - -@router.get("", response_model=List[EntityResponse]) -def list_entities( - type: Optional[str] = None, - search: Optional[str] = None, - project_id: Optional[str] = None, - limit: int = Query(default=50, le=200), - offset: int = 0, - db: Session = Depends(get_scoped_db) -): - """Lista todas as entidades com filtros opcionais""" - query = db.query(Entity) - - if project_id: - query = query.filter(Entity.project_id == project_id) - - if type: - query = query.filter(Entity.type == type) - - if search: - query = query.filter( - or_( - Entity.name.ilike(f"%{search}%"), - Entity.description.ilike(f"%{search}%") - ) - ) - - query = query.order_by(Entity.created_at.desc()) - return query.offset(offset).limit(limit).all() - - -@router.get("/types") -def get_entity_types(db: Session = Depends(get_scoped_db)): - """Retorna todos os tipos de entidade únicos""" - types = db.query(Entity.type).distinct().all() - return [t[0] for t in types] - - -@router.get("/suggest-merge") -async def suggest_merge_candidates( - limit: int = Query(default=10, le=50), - db: Session = Depends(get_scoped_db) -): - """ - Use LLM to find potential duplicate entities that could be merged. - Returns pairs of entities that might be the same. - """ - import httpx - import json - import re - from app.config import settings - - # Get all entities - entities = db.query(Entity).order_by(Entity.name).limit(200).all() - - if len(entities) < 2: - return {"candidates": [], "message": "Not enough entities to compare"} - - # Build entity list for LLM - entity_list = [] - for e in entities: - aliases = (e.properties or {}).get("aliases", []) - entity_list.append({ - "id": e.id, - "name": e.name, - "type": e.type, - "aliases": aliases[:5] if aliases else [] - }) - - # Ask LLM to find duplicates - prompt = f"""Analise esta lista de entidades e encontre possíveis DUPLICATAS (mesma pessoa/organização/local com nomes diferentes). - -Entidades: -{entity_list[:100]} - -Retorne APENAS um JSON válido com pares de IDs que são provavelmente a mesma entidade: -```json -{{ - "duplicates": [ - {{ - "id1": "uuid1", - "id2": "uuid2", - "confidence": 0.95, - "reason": "Mesmo nome com variação" - }} - ] -}} -``` - -Se não houver duplicatas, retorne: {{"duplicates": []}} -""" - - try: - async with httpx.AsyncClient(timeout=30.0) as client: - response = await client.post( - "https://api.cerebras.ai/v1/chat/completions", - headers={ - "Authorization": f"Bearer {settings.cerebras_api_key}", - "Content-Type": "application/json" - }, - json={ - "model": "zai-glm-4.7", - "messages": [ - {"role": "system", "content": "Você é um especialista em detecção de entidades duplicadas. Responda apenas em JSON válido."}, - {"role": "user", "content": prompt} - ], - "temperature": 0.1, - "max_tokens": 1024 - } - ) - - if response.status_code != 200: - return {"candidates": [], "error": "LLM API error"} - - data = response.json() - content = data["choices"][0]["message"]["content"] - - # Parse JSON from response - json_match = re.search(r'\{.*\}', content, re.DOTALL) - if json_match: - result = json.loads(json_match.group(0)) - - # Enrich with entity names - candidates = [] - for dup in result.get("duplicates", [])[:limit]: - e1 = next((e for e in entities if e.id == dup.get("id1")), None) - e2 = next((e for e in entities if e.id == dup.get("id2")), None) - if e1 and e2: - candidates.append({ - "entity1": {"id": e1.id, "name": e1.name, "type": e1.type}, - "entity2": {"id": e2.id, "name": e2.name, "type": e2.type}, - "confidence": dup.get("confidence", 0.5), - "reason": dup.get("reason", "Possível duplicata") - }) - - return {"candidates": candidates} - - return {"candidates": [], "message": "No duplicates found"} - - except Exception as e: - return {"candidates": [], "error": str(e)} - - -@router.get("/{entity_id}", response_model=EntityResponse) -def get_entity(entity_id: str, db: Session = Depends(get_scoped_db)): - """Busca uma entidade por ID""" - entity = db.query(Entity).filter(Entity.id == entity_id).first() - if not entity: - raise HTTPException(status_code=404, detail="Entity not found") - return entity - - -@router.post("", response_model=EntityResponse, status_code=201) -def create_entity(entity: EntityCreate, db: Session = Depends(get_scoped_db)): - """Cria uma nova entidade""" - db_entity = Entity(**entity.model_dump()) - db.add(db_entity) - db.commit() - db.refresh(db_entity) - return db_entity - - -@router.put("/{entity_id}", response_model=EntityResponse) -def update_entity(entity_id: str, entity: EntityUpdate, db: Session = Depends(get_scoped_db)): - """Atualiza uma entidade existente""" - db_entity = db.query(Entity).filter(Entity.id == entity_id).first() - if not db_entity: - raise HTTPException(status_code=404, detail="Entity not found") - - update_data = entity.model_dump(exclude_unset=True) - for field, value in update_data.items(): - setattr(db_entity, field, value) - - db.commit() - db.refresh(db_entity) - return db_entity - - -@router.delete("/{entity_id}") -def delete_entity(entity_id: str, db: Session = Depends(get_scoped_db)): - """Deleta uma entidade""" - db_entity = db.query(Entity).filter(Entity.id == entity_id).first() - if not db_entity: - raise HTTPException(status_code=404, detail="Entity not found") - - # Delete related relationships - db.query(Relationship).filter( - or_( - Relationship.source_id == entity_id, - Relationship.target_id == entity_id - ) - ).delete() - - db.delete(db_entity) - db.commit() - return {"message": "Entity deleted"} - - -@router.get("/{entity_id}/connections", response_model=GraphData) -def get_entity_connections( - entity_id: str, - depth: int = Query(default=1, le=3), - db: Session = Depends(get_scoped_db) -): - """ - Retorna o grafo de conexões de uma entidade - Usado para visualização de rede no frontend - """ - entity = db.query(Entity).filter(Entity.id == entity_id).first() - if not entity: - raise HTTPException(status_code=404, detail="Entity not found") - - nodes = {} - edges = [] - visited = set() - - def explore(eid: str, current_depth: int): - if current_depth > depth or eid in visited: - return - visited.add(eid) - - e = db.query(Entity).filter(Entity.id == eid).first() - if not e: - return - - nodes[e.id] = GraphNode( - id=e.id, - type=e.type, - name=e.name, - properties=e.properties or {} - ) - - # Outgoing relationships - for rel in db.query(Relationship).filter(Relationship.source_id == eid).all(): - edges.append(GraphEdge( - source=rel.source_id, - target=rel.target_id, - type=rel.type, - confidence=rel.confidence - )) - explore(rel.target_id, current_depth + 1) - - # Incoming relationships - for rel in db.query(Relationship).filter(Relationship.target_id == eid).all(): - edges.append(GraphEdge( - source=rel.source_id, - target=rel.target_id, - type=rel.type, - confidence=rel.confidence - )) - explore(rel.source_id, current_depth + 1) - - explore(entity_id, 0) - - return GraphData( - nodes=list(nodes.values()), - edges=edges - ) - - -@router.post("/merge") -def merge_entities( - primary_id: str, - secondary_id: str, - db: Session = Depends(get_scoped_db) -): - """ - Merge two entities into one. - The primary entity is kept, the secondary is deleted. - All relationships from secondary are transferred to primary. - """ - if primary_id == secondary_id: - raise HTTPException(status_code=400, detail="Cannot merge entity with itself") - - primary = db.query(Entity).filter(Entity.id == primary_id).first() - secondary = db.query(Entity).filter(Entity.id == secondary_id).first() - - if not primary: - raise HTTPException(status_code=404, detail="Primary entity not found") - if not secondary: - raise HTTPException(status_code=404, detail="Secondary entity not found") - - # Merge properties - primary_props = primary.properties or {} - secondary_props = secondary.properties or {} - - # Add aliases from secondary - aliases = primary_props.get("aliases", []) or [] - if secondary.name not in aliases: - aliases.append(secondary.name) - secondary_aliases = secondary_props.get("aliases", []) or [] - for alias in secondary_aliases: - if alias not in aliases: - aliases.append(alias) - primary_props["aliases"] = aliases - - # Add merge history - merge_history = primary_props.get("merged_from", []) or [] - merge_history.append({ - "id": secondary.id, - "name": secondary.name, - "source": secondary.source - }) - primary_props["merged_from"] = merge_history - - # Combine descriptions if primary has none - if not primary.description and secondary.description: - primary.description = secondary.description - - primary.properties = primary_props - - # Transfer relationships from secondary to primary - # Update source_id - db.query(Relationship).filter( - Relationship.source_id == secondary_id - ).update({"source_id": primary_id}) - - # Update target_id - db.query(Relationship).filter( - Relationship.target_id == secondary_id - ).update({"target_id": primary_id}) - - # Delete duplicate relationships (same source, target, type) - # This is a simple approach - in production you'd want more sophisticated deduplication - - # Delete the secondary entity - db.delete(secondary) - db.commit() - db.refresh(primary) - - return { - "message": f"Merged '{secondary.name}' into '{primary.name}'", - "primary": { - "id": primary.id, - "name": primary.name, - "aliases": aliases - } - } - diff --git a/app/api/routes/events.py b/app/api/routes/events.py deleted file mode 100644 index 19a16292e599f2a33bffe593cf788d69be9f28dd..0000000000000000000000000000000000000000 --- a/app/api/routes/events.py +++ /dev/null @@ -1,113 +0,0 @@ -""" -Events CRUD Routes -""" -from fastapi import APIRouter, Depends, HTTPException, Query -from sqlalchemy.orm import Session -from sqlalchemy import or_ -from typing import List, Optional -from datetime import datetime - -from app.api.deps import get_scoped_db -from app.models import Event -from app.schemas import EventCreate, EventResponse - -router = APIRouter(prefix="/events", tags=["Events"]) - - -@router.get("/", response_model=List[EventResponse]) -def list_events( - type: Optional[str] = None, - search: Optional[str] = None, - start_date: Optional[datetime] = None, - end_date: Optional[datetime] = None, - limit: int = Query(default=50, le=200), - offset: int = 0, - db: Session = Depends(get_scoped_db) -): - """Lista eventos com filtros opcionais""" - query = db.query(Event) - - if type: - query = query.filter(Event.type == type) - - if search: - query = query.filter( - or_( - Event.title.ilike(f"%{search}%"), - Event.description.ilike(f"%{search}%") - ) - ) - - if start_date: - query = query.filter(Event.event_date >= start_date) - if end_date: - query = query.filter(Event.event_date <= end_date) - - query = query.order_by(Event.event_date.desc().nullslast()) - return query.offset(offset).limit(limit).all() - - -@router.get("/types") -def get_event_types(db: Session = Depends(get_scoped_db)): - """Retorna todos os tipos de evento unicos""" - types = db.query(Event.type).distinct().all() - return [t[0] for t in types] - - -@router.get("/timeline") -def get_timeline( - entity_id: Optional[str] = None, - limit: int = Query(default=50, le=200), - db: Session = Depends(get_scoped_db) -): - """ - Retorna eventos em formato timeline. - """ - query = db.query(Event).filter(Event.event_date.isnot(None)) - - if entity_id: - query = query.filter(Event.entity_ids.contains([entity_id])) - - events = query.order_by(Event.event_date.asc()).limit(limit).all() - - return [ - { - "id": e.id, - "title": e.title, - "date": e.event_date.isoformat() if e.event_date else None, - "type": e.type, - "location": e.location_name - } - for e in events - ] - - -@router.get("/{event_id}", response_model=EventResponse) -def get_event(event_id: str, db: Session = Depends(get_scoped_db)): - """Busca um evento por ID""" - event = db.query(Event).filter(Event.id == event_id).first() - if not event: - raise HTTPException(status_code=404, detail="Event not found") - return event - - -@router.post("/", response_model=EventResponse, status_code=201) -def create_event(event: EventCreate, db: Session = Depends(get_scoped_db)): - """Cria um novo evento""" - db_event = Event(**event.model_dump()) - db.add(db_event) - db.commit() - db.refresh(db_event) - return db_event - - -@router.delete("/{event_id}") -def delete_event(event_id: str, db: Session = Depends(get_scoped_db)): - """Deleta um evento""" - db_event = db.query(Event).filter(Event.id == event_id).first() - if not db_event: - raise HTTPException(status_code=404, detail="Event not found") - - db.delete(db_event) - db.commit() - return {"message": "Event deleted"} diff --git a/app/api/routes/graph.py b/app/api/routes/graph.py deleted file mode 100644 index 66a0886d6fb53b0884ef1e803f2acefd29f92873..0000000000000000000000000000000000000000 --- a/app/api/routes/graph.py +++ /dev/null @@ -1,173 +0,0 @@ -""" -Graph API Routes - Network visualization endpoints -""" -from fastapi import APIRouter, Depends, HTTPException, Query -from typing import Optional, List -from sqlalchemy.orm import Session -from sqlalchemy import or_ - -from app.api.deps import get_scoped_db -from app.models.entity import Entity, Relationship - - -router = APIRouter(prefix="/graph", tags=["Graph"]) - - -@router.get("") -async def get_graph( - entity_type: Optional[str] = Query(None, description="Filter by entity type"), - limit: int = Query(100, le=500, description="Maximum number of entities"), - db: Session = Depends(get_scoped_db) -): - """ - Get graph data for visualization. - Returns nodes (entities) and edges (relationships). - """ - try: - # Get entities - query = db.query(Entity) - if entity_type: - query = query.filter(Entity.type == entity_type) - - entities = query.limit(limit).all() - entity_ids = [e.id for e in entities] - - # Get relationships between these entities - relationships = db.query(Relationship).filter( - or_( - Relationship.source_id.in_(entity_ids), - Relationship.target_id.in_(entity_ids) - ) - ).all() - - # Format for Cytoscape.js - nodes = [] - for e in entities: - nodes.append({ - "data": { - "id": e.id, - "label": e.name[:30] + "..." if len(e.name) > 30 else e.name, - "fullName": e.name, - "type": e.type, - "description": e.description[:100] if e.description else "", - "source": e.source or "unknown" - } - }) - - edges = [] - for r in relationships: - if r.source_id in entity_ids and r.target_id in entity_ids: - edges.append({ - "data": { - "id": r.id, - "source": r.source_id, - "target": r.target_id, - "label": r.type, - "type": r.type - } - }) - - return { - "nodes": nodes, - "edges": edges, - "stats": { - "total_nodes": len(nodes), - "total_edges": len(edges) - } - } - - except Exception as e: - raise HTTPException(status_code=500, detail=f"Failed to get graph: {str(e)}") - - -@router.get("/entity/{entity_id}") -async def get_entity_graph( - entity_id: str, - depth: int = Query(1, ge=1, le=3, description="How many levels of connections to include"), - db: Session = Depends(get_scoped_db) -): - """ - Get graph centered on a specific entity. - """ - try: - # Get the central entity - central = db.query(Entity).filter(Entity.id == entity_id).first() - if not central: - raise HTTPException(status_code=404, detail="Entity not found") - - # Collect entity IDs at each depth level - collected_ids = {entity_id} - current_level = {entity_id} - - for _ in range(depth): - rels = db.query(Relationship).filter( - or_( - Relationship.source_id.in_(current_level), - Relationship.target_id.in_(current_level) - ) - ).all() - - next_level = set() - for r in rels: - next_level.add(r.source_id) - next_level.add(r.target_id) - - current_level = next_level - collected_ids - collected_ids.update(next_level) - - # Get all entities - entities = db.query(Entity).filter(Entity.id.in_(collected_ids)).all() - - # Get all relationships between collected entities - relationships = db.query(Relationship).filter( - Relationship.source_id.in_(collected_ids), - Relationship.target_id.in_(collected_ids) - ).all() - - # Format for Cytoscape - nodes = [] - for e in entities: - nodes.append({ - "data": { - "id": e.id, - "label": e.name[:30] + "..." if len(e.name) > 30 else e.name, - "fullName": e.name, - "type": e.type, - "description": e.description[:100] if e.description else "", - "source": e.source or "unknown", - "isCentral": e.id == entity_id - } - }) - - edges = [] - for r in relationships: - edges.append({ - "data": { - "id": r.id, - "source": r.source_id, - "target": r.target_id, - "label": r.type, - "type": r.type - } - }) - - return { - "central": { - "id": central.id, - "name": central.name, - "type": central.type - }, - "nodes": nodes, - "edges": edges, - "stats": { - "total_nodes": len(nodes), - "total_edges": len(edges), - "depth": depth - } - } - - except HTTPException: - raise - except Exception as e: - raise HTTPException(status_code=500, detail=f"Failed to get entity graph: {str(e)}") - diff --git a/app/api/routes/ingest.py b/app/api/routes/ingest.py deleted file mode 100644 index d2216481b8ad615180d4b4bfad5d7f24e453774a..0000000000000000000000000000000000000000 --- a/app/api/routes/ingest.py +++ /dev/null @@ -1,341 +0,0 @@ -""" -Data Ingestion Routes -Endpoints para importar dados de fontes externas -""" -from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks -from sqlalchemy.orm import Session -from typing import Optional, List -from datetime import datetime -import asyncio - -from app.api.deps import get_scoped_db -from app.models import Entity, Document, Relationship -from app.schemas import EntityResponse, DocumentResponse -from app.services.ingestion import wikipedia_scraper, news_service -from app.services.nlp import entity_extractor -from app.services.geocoding import geocode - -router = APIRouter(prefix="/ingest", tags=["Data Ingestion"]) - - -def parse_event_date(date_str): - """Parse date string to datetime object""" - if not date_str: - return None - try: - # Try YYYY-MM-DD format - return datetime.strptime(date_str[:10], "%Y-%m-%d") - except: - try: - # Try YYYY format - return datetime.strptime(date_str[:4], "%Y") - except: - return None - - -# ========== Wikipedia ========== - -@router.get("/wikipedia/search") -def search_wikipedia(q: str, limit: int = 10): - """Busca artigos na Wikipedia""" - results = wikipedia_scraper.search(q, limit) - return results - - -@router.post("/wikipedia/entity", response_model=EntityResponse) -async def import_from_wikipedia( - title: str, - entity_type: str = "person", - project_id: Optional[str] = None, - auto_extract: bool = True, - db: Session = Depends(get_scoped_db) -): - """ - Importa uma entidade da Wikipedia - entity_type: person, organization, location - project_id: ID do projeto para associar a entidade - auto_extract: Se True, usa LLM para extrair entidades relacionadas - """ - # Check if entity already exists - existing = db.query(Entity).filter( - Entity.name == title, - Entity.source == "wikipedia" - ).first() - - if existing: - return existing - - # Scrape based on type - if entity_type == "person": - data = wikipedia_scraper.scrape_person(title) - elif entity_type == "organization": - data = wikipedia_scraper.scrape_organization(title) - elif entity_type == "location": - data = wikipedia_scraper.scrape_location(title) - else: - data = wikipedia_scraper.scrape_person(title) # default - - if not data: - raise HTTPException(status_code=404, detail="Article not found on Wikipedia") - - # Create main entity with project_id - entity = Entity(**data) - entity.project_id = project_id - db.add(entity) - db.commit() - db.refresh(entity) - - # Auto-extract entities and relationships using LLM - if auto_extract and data.get("description"): - try: - # Limit text to avoid token limits - text_to_analyze = data["description"][:3000] - result = await entity_extractor.extract(text_to_analyze) - - # Create extracted entities - created_entities = {} - for ext_entity in result.entities: - # Skip if same as main entity - if ext_entity.name.lower() == title.lower(): - created_entities[ext_entity.name] = entity - continue - - # Check if entity exists (by similar name) - existing_ent = db.query(Entity).filter( - Entity.name.ilike(f"%{ext_entity.name}%") - ).first() - - if existing_ent: - created_entities[ext_entity.name] = existing_ent - else: - # Get coordinates for location entities - lat, lng = None, None - if ext_entity.type == "location": - coords = await geocode(ext_entity.name) - if coords: - lat, lng = coords - - # Parse event_date - event_date = parse_event_date(getattr(ext_entity, 'event_date', None)) - - new_ent = Entity( - name=ext_entity.name, - type=ext_entity.type if ext_entity.type in ["person", "organization", "location", "event"] else "person", - description=ext_entity.description or ext_entity.role, - source="wikipedia_extraction", - latitude=lat, - longitude=lng, - event_date=event_date, - project_id=project_id, - properties={"role": ext_entity.role, "aliases": ext_entity.aliases, "extracted_from": title} - ) - db.add(new_ent) - db.commit() - db.refresh(new_ent) - created_entities[ext_entity.name] = new_ent - - # Create relationships - for rel in result.relationships: - source_ent = created_entities.get(rel.source) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.source}%")).first() - target_ent = created_entities.get(rel.target) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.target}%")).first() - - if source_ent and target_ent and source_ent.id != target_ent.id: - # Check if relationship exists - existing_rel = db.query(Relationship).filter( - Relationship.source_id == source_ent.id, - Relationship.target_id == target_ent.id, - Relationship.type == rel.relationship_type - ).first() - - if not existing_rel: - # Parse relationship event_date - rel_event_date = parse_event_date(getattr(rel, 'event_date', None)) - - new_rel = Relationship( - source_id=source_ent.id, - target_id=target_ent.id, - type=rel.relationship_type, - event_date=rel_event_date, - properties={"context": rel.context, "extracted_from": title} - ) - db.add(new_rel) - - db.commit() - - except Exception as e: - print(f"NER extraction error: {e}") - # Continue without extraction if it fails - - return entity - - -# ========== News ========== - -@router.get("/news/feeds") -def list_available_feeds(): - """Lista os feeds de notícias disponíveis""" - return list(news_service.RSS_FEEDS.keys()) - - -@router.get("/news/fetch") -def fetch_news(feed: Optional[str] = None): - """ - Busca notícias dos feeds RSS - Se feed não for especificado, busca de todos - """ - if feed: - if feed not in news_service.RSS_FEEDS: - raise HTTPException(status_code=404, detail="Feed not found") - url = news_service.RSS_FEEDS[feed] - articles = news_service.fetch_feed(url) - else: - articles = news_service.fetch_all_feeds() - - return articles - - -@router.get("/news/search") -def search_news(q: str): - """Busca notícias por palavra-chave via Google News""" - return news_service.search_news(q) - - -@router.post("/news/import") -async def import_news( - query: Optional[str] = None, - feed: Optional[str] = None, - auto_extract: bool = True, - db: Session = Depends(get_scoped_db) -): - """ - Importa notícias como documentos no sistema - auto_extract: Se True, usa LLM para extrair entidades de cada notícia - """ - if query: - articles = news_service.search_news(query) - elif feed: - if feed not in news_service.RSS_FEEDS: - raise HTTPException(status_code=404, detail="Feed not found") - articles = news_service.fetch_feed(news_service.RSS_FEEDS[feed]) - else: - articles = news_service.fetch_all_feeds() - - imported = 0 - extracted_entities = 0 - - for article in articles: - # Check if document already exists (by URL) - if article.get("url"): - existing = db.query(Document).filter( - Document.source_url == article["url"] - ).first() - if existing: - continue - - doc_data = news_service.to_document(article) - doc = Document(**doc_data) - db.add(doc) - db.commit() - imported += 1 - - # Extract entities from article content - if auto_extract: - try: - text_to_analyze = f"{article.get('title', '')} {article.get('description', '')}".strip() - if len(text_to_analyze) >= 20: - result = await entity_extractor.extract(text_to_analyze[:2000]) - - created_entities = {} - for ext_entity in result.entities: - # Check if entity exists - existing_ent = db.query(Entity).filter( - Entity.name.ilike(f"%{ext_entity.name}%") - ).first() - - if existing_ent: - created_entities[ext_entity.name] = existing_ent - else: - # Get coordinates for location entities - lat, lng = None, None - if ext_entity.type == "location": - coords = await geocode(ext_entity.name) - if coords: - lat, lng = coords - - new_ent = Entity( - name=ext_entity.name, - type=ext_entity.type if ext_entity.type in ["person", "organization", "location", "event"] else "person", - description=ext_entity.description or ext_entity.role, - source="news_extraction", - latitude=lat, - longitude=lng, - properties={"role": ext_entity.role, "aliases": ext_entity.aliases, "from_article": article.get('title', '')} - ) - db.add(new_ent) - db.commit() - db.refresh(new_ent) - created_entities[ext_entity.name] = new_ent - extracted_entities += 1 - - # Create relationships - for rel in result.relationships: - source_ent = created_entities.get(rel.source) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.source}%")).first() - target_ent = created_entities.get(rel.target) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.target}%")).first() - - if source_ent and target_ent and source_ent.id != target_ent.id: - existing_rel = db.query(Relationship).filter( - Relationship.source_id == source_ent.id, - Relationship.target_id == target_ent.id, - Relationship.type == rel.relationship_type - ).first() - - if not existing_rel: - new_rel = Relationship( - source_id=source_ent.id, - target_id=target_ent.id, - type=rel.relationship_type, - properties={"context": rel.context} - ) - db.add(new_rel) - - db.commit() - - except Exception as e: - print(f"NER extraction error for article: {e}") - # Continue without extraction - - return { - "message": f"Imported {imported} articles", - "total_found": len(articles), - "extracted_entities": extracted_entities - } - - -# ========== Manual Import ========== - -@router.post("/bulk/entities") -def bulk_import_entities( - entities: List[dict], - db: Session = Depends(get_scoped_db) -): - """ - Importa múltiplas entidades de uma vez - Útil para importar de CSV/JSON - """ - imported = 0 - for entity_data in entities: - entity = Entity( - type=entity_data.get("type", "unknown"), - name=entity_data.get("name", "Unnamed"), - description=entity_data.get("description"), - properties=entity_data.get("properties", {}), - latitude=entity_data.get("latitude"), - longitude=entity_data.get("longitude"), - source=entity_data.get("source", "manual") - ) - db.add(entity) - imported += 1 - - db.commit() - - return {"message": f"Imported {imported} entities"} diff --git a/app/api/routes/investigate.py b/app/api/routes/investigate.py deleted file mode 100644 index 646857df8ac0eed0f99ac443367d25c7a6af1512..0000000000000000000000000000000000000000 --- a/app/api/routes/investigate.py +++ /dev/null @@ -1,207 +0,0 @@ -""" -Investigation API Routes - Build dossiers on companies and people -""" -from fastapi import APIRouter, HTTPException, Depends -from pydantic import BaseModel, Field -from typing import Optional, List, Dict, Any -from sqlalchemy.orm import Session - -from app.services.investigation import ( - investigar_empresa, - investigar_pessoa, - dossier_to_dict -) -from app.services.brazil_apis import consultar_cnpj -from app.services.investigator_agent import investigator_agent -from app.api.deps import get_scoped_db - - -router = APIRouter(prefix="/investigate", tags=["Investigation"]) - - -class InvestigateCompanyRequest(BaseModel): - """Request to investigate a company""" - cnpj: str = Field(..., min_length=11, description="CNPJ da empresa") - - -class InvestigatePersonRequest(BaseModel): - """Request to investigate a person""" - nome: str = Field(..., min_length=2, description="Nome da pessoa") - cpf: Optional[str] = Field(None, description="CPF (opcional)") - - -class DossierResponse(BaseModel): - """Dossier response""" - tipo: str - alvo: str - cnpj_cpf: Optional[str] - red_flags: List[str] - score_risco: int - data_geracao: str - fonte_dados: List[str] - secoes: Dict[str, Any] - - -class CNPJResponse(BaseModel): - """Quick CNPJ lookup response""" - cnpj: str - razao_social: str - nome_fantasia: str - situacao: str - data_abertura: str - capital_social: float - endereco: str - telefone: str - email: str - atividade: str - socios: List[Dict[str, Any]] - - -@router.post("/company", response_model=DossierResponse) -async def investigate_company(request: InvestigateCompanyRequest): - """ - Build a comprehensive dossier on a company. - - Collects: - - Cadastral data from CNPJ - - Partners/owners - - Sanctions (CEIS, CNEP, CEPIM) - - News and media mentions - - Related entities - - Returns risk score and red flags. - """ - try: - dossier = await investigar_empresa(request.cnpj) - return DossierResponse(**dossier_to_dict(dossier)) - - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -@router.post("/person", response_model=DossierResponse) -async def investigate_person(request: InvestigatePersonRequest): - """ - Build a dossier on a person. - - Note: Due to LGPD, personal data is limited. - Mainly uses web search for public information. - """ - try: - dossier = await investigar_pessoa(request.nome, request.cpf) - return DossierResponse(**dossier_to_dict(dossier)) - - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -@router.get("/cnpj/{cnpj}", response_model=CNPJResponse) -async def lookup_cnpj(cnpj: str): - """ - Quick CNPJ lookup - returns basic company data. - """ - try: - data = await consultar_cnpj(cnpj) - - if not data: - raise HTTPException(status_code=404, detail="CNPJ não encontrado") - - return CNPJResponse( - cnpj=data.cnpj, - razao_social=data.razao_social, - nome_fantasia=data.nome_fantasia, - situacao=data.situacao, - data_abertura=data.data_abertura, - capital_social=data.capital_social, - endereco=f"{data.logradouro}, {data.numero} - {data.bairro}, {data.cidade}/{data.uf}", - telefone=data.telefone, - email=data.email, - atividade=f"{data.cnae_principal} - {data.cnae_descricao}", - socios=data.socios - ) - - except HTTPException: - raise - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -# =========================================== -# Autonomous Investigation Agent -# =========================================== - -class AgentInvestigateRequest(BaseModel): - """Request for autonomous investigation""" - mission: str = Field(..., min_length=5, description="Missão de investigação em linguagem natural") - max_iterations: int = Field(10, ge=1, le=20, description="Máximo de iterações do agente") - - -class FindingResponse(BaseModel): - """A finding from investigation""" - title: str - content: str - source: str - timestamp: str - - -class AgentInvestigateResponse(BaseModel): - """Response from autonomous investigation""" - mission: str - status: str - report: str - findings: List[FindingResponse] - entities_discovered: int - connections_mapped: int - iterations: int - tools_used: List[str] - - -@router.post("/agent", response_model=AgentInvestigateResponse) -async def investigate_with_agent( - request: AgentInvestigateRequest, - db: Session = Depends(get_scoped_db) -): - """ - Autonomous investigation with AI agent. - - The agent will: - 1. Search NUMIDIUM for existing entities - 2. Query CNPJ data for Brazilian companies - 3. Search the web for news and public info - 4. Follow leads and connections - 5. Generate a comprehensive report - - Example missions: - - "Investigue a rede de empresas de João Silva" - - "Descubra os sócios da empresa CNPJ 11.222.333/0001-44" - - "Pesquise sobre a empresa XYZ e suas conexões" - """ - try: - result = await investigator_agent.investigate( - mission=request.mission, - db=db, - max_iterations=request.max_iterations - ) - - return AgentInvestigateResponse( - mission=result.mission, - status=result.status, - report=result.report, - findings=[ - FindingResponse( - title=f.title, - content=f.content, - source=f.source, - timestamp=f.timestamp - ) - for f in result.findings - ], - entities_discovered=len(result.entities_discovered), - connections_mapped=len(result.connections_mapped), - iterations=result.iterations, - tools_used=result.tools_used - ) - - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - diff --git a/app/api/routes/projects.py b/app/api/routes/projects.py deleted file mode 100644 index d283d8b2d566c49e7a32cba8acc10b39307b4299..0000000000000000000000000000000000000000 --- a/app/api/routes/projects.py +++ /dev/null @@ -1,135 +0,0 @@ -""" -Projects API Routes - Workspace management -""" -from fastapi import APIRouter, Depends, HTTPException -from pydantic import BaseModel -from typing import Optional, List -from datetime import datetime -from sqlalchemy.orm import Session - -from app.api.deps import get_scoped_db -from app.models import Project, Entity, Relationship - - -router = APIRouter(prefix="/projects", tags=["Projects"]) - - -class ProjectCreate(BaseModel): - name: str - description: Optional[str] = None - color: str = "#00d4ff" - icon: str = "folder" - - -class ProjectResponse(BaseModel): - id: str - name: str - description: Optional[str] - color: str - icon: str - entity_count: int = 0 - created_at: datetime - - class Config: - from_attributes = True - - -@router.get("", response_model=List[ProjectResponse]) -def list_projects(db: Session = Depends(get_scoped_db)): - """List all projects""" - projects = db.query(Project).order_by(Project.created_at.desc()).all() - - result = [] - for p in projects: - entity_count = db.query(Entity).filter(Entity.project_id == p.id).count() - result.append(ProjectResponse( - id=p.id, - name=p.name, - description=p.description, - color=p.color, - icon=p.icon, - entity_count=entity_count, - created_at=p.created_at - )) - - return result - - -@router.post("", response_model=ProjectResponse) -def create_project(project: ProjectCreate, db: Session = Depends(get_scoped_db)): - """Create a new project""" - new_project = Project( - name=project.name, - description=project.description, - color=project.color, - icon=project.icon - ) - db.add(new_project) - db.commit() - db.refresh(new_project) - - return ProjectResponse( - id=new_project.id, - name=new_project.name, - description=new_project.description, - color=new_project.color, - icon=new_project.icon, - entity_count=0, - created_at=new_project.created_at - ) - - -@router.get("/{project_id}", response_model=ProjectResponse) -def get_project(project_id: str, db: Session = Depends(get_scoped_db)): - """Get project by ID""" - project = db.query(Project).filter(Project.id == project_id).first() - - if not project: - raise HTTPException(status_code=404, detail="Project not found") - - entity_count = db.query(Entity).filter(Entity.project_id == project_id).count() - - return ProjectResponse( - id=project.id, - name=project.name, - description=project.description, - color=project.color, - icon=project.icon, - entity_count=entity_count, - created_at=project.created_at - ) - - -@router.delete("/{project_id}") -def delete_project(project_id: str, db: Session = Depends(get_scoped_db)): - """Delete project and optionally its entities""" - project = db.query(Project).filter(Project.id == project_id).first() - - if not project: - raise HTTPException(status_code=404, detail="Project not found") - - # Set entities and relationships to no project (null) - db.query(Entity).filter(Entity.project_id == project_id).update({"project_id": None}) - db.query(Relationship).filter(Relationship.project_id == project_id).update({"project_id": None}) - - db.delete(project) - db.commit() - - return {"message": f"Project '{project.name}' deleted"} - - -@router.put("/{project_id}") -def update_project(project_id: str, project: ProjectCreate, db: Session = Depends(get_scoped_db)): - """Update project""" - existing = db.query(Project).filter(Project.id == project_id).first() - - if not existing: - raise HTTPException(status_code=404, detail="Project not found") - - existing.name = project.name - existing.description = project.description - existing.color = project.color - existing.icon = project.icon - db.commit() - - return {"message": "Project updated"} diff --git a/app/api/routes/relationships.py b/app/api/routes/relationships.py deleted file mode 100644 index e5887de9ce8df297614adf63c4db8d365fc33114..0000000000000000000000000000000000000000 --- a/app/api/routes/relationships.py +++ /dev/null @@ -1,76 +0,0 @@ -""" -Relationship CRUD Routes -""" -from fastapi import APIRouter, Depends, HTTPException, Query -from sqlalchemy.orm import Session -from typing import List, Optional - -from app.api.deps import get_scoped_db -from app.models import Relationship, Entity -from app.schemas import RelationshipCreate, RelationshipResponse - -router = APIRouter(prefix="/relationships", tags=["Relationships"]) - - -@router.get("/", response_model=List[RelationshipResponse]) -def list_relationships( - type: Optional[str] = None, - source_id: Optional[str] = None, - target_id: Optional[str] = None, - limit: int = Query(default=50, le=200), - db: Session = Depends(get_scoped_db) -): - """Lista relacionamentos com filtros opcionais""" - query = db.query(Relationship) - - if type: - query = query.filter(Relationship.type == type) - if source_id: - query = query.filter(Relationship.source_id == source_id) - if target_id: - query = query.filter(Relationship.target_id == target_id) - - return query.limit(limit).all() - - -@router.get("/types") -def get_relationship_types(db: Session = Depends(get_scoped_db)): - """Retorna todos os tipos de relacionamento unicos""" - types = db.query(Relationship.type).distinct().all() - return [t[0] for t in types] - - -@router.post("/", response_model=RelationshipResponse, status_code=201) -def create_relationship( - rel: RelationshipCreate, - db: Session = Depends(get_scoped_db) -): - """Cria um novo relacionamento entre entidades""" - source = db.query(Entity).filter(Entity.id == rel.source_id).first() - target = db.query(Entity).filter(Entity.id == rel.target_id).first() - - if not source: - raise HTTPException(status_code=404, detail="Source entity not found") - if not target: - raise HTTPException(status_code=404, detail="Target entity not found") - - db_rel = Relationship(**rel.model_dump()) - db.add(db_rel) - db.commit() - db.refresh(db_rel) - return db_rel - - -@router.delete("/{relationship_id}") -def delete_relationship( - relationship_id: str, - db: Session = Depends(get_scoped_db) -): - """Deleta um relacionamento""" - db_rel = db.query(Relationship).filter(Relationship.id == relationship_id).first() - if not db_rel: - raise HTTPException(status_code=404, detail="Relationship not found") - - db.delete(db_rel) - db.commit() - return {"message": "Relationship deleted"} diff --git a/app/api/routes/research.py b/app/api/routes/research.py deleted file mode 100644 index 41eb6efdb31bbc7cb0da78df28ce780a75fc0f9b..0000000000000000000000000000000000000000 --- a/app/api/routes/research.py +++ /dev/null @@ -1,158 +0,0 @@ -""" -Research API Routes - Deep research with automatic entity extraction -""" -from fastapi import APIRouter, Depends, HTTPException -from pydantic import BaseModel, Field -from typing import Optional, List -import traceback -from sqlalchemy.orm import Session - -from app.api.deps import get_scoped_db -from app.services import lancer -from app.services.nlp import entity_extractor -from app.services.geocoding import geocode -from app.models.entity import Entity, Relationship - - -router = APIRouter(prefix="/research", tags=["Research"]) - - -class ResearchRequest(BaseModel): - """Request model for research""" - query: str = Field(..., min_length=3, description="Research query") - mode: str = Field(default="search", description="Research mode: search, deep, heavy") - max_results: int = Field(default=10, le=20) - auto_extract: bool = Field(default=True, description="Auto-extract entities using NER") - - -class ResearchResponse(BaseModel): - """Response model for research""" - query: str - answer: Optional[str] - sources: List[dict] - citations: List[dict] - extracted_entities: int - extracted_relationships: int - processing_time_ms: float - - -@router.post("", response_model=ResearchResponse) -async def research(request: ResearchRequest, db: Session = Depends(get_scoped_db)): - """ - Perform AI-powered research using Lancer API and optionally extract entities. - - Modes: - - search: Fast search with AI synthesis - - deep: Multi-dimensional deep research (slower, more comprehensive) - - heavy: Search with full content scraping - """ - try: - # Call Lancer API based on mode - if request.mode == "deep": - result = await lancer.deep_research(request.query) - elif request.mode == "heavy": - result = await lancer.heavy_search(request.query, request.max_results) - else: - result = await lancer.search(request.query, request.max_results) - - extracted_entities = 0 - extracted_relationships = 0 - - # Extract entities if enabled - if request.auto_extract and result.raw_text: - try: - # Limit text to avoid token limits - text_to_analyze = result.raw_text[:5000] - ner_result = await entity_extractor.extract(text_to_analyze) - - created_entities = {} - - # Create entities - for entity in ner_result.entities: - # Check if exists - existing = db.query(Entity).filter( - Entity.name.ilike(f"%{entity.name}%") - ).first() - - if existing: - created_entities[entity.name] = existing - else: - # Geocode if location - lat, lng = None, None - if entity.type == "location": - coords = await geocode(entity.name) - if coords: - lat, lng = coords - - new_entity = Entity( - name=entity.name, - type=entity.type if entity.type in ["person", "organization", "location", "event"] else "person", - description=entity.description or entity.role or "", - source="lancer_research", - latitude=lat, - longitude=lng, - properties={ - "role": entity.role, - "aliases": entity.aliases, - "research_query": request.query - } - ) - db.add(new_entity) - db.commit() - db.refresh(new_entity) - created_entities[entity.name] = new_entity - extracted_entities += 1 - - # Create relationships - for rel in ner_result.relationships: - source_ent = created_entities.get(rel.source) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.source}%")).first() - target_ent = created_entities.get(rel.target) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.target}%")).first() - - if source_ent and target_ent and source_ent.id != target_ent.id: - existing_rel = db.query(Relationship).filter( - Relationship.source_id == source_ent.id, - Relationship.target_id == target_ent.id, - Relationship.type == rel.relationship_type - ).first() - - if not existing_rel: - new_rel = Relationship( - source_id=source_ent.id, - target_id=target_ent.id, - type=rel.relationship_type, - properties={"context": rel.context, "research_query": request.query} - ) - db.add(new_rel) - extracted_relationships += 1 - - db.commit() - - except Exception as e: - print(f"NER extraction error: {e}") - traceback.print_exc() - - # Prepare sources for response - sources = [ - { - "title": r.title, - "url": r.url, - "content": r.content[:300] if r.content else "", - "score": r.score - } - for r in result.results[:10] - ] - - return ResearchResponse( - query=result.query, - answer=result.answer, - sources=sources, - citations=result.citations, - extracted_entities=extracted_entities, - extracted_relationships=extracted_relationships, - processing_time_ms=result.processing_time_ms - ) - - except Exception as e: - print(f"Research error: {e}") - traceback.print_exc() - raise HTTPException(status_code=500, detail=str(e)) diff --git a/app/api/routes/search.py b/app/api/routes/search.py deleted file mode 100644 index 27ad925fb6abc0eb121ff2660bc06fd55fd322f0..0000000000000000000000000000000000000000 --- a/app/api/routes/search.py +++ /dev/null @@ -1,126 +0,0 @@ -""" -Search and Analytics Routes -""" -from fastapi import APIRouter, Depends, Query -from sqlalchemy.orm import Session -from sqlalchemy import or_, func -from typing import Optional - -from app.api.deps import get_scoped_db -from app.models import Entity, Relationship, Event, Document -from app.schemas import SearchResult, SystemStats - -router = APIRouter(prefix="/search", tags=["Search"]) - - -@router.get("", response_model=SearchResult) -def global_search( - q: str = Query(..., min_length=2, description="Search query"), - types: Optional[str] = Query(None, description="Entity types (comma-separated)"), - limit: int = Query(default=20, le=100), - db: Session = Depends(get_scoped_db) -): - """ - Busca global em todas as entidades, eventos e documentos. - """ - search_term = f"%{q}%" - type_filter = types.split(",") if types else None - - entity_query = db.query(Entity).filter( - or_( - Entity.name.ilike(search_term), - Entity.description.ilike(search_term) - ) - ) - if type_filter: - entity_query = entity_query.filter(Entity.type.in_(type_filter)) - entities = entity_query.limit(limit).all() - - events = db.query(Event).filter( - or_( - Event.title.ilike(search_term), - Event.description.ilike(search_term) - ) - ).limit(limit).all() - - documents = db.query(Document).filter( - or_( - Document.title.ilike(search_term), - Document.content.ilike(search_term) - ) - ).limit(limit).all() - - return SearchResult( - entities=entities, - events=events, - documents=documents - ) - - -@router.get("/stats", response_model=SystemStats) -def get_system_stats(db: Session = Depends(get_scoped_db)): - """ - Retorna estatisticas gerais do sistema. - """ - total_entities = db.query(Entity).count() - total_relationships = db.query(Relationship).count() - total_events = db.query(Event).count() - total_documents = db.query(Document).count() - - type_counts = db.query( - Entity.type, - func.count(Entity.id) - ).group_by(Entity.type).all() - - entities_by_type = {t: c for t, c in type_counts} - - recent = db.query(Entity).order_by(Entity.created_at.desc()).limit(10).all() - recent_activity = [ - { - "id": e.id, - "type": e.type, - "name": e.name, - "created_at": e.created_at.isoformat() - } - for e in recent - ] - - return SystemStats( - total_entities=total_entities, - total_relationships=total_relationships, - total_events=total_events, - total_documents=total_documents, - entities_by_type=entities_by_type, - recent_activity=recent_activity - ) - - -@router.get("/geo") -def get_geo_data( - entity_type: Optional[str] = None, - db: Session = Depends(get_scoped_db) -): - """ - Retorna entidades com geolocalizacao. - """ - query = db.query(Entity).filter( - Entity.latitude.isnot(None), - Entity.longitude.isnot(None) - ) - - if entity_type: - query = query.filter(Entity.type == entity_type) - - entities = query.all() - - return [ - { - "id": e.id, - "type": e.type, - "name": e.name, - "lat": e.latitude, - "lng": e.longitude, - "properties": e.properties - } - for e in entities - ] diff --git a/app/api/routes/session.py b/app/api/routes/session.py deleted file mode 100644 index c81ea29a5363a585f72aa5f0df7e2798292c189e..0000000000000000000000000000000000000000 --- a/app/api/routes/session.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Session management routes -""" -from fastapi import APIRouter, Header, Cookie, Response, Request -from typing import Optional -import uuid - -from app.core.database import create_new_session_id -from app.config import settings - -router = APIRouter(prefix="/session", tags=["Session"]) - - -@router.post("/create") -def create_session(response: Response, request: Request): - """Create a new session and return session_id""" - session_id = create_new_session_id() - secure = settings.cookie_secure - samesite = settings.cookie_samesite - proto = request.headers.get("x-forwarded-proto", request.url.scheme) - if proto != "https" and secure: - secure = False - samesite = "lax" - response.set_cookie( - key="numidium_session", - value=session_id, - max_age=60*60*24*365, # 1 year - httponly=True, - samesite=samesite, - secure=secure - ) - return {"session_id": session_id} - - -@router.get("/current") -def get_current_session( - numidium_session: Optional[str] = Cookie(None), - x_session_id: Optional[str] = Header(None) -): - """Get current session ID""" - session_id = x_session_id or numidium_session - if not session_id: - return {"session_id": None, "message": "No session. Call POST /session/create"} - return {"session_id": session_id} diff --git a/app/api/routes/timeline.py b/app/api/routes/timeline.py deleted file mode 100644 index fa45453faf038d34277ffb6a5f1481a2748a8b0e..0000000000000000000000000000000000000000 --- a/app/api/routes/timeline.py +++ /dev/null @@ -1,165 +0,0 @@ -""" -Timeline API Routes - Temporal view of entities and relationships -""" -from fastapi import APIRouter, Depends, Query -from pydantic import BaseModel -from typing import Optional, List, Dict, Any -from datetime import datetime, timedelta -from collections import defaultdict -from sqlalchemy.orm import Session - -from app.api.deps import get_scoped_db -from app.models.entity import Entity, Relationship - - -router = APIRouter(prefix="/timeline", tags=["Timeline"]) - - -class TimelineEvent(BaseModel): - id: str - type: str # "entity" or "relationship" - entity_type: Optional[str] = None - name: str - description: Optional[str] = None - date: str - icon: str - - -class TimelineGroup(BaseModel): - date: str - label: str - events: List[TimelineEvent] - - -class TimelineResponse(BaseModel): - groups: List[TimelineGroup] - total_events: int - - -@router.get("", response_model=TimelineResponse) -async def get_timeline( - days: int = Query(default=30, ge=1, le=365), - entity_type: Optional[str] = None, - limit: int = Query(default=100, ge=1, le=500), - db: Session = Depends(get_scoped_db) -): - """ - Get timeline of recent entities and relationships. - Groups events by date. - """ - # Calculate date range - end_date = datetime.now() - start_date = end_date - timedelta(days=days) - - events = [] - - # Get entities - query = db.query(Entity).filter( - Entity.created_at >= start_date - ) - - if entity_type: - query = query.filter(Entity.type == entity_type) - - entities = query.order_by(Entity.created_at.desc()).limit(limit).all() - - icon_map = { - "person": "👤", - "organization": "🏢", - "location": "📍", - "event": "📅", - "concept": "💡", - "product": "📦" - } - - for e in entities: - # Prefer event_date over created_at - date = e.event_date if e.event_date else e.created_at - events.append(TimelineEvent( - id=e.id, - type="entity", - entity_type=e.type, - name=e.name, - description=e.description[:100] if e.description else None, - date=date.isoformat() if date else datetime.now().isoformat(), - icon=icon_map.get(e.type, "📄") - )) - - # Get relationships - relationships = db.query(Relationship).filter( - Relationship.created_at >= start_date - ).order_by(Relationship.created_at.desc()).limit(limit // 2).all() - - for r in relationships: - source = db.query(Entity).filter(Entity.id == r.source_id).first() - target = db.query(Entity).filter(Entity.id == r.target_id).first() - - if source and target: - # Prefer event_date over created_at - date = r.event_date if r.event_date else r.created_at - events.append(TimelineEvent( - id=r.id, - type="relationship", - name=f"{source.name} → {target.name}", - description=r.type, - date=date.isoformat() if date else datetime.now().isoformat(), - icon="🔗" - )) - - # Sort by date - events.sort(key=lambda x: x.date, reverse=True) - - # Group by date - groups_dict = defaultdict(list) - for event in events: - date_key = event.date[:10] # YYYY-MM-DD - groups_dict[date_key].append(event) - - # Format groups - groups = [] - for date_key in sorted(groups_dict.keys(), reverse=True): - try: - dt = datetime.fromisoformat(date_key) - label = dt.strftime("%d %b %Y") - except: - label = date_key - - groups.append(TimelineGroup( - date=date_key, - label=label, - events=groups_dict[date_key] - )) - - return TimelineResponse( - groups=groups, - total_events=len(events) - ) - - -@router.get("/stats") -async def get_timeline_stats(db: Session = Depends(get_scoped_db)): - """Get statistics for timeline visualization""" - - # Count entities by type - entity_counts = {} - for entity_type in ["person", "organization", "location", "event", "concept"]: - count = db.query(Entity).filter(Entity.type == entity_type).count() - entity_counts[entity_type] = count - - # Count relationships - relationship_count = db.query(Relationship).count() - - # Recent activity (last 7 days) - week_ago = datetime.now() - timedelta(days=7) - recent_entities = db.query(Entity).filter(Entity.created_at >= week_ago).count() - recent_relationships = db.query(Relationship).filter(Relationship.created_at >= week_ago).count() - - return { - "entity_counts": entity_counts, - "relationship_count": relationship_count, - "recent_activity": { - "entities": recent_entities, - "relationships": recent_relationships, - "total": recent_entities + recent_relationships - } - } diff --git a/app/config.py b/app/config.py deleted file mode 100644 index 23f3497193305642c7ae08e7c907009e44c489f0..0000000000000000000000000000000000000000 --- a/app/config.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Numidium Backend Configuration -""" -from pydantic_settings import BaseSettings -from functools import lru_cache -import os - - -class Settings(BaseSettings): - """Application settings""" - - # App Info - app_name: str = "Numidium" - app_version: str = "0.1.0" - debug: bool = False - - # Database - database_url: str = "sqlite:///./data/numidium.db" - - # APIs (opcional - pode configurar depois) - newsapi_key: str = "" - - # Cerebras API for LLM-based entity extraction - cerebras_api_key: str = "" - - # AetherMap API for semantic search and NER - aethermap_url: str = "https://madras1-aethermap.hf.space" - - # CORS - cors_origins: list[str] = ["*"] - - # Session cookie - cookie_secure: bool = True - cookie_samesite: str = "none" - - class Config: - env_file = ".env" - env_file_encoding = "utf-8" - - -@lru_cache() -def get_settings() -> Settings: - """Get cached settings""" - return Settings() - - -settings = get_settings() diff --git a/app/core/__init__.py b/app/core/__init__.py deleted file mode 100644 index 0e8825ce5959f9f016f6f5ed46c2a54fdd15d9e8..0000000000000000000000000000000000000000 --- a/app/core/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Core module -from app.core.database import get_db, init_db, Base diff --git a/app/core/__pycache__/__init__.cpython-311.pyc b/app/core/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index c5dc2c47dff4d25a449c31d5b491838968bd8699..0000000000000000000000000000000000000000 Binary files a/app/core/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/app/core/__pycache__/database.cpython-311.pyc b/app/core/__pycache__/database.cpython-311.pyc deleted file mode 100644 index 5d66b606dc407b3d70e7e6b1d62893eb13ff9d42..0000000000000000000000000000000000000000 Binary files a/app/core/__pycache__/database.cpython-311.pyc and /dev/null differ diff --git a/app/core/database.py b/app/core/database.py deleted file mode 100644 index 6fbd7f1d970d02b46df83e466a09287bfc0090be..0000000000000000000000000000000000000000 --- a/app/core/database.py +++ /dev/null @@ -1,115 +0,0 @@ -""" -Database configuration and session management -Per-session databases - each user session gets its own SQLite file -""" -from sqlalchemy import create_engine, text -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import sessionmaker, Session -from typing import Optional -import os -import uuid - -# Ensure data directory exists -os.makedirs("data/sessions", exist_ok=True) - -# Base class for models -Base = declarative_base() - -# Cache for session engines -_session_engines = {} -_session_makers = {} - - -def get_session_engine(session_id: str): - """Get or create engine for a specific session""" - if session_id not in _session_engines: - db_path = f"data/sessions/{session_id}.db" - engine = create_engine( - f"sqlite:///./{db_path}", - connect_args={"check_same_thread": False} - ) - _session_engines[session_id] = engine - _session_makers[session_id] = sessionmaker(autocommit=False, autoflush=False, bind=engine) - - # Initialize tables for this session - Base.metadata.create_all(bind=engine) - _run_migrations(engine) - - return _session_engines[session_id] - - -def get_session_db(session_id: str): - """Get database session for a specific user session""" - get_session_engine(session_id) # Ensure engine exists - SessionLocal = _session_makers[session_id] - db = SessionLocal() - try: - yield db - finally: - db.close() - - -def get_db_for_session(session_id: str) -> Session: - """Direct session getter (non-generator) for routes""" - get_session_engine(session_id) - SessionLocal = _session_makers[session_id] - return SessionLocal() - - -# Legacy - default database for backwards compatibility -from app.config import settings -engine = create_engine( - settings.database_url, - connect_args={"check_same_thread": False} -) -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) - - -def get_default_session() -> Session: - """Create a new session for the default database.""" - return SessionLocal() - - -def get_db(): - """Legacy: Default database session""" - db = get_default_session() - try: - yield db - finally: - db.close() - - -def _run_migrations(eng): - """Run migrations on an engine""" - with eng.connect() as conn: - try: - conn.execute(text("ALTER TABLE entities ADD COLUMN event_date DATETIME")) - conn.commit() - except Exception: - pass - try: - conn.execute(text("ALTER TABLE relationships ADD COLUMN event_date DATETIME")) - conn.commit() - except Exception: - pass - try: - conn.execute(text("ALTER TABLE entities ADD COLUMN project_id VARCHAR(36)")) - conn.commit() - except Exception: - pass - try: - conn.execute(text("ALTER TABLE relationships ADD COLUMN project_id VARCHAR(36)")) - conn.commit() - except Exception: - pass - - -def init_db(): - """Initialize default database tables""" - Base.metadata.create_all(bind=engine) - _run_migrations(engine) - - -def create_new_session_id() -> str: - """Generate a new session ID""" - return str(uuid.uuid4()) diff --git a/app/main.py b/app/main.py deleted file mode 100644 index 7abfa0ca7fb7e31fc2a58e35b5528eb7b135bada..0000000000000000000000000000000000000000 --- a/app/main.py +++ /dev/null @@ -1,99 +0,0 @@ -""" -Numidium Backend - Main Application -Plataforma de Inteligência e Análise de Dados -""" -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from contextlib import asynccontextmanager - -from app.config import settings -from app.core.database import init_db -from app.api.routes import entities, relationships, events, search, ingest, analyze, graph, research, chat, investigate, dados_publicos, timeline, session, aethermap - - -@asynccontextmanager -async def lifespan(app: FastAPI): - """Startup and shutdown events""" - # Startup: Initialize database - init_db() - print("🚀 Numidium Backend started!") - print(f"📊 Database: {settings.database_url}") - yield - # Shutdown - print("👋 Numidium Backend shutting down...") - - -# Create FastAPI app -app = FastAPI( - title="Numidium API", - description=""" - ## 🔮 Sistema de Inteligência e Análise de Dados - - Backend do VANTAGE - Uma plataforma para: - - 📥 Ingestão de dados de múltiplas fontes (Wikipedia, News, Manual) - - 🔗 Mapeamento de conexões entre entidades - - 🗺️ Visualização geográfica - - 📊 Análise de grafos e relacionamentos - - 🔍 Busca global - """, - version=settings.app_version, - lifespan=lifespan -) - -# CORS middleware -app.add_middleware( - CORSMiddleware, - allow_origins=settings.cors_origins, - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -# Include routers -app.include_router(entities.router, prefix="/api/v1") -app.include_router(relationships.router, prefix="/api/v1") -app.include_router(events.router, prefix="/api/v1") -app.include_router(search.router, prefix="/api/v1") -app.include_router(ingest.router, prefix="/api/v1") -app.include_router(analyze.router, prefix="/api/v1") -app.include_router(graph.router, prefix="/api/v1") -app.include_router(research.router, prefix="/api/v1") -app.include_router(chat.router, prefix="/api/v1") -app.include_router(investigate.router, prefix="/api/v1") -app.include_router(dados_publicos.router, prefix="/api/v1") -app.include_router(timeline.router, prefix="/api/v1") -app.include_router(session.router, prefix="/api/v1") -app.include_router(aethermap.router, prefix="/api/v1/aethermap", tags=["aethermap"]) - - -@app.get("/") -def root(): - """Root endpoint - API info""" - return { - "name": "Numidium", - "version": settings.app_version, - "status": "online", - "docs": "/docs", - "description": "Sistema de Inteligência e Análise de Dados" - } - - -@app.get("/health") -def health_check(): - """Health check endpoint for HF Spaces""" - return {"status": "healthy"} - - -@app.get("/api/v1") -def api_info(): - """API v1 info""" - return { - "version": "1.0.0", - "endpoints": { - "entities": "/api/v1/entities", - "relationships": "/api/v1/relationships", - "events": "/api/v1/events", - "search": "/api/v1/search", - "ingest": "/api/v1/ingest" - } - } diff --git a/app/models/__init__.py b/app/models/__init__.py deleted file mode 100644 index dda9ada652332c2b420769a6ace731249e11cfd8..0000000000000000000000000000000000000000 --- a/app/models/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# Models module -from app.models.entity import Entity, Relationship, Event, Document -from app.models.project import Project diff --git a/app/models/__pycache__/__init__.cpython-311.pyc b/app/models/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 5bf1933151b2e8a290dc79c4647a626d3d0500ff..0000000000000000000000000000000000000000 Binary files a/app/models/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/app/models/__pycache__/entity.cpython-311.pyc b/app/models/__pycache__/entity.cpython-311.pyc deleted file mode 100644 index 08def07731fef26be3662b7e40f9afca7961637f..0000000000000000000000000000000000000000 Binary files a/app/models/__pycache__/entity.cpython-311.pyc and /dev/null differ diff --git a/app/models/__pycache__/project.cpython-311.pyc b/app/models/__pycache__/project.cpython-311.pyc deleted file mode 100644 index 5e11c4a8127b91fa0a97a4fc6607860af7c04d37..0000000000000000000000000000000000000000 Binary files a/app/models/__pycache__/project.cpython-311.pyc and /dev/null differ diff --git a/app/models/entity.py b/app/models/entity.py deleted file mode 100644 index 07f9afbd7c789db76ca4d482de4655cd99eb3bda..0000000000000000000000000000000000000000 --- a/app/models/entity.py +++ /dev/null @@ -1,143 +0,0 @@ -""" -SQLAlchemy Models for Numidium -""" -from sqlalchemy import Column, String, Text, DateTime, Float, JSON, ForeignKey, Table -from sqlalchemy.orm import relationship -from datetime import datetime -import uuid - -from app.core.database import Base - - -def generate_uuid(): - return str(uuid.uuid4()) - - -class Entity(Base): - """ - Entidade - qualquer coisa rastreável no sistema - Pode ser: pessoa, organização, local, veículo, evento, documento, etc. - """ - __tablename__ = "entities" - - id = Column(String(36), primary_key=True, default=generate_uuid) - project_id = Column(String(36), ForeignKey("projects.id"), nullable=True, index=True) - type = Column(String(50), nullable=False, index=True) # person, organization, location, etc - name = Column(String(255), nullable=False, index=True) - description = Column(Text, nullable=True) - properties = Column(JSON, default=dict) # Dados flexíveis - - # Geolocalização (opcional) - latitude = Column(Float, nullable=True) - longitude = Column(Float, nullable=True) - - # Data histórica do evento/entidade (quando aconteceu, não quando foi adicionado) - event_date = Column(DateTime, nullable=True) - - # Fonte do dado - source = Column(String(100), nullable=True) # wikipedia, newsapi, manual, etc - source_url = Column(Text, nullable=True) - - # Timestamps - created_at = Column(DateTime, default=datetime.utcnow) - updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) - - # Relacionamentos - outgoing_relationships = relationship( - "Relationship", - foreign_keys="Relationship.source_id", - back_populates="source_entity" - ) - incoming_relationships = relationship( - "Relationship", - foreign_keys="Relationship.target_id", - back_populates="target_entity" - ) - - -class Relationship(Base): - """ - Relacionamento entre duas entidades - Exemplos: works_for, knows, owns, located_at, participated_in - """ - __tablename__ = "relationships" - - id = Column(String(36), primary_key=True, default=generate_uuid) - project_id = Column(String(36), ForeignKey("projects.id"), nullable=True, index=True) - source_id = Column(String(36), ForeignKey("entities.id"), nullable=False) - target_id = Column(String(36), ForeignKey("entities.id"), nullable=False) - type = Column(String(50), nullable=False, index=True) # works_for, knows, owns, etc - properties = Column(JSON, default=dict) - confidence = Column(Float, default=1.0) # 0-1, quão certo estamos dessa conexão - - # Data histórica do relacionamento (quando aconteceu) - event_date = Column(DateTime, nullable=True) - - # Fonte - source = Column(String(100), nullable=True) - - # Timestamps - created_at = Column(DateTime, default=datetime.utcnow) - - # Relacionamentos - source_entity = relationship("Entity", foreign_keys=[source_id], back_populates="outgoing_relationships") - target_entity = relationship("Entity", foreign_keys=[target_id], back_populates="incoming_relationships") - - -class Event(Base): - """ - Evento - algo que aconteceu envolvendo entidades - """ - __tablename__ = "events" - - id = Column(String(36), primary_key=True, default=generate_uuid) - type = Column(String(50), nullable=False, index=True) - title = Column(String(255), nullable=False) - description = Column(Text, nullable=True) - - # Quando aconteceu - event_date = Column(DateTime, nullable=True) - - # Onde aconteceu - location_name = Column(String(255), nullable=True) - latitude = Column(Float, nullable=True) - longitude = Column(Float, nullable=True) - - # Entidades envolvidas (armazenado como JSON array de IDs) - entity_ids = Column(JSON, default=list) - - # Fonte - source = Column(String(100), nullable=True) - source_url = Column(Text, nullable=True) - - # Metadados - properties = Column(JSON, default=dict) - - # Timestamps - created_at = Column(DateTime, default=datetime.utcnow) - - -class Document(Base): - """ - Documento - texto/arquivo para análise - """ - __tablename__ = "documents" - - id = Column(String(36), primary_key=True, default=generate_uuid) - title = Column(String(255), nullable=False) - content = Column(Text, nullable=True) - summary = Column(Text, nullable=True) # Resumo gerado por IA - - # Tipo de documento - doc_type = Column(String(50), default="text") # text, news, report, etc - - # Entidades mencionadas (extraídas por NLP) - mentioned_entities = Column(JSON, default=list) - - # Fonte - source = Column(String(100), nullable=True) - source_url = Column(Text, nullable=True) - - # Timestamps - published_at = Column(DateTime, nullable=True) - created_at = Column(DateTime, default=datetime.utcnow) diff --git a/app/models/project.py b/app/models/project.py deleted file mode 100644 index 72f601e1975770622c146cc3b1b9fb6fbd912a3c..0000000000000000000000000000000000000000 --- a/app/models/project.py +++ /dev/null @@ -1,29 +0,0 @@ -""" -Project Model - Workspaces for organizing investigations -""" -from sqlalchemy import Column, String, Text, DateTime -from datetime import datetime -import uuid - -from app.core.database import Base - - -def generate_uuid(): - return str(uuid.uuid4()) - - -class Project(Base): - """ - Projeto/Workspace - agrupa entidades e relacionamentos por investigação - """ - __tablename__ = "projects" - - id = Column(String(36), primary_key=True, default=generate_uuid) - name = Column(String(255), nullable=False) - description = Column(Text, nullable=True) - color = Column(String(7), default="#00d4ff") # Hex color for UI - icon = Column(String(50), default="folder") # Icon name - - # Timestamps - created_at = Column(DateTime, default=datetime.utcnow) - updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) diff --git a/app/schemas/__init__.py b/app/schemas/__init__.py deleted file mode 100644 index 5afaa5cdacc5762ea19abe607c7dab6309e351a8..0000000000000000000000000000000000000000 --- a/app/schemas/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -# Schemas module -from app.schemas.schemas import ( - EntityCreate, EntityUpdate, EntityResponse, - RelationshipCreate, RelationshipResponse, - EventCreate, EventResponse, - DocumentCreate, DocumentResponse, - GraphData, GraphNode, GraphEdge, - SearchQuery, SearchResult, - SystemStats -) diff --git a/app/schemas/__pycache__/__init__.cpython-311.pyc b/app/schemas/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 5a0991c3a362725e79629796654b9dc0ed9c9668..0000000000000000000000000000000000000000 Binary files a/app/schemas/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/app/schemas/__pycache__/schemas.cpython-311.pyc b/app/schemas/__pycache__/schemas.cpython-311.pyc deleted file mode 100644 index 8358505555f3036c07fa84d01cd9cd01b7b97b9f..0000000000000000000000000000000000000000 Binary files a/app/schemas/__pycache__/schemas.cpython-311.pyc and /dev/null differ diff --git a/app/schemas/schemas.py b/app/schemas/schemas.py deleted file mode 100644 index afbff0c301ddb1fbe8cae0e4848fcafc48082ff0..0000000000000000000000000000000000000000 --- a/app/schemas/schemas.py +++ /dev/null @@ -1,163 +0,0 @@ -""" -Pydantic Schemas for API validation -""" -from pydantic import BaseModel, Field -from typing import Optional, List, Any -from datetime import datetime - - -# ========== Entity Schemas ========== - -class EntityBase(BaseModel): - type: str = Field(..., description="Tipo da entidade: person, organization, location, etc") - name: str = Field(..., description="Nome da entidade") - description: Optional[str] = None - properties: dict = Field(default_factory=dict) - latitude: Optional[float] = None - longitude: Optional[float] = None - source: Optional[str] = None - source_url: Optional[str] = None - - -class EntityCreate(EntityBase): - pass - - -class EntityUpdate(BaseModel): - type: Optional[str] = None - name: Optional[str] = None - description: Optional[str] = None - properties: Optional[dict] = None - latitude: Optional[float] = None - longitude: Optional[float] = None - - -class EntityResponse(EntityBase): - id: str - created_at: datetime - updated_at: datetime - - class Config: - from_attributes = True - - -# ========== Relationship Schemas ========== - -class RelationshipBase(BaseModel): - source_id: str - target_id: str - type: str = Field(..., description="Tipo: works_for, knows, owns, located_at, etc") - properties: dict = Field(default_factory=dict) - confidence: float = Field(default=1.0, ge=0, le=1) - source: Optional[str] = None - - -class RelationshipCreate(RelationshipBase): - pass - - -class RelationshipResponse(RelationshipBase): - id: str - created_at: datetime - - class Config: - from_attributes = True - - -# ========== Event Schemas ========== - -class EventBase(BaseModel): - type: str - title: str - description: Optional[str] = None - event_date: Optional[datetime] = None - location_name: Optional[str] = None - latitude: Optional[float] = None - longitude: Optional[float] = None - entity_ids: List[str] = Field(default_factory=list) - source: Optional[str] = None - source_url: Optional[str] = None - properties: dict = Field(default_factory=dict) - - -class EventCreate(EventBase): - pass - - -class EventResponse(EventBase): - id: str - created_at: datetime - - class Config: - from_attributes = True - - -# ========== Document Schemas ========== - -class DocumentBase(BaseModel): - title: str - content: Optional[str] = None - doc_type: str = "text" - source: Optional[str] = None - source_url: Optional[str] = None - published_at: Optional[datetime] = None - - -class DocumentCreate(DocumentBase): - pass - - -class DocumentResponse(DocumentBase): - id: str - summary: Optional[str] = None - mentioned_entities: List[str] = [] - created_at: datetime - - class Config: - from_attributes = True - - -# ========== Graph Schemas ========== - -class GraphNode(BaseModel): - id: str - type: str - name: str - properties: dict = {} - - -class GraphEdge(BaseModel): - source: str - target: str - type: str - confidence: float = 1.0 - - -class GraphData(BaseModel): - nodes: List[GraphNode] - edges: List[GraphEdge] - - -# ========== Search Schemas ========== - -class SearchQuery(BaseModel): - query: str - entity_types: Optional[List[str]] = None - limit: int = Field(default=20, le=100) - - -class SearchResult(BaseModel): - entities: List[EntityResponse] - events: List[EventResponse] - documents: List[DocumentResponse] - - -# ========== Stats Schemas ========== - -class SystemStats(BaseModel): - total_entities: int - total_relationships: int - total_events: int - total_documents: int - entities_by_type: dict - recent_activity: List[dict] diff --git a/app/services/__init__.py b/app/services/__init__.py deleted file mode 100644 index c7f87b77ce421c83e59588e4c341ebab500c3c41..0000000000000000000000000000000000000000 --- a/app/services/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Services module diff --git a/app/services/__pycache__/__init__.cpython-311.pyc b/app/services/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index bdd1095f0f04ab7b53c9f32b8bbce7e4a48236e3..0000000000000000000000000000000000000000 Binary files a/app/services/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/app/services/__pycache__/brazil_apis.cpython-311.pyc b/app/services/__pycache__/brazil_apis.cpython-311.pyc deleted file mode 100644 index 9751973d578e79e328d2912fe354747fb95d79a9..0000000000000000000000000000000000000000 Binary files a/app/services/__pycache__/brazil_apis.cpython-311.pyc and /dev/null differ diff --git a/app/services/__pycache__/geocoding.cpython-311.pyc b/app/services/__pycache__/geocoding.cpython-311.pyc deleted file mode 100644 index 664fbab62acd7d6c1db2527f8ab8b4b7a11662e9..0000000000000000000000000000000000000000 Binary files a/app/services/__pycache__/geocoding.cpython-311.pyc and /dev/null differ diff --git a/app/services/__pycache__/investigation.cpython-311.pyc b/app/services/__pycache__/investigation.cpython-311.pyc deleted file mode 100644 index 219ab0bfb74e241ee442a062277765b2d3f84c26..0000000000000000000000000000000000000000 Binary files a/app/services/__pycache__/investigation.cpython-311.pyc and /dev/null differ diff --git a/app/services/__pycache__/investigator_agent.cpython-311.pyc b/app/services/__pycache__/investigator_agent.cpython-311.pyc deleted file mode 100644 index 17cd96446bf23aa53179b163aba52d3f2b0ece2e..0000000000000000000000000000000000000000 Binary files a/app/services/__pycache__/investigator_agent.cpython-311.pyc and /dev/null differ diff --git a/app/services/__pycache__/lancer.cpython-311.pyc b/app/services/__pycache__/lancer.cpython-311.pyc deleted file mode 100644 index 0415c81ac5d96a648b50a300ce010fe32a9a53bb..0000000000000000000000000000000000000000 Binary files a/app/services/__pycache__/lancer.cpython-311.pyc and /dev/null differ diff --git a/app/services/__pycache__/transparencia_api.cpython-311.pyc b/app/services/__pycache__/transparencia_api.cpython-311.pyc deleted file mode 100644 index 8758135b5b049f71eceec09add4595fb7b11dd12..0000000000000000000000000000000000000000 Binary files a/app/services/__pycache__/transparencia_api.cpython-311.pyc and /dev/null differ diff --git a/app/services/aethermap_client.py b/app/services/aethermap_client.py deleted file mode 100644 index 2e9a6490f843a94391b376e528db87554c8e31cf..0000000000000000000000000000000000000000 --- a/app/services/aethermap_client.py +++ /dev/null @@ -1,343 +0,0 @@ -""" -AetherMap Client -Client para integração com AetherMap API - busca semântica, NER e análise de grafos. -""" -import httpx -import json -import io -from typing import List, Dict, Any, Optional -from dataclasses import dataclass, field -from datetime import datetime -import logging - -from app.config import settings - -logger = logging.getLogger(__name__) - - -# URL base do AetherMap (HuggingFace Space) -AETHERMAP_URL = getattr(settings, 'aethermap_url', 'https://madras1-aethermap.hf.space') - - -@dataclass -class ProcessResult: - """Resultado do processamento de documentos""" - job_id: str - num_documents: int - num_clusters: int - num_noise: int - metrics: Dict[str, Any] = field(default_factory=dict) - cluster_analysis: Dict[str, Any] = field(default_factory=dict) - - -@dataclass -class SearchResult: - """Resultado de busca semântica""" - summary: str # Resposta RAG gerada pelo LLM - results: List[Dict[str, Any]] = field(default_factory=list) - - -@dataclass -class EntityNode: - """Nó de entidade no grafo""" - entity: str - entity_type: str - docs: int - degree: int = 0 - centrality: float = 0.0 - role: str = "peripheral" # hub, connector, peripheral - - -@dataclass -class EntityEdge: - """Aresta do grafo de entidades""" - source_entity: str - target_entity: str - weight: int - reason: str - - -@dataclass -class EntityGraphResult: - """Resultado da extração de entidades""" - nodes: List[EntityNode] = field(default_factory=list) - edges: List[EntityEdge] = field(default_factory=list) - hubs: List[Dict[str, Any]] = field(default_factory=list) - insights: Dict[str, Any] = field(default_factory=dict) - - -@dataclass -class GraphAnalysis: - """Análise do grafo via LLM""" - analysis: str - key_entities: List[str] = field(default_factory=list) - relationships: List[str] = field(default_factory=list) - - -class AetherMapClient: - """ - Client para AetherMap API. - - Funcionalidades: - - Processamento de documentos (embeddings + clusters) - - Busca semântica RAG (FAISS + BM25 + reranking + LLM) - - Extração de entidades NER - - Análise de grafo via LLM - """ - - def __init__(self, base_url: str = None, timeout: float = 600.0): - self.base_url = (base_url or AETHERMAP_URL).rstrip('/') - self.timeout = timeout - self._current_job_id: Optional[str] = None - - @property - def current_job_id(self) -> Optional[str]: - """Retorna o job_id atual""" - return self._current_job_id - - async def process_documents( - self, - texts: List[str], - fast_mode: bool = True, - min_cluster_size: int = 0, - min_samples: int = 0 - ) -> ProcessResult: - """ - Processa uma lista de textos gerando embeddings e clusters. - - Args: - texts: Lista de textos/documentos - fast_mode: Se True, usa PCA (rápido). Se False, usa UMAP (preciso) - min_cluster_size: Tamanho mínimo do cluster (0=auto) - min_samples: Mínimo de amostras (0=auto) - - Returns: - ProcessResult com job_id e métricas - """ - # Criar arquivo TXT em memória - content = "\n".join(texts) - file_bytes = content.encode('utf-8') - - try: - async with httpx.AsyncClient(timeout=self.timeout) as client: - files = { - 'file': ('documents.txt', io.BytesIO(file_bytes), 'text/plain') - } - data = { - 'n_samples': str(len(texts)), - 'fast_mode': 'true' if fast_mode else 'false', - 'min_cluster_size': str(min_cluster_size), - 'min_samples': str(min_samples) - } - - logger.info(f"AetherMap: Processando {len(texts)} documentos para {self.base_url}/process/") - - response = await client.post( - f"{self.base_url}/process/", - files=files, - data=data - ) - - logger.info(f"AetherMap: Response status {response.status_code}") - - if response.status_code != 200: - error_text = response.text[:500] if response.text else "No response body" - logger.error(f"AetherMap error: {response.status_code} - {error_text}") - raise Exception(f"AetherMap error: {response.status_code} - {error_text}") - - result = response.json() - - self._current_job_id = result.get('job_id') - metadata = result.get('metadata', {}) - - logger.info(f"AetherMap: Job criado {self._current_job_id}") - - return ProcessResult( - job_id=self._current_job_id or "unknown", - num_documents=metadata.get('num_documents_processed', len(texts)), - num_clusters=metadata.get('num_clusters_found', 0), - num_noise=metadata.get('num_noise_points', 0), - metrics=result.get('metrics', {}), - cluster_analysis=result.get('cluster_analysis', {}) - ) - except httpx.TimeoutException: - logger.error(f"AetherMap: Timeout ao conectar com {self.base_url}") - raise Exception(f"Timeout: AetherMap Space pode estar dormindo. Tente novamente em alguns segundos.") - except httpx.ConnectError as e: - logger.error(f"AetherMap: Erro de conexão: {e}") - raise Exception(f"Erro de conexão com AetherMap: {e}") - except Exception as e: - logger.error(f"AetherMap: Erro inesperado: {e}") - raise - - async def semantic_search( - self, - query: str, - job_id: str = None, - turbo_mode: bool = False - ) -> SearchResult: - """ - Busca semântica RAG híbrida nos documentos processados. - - Args: - query: Termo de busca - job_id: ID do job (se não fornecido, usa o último) - turbo_mode: Se True, busca mais rápida (menos precisa) - - Returns: - SearchResult com resumo e resultados - """ - job_id = job_id or self._current_job_id - if not job_id: - raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") - - async with httpx.AsyncClient(timeout=self.timeout) as client: - data = { - 'query': query, - 'job_id': job_id, - 'turbo_mode': 'true' if turbo_mode else 'false' - } - - logger.info(f"AetherMap: Buscando '{query}'...") - - response = await client.post( - f"{self.base_url}/search/", - data=data - ) - - if response.status_code != 200: - raise Exception(f"AetherMap search error: {response.status_code} - {response.text}") - - result = response.json() - - return SearchResult( - summary=result.get('summary', ''), - results=result.get('results', []) - ) - - async def extract_entities(self, job_id: str = None) -> EntityGraphResult: - """ - Extrai entidades nomeadas (NER) e cria grafo de conexões. - - Args: - job_id: ID do job (se não fornecido, usa o último) - - Returns: - EntityGraphResult com nós, arestas e insights - """ - job_id = job_id or self._current_job_id - if not job_id: - raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") - - async with httpx.AsyncClient(timeout=self.timeout) as client: - data = {'job_id': job_id} - - logger.info(f"AetherMap: Extraindo entidades...") - - response = await client.post( - f"{self.base_url}/entity_graph/", - data=data - ) - - if response.status_code != 200: - raise Exception(f"AetherMap entity_graph error: {response.status_code} - {response.text}") - - result = response.json() - - # Converter para dataclasses - nodes = [ - EntityNode( - entity=n.get('entity', ''), - entity_type=n.get('type', ''), - docs=n.get('docs', 0), - degree=n.get('degree', 0), - centrality=n.get('centrality', 0.0), - role=n.get('role', 'peripheral') - ) - for n in result.get('nodes', []) - ] - - edges = [ - EntityEdge( - source_entity=e.get('source_entity', ''), - target_entity=e.get('target_entity', ''), - weight=e.get('weight', 0), - reason=e.get('reason', '') - ) - for e in result.get('edges', []) - ] - - return EntityGraphResult( - nodes=nodes, - edges=edges, - hubs=result.get('hubs', []), - insights=result.get('insights', {}) - ) - - async def analyze_graph(self, job_id: str = None) -> GraphAnalysis: - """ - Usa LLM para analisar o Knowledge Graph e extrair insights. - - Args: - job_id: ID do job (se não fornecido, usa o último) - - Returns: - GraphAnalysis com análise textual - """ - job_id = job_id or self._current_job_id - if not job_id: - raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") - - async with httpx.AsyncClient(timeout=self.timeout) as client: - data = {'job_id': job_id} - - logger.info(f"AetherMap: Analisando grafo com LLM...") - - response = await client.post( - f"{self.base_url}/analyze_graph/", - data=data - ) - - if response.status_code != 200: - raise Exception(f"AetherMap analyze_graph error: {response.status_code} - {response.text}") - - result = response.json() - - return GraphAnalysis( - analysis=result.get('analysis', ''), - key_entities=result.get('key_entities', []), - relationships=result.get('relationships', []) - ) - - async def describe_clusters(self, job_id: str = None) -> Dict[str, Any]: - """ - Usa LLM para descrever cada cluster encontrado. - - Args: - job_id: ID do job (se não fornecido, usa o último) - - Returns: - Dict com insights por cluster - """ - job_id = job_id or self._current_job_id - if not job_id: - raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") - - async with httpx.AsyncClient(timeout=self.timeout) as client: - data = {'job_id': job_id} - - logger.info(f"AetherMap: Descrevendo clusters...") - - response = await client.post( - f"{self.base_url}/describe_clusters/", - data=data - ) - - if response.status_code != 200: - raise Exception(f"AetherMap describe_clusters error: {response.status_code} - {response.text}") - - return response.json() - - -# Instância global do client -aethermap = AetherMapClient() diff --git a/app/services/analysis/__init__.py b/app/services/analysis/__init__.py deleted file mode 100644 index 18e8fe19c13e9ec59fb147e63518a8ddbeef5f25..0000000000000000000000000000000000000000 --- a/app/services/analysis/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Analysis services diff --git a/app/services/brazil_apis.py b/app/services/brazil_apis.py deleted file mode 100644 index 3cf938529a35708355664dd05f60288519c0d7df..0000000000000000000000000000000000000000 --- a/app/services/brazil_apis.py +++ /dev/null @@ -1,218 +0,0 @@ -""" -Brazilian Data APIs Service -Consolidates access to public Brazilian data APIs for investigation -""" -import httpx -from typing import Optional, Dict, Any, List -from dataclasses import dataclass, field -import re - - -# API URLs -CNPJA_URL = "https://api.cnpja.com.br/office" -OPENCNPJ_URL = "https://api.opencnpj.org/v1/cnpj" -BRASILAPI_CNPJ = "https://brasilapi.com.br/api/cnpj/v1" -BRASILAPI_CEP = "https://brasilapi.com.br/api/cep/v2" - - -@dataclass -class CompanyData: - """Data structure for company information""" - cnpj: str - razao_social: str = "" - nome_fantasia: str = "" - situacao: str = "" - data_abertura: str = "" - natureza_juridica: str = "" - capital_social: float = 0.0 - porte: str = "" - - # Address - logradouro: str = "" - numero: str = "" - complemento: str = "" - bairro: str = "" - cidade: str = "" - uf: str = "" - cep: str = "" - - # Contact - telefone: str = "" - email: str = "" - - # Activity - cnae_principal: str = "" - cnae_descricao: str = "" - cnaes_secundarios: List[str] = field(default_factory=list) - - # Partners/Owners - socios: List[Dict[str, Any]] = field(default_factory=list) - - # Source - fonte: str = "" - - -def clean_cnpj(cnpj: str) -> str: - """Remove formatting from CNPJ""" - return re.sub(r'[^0-9]', '', cnpj) - - -async def consultar_cnpj(cnpj: str) -> Optional[CompanyData]: - """ - Query CNPJ data from available APIs. - Tries BrasilAPI first (more reliable), then falls back to others. - """ - cnpj_clean = clean_cnpj(cnpj) - - if len(cnpj_clean) != 14: - return None - - # Try BrasilAPI first - result = await _query_brasilapi(cnpj_clean) - if result: - return result - - # Fallback to OpenCNPJ - result = await _query_opencnpj(cnpj_clean) - if result: - return result - - return None - - -async def _query_brasilapi(cnpj: str) -> Optional[CompanyData]: - """Query BrasilAPI for CNPJ data""" - try: - async with httpx.AsyncClient(timeout=30.0) as client: - response = await client.get(f"{BRASILAPI_CNPJ}/{cnpj}") - - if response.status_code != 200: - return None - - data = response.json() - - # Parse partners - socios = [] - for socio in data.get("qsa", []): - socios.append({ - "nome": socio.get("nome_socio", ""), - "qualificacao": socio.get("qualificacao_socio", ""), - "cpf_cnpj": socio.get("cnpj_cpf_do_socio", ""), - "data_entrada": socio.get("data_entrada_sociedade", "") - }) - - # Parse CNAEs - cnaes_sec = [] - for cnae in data.get("cnaes_secundarios", []): - if isinstance(cnae, dict): - cnaes_sec.append(f"{cnae.get('codigo', '')} - {cnae.get('descricao', '')}") - else: - cnaes_sec.append(str(cnae)) - - return CompanyData( - cnpj=cnpj, - razao_social=data.get("razao_social", ""), - nome_fantasia=data.get("nome_fantasia", ""), - situacao=data.get("descricao_situacao_cadastral", ""), - data_abertura=data.get("data_inicio_atividade", ""), - natureza_juridica=data.get("natureza_juridica", ""), - capital_social=float(data.get("capital_social", 0)), - porte=data.get("porte", ""), - logradouro=data.get("logradouro", ""), - numero=data.get("numero", ""), - complemento=data.get("complemento", ""), - bairro=data.get("bairro", ""), - cidade=data.get("municipio", ""), - uf=data.get("uf", ""), - cep=data.get("cep", ""), - telefone=data.get("ddd_telefone_1", ""), - email=data.get("email", ""), - cnae_principal=str(data.get("cnae_fiscal", "")), - cnae_descricao=data.get("cnae_fiscal_descricao", ""), - cnaes_secundarios=cnaes_sec, - socios=socios, - fonte="BrasilAPI" - ) - - except Exception as e: - print(f"BrasilAPI error: {e}") - return None - - -async def _query_opencnpj(cnpj: str) -> Optional[CompanyData]: - """Query OpenCNPJ API""" - try: - async with httpx.AsyncClient(timeout=30.0) as client: - response = await client.get(f"{OPENCNPJ_URL}/{cnpj}") - - if response.status_code != 200: - return None - - data = response.json() - - # Parse partners - socios = [] - for socio in data.get("socios", []): - socios.append({ - "nome": socio.get("nome", ""), - "qualificacao": socio.get("qualificacao", ""), - "cpf_cnpj": "", - "data_entrada": socio.get("data_entrada", "") - }) - - return CompanyData( - cnpj=cnpj, - razao_social=data.get("razao_social", ""), - nome_fantasia=data.get("nome_fantasia", ""), - situacao=data.get("situacao_cadastral", ""), - data_abertura=data.get("data_inicio_atividade", ""), - natureza_juridica=data.get("natureza_juridica", ""), - capital_social=float(data.get("capital_social", 0) or 0), - porte=data.get("porte", ""), - logradouro=data.get("logradouro", ""), - numero=data.get("numero", ""), - complemento=data.get("complemento", ""), - bairro=data.get("bairro", ""), - cidade=data.get("municipio", ""), - uf=data.get("uf", ""), - cep=data.get("cep", ""), - telefone=data.get("telefone", ""), - email=data.get("email", ""), - cnae_principal=data.get("cnae_principal", {}).get("codigo", ""), - cnae_descricao=data.get("cnae_principal", {}).get("descricao", ""), - cnaes_secundarios=[], - socios=socios, - fonte="OpenCNPJ" - ) - - except Exception as e: - print(f"OpenCNPJ error: {e}") - return None - - -async def consultar_cep(cep: str) -> Optional[Dict[str, Any]]: - """Query address by CEP""" - cep_clean = re.sub(r'[^0-9]', '', cep) - - try: - async with httpx.AsyncClient(timeout=15.0) as client: - response = await client.get(f"{BRASILAPI_CEP}/{cep_clean}") - - if response.status_code != 200: - return None - - return response.json() - - except Exception as e: - print(f"CEP query error: {e}") - return None - - -async def buscar_empresas_por_nome(nome: str, uf: Optional[str] = None) -> List[Dict[str, Any]]: - """ - Search companies by name using web search (via Lancer). - This is a workaround since direct name search APIs are paid. - """ - # This would need Lancer integration for web search - # For now, return empty - will be filled by investigation service - return [] diff --git a/app/services/chat.py b/app/services/chat.py deleted file mode 100644 index 89595f334653e11a19d2103c28ccfaeb97110844..0000000000000000000000000000000000000000 --- a/app/services/chat.py +++ /dev/null @@ -1,213 +0,0 @@ -""" -Chat Service - Intelligent chat with RAG capabilities -Uses local database + Lancer for comprehensive responses -""" -import httpx -from typing import Optional, List, Dict, Any -from sqlalchemy.orm import Session - -from app.config import settings -from app.models.entity import Entity, Relationship - - -LANCER_URL = "https://madras1-lancer.hf.space/api/v1" - -SYSTEM_PROMPT = """Você é um assistente de inteligência do NUMIDIUM. -Você tem acesso a um grafo de conhecimento com entidades e relacionamentos, -e pode pesquisar na web para informações atualizadas. - -Responda em português brasileiro de forma clara e direta. -Se não tiver certeza, diga que não sabe em vez de inventar.""" - - -class ChatService: - """Chat service with RAG using local database and Lancer""" - - def __init__(self): - self.api_url = "https://api.cerebras.ai/v1/chat/completions" - self.conversation_history: Dict[str, List[Dict[str, str]]] = {} - - def _get_history(self, session_id: Optional[str]) -> List[Dict[str, str]]: - key = session_id or "default" - if key not in self.conversation_history: - self.conversation_history[key] = [] - return self.conversation_history[key] - - def clear_history(self, session_id: Optional[str] = None): - """Clear conversation history""" - key = session_id or "default" - self.conversation_history.pop(key, None) - - def _get_local_context(self, query: str, db: Session, limit: int = 5) -> str: - """Get relevant entities from local database""" - # Search entities by name - entities = db.query(Entity).filter( - Entity.name.ilike(f"%{query}%") - ).limit(limit).all() - - # Also search by description - if len(entities) < limit: - desc_entities = db.query(Entity).filter( - Entity.description.ilike(f"%{query}%") - ).limit(limit - len(entities)).all() - entities.extend(desc_entities) - - if not entities: - # Try splitting query into words - words = query.split() - for word in words: - if len(word) > 3: - word_entities = db.query(Entity).filter( - Entity.name.ilike(f"%{word}%") - ).limit(2).all() - entities.extend(word_entities) - - if not entities: - return "" - - context_parts = [] - seen_ids = set() - - for entity in entities: - if entity.id in seen_ids: - continue - seen_ids.add(entity.id) - - ctx = f"• {entity.name} ({entity.type})" - if entity.description: - ctx += f": {entity.description[:200]}" - - # Get relationships - relationships = db.query(Relationship).filter( - (Relationship.source_id == entity.id) | - (Relationship.target_id == entity.id) - ).limit(5).all() - - if relationships: - related = [] - for rel in relationships: - if rel.source_id == entity.id: - target = db.query(Entity).filter(Entity.id == rel.target_id).first() - if target: - related.append(f"{rel.type} → {target.name}") - else: - source = db.query(Entity).filter(Entity.id == rel.source_id).first() - if source: - related.append(f"{source.name} → {rel.type}") - - if related: - ctx += f" | Relações: {', '.join(related[:3])}" - - context_parts.append(ctx) - - return "\n".join(context_parts) - - async def _get_web_context(self, query: str) -> str: - """Get context from Lancer web search""" - try: - async with httpx.AsyncClient(timeout=30.0) as client: - response = await client.post( - f"{LANCER_URL}/search", - json={ - "query": query, - "max_results": 5, - "include_answer": True - } - ) - - if response.status_code == 200: - data = response.json() - if data.get("answer"): - return f"Informações da web:\n{data['answer'][:1000]}" - - return "" - except Exception as e: - print(f"Lancer error: {e}") - return "" - - async def _call_llm(self, messages: List[Dict[str, str]]) -> str: - """Call Cerebras LLM""" - try: - async with httpx.AsyncClient(timeout=60.0) as client: - response = await client.post( - self.api_url, - headers={ - "Authorization": f"Bearer {settings.cerebras_api_key}", - "Content-Type": "application/json" - }, - json={ - "model": "qwen-3-32b", - "messages": messages, - "temperature": 0.7, - "max_tokens": 2048 - } - ) - - if response.status_code == 200: - data = response.json() - return data["choices"][0]["message"]["content"] - else: - return f"Erro na API: {response.status_code}" - - except Exception as e: - return f"Erro: {str(e)}" - - async def chat( - self, - message: str, - db: Session, - use_web: bool = True, - use_history: bool = True, - session_id: Optional[str] = None - ) -> Dict[str, Any]: - """Process chat message with RAG""" - history = self._get_history(session_id) - - # Get local context - local_context = self._get_local_context(message, db) - - # Get web context if enabled - web_context = "" - if use_web: - web_context = await self._get_web_context(message) - - # Build context - context_parts = [] - if local_context: - context_parts.append(f"📊 Conhecimento local:\n{local_context}") - if web_context: - context_parts.append(f"🌐 {web_context}") - - context = "\n\n".join(context_parts) if context_parts else "Nenhum contexto disponível." - - # Build messages - messages = [{"role": "system", "content": SYSTEM_PROMPT}] - - if use_history and history: - messages.extend(history[-6:]) - - user_message = f"""Contexto: -{context} - -Pergunta: {message}""" - - messages.append({"role": "user", "content": user_message}) - - # Call LLM - response = await self._call_llm(messages) - - # Store history - if use_history: - history.append({"role": "user", "content": message}) - history.append({"role": "assistant", "content": response}) - - return { - "answer": response, - "local_context_used": bool(local_context), - "web_context_used": bool(web_context), - "entities_found": local_context.count("•") if local_context else 0 - } - - -# Singleton -chat_service = ChatService() diff --git a/app/services/geocoding.py b/app/services/geocoding.py deleted file mode 100644 index 06863f2be60350c5cd8251ca8cfa063809135cea..0000000000000000000000000000000000000000 --- a/app/services/geocoding.py +++ /dev/null @@ -1,63 +0,0 @@ -""" -Geocoding Service - Uses Nominatim (OpenStreetMap) for free geocoding -""" -import httpx -from typing import Optional, Tuple -import asyncio - - -NOMINATIM_URL = "https://nominatim.openstreetmap.org/search" -USER_AGENT = "NUMIDIUM/1.0 (Intelligence System)" - - -async def geocode(location_name: str) -> Optional[Tuple[float, float]]: - """ - Convert a location name to coordinates using Nominatim. - Returns (latitude, longitude) or None if not found. - - Note: Nominatim has rate limits (1 request/second), so be careful with batch operations. - """ - try: - async with httpx.AsyncClient(timeout=10.0) as client: - response = await client.get( - NOMINATIM_URL, - params={ - "q": location_name, - "format": "json", - "limit": 1, - "addressdetails": 0 - }, - headers={ - "User-Agent": USER_AGENT - } - ) - - if response.status_code == 200: - data = response.json() - if data and len(data) > 0: - lat = float(data[0]["lat"]) - lon = float(data[0]["lon"]) - return (lat, lon) - - return None - - except Exception as e: - print(f"Geocoding error for '{location_name}': {e}") - return None - - -async def geocode_batch(location_names: list[str], delay: float = 1.0) -> dict[str, Tuple[float, float]]: - """ - Geocode multiple locations with proper rate limiting. - Returns a dict mapping location names to (lat, lon) tuples. - """ - results = {} - - for name in location_names: - coords = await geocode(name) - if coords: - results[name] = coords - # Respect Nominatim rate limits - await asyncio.sleep(delay) - - return results diff --git a/app/services/ibge_api.py b/app/services/ibge_api.py deleted file mode 100644 index 26d5000ed2798dfe0f7a1ce55603f305dad74783..0000000000000000000000000000000000000000 --- a/app/services/ibge_api.py +++ /dev/null @@ -1,192 +0,0 @@ -""" -IBGE API Service -Access to Brazilian geographic and demographic data -""" -import httpx -from typing import Optional, Dict, Any, List -from dataclasses import dataclass - - -IBGE_BASE_URL = "https://servicodados.ibge.gov.br/api/v1" - - -@dataclass -class Estado: - """Brazilian state data""" - id: int - sigla: str - nome: str - regiao: str - - -@dataclass -class Municipio: - """Brazilian municipality data""" - id: int - nome: str - estado_sigla: str - estado_nome: str - regiao: str - # Optional enriched data - populacao: Optional[int] = None - area_km2: Optional[float] = None - - -async def listar_estados() -> List[Estado]: - """List all Brazilian states""" - try: - async with httpx.AsyncClient(timeout=15.0) as client: - response = await client.get(f"{IBGE_BASE_URL}/localidades/estados") - - if response.status_code != 200: - return [] - - data = response.json() - estados = [] - - for item in data: - estados.append(Estado( - id=item["id"], - sigla=item["sigla"], - nome=item["nome"], - regiao=item.get("regiao", {}).get("nome", "") - )) - - return sorted(estados, key=lambda x: x.nome) - - except Exception as e: - print(f"IBGE estados error: {e}") - return [] - - -async def listar_municipios(uf: str) -> List[Municipio]: - """List all municipalities in a state""" - try: - async with httpx.AsyncClient(timeout=15.0) as client: - response = await client.get( - f"{IBGE_BASE_URL}/localidades/estados/{uf}/municipios" - ) - - if response.status_code != 200: - return [] - - data = response.json() - municipios = [] - - for item in data: - municipios.append(Municipio( - id=item["id"], - nome=item["nome"], - estado_sigla=uf.upper(), - estado_nome=item.get("microrregiao", {}).get("mesorregiao", {}).get("UF", {}).get("nome", ""), - regiao=item.get("microrregiao", {}).get("mesorregiao", {}).get("UF", {}).get("regiao", {}).get("nome", "") - )) - - return sorted(municipios, key=lambda x: x.nome) - - except Exception as e: - print(f"IBGE municipios error: {e}") - return [] - - -async def buscar_municipio(nome: str, uf: Optional[str] = None) -> List[Municipio]: - """Search for municipalities by name""" - try: - # If UF provided, search only that state - if uf: - municipios = await listar_municipios(uf) - return [m for m in municipios if nome.lower() in m.nome.lower()] - - # Otherwise search all states (slower) - async with httpx.AsyncClient(timeout=30.0) as client: - response = await client.get(f"{IBGE_BASE_URL}/localidades/municipios") - - if response.status_code != 200: - return [] - - data = response.json() - results = [] - - for item in data: - if nome.lower() in item["nome"].lower(): - uf_info = item.get("microrregiao", {}).get("mesorregiao", {}).get("UF", {}) - results.append(Municipio( - id=item["id"], - nome=item["nome"], - estado_sigla=uf_info.get("sigla", ""), - estado_nome=uf_info.get("nome", ""), - regiao=uf_info.get("regiao", {}).get("nome", "") - )) - - return results[:20] # Limit results - - except Exception as e: - print(f"IBGE search error: {e}") - return [] - - -async def obter_municipio_por_id(id_municipio: int) -> Optional[Municipio]: - """Get municipality by IBGE code""" - try: - async with httpx.AsyncClient(timeout=15.0) as client: - response = await client.get( - f"{IBGE_BASE_URL}/localidades/municipios/{id_municipio}" - ) - - if response.status_code != 200: - return None - - item = response.json() - uf_info = item.get("microrregiao", {}).get("mesorregiao", {}).get("UF", {}) - - return Municipio( - id=item["id"], - nome=item["nome"], - estado_sigla=uf_info.get("sigla", ""), - estado_nome=uf_info.get("nome", ""), - regiao=uf_info.get("regiao", {}).get("nome", "") - ) - - except Exception as e: - print(f"IBGE municipio error: {e}") - return None - - -async def enriquecer_localizacao(cidade: str, uf: Optional[str] = None) -> Dict[str, Any]: - """ - Enrich a location name with IBGE data. - Useful for adding context to extracted locations. - """ - resultado = { - "cidade_original": cidade, - "encontrado": False, - "ibge_codigo": None, - "cidade": None, - "estado": None, - "estado_sigla": None, - "regiao": None - } - - municipios = await buscar_municipio(cidade, uf) - - if municipios: - # Take best match (exact or first) - melhor = None - for m in municipios: - if m.nome.lower() == cidade.lower(): - melhor = m - break - - if not melhor: - melhor = municipios[0] - - resultado.update({ - "encontrado": True, - "ibge_codigo": melhor.id, - "cidade": melhor.nome, - "estado": melhor.estado_nome, - "estado_sigla": melhor.estado_sigla, - "regiao": melhor.regiao - }) - - return resultado diff --git a/app/services/ingestion/__init__.py b/app/services/ingestion/__init__.py deleted file mode 100644 index 53751fc389795a6893e21379a16b0680f55cda41..0000000000000000000000000000000000000000 --- a/app/services/ingestion/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# Ingestion services -from app.services.ingestion.wikipedia import wikipedia_scraper -from app.services.ingestion.news import news_service diff --git a/app/services/ingestion/__pycache__/__init__.cpython-311.pyc b/app/services/ingestion/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index de09d686a52c85f16de0eac33cbd28ca9065604d..0000000000000000000000000000000000000000 Binary files a/app/services/ingestion/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/app/services/ingestion/__pycache__/news.cpython-311.pyc b/app/services/ingestion/__pycache__/news.cpython-311.pyc deleted file mode 100644 index 47a4ad23456ff8907ab2a47285b1b74cd099a8fe..0000000000000000000000000000000000000000 Binary files a/app/services/ingestion/__pycache__/news.cpython-311.pyc and /dev/null differ diff --git a/app/services/ingestion/__pycache__/wikipedia.cpython-311.pyc b/app/services/ingestion/__pycache__/wikipedia.cpython-311.pyc deleted file mode 100644 index 215244f9f9e1bdf8dc6071c4e0237f41318f352a..0000000000000000000000000000000000000000 Binary files a/app/services/ingestion/__pycache__/wikipedia.cpython-311.pyc and /dev/null differ diff --git a/app/services/ingestion/news.py b/app/services/ingestion/news.py deleted file mode 100644 index 1aba8df40e8cfb6d2cc19900fea89cf6ce04cf14..0000000000000000000000000000000000000000 --- a/app/services/ingestion/news.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -News API Client Service -Usa RSS feeds públicos para não precisar de API key -""" -import feedparser -import requests -from typing import List, Dict -from datetime import datetime -import re - - -class NewsService: - """Serviço para buscar notícias de fontes públicas via RSS""" - - # RSS feeds públicos brasileiros e internacionais - RSS_FEEDS = { - "g1": "https://g1.globo.com/rss/g1/", - "folha": "https://feeds.folha.uol.com.br/folha/rss/rss091.xml", - "bbc_brasil": "https://www.bbc.com/portuguese/articles/rss.xml", - "reuters": "https://www.reutersagency.com/feed/", - "google_news_br": "https://news.google.com/rss?hl=pt-BR&gl=BR&ceid=BR:pt-419" - } - - def fetch_feed(self, feed_url: str) -> List[Dict]: - """Busca artigos de um feed RSS""" - try: - feed = feedparser.parse(feed_url) - articles = [] - - for entry in feed.entries[:20]: # Limitar a 20 artigos - published = None - if hasattr(entry, 'published_parsed') and entry.published_parsed: - published = datetime(*entry.published_parsed[:6]) - - articles.append({ - "title": entry.get("title", ""), - "description": self._clean_html(entry.get("summary", "")), - "url": entry.get("link", ""), - "published_at": published, - "source": feed.feed.get("title", "Unknown") - }) - - return articles - except Exception as e: - print(f"Error fetching feed {feed_url}: {e}") - return [] - - def fetch_all_feeds(self) -> List[Dict]: - """Busca artigos de todos os feeds configurados""" - all_articles = [] - for name, url in self.RSS_FEEDS.items(): - articles = self.fetch_feed(url) - for article in articles: - article["feed_name"] = name - all_articles.extend(articles) - return all_articles - - def search_news(self, query: str) -> List[Dict]: - """ - Busca notícias pelo Google News RSS - """ - # Google News RSS search - search_url = f"https://news.google.com/rss/search?q={query}&hl=pt-BR&gl=BR&ceid=BR:pt-419" - return self.fetch_feed(search_url) - - def _clean_html(self, text: str) -> str: - """Remove HTML tags do texto""" - clean = re.compile('<.*?>') - return re.sub(clean, '', text) - - def to_document(self, article: Dict) -> Dict: - """ - Converte um artigo de notícia para o formato Document - """ - return { - "title": article["title"], - "content": article.get("description", ""), - "doc_type": "news", - "source": article.get("source", "news"), - "source_url": article.get("url"), - "published_at": article.get("published_at") - } - - -# Singleton instance -news_service = NewsService() diff --git a/app/services/ingestion/wikipedia.py b/app/services/ingestion/wikipedia.py deleted file mode 100644 index 2c64a6f77d4bcd406506966ad4b1c3a75972a8e3..0000000000000000000000000000000000000000 --- a/app/services/ingestion/wikipedia.py +++ /dev/null @@ -1,215 +0,0 @@ -""" -Wikipedia Scraper Service -""" -import requests -from bs4 import BeautifulSoup -from typing import Optional, Dict, List -import re - - -class WikipediaScraper: - """Scraper para extrair dados da Wikipedia""" - - BASE_URL = "https://pt.wikipedia.org" - API_URL = "https://pt.wikipedia.org/w/api.php" - - # User-Agent obrigatório para API da Wikipedia - HEADERS = { - "User-Agent": "NumidiumBot/1.0 (https://github.com/numidium; contact@numidium.app) Python/3.11" - } - - def search(self, query: str, limit: int = 10) -> List[Dict]: - """ - Busca artigos na Wikipedia - """ - try: - params = { - "action": "query", - "list": "search", - "srsearch": query, - "srlimit": limit, - "format": "json" - } - - response = requests.get( - self.API_URL, - params=params, - headers=self.HEADERS, - timeout=10 - ) - response.raise_for_status() - data = response.json() - - results = [] - for item in data.get("query", {}).get("search", []): - results.append({ - "title": item["title"], - "snippet": BeautifulSoup(item["snippet"], "html.parser").get_text(), - "pageid": item["pageid"] - }) - - return results - except Exception as e: - print(f"Wikipedia search error: {e}") - return [] - - def get_article(self, title: str) -> Optional[Dict]: - """ - Busca informações completas de um artigo - """ - try: - params = { - "action": "query", - "titles": title, - "prop": "extracts|pageimages|coordinates|categories", - "exintro": True, - "explaintext": True, - "pithumbsize": 300, - "format": "json" - } - - response = requests.get( - self.API_URL, - params=params, - headers=self.HEADERS, - timeout=10 - ) - response.raise_for_status() - data = response.json() - - pages = data.get("query", {}).get("pages", {}) - for page_id, page in pages.items(): - if page_id == "-1": - return None - - result = { - "title": page.get("title"), - "extract": page.get("extract"), - "pageid": page.get("pageid"), - "url": f"{self.BASE_URL}/wiki/{page.get('title', '').replace(' ', '_')}", - "thumbnail": page.get("thumbnail", {}).get("source"), - "categories": [c["title"].replace("Categoria:", "") - for c in page.get("categories", [])] - } - - # Coordenadas se disponíveis - if "coordinates" in page: - coords = page["coordinates"][0] - result["latitude"] = coords.get("lat") - result["longitude"] = coords.get("lon") - - return result - - return None - except Exception as e: - print(f"Wikipedia article error: {e}") - return None - - def get_infobox(self, title: str) -> Dict: - """ - Tenta extrair dados estruturados do infobox de um artigo - """ - try: - url = f"{self.BASE_URL}/wiki/{title.replace(' ', '_')}" - response = requests.get(url, headers=self.HEADERS, timeout=10) - soup = BeautifulSoup(response.text, "html.parser") - - infobox = soup.find("table", class_="infobox") - if not infobox: - return {} - - data = {} - for row in infobox.find_all("tr"): - header = row.find("th") - cell = row.find("td") - if header and cell: - key = header.get_text(strip=True) - value = cell.get_text(strip=True) - # Clean up the value - value = re.sub(r'\[\d+\]', '', value) # Remove references - data[key] = value - - return data - except Exception as e: - print(f"Infobox error: {e}") - return {} - - def scrape_person(self, name: str) -> Optional[Dict]: - """ - Scrape dados de uma pessoa da Wikipedia - Retorna dados formatados para criar uma Entity - """ - article = self.get_article(name) - if not article: - return None - - infobox = self.get_infobox(name) - - return { - "type": "person", - "name": article["title"], - "description": article.get("extract"), - "source": "wikipedia", - "source_url": article["url"], - "properties": { - "thumbnail": article.get("thumbnail"), - "categories": article.get("categories", []), - **infobox - }, - "latitude": article.get("latitude"), - "longitude": article.get("longitude") - } - - def scrape_organization(self, name: str) -> Optional[Dict]: - """ - Scrape dados de uma organização da Wikipedia - """ - article = self.get_article(name) - if not article: - return None - - infobox = self.get_infobox(name) - - return { - "type": "organization", - "name": article["title"], - "description": article.get("extract"), - "source": "wikipedia", - "source_url": article["url"], - "properties": { - "thumbnail": article.get("thumbnail"), - "categories": article.get("categories", []), - **infobox - }, - "latitude": article.get("latitude"), - "longitude": article.get("longitude") - } - - def scrape_location(self, name: str) -> Optional[Dict]: - """ - Scrape dados de um local da Wikipedia - """ - article = self.get_article(name) - if not article: - return None - - infobox = self.get_infobox(name) - - return { - "type": "location", - "name": article["title"], - "description": article.get("extract"), - "source": "wikipedia", - "source_url": article["url"], - "properties": { - "thumbnail": article.get("thumbnail"), - "categories": article.get("categories", []), - **infobox - }, - "latitude": article.get("latitude"), - "longitude": article.get("longitude") - } - - -# Singleton instance -wikipedia_scraper = WikipediaScraper() diff --git a/app/services/investigation.py b/app/services/investigation.py deleted file mode 100644 index cfbfc764bbd26579e5e92285959e952e3d8afa7e..0000000000000000000000000000000000000000 --- a/app/services/investigation.py +++ /dev/null @@ -1,324 +0,0 @@ -""" -Investigation Service - Builds comprehensive dossiers -Combines CNPJ data, transparency/sanctions, Lancer web search, and NER -""" -import httpx -from typing import Optional, Dict, Any, List -from dataclasses import dataclass, field, asdict -import asyncio - -from app.services.brazil_apis import consultar_cnpj, CompanyData -from app.services.transparencia_api import verificar_sancoes -# from app.services.tse_api import buscar_politico # TSE API needs fixing -from app.services import lancer -from app.services.nlp import entity_extractor -from app.core.database import get_db -from app.models.entity import Entity, Relationship - - -LANCER_URL = "https://madras1-lancer.hf.space/api/v1" - - -@dataclass -class DossierSection: - """A section of the dossier""" - titulo: str - conteudo: Any - status: str = "ok" # ok, warning, danger, info - icone: str = "📋" - - -@dataclass -class Dossier: - """Complete investigation dossier""" - tipo: str # "organization" or "person" - alvo: str # Target name - cnpj_cpf: Optional[str] = None - - # Sections - dados_cadastrais: Optional[DossierSection] = None - socios: Optional[DossierSection] = None - sancoes: Optional[DossierSection] = None - dados_politicos: Optional[DossierSection] = None # TSE data - noticias: Optional[DossierSection] = None - entidades_relacionadas: Optional[DossierSection] = None - - # Metadata - red_flags: List[str] = field(default_factory=list) - score_risco: int = 0 # 0-100 - data_geracao: str = "" - fonte_dados: List[str] = field(default_factory=list) - - -async def investigar_empresa(nome_ou_cnpj: str) -> Dossier: - """ - Investigate a company and build a comprehensive dossier. - """ - import re - from datetime import datetime - - dossier = Dossier( - tipo="organization", - alvo=nome_ou_cnpj, - data_geracao=datetime.now().isoformat() - ) - - # Check if input is CNPJ - cnpj_clean = re.sub(r'[^0-9]', '', nome_ou_cnpj) - is_cnpj = len(cnpj_clean) == 14 - - company_data = None - - # 1. Get company data from CNPJ - if is_cnpj: - dossier.cnpj_cpf = cnpj_clean - company_data = await consultar_cnpj(cnpj_clean) - - if company_data: - dossier.alvo = company_data.razao_social or company_data.nome_fantasia or nome_ou_cnpj - dossier.fonte_dados.append(company_data.fonte) - - # Build cadastral section - dossier.dados_cadastrais = DossierSection( - titulo="Dados Cadastrais", - icone="🏢", - conteudo={ - "cnpj": company_data.cnpj, - "razao_social": company_data.razao_social, - "nome_fantasia": company_data.nome_fantasia, - "situacao": company_data.situacao, - "data_abertura": company_data.data_abertura, - "natureza_juridica": company_data.natureza_juridica, - "capital_social": company_data.capital_social, - "porte": company_data.porte, - "endereco": f"{company_data.logradouro}, {company_data.numero} - {company_data.bairro}, {company_data.cidade}/{company_data.uf}", - "cep": company_data.cep, - "telefone": company_data.telefone, - "email": company_data.email, - "atividade_principal": f"{company_data.cnae_principal} - {company_data.cnae_descricao}" - } - ) - - # Check situação for red flags - if company_data.situacao and "ATIVA" not in company_data.situacao.upper(): - dossier.red_flags.append(f"⚠️ Situação cadastral: {company_data.situacao}") - dossier.dados_cadastrais.status = "warning" - - # Build partners section - if company_data.socios: - dossier.socios = DossierSection( - titulo=f"Sócios ({len(company_data.socios)})", - icone="👥", - conteudo=company_data.socios - ) - - # 2. Check sanctions/transparency - if dossier.cnpj_cpf: - sancoes = await verificar_sancoes(dossier.cnpj_cpf) - dossier.fonte_dados.append("Portal da Transparência") - - if sancoes["tem_sancoes"]: - dossier.red_flags.append(f"🚨 Encontrado em {sancoes['total_sancoes']} lista(s) de sanções") - dossier.score_risco += 40 - - dossier.sancoes = DossierSection( - titulo=f"Sanções ({sancoes['total_sancoes']})", - icone="⚠️", - status="danger", - conteudo=sancoes - ) - else: - dossier.sancoes = DossierSection( - titulo="Sanções", - icone="✅", - status="ok", - conteudo={"mensagem": "Nenhuma sanção encontrada nos cadastros públicos"} - ) - - # 3. Web search for news and context - search_query = dossier.alvo - if company_data and company_data.nome_fantasia: - search_query = company_data.nome_fantasia - - try: - web_result = await lancer.search(f"{search_query} notícias escândalos processos", max_results=8) - - if web_result.answer or web_result.results: - dossier.fonte_dados.append("Lancer Web Search") - - news_content = { - "resumo": web_result.answer or "Sem resumo disponível", - "fontes": [ - {"titulo": r.title, "url": r.url, "snippet": r.content[:200]} - for r in web_result.results[:5] - ] - } - - dossier.noticias = DossierSection( - titulo="Notícias e Mídia", - icone="📰", - conteudo=news_content - ) - - # Check for negative keywords in news - negative_keywords = ["escândalo", "fraude", "corrupção", "prisão", "investigado", "denúncia", "irregularidade"] - raw_text = (web_result.answer or "").lower() - for kw in negative_keywords: - if kw in raw_text: - dossier.red_flags.append(f"📰 Menção a '{kw}' encontrada nas notícias") - dossier.noticias.status = "warning" - dossier.score_risco += 10 - break - except Exception as e: - print(f"Web search error: {e}") - - # 4. Extract related entities using NER - if dossier.noticias and dossier.noticias.conteudo.get("resumo"): - try: - text_to_analyze = dossier.noticias.conteudo.get("resumo", "")[:3000] - ner_result = await entity_extractor.extract(text_to_analyze) - - if ner_result.entities: - entities = [ - {"nome": e.name, "tipo": e.type, "descricao": e.description or e.role} - for e in ner_result.entities[:10] - ] - - dossier.entidades_relacionadas = DossierSection( - titulo=f"Entidades Relacionadas ({len(entities)})", - icone="🔗", - conteudo=entities - ) - except Exception as e: - print(f"NER error: {e}") - - # Calculate final risk score - dossier.score_risco = min(100, dossier.score_risco + len(dossier.red_flags) * 5) - - return dossier - - -async def investigar_pessoa(nome: str, cpf: Optional[str] = None) -> Dossier: - """ - Investigate a person and build a dossier. - Note: CPF data is heavily protected by LGPD, so mainly uses web search. - """ - from datetime import datetime - - dossier = Dossier( - tipo="person", - alvo=nome, - cnpj_cpf=cpf, - data_geracao=datetime.now().isoformat() - ) - - # 1. Check sanctions if CPF provided - if cpf: - sancoes = await verificar_sancoes(cpf) - dossier.fonte_dados.append("Portal da Transparência") - - if sancoes["tem_sancoes"]: - dossier.red_flags.append(f"🚨 Encontrado em {sancoes['total_sancoes']} lista(s) de sanções") - dossier.score_risco += 50 - - dossier.sancoes = DossierSection( - titulo=f"Sanções ({sancoes['total_sancoes']})", - icone="⚠️", - status="danger", - conteudo=sancoes - ) - - # 2. Check TSE for political data (DISABLED - API needs fixing) - # try: - # tse_data = await buscar_politico(nome) - # if tse_data.get("encontrado"): - # dossier.fonte_dados.append("TSE (DivulgaCand)") - # candidaturas = tse_data.get("candidaturas", []) - # patrimonio = tse_data.get("total_patrimonio", 0) - # partidos = tse_data.get("partidos", []) - # dossier.dados_politicos = DossierSection(...) - # except Exception as e: - # print(f"TSE search error: {e}") - - - # 3. Web search for information - try: - web_result = await lancer.search(f'"{nome}" biografia cargo empresa', max_results=10) - - if web_result.answer or web_result.results: - dossier.fonte_dados.append("Lancer Web Search") - - dossier.noticias = DossierSection( - titulo="Informações Públicas", - icone="🌐", - conteudo={ - "resumo": web_result.answer or "Informações limitadas", - "fontes": [ - {"titulo": r.title, "url": r.url, "snippet": r.content[:200]} - for r in web_result.results[:5] - ] - } - ) - - # Check for negative keywords - negative_keywords = ["preso", "condenado", "investigado", "acusado", "escândalo", "fraude"] - raw_text = (web_result.answer or "").lower() - for kw in negative_keywords: - if kw in raw_text: - dossier.red_flags.append(f"📰 Menção a '{kw}' encontrada") - dossier.noticias.status = "warning" - dossier.score_risco += 15 - break - except Exception as e: - print(f"Web search error: {e}") - - # 3. Extract related entities - if dossier.noticias and dossier.noticias.conteudo.get("resumo"): - try: - ner_result = await entity_extractor.extract(dossier.noticias.conteudo["resumo"][:2000]) - - if ner_result.entities: - entities = [ - {"nome": e.name, "tipo": e.type, "descricao": e.description or e.role} - for e in ner_result.entities[:10] - if e.name.lower() != nome.lower() # Exclude the target - ] - - if entities: - dossier.entidades_relacionadas = DossierSection( - titulo=f"Conexões ({len(entities)})", - icone="🔗", - conteudo=entities - ) - except Exception as e: - print(f"NER error: {e}") - - dossier.score_risco = min(100, dossier.score_risco + len(dossier.red_flags) * 5) - - return dossier - - -def dossier_to_dict(dossier: Dossier) -> Dict[str, Any]: - """Convert dossier to dictionary for JSON response""" - result = { - "tipo": dossier.tipo, - "alvo": dossier.alvo, - "cnpj_cpf": dossier.cnpj_cpf, - "red_flags": dossier.red_flags, - "score_risco": dossier.score_risco, - "data_geracao": dossier.data_geracao, - "fonte_dados": dossier.fonte_dados, - "secoes": {} - } - - for field_name in ["dados_cadastrais", "socios", "sancoes", "dados_politicos", "noticias", "entidades_relacionadas"]: - section = getattr(dossier, field_name) - if section: - result["secoes"][field_name] = { - "titulo": section.titulo, - "icone": section.icone, - "status": section.status, - "conteudo": section.conteudo - } - - return result diff --git a/app/services/investigator_agent.py b/app/services/investigator_agent.py deleted file mode 100644 index 56b74ad4c994947ed35f3185df53fc586a4232cc..0000000000000000000000000000000000000000 --- a/app/services/investigator_agent.py +++ /dev/null @@ -1,659 +0,0 @@ -""" -Investigator Agent - Autonomous Investigation with Tool Calling -Uses Cerebras native tool calling for multi-source investigations -""" -import json -import re -import httpx -from typing import Optional, List, Dict, Any -from dataclasses import dataclass, field -from datetime import datetime -from sqlalchemy.orm import Session - -from app.config import settings -from app.services import lancer -from app.services.brazil_apis import consultar_cnpj -from app.models.entity import Entity, Relationship - - -def sanitize_text(text: str) -> str: - """ - Clean up text from model that may contain thinking artifacts. - Only removes thinking tags, does NOT remove valid characters. - """ - if not text: - return text - - # Remove thinking tags and content between them - text = re.sub(r'.*?', '', text, flags=re.DOTALL) - text = re.sub(r'<\|think\|>.*?<\|/think\|>', '', text, flags=re.DOTALL) - - # Remove other common model artifacts like <|...|> tags - text = re.sub(r'<\|.*?\|>', '', text) - - # Clean up excessive newlines only - text = re.sub(r'\n{3,}', '\n\n', text) - - return text.strip() - - -@dataclass -class Finding: - """A discovery made during investigation""" - title: str - content: str - source: str - timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) - - -@dataclass -class InvestigationResult: - """Complete investigation result""" - mission: str - findings: List[Finding] - entities_discovered: List[Dict[str, Any]] - connections_mapped: List[Dict[str, Any]] - report: str - iterations: int - tools_used: List[str] - status: str = "completed" - - -# Tool definitions for Cerebras API -TOOLS = [ - { - "type": "function", - "function": { - "name": "search_entity", - "description": "Buscar entidade no NUMIDIUM (grafo de conhecimento) por nome. Use para encontrar pessoas, empresas ou locais já conhecidos.", - "parameters": { - "type": "object", - "properties": { - "query": { - "type": "string", - "description": "Nome ou termo para buscar" - }, - "entity_type": { - "type": "string", - "enum": ["person", "organization", "location", "any"], - "description": "Tipo de entidade (opcional)" - } - }, - "required": ["query"] - } - } - }, - { - "type": "function", - "function": { - "name": "get_connections", - "description": "Obter a rede de conexões de uma entidade específica. Retorna entidades relacionadas.", - "parameters": { - "type": "object", - "properties": { - "entity_id": { - "type": "string", - "description": "ID da entidade no NUMIDIUM" - } - }, - "required": ["entity_id"] - } - } - }, - { - "type": "function", - "function": { - "name": "lookup_cnpj", - "description": "Consultar dados de uma empresa brasileira pelo CNPJ. Retorna razão social, sócios, endereço, CNAEs, etc.", - "parameters": { - "type": "object", - "properties": { - "cnpj": { - "type": "string", - "description": "CNPJ da empresa (com ou sem formatação)" - } - }, - "required": ["cnpj"] - } - } - }, - { - "type": "function", - "function": { - "name": "web_search", - "description": "Pesquisar informações na web. Use para buscar notícias, artigos e informações públicas.", - "parameters": { - "type": "object", - "properties": { - "query": { - "type": "string", - "description": "Termo de busca" - }, - "freshness": { - "type": "string", - "enum": ["day", "week", "month", "any"], - "description": "Frescor dos resultados", - "default": "any" - } - }, - "required": ["query"] - } - } - }, - { - "type": "function", - "function": { - "name": "deep_research", - "description": "Pesquisa profunda e multi-dimensional sobre um tema. Use para tópicos complexos.", - "parameters": { - "type": "object", - "properties": { - "topic": { - "type": "string", - "description": "Tópico para pesquisa profunda" - } - }, - "required": ["topic"] - } - } - }, - { - "type": "function", - "function": { - "name": "save_finding", - "description": "Salvar uma descoberta importante da investigação.", - "parameters": { - "type": "object", - "properties": { - "title": { - "type": "string", - "description": "Título curto da descoberta" - }, - "content": { - "type": "string", - "description": "Conteúdo detalhado" - }, - "source": { - "type": "string", - "description": "Fonte da informação" - } - }, - "required": ["title", "content", "source"] - } - } - }, - { - "type": "function", - "function": { - "name": "finish_investigation", - "description": "Finalizar a investigação e gerar o relatório final.", - "parameters": { - "type": "object", - "properties": { - "summary": { - "type": "string", - "description": "Resumo das descobertas principais" - } - }, - "required": ["summary"] - } - } - } -] - - -SYSTEM_PROMPT = """Você é um agente investigador autônomo do sistema NUMIDIUM/AVANGARD. /no_think - -Sua missão é investigar temas usando múltiplas fontes de dados: -- NUMIDIUM: Grafo de conhecimento com entidades e relacionamentos -- Consulta CNPJ: Dados oficiais de empresas brasileiras (BrasilAPI) -- Web Search: Pesquisa na internet via Lancer - -## Estratégia de Investigação: - -1. Comece buscando no NUMIDIUM se já temos informações sobre o alvo -2. Para empresas brasileiras, consulte o CNPJ para obter sócios e dados -3. Use web_search para buscar notícias e informações públicas -4. Para cada sócio/conexão descoberta, considere investigar mais a fundo -5. Use save_finding para registrar descobertas importantes -6. Quando tiver informações suficientes, use finish_investigation - -## Regras: -- Seja metódico e siga pistas -- Não invente informações - use apenas dados das ferramentas -- Priorize qualidade sobre quantidade -- Cite sempre as fontes -- NÃO use pensamento interno ou tags . Responda diretamente.""" - - -class InvestigatorAgent: - """Autonomous investigation agent with tool calling""" - - def __init__(self): - self.api_url = "https://api.cerebras.ai/v1/chat/completions" - self.api_key = settings.cerebras_api_key - self.model = "zai-glm-4.7" - - # Investigation state - self.findings: List[Finding] = [] - self.entities_discovered: List[Dict[str, Any]] = [] - self.connections_mapped: List[Dict[str, Any]] = [] - self.tools_used: List[str] = [] - self.messages: List[Dict[str, Any]] = [] - self.db: Optional[Session] = None - - def _reset_state(self): - """Reset investigation state""" - self.findings = [] - self.entities_discovered = [] - self.connections_mapped = [] - self.tools_used = [] - self.messages = [] - - async def _call_llm( - self, - messages: List[Dict[str, Any]], - tools: List[Dict] = None - ) -> Dict[str, Any]: - """Call Cerebras API with tool calling support""" - try: - payload = { - "model": self.model, - "messages": messages, - "temperature": 0.3, - "max_tokens": 2048, - } - - if tools: - payload["tools"] = tools - payload["tool_choice"] = "auto" - payload["parallel_tool_calls"] = True - - async with httpx.AsyncClient(timeout=60.0) as client: - response = await client.post( - self.api_url, - headers={ - "Authorization": f"Bearer {self.api_key}", - "Content-Type": "application/json" - }, - json=payload - ) - - if response.status_code != 200: - raise Exception(f"API error: {response.status_code} - {response.text}") - - return response.json() - - except Exception as e: - raise Exception(f"LLM call failed: {str(e)}") - - async def _execute_tool(self, tool_name: str, arguments: Dict) -> str: - """Execute a tool and return the result""" - self.tools_used.append(tool_name) - - try: - if tool_name == "search_entity": - return await self._search_entity( - arguments.get("query", ""), - arguments.get("entity_type") - ) - - elif tool_name == "get_connections": - return await self._get_connections(arguments.get("entity_id")) - - elif tool_name == "lookup_cnpj": - return await self._lookup_cnpj(arguments.get("cnpj", "")) - - - elif tool_name == "web_search": - return await self._web_search( - arguments.get("query", ""), - arguments.get("freshness", "any") - ) - - elif tool_name == "deep_research": - return await self._deep_research(arguments.get("topic", "")) - - elif tool_name == "aether_search": - return await self._aether_search(arguments.get("query", "")) - - elif tool_name == "aether_entities": - return await self._aether_entities() - - elif tool_name == "save_finding": - finding = Finding( - title=arguments.get("title", ""), - content=arguments.get("content", ""), - source=arguments.get("source", "") - ) - self.findings.append(finding) - return f"Descoberta salva: {finding.title}" - - elif tool_name == "finish_investigation": - return f"INVESTIGATION_COMPLETE: {arguments.get('summary', '')}" - - else: - return f"Ferramenta desconhecida: {tool_name}" - - except Exception as e: - return f"Erro ao executar {tool_name}: {str(e)}" - - async def _search_entity(self, query: str, entity_type: Optional[str]) -> str: - """Search entities in database""" - if not self.db: - return "Erro: Banco de dados não disponível" - - q = self.db.query(Entity).filter(Entity.name.ilike(f"%{query}%")) - if entity_type and entity_type != "any": - q = q.filter(Entity.type == entity_type) - - entities = q.limit(10).all() - - if entities: - result = [] - for e in entities: - self.entities_discovered.append({ - "id": str(e.id), - "name": e.name, - "type": e.type - }) - result.append({ - "id": str(e.id), - "name": e.name, - "type": e.type, - "description": e.description[:200] if e.description else None - }) - return json.dumps(result, ensure_ascii=False, indent=2) - - return "Nenhuma entidade encontrada no NUMIDIUM." - - async def _get_connections(self, entity_id: str) -> str: - """Get entity connections""" - if not self.db: - return "Erro: Banco de dados não disponível" - - relationships = self.db.query(Relationship).filter( - (Relationship.source_id == entity_id) | (Relationship.target_id == entity_id) - ).limit(20).all() - - if relationships: - connections = [] - for rel in relationships: - source = self.db.query(Entity).filter(Entity.id == rel.source_id).first() - target = self.db.query(Entity).filter(Entity.id == rel.target_id).first() - if source and target: - connections.append({ - "source": source.name, - "target": target.name, - "type": rel.type - }) - return json.dumps(connections, ensure_ascii=False, indent=2) - - return "Nenhuma conexão encontrada." - - async def _lookup_cnpj(self, cnpj: str) -> str: - """Lookup CNPJ via BrasilAPI""" - cnpj_clean = cnpj.replace(".", "").replace("/", "").replace("-", "") - result = await consultar_cnpj(cnpj_clean) - - if result: - data = { - "razao_social": result.razao_social, - "nome_fantasia": result.nome_fantasia, - "situacao": result.situacao, - "data_abertura": result.data_abertura, - "capital_social": result.capital_social, - "endereco": f"{result.logradouro}, {result.numero} - {result.cidade}/{result.uf}", - "cnae": f"{result.cnae_principal} - {result.cnae_descricao}", - "socios": result.socios - } - return json.dumps(data, ensure_ascii=False, indent=2) - - return "CNPJ não encontrado." - - async def _lookup_phone(self, phone: str) -> str: - """Lookup phone number via NumVerify API""" - # Clean phone number - keep only digits - phone_clean = "".join(c for c in phone if c.isdigit()) - - # NumVerify API key (free tier: 100 req/month) - numverify_key = getattr(settings, 'numverify_api_key', None) - - if not numverify_key: - # Fallback: just do a web search for the number - return await self._web_search(f'"{phone_clean}" telefone', "any") - - try: - async with httpx.AsyncClient(timeout=10.0) as client: - response = await client.get( - "http://apilayer.net/api/validate", - params={ - "access_key": numverify_key, - "number": phone_clean, - "country_code": "", # Auto-detect - "format": 1 - } - ) - - if response.status_code == 200: - data = response.json() - - if data.get("valid"): - result = { - "numero": data.get("international_format"), - "valido": True, - "pais": data.get("country_name"), - "codigo_pais": data.get("country_code"), - "operadora": data.get("carrier"), - "tipo_linha": data.get("line_type"), # mobile, landline, etc - "localizacao": data.get("location") - } - return json.dumps(result, ensure_ascii=False, indent=2) - else: - return f"Número {phone_clean} não é válido ou não foi encontrado." - - return "Erro ao consultar número." - - except Exception as e: - # Fallback to web search - return await self._web_search(f'"{phone_clean}" telefone', "any") - - async def _web_search(self, query: str, freshness: str) -> str: - """Web search via Lancer""" - try: - result = await lancer.search(query, max_results=5, freshness=freshness) - if result.answer: - return f"Resumo: {result.answer}\n\nFontes: {len(result.results)} resultados" - return "Nenhum resultado encontrado." - except Exception as e: - return f"Erro na busca web: {str(e)}" - - async def _deep_research(self, topic: str) -> str: - """Deep research via Lancer""" - try: - result = await lancer.deep_research(topic, max_dimensions=3) - if result.answer: - return result.answer - return "Pesquisa profunda não retornou resultados." - except Exception as e: - return f"Erro na pesquisa: {str(e)}" - - async def _aether_search(self, query: str) -> str: - """Semantic search via AetherMap""" - try: - # Check if we have a job_id cached - if not aethermap.current_job_id: - # Index entities from database first - if self.db: - entities = self.db.query(Entity).limit(500).all() - if entities: - texts = [] - for e in entities: - text = f"{e.name} ({e.type})" - if e.description: - text += f": {e.description[:500]}" - texts.append(text) - - if texts: - result = await aethermap.process_documents(texts, fast_mode=True) - # Continue with search - - if aethermap.current_job_id: - result = await aethermap.semantic_search(query, turbo_mode=True) - return f"RAG Response:\n{result.summary}" - else: - return "Nenhum documento indexado no AetherMap." - - except Exception as e: - return f"Erro no AetherMap search: {str(e)}" - - async def _aether_entities(self) -> str: - """Extract NER entities via AetherMap""" - try: - if not aethermap.current_job_id: - return "Nenhum documento indexado. Use aether_search primeiro." - - result = await aethermap.extract_entities() - - # Format response - output = [] - - if result.hubs: - output.append("**Entidades Centrais (Hubs):**") - for hub in result.hubs[:5]: - output.append(f"- {hub.get('entity')} ({hub.get('type')}): {hub.get('degree')} conexões") - - if result.insights: - output.append(f"\n**Insights:**") - output.append(f"- Total de conexões: {result.insights.get('total_connections', 0)}") - output.append(f"- Grau médio: {result.insights.get('avg_degree', 0)}") - - if result.edges: - output.append(f"\n**Top 5 Relacionamentos:**") - for edge in result.edges[:5]: - output.append(f"- {edge.source_entity} <-> {edge.target_entity}: {edge.reason}") - - return "\n".join(output) if output else "Nenhuma entidade significativa encontrada." - - except Exception as e: - return f"Erro na extração de entidades: {str(e)}" - - async def investigate( - self, - mission: str, - db: Session, - max_iterations: int = 10 - ) -> InvestigationResult: - """Main investigation loop""" - self._reset_state() - self.db = db - - self.messages = [ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": f"Missão de investigação: {mission}\n\nComece a investigação."} - ] - - iteration = 0 - final_summary = "" - - while iteration < max_iterations: - iteration += 1 - - response = await self._call_llm(self.messages, TOOLS) - - choice = response["choices"][0] - message = choice["message"] - self.messages.append(message) - - tool_calls = message.get("tool_calls", []) - - if not tool_calls: - if message.get("content"): - final_summary = message["content"] - break - - for tool_call in tool_calls: - func = tool_call["function"] - tool_name = func["name"] - - try: - arguments = json.loads(func["arguments"]) - except: - arguments = {} - - result = await self._execute_tool(tool_name, arguments) - - if result.startswith("INVESTIGATION_COMPLETE:"): - final_summary = result.replace("INVESTIGATION_COMPLETE:", "").strip() - break - - self.messages.append({ - "role": "tool", - "tool_call_id": tool_call["id"], - "content": result - }) - - if final_summary: - break - - if not final_summary: - final_summary = await self._generate_report(mission) - - # Sanitize all text outputs to remove thinking artifacts - final_summary = sanitize_text(final_summary) - - # Sanitize findings content - sanitized_findings = [] - for f in self.findings: - sanitized_findings.append(Finding( - title=sanitize_text(f.title), - content=sanitize_text(f.content), - source=f.source, - timestamp=f.timestamp - )) - - return InvestigationResult( - mission=mission, - findings=sanitized_findings, - entities_discovered=self.entities_discovered, - connections_mapped=self.connections_mapped, - report=final_summary, - iterations=iteration, - tools_used=list(set(self.tools_used)), - status="completed" - ) - - async def _generate_report(self, mission: str) -> str: - """Generate final report""" - findings_text = "\n".join([ - f"- {f.title}: {f.content} (Fonte: {f.source})" - for f in self.findings - ]) or "Nenhuma descoberta registrada." - - entities_text = ", ".join([ - e.get("name", "Unknown") for e in self.entities_discovered[:10] - ]) or "Nenhuma entidade." - - prompt = f"""Gere um relatório de investigação: - -Missão: {mission} - -Descobertas: -{findings_text} - -Entidades: {entities_text} - -Ferramentas usadas: {', '.join(set(self.tools_used))} - -Gere relatório estruturado com: Resumo Executivo, Descobertas, Entidades, Recomendações.""" - - response = await self._call_llm([ - {"role": "system", "content": "Gere relatórios concisos."}, - {"role": "user", "content": prompt} - ]) - - return sanitize_text(response["choices"][0]["message"]["content"]) - - -# Singleton -investigator_agent = InvestigatorAgent() diff --git a/app/services/lancer.py b/app/services/lancer.py deleted file mode 100644 index 179868cdd00136f0a9376b6ea6fdff3df5b48abf..0000000000000000000000000000000000000000 --- a/app/services/lancer.py +++ /dev/null @@ -1,198 +0,0 @@ -""" -Lancer Deep Research Service -Integrates with Lancer Search API for AI-powered research -""" -import httpx -from typing import Optional, List, Dict, Any -from dataclasses import dataclass - - -LANCER_BASE_URL = "https://madras1-lancer.hf.space" - - -@dataclass -class SearchResult: - """Individual search result from Lancer""" - title: str - url: str - content: str - score: float - published_date: Optional[str] = None - - -@dataclass -class ResearchResponse: - """Response from Lancer research/search""" - query: str - answer: Optional[str] - results: List[SearchResult] - citations: List[Dict[str, Any]] - processing_time_ms: float - raw_text: str # Combined text for NER extraction - - -async def search( - query: str, - max_results: int = 10, - freshness: str = "any" -) -> ResearchResponse: - """ - Perform a search with AI synthesis using Lancer API. - """ - try: - async with httpx.AsyncClient(timeout=60.0) as client: - response = await client.post( - f"{LANCER_BASE_URL}/api/v1/search", - json={ - "query": query, - "max_results": max_results, - "freshness": freshness, - "include_answer": True - } - ) - - if response.status_code != 200: - raise Exception(f"Lancer API error: {response.status_code}") - - data = response.json() - - results = [ - SearchResult( - title=r.get("title", ""), - url=r.get("url", ""), - content=r.get("content", ""), - score=r.get("score", 0.0), - published_date=r.get("published_date") - ) - for r in data.get("results", []) - ] - - # Combine all text for NER - raw_text = data.get("answer", "") or "" - for r in results: - raw_text += f"\n{r.title}. {r.content}" - - return ResearchResponse( - query=data.get("query", query), - answer=data.get("answer"), - results=results, - citations=data.get("citations", []), - processing_time_ms=data.get("processing_time_ms", 0), - raw_text=raw_text - ) - - except Exception as e: - raise Exception(f"Lancer search failed: {str(e)}") - - -async def deep_research( - query: str, - max_dimensions: int = 5, - max_sources_per_dim: int = 5 -) -> ResearchResponse: - """ - Perform deep multi-dimensional research using Lancer API. - This provides richer, more comprehensive analysis. - """ - try: - async with httpx.AsyncClient(timeout=120.0) as client: - response = await client.post( - f"{LANCER_BASE_URL}/api/v1/research/deep", - json={ - "query": query, - "max_dimensions": max_dimensions, - "max_sources_per_dim": max_sources_per_dim, - "max_total_searches": 20 - } - ) - - if response.status_code != 200: - raise Exception(f"Lancer API error: {response.status_code}") - - data = response.json() - - # Deep research returns a different format - adapt it - results = [] - raw_text = "" - - # Extract from dimensions if present - if "dimensions" in data: - for dim in data["dimensions"]: - dim_name = dim.get("dimension", "") - raw_text += f"\n## {dim_name}\n" - for r in dim.get("results", []): - results.append(SearchResult( - title=r.get("title", ""), - url=r.get("url", ""), - content=r.get("content", ""), - score=r.get("score", 0.0) - )) - raw_text += f"{r.get('title', '')}. {r.get('content', '')}\n" - - # Add final report - final_report = data.get("final_report", data.get("report", "")) - if final_report: - raw_text = final_report + "\n" + raw_text - - return ResearchResponse( - query=query, - answer=final_report, - results=results, - citations=data.get("citations", []), - processing_time_ms=data.get("processing_time_ms", 0), - raw_text=raw_text - ) - - except Exception as e: - raise Exception(f"Lancer deep research failed: {str(e)}") - - -async def heavy_search( - query: str, - max_results: int = 5 -) -> ResearchResponse: - """ - Heavy search with full content scraping from sources. - Slower but provides more context. - """ - try: - async with httpx.AsyncClient(timeout=90.0) as client: - response = await client.post( - f"{LANCER_BASE_URL}/api/v1/search/heavy", - json={ - "query": query, - "max_results": max_results, - "include_answer": True - } - ) - - if response.status_code != 200: - raise Exception(f"Lancer API error: {response.status_code}") - - data = response.json() - - results = [ - SearchResult( - title=r.get("title", ""), - url=r.get("url", ""), - content=r.get("content", ""), - score=r.get("score", 0.0) - ) - for r in data.get("results", []) - ] - - raw_text = data.get("answer", "") or "" - for r in results: - raw_text += f"\n{r.title}. {r.content}" - - return ResearchResponse( - query=query, - answer=data.get("answer"), - results=results, - citations=data.get("citations", []), - processing_time_ms=data.get("processing_time_ms", 0), - raw_text=raw_text - ) - - except Exception as e: - raise Exception(f"Lancer heavy search failed: {str(e)}") diff --git a/app/services/nlp/__init__.py b/app/services/nlp/__init__.py deleted file mode 100644 index e9265c7e61b3b29a87dcd75c4455abd114be3e18..0000000000000000000000000000000000000000 --- a/app/services/nlp/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# NLP Services -from .entity_extractor import entity_extractor diff --git a/app/services/nlp/__pycache__/__init__.cpython-311.pyc b/app/services/nlp/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 8671a044592ad7e7b9a10ee976be1a78f1f7958d..0000000000000000000000000000000000000000 Binary files a/app/services/nlp/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/app/services/nlp/__pycache__/entity_extractor.cpython-311.pyc b/app/services/nlp/__pycache__/entity_extractor.cpython-311.pyc deleted file mode 100644 index a7aac7bb33e176996105a8d539ec88db2b3ceaf5..0000000000000000000000000000000000000000 Binary files a/app/services/nlp/__pycache__/entity_extractor.cpython-311.pyc and /dev/null differ diff --git a/app/services/nlp/entity_extractor.py b/app/services/nlp/entity_extractor.py deleted file mode 100644 index 8855cc0c67661840ce99ce63f7aafe9da23e60b2..0000000000000000000000000000000000000000 --- a/app/services/nlp/entity_extractor.py +++ /dev/null @@ -1,265 +0,0 @@ -""" -Entity Extractor Service - LLM-based NER -Uses Cerebras API with Qwen 3 235B for intelligent entity and relationship extraction -""" -import json -import re -from typing import Dict, List, Optional, Any -from dataclasses import dataclass -import httpx - -from app.config import settings - - -@dataclass -class ExtractedEntity: - """Represents an extracted entity""" - name: str - type: str # person, organization, location, event - role: Optional[str] = None - aliases: Optional[List[str]] = None - description: Optional[str] = None - latitude: Optional[float] = None - longitude: Optional[float] = None - event_date: Optional[str] = None # Date in ISO format (YYYY-MM-DD) - - -@dataclass -class ExtractedRelationship: - """Represents a relationship between entities""" - source: str - target: str - relationship_type: str - context: Optional[str] = None - event_date: Optional[str] = None # Date in ISO format (YYYY-MM-DD) - - -@dataclass -class ExtractedEvent: - """Represents an extracted event""" - description: str - event_type: Optional[str] = None - date: Optional[str] = None - location: Optional[str] = None - participants: Optional[List[str]] = None - - -@dataclass -class ExtractionResult: - """Complete extraction result""" - entities: List[ExtractedEntity] - relationships: List[ExtractedRelationship] - events: List[ExtractedEvent] - raw_response: Optional[str] = None - - -EXTRACTION_PROMPT = """Você é um especialista em extração de informações estruturadas de textos. - -Analise o texto fornecido e extraia TODAS as entidades, relacionamentos e eventos mencionados. - -## Regras: -1. Identifique entidades: pessoas, organizações, locais, eventos -2. Para PESSOAS: inclua nome completo (se mencionado ou conhecido), cargo/função -3. Para ORGANIZAÇÕES: inclua nome oficial e siglas -4. Para LOCAIS: seja específico (cidade, país, endereço) -5. Identifique RELACIONAMENTOS entre entidades (quem trabalha onde, quem conhece quem, etc.) -6. Identifique EVENTOS mencionados (reuniões, anúncios, eleições, etc.) -7. EXTRAIA DATAS sempre que mencionadas (formato YYYY-MM-DD ou YYYY se só o ano) - -## Formato de resposta (JSON válido): -```json -{{ - "entities": [ - {{ - "name": "Nome Completo", - "type": "person|organization|location|event", - "role": "cargo ou função (opcional)", - "aliases": ["apelidos", "siglas"], - "description": "breve descrição se relevante", - "event_date": "YYYY-MM-DD ou YYYY (data relevante como nascimento, fundação, etc)" - }} - ], - "relationships": [ - {{ - "source": "Nome da Entidade 1", - "target": "Nome da Entidade 2", - "relationship_type": "tipo de relação (trabalha em, preside, fundou, reuniu-se com, etc.)", - "context": "contexto da relação", - "event_date": "YYYY-MM-DD ou YYYY (quando o relacionamento aconteceu/iniciou)" - }} - ], - "events": [ - {{ - "description": "O que aconteceu", - "event_type": "meeting|announcement|election|crime|etc", - "date": "YYYY-MM-DD ou YYYY", - "location": "local se mencionado", - "participants": ["lista de participantes"] - }} - ] -}} -``` - -Retorne APENAS o JSON, sem texto adicional. - -## Texto para análise: -{text} -""" - - -class EntityExtractor: - """ - LLM-based Entity Extractor using Cerebras API - """ - - def __init__(self): - self.api_key = settings.cerebras_api_key - self.base_url = "https://api.cerebras.ai/v1" - self.model = "qwen-3-235b-a22b-instruct-2507" - self.timeout = 60.0 - - async def extract(self, text: str) -> ExtractionResult: - """ - Extract entities, relationships, and events from text using LLM - - Args: - text: The text to analyze - - Returns: - ExtractionResult with all extracted information - """ - if not self.api_key: - raise ValueError("CEREBRAS_API_KEY not configured. Please set the environment variable.") - - if not text or len(text.strip()) < 10: - return ExtractionResult(entities=[], relationships=[], events=[]) - - # Prepare the prompt - prompt = EXTRACTION_PROMPT.format(text=text) - - try: - # Call Cerebras API - async with httpx.AsyncClient(timeout=self.timeout) as client: - response = await client.post( - f"{self.base_url}/chat/completions", - headers={ - "Authorization": f"Bearer {self.api_key}", - "Content-Type": "application/json" - }, - json={ - "model": self.model, - "messages": [ - { - "role": "system", - "content": "Você é um assistente especialista em extração de entidades e relacionamentos. Sempre responda em JSON válido." - }, - { - "role": "user", - "content": prompt - } - ], - "temperature": 0.1, # Low temperature for consistent extraction - "max_tokens": 4096 - } - ) - - if response.status_code != 200: - error_text = response.text - print(f"Cerebras API error: {response.status_code} - {error_text}") - raise ValueError(f"Cerebras API error: {response.status_code}") - - data = response.json() - - # Parse the response - raw_content = data["choices"][0]["message"]["content"] - return self._parse_response(raw_content) - - except httpx.TimeoutException: - print("Cerebras API timeout") - raise ValueError("API timeout - please try again with shorter text") - except httpx.RequestError as e: - print(f"Cerebras API request error: {e}") - raise ValueError(f"API connection error: {str(e)}") - except KeyError as e: - print(f"Unexpected API response format: {e}") - raise ValueError("Unexpected API response format") - - def _parse_response(self, content: str) -> ExtractionResult: - """Parse the LLM response into structured data""" - try: - # Try to extract JSON from the response - # Sometimes the model wraps it in ```json ... ``` - json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL) - if json_match: - json_str = json_match.group(1) - else: - # Try to find raw JSON - json_match = re.search(r'\{.*\}', content, re.DOTALL) - if json_match: - json_str = json_match.group(0) - else: - json_str = content - - data = json.loads(json_str) - - # Parse entities - entities = [] - for e in data.get("entities", []): - entities.append(ExtractedEntity( - name=e.get("name", ""), - type=e.get("type", "unknown"), - role=e.get("role"), - aliases=e.get("aliases", []), - description=e.get("description"), - event_date=e.get("event_date") - )) - - # Parse relationships - relationships = [] - for r in data.get("relationships", []): - relationships.append(ExtractedRelationship( - source=r.get("source", ""), - target=r.get("target", ""), - relationship_type=r.get("relationship_type", "related_to"), - context=r.get("context"), - event_date=r.get("event_date") - )) - - # Parse events - events = [] - for ev in data.get("events", []): - events.append(ExtractedEvent( - description=ev.get("description", ""), - event_type=ev.get("event_type"), - date=ev.get("date"), - location=ev.get("location"), - participants=ev.get("participants", []) - )) - - return ExtractionResult( - entities=entities, - relationships=relationships, - events=events, - raw_response=content - ) - - except json.JSONDecodeError as e: - print(f"Failed to parse LLM response: {e}") - print(f"Raw content: {content}") - return ExtractionResult( - entities=[], - relationships=[], - events=[], - raw_response=content - ) - - def extract_sync(self, text: str) -> ExtractionResult: - """ - Synchronous version of extract for non-async contexts - """ - import asyncio - return asyncio.run(self.extract(text)) - - -# Singleton instance -entity_extractor = EntityExtractor() diff --git a/app/services/transparencia_api.py b/app/services/transparencia_api.py deleted file mode 100644 index 13face26b77d772b00023efd68c0a6af8dc03d9e..0000000000000000000000000000000000000000 --- a/app/services/transparencia_api.py +++ /dev/null @@ -1,146 +0,0 @@ -""" -Portal da Transparência APIs -Access to Brazilian government transparency data -""" -import httpx -from typing import Optional, Dict, Any, List -from dataclasses import dataclass - - -# Portal da Transparência base URL -TRANSPARENCIA_URL = "https://api.portaldatransparencia.gov.br/api-de-dados" - - -@dataclass -class SanctionRecord: - """Data structure for sanction/punishment records""" - tipo: str # CEIS, CNEP, CEPIM - cpf_cnpj: str - nome: str - tipo_pessoa: str # 'F' or 'J' - - # Sanction details - tipo_sancao: str = "" - data_inicio: str = "" - data_fim: str = "" - orgao_sancionador: str = "" - uf_orgao: str = "" - fundamentacao_legal: str = "" - - # Source - fonte_url: str = "" - - -async def consultar_ceis(cnpj_cpf: str, token: Optional[str] = None) -> List[SanctionRecord]: - """ - Query CEIS - Cadastro de Empresas Inidôneas e Suspensas - Note: Requires authentication token from Portal da Transparência - """ - # Without token, we can still try - some endpoints work without auth - return await _query_sanctions("ceis", cnpj_cpf, token) - - -async def consultar_cnep(cnpj_cpf: str, token: Optional[str] = None) -> List[SanctionRecord]: - """ - Query CNEP - Cadastro Nacional de Empresas Punidas - """ - return await _query_sanctions("cnep", cnpj_cpf, token) - - -async def consultar_cepim(cnpj_cpf: str, token: Optional[str] = None) -> List[SanctionRecord]: - """ - Query CEPIM - Cadastro de Entidades Privadas sem Fins Lucrativos Impedidas - """ - return await _query_sanctions("cepim", cnpj_cpf, token) - - -async def _query_sanctions( - endpoint: str, - cnpj_cpf: str, - token: Optional[str] = None -) -> List[SanctionRecord]: - """Internal function to query sanction APIs""" - try: - headers = {} - if token: - headers["chave-api-dados"] = token - - params = {"cnpjCpf": cnpj_cpf} - - async with httpx.AsyncClient(timeout=30.0) as client: - response = await client.get( - f"{TRANSPARENCIA_URL}/{endpoint}", - params=params, - headers=headers - ) - - if response.status_code == 401: - # Need authentication - return empty for now - print(f"Portal da Transparência requires authentication for {endpoint}") - return [] - - if response.status_code != 200: - return [] - - data = response.json() - if not isinstance(data, list): - data = [data] if data else [] - - records = [] - for item in data: - records.append(SanctionRecord( - tipo=endpoint.upper(), - cpf_cnpj=item.get("cpfCnpj", ""), - nome=item.get("nomeRazaoSocial", item.get("nome", "")), - tipo_pessoa=item.get("tipoPessoa", ""), - tipo_sancao=item.get("tipoSancao", {}).get("descricao", "") if isinstance(item.get("tipoSancao"), dict) else str(item.get("tipoSancao", "")), - data_inicio=item.get("dataInicioSancao", ""), - data_fim=item.get("dataFimSancao", ""), - orgao_sancionador=item.get("orgaoSancionador", {}).get("nome", "") if isinstance(item.get("orgaoSancionador"), dict) else str(item.get("orgaoSancionador", "")), - uf_orgao=item.get("ufOrgaoSancionador", ""), - fundamentacao_legal=item.get("fundamentacaoLegal", ""), - fonte_url=f"https://portaldatransparencia.gov.br/{endpoint}" - )) - - return records - - except Exception as e: - print(f"Transparência API error ({endpoint}): {e}") - return [] - - -async def verificar_sancoes(cnpj_cpf: str, token: Optional[str] = None) -> Dict[str, Any]: - """ - Check all sanction databases for a CNPJ/CPF - Returns consolidated result - """ - import asyncio - - # Query all databases in parallel - ceis_task = consultar_ceis(cnpj_cpf, token) - cnep_task = consultar_cnep(cnpj_cpf, token) - cepim_task = consultar_cepim(cnpj_cpf, token) - - ceis, cnep, cepim = await asyncio.gather(ceis_task, cnep_task, cepim_task) - - all_sanctions = ceis + cnep + cepim - - return { - "cnpj_cpf": cnpj_cpf, - "tem_sancoes": len(all_sanctions) > 0, - "total_sancoes": len(all_sanctions), - "ceis": len(ceis), - "cnep": len(cnep), - "cepim": len(cepim), - "registros": [ - { - "tipo": s.tipo, - "tipo_sancao": s.tipo_sancao, - "orgao": s.orgao_sancionador, - "inicio": s.data_inicio, - "fim": s.data_fim, - "fundamentacao": s.fundamentacao_legal - } - for s in all_sanctions - ] - } diff --git a/app/services/tse_api.py b/app/services/tse_api.py deleted file mode 100644 index e851625961d9a2c673f2eab9d91c44145d05e9cc..0000000000000000000000000000000000000000 --- a/app/services/tse_api.py +++ /dev/null @@ -1,270 +0,0 @@ -""" -TSE (Tribunal Superior Eleitoral) API Service -Access to Brazilian electoral data - candidates, assets, donations -""" -import httpx -from typing import Optional, Dict, Any, List -from dataclasses import dataclass, field - - -# DivulgaCand API (unofficial but functional) -TSE_DIVULGACAND_URL = "https://divulgacandcontas.tse.jus.br/divulga/rest/v1" - - -@dataclass -class Candidato: - """Electoral candidate data""" - id: int - nome: str - nome_urna: str - cpf_parcial: str = "" # TSE only shows partial - numero: str = "" - cargo: str = "" - partido_sigla: str = "" - partido_nome: str = "" - coligacao: str = "" - situacao: str = "" - - # Location - uf: str = "" - municipio: str = "" - - # Personal - data_nascimento: str = "" - genero: str = "" - grau_instrucao: str = "" - ocupacao: str = "" - - # Assets - total_bens: float = 0.0 - bens: List[Dict[str, Any]] = field(default_factory=list) - - # Campaign - total_receitas: float = 0.0 - total_despesas: float = 0.0 - - -@dataclass -class Eleicao: - """Election metadata""" - id: int - ano: int - descricao: str - turno: int = 1 - - -async def listar_eleicoes() -> List[Eleicao]: - """List available elections""" - try: - async with httpx.AsyncClient(timeout=15.0) as client: - response = await client.get(f"{TSE_DIVULGACAND_URL}/eleicao/ordinarias") - - if response.status_code != 200: - return [] - - data = response.json() - eleicoes = [] - - for item in data: - eleicoes.append(Eleicao( - id=item.get("id", 0), - ano=item.get("ano", 0), - descricao=item.get("descricaoEleicao", ""), - turno=item.get("turno", 1) - )) - - return sorted(eleicoes, key=lambda x: x.ano, reverse=True) - - except Exception as e: - print(f"TSE eleicoes error: {e}") - return [] - - -async def buscar_candidatos( - nome: str, - ano: int = 2024, - uf: Optional[str] = None, - cargo: Optional[str] = None -) -> List[Candidato]: - """ - Search for candidates by name. - - Args: - nome: Candidate name to search - ano: Election year (default 2024) - uf: State filter (optional) - cargo: Position filter (optional) - """ - try: - # First get the election ID for the year - eleicoes = await listar_eleicoes() - eleicao = next((e for e in eleicoes if e.ano == ano), None) - - if not eleicao: - # Try common election IDs - eleicao_id = {2024: 546, 2022: 544, 2020: 426, 2018: 295}.get(ano, 546) - else: - eleicao_id = eleicao.id - - # Build search URL - base_url = f"{TSE_DIVULGACAND_URL}/candidatura/listar/{ano}/{eleicao_id}" - - params = {"nomeCompleto": nome} - if uf: - params["uf"] = uf.upper() - if cargo: - params["cargo"] = cargo - - async with httpx.AsyncClient(timeout=30.0) as client: - response = await client.get(base_url, params=params) - - if response.status_code != 200: - return [] - - data = response.json() - candidatos_data = data.get("candidatos", []) - - candidatos = [] - for item in candidatos_data: - candidatos.append(Candidato( - id=item.get("id", 0), - nome=item.get("nomeCompleto", ""), - nome_urna=item.get("nomeUrna", ""), - cpf_parcial=item.get("cpf", "")[:3] + ".***.***-**" if item.get("cpf") else "", - numero=str(item.get("numero", "")), - cargo=item.get("cargo", {}).get("nome", "") if isinstance(item.get("cargo"), dict) else str(item.get("cargo", "")), - partido_sigla=item.get("partido", {}).get("sigla", "") if isinstance(item.get("partido"), dict) else "", - partido_nome=item.get("partido", {}).get("nome", "") if isinstance(item.get("partido"), dict) else "", - uf=item.get("ufSigla", "") or item.get("uf", ""), - municipio=item.get("municipio", {}).get("nome", "") if isinstance(item.get("municipio"), dict) else "", - situacao=item.get("situacao", ""), - total_bens=float(item.get("totalDeBens", 0) or 0) - )) - - return candidatos - - except Exception as e: - print(f"TSE search error: {e}") - return [] - - -async def obter_candidato_detalhes( - id_candidato: int, - ano: int = 2024, - eleicao_id: Optional[int] = None -) -> Optional[Candidato]: - """Get detailed candidate information including assets""" - try: - if not eleicao_id: - eleicao_id = {2024: 546, 2022: 544, 2020: 426, 2018: 295}.get(ano, 546) - - async with httpx.AsyncClient(timeout=30.0) as client: - # Get candidate details - response = await client.get( - f"{TSE_DIVULGACAND_URL}/candidatura/buscar/{ano}/{eleicao_id}/candidato/{id_candidato}" - ) - - if response.status_code != 200: - return None - - item = response.json() - - candidato = Candidato( - id=item.get("id", 0), - nome=item.get("nomeCompleto", ""), - nome_urna=item.get("nomeUrna", ""), - numero=str(item.get("numero", "")), - cargo=item.get("cargo", {}).get("nome", "") if isinstance(item.get("cargo"), dict) else "", - partido_sigla=item.get("partido", {}).get("sigla", "") if isinstance(item.get("partido"), dict) else "", - partido_nome=item.get("partido", {}).get("nome", "") if isinstance(item.get("partido"), dict) else "", - uf=item.get("ufSigla", ""), - municipio=item.get("localCandidatura", ""), - situacao=item.get("situacao", ""), - data_nascimento=item.get("dataNascimento", ""), - genero=item.get("genero", ""), - grau_instrucao=item.get("grauInstrucao", ""), - ocupacao=item.get("ocupacao", ""), - total_bens=float(item.get("totalDeBens", 0) or 0) - ) - - # Try to get assets (bens) - try: - bens_response = await client.get( - f"{TSE_DIVULGACAND_URL}/candidatura/buscar/{ano}/{eleicao_id}/candidato/{id_candidato}/bens" - ) - if bens_response.status_code == 200: - bens_data = bens_response.json() - candidato.bens = [ - { - "tipo": b.get("tipoBem", ""), - "descricao": b.get("descricao", ""), - "valor": float(b.get("valor", 0) or 0) - } - for b in bens_data - ] - except: - pass - - return candidato - - except Exception as e: - print(f"TSE details error: {e}") - return None - - -async def buscar_politico(nome: str) -> Dict[str, Any]: - """ - Search for a politician across multiple elections. - Returns consolidated information. - """ - resultado = { - "nome": nome, - "encontrado": False, - "candidaturas": [], - "ultimo_cargo": None, - "total_patrimonio": 0.0, - "partidos": set(), - "ufs": set() - } - - # Search in recent elections - continue through ALL years - for ano in [2024, 2022, 2020, 2018]: - try: - candidatos = await buscar_candidatos(nome, ano=ano) - print(f"TSE: Buscando '{nome}' em {ano} - encontrados: {len(candidatos)}") - - for c in candidatos: - # Match if nome is in the candidate's full name - if nome.lower() in c.nome.lower() or nome.lower() in c.nome_urna.lower(): - resultado["encontrado"] = True - resultado["candidaturas"].append({ - "ano": ano, - "cargo": c.cargo, - "partido": c.partido_sigla, - "uf": c.uf, - "situacao": c.situacao, - "patrimonio": c.total_bens - }) - - if c.partido_sigla: - resultado["partidos"].add(c.partido_sigla) - if c.uf: - resultado["ufs"].add(c.uf) - - if c.total_bens > resultado["total_patrimonio"]: - resultado["total_patrimonio"] = c.total_bens - - if not resultado["ultimo_cargo"]: - resultado["ultimo_cargo"] = f"{c.cargo} ({ano})" - except Exception as e: - print(f"TSE search {ano} error: {e}") - continue - - # Convert sets to lists for JSON - resultado["partidos"] = list(resultado["partidos"]) - resultado["ufs"] = list(resultado["ufs"]) - - print(f"TSE resultado para '{nome}': encontrado={resultado['encontrado']}, candidaturas={len(resultado['candidaturas'])}") - - return resultado -