diff --git a/Dockerfile b/Dockerfile index 05a1891dae3cddb81c245180f5c1c088584295ee..7f23e0d6eb0ca91cf496513fb4d0227ffac10ec9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,33 +1,24 @@ -# ============================================================================== -# Dockerfile — AetherMap API (versão profissional) -# ============================================================================== +FROM python:3.11-slim -# Imagem Python robusta (não slim → evita erros de build) -FROM python:3.10 - -# Define diretório da aplicação WORKDIR /app -# --- INSTALAR TORCH CPU ANTES (CRÍTICO!) --- -# Isso garante que a versão certa (CPU) seja instalada -RUN pip install --no-cache-dir \ - torch \ - torchvision \ - torchaudio \ - --index-url https://download.pytorch.org/whl/cpu - +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + && rm -rf /var/lib/apt/lists/* -# Copiar requirements +# Copy requirements first for better caching COPY requirements.txt . - -# Instalar dependências restantes RUN pip install --no-cache-dir -r requirements.txt -# Copiar código da aplicação +# Copy application code COPY . . -# Expor porta usada pelo Hugging Face Spaces +# Create data directory for SQLite +RUN mkdir -p /app/data + +# Expose port (HF Spaces uses 7860) EXPOSE 7860 -# Comando padrão para executar FastAPI -CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"] +# Run the application +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"] diff --git a/README.md b/README.md index 1b0b6975ed9d1181da24ce796835935436320b98..b969ddabfb79f685dcf88fbe98f72b5d3bc1bead 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,27 @@ --- -title: AetherMap -emoji: 🦀 -colorFrom: indigo -colorTo: pink +title: Numidium +emoji: 🔮 +colorFrom: blue +colorTo: red sdk: docker pinned: false -license: apache-2.0 --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# Numidium API + +Backend do sistema de inteligência Numidium/VANTAGE. + +## Endpoints + +- `/docs` - Documentação Swagger +- `/api/v1/entities` - CRUD de entidades +- `/api/v1/relationships` - Conexões +- `/api/v1/events` - Eventos +- `/api/v1/search` - Busca global +- `/api/v1/ingest` - Ingestão de dados (Wikipedia, News) + +## Stack + +- FastAPI +- SQLite +- BeautifulSoup (scraping) diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5ca62e91c6b6d2fd4d3a0d2f3169941e71d37af3 --- /dev/null +++ b/app/__init__.py @@ -0,0 +1 @@ +# Numidium Backend App diff --git a/app/__pycache__/__init__.cpython-311.pyc b/app/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a44e729bba8a6e6cdf407034b3b1ec551cfb6fe Binary files /dev/null and b/app/__pycache__/__init__.cpython-311.pyc differ diff --git a/app/__pycache__/config.cpython-311.pyc b/app/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6ba5b98d9de60400fecda19a96033ffd700d3a1 Binary files /dev/null and b/app/__pycache__/config.cpython-311.pyc differ diff --git a/app/api/__init__.py b/app/api/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ce0a2733c6eceaf10144429177e8f20db9604545 --- /dev/null +++ b/app/api/__init__.py @@ -0,0 +1 @@ +# API module diff --git a/app/api/__pycache__/__init__.cpython-311.pyc b/app/api/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e59a223a6007cd27a3443d5ab5a26d31df7fb4ff Binary files /dev/null and b/app/api/__pycache__/__init__.cpython-311.pyc differ diff --git a/app/api/__pycache__/deps.cpython-311.pyc b/app/api/__pycache__/deps.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07e15cf0e980065fc3e41e4e0eea81575dc514d5 Binary files /dev/null and b/app/api/__pycache__/deps.cpython-311.pyc differ diff --git a/app/api/deps.py b/app/api/deps.py new file mode 100644 index 0000000000000000000000000000000000000000..bcea9d8c46a65a9857513605150ce15591631945 --- /dev/null +++ b/app/api/deps.py @@ -0,0 +1,35 @@ +""" +API dependencies. +""" +from typing import Generator, Optional + +from fastapi import Cookie, Header +from sqlalchemy.orm import Session + +from app.core.database import get_db_for_session, get_default_session + + +def get_session_id( + x_session_id: Optional[str] = Header(None), + numidium_session: Optional[str] = Cookie(None) +) -> Optional[str]: + """Return the session id from header or cookie.""" + return x_session_id or numidium_session + + +def get_scoped_db( + x_session_id: Optional[str] = Header(None), + numidium_session: Optional[str] = Cookie(None) +) -> Generator[Session, None, None]: + """ + Provide a session-scoped DB if available, otherwise the default DB. + """ + session_id = x_session_id or numidium_session + if session_id: + db = get_db_for_session(session_id) + else: + db = get_default_session() + try: + yield db + finally: + db.close() diff --git a/app/api/routes/__init__.py b/app/api/routes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e37c97a33d27ba2e879921f79996d8fdc3edbb73 --- /dev/null +++ b/app/api/routes/__init__.py @@ -0,0 +1,2 @@ +# API Routes module +from app.api.routes import entities, relationships, events, search, ingest diff --git a/app/api/routes/__pycache__/__init__.cpython-311.pyc b/app/api/routes/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e48c20bb1f744a1d1037323ce205527266cb5c7c Binary files /dev/null and b/app/api/routes/__pycache__/__init__.cpython-311.pyc differ diff --git a/app/api/routes/__pycache__/entities.cpython-311.pyc b/app/api/routes/__pycache__/entities.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee99ab907a18b99a588dfe960d31a7e21c7e53d6 Binary files /dev/null and b/app/api/routes/__pycache__/entities.cpython-311.pyc differ diff --git a/app/api/routes/__pycache__/events.cpython-311.pyc b/app/api/routes/__pycache__/events.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52f29ec16d10fc54bd6be7d6e32591d65d3acfcc Binary files /dev/null and b/app/api/routes/__pycache__/events.cpython-311.pyc differ diff --git a/app/api/routes/__pycache__/ingest.cpython-311.pyc b/app/api/routes/__pycache__/ingest.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e524bafc4ce081ccccb32d94f2426c10b1e79b9a Binary files /dev/null and b/app/api/routes/__pycache__/ingest.cpython-311.pyc differ diff --git a/app/api/routes/__pycache__/investigate.cpython-311.pyc b/app/api/routes/__pycache__/investigate.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61c0e309052c422eb7d506d8623cfaed4ff4e01e Binary files /dev/null and b/app/api/routes/__pycache__/investigate.cpython-311.pyc differ diff --git a/app/api/routes/__pycache__/relationships.cpython-311.pyc b/app/api/routes/__pycache__/relationships.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73c88868d8b1ad76745a529fe05928d06408c415 Binary files /dev/null and b/app/api/routes/__pycache__/relationships.cpython-311.pyc differ diff --git a/app/api/routes/__pycache__/search.cpython-311.pyc b/app/api/routes/__pycache__/search.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..83951b1b069fe2d10b140852fbc85e7294cac015 Binary files /dev/null and b/app/api/routes/__pycache__/search.cpython-311.pyc differ diff --git a/app/api/routes/aethermap.py b/app/api/routes/aethermap.py new file mode 100644 index 0000000000000000000000000000000000000000..bc0535153069d293dcdbe97be9565e0a17728e3e --- /dev/null +++ b/app/api/routes/aethermap.py @@ -0,0 +1,307 @@ +""" +AetherMap Routes - Document Mapping & Semantic Search +Integrates with AetherMap API for document clustering, NER, and semantic search. +""" +from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends +from pydantic import BaseModel, Field +from typing import Optional, List, Dict, Any +from sqlalchemy.orm import Session +import io + +from app.api.deps import get_scoped_db +from app.services.aethermap_client import aethermap, ProcessResult, SearchResult, EntityGraphResult + + +router = APIRouter() + + +# ============================================================================ +# Request/Response Models +# ============================================================================ + +class IndexDocumentsRequest(BaseModel): + """Request to index documents from text list""" + documents: List[str] = Field(..., description="Lista de textos para indexar") + fast_mode: bool = Field(True, description="Modo rápido (PCA) ou preciso (UMAP)") + + +class IndexEntitiesRequest(BaseModel): + """Request to index entities from NUMIDIUM database""" + entity_types: Optional[List[str]] = Field(None, description="Filtrar por tipos de entidade") + limit: int = Field(500, description="Limite de entidades") + + +class SemanticSearchRequest(BaseModel): + """Request for semantic search""" + query: str = Field(..., description="Termo de busca") + turbo_mode: bool = Field(True, description="Modo turbo (mais rápido)") + + +class IndexResponse(BaseModel): + """Response from indexing""" + job_id: str + num_documents: int + num_clusters: int + num_noise: int + metrics: Dict[str, Any] = {} + cluster_analysis: Dict[str, Any] = {} + + +class SearchResponse(BaseModel): + """Response from search""" + summary: str + results: List[Dict[str, Any]] = [] + + +class EntityGraphResponse(BaseModel): + """Response from NER extraction""" + hubs: List[Dict[str, Any]] = [] + insights: Dict[str, Any] = {} + node_count: int = 0 + edge_count: int = 0 + + +class StatusResponse(BaseModel): + """AetherMap status""" + connected: bool + job_id: Optional[str] = None + documents_indexed: int = 0 + + +# ============================================================================ +# Endpoints +# ============================================================================ + +@router.get("/status", response_model=StatusResponse) +async def get_status(): + """ + Get AetherMap connection status. + """ + return StatusResponse( + connected=True, + job_id=aethermap.current_job_id, + documents_indexed=0 # TODO: track this + ) + + +@router.post("/index", response_model=IndexResponse) +async def index_documents(request: IndexDocumentsRequest): + """ + Index a list of documents for semantic search. + + The documents will be: + - Embedded using sentence transformers + - Clustered using HDBSCAN + - Indexed in FAISS + BM25 for hybrid search + """ + try: + if not request.documents: + raise HTTPException(status_code=400, detail="Nenhum documento fornecido") + + result = await aethermap.process_documents( + texts=request.documents, + fast_mode=request.fast_mode + ) + + return IndexResponse( + job_id=result.job_id, + num_documents=result.num_documents, + num_clusters=result.num_clusters, + num_noise=result.num_noise, + metrics=result.metrics, + cluster_analysis=result.cluster_analysis + ) + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/index-entities", response_model=IndexResponse) +async def index_entities( + request: IndexEntitiesRequest, + db: Session = Depends(get_scoped_db) +): + """ + Index entities from NUMIDIUM database. + + Collects entity names and descriptions, sends to AetherMap for processing. + """ + from app.models.entity import Entity + + try: + query = db.query(Entity) + + if request.entity_types: + query = query.filter(Entity.type.in_(request.entity_types)) + + entities = query.limit(request.limit).all() + + if not entities: + raise HTTPException(status_code=404, detail="Nenhuma entidade encontrada") + + # Build text representations + documents = [] + for e in entities: + text = f"{e.name} ({e.type})" + if e.description: + text += f": {e.description[:1000]}" + documents.append(text) + + result = await aethermap.process_documents( + texts=documents, + fast_mode=request.fast_mode if hasattr(request, 'fast_mode') else True + ) + + return IndexResponse( + job_id=result.job_id, + num_documents=result.num_documents, + num_clusters=result.num_clusters, + num_noise=result.num_noise, + metrics=result.metrics, + cluster_analysis=result.cluster_analysis + ) + + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/upload", response_model=IndexResponse) +async def upload_documents( + file: UploadFile = File(...), + fast_mode: bool = Form(True) +): + """ + Upload a file (TXT or CSV) for indexing. + + - TXT: One document per line + - CSV: Will use first text column found + """ + try: + content = await file.read() + text = content.decode('utf-8', errors='ignore') + + # Split by lines for TXT + documents = [line.strip() for line in text.splitlines() if line.strip()] + + if not documents: + raise HTTPException(status_code=400, detail="Arquivo vazio ou sem texto válido") + + result = await aethermap.process_documents( + texts=documents, + fast_mode=fast_mode + ) + + return IndexResponse( + job_id=result.job_id, + num_documents=result.num_documents, + num_clusters=result.num_clusters, + num_noise=result.num_noise, + metrics=result.metrics, + cluster_analysis=result.cluster_analysis + ) + + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/search", response_model=SearchResponse) +async def semantic_search(request: SemanticSearchRequest): + """ + Semantic search in indexed documents. + + Uses hybrid RAG (FAISS + BM25 + reranking + LLM). + Returns a summary answering the query with citations. + """ + try: + if not aethermap.current_job_id: + raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.") + + result = await aethermap.semantic_search( + query=request.query, + turbo_mode=request.turbo_mode + ) + + return SearchResponse( + summary=result.summary, + results=result.results + ) + + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/entities", response_model=EntityGraphResponse) +async def extract_entities(): + """ + Extract named entities (NER) from indexed documents. + + Returns: + - Hub entities (most connected) + - Relationship insights + - Graph metrics + """ + try: + if not aethermap.current_job_id: + raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.") + + result = await aethermap.extract_entities() + + return EntityGraphResponse( + hubs=result.hubs, + insights=result.insights, + node_count=len(result.nodes), + edge_count=len(result.edges) + ) + + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/analyze") +async def analyze_graph(): + """ + Analyze entity graph using LLM. + + Returns semantic insights about relationships and patterns. + """ + try: + if not aethermap.current_job_id: + raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.") + + result = await aethermap.analyze_graph() + + return { + "analysis": result.analysis, + "key_entities": result.key_entities, + "relationships": result.relationships + } + + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/describe-clusters") +async def describe_clusters(): + """ + Get LLM descriptions for each cluster found. + """ + try: + if not aethermap.current_job_id: + raise HTTPException(status_code=400, detail="Nenhum documento indexado. Use /index primeiro.") + + result = await aethermap.describe_clusters() + + return result + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) diff --git a/app/api/routes/analyze.py b/app/api/routes/analyze.py new file mode 100644 index 0000000000000000000000000000000000000000..37b93947c0e0c9f2a5a626301007c1cf30b212d6 --- /dev/null +++ b/app/api/routes/analyze.py @@ -0,0 +1,309 @@ +""" +Analyze API Routes - LLM-based text analysis +""" +from fastapi import APIRouter, Depends, HTTPException +from pydantic import BaseModel, Field +from typing import Optional, List +from sqlalchemy.orm import Session +import traceback + +from app.api.deps import get_scoped_db +from app.services.nlp import entity_extractor +from app.services.geocoding import geocode +from app.models.entity import Entity, Relationship, Event +from app.config import settings + + +router = APIRouter(prefix="/analyze", tags=["Analysis"]) + + +class AnalyzeRequest(BaseModel): + """Request model for text analysis""" + text: str = Field(..., min_length=10, description="Text to analyze") + auto_create: bool = Field(default=False, description="Auto-create extracted entities in database") + + +class ExtractedEntityResponse(BaseModel): + """Response model for an extracted entity""" + name: str + type: str + role: Optional[str] = None + aliases: Optional[List[str]] = None + description: Optional[str] = None + created: bool = False # Whether it was created in DB + entity_id: Optional[str] = None # DB ID if created + + +class ExtractedRelationshipResponse(BaseModel): + """Response model for an extracted relationship""" + source: str + target: str + relationship_type: str + context: Optional[str] = None + created: bool = False + + +class ExtractedEventResponse(BaseModel): + """Response model for an extracted event""" + description: str + event_type: Optional[str] = None + date: Optional[str] = None + location: Optional[str] = None + participants: Optional[List[str]] = None + created: bool = False + event_id: Optional[str] = None + + +class AnalyzeResponse(BaseModel): + """Response model for analysis""" + entities: List[ExtractedEntityResponse] + relationships: List[ExtractedRelationshipResponse] + events: List[ExtractedEventResponse] + stats: dict + + +@router.post("", response_model=AnalyzeResponse) +async def analyze_text(request: AnalyzeRequest, db: Session = Depends(get_scoped_db)): + """ + Analyze text using LLM to extract entities, relationships, and events. + + Uses Cerebras API with Qwen 3 235B for intelligent extraction. + + Args: + text: Text to analyze (min 10 characters) + auto_create: If true, automatically creates entities in the database + + Returns: + Extracted entities, relationships, events, and statistics + """ + try: + # Extract using LLM + result = await entity_extractor.extract(request.text) + + # Prepare response + entities_response = [] + relationships_response = [] + events_response = [] + + created_entities = 0 + created_relationships = 0 + created_events = 0 + + # Helper function to parse date strings + def parse_date(date_str): + if not date_str: + return None + from datetime import datetime + try: + # Try YYYY-MM-DD format + return datetime.strptime(date_str[:10], "%Y-%m-%d") + except: + try: + # Try YYYY format + return datetime.strptime(date_str[:4], "%Y") + except: + return None + + # Process entities + for entity in result.entities: + entity_data = ExtractedEntityResponse( + name=entity.name, + type=entity.type, + role=entity.role, + aliases=entity.aliases, + description=entity.description, + created=False + ) + + if request.auto_create and entity.name: + # Check if entity already exists + existing = db.query(Entity).filter( + Entity.name.ilike(f"%{entity.name}%") + ).first() + + if not existing: + # Get coordinates for location entities + lat, lng = None, None + if entity.type == "location": + coords = await geocode(entity.name) + if coords: + lat, lng = coords + + # Parse event_date if available + event_date = parse_date(getattr(entity, 'event_date', None)) + + # Create new entity + new_entity = Entity( + name=entity.name, + type=entity.type if entity.type in ["person", "organization", "location", "event"] else "person", + description=entity.description or entity.role or "", + source="llm_extraction", + latitude=lat, + longitude=lng, + event_date=event_date, + properties={"role": entity.role, "aliases": entity.aliases} + ) + db.add(new_entity) + db.commit() + db.refresh(new_entity) + + entity_data.created = True + entity_data.entity_id = new_entity.id + created_entities += 1 + else: + entity_data.entity_id = existing.id + + entities_response.append(entity_data) + + # Process relationships + for rel in result.relationships: + rel_data = ExtractedRelationshipResponse( + source=rel.source, + target=rel.target, + relationship_type=rel.relationship_type, + context=rel.context, + created=False + ) + + if request.auto_create: + # Find source and target entities + source_entity = db.query(Entity).filter( + Entity.name.ilike(f"%{rel.source}%") + ).first() + target_entity = db.query(Entity).filter( + Entity.name.ilike(f"%{rel.target}%") + ).first() + + if source_entity and target_entity: + # Check if relationship exists + existing_rel = db.query(Relationship).filter( + Relationship.source_id == source_entity.id, + Relationship.target_id == target_entity.id, + Relationship.type == rel.relationship_type + ).first() + + if not existing_rel: + # Parse event_date if available + rel_event_date = parse_date(getattr(rel, 'event_date', None)) + + new_rel = Relationship( + source_id=source_entity.id, + target_id=target_entity.id, + type=rel.relationship_type, + event_date=rel_event_date, + properties={"context": rel.context} + ) + db.add(new_rel) + db.commit() + rel_data.created = True + created_relationships += 1 + + relationships_response.append(rel_data) + + # Process events + for event in result.events: + event_data = ExtractedEventResponse( + description=event.description, + event_type=event.event_type, + date=event.date, + location=event.location, + participants=event.participants, + created=False + ) + + if request.auto_create and event.description: + # Create event + new_event = Event( + title=event.description[:100] if len(event.description) > 100 else event.description, + description=event.description, + type=event.event_type or "general", + source="llm_extraction" + ) + db.add(new_event) + db.commit() + db.refresh(new_event) + + event_data.created = True + event_data.event_id = new_event.id + created_events += 1 + + events_response.append(event_data) + + return AnalyzeResponse( + entities=entities_response, + relationships=relationships_response, + events=events_response, + stats={ + "total_entities": len(entities_response), + "total_relationships": len(relationships_response), + "total_events": len(events_response), + "created_entities": created_entities, + "created_relationships": created_relationships, + "created_events": created_events + } + ) + + except Exception as e: + # Log the full error with traceback + print(f"=== ANALYZE ERROR ===") + print(f"Error type: {type(e).__name__}") + print(f"Error message: {str(e)}") + print(f"Traceback:") + traceback.print_exc() + print(f"=== END ERROR ===") + raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}") + + +@router.get("/debug") +async def debug_config(): + """ + Debug endpoint to check if API is configured correctly. + """ + api_key = settings.cerebras_api_key + return { + "cerebras_api_key_configured": bool(api_key), + "cerebras_api_key_length": len(api_key) if api_key else 0, + "cerebras_api_key_preview": f"{api_key[:8]}...{api_key[-4:]}" if api_key and len(api_key) > 12 else "NOT SET" + } + + +@router.post("/quick") +async def quick_analyze(request: AnalyzeRequest): + """ + Quick analysis without database operations. + Returns only extracted data without creating anything. + """ + try: + result = await entity_extractor.extract(request.text) + + return { + "entities": [ + { + "name": e.name, + "type": e.type, + "role": e.role, + "aliases": e.aliases + } + for e in result.entities + ], + "relationships": [ + { + "source": r.source, + "target": r.target, + "type": r.relationship_type, + "context": r.context + } + for r in result.relationships + ], + "events": [ + { + "description": ev.description, + "type": ev.event_type, + "date": ev.date, + "participants": ev.participants + } + for ev in result.events + ] + } + except Exception as e: + raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}") diff --git a/app/api/routes/chat.py b/app/api/routes/chat.py new file mode 100644 index 0000000000000000000000000000000000000000..f75b133bf9956e67eb7b1b86312192d7fa093c46 --- /dev/null +++ b/app/api/routes/chat.py @@ -0,0 +1,63 @@ +""" +Chat API Routes - Intelligent chat with RAG +""" +from fastapi import APIRouter, Depends, HTTPException +from pydantic import BaseModel, Field +from typing import Optional +from sqlalchemy.orm import Session + +from app.api.deps import get_scoped_db, get_session_id +from app.services.chat import chat_service + + +router = APIRouter(prefix="/chat", tags=["Chat"]) + + +class ChatRequest(BaseModel): + """Chat request model""" + message: str = Field(..., min_length=1, description="User message") + use_web: bool = Field(default=True, description="Include web search") + use_history: bool = Field(default=True, description="Use conversation history") + + +class ChatResponse(BaseModel): + """Chat response model""" + answer: str + local_context_used: bool + web_context_used: bool + entities_found: int + + +@router.post("", response_model=ChatResponse) +async def chat( + request: ChatRequest, + db: Session = Depends(get_scoped_db), + session_id: Optional[str] = Depends(get_session_id) +): + """ + Send a message and get an intelligent response. + + Uses: + - Local NUMIDIUM knowledge (entities/relationships) + - Lancer web search (if enabled) + - Cerebras LLM for synthesis + """ + try: + result = await chat_service.chat( + message=request.message, + db=db, + use_web=request.use_web, + use_history=request.use_history, + session_id=session_id + ) + return ChatResponse(**result) + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/clear") +async def clear_history(session_id: Optional[str] = Depends(get_session_id)): + """Clear conversation history""" + chat_service.clear_history(session_id=session_id) + return {"message": "Historico limpo"} diff --git a/app/api/routes/dados_publicos.py b/app/api/routes/dados_publicos.py new file mode 100644 index 0000000000000000000000000000000000000000..842e82d2d17f48687b92bb8012105eab495a8051 --- /dev/null +++ b/app/api/routes/dados_publicos.py @@ -0,0 +1,155 @@ +""" +Public Data API Routes - IBGE and TSE data access +""" +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel, Field +from typing import Optional, List, Dict, Any + +from app.services.ibge_api import ( + listar_estados, + listar_municipios, + buscar_municipio, + enriquecer_localizacao +) +from app.services.tse_api import ( + listar_eleicoes, + buscar_candidatos, + obter_candidato_detalhes, + buscar_politico +) + + +router = APIRouter(prefix="/dados", tags=["Public Data"]) + + +# ========== IBGE Endpoints ========== + +class EstadoResponse(BaseModel): + id: int + sigla: str + nome: str + regiao: str + + +class MunicipioResponse(BaseModel): + id: int + nome: str + estado_sigla: str + estado_nome: str + regiao: str + + +@router.get("/ibge/estados", response_model=List[EstadoResponse]) +async def get_estados(): + """List all Brazilian states""" + estados = await listar_estados() + return [EstadoResponse(**e.__dict__) for e in estados] + + +@router.get("/ibge/municipios/{uf}", response_model=List[MunicipioResponse]) +async def get_municipios(uf: str): + """List municipalities in a state""" + municipios = await listar_municipios(uf) + return [MunicipioResponse(**m.__dict__) for m in municipios] + + +@router.get("/ibge/buscar") +async def buscar_cidade( + nome: str = Query(..., min_length=2), + uf: Optional[str] = None +): + """Search for a municipality by name""" + municipios = await buscar_municipio(nome, uf) + return [MunicipioResponse(**m.__dict__) for m in municipios] + + +@router.get("/ibge/enriquecer") +async def enriquecer_cidade( + cidade: str = Query(..., min_length=2), + uf: Optional[str] = None +): + """Enrich a location name with IBGE data""" + return await enriquecer_localizacao(cidade, uf) + + +# ========== TSE Endpoints ========== + +class EleicaoResponse(BaseModel): + id: int + ano: int + descricao: str + turno: int + + +class CandidatoResponse(BaseModel): + id: int + nome: str + nome_urna: str + numero: str + cargo: str + partido_sigla: str + uf: str + municipio: str + situacao: str + total_bens: float + + +class CandidatoDetalhadoResponse(BaseModel): + id: int + nome: str + nome_urna: str + numero: str + cargo: str + partido_sigla: str + partido_nome: str + uf: str + municipio: str + situacao: str + data_nascimento: str + genero: str + grau_instrucao: str + ocupacao: str + total_bens: float + bens: List[Dict[str, Any]] + + +@router.get("/tse/eleicoes", response_model=List[EleicaoResponse]) +async def get_eleicoes(): + """List available elections""" + eleicoes = await listar_eleicoes() + return [EleicaoResponse(**e.__dict__) for e in eleicoes] + + +@router.get("/tse/candidatos") +async def get_candidatos( + nome: str = Query(..., min_length=3), + ano: int = Query(default=2024), + uf: Optional[str] = None, + cargo: Optional[str] = None +): + """Search for candidates by name""" + candidatos = await buscar_candidatos(nome, ano=ano, uf=uf, cargo=cargo) + return [CandidatoResponse(**c.__dict__) for c in candidatos] + + +@router.get("/tse/candidato/{id_candidato}") +async def get_candidato_detalhes( + id_candidato: int, + ano: int = Query(default=2024) +): + """Get detailed candidate information including assets""" + candidato = await obter_candidato_detalhes(id_candidato, ano=ano) + + if not candidato: + raise HTTPException(status_code=404, detail="Candidato não encontrado") + + return CandidatoDetalhadoResponse(**candidato.__dict__) + + +@router.get("/tse/politico") +async def pesquisar_politico(nome: str = Query(..., min_length=3)): + """ + Search for a politician across multiple elections. + Returns consolidated career information. + """ + return await buscar_politico(nome) diff --git a/app/api/routes/entities.py b/app/api/routes/entities.py new file mode 100644 index 0000000000000000000000000000000000000000..2727179e0e20a58a8a5893f1821a15c24df3013f --- /dev/null +++ b/app/api/routes/entities.py @@ -0,0 +1,353 @@ +""" +Entity CRUD Routes +""" +from fastapi import APIRouter, Depends, HTTPException, Query +from sqlalchemy.orm import Session +from sqlalchemy import or_ +from typing import List, Optional + +from app.api.deps import get_scoped_db +from app.models import Entity, Relationship +from app.schemas import EntityCreate, EntityUpdate, EntityResponse, GraphData, GraphNode, GraphEdge + +router = APIRouter(prefix="/entities", tags=["Entities"]) + + +@router.get("", response_model=List[EntityResponse]) +def list_entities( + type: Optional[str] = None, + search: Optional[str] = None, + project_id: Optional[str] = None, + limit: int = Query(default=50, le=200), + offset: int = 0, + db: Session = Depends(get_scoped_db) +): + """Lista todas as entidades com filtros opcionais""" + query = db.query(Entity) + + if project_id: + query = query.filter(Entity.project_id == project_id) + + if type: + query = query.filter(Entity.type == type) + + if search: + query = query.filter( + or_( + Entity.name.ilike(f"%{search}%"), + Entity.description.ilike(f"%{search}%") + ) + ) + + query = query.order_by(Entity.created_at.desc()) + return query.offset(offset).limit(limit).all() + + +@router.get("/types") +def get_entity_types(db: Session = Depends(get_scoped_db)): + """Retorna todos os tipos de entidade únicos""" + types = db.query(Entity.type).distinct().all() + return [t[0] for t in types] + + +@router.get("/suggest-merge") +async def suggest_merge_candidates( + limit: int = Query(default=10, le=50), + db: Session = Depends(get_scoped_db) +): + """ + Use LLM to find potential duplicate entities that could be merged. + Returns pairs of entities that might be the same. + """ + import httpx + import json + import re + from app.config import settings + + # Get all entities + entities = db.query(Entity).order_by(Entity.name).limit(200).all() + + if len(entities) < 2: + return {"candidates": [], "message": "Not enough entities to compare"} + + # Build entity list for LLM + entity_list = [] + for e in entities: + aliases = (e.properties or {}).get("aliases", []) + entity_list.append({ + "id": e.id, + "name": e.name, + "type": e.type, + "aliases": aliases[:5] if aliases else [] + }) + + # Ask LLM to find duplicates + prompt = f"""Analise esta lista de entidades e encontre possíveis DUPLICATAS (mesma pessoa/organização/local com nomes diferentes). + +Entidades: +{entity_list[:100]} + +Retorne APENAS um JSON válido com pares de IDs que são provavelmente a mesma entidade: +```json +{{ + "duplicates": [ + {{ + "id1": "uuid1", + "id2": "uuid2", + "confidence": 0.95, + "reason": "Mesmo nome com variação" + }} + ] +}} +``` + +Se não houver duplicatas, retorne: {{"duplicates": []}} +""" + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post( + "https://api.cerebras.ai/v1/chat/completions", + headers={ + "Authorization": f"Bearer {settings.cerebras_api_key}", + "Content-Type": "application/json" + }, + json={ + "model": "zai-glm-4.7", + "messages": [ + {"role": "system", "content": "Você é um especialista em detecção de entidades duplicadas. Responda apenas em JSON válido."}, + {"role": "user", "content": prompt} + ], + "temperature": 0.1, + "max_tokens": 1024 + } + ) + + if response.status_code != 200: + return {"candidates": [], "error": "LLM API error"} + + data = response.json() + content = data["choices"][0]["message"]["content"] + + # Parse JSON from response + json_match = re.search(r'\{.*\}', content, re.DOTALL) + if json_match: + result = json.loads(json_match.group(0)) + + # Enrich with entity names + candidates = [] + for dup in result.get("duplicates", [])[:limit]: + e1 = next((e for e in entities if e.id == dup.get("id1")), None) + e2 = next((e for e in entities if e.id == dup.get("id2")), None) + if e1 and e2: + candidates.append({ + "entity1": {"id": e1.id, "name": e1.name, "type": e1.type}, + "entity2": {"id": e2.id, "name": e2.name, "type": e2.type}, + "confidence": dup.get("confidence", 0.5), + "reason": dup.get("reason", "Possível duplicata") + }) + + return {"candidates": candidates} + + return {"candidates": [], "message": "No duplicates found"} + + except Exception as e: + return {"candidates": [], "error": str(e)} + + +@router.get("/{entity_id}", response_model=EntityResponse) +def get_entity(entity_id: str, db: Session = Depends(get_scoped_db)): + """Busca uma entidade por ID""" + entity = db.query(Entity).filter(Entity.id == entity_id).first() + if not entity: + raise HTTPException(status_code=404, detail="Entity not found") + return entity + + +@router.post("", response_model=EntityResponse, status_code=201) +def create_entity(entity: EntityCreate, db: Session = Depends(get_scoped_db)): + """Cria uma nova entidade""" + db_entity = Entity(**entity.model_dump()) + db.add(db_entity) + db.commit() + db.refresh(db_entity) + return db_entity + + +@router.put("/{entity_id}", response_model=EntityResponse) +def update_entity(entity_id: str, entity: EntityUpdate, db: Session = Depends(get_scoped_db)): + """Atualiza uma entidade existente""" + db_entity = db.query(Entity).filter(Entity.id == entity_id).first() + if not db_entity: + raise HTTPException(status_code=404, detail="Entity not found") + + update_data = entity.model_dump(exclude_unset=True) + for field, value in update_data.items(): + setattr(db_entity, field, value) + + db.commit() + db.refresh(db_entity) + return db_entity + + +@router.delete("/{entity_id}") +def delete_entity(entity_id: str, db: Session = Depends(get_scoped_db)): + """Deleta uma entidade""" + db_entity = db.query(Entity).filter(Entity.id == entity_id).first() + if not db_entity: + raise HTTPException(status_code=404, detail="Entity not found") + + # Delete related relationships + db.query(Relationship).filter( + or_( + Relationship.source_id == entity_id, + Relationship.target_id == entity_id + ) + ).delete() + + db.delete(db_entity) + db.commit() + return {"message": "Entity deleted"} + + +@router.get("/{entity_id}/connections", response_model=GraphData) +def get_entity_connections( + entity_id: str, + depth: int = Query(default=1, le=3), + db: Session = Depends(get_scoped_db) +): + """ + Retorna o grafo de conexões de uma entidade + Usado para visualização de rede no frontend + """ + entity = db.query(Entity).filter(Entity.id == entity_id).first() + if not entity: + raise HTTPException(status_code=404, detail="Entity not found") + + nodes = {} + edges = [] + visited = set() + + def explore(eid: str, current_depth: int): + if current_depth > depth or eid in visited: + return + visited.add(eid) + + e = db.query(Entity).filter(Entity.id == eid).first() + if not e: + return + + nodes[e.id] = GraphNode( + id=e.id, + type=e.type, + name=e.name, + properties=e.properties or {} + ) + + # Outgoing relationships + for rel in db.query(Relationship).filter(Relationship.source_id == eid).all(): + edges.append(GraphEdge( + source=rel.source_id, + target=rel.target_id, + type=rel.type, + confidence=rel.confidence + )) + explore(rel.target_id, current_depth + 1) + + # Incoming relationships + for rel in db.query(Relationship).filter(Relationship.target_id == eid).all(): + edges.append(GraphEdge( + source=rel.source_id, + target=rel.target_id, + type=rel.type, + confidence=rel.confidence + )) + explore(rel.source_id, current_depth + 1) + + explore(entity_id, 0) + + return GraphData( + nodes=list(nodes.values()), + edges=edges + ) + + +@router.post("/merge") +def merge_entities( + primary_id: str, + secondary_id: str, + db: Session = Depends(get_scoped_db) +): + """ + Merge two entities into one. + The primary entity is kept, the secondary is deleted. + All relationships from secondary are transferred to primary. + """ + if primary_id == secondary_id: + raise HTTPException(status_code=400, detail="Cannot merge entity with itself") + + primary = db.query(Entity).filter(Entity.id == primary_id).first() + secondary = db.query(Entity).filter(Entity.id == secondary_id).first() + + if not primary: + raise HTTPException(status_code=404, detail="Primary entity not found") + if not secondary: + raise HTTPException(status_code=404, detail="Secondary entity not found") + + # Merge properties + primary_props = primary.properties or {} + secondary_props = secondary.properties or {} + + # Add aliases from secondary + aliases = primary_props.get("aliases", []) or [] + if secondary.name not in aliases: + aliases.append(secondary.name) + secondary_aliases = secondary_props.get("aliases", []) or [] + for alias in secondary_aliases: + if alias not in aliases: + aliases.append(alias) + primary_props["aliases"] = aliases + + # Add merge history + merge_history = primary_props.get("merged_from", []) or [] + merge_history.append({ + "id": secondary.id, + "name": secondary.name, + "source": secondary.source + }) + primary_props["merged_from"] = merge_history + + # Combine descriptions if primary has none + if not primary.description and secondary.description: + primary.description = secondary.description + + primary.properties = primary_props + + # Transfer relationships from secondary to primary + # Update source_id + db.query(Relationship).filter( + Relationship.source_id == secondary_id + ).update({"source_id": primary_id}) + + # Update target_id + db.query(Relationship).filter( + Relationship.target_id == secondary_id + ).update({"target_id": primary_id}) + + # Delete duplicate relationships (same source, target, type) + # This is a simple approach - in production you'd want more sophisticated deduplication + + # Delete the secondary entity + db.delete(secondary) + db.commit() + db.refresh(primary) + + return { + "message": f"Merged '{secondary.name}' into '{primary.name}'", + "primary": { + "id": primary.id, + "name": primary.name, + "aliases": aliases + } + } + diff --git a/app/api/routes/events.py b/app/api/routes/events.py new file mode 100644 index 0000000000000000000000000000000000000000..19a16292e599f2a33bffe593cf788d69be9f28dd --- /dev/null +++ b/app/api/routes/events.py @@ -0,0 +1,113 @@ +""" +Events CRUD Routes +""" +from fastapi import APIRouter, Depends, HTTPException, Query +from sqlalchemy.orm import Session +from sqlalchemy import or_ +from typing import List, Optional +from datetime import datetime + +from app.api.deps import get_scoped_db +from app.models import Event +from app.schemas import EventCreate, EventResponse + +router = APIRouter(prefix="/events", tags=["Events"]) + + +@router.get("/", response_model=List[EventResponse]) +def list_events( + type: Optional[str] = None, + search: Optional[str] = None, + start_date: Optional[datetime] = None, + end_date: Optional[datetime] = None, + limit: int = Query(default=50, le=200), + offset: int = 0, + db: Session = Depends(get_scoped_db) +): + """Lista eventos com filtros opcionais""" + query = db.query(Event) + + if type: + query = query.filter(Event.type == type) + + if search: + query = query.filter( + or_( + Event.title.ilike(f"%{search}%"), + Event.description.ilike(f"%{search}%") + ) + ) + + if start_date: + query = query.filter(Event.event_date >= start_date) + if end_date: + query = query.filter(Event.event_date <= end_date) + + query = query.order_by(Event.event_date.desc().nullslast()) + return query.offset(offset).limit(limit).all() + + +@router.get("/types") +def get_event_types(db: Session = Depends(get_scoped_db)): + """Retorna todos os tipos de evento unicos""" + types = db.query(Event.type).distinct().all() + return [t[0] for t in types] + + +@router.get("/timeline") +def get_timeline( + entity_id: Optional[str] = None, + limit: int = Query(default=50, le=200), + db: Session = Depends(get_scoped_db) +): + """ + Retorna eventos em formato timeline. + """ + query = db.query(Event).filter(Event.event_date.isnot(None)) + + if entity_id: + query = query.filter(Event.entity_ids.contains([entity_id])) + + events = query.order_by(Event.event_date.asc()).limit(limit).all() + + return [ + { + "id": e.id, + "title": e.title, + "date": e.event_date.isoformat() if e.event_date else None, + "type": e.type, + "location": e.location_name + } + for e in events + ] + + +@router.get("/{event_id}", response_model=EventResponse) +def get_event(event_id: str, db: Session = Depends(get_scoped_db)): + """Busca um evento por ID""" + event = db.query(Event).filter(Event.id == event_id).first() + if not event: + raise HTTPException(status_code=404, detail="Event not found") + return event + + +@router.post("/", response_model=EventResponse, status_code=201) +def create_event(event: EventCreate, db: Session = Depends(get_scoped_db)): + """Cria um novo evento""" + db_event = Event(**event.model_dump()) + db.add(db_event) + db.commit() + db.refresh(db_event) + return db_event + + +@router.delete("/{event_id}") +def delete_event(event_id: str, db: Session = Depends(get_scoped_db)): + """Deleta um evento""" + db_event = db.query(Event).filter(Event.id == event_id).first() + if not db_event: + raise HTTPException(status_code=404, detail="Event not found") + + db.delete(db_event) + db.commit() + return {"message": "Event deleted"} diff --git a/app/api/routes/graph.py b/app/api/routes/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..66a0886d6fb53b0884ef1e803f2acefd29f92873 --- /dev/null +++ b/app/api/routes/graph.py @@ -0,0 +1,173 @@ +""" +Graph API Routes - Network visualization endpoints +""" +from fastapi import APIRouter, Depends, HTTPException, Query +from typing import Optional, List +from sqlalchemy.orm import Session +from sqlalchemy import or_ + +from app.api.deps import get_scoped_db +from app.models.entity import Entity, Relationship + + +router = APIRouter(prefix="/graph", tags=["Graph"]) + + +@router.get("") +async def get_graph( + entity_type: Optional[str] = Query(None, description="Filter by entity type"), + limit: int = Query(100, le=500, description="Maximum number of entities"), + db: Session = Depends(get_scoped_db) +): + """ + Get graph data for visualization. + Returns nodes (entities) and edges (relationships). + """ + try: + # Get entities + query = db.query(Entity) + if entity_type: + query = query.filter(Entity.type == entity_type) + + entities = query.limit(limit).all() + entity_ids = [e.id for e in entities] + + # Get relationships between these entities + relationships = db.query(Relationship).filter( + or_( + Relationship.source_id.in_(entity_ids), + Relationship.target_id.in_(entity_ids) + ) + ).all() + + # Format for Cytoscape.js + nodes = [] + for e in entities: + nodes.append({ + "data": { + "id": e.id, + "label": e.name[:30] + "..." if len(e.name) > 30 else e.name, + "fullName": e.name, + "type": e.type, + "description": e.description[:100] if e.description else "", + "source": e.source or "unknown" + } + }) + + edges = [] + for r in relationships: + if r.source_id in entity_ids and r.target_id in entity_ids: + edges.append({ + "data": { + "id": r.id, + "source": r.source_id, + "target": r.target_id, + "label": r.type, + "type": r.type + } + }) + + return { + "nodes": nodes, + "edges": edges, + "stats": { + "total_nodes": len(nodes), + "total_edges": len(edges) + } + } + + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to get graph: {str(e)}") + + +@router.get("/entity/{entity_id}") +async def get_entity_graph( + entity_id: str, + depth: int = Query(1, ge=1, le=3, description="How many levels of connections to include"), + db: Session = Depends(get_scoped_db) +): + """ + Get graph centered on a specific entity. + """ + try: + # Get the central entity + central = db.query(Entity).filter(Entity.id == entity_id).first() + if not central: + raise HTTPException(status_code=404, detail="Entity not found") + + # Collect entity IDs at each depth level + collected_ids = {entity_id} + current_level = {entity_id} + + for _ in range(depth): + rels = db.query(Relationship).filter( + or_( + Relationship.source_id.in_(current_level), + Relationship.target_id.in_(current_level) + ) + ).all() + + next_level = set() + for r in rels: + next_level.add(r.source_id) + next_level.add(r.target_id) + + current_level = next_level - collected_ids + collected_ids.update(next_level) + + # Get all entities + entities = db.query(Entity).filter(Entity.id.in_(collected_ids)).all() + + # Get all relationships between collected entities + relationships = db.query(Relationship).filter( + Relationship.source_id.in_(collected_ids), + Relationship.target_id.in_(collected_ids) + ).all() + + # Format for Cytoscape + nodes = [] + for e in entities: + nodes.append({ + "data": { + "id": e.id, + "label": e.name[:30] + "..." if len(e.name) > 30 else e.name, + "fullName": e.name, + "type": e.type, + "description": e.description[:100] if e.description else "", + "source": e.source or "unknown", + "isCentral": e.id == entity_id + } + }) + + edges = [] + for r in relationships: + edges.append({ + "data": { + "id": r.id, + "source": r.source_id, + "target": r.target_id, + "label": r.type, + "type": r.type + } + }) + + return { + "central": { + "id": central.id, + "name": central.name, + "type": central.type + }, + "nodes": nodes, + "edges": edges, + "stats": { + "total_nodes": len(nodes), + "total_edges": len(edges), + "depth": depth + } + } + + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to get entity graph: {str(e)}") + diff --git a/app/api/routes/ingest.py b/app/api/routes/ingest.py new file mode 100644 index 0000000000000000000000000000000000000000..d2216481b8ad615180d4b4bfad5d7f24e453774a --- /dev/null +++ b/app/api/routes/ingest.py @@ -0,0 +1,341 @@ +""" +Data Ingestion Routes +Endpoints para importar dados de fontes externas +""" +from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks +from sqlalchemy.orm import Session +from typing import Optional, List +from datetime import datetime +import asyncio + +from app.api.deps import get_scoped_db +from app.models import Entity, Document, Relationship +from app.schemas import EntityResponse, DocumentResponse +from app.services.ingestion import wikipedia_scraper, news_service +from app.services.nlp import entity_extractor +from app.services.geocoding import geocode + +router = APIRouter(prefix="/ingest", tags=["Data Ingestion"]) + + +def parse_event_date(date_str): + """Parse date string to datetime object""" + if not date_str: + return None + try: + # Try YYYY-MM-DD format + return datetime.strptime(date_str[:10], "%Y-%m-%d") + except: + try: + # Try YYYY format + return datetime.strptime(date_str[:4], "%Y") + except: + return None + + +# ========== Wikipedia ========== + +@router.get("/wikipedia/search") +def search_wikipedia(q: str, limit: int = 10): + """Busca artigos na Wikipedia""" + results = wikipedia_scraper.search(q, limit) + return results + + +@router.post("/wikipedia/entity", response_model=EntityResponse) +async def import_from_wikipedia( + title: str, + entity_type: str = "person", + project_id: Optional[str] = None, + auto_extract: bool = True, + db: Session = Depends(get_scoped_db) +): + """ + Importa uma entidade da Wikipedia + entity_type: person, organization, location + project_id: ID do projeto para associar a entidade + auto_extract: Se True, usa LLM para extrair entidades relacionadas + """ + # Check if entity already exists + existing = db.query(Entity).filter( + Entity.name == title, + Entity.source == "wikipedia" + ).first() + + if existing: + return existing + + # Scrape based on type + if entity_type == "person": + data = wikipedia_scraper.scrape_person(title) + elif entity_type == "organization": + data = wikipedia_scraper.scrape_organization(title) + elif entity_type == "location": + data = wikipedia_scraper.scrape_location(title) + else: + data = wikipedia_scraper.scrape_person(title) # default + + if not data: + raise HTTPException(status_code=404, detail="Article not found on Wikipedia") + + # Create main entity with project_id + entity = Entity(**data) + entity.project_id = project_id + db.add(entity) + db.commit() + db.refresh(entity) + + # Auto-extract entities and relationships using LLM + if auto_extract and data.get("description"): + try: + # Limit text to avoid token limits + text_to_analyze = data["description"][:3000] + result = await entity_extractor.extract(text_to_analyze) + + # Create extracted entities + created_entities = {} + for ext_entity in result.entities: + # Skip if same as main entity + if ext_entity.name.lower() == title.lower(): + created_entities[ext_entity.name] = entity + continue + + # Check if entity exists (by similar name) + existing_ent = db.query(Entity).filter( + Entity.name.ilike(f"%{ext_entity.name}%") + ).first() + + if existing_ent: + created_entities[ext_entity.name] = existing_ent + else: + # Get coordinates for location entities + lat, lng = None, None + if ext_entity.type == "location": + coords = await geocode(ext_entity.name) + if coords: + lat, lng = coords + + # Parse event_date + event_date = parse_event_date(getattr(ext_entity, 'event_date', None)) + + new_ent = Entity( + name=ext_entity.name, + type=ext_entity.type if ext_entity.type in ["person", "organization", "location", "event"] else "person", + description=ext_entity.description or ext_entity.role, + source="wikipedia_extraction", + latitude=lat, + longitude=lng, + event_date=event_date, + project_id=project_id, + properties={"role": ext_entity.role, "aliases": ext_entity.aliases, "extracted_from": title} + ) + db.add(new_ent) + db.commit() + db.refresh(new_ent) + created_entities[ext_entity.name] = new_ent + + # Create relationships + for rel in result.relationships: + source_ent = created_entities.get(rel.source) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.source}%")).first() + target_ent = created_entities.get(rel.target) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.target}%")).first() + + if source_ent and target_ent and source_ent.id != target_ent.id: + # Check if relationship exists + existing_rel = db.query(Relationship).filter( + Relationship.source_id == source_ent.id, + Relationship.target_id == target_ent.id, + Relationship.type == rel.relationship_type + ).first() + + if not existing_rel: + # Parse relationship event_date + rel_event_date = parse_event_date(getattr(rel, 'event_date', None)) + + new_rel = Relationship( + source_id=source_ent.id, + target_id=target_ent.id, + type=rel.relationship_type, + event_date=rel_event_date, + properties={"context": rel.context, "extracted_from": title} + ) + db.add(new_rel) + + db.commit() + + except Exception as e: + print(f"NER extraction error: {e}") + # Continue without extraction if it fails + + return entity + + +# ========== News ========== + +@router.get("/news/feeds") +def list_available_feeds(): + """Lista os feeds de notícias disponíveis""" + return list(news_service.RSS_FEEDS.keys()) + + +@router.get("/news/fetch") +def fetch_news(feed: Optional[str] = None): + """ + Busca notícias dos feeds RSS + Se feed não for especificado, busca de todos + """ + if feed: + if feed not in news_service.RSS_FEEDS: + raise HTTPException(status_code=404, detail="Feed not found") + url = news_service.RSS_FEEDS[feed] + articles = news_service.fetch_feed(url) + else: + articles = news_service.fetch_all_feeds() + + return articles + + +@router.get("/news/search") +def search_news(q: str): + """Busca notícias por palavra-chave via Google News""" + return news_service.search_news(q) + + +@router.post("/news/import") +async def import_news( + query: Optional[str] = None, + feed: Optional[str] = None, + auto_extract: bool = True, + db: Session = Depends(get_scoped_db) +): + """ + Importa notícias como documentos no sistema + auto_extract: Se True, usa LLM para extrair entidades de cada notícia + """ + if query: + articles = news_service.search_news(query) + elif feed: + if feed not in news_service.RSS_FEEDS: + raise HTTPException(status_code=404, detail="Feed not found") + articles = news_service.fetch_feed(news_service.RSS_FEEDS[feed]) + else: + articles = news_service.fetch_all_feeds() + + imported = 0 + extracted_entities = 0 + + for article in articles: + # Check if document already exists (by URL) + if article.get("url"): + existing = db.query(Document).filter( + Document.source_url == article["url"] + ).first() + if existing: + continue + + doc_data = news_service.to_document(article) + doc = Document(**doc_data) + db.add(doc) + db.commit() + imported += 1 + + # Extract entities from article content + if auto_extract: + try: + text_to_analyze = f"{article.get('title', '')} {article.get('description', '')}".strip() + if len(text_to_analyze) >= 20: + result = await entity_extractor.extract(text_to_analyze[:2000]) + + created_entities = {} + for ext_entity in result.entities: + # Check if entity exists + existing_ent = db.query(Entity).filter( + Entity.name.ilike(f"%{ext_entity.name}%") + ).first() + + if existing_ent: + created_entities[ext_entity.name] = existing_ent + else: + # Get coordinates for location entities + lat, lng = None, None + if ext_entity.type == "location": + coords = await geocode(ext_entity.name) + if coords: + lat, lng = coords + + new_ent = Entity( + name=ext_entity.name, + type=ext_entity.type if ext_entity.type in ["person", "organization", "location", "event"] else "person", + description=ext_entity.description or ext_entity.role, + source="news_extraction", + latitude=lat, + longitude=lng, + properties={"role": ext_entity.role, "aliases": ext_entity.aliases, "from_article": article.get('title', '')} + ) + db.add(new_ent) + db.commit() + db.refresh(new_ent) + created_entities[ext_entity.name] = new_ent + extracted_entities += 1 + + # Create relationships + for rel in result.relationships: + source_ent = created_entities.get(rel.source) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.source}%")).first() + target_ent = created_entities.get(rel.target) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.target}%")).first() + + if source_ent and target_ent and source_ent.id != target_ent.id: + existing_rel = db.query(Relationship).filter( + Relationship.source_id == source_ent.id, + Relationship.target_id == target_ent.id, + Relationship.type == rel.relationship_type + ).first() + + if not existing_rel: + new_rel = Relationship( + source_id=source_ent.id, + target_id=target_ent.id, + type=rel.relationship_type, + properties={"context": rel.context} + ) + db.add(new_rel) + + db.commit() + + except Exception as e: + print(f"NER extraction error for article: {e}") + # Continue without extraction + + return { + "message": f"Imported {imported} articles", + "total_found": len(articles), + "extracted_entities": extracted_entities + } + + +# ========== Manual Import ========== + +@router.post("/bulk/entities") +def bulk_import_entities( + entities: List[dict], + db: Session = Depends(get_scoped_db) +): + """ + Importa múltiplas entidades de uma vez + Útil para importar de CSV/JSON + """ + imported = 0 + for entity_data in entities: + entity = Entity( + type=entity_data.get("type", "unknown"), + name=entity_data.get("name", "Unnamed"), + description=entity_data.get("description"), + properties=entity_data.get("properties", {}), + latitude=entity_data.get("latitude"), + longitude=entity_data.get("longitude"), + source=entity_data.get("source", "manual") + ) + db.add(entity) + imported += 1 + + db.commit() + + return {"message": f"Imported {imported} entities"} diff --git a/app/api/routes/investigate.py b/app/api/routes/investigate.py new file mode 100644 index 0000000000000000000000000000000000000000..646857df8ac0eed0f99ac443367d25c7a6af1512 --- /dev/null +++ b/app/api/routes/investigate.py @@ -0,0 +1,207 @@ +""" +Investigation API Routes - Build dossiers on companies and people +""" +from fastapi import APIRouter, HTTPException, Depends +from pydantic import BaseModel, Field +from typing import Optional, List, Dict, Any +from sqlalchemy.orm import Session + +from app.services.investigation import ( + investigar_empresa, + investigar_pessoa, + dossier_to_dict +) +from app.services.brazil_apis import consultar_cnpj +from app.services.investigator_agent import investigator_agent +from app.api.deps import get_scoped_db + + +router = APIRouter(prefix="/investigate", tags=["Investigation"]) + + +class InvestigateCompanyRequest(BaseModel): + """Request to investigate a company""" + cnpj: str = Field(..., min_length=11, description="CNPJ da empresa") + + +class InvestigatePersonRequest(BaseModel): + """Request to investigate a person""" + nome: str = Field(..., min_length=2, description="Nome da pessoa") + cpf: Optional[str] = Field(None, description="CPF (opcional)") + + +class DossierResponse(BaseModel): + """Dossier response""" + tipo: str + alvo: str + cnpj_cpf: Optional[str] + red_flags: List[str] + score_risco: int + data_geracao: str + fonte_dados: List[str] + secoes: Dict[str, Any] + + +class CNPJResponse(BaseModel): + """Quick CNPJ lookup response""" + cnpj: str + razao_social: str + nome_fantasia: str + situacao: str + data_abertura: str + capital_social: float + endereco: str + telefone: str + email: str + atividade: str + socios: List[Dict[str, Any]] + + +@router.post("/company", response_model=DossierResponse) +async def investigate_company(request: InvestigateCompanyRequest): + """ + Build a comprehensive dossier on a company. + + Collects: + - Cadastral data from CNPJ + - Partners/owners + - Sanctions (CEIS, CNEP, CEPIM) + - News and media mentions + - Related entities + + Returns risk score and red flags. + """ + try: + dossier = await investigar_empresa(request.cnpj) + return DossierResponse(**dossier_to_dict(dossier)) + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/person", response_model=DossierResponse) +async def investigate_person(request: InvestigatePersonRequest): + """ + Build a dossier on a person. + + Note: Due to LGPD, personal data is limited. + Mainly uses web search for public information. + """ + try: + dossier = await investigar_pessoa(request.nome, request.cpf) + return DossierResponse(**dossier_to_dict(dossier)) + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/cnpj/{cnpj}", response_model=CNPJResponse) +async def lookup_cnpj(cnpj: str): + """ + Quick CNPJ lookup - returns basic company data. + """ + try: + data = await consultar_cnpj(cnpj) + + if not data: + raise HTTPException(status_code=404, detail="CNPJ não encontrado") + + return CNPJResponse( + cnpj=data.cnpj, + razao_social=data.razao_social, + nome_fantasia=data.nome_fantasia, + situacao=data.situacao, + data_abertura=data.data_abertura, + capital_social=data.capital_social, + endereco=f"{data.logradouro}, {data.numero} - {data.bairro}, {data.cidade}/{data.uf}", + telefone=data.telefone, + email=data.email, + atividade=f"{data.cnae_principal} - {data.cnae_descricao}", + socios=data.socios + ) + + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +# =========================================== +# Autonomous Investigation Agent +# =========================================== + +class AgentInvestigateRequest(BaseModel): + """Request for autonomous investigation""" + mission: str = Field(..., min_length=5, description="Missão de investigação em linguagem natural") + max_iterations: int = Field(10, ge=1, le=20, description="Máximo de iterações do agente") + + +class FindingResponse(BaseModel): + """A finding from investigation""" + title: str + content: str + source: str + timestamp: str + + +class AgentInvestigateResponse(BaseModel): + """Response from autonomous investigation""" + mission: str + status: str + report: str + findings: List[FindingResponse] + entities_discovered: int + connections_mapped: int + iterations: int + tools_used: List[str] + + +@router.post("/agent", response_model=AgentInvestigateResponse) +async def investigate_with_agent( + request: AgentInvestigateRequest, + db: Session = Depends(get_scoped_db) +): + """ + Autonomous investigation with AI agent. + + The agent will: + 1. Search NUMIDIUM for existing entities + 2. Query CNPJ data for Brazilian companies + 3. Search the web for news and public info + 4. Follow leads and connections + 5. Generate a comprehensive report + + Example missions: + - "Investigue a rede de empresas de João Silva" + - "Descubra os sócios da empresa CNPJ 11.222.333/0001-44" + - "Pesquise sobre a empresa XYZ e suas conexões" + """ + try: + result = await investigator_agent.investigate( + mission=request.mission, + db=db, + max_iterations=request.max_iterations + ) + + return AgentInvestigateResponse( + mission=result.mission, + status=result.status, + report=result.report, + findings=[ + FindingResponse( + title=f.title, + content=f.content, + source=f.source, + timestamp=f.timestamp + ) + for f in result.findings + ], + entities_discovered=len(result.entities_discovered), + connections_mapped=len(result.connections_mapped), + iterations=result.iterations, + tools_used=result.tools_used + ) + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + diff --git a/app/api/routes/projects.py b/app/api/routes/projects.py new file mode 100644 index 0000000000000000000000000000000000000000..d283d8b2d566c49e7a32cba8acc10b39307b4299 --- /dev/null +++ b/app/api/routes/projects.py @@ -0,0 +1,135 @@ +""" +Projects API Routes - Workspace management +""" +from fastapi import APIRouter, Depends, HTTPException +from pydantic import BaseModel +from typing import Optional, List +from datetime import datetime +from sqlalchemy.orm import Session + +from app.api.deps import get_scoped_db +from app.models import Project, Entity, Relationship + + +router = APIRouter(prefix="/projects", tags=["Projects"]) + + +class ProjectCreate(BaseModel): + name: str + description: Optional[str] = None + color: str = "#00d4ff" + icon: str = "folder" + + +class ProjectResponse(BaseModel): + id: str + name: str + description: Optional[str] + color: str + icon: str + entity_count: int = 0 + created_at: datetime + + class Config: + from_attributes = True + + +@router.get("", response_model=List[ProjectResponse]) +def list_projects(db: Session = Depends(get_scoped_db)): + """List all projects""" + projects = db.query(Project).order_by(Project.created_at.desc()).all() + + result = [] + for p in projects: + entity_count = db.query(Entity).filter(Entity.project_id == p.id).count() + result.append(ProjectResponse( + id=p.id, + name=p.name, + description=p.description, + color=p.color, + icon=p.icon, + entity_count=entity_count, + created_at=p.created_at + )) + + return result + + +@router.post("", response_model=ProjectResponse) +def create_project(project: ProjectCreate, db: Session = Depends(get_scoped_db)): + """Create a new project""" + new_project = Project( + name=project.name, + description=project.description, + color=project.color, + icon=project.icon + ) + db.add(new_project) + db.commit() + db.refresh(new_project) + + return ProjectResponse( + id=new_project.id, + name=new_project.name, + description=new_project.description, + color=new_project.color, + icon=new_project.icon, + entity_count=0, + created_at=new_project.created_at + ) + + +@router.get("/{project_id}", response_model=ProjectResponse) +def get_project(project_id: str, db: Session = Depends(get_scoped_db)): + """Get project by ID""" + project = db.query(Project).filter(Project.id == project_id).first() + + if not project: + raise HTTPException(status_code=404, detail="Project not found") + + entity_count = db.query(Entity).filter(Entity.project_id == project_id).count() + + return ProjectResponse( + id=project.id, + name=project.name, + description=project.description, + color=project.color, + icon=project.icon, + entity_count=entity_count, + created_at=project.created_at + ) + + +@router.delete("/{project_id}") +def delete_project(project_id: str, db: Session = Depends(get_scoped_db)): + """Delete project and optionally its entities""" + project = db.query(Project).filter(Project.id == project_id).first() + + if not project: + raise HTTPException(status_code=404, detail="Project not found") + + # Set entities and relationships to no project (null) + db.query(Entity).filter(Entity.project_id == project_id).update({"project_id": None}) + db.query(Relationship).filter(Relationship.project_id == project_id).update({"project_id": None}) + + db.delete(project) + db.commit() + + return {"message": f"Project '{project.name}' deleted"} + + +@router.put("/{project_id}") +def update_project(project_id: str, project: ProjectCreate, db: Session = Depends(get_scoped_db)): + """Update project""" + existing = db.query(Project).filter(Project.id == project_id).first() + + if not existing: + raise HTTPException(status_code=404, detail="Project not found") + + existing.name = project.name + existing.description = project.description + existing.color = project.color + existing.icon = project.icon + db.commit() + + return {"message": "Project updated"} diff --git a/app/api/routes/relationships.py b/app/api/routes/relationships.py new file mode 100644 index 0000000000000000000000000000000000000000..e5887de9ce8df297614adf63c4db8d365fc33114 --- /dev/null +++ b/app/api/routes/relationships.py @@ -0,0 +1,76 @@ +""" +Relationship CRUD Routes +""" +from fastapi import APIRouter, Depends, HTTPException, Query +from sqlalchemy.orm import Session +from typing import List, Optional + +from app.api.deps import get_scoped_db +from app.models import Relationship, Entity +from app.schemas import RelationshipCreate, RelationshipResponse + +router = APIRouter(prefix="/relationships", tags=["Relationships"]) + + +@router.get("/", response_model=List[RelationshipResponse]) +def list_relationships( + type: Optional[str] = None, + source_id: Optional[str] = None, + target_id: Optional[str] = None, + limit: int = Query(default=50, le=200), + db: Session = Depends(get_scoped_db) +): + """Lista relacionamentos com filtros opcionais""" + query = db.query(Relationship) + + if type: + query = query.filter(Relationship.type == type) + if source_id: + query = query.filter(Relationship.source_id == source_id) + if target_id: + query = query.filter(Relationship.target_id == target_id) + + return query.limit(limit).all() + + +@router.get("/types") +def get_relationship_types(db: Session = Depends(get_scoped_db)): + """Retorna todos os tipos de relacionamento unicos""" + types = db.query(Relationship.type).distinct().all() + return [t[0] for t in types] + + +@router.post("/", response_model=RelationshipResponse, status_code=201) +def create_relationship( + rel: RelationshipCreate, + db: Session = Depends(get_scoped_db) +): + """Cria um novo relacionamento entre entidades""" + source = db.query(Entity).filter(Entity.id == rel.source_id).first() + target = db.query(Entity).filter(Entity.id == rel.target_id).first() + + if not source: + raise HTTPException(status_code=404, detail="Source entity not found") + if not target: + raise HTTPException(status_code=404, detail="Target entity not found") + + db_rel = Relationship(**rel.model_dump()) + db.add(db_rel) + db.commit() + db.refresh(db_rel) + return db_rel + + +@router.delete("/{relationship_id}") +def delete_relationship( + relationship_id: str, + db: Session = Depends(get_scoped_db) +): + """Deleta um relacionamento""" + db_rel = db.query(Relationship).filter(Relationship.id == relationship_id).first() + if not db_rel: + raise HTTPException(status_code=404, detail="Relationship not found") + + db.delete(db_rel) + db.commit() + return {"message": "Relationship deleted"} diff --git a/app/api/routes/research.py b/app/api/routes/research.py new file mode 100644 index 0000000000000000000000000000000000000000..41eb6efdb31bbc7cb0da78df28ce780a75fc0f9b --- /dev/null +++ b/app/api/routes/research.py @@ -0,0 +1,158 @@ +""" +Research API Routes - Deep research with automatic entity extraction +""" +from fastapi import APIRouter, Depends, HTTPException +from pydantic import BaseModel, Field +from typing import Optional, List +import traceback +from sqlalchemy.orm import Session + +from app.api.deps import get_scoped_db +from app.services import lancer +from app.services.nlp import entity_extractor +from app.services.geocoding import geocode +from app.models.entity import Entity, Relationship + + +router = APIRouter(prefix="/research", tags=["Research"]) + + +class ResearchRequest(BaseModel): + """Request model for research""" + query: str = Field(..., min_length=3, description="Research query") + mode: str = Field(default="search", description="Research mode: search, deep, heavy") + max_results: int = Field(default=10, le=20) + auto_extract: bool = Field(default=True, description="Auto-extract entities using NER") + + +class ResearchResponse(BaseModel): + """Response model for research""" + query: str + answer: Optional[str] + sources: List[dict] + citations: List[dict] + extracted_entities: int + extracted_relationships: int + processing_time_ms: float + + +@router.post("", response_model=ResearchResponse) +async def research(request: ResearchRequest, db: Session = Depends(get_scoped_db)): + """ + Perform AI-powered research using Lancer API and optionally extract entities. + + Modes: + - search: Fast search with AI synthesis + - deep: Multi-dimensional deep research (slower, more comprehensive) + - heavy: Search with full content scraping + """ + try: + # Call Lancer API based on mode + if request.mode == "deep": + result = await lancer.deep_research(request.query) + elif request.mode == "heavy": + result = await lancer.heavy_search(request.query, request.max_results) + else: + result = await lancer.search(request.query, request.max_results) + + extracted_entities = 0 + extracted_relationships = 0 + + # Extract entities if enabled + if request.auto_extract and result.raw_text: + try: + # Limit text to avoid token limits + text_to_analyze = result.raw_text[:5000] + ner_result = await entity_extractor.extract(text_to_analyze) + + created_entities = {} + + # Create entities + for entity in ner_result.entities: + # Check if exists + existing = db.query(Entity).filter( + Entity.name.ilike(f"%{entity.name}%") + ).first() + + if existing: + created_entities[entity.name] = existing + else: + # Geocode if location + lat, lng = None, None + if entity.type == "location": + coords = await geocode(entity.name) + if coords: + lat, lng = coords + + new_entity = Entity( + name=entity.name, + type=entity.type if entity.type in ["person", "organization", "location", "event"] else "person", + description=entity.description or entity.role or "", + source="lancer_research", + latitude=lat, + longitude=lng, + properties={ + "role": entity.role, + "aliases": entity.aliases, + "research_query": request.query + } + ) + db.add(new_entity) + db.commit() + db.refresh(new_entity) + created_entities[entity.name] = new_entity + extracted_entities += 1 + + # Create relationships + for rel in ner_result.relationships: + source_ent = created_entities.get(rel.source) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.source}%")).first() + target_ent = created_entities.get(rel.target) or db.query(Entity).filter(Entity.name.ilike(f"%{rel.target}%")).first() + + if source_ent and target_ent and source_ent.id != target_ent.id: + existing_rel = db.query(Relationship).filter( + Relationship.source_id == source_ent.id, + Relationship.target_id == target_ent.id, + Relationship.type == rel.relationship_type + ).first() + + if not existing_rel: + new_rel = Relationship( + source_id=source_ent.id, + target_id=target_ent.id, + type=rel.relationship_type, + properties={"context": rel.context, "research_query": request.query} + ) + db.add(new_rel) + extracted_relationships += 1 + + db.commit() + + except Exception as e: + print(f"NER extraction error: {e}") + traceback.print_exc() + + # Prepare sources for response + sources = [ + { + "title": r.title, + "url": r.url, + "content": r.content[:300] if r.content else "", + "score": r.score + } + for r in result.results[:10] + ] + + return ResearchResponse( + query=result.query, + answer=result.answer, + sources=sources, + citations=result.citations, + extracted_entities=extracted_entities, + extracted_relationships=extracted_relationships, + processing_time_ms=result.processing_time_ms + ) + + except Exception as e: + print(f"Research error: {e}") + traceback.print_exc() + raise HTTPException(status_code=500, detail=str(e)) diff --git a/app/api/routes/search.py b/app/api/routes/search.py new file mode 100644 index 0000000000000000000000000000000000000000..27ad925fb6abc0eb121ff2660bc06fd55fd322f0 --- /dev/null +++ b/app/api/routes/search.py @@ -0,0 +1,126 @@ +""" +Search and Analytics Routes +""" +from fastapi import APIRouter, Depends, Query +from sqlalchemy.orm import Session +from sqlalchemy import or_, func +from typing import Optional + +from app.api.deps import get_scoped_db +from app.models import Entity, Relationship, Event, Document +from app.schemas import SearchResult, SystemStats + +router = APIRouter(prefix="/search", tags=["Search"]) + + +@router.get("", response_model=SearchResult) +def global_search( + q: str = Query(..., min_length=2, description="Search query"), + types: Optional[str] = Query(None, description="Entity types (comma-separated)"), + limit: int = Query(default=20, le=100), + db: Session = Depends(get_scoped_db) +): + """ + Busca global em todas as entidades, eventos e documentos. + """ + search_term = f"%{q}%" + type_filter = types.split(",") if types else None + + entity_query = db.query(Entity).filter( + or_( + Entity.name.ilike(search_term), + Entity.description.ilike(search_term) + ) + ) + if type_filter: + entity_query = entity_query.filter(Entity.type.in_(type_filter)) + entities = entity_query.limit(limit).all() + + events = db.query(Event).filter( + or_( + Event.title.ilike(search_term), + Event.description.ilike(search_term) + ) + ).limit(limit).all() + + documents = db.query(Document).filter( + or_( + Document.title.ilike(search_term), + Document.content.ilike(search_term) + ) + ).limit(limit).all() + + return SearchResult( + entities=entities, + events=events, + documents=documents + ) + + +@router.get("/stats", response_model=SystemStats) +def get_system_stats(db: Session = Depends(get_scoped_db)): + """ + Retorna estatisticas gerais do sistema. + """ + total_entities = db.query(Entity).count() + total_relationships = db.query(Relationship).count() + total_events = db.query(Event).count() + total_documents = db.query(Document).count() + + type_counts = db.query( + Entity.type, + func.count(Entity.id) + ).group_by(Entity.type).all() + + entities_by_type = {t: c for t, c in type_counts} + + recent = db.query(Entity).order_by(Entity.created_at.desc()).limit(10).all() + recent_activity = [ + { + "id": e.id, + "type": e.type, + "name": e.name, + "created_at": e.created_at.isoformat() + } + for e in recent + ] + + return SystemStats( + total_entities=total_entities, + total_relationships=total_relationships, + total_events=total_events, + total_documents=total_documents, + entities_by_type=entities_by_type, + recent_activity=recent_activity + ) + + +@router.get("/geo") +def get_geo_data( + entity_type: Optional[str] = None, + db: Session = Depends(get_scoped_db) +): + """ + Retorna entidades com geolocalizacao. + """ + query = db.query(Entity).filter( + Entity.latitude.isnot(None), + Entity.longitude.isnot(None) + ) + + if entity_type: + query = query.filter(Entity.type == entity_type) + + entities = query.all() + + return [ + { + "id": e.id, + "type": e.type, + "name": e.name, + "lat": e.latitude, + "lng": e.longitude, + "properties": e.properties + } + for e in entities + ] diff --git a/app/api/routes/session.py b/app/api/routes/session.py new file mode 100644 index 0000000000000000000000000000000000000000..c81ea29a5363a585f72aa5f0df7e2798292c189e --- /dev/null +++ b/app/api/routes/session.py @@ -0,0 +1,44 @@ +""" +Session management routes +""" +from fastapi import APIRouter, Header, Cookie, Response, Request +from typing import Optional +import uuid + +from app.core.database import create_new_session_id +from app.config import settings + +router = APIRouter(prefix="/session", tags=["Session"]) + + +@router.post("/create") +def create_session(response: Response, request: Request): + """Create a new session and return session_id""" + session_id = create_new_session_id() + secure = settings.cookie_secure + samesite = settings.cookie_samesite + proto = request.headers.get("x-forwarded-proto", request.url.scheme) + if proto != "https" and secure: + secure = False + samesite = "lax" + response.set_cookie( + key="numidium_session", + value=session_id, + max_age=60*60*24*365, # 1 year + httponly=True, + samesite=samesite, + secure=secure + ) + return {"session_id": session_id} + + +@router.get("/current") +def get_current_session( + numidium_session: Optional[str] = Cookie(None), + x_session_id: Optional[str] = Header(None) +): + """Get current session ID""" + session_id = x_session_id or numidium_session + if not session_id: + return {"session_id": None, "message": "No session. Call POST /session/create"} + return {"session_id": session_id} diff --git a/app/api/routes/timeline.py b/app/api/routes/timeline.py new file mode 100644 index 0000000000000000000000000000000000000000..fa45453faf038d34277ffb6a5f1481a2748a8b0e --- /dev/null +++ b/app/api/routes/timeline.py @@ -0,0 +1,165 @@ +""" +Timeline API Routes - Temporal view of entities and relationships +""" +from fastapi import APIRouter, Depends, Query +from pydantic import BaseModel +from typing import Optional, List, Dict, Any +from datetime import datetime, timedelta +from collections import defaultdict +from sqlalchemy.orm import Session + +from app.api.deps import get_scoped_db +from app.models.entity import Entity, Relationship + + +router = APIRouter(prefix="/timeline", tags=["Timeline"]) + + +class TimelineEvent(BaseModel): + id: str + type: str # "entity" or "relationship" + entity_type: Optional[str] = None + name: str + description: Optional[str] = None + date: str + icon: str + + +class TimelineGroup(BaseModel): + date: str + label: str + events: List[TimelineEvent] + + +class TimelineResponse(BaseModel): + groups: List[TimelineGroup] + total_events: int + + +@router.get("", response_model=TimelineResponse) +async def get_timeline( + days: int = Query(default=30, ge=1, le=365), + entity_type: Optional[str] = None, + limit: int = Query(default=100, ge=1, le=500), + db: Session = Depends(get_scoped_db) +): + """ + Get timeline of recent entities and relationships. + Groups events by date. + """ + # Calculate date range + end_date = datetime.now() + start_date = end_date - timedelta(days=days) + + events = [] + + # Get entities + query = db.query(Entity).filter( + Entity.created_at >= start_date + ) + + if entity_type: + query = query.filter(Entity.type == entity_type) + + entities = query.order_by(Entity.created_at.desc()).limit(limit).all() + + icon_map = { + "person": "👤", + "organization": "🏢", + "location": "📍", + "event": "📅", + "concept": "💡", + "product": "📦" + } + + for e in entities: + # Prefer event_date over created_at + date = e.event_date if e.event_date else e.created_at + events.append(TimelineEvent( + id=e.id, + type="entity", + entity_type=e.type, + name=e.name, + description=e.description[:100] if e.description else None, + date=date.isoformat() if date else datetime.now().isoformat(), + icon=icon_map.get(e.type, "📄") + )) + + # Get relationships + relationships = db.query(Relationship).filter( + Relationship.created_at >= start_date + ).order_by(Relationship.created_at.desc()).limit(limit // 2).all() + + for r in relationships: + source = db.query(Entity).filter(Entity.id == r.source_id).first() + target = db.query(Entity).filter(Entity.id == r.target_id).first() + + if source and target: + # Prefer event_date over created_at + date = r.event_date if r.event_date else r.created_at + events.append(TimelineEvent( + id=r.id, + type="relationship", + name=f"{source.name} → {target.name}", + description=r.type, + date=date.isoformat() if date else datetime.now().isoformat(), + icon="🔗" + )) + + # Sort by date + events.sort(key=lambda x: x.date, reverse=True) + + # Group by date + groups_dict = defaultdict(list) + for event in events: + date_key = event.date[:10] # YYYY-MM-DD + groups_dict[date_key].append(event) + + # Format groups + groups = [] + for date_key in sorted(groups_dict.keys(), reverse=True): + try: + dt = datetime.fromisoformat(date_key) + label = dt.strftime("%d %b %Y") + except: + label = date_key + + groups.append(TimelineGroup( + date=date_key, + label=label, + events=groups_dict[date_key] + )) + + return TimelineResponse( + groups=groups, + total_events=len(events) + ) + + +@router.get("/stats") +async def get_timeline_stats(db: Session = Depends(get_scoped_db)): + """Get statistics for timeline visualization""" + + # Count entities by type + entity_counts = {} + for entity_type in ["person", "organization", "location", "event", "concept"]: + count = db.query(Entity).filter(Entity.type == entity_type).count() + entity_counts[entity_type] = count + + # Count relationships + relationship_count = db.query(Relationship).count() + + # Recent activity (last 7 days) + week_ago = datetime.now() - timedelta(days=7) + recent_entities = db.query(Entity).filter(Entity.created_at >= week_ago).count() + recent_relationships = db.query(Relationship).filter(Relationship.created_at >= week_ago).count() + + return { + "entity_counts": entity_counts, + "relationship_count": relationship_count, + "recent_activity": { + "entities": recent_entities, + "relationships": recent_relationships, + "total": recent_entities + recent_relationships + } + } diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000000000000000000000000000000000000..23f3497193305642c7ae08e7c907009e44c489f0 --- /dev/null +++ b/app/config.py @@ -0,0 +1,47 @@ +""" +Numidium Backend Configuration +""" +from pydantic_settings import BaseSettings +from functools import lru_cache +import os + + +class Settings(BaseSettings): + """Application settings""" + + # App Info + app_name: str = "Numidium" + app_version: str = "0.1.0" + debug: bool = False + + # Database + database_url: str = "sqlite:///./data/numidium.db" + + # APIs (opcional - pode configurar depois) + newsapi_key: str = "" + + # Cerebras API for LLM-based entity extraction + cerebras_api_key: str = "" + + # AetherMap API for semantic search and NER + aethermap_url: str = "https://madras1-aethermap.hf.space" + + # CORS + cors_origins: list[str] = ["*"] + + # Session cookie + cookie_secure: bool = True + cookie_samesite: str = "none" + + class Config: + env_file = ".env" + env_file_encoding = "utf-8" + + +@lru_cache() +def get_settings() -> Settings: + """Get cached settings""" + return Settings() + + +settings = get_settings() diff --git a/app/core/__init__.py b/app/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0e8825ce5959f9f016f6f5ed46c2a54fdd15d9e8 --- /dev/null +++ b/app/core/__init__.py @@ -0,0 +1,2 @@ +# Core module +from app.core.database import get_db, init_db, Base diff --git a/app/core/__pycache__/__init__.cpython-311.pyc b/app/core/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5dc2c47dff4d25a449c31d5b491838968bd8699 Binary files /dev/null and b/app/core/__pycache__/__init__.cpython-311.pyc differ diff --git a/app/core/__pycache__/database.cpython-311.pyc b/app/core/__pycache__/database.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d66b606dc407b3d70e7e6b1d62893eb13ff9d42 Binary files /dev/null and b/app/core/__pycache__/database.cpython-311.pyc differ diff --git a/app/core/database.py b/app/core/database.py new file mode 100644 index 0000000000000000000000000000000000000000..6fbd7f1d970d02b46df83e466a09287bfc0090be --- /dev/null +++ b/app/core/database.py @@ -0,0 +1,115 @@ +""" +Database configuration and session management +Per-session databases - each user session gets its own SQLite file +""" +from sqlalchemy import create_engine, text +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker, Session +from typing import Optional +import os +import uuid + +# Ensure data directory exists +os.makedirs("data/sessions", exist_ok=True) + +# Base class for models +Base = declarative_base() + +# Cache for session engines +_session_engines = {} +_session_makers = {} + + +def get_session_engine(session_id: str): + """Get or create engine for a specific session""" + if session_id not in _session_engines: + db_path = f"data/sessions/{session_id}.db" + engine = create_engine( + f"sqlite:///./{db_path}", + connect_args={"check_same_thread": False} + ) + _session_engines[session_id] = engine + _session_makers[session_id] = sessionmaker(autocommit=False, autoflush=False, bind=engine) + + # Initialize tables for this session + Base.metadata.create_all(bind=engine) + _run_migrations(engine) + + return _session_engines[session_id] + + +def get_session_db(session_id: str): + """Get database session for a specific user session""" + get_session_engine(session_id) # Ensure engine exists + SessionLocal = _session_makers[session_id] + db = SessionLocal() + try: + yield db + finally: + db.close() + + +def get_db_for_session(session_id: str) -> Session: + """Direct session getter (non-generator) for routes""" + get_session_engine(session_id) + SessionLocal = _session_makers[session_id] + return SessionLocal() + + +# Legacy - default database for backwards compatibility +from app.config import settings +engine = create_engine( + settings.database_url, + connect_args={"check_same_thread": False} +) +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) + + +def get_default_session() -> Session: + """Create a new session for the default database.""" + return SessionLocal() + + +def get_db(): + """Legacy: Default database session""" + db = get_default_session() + try: + yield db + finally: + db.close() + + +def _run_migrations(eng): + """Run migrations on an engine""" + with eng.connect() as conn: + try: + conn.execute(text("ALTER TABLE entities ADD COLUMN event_date DATETIME")) + conn.commit() + except Exception: + pass + try: + conn.execute(text("ALTER TABLE relationships ADD COLUMN event_date DATETIME")) + conn.commit() + except Exception: + pass + try: + conn.execute(text("ALTER TABLE entities ADD COLUMN project_id VARCHAR(36)")) + conn.commit() + except Exception: + pass + try: + conn.execute(text("ALTER TABLE relationships ADD COLUMN project_id VARCHAR(36)")) + conn.commit() + except Exception: + pass + + +def init_db(): + """Initialize default database tables""" + Base.metadata.create_all(bind=engine) + _run_migrations(engine) + + +def create_new_session_id() -> str: + """Generate a new session ID""" + return str(uuid.uuid4()) diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000000000000000000000000000000000000..7abfa0ca7fb7e31fc2a58e35b5528eb7b135bada --- /dev/null +++ b/app/main.py @@ -0,0 +1,99 @@ +""" +Numidium Backend - Main Application +Plataforma de Inteligência e Análise de Dados +""" +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from contextlib import asynccontextmanager + +from app.config import settings +from app.core.database import init_db +from app.api.routes import entities, relationships, events, search, ingest, analyze, graph, research, chat, investigate, dados_publicos, timeline, session, aethermap + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Startup and shutdown events""" + # Startup: Initialize database + init_db() + print("🚀 Numidium Backend started!") + print(f"📊 Database: {settings.database_url}") + yield + # Shutdown + print("👋 Numidium Backend shutting down...") + + +# Create FastAPI app +app = FastAPI( + title="Numidium API", + description=""" + ## 🔮 Sistema de Inteligência e Análise de Dados + + Backend do VANTAGE - Uma plataforma para: + - 📥 Ingestão de dados de múltiplas fontes (Wikipedia, News, Manual) + - 🔗 Mapeamento de conexões entre entidades + - 🗺️ Visualização geográfica + - 📊 Análise de grafos e relacionamentos + - 🔍 Busca global + """, + version=settings.app_version, + lifespan=lifespan +) + +# CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=settings.cors_origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Include routers +app.include_router(entities.router, prefix="/api/v1") +app.include_router(relationships.router, prefix="/api/v1") +app.include_router(events.router, prefix="/api/v1") +app.include_router(search.router, prefix="/api/v1") +app.include_router(ingest.router, prefix="/api/v1") +app.include_router(analyze.router, prefix="/api/v1") +app.include_router(graph.router, prefix="/api/v1") +app.include_router(research.router, prefix="/api/v1") +app.include_router(chat.router, prefix="/api/v1") +app.include_router(investigate.router, prefix="/api/v1") +app.include_router(dados_publicos.router, prefix="/api/v1") +app.include_router(timeline.router, prefix="/api/v1") +app.include_router(session.router, prefix="/api/v1") +app.include_router(aethermap.router, prefix="/api/v1/aethermap", tags=["aethermap"]) + + +@app.get("/") +def root(): + """Root endpoint - API info""" + return { + "name": "Numidium", + "version": settings.app_version, + "status": "online", + "docs": "/docs", + "description": "Sistema de Inteligência e Análise de Dados" + } + + +@app.get("/health") +def health_check(): + """Health check endpoint for HF Spaces""" + return {"status": "healthy"} + + +@app.get("/api/v1") +def api_info(): + """API v1 info""" + return { + "version": "1.0.0", + "endpoints": { + "entities": "/api/v1/entities", + "relationships": "/api/v1/relationships", + "events": "/api/v1/events", + "search": "/api/v1/search", + "ingest": "/api/v1/ingest" + } + } diff --git a/app/models/__init__.py b/app/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dda9ada652332c2b420769a6ace731249e11cfd8 --- /dev/null +++ b/app/models/__init__.py @@ -0,0 +1,3 @@ +# Models module +from app.models.entity import Entity, Relationship, Event, Document +from app.models.project import Project diff --git a/app/models/__pycache__/__init__.cpython-311.pyc b/app/models/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5bf1933151b2e8a290dc79c4647a626d3d0500ff Binary files /dev/null and b/app/models/__pycache__/__init__.cpython-311.pyc differ diff --git a/app/models/__pycache__/entity.cpython-311.pyc b/app/models/__pycache__/entity.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08def07731fef26be3662b7e40f9afca7961637f Binary files /dev/null and b/app/models/__pycache__/entity.cpython-311.pyc differ diff --git a/app/models/__pycache__/project.cpython-311.pyc b/app/models/__pycache__/project.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e11c4a8127b91fa0a97a4fc6607860af7c04d37 Binary files /dev/null and b/app/models/__pycache__/project.cpython-311.pyc differ diff --git a/app/models/entity.py b/app/models/entity.py new file mode 100644 index 0000000000000000000000000000000000000000..07f9afbd7c789db76ca4d482de4655cd99eb3bda --- /dev/null +++ b/app/models/entity.py @@ -0,0 +1,143 @@ +""" +SQLAlchemy Models for Numidium +""" +from sqlalchemy import Column, String, Text, DateTime, Float, JSON, ForeignKey, Table +from sqlalchemy.orm import relationship +from datetime import datetime +import uuid + +from app.core.database import Base + + +def generate_uuid(): + return str(uuid.uuid4()) + + +class Entity(Base): + """ + Entidade - qualquer coisa rastreável no sistema + Pode ser: pessoa, organização, local, veículo, evento, documento, etc. + """ + __tablename__ = "entities" + + id = Column(String(36), primary_key=True, default=generate_uuid) + project_id = Column(String(36), ForeignKey("projects.id"), nullable=True, index=True) + type = Column(String(50), nullable=False, index=True) # person, organization, location, etc + name = Column(String(255), nullable=False, index=True) + description = Column(Text, nullable=True) + properties = Column(JSON, default=dict) # Dados flexíveis + + # Geolocalização (opcional) + latitude = Column(Float, nullable=True) + longitude = Column(Float, nullable=True) + + # Data histórica do evento/entidade (quando aconteceu, não quando foi adicionado) + event_date = Column(DateTime, nullable=True) + + # Fonte do dado + source = Column(String(100), nullable=True) # wikipedia, newsapi, manual, etc + source_url = Column(Text, nullable=True) + + # Timestamps + created_at = Column(DateTime, default=datetime.utcnow) + updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) + + # Relacionamentos + outgoing_relationships = relationship( + "Relationship", + foreign_keys="Relationship.source_id", + back_populates="source_entity" + ) + incoming_relationships = relationship( + "Relationship", + foreign_keys="Relationship.target_id", + back_populates="target_entity" + ) + + +class Relationship(Base): + """ + Relacionamento entre duas entidades + Exemplos: works_for, knows, owns, located_at, participated_in + """ + __tablename__ = "relationships" + + id = Column(String(36), primary_key=True, default=generate_uuid) + project_id = Column(String(36), ForeignKey("projects.id"), nullable=True, index=True) + source_id = Column(String(36), ForeignKey("entities.id"), nullable=False) + target_id = Column(String(36), ForeignKey("entities.id"), nullable=False) + type = Column(String(50), nullable=False, index=True) # works_for, knows, owns, etc + properties = Column(JSON, default=dict) + confidence = Column(Float, default=1.0) # 0-1, quão certo estamos dessa conexão + + # Data histórica do relacionamento (quando aconteceu) + event_date = Column(DateTime, nullable=True) + + # Fonte + source = Column(String(100), nullable=True) + + # Timestamps + created_at = Column(DateTime, default=datetime.utcnow) + + # Relacionamentos + source_entity = relationship("Entity", foreign_keys=[source_id], back_populates="outgoing_relationships") + target_entity = relationship("Entity", foreign_keys=[target_id], back_populates="incoming_relationships") + + +class Event(Base): + """ + Evento - algo que aconteceu envolvendo entidades + """ + __tablename__ = "events" + + id = Column(String(36), primary_key=True, default=generate_uuid) + type = Column(String(50), nullable=False, index=True) + title = Column(String(255), nullable=False) + description = Column(Text, nullable=True) + + # Quando aconteceu + event_date = Column(DateTime, nullable=True) + + # Onde aconteceu + location_name = Column(String(255), nullable=True) + latitude = Column(Float, nullable=True) + longitude = Column(Float, nullable=True) + + # Entidades envolvidas (armazenado como JSON array de IDs) + entity_ids = Column(JSON, default=list) + + # Fonte + source = Column(String(100), nullable=True) + source_url = Column(Text, nullable=True) + + # Metadados + properties = Column(JSON, default=dict) + + # Timestamps + created_at = Column(DateTime, default=datetime.utcnow) + + +class Document(Base): + """ + Documento - texto/arquivo para análise + """ + __tablename__ = "documents" + + id = Column(String(36), primary_key=True, default=generate_uuid) + title = Column(String(255), nullable=False) + content = Column(Text, nullable=True) + summary = Column(Text, nullable=True) # Resumo gerado por IA + + # Tipo de documento + doc_type = Column(String(50), default="text") # text, news, report, etc + + # Entidades mencionadas (extraídas por NLP) + mentioned_entities = Column(JSON, default=list) + + # Fonte + source = Column(String(100), nullable=True) + source_url = Column(Text, nullable=True) + + # Timestamps + published_at = Column(DateTime, nullable=True) + created_at = Column(DateTime, default=datetime.utcnow) diff --git a/app/models/project.py b/app/models/project.py new file mode 100644 index 0000000000000000000000000000000000000000..72f601e1975770622c146cc3b1b9fb6fbd912a3c --- /dev/null +++ b/app/models/project.py @@ -0,0 +1,29 @@ +""" +Project Model - Workspaces for organizing investigations +""" +from sqlalchemy import Column, String, Text, DateTime +from datetime import datetime +import uuid + +from app.core.database import Base + + +def generate_uuid(): + return str(uuid.uuid4()) + + +class Project(Base): + """ + Projeto/Workspace - agrupa entidades e relacionamentos por investigação + """ + __tablename__ = "projects" + + id = Column(String(36), primary_key=True, default=generate_uuid) + name = Column(String(255), nullable=False) + description = Column(Text, nullable=True) + color = Column(String(7), default="#00d4ff") # Hex color for UI + icon = Column(String(50), default="folder") # Icon name + + # Timestamps + created_at = Column(DateTime, default=datetime.utcnow) + updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) diff --git a/app/schemas/__init__.py b/app/schemas/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5afaa5cdacc5762ea19abe607c7dab6309e351a8 --- /dev/null +++ b/app/schemas/__init__.py @@ -0,0 +1,10 @@ +# Schemas module +from app.schemas.schemas import ( + EntityCreate, EntityUpdate, EntityResponse, + RelationshipCreate, RelationshipResponse, + EventCreate, EventResponse, + DocumentCreate, DocumentResponse, + GraphData, GraphNode, GraphEdge, + SearchQuery, SearchResult, + SystemStats +) diff --git a/app/schemas/__pycache__/__init__.cpython-311.pyc b/app/schemas/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a0991c3a362725e79629796654b9dc0ed9c9668 Binary files /dev/null and b/app/schemas/__pycache__/__init__.cpython-311.pyc differ diff --git a/app/schemas/__pycache__/schemas.cpython-311.pyc b/app/schemas/__pycache__/schemas.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8358505555f3036c07fa84d01cd9cd01b7b97b9f Binary files /dev/null and b/app/schemas/__pycache__/schemas.cpython-311.pyc differ diff --git a/app/schemas/schemas.py b/app/schemas/schemas.py new file mode 100644 index 0000000000000000000000000000000000000000..afbff0c301ddb1fbe8cae0e4848fcafc48082ff0 --- /dev/null +++ b/app/schemas/schemas.py @@ -0,0 +1,163 @@ +""" +Pydantic Schemas for API validation +""" +from pydantic import BaseModel, Field +from typing import Optional, List, Any +from datetime import datetime + + +# ========== Entity Schemas ========== + +class EntityBase(BaseModel): + type: str = Field(..., description="Tipo da entidade: person, organization, location, etc") + name: str = Field(..., description="Nome da entidade") + description: Optional[str] = None + properties: dict = Field(default_factory=dict) + latitude: Optional[float] = None + longitude: Optional[float] = None + source: Optional[str] = None + source_url: Optional[str] = None + + +class EntityCreate(EntityBase): + pass + + +class EntityUpdate(BaseModel): + type: Optional[str] = None + name: Optional[str] = None + description: Optional[str] = None + properties: Optional[dict] = None + latitude: Optional[float] = None + longitude: Optional[float] = None + + +class EntityResponse(EntityBase): + id: str + created_at: datetime + updated_at: datetime + + class Config: + from_attributes = True + + +# ========== Relationship Schemas ========== + +class RelationshipBase(BaseModel): + source_id: str + target_id: str + type: str = Field(..., description="Tipo: works_for, knows, owns, located_at, etc") + properties: dict = Field(default_factory=dict) + confidence: float = Field(default=1.0, ge=0, le=1) + source: Optional[str] = None + + +class RelationshipCreate(RelationshipBase): + pass + + +class RelationshipResponse(RelationshipBase): + id: str + created_at: datetime + + class Config: + from_attributes = True + + +# ========== Event Schemas ========== + +class EventBase(BaseModel): + type: str + title: str + description: Optional[str] = None + event_date: Optional[datetime] = None + location_name: Optional[str] = None + latitude: Optional[float] = None + longitude: Optional[float] = None + entity_ids: List[str] = Field(default_factory=list) + source: Optional[str] = None + source_url: Optional[str] = None + properties: dict = Field(default_factory=dict) + + +class EventCreate(EventBase): + pass + + +class EventResponse(EventBase): + id: str + created_at: datetime + + class Config: + from_attributes = True + + +# ========== Document Schemas ========== + +class DocumentBase(BaseModel): + title: str + content: Optional[str] = None + doc_type: str = "text" + source: Optional[str] = None + source_url: Optional[str] = None + published_at: Optional[datetime] = None + + +class DocumentCreate(DocumentBase): + pass + + +class DocumentResponse(DocumentBase): + id: str + summary: Optional[str] = None + mentioned_entities: List[str] = [] + created_at: datetime + + class Config: + from_attributes = True + + +# ========== Graph Schemas ========== + +class GraphNode(BaseModel): + id: str + type: str + name: str + properties: dict = {} + + +class GraphEdge(BaseModel): + source: str + target: str + type: str + confidence: float = 1.0 + + +class GraphData(BaseModel): + nodes: List[GraphNode] + edges: List[GraphEdge] + + +# ========== Search Schemas ========== + +class SearchQuery(BaseModel): + query: str + entity_types: Optional[List[str]] = None + limit: int = Field(default=20, le=100) + + +class SearchResult(BaseModel): + entities: List[EntityResponse] + events: List[EventResponse] + documents: List[DocumentResponse] + + +# ========== Stats Schemas ========== + +class SystemStats(BaseModel): + total_entities: int + total_relationships: int + total_events: int + total_documents: int + entities_by_type: dict + recent_activity: List[dict] diff --git a/app/services/__init__.py b/app/services/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c7f87b77ce421c83e59588e4c341ebab500c3c41 --- /dev/null +++ b/app/services/__init__.py @@ -0,0 +1 @@ +# Services module diff --git a/app/services/__pycache__/__init__.cpython-311.pyc b/app/services/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bdd1095f0f04ab7b53c9f32b8bbce7e4a48236e3 Binary files /dev/null and b/app/services/__pycache__/__init__.cpython-311.pyc differ diff --git a/app/services/__pycache__/brazil_apis.cpython-311.pyc b/app/services/__pycache__/brazil_apis.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9751973d578e79e328d2912fe354747fb95d79a9 Binary files /dev/null and b/app/services/__pycache__/brazil_apis.cpython-311.pyc differ diff --git a/app/services/__pycache__/geocoding.cpython-311.pyc b/app/services/__pycache__/geocoding.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..664fbab62acd7d6c1db2527f8ab8b4b7a11662e9 Binary files /dev/null and b/app/services/__pycache__/geocoding.cpython-311.pyc differ diff --git a/app/services/__pycache__/investigation.cpython-311.pyc b/app/services/__pycache__/investigation.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..219ab0bfb74e241ee442a062277765b2d3f84c26 Binary files /dev/null and b/app/services/__pycache__/investigation.cpython-311.pyc differ diff --git a/app/services/__pycache__/investigator_agent.cpython-311.pyc b/app/services/__pycache__/investigator_agent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17cd96446bf23aa53179b163aba52d3f2b0ece2e Binary files /dev/null and b/app/services/__pycache__/investigator_agent.cpython-311.pyc differ diff --git a/app/services/__pycache__/lancer.cpython-311.pyc b/app/services/__pycache__/lancer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0415c81ac5d96a648b50a300ce010fe32a9a53bb Binary files /dev/null and b/app/services/__pycache__/lancer.cpython-311.pyc differ diff --git a/app/services/__pycache__/transparencia_api.cpython-311.pyc b/app/services/__pycache__/transparencia_api.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8758135b5b049f71eceec09add4595fb7b11dd12 Binary files /dev/null and b/app/services/__pycache__/transparencia_api.cpython-311.pyc differ diff --git a/app/services/aethermap_client.py b/app/services/aethermap_client.py new file mode 100644 index 0000000000000000000000000000000000000000..2e9a6490f843a94391b376e528db87554c8e31cf --- /dev/null +++ b/app/services/aethermap_client.py @@ -0,0 +1,343 @@ +""" +AetherMap Client +Client para integração com AetherMap API - busca semântica, NER e análise de grafos. +""" +import httpx +import json +import io +from typing import List, Dict, Any, Optional +from dataclasses import dataclass, field +from datetime import datetime +import logging + +from app.config import settings + +logger = logging.getLogger(__name__) + + +# URL base do AetherMap (HuggingFace Space) +AETHERMAP_URL = getattr(settings, 'aethermap_url', 'https://madras1-aethermap.hf.space') + + +@dataclass +class ProcessResult: + """Resultado do processamento de documentos""" + job_id: str + num_documents: int + num_clusters: int + num_noise: int + metrics: Dict[str, Any] = field(default_factory=dict) + cluster_analysis: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class SearchResult: + """Resultado de busca semântica""" + summary: str # Resposta RAG gerada pelo LLM + results: List[Dict[str, Any]] = field(default_factory=list) + + +@dataclass +class EntityNode: + """Nó de entidade no grafo""" + entity: str + entity_type: str + docs: int + degree: int = 0 + centrality: float = 0.0 + role: str = "peripheral" # hub, connector, peripheral + + +@dataclass +class EntityEdge: + """Aresta do grafo de entidades""" + source_entity: str + target_entity: str + weight: int + reason: str + + +@dataclass +class EntityGraphResult: + """Resultado da extração de entidades""" + nodes: List[EntityNode] = field(default_factory=list) + edges: List[EntityEdge] = field(default_factory=list) + hubs: List[Dict[str, Any]] = field(default_factory=list) + insights: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class GraphAnalysis: + """Análise do grafo via LLM""" + analysis: str + key_entities: List[str] = field(default_factory=list) + relationships: List[str] = field(default_factory=list) + + +class AetherMapClient: + """ + Client para AetherMap API. + + Funcionalidades: + - Processamento de documentos (embeddings + clusters) + - Busca semântica RAG (FAISS + BM25 + reranking + LLM) + - Extração de entidades NER + - Análise de grafo via LLM + """ + + def __init__(self, base_url: str = None, timeout: float = 600.0): + self.base_url = (base_url or AETHERMAP_URL).rstrip('/') + self.timeout = timeout + self._current_job_id: Optional[str] = None + + @property + def current_job_id(self) -> Optional[str]: + """Retorna o job_id atual""" + return self._current_job_id + + async def process_documents( + self, + texts: List[str], + fast_mode: bool = True, + min_cluster_size: int = 0, + min_samples: int = 0 + ) -> ProcessResult: + """ + Processa uma lista de textos gerando embeddings e clusters. + + Args: + texts: Lista de textos/documentos + fast_mode: Se True, usa PCA (rápido). Se False, usa UMAP (preciso) + min_cluster_size: Tamanho mínimo do cluster (0=auto) + min_samples: Mínimo de amostras (0=auto) + + Returns: + ProcessResult com job_id e métricas + """ + # Criar arquivo TXT em memória + content = "\n".join(texts) + file_bytes = content.encode('utf-8') + + try: + async with httpx.AsyncClient(timeout=self.timeout) as client: + files = { + 'file': ('documents.txt', io.BytesIO(file_bytes), 'text/plain') + } + data = { + 'n_samples': str(len(texts)), + 'fast_mode': 'true' if fast_mode else 'false', + 'min_cluster_size': str(min_cluster_size), + 'min_samples': str(min_samples) + } + + logger.info(f"AetherMap: Processando {len(texts)} documentos para {self.base_url}/process/") + + response = await client.post( + f"{self.base_url}/process/", + files=files, + data=data + ) + + logger.info(f"AetherMap: Response status {response.status_code}") + + if response.status_code != 200: + error_text = response.text[:500] if response.text else "No response body" + logger.error(f"AetherMap error: {response.status_code} - {error_text}") + raise Exception(f"AetherMap error: {response.status_code} - {error_text}") + + result = response.json() + + self._current_job_id = result.get('job_id') + metadata = result.get('metadata', {}) + + logger.info(f"AetherMap: Job criado {self._current_job_id}") + + return ProcessResult( + job_id=self._current_job_id or "unknown", + num_documents=metadata.get('num_documents_processed', len(texts)), + num_clusters=metadata.get('num_clusters_found', 0), + num_noise=metadata.get('num_noise_points', 0), + metrics=result.get('metrics', {}), + cluster_analysis=result.get('cluster_analysis', {}) + ) + except httpx.TimeoutException: + logger.error(f"AetherMap: Timeout ao conectar com {self.base_url}") + raise Exception(f"Timeout: AetherMap Space pode estar dormindo. Tente novamente em alguns segundos.") + except httpx.ConnectError as e: + logger.error(f"AetherMap: Erro de conexão: {e}") + raise Exception(f"Erro de conexão com AetherMap: {e}") + except Exception as e: + logger.error(f"AetherMap: Erro inesperado: {e}") + raise + + async def semantic_search( + self, + query: str, + job_id: str = None, + turbo_mode: bool = False + ) -> SearchResult: + """ + Busca semântica RAG híbrida nos documentos processados. + + Args: + query: Termo de busca + job_id: ID do job (se não fornecido, usa o último) + turbo_mode: Se True, busca mais rápida (menos precisa) + + Returns: + SearchResult com resumo e resultados + """ + job_id = job_id or self._current_job_id + if not job_id: + raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") + + async with httpx.AsyncClient(timeout=self.timeout) as client: + data = { + 'query': query, + 'job_id': job_id, + 'turbo_mode': 'true' if turbo_mode else 'false' + } + + logger.info(f"AetherMap: Buscando '{query}'...") + + response = await client.post( + f"{self.base_url}/search/", + data=data + ) + + if response.status_code != 200: + raise Exception(f"AetherMap search error: {response.status_code} - {response.text}") + + result = response.json() + + return SearchResult( + summary=result.get('summary', ''), + results=result.get('results', []) + ) + + async def extract_entities(self, job_id: str = None) -> EntityGraphResult: + """ + Extrai entidades nomeadas (NER) e cria grafo de conexões. + + Args: + job_id: ID do job (se não fornecido, usa o último) + + Returns: + EntityGraphResult com nós, arestas e insights + """ + job_id = job_id or self._current_job_id + if not job_id: + raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") + + async with httpx.AsyncClient(timeout=self.timeout) as client: + data = {'job_id': job_id} + + logger.info(f"AetherMap: Extraindo entidades...") + + response = await client.post( + f"{self.base_url}/entity_graph/", + data=data + ) + + if response.status_code != 200: + raise Exception(f"AetherMap entity_graph error: {response.status_code} - {response.text}") + + result = response.json() + + # Converter para dataclasses + nodes = [ + EntityNode( + entity=n.get('entity', ''), + entity_type=n.get('type', ''), + docs=n.get('docs', 0), + degree=n.get('degree', 0), + centrality=n.get('centrality', 0.0), + role=n.get('role', 'peripheral') + ) + for n in result.get('nodes', []) + ] + + edges = [ + EntityEdge( + source_entity=e.get('source_entity', ''), + target_entity=e.get('target_entity', ''), + weight=e.get('weight', 0), + reason=e.get('reason', '') + ) + for e in result.get('edges', []) + ] + + return EntityGraphResult( + nodes=nodes, + edges=edges, + hubs=result.get('hubs', []), + insights=result.get('insights', {}) + ) + + async def analyze_graph(self, job_id: str = None) -> GraphAnalysis: + """ + Usa LLM para analisar o Knowledge Graph e extrair insights. + + Args: + job_id: ID do job (se não fornecido, usa o último) + + Returns: + GraphAnalysis com análise textual + """ + job_id = job_id or self._current_job_id + if not job_id: + raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") + + async with httpx.AsyncClient(timeout=self.timeout) as client: + data = {'job_id': job_id} + + logger.info(f"AetherMap: Analisando grafo com LLM...") + + response = await client.post( + f"{self.base_url}/analyze_graph/", + data=data + ) + + if response.status_code != 200: + raise Exception(f"AetherMap analyze_graph error: {response.status_code} - {response.text}") + + result = response.json() + + return GraphAnalysis( + analysis=result.get('analysis', ''), + key_entities=result.get('key_entities', []), + relationships=result.get('relationships', []) + ) + + async def describe_clusters(self, job_id: str = None) -> Dict[str, Any]: + """ + Usa LLM para descrever cada cluster encontrado. + + Args: + job_id: ID do job (se não fornecido, usa o último) + + Returns: + Dict com insights por cluster + """ + job_id = job_id or self._current_job_id + if not job_id: + raise ValueError("Nenhum job_id disponível. Processe documentos primeiro.") + + async with httpx.AsyncClient(timeout=self.timeout) as client: + data = {'job_id': job_id} + + logger.info(f"AetherMap: Descrevendo clusters...") + + response = await client.post( + f"{self.base_url}/describe_clusters/", + data=data + ) + + if response.status_code != 200: + raise Exception(f"AetherMap describe_clusters error: {response.status_code} - {response.text}") + + return response.json() + + +# Instância global do client +aethermap = AetherMapClient() diff --git a/app/services/analysis/__init__.py b/app/services/analysis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..18e8fe19c13e9ec59fb147e63518a8ddbeef5f25 --- /dev/null +++ b/app/services/analysis/__init__.py @@ -0,0 +1 @@ +# Analysis services diff --git a/app/services/brazil_apis.py b/app/services/brazil_apis.py new file mode 100644 index 0000000000000000000000000000000000000000..3cf938529a35708355664dd05f60288519c0d7df --- /dev/null +++ b/app/services/brazil_apis.py @@ -0,0 +1,218 @@ +""" +Brazilian Data APIs Service +Consolidates access to public Brazilian data APIs for investigation +""" +import httpx +from typing import Optional, Dict, Any, List +from dataclasses import dataclass, field +import re + + +# API URLs +CNPJA_URL = "https://api.cnpja.com.br/office" +OPENCNPJ_URL = "https://api.opencnpj.org/v1/cnpj" +BRASILAPI_CNPJ = "https://brasilapi.com.br/api/cnpj/v1" +BRASILAPI_CEP = "https://brasilapi.com.br/api/cep/v2" + + +@dataclass +class CompanyData: + """Data structure for company information""" + cnpj: str + razao_social: str = "" + nome_fantasia: str = "" + situacao: str = "" + data_abertura: str = "" + natureza_juridica: str = "" + capital_social: float = 0.0 + porte: str = "" + + # Address + logradouro: str = "" + numero: str = "" + complemento: str = "" + bairro: str = "" + cidade: str = "" + uf: str = "" + cep: str = "" + + # Contact + telefone: str = "" + email: str = "" + + # Activity + cnae_principal: str = "" + cnae_descricao: str = "" + cnaes_secundarios: List[str] = field(default_factory=list) + + # Partners/Owners + socios: List[Dict[str, Any]] = field(default_factory=list) + + # Source + fonte: str = "" + + +def clean_cnpj(cnpj: str) -> str: + """Remove formatting from CNPJ""" + return re.sub(r'[^0-9]', '', cnpj) + + +async def consultar_cnpj(cnpj: str) -> Optional[CompanyData]: + """ + Query CNPJ data from available APIs. + Tries BrasilAPI first (more reliable), then falls back to others. + """ + cnpj_clean = clean_cnpj(cnpj) + + if len(cnpj_clean) != 14: + return None + + # Try BrasilAPI first + result = await _query_brasilapi(cnpj_clean) + if result: + return result + + # Fallback to OpenCNPJ + result = await _query_opencnpj(cnpj_clean) + if result: + return result + + return None + + +async def _query_brasilapi(cnpj: str) -> Optional[CompanyData]: + """Query BrasilAPI for CNPJ data""" + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get(f"{BRASILAPI_CNPJ}/{cnpj}") + + if response.status_code != 200: + return None + + data = response.json() + + # Parse partners + socios = [] + for socio in data.get("qsa", []): + socios.append({ + "nome": socio.get("nome_socio", ""), + "qualificacao": socio.get("qualificacao_socio", ""), + "cpf_cnpj": socio.get("cnpj_cpf_do_socio", ""), + "data_entrada": socio.get("data_entrada_sociedade", "") + }) + + # Parse CNAEs + cnaes_sec = [] + for cnae in data.get("cnaes_secundarios", []): + if isinstance(cnae, dict): + cnaes_sec.append(f"{cnae.get('codigo', '')} - {cnae.get('descricao', '')}") + else: + cnaes_sec.append(str(cnae)) + + return CompanyData( + cnpj=cnpj, + razao_social=data.get("razao_social", ""), + nome_fantasia=data.get("nome_fantasia", ""), + situacao=data.get("descricao_situacao_cadastral", ""), + data_abertura=data.get("data_inicio_atividade", ""), + natureza_juridica=data.get("natureza_juridica", ""), + capital_social=float(data.get("capital_social", 0)), + porte=data.get("porte", ""), + logradouro=data.get("logradouro", ""), + numero=data.get("numero", ""), + complemento=data.get("complemento", ""), + bairro=data.get("bairro", ""), + cidade=data.get("municipio", ""), + uf=data.get("uf", ""), + cep=data.get("cep", ""), + telefone=data.get("ddd_telefone_1", ""), + email=data.get("email", ""), + cnae_principal=str(data.get("cnae_fiscal", "")), + cnae_descricao=data.get("cnae_fiscal_descricao", ""), + cnaes_secundarios=cnaes_sec, + socios=socios, + fonte="BrasilAPI" + ) + + except Exception as e: + print(f"BrasilAPI error: {e}") + return None + + +async def _query_opencnpj(cnpj: str) -> Optional[CompanyData]: + """Query OpenCNPJ API""" + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get(f"{OPENCNPJ_URL}/{cnpj}") + + if response.status_code != 200: + return None + + data = response.json() + + # Parse partners + socios = [] + for socio in data.get("socios", []): + socios.append({ + "nome": socio.get("nome", ""), + "qualificacao": socio.get("qualificacao", ""), + "cpf_cnpj": "", + "data_entrada": socio.get("data_entrada", "") + }) + + return CompanyData( + cnpj=cnpj, + razao_social=data.get("razao_social", ""), + nome_fantasia=data.get("nome_fantasia", ""), + situacao=data.get("situacao_cadastral", ""), + data_abertura=data.get("data_inicio_atividade", ""), + natureza_juridica=data.get("natureza_juridica", ""), + capital_social=float(data.get("capital_social", 0) or 0), + porte=data.get("porte", ""), + logradouro=data.get("logradouro", ""), + numero=data.get("numero", ""), + complemento=data.get("complemento", ""), + bairro=data.get("bairro", ""), + cidade=data.get("municipio", ""), + uf=data.get("uf", ""), + cep=data.get("cep", ""), + telefone=data.get("telefone", ""), + email=data.get("email", ""), + cnae_principal=data.get("cnae_principal", {}).get("codigo", ""), + cnae_descricao=data.get("cnae_principal", {}).get("descricao", ""), + cnaes_secundarios=[], + socios=socios, + fonte="OpenCNPJ" + ) + + except Exception as e: + print(f"OpenCNPJ error: {e}") + return None + + +async def consultar_cep(cep: str) -> Optional[Dict[str, Any]]: + """Query address by CEP""" + cep_clean = re.sub(r'[^0-9]', '', cep) + + try: + async with httpx.AsyncClient(timeout=15.0) as client: + response = await client.get(f"{BRASILAPI_CEP}/{cep_clean}") + + if response.status_code != 200: + return None + + return response.json() + + except Exception as e: + print(f"CEP query error: {e}") + return None + + +async def buscar_empresas_por_nome(nome: str, uf: Optional[str] = None) -> List[Dict[str, Any]]: + """ + Search companies by name using web search (via Lancer). + This is a workaround since direct name search APIs are paid. + """ + # This would need Lancer integration for web search + # For now, return empty - will be filled by investigation service + return [] diff --git a/app/services/chat.py b/app/services/chat.py new file mode 100644 index 0000000000000000000000000000000000000000..89595f334653e11a19d2103c28ccfaeb97110844 --- /dev/null +++ b/app/services/chat.py @@ -0,0 +1,213 @@ +""" +Chat Service - Intelligent chat with RAG capabilities +Uses local database + Lancer for comprehensive responses +""" +import httpx +from typing import Optional, List, Dict, Any +from sqlalchemy.orm import Session + +from app.config import settings +from app.models.entity import Entity, Relationship + + +LANCER_URL = "https://madras1-lancer.hf.space/api/v1" + +SYSTEM_PROMPT = """Você é um assistente de inteligência do NUMIDIUM. +Você tem acesso a um grafo de conhecimento com entidades e relacionamentos, +e pode pesquisar na web para informações atualizadas. + +Responda em português brasileiro de forma clara e direta. +Se não tiver certeza, diga que não sabe em vez de inventar.""" + + +class ChatService: + """Chat service with RAG using local database and Lancer""" + + def __init__(self): + self.api_url = "https://api.cerebras.ai/v1/chat/completions" + self.conversation_history: Dict[str, List[Dict[str, str]]] = {} + + def _get_history(self, session_id: Optional[str]) -> List[Dict[str, str]]: + key = session_id or "default" + if key not in self.conversation_history: + self.conversation_history[key] = [] + return self.conversation_history[key] + + def clear_history(self, session_id: Optional[str] = None): + """Clear conversation history""" + key = session_id or "default" + self.conversation_history.pop(key, None) + + def _get_local_context(self, query: str, db: Session, limit: int = 5) -> str: + """Get relevant entities from local database""" + # Search entities by name + entities = db.query(Entity).filter( + Entity.name.ilike(f"%{query}%") + ).limit(limit).all() + + # Also search by description + if len(entities) < limit: + desc_entities = db.query(Entity).filter( + Entity.description.ilike(f"%{query}%") + ).limit(limit - len(entities)).all() + entities.extend(desc_entities) + + if not entities: + # Try splitting query into words + words = query.split() + for word in words: + if len(word) > 3: + word_entities = db.query(Entity).filter( + Entity.name.ilike(f"%{word}%") + ).limit(2).all() + entities.extend(word_entities) + + if not entities: + return "" + + context_parts = [] + seen_ids = set() + + for entity in entities: + if entity.id in seen_ids: + continue + seen_ids.add(entity.id) + + ctx = f"• {entity.name} ({entity.type})" + if entity.description: + ctx += f": {entity.description[:200]}" + + # Get relationships + relationships = db.query(Relationship).filter( + (Relationship.source_id == entity.id) | + (Relationship.target_id == entity.id) + ).limit(5).all() + + if relationships: + related = [] + for rel in relationships: + if rel.source_id == entity.id: + target = db.query(Entity).filter(Entity.id == rel.target_id).first() + if target: + related.append(f"{rel.type} → {target.name}") + else: + source = db.query(Entity).filter(Entity.id == rel.source_id).first() + if source: + related.append(f"{source.name} → {rel.type}") + + if related: + ctx += f" | Relações: {', '.join(related[:3])}" + + context_parts.append(ctx) + + return "\n".join(context_parts) + + async def _get_web_context(self, query: str) -> str: + """Get context from Lancer web search""" + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post( + f"{LANCER_URL}/search", + json={ + "query": query, + "max_results": 5, + "include_answer": True + } + ) + + if response.status_code == 200: + data = response.json() + if data.get("answer"): + return f"Informações da web:\n{data['answer'][:1000]}" + + return "" + except Exception as e: + print(f"Lancer error: {e}") + return "" + + async def _call_llm(self, messages: List[Dict[str, str]]) -> str: + """Call Cerebras LLM""" + try: + async with httpx.AsyncClient(timeout=60.0) as client: + response = await client.post( + self.api_url, + headers={ + "Authorization": f"Bearer {settings.cerebras_api_key}", + "Content-Type": "application/json" + }, + json={ + "model": "qwen-3-32b", + "messages": messages, + "temperature": 0.7, + "max_tokens": 2048 + } + ) + + if response.status_code == 200: + data = response.json() + return data["choices"][0]["message"]["content"] + else: + return f"Erro na API: {response.status_code}" + + except Exception as e: + return f"Erro: {str(e)}" + + async def chat( + self, + message: str, + db: Session, + use_web: bool = True, + use_history: bool = True, + session_id: Optional[str] = None + ) -> Dict[str, Any]: + """Process chat message with RAG""" + history = self._get_history(session_id) + + # Get local context + local_context = self._get_local_context(message, db) + + # Get web context if enabled + web_context = "" + if use_web: + web_context = await self._get_web_context(message) + + # Build context + context_parts = [] + if local_context: + context_parts.append(f"📊 Conhecimento local:\n{local_context}") + if web_context: + context_parts.append(f"🌐 {web_context}") + + context = "\n\n".join(context_parts) if context_parts else "Nenhum contexto disponível." + + # Build messages + messages = [{"role": "system", "content": SYSTEM_PROMPT}] + + if use_history and history: + messages.extend(history[-6:]) + + user_message = f"""Contexto: +{context} + +Pergunta: {message}""" + + messages.append({"role": "user", "content": user_message}) + + # Call LLM + response = await self._call_llm(messages) + + # Store history + if use_history: + history.append({"role": "user", "content": message}) + history.append({"role": "assistant", "content": response}) + + return { + "answer": response, + "local_context_used": bool(local_context), + "web_context_used": bool(web_context), + "entities_found": local_context.count("•") if local_context else 0 + } + + +# Singleton +chat_service = ChatService() diff --git a/app/services/geocoding.py b/app/services/geocoding.py new file mode 100644 index 0000000000000000000000000000000000000000..06863f2be60350c5cd8251ca8cfa063809135cea --- /dev/null +++ b/app/services/geocoding.py @@ -0,0 +1,63 @@ +""" +Geocoding Service - Uses Nominatim (OpenStreetMap) for free geocoding +""" +import httpx +from typing import Optional, Tuple +import asyncio + + +NOMINATIM_URL = "https://nominatim.openstreetmap.org/search" +USER_AGENT = "NUMIDIUM/1.0 (Intelligence System)" + + +async def geocode(location_name: str) -> Optional[Tuple[float, float]]: + """ + Convert a location name to coordinates using Nominatim. + Returns (latitude, longitude) or None if not found. + + Note: Nominatim has rate limits (1 request/second), so be careful with batch operations. + """ + try: + async with httpx.AsyncClient(timeout=10.0) as client: + response = await client.get( + NOMINATIM_URL, + params={ + "q": location_name, + "format": "json", + "limit": 1, + "addressdetails": 0 + }, + headers={ + "User-Agent": USER_AGENT + } + ) + + if response.status_code == 200: + data = response.json() + if data and len(data) > 0: + lat = float(data[0]["lat"]) + lon = float(data[0]["lon"]) + return (lat, lon) + + return None + + except Exception as e: + print(f"Geocoding error for '{location_name}': {e}") + return None + + +async def geocode_batch(location_names: list[str], delay: float = 1.0) -> dict[str, Tuple[float, float]]: + """ + Geocode multiple locations with proper rate limiting. + Returns a dict mapping location names to (lat, lon) tuples. + """ + results = {} + + for name in location_names: + coords = await geocode(name) + if coords: + results[name] = coords + # Respect Nominatim rate limits + await asyncio.sleep(delay) + + return results diff --git a/app/services/ibge_api.py b/app/services/ibge_api.py new file mode 100644 index 0000000000000000000000000000000000000000..26d5000ed2798dfe0f7a1ce55603f305dad74783 --- /dev/null +++ b/app/services/ibge_api.py @@ -0,0 +1,192 @@ +""" +IBGE API Service +Access to Brazilian geographic and demographic data +""" +import httpx +from typing import Optional, Dict, Any, List +from dataclasses import dataclass + + +IBGE_BASE_URL = "https://servicodados.ibge.gov.br/api/v1" + + +@dataclass +class Estado: + """Brazilian state data""" + id: int + sigla: str + nome: str + regiao: str + + +@dataclass +class Municipio: + """Brazilian municipality data""" + id: int + nome: str + estado_sigla: str + estado_nome: str + regiao: str + # Optional enriched data + populacao: Optional[int] = None + area_km2: Optional[float] = None + + +async def listar_estados() -> List[Estado]: + """List all Brazilian states""" + try: + async with httpx.AsyncClient(timeout=15.0) as client: + response = await client.get(f"{IBGE_BASE_URL}/localidades/estados") + + if response.status_code != 200: + return [] + + data = response.json() + estados = [] + + for item in data: + estados.append(Estado( + id=item["id"], + sigla=item["sigla"], + nome=item["nome"], + regiao=item.get("regiao", {}).get("nome", "") + )) + + return sorted(estados, key=lambda x: x.nome) + + except Exception as e: + print(f"IBGE estados error: {e}") + return [] + + +async def listar_municipios(uf: str) -> List[Municipio]: + """List all municipalities in a state""" + try: + async with httpx.AsyncClient(timeout=15.0) as client: + response = await client.get( + f"{IBGE_BASE_URL}/localidades/estados/{uf}/municipios" + ) + + if response.status_code != 200: + return [] + + data = response.json() + municipios = [] + + for item in data: + municipios.append(Municipio( + id=item["id"], + nome=item["nome"], + estado_sigla=uf.upper(), + estado_nome=item.get("microrregiao", {}).get("mesorregiao", {}).get("UF", {}).get("nome", ""), + regiao=item.get("microrregiao", {}).get("mesorregiao", {}).get("UF", {}).get("regiao", {}).get("nome", "") + )) + + return sorted(municipios, key=lambda x: x.nome) + + except Exception as e: + print(f"IBGE municipios error: {e}") + return [] + + +async def buscar_municipio(nome: str, uf: Optional[str] = None) -> List[Municipio]: + """Search for municipalities by name""" + try: + # If UF provided, search only that state + if uf: + municipios = await listar_municipios(uf) + return [m for m in municipios if nome.lower() in m.nome.lower()] + + # Otherwise search all states (slower) + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get(f"{IBGE_BASE_URL}/localidades/municipios") + + if response.status_code != 200: + return [] + + data = response.json() + results = [] + + for item in data: + if nome.lower() in item["nome"].lower(): + uf_info = item.get("microrregiao", {}).get("mesorregiao", {}).get("UF", {}) + results.append(Municipio( + id=item["id"], + nome=item["nome"], + estado_sigla=uf_info.get("sigla", ""), + estado_nome=uf_info.get("nome", ""), + regiao=uf_info.get("regiao", {}).get("nome", "") + )) + + return results[:20] # Limit results + + except Exception as e: + print(f"IBGE search error: {e}") + return [] + + +async def obter_municipio_por_id(id_municipio: int) -> Optional[Municipio]: + """Get municipality by IBGE code""" + try: + async with httpx.AsyncClient(timeout=15.0) as client: + response = await client.get( + f"{IBGE_BASE_URL}/localidades/municipios/{id_municipio}" + ) + + if response.status_code != 200: + return None + + item = response.json() + uf_info = item.get("microrregiao", {}).get("mesorregiao", {}).get("UF", {}) + + return Municipio( + id=item["id"], + nome=item["nome"], + estado_sigla=uf_info.get("sigla", ""), + estado_nome=uf_info.get("nome", ""), + regiao=uf_info.get("regiao", {}).get("nome", "") + ) + + except Exception as e: + print(f"IBGE municipio error: {e}") + return None + + +async def enriquecer_localizacao(cidade: str, uf: Optional[str] = None) -> Dict[str, Any]: + """ + Enrich a location name with IBGE data. + Useful for adding context to extracted locations. + """ + resultado = { + "cidade_original": cidade, + "encontrado": False, + "ibge_codigo": None, + "cidade": None, + "estado": None, + "estado_sigla": None, + "regiao": None + } + + municipios = await buscar_municipio(cidade, uf) + + if municipios: + # Take best match (exact or first) + melhor = None + for m in municipios: + if m.nome.lower() == cidade.lower(): + melhor = m + break + + if not melhor: + melhor = municipios[0] + + resultado.update({ + "encontrado": True, + "ibge_codigo": melhor.id, + "cidade": melhor.nome, + "estado": melhor.estado_nome, + "estado_sigla": melhor.estado_sigla, + "regiao": melhor.regiao + }) + + return resultado diff --git a/app/services/ingestion/__init__.py b/app/services/ingestion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..53751fc389795a6893e21379a16b0680f55cda41 --- /dev/null +++ b/app/services/ingestion/__init__.py @@ -0,0 +1,3 @@ +# Ingestion services +from app.services.ingestion.wikipedia import wikipedia_scraper +from app.services.ingestion.news import news_service diff --git a/app/services/ingestion/__pycache__/__init__.cpython-311.pyc b/app/services/ingestion/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de09d686a52c85f16de0eac33cbd28ca9065604d Binary files /dev/null and b/app/services/ingestion/__pycache__/__init__.cpython-311.pyc differ diff --git a/app/services/ingestion/__pycache__/news.cpython-311.pyc b/app/services/ingestion/__pycache__/news.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47a4ad23456ff8907ab2a47285b1b74cd099a8fe Binary files /dev/null and b/app/services/ingestion/__pycache__/news.cpython-311.pyc differ diff --git a/app/services/ingestion/__pycache__/wikipedia.cpython-311.pyc b/app/services/ingestion/__pycache__/wikipedia.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..215244f9f9e1bdf8dc6071c4e0237f41318f352a Binary files /dev/null and b/app/services/ingestion/__pycache__/wikipedia.cpython-311.pyc differ diff --git a/app/services/ingestion/news.py b/app/services/ingestion/news.py new file mode 100644 index 0000000000000000000000000000000000000000..1aba8df40e8cfb6d2cc19900fea89cf6ce04cf14 --- /dev/null +++ b/app/services/ingestion/news.py @@ -0,0 +1,86 @@ +""" +News API Client Service +Usa RSS feeds públicos para não precisar de API key +""" +import feedparser +import requests +from typing import List, Dict +from datetime import datetime +import re + + +class NewsService: + """Serviço para buscar notícias de fontes públicas via RSS""" + + # RSS feeds públicos brasileiros e internacionais + RSS_FEEDS = { + "g1": "https://g1.globo.com/rss/g1/", + "folha": "https://feeds.folha.uol.com.br/folha/rss/rss091.xml", + "bbc_brasil": "https://www.bbc.com/portuguese/articles/rss.xml", + "reuters": "https://www.reutersagency.com/feed/", + "google_news_br": "https://news.google.com/rss?hl=pt-BR&gl=BR&ceid=BR:pt-419" + } + + def fetch_feed(self, feed_url: str) -> List[Dict]: + """Busca artigos de um feed RSS""" + try: + feed = feedparser.parse(feed_url) + articles = [] + + for entry in feed.entries[:20]: # Limitar a 20 artigos + published = None + if hasattr(entry, 'published_parsed') and entry.published_parsed: + published = datetime(*entry.published_parsed[:6]) + + articles.append({ + "title": entry.get("title", ""), + "description": self._clean_html(entry.get("summary", "")), + "url": entry.get("link", ""), + "published_at": published, + "source": feed.feed.get("title", "Unknown") + }) + + return articles + except Exception as e: + print(f"Error fetching feed {feed_url}: {e}") + return [] + + def fetch_all_feeds(self) -> List[Dict]: + """Busca artigos de todos os feeds configurados""" + all_articles = [] + for name, url in self.RSS_FEEDS.items(): + articles = self.fetch_feed(url) + for article in articles: + article["feed_name"] = name + all_articles.extend(articles) + return all_articles + + def search_news(self, query: str) -> List[Dict]: + """ + Busca notícias pelo Google News RSS + """ + # Google News RSS search + search_url = f"https://news.google.com/rss/search?q={query}&hl=pt-BR&gl=BR&ceid=BR:pt-419" + return self.fetch_feed(search_url) + + def _clean_html(self, text: str) -> str: + """Remove HTML tags do texto""" + clean = re.compile('<.*?>') + return re.sub(clean, '', text) + + def to_document(self, article: Dict) -> Dict: + """ + Converte um artigo de notícia para o formato Document + """ + return { + "title": article["title"], + "content": article.get("description", ""), + "doc_type": "news", + "source": article.get("source", "news"), + "source_url": article.get("url"), + "published_at": article.get("published_at") + } + + +# Singleton instance +news_service = NewsService() diff --git a/app/services/ingestion/wikipedia.py b/app/services/ingestion/wikipedia.py new file mode 100644 index 0000000000000000000000000000000000000000..2c64a6f77d4bcd406506966ad4b1c3a75972a8e3 --- /dev/null +++ b/app/services/ingestion/wikipedia.py @@ -0,0 +1,215 @@ +""" +Wikipedia Scraper Service +""" +import requests +from bs4 import BeautifulSoup +from typing import Optional, Dict, List +import re + + +class WikipediaScraper: + """Scraper para extrair dados da Wikipedia""" + + BASE_URL = "https://pt.wikipedia.org" + API_URL = "https://pt.wikipedia.org/w/api.php" + + # User-Agent obrigatório para API da Wikipedia + HEADERS = { + "User-Agent": "NumidiumBot/1.0 (https://github.com/numidium; contact@numidium.app) Python/3.11" + } + + def search(self, query: str, limit: int = 10) -> List[Dict]: + """ + Busca artigos na Wikipedia + """ + try: + params = { + "action": "query", + "list": "search", + "srsearch": query, + "srlimit": limit, + "format": "json" + } + + response = requests.get( + self.API_URL, + params=params, + headers=self.HEADERS, + timeout=10 + ) + response.raise_for_status() + data = response.json() + + results = [] + for item in data.get("query", {}).get("search", []): + results.append({ + "title": item["title"], + "snippet": BeautifulSoup(item["snippet"], "html.parser").get_text(), + "pageid": item["pageid"] + }) + + return results + except Exception as e: + print(f"Wikipedia search error: {e}") + return [] + + def get_article(self, title: str) -> Optional[Dict]: + """ + Busca informações completas de um artigo + """ + try: + params = { + "action": "query", + "titles": title, + "prop": "extracts|pageimages|coordinates|categories", + "exintro": True, + "explaintext": True, + "pithumbsize": 300, + "format": "json" + } + + response = requests.get( + self.API_URL, + params=params, + headers=self.HEADERS, + timeout=10 + ) + response.raise_for_status() + data = response.json() + + pages = data.get("query", {}).get("pages", {}) + for page_id, page in pages.items(): + if page_id == "-1": + return None + + result = { + "title": page.get("title"), + "extract": page.get("extract"), + "pageid": page.get("pageid"), + "url": f"{self.BASE_URL}/wiki/{page.get('title', '').replace(' ', '_')}", + "thumbnail": page.get("thumbnail", {}).get("source"), + "categories": [c["title"].replace("Categoria:", "") + for c in page.get("categories", [])] + } + + # Coordenadas se disponíveis + if "coordinates" in page: + coords = page["coordinates"][0] + result["latitude"] = coords.get("lat") + result["longitude"] = coords.get("lon") + + return result + + return None + except Exception as e: + print(f"Wikipedia article error: {e}") + return None + + def get_infobox(self, title: str) -> Dict: + """ + Tenta extrair dados estruturados do infobox de um artigo + """ + try: + url = f"{self.BASE_URL}/wiki/{title.replace(' ', '_')}" + response = requests.get(url, headers=self.HEADERS, timeout=10) + soup = BeautifulSoup(response.text, "html.parser") + + infobox = soup.find("table", class_="infobox") + if not infobox: + return {} + + data = {} + for row in infobox.find_all("tr"): + header = row.find("th") + cell = row.find("td") + if header and cell: + key = header.get_text(strip=True) + value = cell.get_text(strip=True) + # Clean up the value + value = re.sub(r'\[\d+\]', '', value) # Remove references + data[key] = value + + return data + except Exception as e: + print(f"Infobox error: {e}") + return {} + + def scrape_person(self, name: str) -> Optional[Dict]: + """ + Scrape dados de uma pessoa da Wikipedia + Retorna dados formatados para criar uma Entity + """ + article = self.get_article(name) + if not article: + return None + + infobox = self.get_infobox(name) + + return { + "type": "person", + "name": article["title"], + "description": article.get("extract"), + "source": "wikipedia", + "source_url": article["url"], + "properties": { + "thumbnail": article.get("thumbnail"), + "categories": article.get("categories", []), + **infobox + }, + "latitude": article.get("latitude"), + "longitude": article.get("longitude") + } + + def scrape_organization(self, name: str) -> Optional[Dict]: + """ + Scrape dados de uma organização da Wikipedia + """ + article = self.get_article(name) + if not article: + return None + + infobox = self.get_infobox(name) + + return { + "type": "organization", + "name": article["title"], + "description": article.get("extract"), + "source": "wikipedia", + "source_url": article["url"], + "properties": { + "thumbnail": article.get("thumbnail"), + "categories": article.get("categories", []), + **infobox + }, + "latitude": article.get("latitude"), + "longitude": article.get("longitude") + } + + def scrape_location(self, name: str) -> Optional[Dict]: + """ + Scrape dados de um local da Wikipedia + """ + article = self.get_article(name) + if not article: + return None + + infobox = self.get_infobox(name) + + return { + "type": "location", + "name": article["title"], + "description": article.get("extract"), + "source": "wikipedia", + "source_url": article["url"], + "properties": { + "thumbnail": article.get("thumbnail"), + "categories": article.get("categories", []), + **infobox + }, + "latitude": article.get("latitude"), + "longitude": article.get("longitude") + } + + +# Singleton instance +wikipedia_scraper = WikipediaScraper() diff --git a/app/services/investigation.py b/app/services/investigation.py new file mode 100644 index 0000000000000000000000000000000000000000..cfbfc764bbd26579e5e92285959e952e3d8afa7e --- /dev/null +++ b/app/services/investigation.py @@ -0,0 +1,324 @@ +""" +Investigation Service - Builds comprehensive dossiers +Combines CNPJ data, transparency/sanctions, Lancer web search, and NER +""" +import httpx +from typing import Optional, Dict, Any, List +from dataclasses import dataclass, field, asdict +import asyncio + +from app.services.brazil_apis import consultar_cnpj, CompanyData +from app.services.transparencia_api import verificar_sancoes +# from app.services.tse_api import buscar_politico # TSE API needs fixing +from app.services import lancer +from app.services.nlp import entity_extractor +from app.core.database import get_db +from app.models.entity import Entity, Relationship + + +LANCER_URL = "https://madras1-lancer.hf.space/api/v1" + + +@dataclass +class DossierSection: + """A section of the dossier""" + titulo: str + conteudo: Any + status: str = "ok" # ok, warning, danger, info + icone: str = "📋" + + +@dataclass +class Dossier: + """Complete investigation dossier""" + tipo: str # "organization" or "person" + alvo: str # Target name + cnpj_cpf: Optional[str] = None + + # Sections + dados_cadastrais: Optional[DossierSection] = None + socios: Optional[DossierSection] = None + sancoes: Optional[DossierSection] = None + dados_politicos: Optional[DossierSection] = None # TSE data + noticias: Optional[DossierSection] = None + entidades_relacionadas: Optional[DossierSection] = None + + # Metadata + red_flags: List[str] = field(default_factory=list) + score_risco: int = 0 # 0-100 + data_geracao: str = "" + fonte_dados: List[str] = field(default_factory=list) + + +async def investigar_empresa(nome_ou_cnpj: str) -> Dossier: + """ + Investigate a company and build a comprehensive dossier. + """ + import re + from datetime import datetime + + dossier = Dossier( + tipo="organization", + alvo=nome_ou_cnpj, + data_geracao=datetime.now().isoformat() + ) + + # Check if input is CNPJ + cnpj_clean = re.sub(r'[^0-9]', '', nome_ou_cnpj) + is_cnpj = len(cnpj_clean) == 14 + + company_data = None + + # 1. Get company data from CNPJ + if is_cnpj: + dossier.cnpj_cpf = cnpj_clean + company_data = await consultar_cnpj(cnpj_clean) + + if company_data: + dossier.alvo = company_data.razao_social or company_data.nome_fantasia or nome_ou_cnpj + dossier.fonte_dados.append(company_data.fonte) + + # Build cadastral section + dossier.dados_cadastrais = DossierSection( + titulo="Dados Cadastrais", + icone="🏢", + conteudo={ + "cnpj": company_data.cnpj, + "razao_social": company_data.razao_social, + "nome_fantasia": company_data.nome_fantasia, + "situacao": company_data.situacao, + "data_abertura": company_data.data_abertura, + "natureza_juridica": company_data.natureza_juridica, + "capital_social": company_data.capital_social, + "porte": company_data.porte, + "endereco": f"{company_data.logradouro}, {company_data.numero} - {company_data.bairro}, {company_data.cidade}/{company_data.uf}", + "cep": company_data.cep, + "telefone": company_data.telefone, + "email": company_data.email, + "atividade_principal": f"{company_data.cnae_principal} - {company_data.cnae_descricao}" + } + ) + + # Check situação for red flags + if company_data.situacao and "ATIVA" not in company_data.situacao.upper(): + dossier.red_flags.append(f"⚠️ Situação cadastral: {company_data.situacao}") + dossier.dados_cadastrais.status = "warning" + + # Build partners section + if company_data.socios: + dossier.socios = DossierSection( + titulo=f"Sócios ({len(company_data.socios)})", + icone="👥", + conteudo=company_data.socios + ) + + # 2. Check sanctions/transparency + if dossier.cnpj_cpf: + sancoes = await verificar_sancoes(dossier.cnpj_cpf) + dossier.fonte_dados.append("Portal da Transparência") + + if sancoes["tem_sancoes"]: + dossier.red_flags.append(f"🚨 Encontrado em {sancoes['total_sancoes']} lista(s) de sanções") + dossier.score_risco += 40 + + dossier.sancoes = DossierSection( + titulo=f"Sanções ({sancoes['total_sancoes']})", + icone="⚠️", + status="danger", + conteudo=sancoes + ) + else: + dossier.sancoes = DossierSection( + titulo="Sanções", + icone="✅", + status="ok", + conteudo={"mensagem": "Nenhuma sanção encontrada nos cadastros públicos"} + ) + + # 3. Web search for news and context + search_query = dossier.alvo + if company_data and company_data.nome_fantasia: + search_query = company_data.nome_fantasia + + try: + web_result = await lancer.search(f"{search_query} notícias escândalos processos", max_results=8) + + if web_result.answer or web_result.results: + dossier.fonte_dados.append("Lancer Web Search") + + news_content = { + "resumo": web_result.answer or "Sem resumo disponível", + "fontes": [ + {"titulo": r.title, "url": r.url, "snippet": r.content[:200]} + for r in web_result.results[:5] + ] + } + + dossier.noticias = DossierSection( + titulo="Notícias e Mídia", + icone="📰", + conteudo=news_content + ) + + # Check for negative keywords in news + negative_keywords = ["escândalo", "fraude", "corrupção", "prisão", "investigado", "denúncia", "irregularidade"] + raw_text = (web_result.answer or "").lower() + for kw in negative_keywords: + if kw in raw_text: + dossier.red_flags.append(f"📰 Menção a '{kw}' encontrada nas notícias") + dossier.noticias.status = "warning" + dossier.score_risco += 10 + break + except Exception as e: + print(f"Web search error: {e}") + + # 4. Extract related entities using NER + if dossier.noticias and dossier.noticias.conteudo.get("resumo"): + try: + text_to_analyze = dossier.noticias.conteudo.get("resumo", "")[:3000] + ner_result = await entity_extractor.extract(text_to_analyze) + + if ner_result.entities: + entities = [ + {"nome": e.name, "tipo": e.type, "descricao": e.description or e.role} + for e in ner_result.entities[:10] + ] + + dossier.entidades_relacionadas = DossierSection( + titulo=f"Entidades Relacionadas ({len(entities)})", + icone="🔗", + conteudo=entities + ) + except Exception as e: + print(f"NER error: {e}") + + # Calculate final risk score + dossier.score_risco = min(100, dossier.score_risco + len(dossier.red_flags) * 5) + + return dossier + + +async def investigar_pessoa(nome: str, cpf: Optional[str] = None) -> Dossier: + """ + Investigate a person and build a dossier. + Note: CPF data is heavily protected by LGPD, so mainly uses web search. + """ + from datetime import datetime + + dossier = Dossier( + tipo="person", + alvo=nome, + cnpj_cpf=cpf, + data_geracao=datetime.now().isoformat() + ) + + # 1. Check sanctions if CPF provided + if cpf: + sancoes = await verificar_sancoes(cpf) + dossier.fonte_dados.append("Portal da Transparência") + + if sancoes["tem_sancoes"]: + dossier.red_flags.append(f"🚨 Encontrado em {sancoes['total_sancoes']} lista(s) de sanções") + dossier.score_risco += 50 + + dossier.sancoes = DossierSection( + titulo=f"Sanções ({sancoes['total_sancoes']})", + icone="⚠️", + status="danger", + conteudo=sancoes + ) + + # 2. Check TSE for political data (DISABLED - API needs fixing) + # try: + # tse_data = await buscar_politico(nome) + # if tse_data.get("encontrado"): + # dossier.fonte_dados.append("TSE (DivulgaCand)") + # candidaturas = tse_data.get("candidaturas", []) + # patrimonio = tse_data.get("total_patrimonio", 0) + # partidos = tse_data.get("partidos", []) + # dossier.dados_politicos = DossierSection(...) + # except Exception as e: + # print(f"TSE search error: {e}") + + + # 3. Web search for information + try: + web_result = await lancer.search(f'"{nome}" biografia cargo empresa', max_results=10) + + if web_result.answer or web_result.results: + dossier.fonte_dados.append("Lancer Web Search") + + dossier.noticias = DossierSection( + titulo="Informações Públicas", + icone="🌐", + conteudo={ + "resumo": web_result.answer or "Informações limitadas", + "fontes": [ + {"titulo": r.title, "url": r.url, "snippet": r.content[:200]} + for r in web_result.results[:5] + ] + } + ) + + # Check for negative keywords + negative_keywords = ["preso", "condenado", "investigado", "acusado", "escândalo", "fraude"] + raw_text = (web_result.answer or "").lower() + for kw in negative_keywords: + if kw in raw_text: + dossier.red_flags.append(f"📰 Menção a '{kw}' encontrada") + dossier.noticias.status = "warning" + dossier.score_risco += 15 + break + except Exception as e: + print(f"Web search error: {e}") + + # 3. Extract related entities + if dossier.noticias and dossier.noticias.conteudo.get("resumo"): + try: + ner_result = await entity_extractor.extract(dossier.noticias.conteudo["resumo"][:2000]) + + if ner_result.entities: + entities = [ + {"nome": e.name, "tipo": e.type, "descricao": e.description or e.role} + for e in ner_result.entities[:10] + if e.name.lower() != nome.lower() # Exclude the target + ] + + if entities: + dossier.entidades_relacionadas = DossierSection( + titulo=f"Conexões ({len(entities)})", + icone="🔗", + conteudo=entities + ) + except Exception as e: + print(f"NER error: {e}") + + dossier.score_risco = min(100, dossier.score_risco + len(dossier.red_flags) * 5) + + return dossier + + +def dossier_to_dict(dossier: Dossier) -> Dict[str, Any]: + """Convert dossier to dictionary for JSON response""" + result = { + "tipo": dossier.tipo, + "alvo": dossier.alvo, + "cnpj_cpf": dossier.cnpj_cpf, + "red_flags": dossier.red_flags, + "score_risco": dossier.score_risco, + "data_geracao": dossier.data_geracao, + "fonte_dados": dossier.fonte_dados, + "secoes": {} + } + + for field_name in ["dados_cadastrais", "socios", "sancoes", "dados_politicos", "noticias", "entidades_relacionadas"]: + section = getattr(dossier, field_name) + if section: + result["secoes"][field_name] = { + "titulo": section.titulo, + "icone": section.icone, + "status": section.status, + "conteudo": section.conteudo + } + + return result diff --git a/app/services/investigator_agent.py b/app/services/investigator_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..56b74ad4c994947ed35f3185df53fc586a4232cc --- /dev/null +++ b/app/services/investigator_agent.py @@ -0,0 +1,659 @@ +""" +Investigator Agent - Autonomous Investigation with Tool Calling +Uses Cerebras native tool calling for multi-source investigations +""" +import json +import re +import httpx +from typing import Optional, List, Dict, Any +from dataclasses import dataclass, field +from datetime import datetime +from sqlalchemy.orm import Session + +from app.config import settings +from app.services import lancer +from app.services.brazil_apis import consultar_cnpj +from app.models.entity import Entity, Relationship + + +def sanitize_text(text: str) -> str: + """ + Clean up text from model that may contain thinking artifacts. + Only removes thinking tags, does NOT remove valid characters. + """ + if not text: + return text + + # Remove thinking tags and content between them + text = re.sub(r'.*?', '', text, flags=re.DOTALL) + text = re.sub(r'<\|think\|>.*?<\|/think\|>', '', text, flags=re.DOTALL) + + # Remove other common model artifacts like <|...|> tags + text = re.sub(r'<\|.*?\|>', '', text) + + # Clean up excessive newlines only + text = re.sub(r'\n{3,}', '\n\n', text) + + return text.strip() + + +@dataclass +class Finding: + """A discovery made during investigation""" + title: str + content: str + source: str + timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) + + +@dataclass +class InvestigationResult: + """Complete investigation result""" + mission: str + findings: List[Finding] + entities_discovered: List[Dict[str, Any]] + connections_mapped: List[Dict[str, Any]] + report: str + iterations: int + tools_used: List[str] + status: str = "completed" + + +# Tool definitions for Cerebras API +TOOLS = [ + { + "type": "function", + "function": { + "name": "search_entity", + "description": "Buscar entidade no NUMIDIUM (grafo de conhecimento) por nome. Use para encontrar pessoas, empresas ou locais já conhecidos.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Nome ou termo para buscar" + }, + "entity_type": { + "type": "string", + "enum": ["person", "organization", "location", "any"], + "description": "Tipo de entidade (opcional)" + } + }, + "required": ["query"] + } + } + }, + { + "type": "function", + "function": { + "name": "get_connections", + "description": "Obter a rede de conexões de uma entidade específica. Retorna entidades relacionadas.", + "parameters": { + "type": "object", + "properties": { + "entity_id": { + "type": "string", + "description": "ID da entidade no NUMIDIUM" + } + }, + "required": ["entity_id"] + } + } + }, + { + "type": "function", + "function": { + "name": "lookup_cnpj", + "description": "Consultar dados de uma empresa brasileira pelo CNPJ. Retorna razão social, sócios, endereço, CNAEs, etc.", + "parameters": { + "type": "object", + "properties": { + "cnpj": { + "type": "string", + "description": "CNPJ da empresa (com ou sem formatação)" + } + }, + "required": ["cnpj"] + } + } + }, + { + "type": "function", + "function": { + "name": "web_search", + "description": "Pesquisar informações na web. Use para buscar notícias, artigos e informações públicas.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Termo de busca" + }, + "freshness": { + "type": "string", + "enum": ["day", "week", "month", "any"], + "description": "Frescor dos resultados", + "default": "any" + } + }, + "required": ["query"] + } + } + }, + { + "type": "function", + "function": { + "name": "deep_research", + "description": "Pesquisa profunda e multi-dimensional sobre um tema. Use para tópicos complexos.", + "parameters": { + "type": "object", + "properties": { + "topic": { + "type": "string", + "description": "Tópico para pesquisa profunda" + } + }, + "required": ["topic"] + } + } + }, + { + "type": "function", + "function": { + "name": "save_finding", + "description": "Salvar uma descoberta importante da investigação.", + "parameters": { + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "Título curto da descoberta" + }, + "content": { + "type": "string", + "description": "Conteúdo detalhado" + }, + "source": { + "type": "string", + "description": "Fonte da informação" + } + }, + "required": ["title", "content", "source"] + } + } + }, + { + "type": "function", + "function": { + "name": "finish_investigation", + "description": "Finalizar a investigação e gerar o relatório final.", + "parameters": { + "type": "object", + "properties": { + "summary": { + "type": "string", + "description": "Resumo das descobertas principais" + } + }, + "required": ["summary"] + } + } + } +] + + +SYSTEM_PROMPT = """Você é um agente investigador autônomo do sistema NUMIDIUM/AVANGARD. /no_think + +Sua missão é investigar temas usando múltiplas fontes de dados: +- NUMIDIUM: Grafo de conhecimento com entidades e relacionamentos +- Consulta CNPJ: Dados oficiais de empresas brasileiras (BrasilAPI) +- Web Search: Pesquisa na internet via Lancer + +## Estratégia de Investigação: + +1. Comece buscando no NUMIDIUM se já temos informações sobre o alvo +2. Para empresas brasileiras, consulte o CNPJ para obter sócios e dados +3. Use web_search para buscar notícias e informações públicas +4. Para cada sócio/conexão descoberta, considere investigar mais a fundo +5. Use save_finding para registrar descobertas importantes +6. Quando tiver informações suficientes, use finish_investigation + +## Regras: +- Seja metódico e siga pistas +- Não invente informações - use apenas dados das ferramentas +- Priorize qualidade sobre quantidade +- Cite sempre as fontes +- NÃO use pensamento interno ou tags . Responda diretamente.""" + + +class InvestigatorAgent: + """Autonomous investigation agent with tool calling""" + + def __init__(self): + self.api_url = "https://api.cerebras.ai/v1/chat/completions" + self.api_key = settings.cerebras_api_key + self.model = "zai-glm-4.7" + + # Investigation state + self.findings: List[Finding] = [] + self.entities_discovered: List[Dict[str, Any]] = [] + self.connections_mapped: List[Dict[str, Any]] = [] + self.tools_used: List[str] = [] + self.messages: List[Dict[str, Any]] = [] + self.db: Optional[Session] = None + + def _reset_state(self): + """Reset investigation state""" + self.findings = [] + self.entities_discovered = [] + self.connections_mapped = [] + self.tools_used = [] + self.messages = [] + + async def _call_llm( + self, + messages: List[Dict[str, Any]], + tools: List[Dict] = None + ) -> Dict[str, Any]: + """Call Cerebras API with tool calling support""" + try: + payload = { + "model": self.model, + "messages": messages, + "temperature": 0.3, + "max_tokens": 2048, + } + + if tools: + payload["tools"] = tools + payload["tool_choice"] = "auto" + payload["parallel_tool_calls"] = True + + async with httpx.AsyncClient(timeout=60.0) as client: + response = await client.post( + self.api_url, + headers={ + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + }, + json=payload + ) + + if response.status_code != 200: + raise Exception(f"API error: {response.status_code} - {response.text}") + + return response.json() + + except Exception as e: + raise Exception(f"LLM call failed: {str(e)}") + + async def _execute_tool(self, tool_name: str, arguments: Dict) -> str: + """Execute a tool and return the result""" + self.tools_used.append(tool_name) + + try: + if tool_name == "search_entity": + return await self._search_entity( + arguments.get("query", ""), + arguments.get("entity_type") + ) + + elif tool_name == "get_connections": + return await self._get_connections(arguments.get("entity_id")) + + elif tool_name == "lookup_cnpj": + return await self._lookup_cnpj(arguments.get("cnpj", "")) + + + elif tool_name == "web_search": + return await self._web_search( + arguments.get("query", ""), + arguments.get("freshness", "any") + ) + + elif tool_name == "deep_research": + return await self._deep_research(arguments.get("topic", "")) + + elif tool_name == "aether_search": + return await self._aether_search(arguments.get("query", "")) + + elif tool_name == "aether_entities": + return await self._aether_entities() + + elif tool_name == "save_finding": + finding = Finding( + title=arguments.get("title", ""), + content=arguments.get("content", ""), + source=arguments.get("source", "") + ) + self.findings.append(finding) + return f"Descoberta salva: {finding.title}" + + elif tool_name == "finish_investigation": + return f"INVESTIGATION_COMPLETE: {arguments.get('summary', '')}" + + else: + return f"Ferramenta desconhecida: {tool_name}" + + except Exception as e: + return f"Erro ao executar {tool_name}: {str(e)}" + + async def _search_entity(self, query: str, entity_type: Optional[str]) -> str: + """Search entities in database""" + if not self.db: + return "Erro: Banco de dados não disponível" + + q = self.db.query(Entity).filter(Entity.name.ilike(f"%{query}%")) + if entity_type and entity_type != "any": + q = q.filter(Entity.type == entity_type) + + entities = q.limit(10).all() + + if entities: + result = [] + for e in entities: + self.entities_discovered.append({ + "id": str(e.id), + "name": e.name, + "type": e.type + }) + result.append({ + "id": str(e.id), + "name": e.name, + "type": e.type, + "description": e.description[:200] if e.description else None + }) + return json.dumps(result, ensure_ascii=False, indent=2) + + return "Nenhuma entidade encontrada no NUMIDIUM." + + async def _get_connections(self, entity_id: str) -> str: + """Get entity connections""" + if not self.db: + return "Erro: Banco de dados não disponível" + + relationships = self.db.query(Relationship).filter( + (Relationship.source_id == entity_id) | (Relationship.target_id == entity_id) + ).limit(20).all() + + if relationships: + connections = [] + for rel in relationships: + source = self.db.query(Entity).filter(Entity.id == rel.source_id).first() + target = self.db.query(Entity).filter(Entity.id == rel.target_id).first() + if source and target: + connections.append({ + "source": source.name, + "target": target.name, + "type": rel.type + }) + return json.dumps(connections, ensure_ascii=False, indent=2) + + return "Nenhuma conexão encontrada." + + async def _lookup_cnpj(self, cnpj: str) -> str: + """Lookup CNPJ via BrasilAPI""" + cnpj_clean = cnpj.replace(".", "").replace("/", "").replace("-", "") + result = await consultar_cnpj(cnpj_clean) + + if result: + data = { + "razao_social": result.razao_social, + "nome_fantasia": result.nome_fantasia, + "situacao": result.situacao, + "data_abertura": result.data_abertura, + "capital_social": result.capital_social, + "endereco": f"{result.logradouro}, {result.numero} - {result.cidade}/{result.uf}", + "cnae": f"{result.cnae_principal} - {result.cnae_descricao}", + "socios": result.socios + } + return json.dumps(data, ensure_ascii=False, indent=2) + + return "CNPJ não encontrado." + + async def _lookup_phone(self, phone: str) -> str: + """Lookup phone number via NumVerify API""" + # Clean phone number - keep only digits + phone_clean = "".join(c for c in phone if c.isdigit()) + + # NumVerify API key (free tier: 100 req/month) + numverify_key = getattr(settings, 'numverify_api_key', None) + + if not numverify_key: + # Fallback: just do a web search for the number + return await self._web_search(f'"{phone_clean}" telefone', "any") + + try: + async with httpx.AsyncClient(timeout=10.0) as client: + response = await client.get( + "http://apilayer.net/api/validate", + params={ + "access_key": numverify_key, + "number": phone_clean, + "country_code": "", # Auto-detect + "format": 1 + } + ) + + if response.status_code == 200: + data = response.json() + + if data.get("valid"): + result = { + "numero": data.get("international_format"), + "valido": True, + "pais": data.get("country_name"), + "codigo_pais": data.get("country_code"), + "operadora": data.get("carrier"), + "tipo_linha": data.get("line_type"), # mobile, landline, etc + "localizacao": data.get("location") + } + return json.dumps(result, ensure_ascii=False, indent=2) + else: + return f"Número {phone_clean} não é válido ou não foi encontrado." + + return "Erro ao consultar número." + + except Exception as e: + # Fallback to web search + return await self._web_search(f'"{phone_clean}" telefone', "any") + + async def _web_search(self, query: str, freshness: str) -> str: + """Web search via Lancer""" + try: + result = await lancer.search(query, max_results=5, freshness=freshness) + if result.answer: + return f"Resumo: {result.answer}\n\nFontes: {len(result.results)} resultados" + return "Nenhum resultado encontrado." + except Exception as e: + return f"Erro na busca web: {str(e)}" + + async def _deep_research(self, topic: str) -> str: + """Deep research via Lancer""" + try: + result = await lancer.deep_research(topic, max_dimensions=3) + if result.answer: + return result.answer + return "Pesquisa profunda não retornou resultados." + except Exception as e: + return f"Erro na pesquisa: {str(e)}" + + async def _aether_search(self, query: str) -> str: + """Semantic search via AetherMap""" + try: + # Check if we have a job_id cached + if not aethermap.current_job_id: + # Index entities from database first + if self.db: + entities = self.db.query(Entity).limit(500).all() + if entities: + texts = [] + for e in entities: + text = f"{e.name} ({e.type})" + if e.description: + text += f": {e.description[:500]}" + texts.append(text) + + if texts: + result = await aethermap.process_documents(texts, fast_mode=True) + # Continue with search + + if aethermap.current_job_id: + result = await aethermap.semantic_search(query, turbo_mode=True) + return f"RAG Response:\n{result.summary}" + else: + return "Nenhum documento indexado no AetherMap." + + except Exception as e: + return f"Erro no AetherMap search: {str(e)}" + + async def _aether_entities(self) -> str: + """Extract NER entities via AetherMap""" + try: + if not aethermap.current_job_id: + return "Nenhum documento indexado. Use aether_search primeiro." + + result = await aethermap.extract_entities() + + # Format response + output = [] + + if result.hubs: + output.append("**Entidades Centrais (Hubs):**") + for hub in result.hubs[:5]: + output.append(f"- {hub.get('entity')} ({hub.get('type')}): {hub.get('degree')} conexões") + + if result.insights: + output.append(f"\n**Insights:**") + output.append(f"- Total de conexões: {result.insights.get('total_connections', 0)}") + output.append(f"- Grau médio: {result.insights.get('avg_degree', 0)}") + + if result.edges: + output.append(f"\n**Top 5 Relacionamentos:**") + for edge in result.edges[:5]: + output.append(f"- {edge.source_entity} <-> {edge.target_entity}: {edge.reason}") + + return "\n".join(output) if output else "Nenhuma entidade significativa encontrada." + + except Exception as e: + return f"Erro na extração de entidades: {str(e)}" + + async def investigate( + self, + mission: str, + db: Session, + max_iterations: int = 10 + ) -> InvestigationResult: + """Main investigation loop""" + self._reset_state() + self.db = db + + self.messages = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": f"Missão de investigação: {mission}\n\nComece a investigação."} + ] + + iteration = 0 + final_summary = "" + + while iteration < max_iterations: + iteration += 1 + + response = await self._call_llm(self.messages, TOOLS) + + choice = response["choices"][0] + message = choice["message"] + self.messages.append(message) + + tool_calls = message.get("tool_calls", []) + + if not tool_calls: + if message.get("content"): + final_summary = message["content"] + break + + for tool_call in tool_calls: + func = tool_call["function"] + tool_name = func["name"] + + try: + arguments = json.loads(func["arguments"]) + except: + arguments = {} + + result = await self._execute_tool(tool_name, arguments) + + if result.startswith("INVESTIGATION_COMPLETE:"): + final_summary = result.replace("INVESTIGATION_COMPLETE:", "").strip() + break + + self.messages.append({ + "role": "tool", + "tool_call_id": tool_call["id"], + "content": result + }) + + if final_summary: + break + + if not final_summary: + final_summary = await self._generate_report(mission) + + # Sanitize all text outputs to remove thinking artifacts + final_summary = sanitize_text(final_summary) + + # Sanitize findings content + sanitized_findings = [] + for f in self.findings: + sanitized_findings.append(Finding( + title=sanitize_text(f.title), + content=sanitize_text(f.content), + source=f.source, + timestamp=f.timestamp + )) + + return InvestigationResult( + mission=mission, + findings=sanitized_findings, + entities_discovered=self.entities_discovered, + connections_mapped=self.connections_mapped, + report=final_summary, + iterations=iteration, + tools_used=list(set(self.tools_used)), + status="completed" + ) + + async def _generate_report(self, mission: str) -> str: + """Generate final report""" + findings_text = "\n".join([ + f"- {f.title}: {f.content} (Fonte: {f.source})" + for f in self.findings + ]) or "Nenhuma descoberta registrada." + + entities_text = ", ".join([ + e.get("name", "Unknown") for e in self.entities_discovered[:10] + ]) or "Nenhuma entidade." + + prompt = f"""Gere um relatório de investigação: + +Missão: {mission} + +Descobertas: +{findings_text} + +Entidades: {entities_text} + +Ferramentas usadas: {', '.join(set(self.tools_used))} + +Gere relatório estruturado com: Resumo Executivo, Descobertas, Entidades, Recomendações.""" + + response = await self._call_llm([ + {"role": "system", "content": "Gere relatórios concisos."}, + {"role": "user", "content": prompt} + ]) + + return sanitize_text(response["choices"][0]["message"]["content"]) + + +# Singleton +investigator_agent = InvestigatorAgent() diff --git a/app/services/lancer.py b/app/services/lancer.py new file mode 100644 index 0000000000000000000000000000000000000000..179868cdd00136f0a9376b6ea6fdff3df5b48abf --- /dev/null +++ b/app/services/lancer.py @@ -0,0 +1,198 @@ +""" +Lancer Deep Research Service +Integrates with Lancer Search API for AI-powered research +""" +import httpx +from typing import Optional, List, Dict, Any +from dataclasses import dataclass + + +LANCER_BASE_URL = "https://madras1-lancer.hf.space" + + +@dataclass +class SearchResult: + """Individual search result from Lancer""" + title: str + url: str + content: str + score: float + published_date: Optional[str] = None + + +@dataclass +class ResearchResponse: + """Response from Lancer research/search""" + query: str + answer: Optional[str] + results: List[SearchResult] + citations: List[Dict[str, Any]] + processing_time_ms: float + raw_text: str # Combined text for NER extraction + + +async def search( + query: str, + max_results: int = 10, + freshness: str = "any" +) -> ResearchResponse: + """ + Perform a search with AI synthesis using Lancer API. + """ + try: + async with httpx.AsyncClient(timeout=60.0) as client: + response = await client.post( + f"{LANCER_BASE_URL}/api/v1/search", + json={ + "query": query, + "max_results": max_results, + "freshness": freshness, + "include_answer": True + } + ) + + if response.status_code != 200: + raise Exception(f"Lancer API error: {response.status_code}") + + data = response.json() + + results = [ + SearchResult( + title=r.get("title", ""), + url=r.get("url", ""), + content=r.get("content", ""), + score=r.get("score", 0.0), + published_date=r.get("published_date") + ) + for r in data.get("results", []) + ] + + # Combine all text for NER + raw_text = data.get("answer", "") or "" + for r in results: + raw_text += f"\n{r.title}. {r.content}" + + return ResearchResponse( + query=data.get("query", query), + answer=data.get("answer"), + results=results, + citations=data.get("citations", []), + processing_time_ms=data.get("processing_time_ms", 0), + raw_text=raw_text + ) + + except Exception as e: + raise Exception(f"Lancer search failed: {str(e)}") + + +async def deep_research( + query: str, + max_dimensions: int = 5, + max_sources_per_dim: int = 5 +) -> ResearchResponse: + """ + Perform deep multi-dimensional research using Lancer API. + This provides richer, more comprehensive analysis. + """ + try: + async with httpx.AsyncClient(timeout=120.0) as client: + response = await client.post( + f"{LANCER_BASE_URL}/api/v1/research/deep", + json={ + "query": query, + "max_dimensions": max_dimensions, + "max_sources_per_dim": max_sources_per_dim, + "max_total_searches": 20 + } + ) + + if response.status_code != 200: + raise Exception(f"Lancer API error: {response.status_code}") + + data = response.json() + + # Deep research returns a different format - adapt it + results = [] + raw_text = "" + + # Extract from dimensions if present + if "dimensions" in data: + for dim in data["dimensions"]: + dim_name = dim.get("dimension", "") + raw_text += f"\n## {dim_name}\n" + for r in dim.get("results", []): + results.append(SearchResult( + title=r.get("title", ""), + url=r.get("url", ""), + content=r.get("content", ""), + score=r.get("score", 0.0) + )) + raw_text += f"{r.get('title', '')}. {r.get('content', '')}\n" + + # Add final report + final_report = data.get("final_report", data.get("report", "")) + if final_report: + raw_text = final_report + "\n" + raw_text + + return ResearchResponse( + query=query, + answer=final_report, + results=results, + citations=data.get("citations", []), + processing_time_ms=data.get("processing_time_ms", 0), + raw_text=raw_text + ) + + except Exception as e: + raise Exception(f"Lancer deep research failed: {str(e)}") + + +async def heavy_search( + query: str, + max_results: int = 5 +) -> ResearchResponse: + """ + Heavy search with full content scraping from sources. + Slower but provides more context. + """ + try: + async with httpx.AsyncClient(timeout=90.0) as client: + response = await client.post( + f"{LANCER_BASE_URL}/api/v1/search/heavy", + json={ + "query": query, + "max_results": max_results, + "include_answer": True + } + ) + + if response.status_code != 200: + raise Exception(f"Lancer API error: {response.status_code}") + + data = response.json() + + results = [ + SearchResult( + title=r.get("title", ""), + url=r.get("url", ""), + content=r.get("content", ""), + score=r.get("score", 0.0) + ) + for r in data.get("results", []) + ] + + raw_text = data.get("answer", "") or "" + for r in results: + raw_text += f"\n{r.title}. {r.content}" + + return ResearchResponse( + query=query, + answer=data.get("answer"), + results=results, + citations=data.get("citations", []), + processing_time_ms=data.get("processing_time_ms", 0), + raw_text=raw_text + ) + + except Exception as e: + raise Exception(f"Lancer heavy search failed: {str(e)}") diff --git a/app/services/nlp/__init__.py b/app/services/nlp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e9265c7e61b3b29a87dcd75c4455abd114be3e18 --- /dev/null +++ b/app/services/nlp/__init__.py @@ -0,0 +1,2 @@ +# NLP Services +from .entity_extractor import entity_extractor diff --git a/app/services/nlp/__pycache__/__init__.cpython-311.pyc b/app/services/nlp/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8671a044592ad7e7b9a10ee976be1a78f1f7958d Binary files /dev/null and b/app/services/nlp/__pycache__/__init__.cpython-311.pyc differ diff --git a/app/services/nlp/__pycache__/entity_extractor.cpython-311.pyc b/app/services/nlp/__pycache__/entity_extractor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7aac7bb33e176996105a8d539ec88db2b3ceaf5 Binary files /dev/null and b/app/services/nlp/__pycache__/entity_extractor.cpython-311.pyc differ diff --git a/app/services/nlp/entity_extractor.py b/app/services/nlp/entity_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..8855cc0c67661840ce99ce63f7aafe9da23e60b2 --- /dev/null +++ b/app/services/nlp/entity_extractor.py @@ -0,0 +1,265 @@ +""" +Entity Extractor Service - LLM-based NER +Uses Cerebras API with Qwen 3 235B for intelligent entity and relationship extraction +""" +import json +import re +from typing import Dict, List, Optional, Any +from dataclasses import dataclass +import httpx + +from app.config import settings + + +@dataclass +class ExtractedEntity: + """Represents an extracted entity""" + name: str + type: str # person, organization, location, event + role: Optional[str] = None + aliases: Optional[List[str]] = None + description: Optional[str] = None + latitude: Optional[float] = None + longitude: Optional[float] = None + event_date: Optional[str] = None # Date in ISO format (YYYY-MM-DD) + + +@dataclass +class ExtractedRelationship: + """Represents a relationship between entities""" + source: str + target: str + relationship_type: str + context: Optional[str] = None + event_date: Optional[str] = None # Date in ISO format (YYYY-MM-DD) + + +@dataclass +class ExtractedEvent: + """Represents an extracted event""" + description: str + event_type: Optional[str] = None + date: Optional[str] = None + location: Optional[str] = None + participants: Optional[List[str]] = None + + +@dataclass +class ExtractionResult: + """Complete extraction result""" + entities: List[ExtractedEntity] + relationships: List[ExtractedRelationship] + events: List[ExtractedEvent] + raw_response: Optional[str] = None + + +EXTRACTION_PROMPT = """Você é um especialista em extração de informações estruturadas de textos. + +Analise o texto fornecido e extraia TODAS as entidades, relacionamentos e eventos mencionados. + +## Regras: +1. Identifique entidades: pessoas, organizações, locais, eventos +2. Para PESSOAS: inclua nome completo (se mencionado ou conhecido), cargo/função +3. Para ORGANIZAÇÕES: inclua nome oficial e siglas +4. Para LOCAIS: seja específico (cidade, país, endereço) +5. Identifique RELACIONAMENTOS entre entidades (quem trabalha onde, quem conhece quem, etc.) +6. Identifique EVENTOS mencionados (reuniões, anúncios, eleições, etc.) +7. EXTRAIA DATAS sempre que mencionadas (formato YYYY-MM-DD ou YYYY se só o ano) + +## Formato de resposta (JSON válido): +```json +{{ + "entities": [ + {{ + "name": "Nome Completo", + "type": "person|organization|location|event", + "role": "cargo ou função (opcional)", + "aliases": ["apelidos", "siglas"], + "description": "breve descrição se relevante", + "event_date": "YYYY-MM-DD ou YYYY (data relevante como nascimento, fundação, etc)" + }} + ], + "relationships": [ + {{ + "source": "Nome da Entidade 1", + "target": "Nome da Entidade 2", + "relationship_type": "tipo de relação (trabalha em, preside, fundou, reuniu-se com, etc.)", + "context": "contexto da relação", + "event_date": "YYYY-MM-DD ou YYYY (quando o relacionamento aconteceu/iniciou)" + }} + ], + "events": [ + {{ + "description": "O que aconteceu", + "event_type": "meeting|announcement|election|crime|etc", + "date": "YYYY-MM-DD ou YYYY", + "location": "local se mencionado", + "participants": ["lista de participantes"] + }} + ] +}} +``` + +Retorne APENAS o JSON, sem texto adicional. + +## Texto para análise: +{text} +""" + + +class EntityExtractor: + """ + LLM-based Entity Extractor using Cerebras API + """ + + def __init__(self): + self.api_key = settings.cerebras_api_key + self.base_url = "https://api.cerebras.ai/v1" + self.model = "qwen-3-235b-a22b-instruct-2507" + self.timeout = 60.0 + + async def extract(self, text: str) -> ExtractionResult: + """ + Extract entities, relationships, and events from text using LLM + + Args: + text: The text to analyze + + Returns: + ExtractionResult with all extracted information + """ + if not self.api_key: + raise ValueError("CEREBRAS_API_KEY not configured. Please set the environment variable.") + + if not text or len(text.strip()) < 10: + return ExtractionResult(entities=[], relationships=[], events=[]) + + # Prepare the prompt + prompt = EXTRACTION_PROMPT.format(text=text) + + try: + # Call Cerebras API + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.post( + f"{self.base_url}/chat/completions", + headers={ + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + }, + json={ + "model": self.model, + "messages": [ + { + "role": "system", + "content": "Você é um assistente especialista em extração de entidades e relacionamentos. Sempre responda em JSON válido." + }, + { + "role": "user", + "content": prompt + } + ], + "temperature": 0.1, # Low temperature for consistent extraction + "max_tokens": 4096 + } + ) + + if response.status_code != 200: + error_text = response.text + print(f"Cerebras API error: {response.status_code} - {error_text}") + raise ValueError(f"Cerebras API error: {response.status_code}") + + data = response.json() + + # Parse the response + raw_content = data["choices"][0]["message"]["content"] + return self._parse_response(raw_content) + + except httpx.TimeoutException: + print("Cerebras API timeout") + raise ValueError("API timeout - please try again with shorter text") + except httpx.RequestError as e: + print(f"Cerebras API request error: {e}") + raise ValueError(f"API connection error: {str(e)}") + except KeyError as e: + print(f"Unexpected API response format: {e}") + raise ValueError("Unexpected API response format") + + def _parse_response(self, content: str) -> ExtractionResult: + """Parse the LLM response into structured data""" + try: + # Try to extract JSON from the response + # Sometimes the model wraps it in ```json ... ``` + json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL) + if json_match: + json_str = json_match.group(1) + else: + # Try to find raw JSON + json_match = re.search(r'\{.*\}', content, re.DOTALL) + if json_match: + json_str = json_match.group(0) + else: + json_str = content + + data = json.loads(json_str) + + # Parse entities + entities = [] + for e in data.get("entities", []): + entities.append(ExtractedEntity( + name=e.get("name", ""), + type=e.get("type", "unknown"), + role=e.get("role"), + aliases=e.get("aliases", []), + description=e.get("description"), + event_date=e.get("event_date") + )) + + # Parse relationships + relationships = [] + for r in data.get("relationships", []): + relationships.append(ExtractedRelationship( + source=r.get("source", ""), + target=r.get("target", ""), + relationship_type=r.get("relationship_type", "related_to"), + context=r.get("context"), + event_date=r.get("event_date") + )) + + # Parse events + events = [] + for ev in data.get("events", []): + events.append(ExtractedEvent( + description=ev.get("description", ""), + event_type=ev.get("event_type"), + date=ev.get("date"), + location=ev.get("location"), + participants=ev.get("participants", []) + )) + + return ExtractionResult( + entities=entities, + relationships=relationships, + events=events, + raw_response=content + ) + + except json.JSONDecodeError as e: + print(f"Failed to parse LLM response: {e}") + print(f"Raw content: {content}") + return ExtractionResult( + entities=[], + relationships=[], + events=[], + raw_response=content + ) + + def extract_sync(self, text: str) -> ExtractionResult: + """ + Synchronous version of extract for non-async contexts + """ + import asyncio + return asyncio.run(self.extract(text)) + + +# Singleton instance +entity_extractor = EntityExtractor() diff --git a/app/services/transparencia_api.py b/app/services/transparencia_api.py new file mode 100644 index 0000000000000000000000000000000000000000..13face26b77d772b00023efd68c0a6af8dc03d9e --- /dev/null +++ b/app/services/transparencia_api.py @@ -0,0 +1,146 @@ +""" +Portal da Transparência APIs +Access to Brazilian government transparency data +""" +import httpx +from typing import Optional, Dict, Any, List +from dataclasses import dataclass + + +# Portal da Transparência base URL +TRANSPARENCIA_URL = "https://api.portaldatransparencia.gov.br/api-de-dados" + + +@dataclass +class SanctionRecord: + """Data structure for sanction/punishment records""" + tipo: str # CEIS, CNEP, CEPIM + cpf_cnpj: str + nome: str + tipo_pessoa: str # 'F' or 'J' + + # Sanction details + tipo_sancao: str = "" + data_inicio: str = "" + data_fim: str = "" + orgao_sancionador: str = "" + uf_orgao: str = "" + fundamentacao_legal: str = "" + + # Source + fonte_url: str = "" + + +async def consultar_ceis(cnpj_cpf: str, token: Optional[str] = None) -> List[SanctionRecord]: + """ + Query CEIS - Cadastro de Empresas Inidôneas e Suspensas + Note: Requires authentication token from Portal da Transparência + """ + # Without token, we can still try - some endpoints work without auth + return await _query_sanctions("ceis", cnpj_cpf, token) + + +async def consultar_cnep(cnpj_cpf: str, token: Optional[str] = None) -> List[SanctionRecord]: + """ + Query CNEP - Cadastro Nacional de Empresas Punidas + """ + return await _query_sanctions("cnep", cnpj_cpf, token) + + +async def consultar_cepim(cnpj_cpf: str, token: Optional[str] = None) -> List[SanctionRecord]: + """ + Query CEPIM - Cadastro de Entidades Privadas sem Fins Lucrativos Impedidas + """ + return await _query_sanctions("cepim", cnpj_cpf, token) + + +async def _query_sanctions( + endpoint: str, + cnpj_cpf: str, + token: Optional[str] = None +) -> List[SanctionRecord]: + """Internal function to query sanction APIs""" + try: + headers = {} + if token: + headers["chave-api-dados"] = token + + params = {"cnpjCpf": cnpj_cpf} + + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get( + f"{TRANSPARENCIA_URL}/{endpoint}", + params=params, + headers=headers + ) + + if response.status_code == 401: + # Need authentication - return empty for now + print(f"Portal da Transparência requires authentication for {endpoint}") + return [] + + if response.status_code != 200: + return [] + + data = response.json() + if not isinstance(data, list): + data = [data] if data else [] + + records = [] + for item in data: + records.append(SanctionRecord( + tipo=endpoint.upper(), + cpf_cnpj=item.get("cpfCnpj", ""), + nome=item.get("nomeRazaoSocial", item.get("nome", "")), + tipo_pessoa=item.get("tipoPessoa", ""), + tipo_sancao=item.get("tipoSancao", {}).get("descricao", "") if isinstance(item.get("tipoSancao"), dict) else str(item.get("tipoSancao", "")), + data_inicio=item.get("dataInicioSancao", ""), + data_fim=item.get("dataFimSancao", ""), + orgao_sancionador=item.get("orgaoSancionador", {}).get("nome", "") if isinstance(item.get("orgaoSancionador"), dict) else str(item.get("orgaoSancionador", "")), + uf_orgao=item.get("ufOrgaoSancionador", ""), + fundamentacao_legal=item.get("fundamentacaoLegal", ""), + fonte_url=f"https://portaldatransparencia.gov.br/{endpoint}" + )) + + return records + + except Exception as e: + print(f"Transparência API error ({endpoint}): {e}") + return [] + + +async def verificar_sancoes(cnpj_cpf: str, token: Optional[str] = None) -> Dict[str, Any]: + """ + Check all sanction databases for a CNPJ/CPF + Returns consolidated result + """ + import asyncio + + # Query all databases in parallel + ceis_task = consultar_ceis(cnpj_cpf, token) + cnep_task = consultar_cnep(cnpj_cpf, token) + cepim_task = consultar_cepim(cnpj_cpf, token) + + ceis, cnep, cepim = await asyncio.gather(ceis_task, cnep_task, cepim_task) + + all_sanctions = ceis + cnep + cepim + + return { + "cnpj_cpf": cnpj_cpf, + "tem_sancoes": len(all_sanctions) > 0, + "total_sancoes": len(all_sanctions), + "ceis": len(ceis), + "cnep": len(cnep), + "cepim": len(cepim), + "registros": [ + { + "tipo": s.tipo, + "tipo_sancao": s.tipo_sancao, + "orgao": s.orgao_sancionador, + "inicio": s.data_inicio, + "fim": s.data_fim, + "fundamentacao": s.fundamentacao_legal + } + for s in all_sanctions + ] + } diff --git a/app/services/tse_api.py b/app/services/tse_api.py new file mode 100644 index 0000000000000000000000000000000000000000..e851625961d9a2c673f2eab9d91c44145d05e9cc --- /dev/null +++ b/app/services/tse_api.py @@ -0,0 +1,270 @@ +""" +TSE (Tribunal Superior Eleitoral) API Service +Access to Brazilian electoral data - candidates, assets, donations +""" +import httpx +from typing import Optional, Dict, Any, List +from dataclasses import dataclass, field + + +# DivulgaCand API (unofficial but functional) +TSE_DIVULGACAND_URL = "https://divulgacandcontas.tse.jus.br/divulga/rest/v1" + + +@dataclass +class Candidato: + """Electoral candidate data""" + id: int + nome: str + nome_urna: str + cpf_parcial: str = "" # TSE only shows partial + numero: str = "" + cargo: str = "" + partido_sigla: str = "" + partido_nome: str = "" + coligacao: str = "" + situacao: str = "" + + # Location + uf: str = "" + municipio: str = "" + + # Personal + data_nascimento: str = "" + genero: str = "" + grau_instrucao: str = "" + ocupacao: str = "" + + # Assets + total_bens: float = 0.0 + bens: List[Dict[str, Any]] = field(default_factory=list) + + # Campaign + total_receitas: float = 0.0 + total_despesas: float = 0.0 + + +@dataclass +class Eleicao: + """Election metadata""" + id: int + ano: int + descricao: str + turno: int = 1 + + +async def listar_eleicoes() -> List[Eleicao]: + """List available elections""" + try: + async with httpx.AsyncClient(timeout=15.0) as client: + response = await client.get(f"{TSE_DIVULGACAND_URL}/eleicao/ordinarias") + + if response.status_code != 200: + return [] + + data = response.json() + eleicoes = [] + + for item in data: + eleicoes.append(Eleicao( + id=item.get("id", 0), + ano=item.get("ano", 0), + descricao=item.get("descricaoEleicao", ""), + turno=item.get("turno", 1) + )) + + return sorted(eleicoes, key=lambda x: x.ano, reverse=True) + + except Exception as e: + print(f"TSE eleicoes error: {e}") + return [] + + +async def buscar_candidatos( + nome: str, + ano: int = 2024, + uf: Optional[str] = None, + cargo: Optional[str] = None +) -> List[Candidato]: + """ + Search for candidates by name. + + Args: + nome: Candidate name to search + ano: Election year (default 2024) + uf: State filter (optional) + cargo: Position filter (optional) + """ + try: + # First get the election ID for the year + eleicoes = await listar_eleicoes() + eleicao = next((e for e in eleicoes if e.ano == ano), None) + + if not eleicao: + # Try common election IDs + eleicao_id = {2024: 546, 2022: 544, 2020: 426, 2018: 295}.get(ano, 546) + else: + eleicao_id = eleicao.id + + # Build search URL + base_url = f"{TSE_DIVULGACAND_URL}/candidatura/listar/{ano}/{eleicao_id}" + + params = {"nomeCompleto": nome} + if uf: + params["uf"] = uf.upper() + if cargo: + params["cargo"] = cargo + + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get(base_url, params=params) + + if response.status_code != 200: + return [] + + data = response.json() + candidatos_data = data.get("candidatos", []) + + candidatos = [] + for item in candidatos_data: + candidatos.append(Candidato( + id=item.get("id", 0), + nome=item.get("nomeCompleto", ""), + nome_urna=item.get("nomeUrna", ""), + cpf_parcial=item.get("cpf", "")[:3] + ".***.***-**" if item.get("cpf") else "", + numero=str(item.get("numero", "")), + cargo=item.get("cargo", {}).get("nome", "") if isinstance(item.get("cargo"), dict) else str(item.get("cargo", "")), + partido_sigla=item.get("partido", {}).get("sigla", "") if isinstance(item.get("partido"), dict) else "", + partido_nome=item.get("partido", {}).get("nome", "") if isinstance(item.get("partido"), dict) else "", + uf=item.get("ufSigla", "") or item.get("uf", ""), + municipio=item.get("municipio", {}).get("nome", "") if isinstance(item.get("municipio"), dict) else "", + situacao=item.get("situacao", ""), + total_bens=float(item.get("totalDeBens", 0) or 0) + )) + + return candidatos + + except Exception as e: + print(f"TSE search error: {e}") + return [] + + +async def obter_candidato_detalhes( + id_candidato: int, + ano: int = 2024, + eleicao_id: Optional[int] = None +) -> Optional[Candidato]: + """Get detailed candidate information including assets""" + try: + if not eleicao_id: + eleicao_id = {2024: 546, 2022: 544, 2020: 426, 2018: 295}.get(ano, 546) + + async with httpx.AsyncClient(timeout=30.0) as client: + # Get candidate details + response = await client.get( + f"{TSE_DIVULGACAND_URL}/candidatura/buscar/{ano}/{eleicao_id}/candidato/{id_candidato}" + ) + + if response.status_code != 200: + return None + + item = response.json() + + candidato = Candidato( + id=item.get("id", 0), + nome=item.get("nomeCompleto", ""), + nome_urna=item.get("nomeUrna", ""), + numero=str(item.get("numero", "")), + cargo=item.get("cargo", {}).get("nome", "") if isinstance(item.get("cargo"), dict) else "", + partido_sigla=item.get("partido", {}).get("sigla", "") if isinstance(item.get("partido"), dict) else "", + partido_nome=item.get("partido", {}).get("nome", "") if isinstance(item.get("partido"), dict) else "", + uf=item.get("ufSigla", ""), + municipio=item.get("localCandidatura", ""), + situacao=item.get("situacao", ""), + data_nascimento=item.get("dataNascimento", ""), + genero=item.get("genero", ""), + grau_instrucao=item.get("grauInstrucao", ""), + ocupacao=item.get("ocupacao", ""), + total_bens=float(item.get("totalDeBens", 0) or 0) + ) + + # Try to get assets (bens) + try: + bens_response = await client.get( + f"{TSE_DIVULGACAND_URL}/candidatura/buscar/{ano}/{eleicao_id}/candidato/{id_candidato}/bens" + ) + if bens_response.status_code == 200: + bens_data = bens_response.json() + candidato.bens = [ + { + "tipo": b.get("tipoBem", ""), + "descricao": b.get("descricao", ""), + "valor": float(b.get("valor", 0) or 0) + } + for b in bens_data + ] + except: + pass + + return candidato + + except Exception as e: + print(f"TSE details error: {e}") + return None + + +async def buscar_politico(nome: str) -> Dict[str, Any]: + """ + Search for a politician across multiple elections. + Returns consolidated information. + """ + resultado = { + "nome": nome, + "encontrado": False, + "candidaturas": [], + "ultimo_cargo": None, + "total_patrimonio": 0.0, + "partidos": set(), + "ufs": set() + } + + # Search in recent elections - continue through ALL years + for ano in [2024, 2022, 2020, 2018]: + try: + candidatos = await buscar_candidatos(nome, ano=ano) + print(f"TSE: Buscando '{nome}' em {ano} - encontrados: {len(candidatos)}") + + for c in candidatos: + # Match if nome is in the candidate's full name + if nome.lower() in c.nome.lower() or nome.lower() in c.nome_urna.lower(): + resultado["encontrado"] = True + resultado["candidaturas"].append({ + "ano": ano, + "cargo": c.cargo, + "partido": c.partido_sigla, + "uf": c.uf, + "situacao": c.situacao, + "patrimonio": c.total_bens + }) + + if c.partido_sigla: + resultado["partidos"].add(c.partido_sigla) + if c.uf: + resultado["ufs"].add(c.uf) + + if c.total_bens > resultado["total_patrimonio"]: + resultado["total_patrimonio"] = c.total_bens + + if not resultado["ultimo_cargo"]: + resultado["ultimo_cargo"] = f"{c.cargo} ({ano})" + except Exception as e: + print(f"TSE search {ano} error: {e}") + continue + + # Convert sets to lists for JSON + resultado["partidos"] = list(resultado["partidos"]) + resultado["ufs"] = list(resultado["ufs"]) + + print(f"TSE resultado para '{nome}': encontrado={resultado['encontrado']}, candidaturas={len(resultado['candidaturas'])}") + + return resultado + diff --git a/data/numidium.db b/data/numidium.db new file mode 100644 index 0000000000000000000000000000000000000000..5293aaa509c3eea7032c820aaf1ed43707885ac6 Binary files /dev/null and b/data/numidium.db differ diff --git a/requirements.txt b/requirements.txt index d3d8138add29dc954de2005bd86bdec53dd629e8..8d6b074afd2336d205fa0443bc46feac92470d10 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,28 +1,12 @@ -# --- SERVIDOR E API --- -fastapi -uvicorn[standard] -python-multipart -openai -prometheus-fastapi-instrumentator -prometheus-client -tavily-python - -# --- MACHINE LEARNING E NLP --- -sentence-transformers -numpy -pandas -scikit-learn -scipy -umap-learn -hdbscan -faiss-cpu -nltk -spacy -langdetect -https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl -https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl - -# --- TORCH CPU (>=2.6 required for CVE-2025-32434) --- -torch>=2.6.0 -torchvision -torchaudio \ No newline at end of file +fastapi==0.104.1 +uvicorn[standard]==0.24.0 +sqlalchemy==2.0.23 +pydantic==2.5.2 +pydantic-settings==2.1.0 +requests==2.31.0 +beautifulsoup4==4.12.2 +httpx==0.25.2 +python-multipart==0.0.6 +aiohttp==3.9.1 +feedparser==6.0.10 +# httpx already included - used for Cerebras API calls