neural-thinker's picture
feat: initial cidadao.ai-models deployment
b95e73a
raw
history blame
20.3 kB
"""
Cidadão.AI - Hugging Face Transformers Integration
Modelo especializado em transparência pública brasileira
compatível com a biblioteca transformers do Hugging Face.
"""
import torch
import torch.nn as nn
from transformers import (
PreTrainedModel, PretrainedConfig,
AutoModel, AutoTokenizer,
pipeline, Pipeline
)
from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutput
from typing import Optional, Dict, List, Union, Tuple
import json
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
class CidadaoAIConfig(PretrainedConfig):
"""
Configuração do Cidadão.AI para Hugging Face
"""
model_type = "cidadao-gpt"
def __init__(
self,
vocab_size: int = 50257,
hidden_size: int = 1024,
num_hidden_layers: int = 24,
num_attention_heads: int = 16,
intermediate_size: int = 4096,
max_position_embeddings: int = 8192,
# Configurações específicas de transparência
transparency_vocab_size: int = 2048,
corruption_detection_layers: int = 4,
financial_analysis_dim: int = 512,
legal_understanding_dim: int = 256,
# Configurações de dropout
hidden_dropout_prob: float = 0.1,
attention_probs_dropout_prob: float = 0.1,
# Configurações de ativação
hidden_act: str = "gelu",
# Configurações de inicialização
initializer_range: float = 0.02,
layer_norm_eps: float = 1e-12,
# Tarefas especializadas
enable_anomaly_detection: bool = True,
enable_financial_analysis: bool = True,
enable_legal_reasoning: bool = True,
# Labels para classificação
num_anomaly_labels: int = 3, # Normal, Suspeito, Anômalo
num_financial_labels: int = 5, # Muito Baixo, Baixo, Médio, Alto, Muito Alto
num_legal_labels: int = 2, # Não Conforme, Conforme
**kwargs
):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.max_position_embeddings = max_position_embeddings
# Configurações específicas
self.transparency_vocab_size = transparency_vocab_size
self.corruption_detection_layers = corruption_detection_layers
self.financial_analysis_dim = financial_analysis_dim
self.legal_understanding_dim = legal_understanding_dim
# Dropout
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
# Ativação
self.hidden_act = hidden_act
# Inicialização
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
# Tarefas
self.enable_anomaly_detection = enable_anomaly_detection
self.enable_financial_analysis = enable_financial_analysis
self.enable_legal_reasoning = enable_legal_reasoning
# Labels
self.num_anomaly_labels = num_anomaly_labels
self.num_financial_labels = num_financial_labels
self.num_legal_labels = num_legal_labels
class CidadaoAIModel(PreTrainedModel):
"""
Modelo base Cidadão.AI compatível com Hugging Face
"""
config_class = CidadaoAIConfig
base_model_prefix = "cidadao_gpt"
supports_gradient_checkpointing = True
def __init__(self, config: CidadaoAIConfig):
super().__init__(config)
self.config = config
# Modelo base (usar GPT-2 como backbone)
from transformers import GPT2Model
self.backbone = GPT2Model(config)
# Embeddings especializados para transparência
self.transparency_embeddings = nn.ModuleDict({
'entity_types': nn.Embedding(100, config.hidden_size // 4),
'financial_types': nn.Embedding(50, config.hidden_size // 4),
'legal_types': nn.Embedding(200, config.hidden_size // 4),
'corruption_indicators': nn.Embedding(20, config.hidden_size // 4)
})
# Cabeças de classificação especializadas
if config.enable_anomaly_detection:
self.anomaly_classifier = nn.Sequential(
nn.Linear(config.hidden_size, config.hidden_size // 2),
nn.ReLU(),
nn.Dropout(config.hidden_dropout_prob),
nn.Linear(config.hidden_size // 2, config.num_anomaly_labels)
)
self.anomaly_confidence = nn.Sequential(
nn.Linear(config.hidden_size, config.hidden_size // 4),
nn.ReLU(),
nn.Linear(config.hidden_size // 4, 1),
nn.Sigmoid()
)
if config.enable_financial_analysis:
self.financial_classifier = nn.Sequential(
nn.Linear(config.hidden_size, config.financial_analysis_dim),
nn.ReLU(),
nn.Dropout(config.hidden_dropout_prob),
nn.Linear(config.financial_analysis_dim, config.num_financial_labels)
)
self.financial_regressor = nn.Sequential(
nn.Linear(config.hidden_size, config.financial_analysis_dim),
nn.ReLU(),
nn.Linear(config.financial_analysis_dim, 1)
)
if config.enable_legal_reasoning:
self.legal_classifier = nn.Sequential(
nn.Linear(config.hidden_size, config.legal_understanding_dim),
nn.ReLU(),
nn.Dropout(config.hidden_dropout_prob),
nn.Linear(config.legal_understanding_dim, config.num_legal_labels)
)
# Inicializar pesos
self.init_weights()
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# Inputs especializados
entity_types: Optional[torch.Tensor] = None,
financial_types: Optional[torch.Tensor] = None,
legal_types: Optional[torch.Tensor] = None,
corruption_indicators: Optional[torch.Tensor] = None,
# Labels para treinamento
anomaly_labels: Optional[torch.Tensor] = None,
financial_labels: Optional[torch.Tensor] = None,
legal_labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, BaseModelOutput]:
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Forward do modelo base
outputs = self.backbone(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0] # [batch_size, seq_len, hidden_size]
# Pooling para classificação (usar [CLS] token ou média)
pooled_output = sequence_output.mean(dim=1) # [batch_size, hidden_size]
# Adicionar embeddings especializados se fornecidos
if entity_types is not None:
entity_embeds = self.transparency_embeddings['entity_types'](entity_types)
pooled_output = pooled_output + entity_embeds.mean(dim=1)
if corruption_indicators is not None:
corruption_embeds = self.transparency_embeddings['corruption_indicators'](corruption_indicators)
pooled_output = pooled_output + corruption_embeds.mean(dim=1)
result = {
"last_hidden_state": sequence_output,
"pooler_output": pooled_output,
"hidden_states": outputs.hidden_states if output_hidden_states else None,
"attentions": outputs.attentions if output_attentions else None,
}
# Adicionar predições das cabeças especializadas
if hasattr(self, 'anomaly_classifier'):
anomaly_logits = self.anomaly_classifier(pooled_output)
anomaly_confidence = self.anomaly_confidence(pooled_output)
result["anomaly_logits"] = anomaly_logits
result["anomaly_confidence"] = anomaly_confidence
# Calcular loss se labels fornecidos
if anomaly_labels is not None:
loss_fct = nn.CrossEntropyLoss()
anomaly_loss = loss_fct(anomaly_logits, anomaly_labels)
result["anomaly_loss"] = anomaly_loss
if hasattr(self, 'financial_classifier'):
financial_logits = self.financial_classifier(pooled_output)
financial_value = self.financial_regressor(pooled_output)
result["financial_logits"] = financial_logits
result["financial_value"] = financial_value
if financial_labels is not None:
loss_fct = nn.CrossEntropyLoss()
financial_loss = loss_fct(financial_logits, financial_labels)
result["financial_loss"] = financial_loss
if hasattr(self, 'legal_classifier'):
legal_logits = self.legal_classifier(pooled_output)
result["legal_logits"] = legal_logits
if legal_labels is not None:
loss_fct = nn.CrossEntropyLoss()
legal_loss = loss_fct(legal_logits, legal_labels)
result["legal_loss"] = legal_loss
# Calcular loss total se em modo de treinamento
if any(key.endswith('_loss') for key in result.keys()):
total_loss = 0
loss_count = 0
for key, value in result.items():
if key.endswith('_loss'):
total_loss += value
loss_count += 1
if loss_count > 0:
result["loss"] = total_loss / loss_count
if not return_dict:
return tuple(v for v in result.values() if v is not None)
return BaseModelOutput(**result)
class CidadaoAIForAnomalyDetection(PreTrainedModel):
"""Modelo Cidadão.AI especializado para detecção de anomalias"""
config_class = CidadaoAIConfig
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_anomaly_labels
self.cidadao_gpt = CidadaoAIModel(config)
def forward(
self,
input_ids=None,
attention_mask=None,
labels=None,
**kwargs
):
outputs = self.cidadao_gpt(
input_ids=input_ids,
attention_mask=attention_mask,
anomaly_labels=labels,
**kwargs
)
logits = outputs.get("anomaly_logits")
confidence = outputs.get("anomaly_confidence")
loss = outputs.get("anomaly_loss")
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.get("hidden_states"),
attentions=outputs.get("attentions"),
)
class CidadaoAIForFinancialAnalysis(PreTrainedModel):
"""Modelo Cidadão.AI especializado para análise financeira"""
config_class = CidadaoAIConfig
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_financial_labels
self.cidadao_gpt = CidadaoAIModel(config)
def forward(
self,
input_ids=None,
attention_mask=None,
labels=None,
**kwargs
):
outputs = self.cidadao_gpt(
input_ids=input_ids,
attention_mask=attention_mask,
financial_labels=labels,
**kwargs
)
logits = outputs.get("financial_logits")
value = outputs.get("financial_value")
loss = outputs.get("financial_loss")
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.get("hidden_states"),
attentions=outputs.get("attentions"),
)
class CidadaoAIForLegalCompliance(PreTrainedModel):
"""Modelo Cidadão.AI especializado para conformidade legal"""
config_class = CidadaoAIConfig
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_legal_labels
self.cidadao_gpt = CidadaoAIModel(config)
def forward(
self,
input_ids=None,
attention_mask=None,
labels=None,
**kwargs
):
outputs = self.cidadao_gpt(
input_ids=input_ids,
attention_mask=attention_mask,
legal_labels=labels,
**kwargs
)
logits = outputs.get("legal_logits")
loss = outputs.get("legal_loss")
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.get("hidden_states"),
attentions=outputs.get("attentions"),
)
# Pipelines personalizados para cada tarefa
class TransparencyAnalysisPipeline(Pipeline):
"""Pipeline personalizado para análise de transparência"""
def __init__(self, model, tokenizer, task="transparency-analysis", **kwargs):
super().__init__(model=model, tokenizer=tokenizer, task=task, **kwargs)
self.anomaly_labels = ["Normal", "Suspeito", "Anômalo"]
self.financial_labels = ["Muito Baixo", "Baixo", "Médio", "Alto", "Muito Alto"]
self.legal_labels = ["Não Conforme", "Conforme"]
def _sanitize_parameters(self, **kwargs):
preprocess_kwargs = {}
forward_kwargs = {}
postprocess_kwargs = {}
if "max_length" in kwargs:
preprocess_kwargs["max_length"] = kwargs["max_length"]
if "return_all_scores" in kwargs:
postprocess_kwargs["return_all_scores"] = kwargs["return_all_scores"]
return preprocess_kwargs, forward_kwargs, postprocess_kwargs
def preprocess(self, inputs, max_length=512):
return self.tokenizer(
inputs,
truncation=True,
padding=True,
max_length=max_length,
return_tensors="pt"
)
def _forward(self, model_inputs):
return self.model(**model_inputs)
def postprocess(self, model_outputs, return_all_scores=False):
results = {}
# Detecção de anomalias
if hasattr(model_outputs, 'anomaly_logits') or 'anomaly_logits' in model_outputs:
anomaly_logits = model_outputs.get('anomaly_logits', model_outputs.anomaly_logits)
anomaly_probs = torch.softmax(anomaly_logits, dim=-1)
anomaly_pred = torch.argmax(anomaly_probs, dim=-1)
results["anomaly"] = {
"label": self.anomaly_labels[anomaly_pred.item()],
"score": anomaly_probs.max().item(),
"all_scores": [
{"label": label, "score": score.item()}
for label, score in zip(self.anomaly_labels, anomaly_probs[0])
] if return_all_scores else None
}
# Análise financeira
if hasattr(model_outputs, 'financial_logits') or 'financial_logits' in model_outputs:
financial_logits = model_outputs.get('financial_logits', model_outputs.financial_logits)
financial_probs = torch.softmax(financial_logits, dim=-1)
financial_pred = torch.argmax(financial_probs, dim=-1)
results["financial"] = {
"label": self.financial_labels[financial_pred.item()],
"score": financial_probs.max().item(),
"all_scores": [
{"label": label, "score": score.item()}
for label, score in zip(self.financial_labels, financial_probs[0])
] if return_all_scores else None
}
# Conformidade legal
if hasattr(model_outputs, 'legal_logits') or 'legal_logits' in model_outputs:
legal_logits = model_outputs.get('legal_logits', model_outputs.legal_logits)
legal_probs = torch.softmax(legal_logits, dim=-1)
legal_pred = torch.argmax(legal_probs, dim=-1)
results["legal"] = {
"label": self.legal_labels[legal_pred.item()],
"score": legal_probs.max().item(),
"all_scores": [
{"label": label, "score": score.item()}
for label, score in zip(self.legal_labels, legal_probs[0])
] if return_all_scores else None
}
return results
# Registro dos modelos no AutoModel
from transformers import AutoConfig, AutoModel
AutoConfig.register("cidadao-gpt", CidadaoAIConfig)
AutoModel.register(CidadaoAIConfig, CidadaoAIModel)
def create_cidadao_pipeline(
model_name_or_path: str = "neural-thinker/cidadao-gpt",
task: str = "transparency-analysis",
**kwargs
) -> TransparencyAnalysisPipeline:
"""
Criar pipeline Cidadão.AI
Args:
model_name_or_path: Nome do modelo no HF Hub ou caminho local
task: Tipo de tarefa
**kwargs: Argumentos adicionais
Returns:
Pipeline configurado
"""
model = AutoModel.from_pretrained(model_name_or_path, **kwargs)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
return TransparencyAnalysisPipeline(
model=model,
tokenizer=tokenizer,
task=task
)
# Função de conveniência para uso rápido
def analyze_transparency(
text: str,
model_name: str = "neural-thinker/cidadao-gpt"
) -> Dict:
"""
Análise rápida de transparência
Args:
text: Texto para análise
model_name: Nome do modelo
Returns:
Resultados da análise
"""
pipe = create_cidadao_pipeline(model_name)
return pipe(text, return_all_scores=True)
if __name__ == "__main__":
# Exemplo de uso
# Criar configuração
config = CidadaoAIConfig(
vocab_size=50257,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
enable_anomaly_detection=True,
enable_financial_analysis=True,
enable_legal_reasoning=True
)
# Criar modelo
model = CidadaoAIModel(config)
print(f"✅ Modelo Cidadão.AI criado com {sum(p.numel() for p in model.parameters()):,} parâmetros")
print(f"🎯 Tarefas habilitadas: Anomalias, Financeiro, Legal")
# Teste básico
batch_size, seq_len = 2, 128
input_ids = torch.randint(0, config.vocab_size, (batch_size, seq_len))
attention_mask = torch.ones(batch_size, seq_len)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
print(f"📊 Output shape: {outputs.last_hidden_state.shape}")
print(f"🔍 Anomaly logits: {outputs.anomaly_logits.shape if 'anomaly_logits' in outputs else 'N/A'}")
print(f"💰 Financial logits: {outputs.financial_logits.shape if 'financial_logits' in outputs else 'N/A'}")
print(f"⚖️ Legal logits: {outputs.legal_logits.shape if 'legal_logits' in outputs else 'N/A'}")