Spaces:
Sleeping
Sleeping
| """ | |
| Cidadão.AI - Hugging Face Transformers Integration | |
| Modelo especializado em transparência pública brasileira | |
| compatível com a biblioteca transformers do Hugging Face. | |
| """ | |
| import torch | |
| import torch.nn as nn | |
| from transformers import ( | |
| PreTrainedModel, PretrainedConfig, | |
| AutoModel, AutoTokenizer, | |
| pipeline, Pipeline | |
| ) | |
| from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutput | |
| from typing import Optional, Dict, List, Union, Tuple | |
| import json | |
| import logging | |
| from pathlib import Path | |
| logger = logging.getLogger(__name__) | |
| class CidadaoAIConfig(PretrainedConfig): | |
| """ | |
| Configuração do Cidadão.AI para Hugging Face | |
| """ | |
| model_type = "cidadao-gpt" | |
| def __init__( | |
| self, | |
| vocab_size: int = 50257, | |
| hidden_size: int = 1024, | |
| num_hidden_layers: int = 24, | |
| num_attention_heads: int = 16, | |
| intermediate_size: int = 4096, | |
| max_position_embeddings: int = 8192, | |
| # Configurações específicas de transparência | |
| transparency_vocab_size: int = 2048, | |
| corruption_detection_layers: int = 4, | |
| financial_analysis_dim: int = 512, | |
| legal_understanding_dim: int = 256, | |
| # Configurações de dropout | |
| hidden_dropout_prob: float = 0.1, | |
| attention_probs_dropout_prob: float = 0.1, | |
| # Configurações de ativação | |
| hidden_act: str = "gelu", | |
| # Configurações de inicialização | |
| initializer_range: float = 0.02, | |
| layer_norm_eps: float = 1e-12, | |
| # Tarefas especializadas | |
| enable_anomaly_detection: bool = True, | |
| enable_financial_analysis: bool = True, | |
| enable_legal_reasoning: bool = True, | |
| # Labels para classificação | |
| num_anomaly_labels: int = 3, # Normal, Suspeito, Anômalo | |
| num_financial_labels: int = 5, # Muito Baixo, Baixo, Médio, Alto, Muito Alto | |
| num_legal_labels: int = 2, # Não Conforme, Conforme | |
| **kwargs | |
| ): | |
| super().__init__(**kwargs) | |
| self.vocab_size = vocab_size | |
| self.hidden_size = hidden_size | |
| self.num_hidden_layers = num_hidden_layers | |
| self.num_attention_heads = num_attention_heads | |
| self.intermediate_size = intermediate_size | |
| self.max_position_embeddings = max_position_embeddings | |
| # Configurações específicas | |
| self.transparency_vocab_size = transparency_vocab_size | |
| self.corruption_detection_layers = corruption_detection_layers | |
| self.financial_analysis_dim = financial_analysis_dim | |
| self.legal_understanding_dim = legal_understanding_dim | |
| # Dropout | |
| self.hidden_dropout_prob = hidden_dropout_prob | |
| self.attention_probs_dropout_prob = attention_probs_dropout_prob | |
| # Ativação | |
| self.hidden_act = hidden_act | |
| # Inicialização | |
| self.initializer_range = initializer_range | |
| self.layer_norm_eps = layer_norm_eps | |
| # Tarefas | |
| self.enable_anomaly_detection = enable_anomaly_detection | |
| self.enable_financial_analysis = enable_financial_analysis | |
| self.enable_legal_reasoning = enable_legal_reasoning | |
| # Labels | |
| self.num_anomaly_labels = num_anomaly_labels | |
| self.num_financial_labels = num_financial_labels | |
| self.num_legal_labels = num_legal_labels | |
| class CidadaoAIModel(PreTrainedModel): | |
| """ | |
| Modelo base Cidadão.AI compatível com Hugging Face | |
| """ | |
| config_class = CidadaoAIConfig | |
| base_model_prefix = "cidadao_gpt" | |
| supports_gradient_checkpointing = True | |
| def __init__(self, config: CidadaoAIConfig): | |
| super().__init__(config) | |
| self.config = config | |
| # Modelo base (usar GPT-2 como backbone) | |
| from transformers import GPT2Model | |
| self.backbone = GPT2Model(config) | |
| # Embeddings especializados para transparência | |
| self.transparency_embeddings = nn.ModuleDict({ | |
| 'entity_types': nn.Embedding(100, config.hidden_size // 4), | |
| 'financial_types': nn.Embedding(50, config.hidden_size // 4), | |
| 'legal_types': nn.Embedding(200, config.hidden_size // 4), | |
| 'corruption_indicators': nn.Embedding(20, config.hidden_size // 4) | |
| }) | |
| # Cabeças de classificação especializadas | |
| if config.enable_anomaly_detection: | |
| self.anomaly_classifier = nn.Sequential( | |
| nn.Linear(config.hidden_size, config.hidden_size // 2), | |
| nn.ReLU(), | |
| nn.Dropout(config.hidden_dropout_prob), | |
| nn.Linear(config.hidden_size // 2, config.num_anomaly_labels) | |
| ) | |
| self.anomaly_confidence = nn.Sequential( | |
| nn.Linear(config.hidden_size, config.hidden_size // 4), | |
| nn.ReLU(), | |
| nn.Linear(config.hidden_size // 4, 1), | |
| nn.Sigmoid() | |
| ) | |
| if config.enable_financial_analysis: | |
| self.financial_classifier = nn.Sequential( | |
| nn.Linear(config.hidden_size, config.financial_analysis_dim), | |
| nn.ReLU(), | |
| nn.Dropout(config.hidden_dropout_prob), | |
| nn.Linear(config.financial_analysis_dim, config.num_financial_labels) | |
| ) | |
| self.financial_regressor = nn.Sequential( | |
| nn.Linear(config.hidden_size, config.financial_analysis_dim), | |
| nn.ReLU(), | |
| nn.Linear(config.financial_analysis_dim, 1) | |
| ) | |
| if config.enable_legal_reasoning: | |
| self.legal_classifier = nn.Sequential( | |
| nn.Linear(config.hidden_size, config.legal_understanding_dim), | |
| nn.ReLU(), | |
| nn.Dropout(config.hidden_dropout_prob), | |
| nn.Linear(config.legal_understanding_dim, config.num_legal_labels) | |
| ) | |
| # Inicializar pesos | |
| self.init_weights() | |
| def forward( | |
| self, | |
| input_ids: Optional[torch.Tensor] = None, | |
| attention_mask: Optional[torch.Tensor] = None, | |
| token_type_ids: Optional[torch.Tensor] = None, | |
| position_ids: Optional[torch.Tensor] = None, | |
| head_mask: Optional[torch.Tensor] = None, | |
| inputs_embeds: Optional[torch.Tensor] = None, | |
| use_cache: Optional[bool] = None, | |
| output_attentions: Optional[bool] = None, | |
| output_hidden_states: Optional[bool] = None, | |
| return_dict: Optional[bool] = None, | |
| # Inputs especializados | |
| entity_types: Optional[torch.Tensor] = None, | |
| financial_types: Optional[torch.Tensor] = None, | |
| legal_types: Optional[torch.Tensor] = None, | |
| corruption_indicators: Optional[torch.Tensor] = None, | |
| # Labels para treinamento | |
| anomaly_labels: Optional[torch.Tensor] = None, | |
| financial_labels: Optional[torch.Tensor] = None, | |
| legal_labels: Optional[torch.Tensor] = None, | |
| ) -> Union[Tuple, BaseModelOutput]: | |
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
| # Forward do modelo base | |
| outputs = self.backbone( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| token_type_ids=token_type_ids, | |
| position_ids=position_ids, | |
| head_mask=head_mask, | |
| inputs_embeds=inputs_embeds, | |
| use_cache=use_cache, | |
| output_attentions=output_attentions, | |
| output_hidden_states=output_hidden_states, | |
| return_dict=return_dict, | |
| ) | |
| sequence_output = outputs[0] # [batch_size, seq_len, hidden_size] | |
| # Pooling para classificação (usar [CLS] token ou média) | |
| pooled_output = sequence_output.mean(dim=1) # [batch_size, hidden_size] | |
| # Adicionar embeddings especializados se fornecidos | |
| if entity_types is not None: | |
| entity_embeds = self.transparency_embeddings['entity_types'](entity_types) | |
| pooled_output = pooled_output + entity_embeds.mean(dim=1) | |
| if corruption_indicators is not None: | |
| corruption_embeds = self.transparency_embeddings['corruption_indicators'](corruption_indicators) | |
| pooled_output = pooled_output + corruption_embeds.mean(dim=1) | |
| result = { | |
| "last_hidden_state": sequence_output, | |
| "pooler_output": pooled_output, | |
| "hidden_states": outputs.hidden_states if output_hidden_states else None, | |
| "attentions": outputs.attentions if output_attentions else None, | |
| } | |
| # Adicionar predições das cabeças especializadas | |
| if hasattr(self, 'anomaly_classifier'): | |
| anomaly_logits = self.anomaly_classifier(pooled_output) | |
| anomaly_confidence = self.anomaly_confidence(pooled_output) | |
| result["anomaly_logits"] = anomaly_logits | |
| result["anomaly_confidence"] = anomaly_confidence | |
| # Calcular loss se labels fornecidos | |
| if anomaly_labels is not None: | |
| loss_fct = nn.CrossEntropyLoss() | |
| anomaly_loss = loss_fct(anomaly_logits, anomaly_labels) | |
| result["anomaly_loss"] = anomaly_loss | |
| if hasattr(self, 'financial_classifier'): | |
| financial_logits = self.financial_classifier(pooled_output) | |
| financial_value = self.financial_regressor(pooled_output) | |
| result["financial_logits"] = financial_logits | |
| result["financial_value"] = financial_value | |
| if financial_labels is not None: | |
| loss_fct = nn.CrossEntropyLoss() | |
| financial_loss = loss_fct(financial_logits, financial_labels) | |
| result["financial_loss"] = financial_loss | |
| if hasattr(self, 'legal_classifier'): | |
| legal_logits = self.legal_classifier(pooled_output) | |
| result["legal_logits"] = legal_logits | |
| if legal_labels is not None: | |
| loss_fct = nn.CrossEntropyLoss() | |
| legal_loss = loss_fct(legal_logits, legal_labels) | |
| result["legal_loss"] = legal_loss | |
| # Calcular loss total se em modo de treinamento | |
| if any(key.endswith('_loss') for key in result.keys()): | |
| total_loss = 0 | |
| loss_count = 0 | |
| for key, value in result.items(): | |
| if key.endswith('_loss'): | |
| total_loss += value | |
| loss_count += 1 | |
| if loss_count > 0: | |
| result["loss"] = total_loss / loss_count | |
| if not return_dict: | |
| return tuple(v for v in result.values() if v is not None) | |
| return BaseModelOutput(**result) | |
| class CidadaoAIForAnomalyDetection(PreTrainedModel): | |
| """Modelo Cidadão.AI especializado para detecção de anomalias""" | |
| config_class = CidadaoAIConfig | |
| def __init__(self, config): | |
| super().__init__(config) | |
| self.num_labels = config.num_anomaly_labels | |
| self.cidadao_gpt = CidadaoAIModel(config) | |
| def forward( | |
| self, | |
| input_ids=None, | |
| attention_mask=None, | |
| labels=None, | |
| **kwargs | |
| ): | |
| outputs = self.cidadao_gpt( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| anomaly_labels=labels, | |
| **kwargs | |
| ) | |
| logits = outputs.get("anomaly_logits") | |
| confidence = outputs.get("anomaly_confidence") | |
| loss = outputs.get("anomaly_loss") | |
| return SequenceClassifierOutput( | |
| loss=loss, | |
| logits=logits, | |
| hidden_states=outputs.get("hidden_states"), | |
| attentions=outputs.get("attentions"), | |
| ) | |
| class CidadaoAIForFinancialAnalysis(PreTrainedModel): | |
| """Modelo Cidadão.AI especializado para análise financeira""" | |
| config_class = CidadaoAIConfig | |
| def __init__(self, config): | |
| super().__init__(config) | |
| self.num_labels = config.num_financial_labels | |
| self.cidadao_gpt = CidadaoAIModel(config) | |
| def forward( | |
| self, | |
| input_ids=None, | |
| attention_mask=None, | |
| labels=None, | |
| **kwargs | |
| ): | |
| outputs = self.cidadao_gpt( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| financial_labels=labels, | |
| **kwargs | |
| ) | |
| logits = outputs.get("financial_logits") | |
| value = outputs.get("financial_value") | |
| loss = outputs.get("financial_loss") | |
| return SequenceClassifierOutput( | |
| loss=loss, | |
| logits=logits, | |
| hidden_states=outputs.get("hidden_states"), | |
| attentions=outputs.get("attentions"), | |
| ) | |
| class CidadaoAIForLegalCompliance(PreTrainedModel): | |
| """Modelo Cidadão.AI especializado para conformidade legal""" | |
| config_class = CidadaoAIConfig | |
| def __init__(self, config): | |
| super().__init__(config) | |
| self.num_labels = config.num_legal_labels | |
| self.cidadao_gpt = CidadaoAIModel(config) | |
| def forward( | |
| self, | |
| input_ids=None, | |
| attention_mask=None, | |
| labels=None, | |
| **kwargs | |
| ): | |
| outputs = self.cidadao_gpt( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| legal_labels=labels, | |
| **kwargs | |
| ) | |
| logits = outputs.get("legal_logits") | |
| loss = outputs.get("legal_loss") | |
| return SequenceClassifierOutput( | |
| loss=loss, | |
| logits=logits, | |
| hidden_states=outputs.get("hidden_states"), | |
| attentions=outputs.get("attentions"), | |
| ) | |
| # Pipelines personalizados para cada tarefa | |
| class TransparencyAnalysisPipeline(Pipeline): | |
| """Pipeline personalizado para análise de transparência""" | |
| def __init__(self, model, tokenizer, task="transparency-analysis", **kwargs): | |
| super().__init__(model=model, tokenizer=tokenizer, task=task, **kwargs) | |
| self.anomaly_labels = ["Normal", "Suspeito", "Anômalo"] | |
| self.financial_labels = ["Muito Baixo", "Baixo", "Médio", "Alto", "Muito Alto"] | |
| self.legal_labels = ["Não Conforme", "Conforme"] | |
| def _sanitize_parameters(self, **kwargs): | |
| preprocess_kwargs = {} | |
| forward_kwargs = {} | |
| postprocess_kwargs = {} | |
| if "max_length" in kwargs: | |
| preprocess_kwargs["max_length"] = kwargs["max_length"] | |
| if "return_all_scores" in kwargs: | |
| postprocess_kwargs["return_all_scores"] = kwargs["return_all_scores"] | |
| return preprocess_kwargs, forward_kwargs, postprocess_kwargs | |
| def preprocess(self, inputs, max_length=512): | |
| return self.tokenizer( | |
| inputs, | |
| truncation=True, | |
| padding=True, | |
| max_length=max_length, | |
| return_tensors="pt" | |
| ) | |
| def _forward(self, model_inputs): | |
| return self.model(**model_inputs) | |
| def postprocess(self, model_outputs, return_all_scores=False): | |
| results = {} | |
| # Detecção de anomalias | |
| if hasattr(model_outputs, 'anomaly_logits') or 'anomaly_logits' in model_outputs: | |
| anomaly_logits = model_outputs.get('anomaly_logits', model_outputs.anomaly_logits) | |
| anomaly_probs = torch.softmax(anomaly_logits, dim=-1) | |
| anomaly_pred = torch.argmax(anomaly_probs, dim=-1) | |
| results["anomaly"] = { | |
| "label": self.anomaly_labels[anomaly_pred.item()], | |
| "score": anomaly_probs.max().item(), | |
| "all_scores": [ | |
| {"label": label, "score": score.item()} | |
| for label, score in zip(self.anomaly_labels, anomaly_probs[0]) | |
| ] if return_all_scores else None | |
| } | |
| # Análise financeira | |
| if hasattr(model_outputs, 'financial_logits') or 'financial_logits' in model_outputs: | |
| financial_logits = model_outputs.get('financial_logits', model_outputs.financial_logits) | |
| financial_probs = torch.softmax(financial_logits, dim=-1) | |
| financial_pred = torch.argmax(financial_probs, dim=-1) | |
| results["financial"] = { | |
| "label": self.financial_labels[financial_pred.item()], | |
| "score": financial_probs.max().item(), | |
| "all_scores": [ | |
| {"label": label, "score": score.item()} | |
| for label, score in zip(self.financial_labels, financial_probs[0]) | |
| ] if return_all_scores else None | |
| } | |
| # Conformidade legal | |
| if hasattr(model_outputs, 'legal_logits') or 'legal_logits' in model_outputs: | |
| legal_logits = model_outputs.get('legal_logits', model_outputs.legal_logits) | |
| legal_probs = torch.softmax(legal_logits, dim=-1) | |
| legal_pred = torch.argmax(legal_probs, dim=-1) | |
| results["legal"] = { | |
| "label": self.legal_labels[legal_pred.item()], | |
| "score": legal_probs.max().item(), | |
| "all_scores": [ | |
| {"label": label, "score": score.item()} | |
| for label, score in zip(self.legal_labels, legal_probs[0]) | |
| ] if return_all_scores else None | |
| } | |
| return results | |
| # Registro dos modelos no AutoModel | |
| from transformers import AutoConfig, AutoModel | |
| AutoConfig.register("cidadao-gpt", CidadaoAIConfig) | |
| AutoModel.register(CidadaoAIConfig, CidadaoAIModel) | |
| def create_cidadao_pipeline( | |
| model_name_or_path: str = "neural-thinker/cidadao-gpt", | |
| task: str = "transparency-analysis", | |
| **kwargs | |
| ) -> TransparencyAnalysisPipeline: | |
| """ | |
| Criar pipeline Cidadão.AI | |
| Args: | |
| model_name_or_path: Nome do modelo no HF Hub ou caminho local | |
| task: Tipo de tarefa | |
| **kwargs: Argumentos adicionais | |
| Returns: | |
| Pipeline configurado | |
| """ | |
| model = AutoModel.from_pretrained(model_name_or_path, **kwargs) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, **kwargs) | |
| return TransparencyAnalysisPipeline( | |
| model=model, | |
| tokenizer=tokenizer, | |
| task=task | |
| ) | |
| # Função de conveniência para uso rápido | |
| def analyze_transparency( | |
| text: str, | |
| model_name: str = "neural-thinker/cidadao-gpt" | |
| ) -> Dict: | |
| """ | |
| Análise rápida de transparência | |
| Args: | |
| text: Texto para análise | |
| model_name: Nome do modelo | |
| Returns: | |
| Resultados da análise | |
| """ | |
| pipe = create_cidadao_pipeline(model_name) | |
| return pipe(text, return_all_scores=True) | |
| if __name__ == "__main__": | |
| # Exemplo de uso | |
| # Criar configuração | |
| config = CidadaoAIConfig( | |
| vocab_size=50257, | |
| hidden_size=768, | |
| num_hidden_layers=12, | |
| num_attention_heads=12, | |
| enable_anomaly_detection=True, | |
| enable_financial_analysis=True, | |
| enable_legal_reasoning=True | |
| ) | |
| # Criar modelo | |
| model = CidadaoAIModel(config) | |
| print(f"✅ Modelo Cidadão.AI criado com {sum(p.numel() for p in model.parameters()):,} parâmetros") | |
| print(f"🎯 Tarefas habilitadas: Anomalias, Financeiro, Legal") | |
| # Teste básico | |
| batch_size, seq_len = 2, 128 | |
| input_ids = torch.randint(0, config.vocab_size, (batch_size, seq_len)) | |
| attention_mask = torch.ones(batch_size, seq_len) | |
| outputs = model(input_ids=input_ids, attention_mask=attention_mask) | |
| print(f"📊 Output shape: {outputs.last_hidden_state.shape}") | |
| print(f"🔍 Anomaly logits: {outputs.anomaly_logits.shape if 'anomaly_logits' in outputs else 'N/A'}") | |
| print(f"💰 Financial logits: {outputs.financial_logits.shape if 'financial_logits' in outputs else 'N/A'}") | |
| print(f"⚖️ Legal logits: {outputs.legal_logits.shape if 'legal_logits' in outputs else 'N/A'}") |