from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments from datasets import load_dataset # 1. Preparar o Dataset def load_data_from_csv(csv_file): dataset = load_dataset("csv", data_files=csv_file) return dataset['train'] # 2. Configurar o Tokenizer e Modelo def get_model_and_tokenizer(): model_name = "microsoft/codebert-base" tokenizer = RobertaTokenizer.from_pretrained(model_name) model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2) # 2 classes: safe/unsafe return model, tokenizer # 3. Tokenizar os Dados def tokenize_function(example, tokenizer): return tokenizer(example['content'], truncation=True, padding="max_length", max_length=512) # 4. Treinar o Modelo def train_model(dataset, tokenizer, model): tokenized_data = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True) training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, per_device_train_batch_size=16, per_device_eval_batch_size=16, warmup_steps=500, weight_decay=0.01, logging_dir="./logs", logging_steps=10, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_data, eval_dataset=tokenized_data, tokenizer=tokenizer, ) trainer.train() if __name__ == "__main__": # Carregar Dados e Modelo dataset = load_data_from_csv("code_analysis_dataset.csv") model, tokenizer = get_model_and_tokenizer() # Treinar Modelo train_model(dataset, tokenizer, model) print("[SUCCESS] Model trained!")