lilbool's picture
Upload 212 files
497f2f3 verified
raw
history blame
1.83 kB
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
# 1. Preparar o Dataset
def load_data_from_csv(csv_file):
dataset = load_dataset("csv", data_files=csv_file)
return dataset['train']
# 2. Configurar o Tokenizer e Modelo
def get_model_and_tokenizer():
model_name = "microsoft/codebert-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2) # 2 classes: safe/unsafe
return model, tokenizer
# 3. Tokenizar os Dados
def tokenize_function(example, tokenizer):
return tokenizer(example['content'], truncation=True, padding="max_length", max_length=512)
# 4. Treinar o Modelo
def train_model(dataset, tokenizer, model):
tokenized_data = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=10,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_data,
eval_dataset=tokenized_data,
tokenizer=tokenizer,
)
trainer.train()
if __name__ == "__main__":
# Carregar Dados e Modelo
dataset = load_data_from_csv("code_analysis_dataset.csv")
model, tokenizer = get_model_and_tokenizer()
# Treinar Modelo
train_model(dataset, tokenizer, model)
print("[SUCCESS] Model trained!")