File size: 1,953 Bytes
5059be2 137457a 5059be2 137457a 5059be2 dea7034 5059be2 137457a 5059be2 137457a 5059be2 db2b094 dea7034 5059be2 dea7034 137457a db2b094 dea7034 5059be2 db2b094 5059be2 137457a 5059be2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | import torch
from transformers import (
DataCollatorWithPadding,
AutoModelForSequenceClassification,
AutoTokenizer,
TrainingArguments,
Trainer,
)
from split_data import make_train_data
# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load model and tokeniser
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels=2,
id2label=id2label,
label2id=label2id,
#Add dropout for hidden and attention layers
hidden_dropout_prob=0.3,
attention_probs_dropout_prob=0.3
).to(device)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# Preprocessing function
def tokenize_func(data):
return tokenizer(data["text"], truncation=True)
# Load and pre-process dataset
train_data, validation_data = make_train_data()
tokenized_train_data = train_data.map(tokenize_func, batched=True)
tokenized_validation_data = validation_data.map(tokenize_func, batched=True)
# Data collator
data_collator = DataCollatorWithPadding(tokenizer)
steps_per_epoch = len(tokenized_train_data) // 16
logging_steps = steps_per_epoch // 25
# Training arguments
training_args = TrainingArguments(
output_dir='./finetuned',
learning_rate=1.0e-5,
per_device_train_batch_size=32,
num_train_epochs=2,
save_total_limit=2,
#Weight decay
weight_decay=0.01,
fp16=torch.cuda.is_available(),
logging_dir='./logs',
logging_steps=logging_steps,
eval_strategy="steps",
eval_steps=logging_steps,
save_strategy="steps",
save_steps=logging_steps,
)
# Trainer instance
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_train_data,
eval_dataset=tokenized_validation_data,
)
# Train
trainer.train()
trainer.save_model()
tokenizer.save_pretrained('./finetuned')
|