| import torch |
| from transformers import ( |
| DataCollatorWithPadding, |
| AutoModelForSequenceClassification, |
| AutoTokenizer, |
| TrainingArguments, |
| Trainer, |
| ) |
| from split_data import make_train_data |
|
|
| |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| |
| id2label = {0: "NEGATIVE", 1: "POSITIVE"} |
| label2id = {"NEGATIVE": 0, "POSITIVE": 1} |
| model = AutoModelForSequenceClassification.from_pretrained( |
| "bert-base-uncased", |
| num_labels=2, |
| id2label=id2label, |
| label2id=label2id, |
| |
| hidden_dropout_prob=0.3, |
| attention_probs_dropout_prob=0.3 |
| ).to(device) |
|
|
| tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") |
|
|
|
|
| |
| def tokenize_func(data): |
| return tokenizer(data["text"], truncation=True) |
|
|
| |
| train_data, validation_data = make_train_data() |
|
|
| tokenized_train_data = train_data.map(tokenize_func, batched=True) |
| tokenized_validation_data = validation_data.map(tokenize_func, batched=True) |
|
|
| |
| data_collator = DataCollatorWithPadding(tokenizer) |
|
|
| steps_per_epoch = len(tokenized_train_data) // 16 |
| logging_steps = steps_per_epoch // 25 |
|
|
| |
| training_args = TrainingArguments( |
| output_dir='./finetuned', |
| learning_rate=1.0e-5, |
| per_device_train_batch_size=32, |
| num_train_epochs=2, |
| save_total_limit=2, |
| |
| weight_decay=0.01, |
| fp16=torch.cuda.is_available(), |
| logging_dir='./logs', |
| logging_steps=logging_steps, |
| eval_strategy="steps", |
| eval_steps=logging_steps, |
| save_strategy="steps", |
| save_steps=logging_steps, |
| ) |
|
|
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| data_collator=data_collator, |
| train_dataset=tokenized_train_data, |
| eval_dataset=tokenized_validation_data, |
| ) |
|
|
| |
| trainer.train() |
| trainer.save_model() |
| tokenizer.save_pretrained('./finetuned') |
|
|