Updated Model Tutorial Code
Browse files
README.md
CHANGED
|
@@ -74,56 +74,41 @@ Use the code below to get started with the model for general finetuning tasks. P
|
|
| 74 |
```
|
| 75 |
import torch
|
| 76 |
from datasets import load_dataset, load_metric
|
| 77 |
-
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
|
| 78 |
import evaluate
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
def compute_metrics(eval_pred):
|
| 81 |
logits, labels = eval_pred
|
| 82 |
predictions = np.argmax(logits, axis=-1)
|
| 83 |
return metric.compute(predictions=predictions, references=labels)
|
| 84 |
|
| 85 |
-
# Load the CoLA dataset
|
| 86 |
-
cola_dataset = load_dataset("glue", "cola")
|
| 87 |
-
|
| 88 |
-
cola_dataset = cola_dataset.rename_column('label', 'labels')
|
| 89 |
-
cola_dataset = cola_dataset.rename_column('sentence', 'text')
|
| 90 |
-
|
| 91 |
-
# Load the tokenizer and model
|
| 92 |
-
tokenizer = AutoTokenizer.from_pretrained("Koodsml/KooBERT")
|
| 93 |
-
model = AutoModel.from_pretrained("Koodsml/KooBERT", num_labels=2)
|
| 94 |
-
|
| 95 |
def tokenize_function(examples):
|
| 96 |
-
return tokenizer(examples["text"], padding=
|
| 97 |
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
-
|
| 100 |
|
| 101 |
# Set the device
|
| 102 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 103 |
model.to(device)
|
| 104 |
|
| 105 |
# Define the training arguments
|
| 106 |
-
training_args = TrainingArguments(
|
| 107 |
-
output_dir='./results',
|
| 108 |
-
evaluation_strategy='epoch',
|
| 109 |
-
# eval_steps=100,
|
| 110 |
-
save_total_limit=1,
|
| 111 |
-
learning_rate=2e-5,
|
| 112 |
-
per_device_train_batch_size=8,
|
| 113 |
-
per_device_eval_batch_size=8,
|
| 114 |
-
num_train_epochs=3,
|
| 115 |
-
weight_decay=0.01,
|
| 116 |
-
push_to_hub=False,
|
| 117 |
-
)
|
| 118 |
|
| 119 |
# Define the trainer
|
| 120 |
trainer = Trainer(
|
| 121 |
model=model,
|
| 122 |
args=training_args,
|
| 123 |
-
train_dataset=
|
| 124 |
-
eval_dataset=
|
| 125 |
-
|
| 126 |
-
compute_metrics=compute_metrics
|
| 127 |
)
|
| 128 |
|
| 129 |
# Fine-tune on the CoLA dataset
|
|
|
|
| 74 |
```
|
| 75 |
import torch
|
| 76 |
from datasets import load_dataset, load_metric
|
|
|
|
| 77 |
import evaluate
|
| 78 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
|
| 79 |
+
|
| 80 |
+
# Load the tokenizer and model
|
| 81 |
+
tokenizer = AutoTokenizer.from_pretrained("Koodsml/KooBERT")
|
| 82 |
+
model = AutoModelForSequenceClassification.from_pretrained("Koodsml/KooBERT", num_labels=2)
|
| 83 |
+
|
| 84 |
def compute_metrics(eval_pred):
|
| 85 |
logits, labels = eval_pred
|
| 86 |
predictions = np.argmax(logits, axis=-1)
|
| 87 |
return metric.compute(predictions=predictions, references=labels)
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
def tokenize_function(examples):
|
| 90 |
+
return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=128)
|
| 91 |
|
| 92 |
+
# Load the CoLA dataset
|
| 93 |
+
dataset = load_dataset("glue","cola")
|
| 94 |
+
dataset = dataset.rename_column('sentence', 'text')
|
| 95 |
|
| 96 |
+
datset_tok = dataset.map(tokenize_function, batched=True)
|
| 97 |
|
| 98 |
# Set the device
|
| 99 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 100 |
model.to(device)
|
| 101 |
|
| 102 |
# Define the training arguments
|
| 103 |
+
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
# Define the trainer
|
| 106 |
trainer = Trainer(
|
| 107 |
model=model,
|
| 108 |
args=training_args,
|
| 109 |
+
train_dataset=datset_tok['train'],
|
| 110 |
+
eval_dataset=datset_tok['validation'],
|
| 111 |
+
compute_metrics=compute_metrics,
|
|
|
|
| 112 |
)
|
| 113 |
|
| 114 |
# Fine-tune on the CoLA dataset
|