| | from datasets import load_dataset |
| | import numpy as np |
| |
|
| | dataset = load_dataset("json", data_files={"train":"tense_train.json", "validation":"tense_validation.json"}) |
| |
|
| | labels = ['first', 'second', 'third'] |
| | id2label = {idx:label for idx, label in enumerate(labels)} |
| | label2id = {label:idx for idx, label in enumerate(labels)} |
| |
|
| | from transformers import AutoModelForSequenceClassification |
| | model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", |
| | problem_type="multi_label_classification", |
| | num_labels=len(labels), |
| | id2label=id2label, |
| | label2id=label2id) |
| |
|
| | batch_size = 8 |
| | metric_name = "f1" |
| |
|
| | from transformers import TrainingArguments, Trainer |
| | args = TrainingArguments( |
| | f"bert-finetuned-sem_eval-english", |
| | evaluation_strategy = "epoch", |
| | save_strategy = "epoch", |
| | learning_rate=2e-5, |
| | per_device_train_batch_size=batch_size, |
| | per_device_eval_batch_size=batch_size, |
| | num_train_epochs=5, |
| | weight_decay=0.01, |
| | load_best_model_at_end=True, |
| | metric_for_best_model=metric_name, |
| | |
| | ) |
| |
|
| | from sklearn.metrics import f1_score, roc_auc_score, accuracy_score |
| | from transformers import EvalPrediction, AutoTokenizer |
| | import torch |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") |
| |
|
| | |
| | def multi_label_metrics(predictions, labels, threshold=0.5): |
| | |
| | sigmoid = torch.nn.Sigmoid() |
| | probs = sigmoid(torch.Tensor(predictions)) |
| | |
| | y_pred = np.zeros(probs.shape) |
| | y_pred[np.where(probs >= threshold)] = 1 |
| | |
| | y_true = labels |
| | f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro') |
| | roc_auc = roc_auc_score(y_true, y_pred, average = 'micro') |
| | accuracy = accuracy_score(y_true, y_pred) |
| | |
| | metrics = {'f1': f1_micro_average, |
| | 'roc_auc': roc_auc, |
| | 'accuracy': accuracy} |
| | return metrics |
| |
|
| | def compute_metrics(p: EvalPrediction): |
| | preds = p.predictions[0] if isinstance(p.predictions, |
| | tuple) else p.predictions |
| | result = multi_label_metrics( |
| | predictions=preds, |
| | labels=p.label_ids) |
| | return result |
| |
|
| | def preprocess_data(ex): |
| | encoding = tokenizer(ex["text"], padding="max_length", truncation=True, max_length=128) |
| | encoding['labels'] = [float(ex['pov']=="first"), float(ex['pov']=="second"), float(ex['pov']=="third")] |
| | return encoding |
| |
|
| | dataset = dataset.filter(lambda ex: ex['pov'] != "unknown", num_proc=8) |
| | encoded_dataset = dataset.map(preprocess_data, remove_columns=dataset['train'].column_names, num_proc=8) |
| |
|
| | trainer = Trainer( |
| | model, |
| | args, |
| | train_dataset=encoded_dataset["train"], |
| | eval_dataset=encoded_dataset["validation"], |
| | tokenizer=tokenizer, |
| | compute_metrics=compute_metrics |
| | ) |
| |
|
| | trainer.train() |
| | trainer.save_model('bert-base-uncased-tense') |
| |
|
| | print(trainer.evaluate()) |
| |
|