| | |
| | |
| | |
| | |
| |
|
| | |
| | from datasets import load_dataset |
| | from transformers import AutoModelForSequenceClassification, AutoTokenizer |
| | from transformers import TrainingArguments |
| | from transformers import Trainer |
| | from sklearn.metrics import accuracy_score, f1_score |
| | from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix |
| | import torch |
| | import matplotlib.pyplot as plt |
| | import numpy as np |
| |
|
| | |
| | print('gpu available:',torch.cuda.is_available()) |
| |
|
| | |
| | |
| |
|
| | |
| | dataset = load_dataset("tyqiangz/multilingual-sentiments", "japanese") |
| |
|
| | |
| | |
| | dataset.set_format(type='pandas') |
| | train_df = dataset['train'][:] |
| |
|
| | |
| | def label_int2str(x): |
| | return dataset["train"].features["label"].int2str(x) |
| |
|
| | train_df["label_name"] = train_df["label"].apply(label_int2str) |
| |
|
| | |
| | dataset.reset_format() |
| |
|
| | |
| | from transformers import AutoTokenizer |
| |
|
| | model_ckpt = "cl-tohoku/bert-base-japanese-whole-word-masking" |
| | tokenizer = AutoTokenizer.from_pretrained(model_ckpt) |
| |
|
| | |
| | def tokenize(batch): |
| | return tokenizer(batch["text"], padding=True, truncation=True) |
| |
|
| | |
| | dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None) |
| |
|
| | |
| | |
| |
|
| | |
| | import torch |
| | from transformers import AutoModelForSequenceClassification |
| |
|
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | print(device) |
| | num_labels = 3 |
| |
|
| | model = (AutoModelForSequenceClassification |
| | .from_pretrained(model_ckpt, num_labels=num_labels) |
| | .to(device)) |
| |
|
| | |
| | from sklearn.metrics import accuracy_score, f1_score |
| |
|
| | def compute_metrics(pred): |
| | labels = pred.label_ids |
| | preds = pred.predictions.argmax(-1) |
| | f1 = f1_score(labels, preds, average="weighted") |
| | acc = accuracy_score(labels, preds) |
| | return {"accuracy": acc, "f1": f1} |
| |
|
| | |
| | from transformers import TrainingArguments |
| |
|
| | batch_size = 16 |
| | logging_steps = len(dataset_encoded["train"]) // batch_size |
| | model_name = "sample-text-classification-bert" |
| |
|
| | training_args = TrainingArguments( |
| | output_dir=model_name, |
| | num_train_epochs=10, |
| | learning_rate=2e-5, |
| | per_device_train_batch_size=batch_size, |
| | per_device_eval_batch_size=batch_size, |
| | weight_decay=0.01, |
| | evaluation_strategy="epoch", |
| | disable_tqdm=False, |
| | logging_steps=logging_steps, |
| | push_to_hub=False, |
| | log_level="error" |
| | ) |
| |
|
| | |
| | from transformers import Trainer |
| |
|
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | compute_metrics=compute_metrics, |
| | train_dataset=dataset_encoded["train"], |
| | eval_dataset=dataset_encoded["validation"], |
| | tokenizer=tokenizer |
| | ) |
| | print('start training..') |
| | trainer.train() |
| |
|
| | |
| | |
| | id2label = {} |
| | for i in range(dataset["train"].features["label"].num_classes): |
| | id2label[i] = dataset["train"].features["label"].int2str(i) |
| |
|
| | label2id = {} |
| | for i in range(dataset["train"].features["label"].num_classes): |
| | label2id[dataset["train"].features["label"].int2str(i)] = i |
| |
|
| | trainer.model.config.id2label = id2label |
| | trainer.model.config.label2id = label2id |
| |
|
| | |
| | |
| | print('save model.') |
| | trainer.save_model('sample-text-classification-bert') |
| |
|