# %% import torch # GPUが使用可能か判断 if torch.cuda.is_available(): print('gpu is available') else: raise Exception('gpu is NOT available') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device # %% from datasets import load_dataset, DatasetDict from transformers import AutoTokenizer from transformers import AutoModelForSequenceClassification from transformers import TrainingArguments from transformers import Trainer from sklearn.metrics import accuracy_score, f1_score import numpy as np import pandas as pd import torch import random # %% from transformers.trainer_utils import set_seed # 乱数シードを42に固定 set_seed(42) # %% from pprint import pprint from datasets import load_dataset # Hugging Face Hub上のllm-book/wrime-sentimentのリポジトリから # データを読み込む train_dataset = load_dataset("tyqiangz/multilingual-sentiments", "japanese", split="train") valid_dataset = load_dataset("tyqiangz/multilingual-sentiments", "japanese", split="validation") # pprintで見やすく表示する pprint(train_dataset) pprint(valid_dataset) # %% train_dataset[0] # %% # トークナイザのロード model_name = "cl-tohoku/bert-base-japanese-whole-word-masking" tokenizer = AutoTokenizer.from_pretrained(model_name) # %% # トークナイズ関数 # text列をトークナイズ # label列をlabels列にセット def preprocess_text(batch): encoded_batch = tokenizer(batch['text'], max_length=512) encoded_batch['labels'] = batch['label'] return encoded_batch # %% # トークナイズ処理(既存の列は削除) encoded_train_dataset = train_dataset.map( preprocess_text, remove_columns=train_dataset.column_names, ) encoded_valid_dataset = valid_dataset.map( preprocess_text, remove_columns=valid_dataset.column_names, ) # %% # ミニバッチ構築 from transformers import DataCollatorWithPadding data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # %% # モデルの準備 from transformers import AutoModelForSequenceClassification class_label = train_dataset.features["label"] label2id = {label: id for id, label in enumerate(class_label.names)} id2label = {id: label for id, label in enumerate(class_label.names)} model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=class_label.num_classes, label2id=label2id, # ラベル名からIDへの対応を指定 id2label=id2label, # IDからラベル名への対応を指定 ) print(type(model).__name__) # %% # メトリクスの定義 def compute_metrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) f1 = f1_score(labels, preds, average="weighted") acc = accuracy_score(labels, preds) return {"accuracy": acc, "f1": f1} # %% # epoch: 100, early stopping # 訓練の実行 from transformers import TrainingArguments # 保存ディレクトリ save_dir = f'bert-finetuned-multilingual-sentiments-base' training_args = TrainingArguments( output_dir=save_dir, # 結果の保存フォルダ per_device_train_batch_size=32, # 訓練時のバッチサイズ per_device_eval_batch_size=32, # 評価時のバッチサイズ learning_rate=2e-5, # 学習率 lr_scheduler_type="constant", # 学習率スケジューラの種類 warmup_ratio=0.1, # 学習率のウォームアップの長さを指定 num_train_epochs=100, # エポック数 save_strategy="epoch", # チェックポイントの保存タイミング logging_strategy="epoch", # ロギングのタイミング evaluation_strategy="epoch", # 検証セットによる評価のタイミング load_best_model_at_end=True, # 訓練後に開発セットで最良のモデルをロード metric_for_best_model="eval_loss", # 最良のモデルを決定する評価指標 greater_is_better=False, # eval_lossは小さいほどよい fp16=True, # 自動混合精度演算の有効化 ) # %% from transformers import Trainer from transformers import EarlyStoppingCallback trainer = Trainer( model=model, train_dataset=encoded_train_dataset, eval_dataset=encoded_valid_dataset, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], ) trainer.train() # %% # モデルの保存 trainer.save_model(save_dir) tokenizer.save_pretrained(save_dir) # 履歴の保存 history_df = pd.DataFrame(trainer.state.log_history) history_df.to_csv('base_line/mullingual_baseline_history.csv') # %% import matplotlib.pyplot as plt def show_graph(df, suptitle, output='output.png'): suptitle_size = 23 graph_title_size = 20 legend_size = 18 ticks_size = 13 # 学習曲線 plt.figure(figsize=(20, 5)) plt.suptitle(suptitle, fontsize=suptitle_size) # Train Loss plt.subplot(131) plt.title('Train Loss', fontsize=graph_title_size) plt.plot(df['loss'].dropna(), label='train') plt.legend(fontsize=legend_size) plt.yticks(fontsize=ticks_size) # Validation Loss plt.subplot(132) plt.title(f'Val Loss', fontsize=graph_title_size) y = df['eval_loss'].dropna().values x = np.arange(len(y)).reshape(-1, 1) plt.plot(y, color='tab:orange', label='val') plt.legend(fontsize=legend_size) plt.yticks(fontsize=ticks_size) # Accuracy/F1 plt.subplot(133) plt.title('eval Accuracy/F1', fontsize=graph_title_size) plt.plot(df['eval_accuracy'].dropna(), label='accuracy') plt.plot(df['eval_f1'].dropna(), label='F1') plt.legend(fontsize=legend_size) plt.yticks(fontsize=ticks_size) plt.tight_layout() # plt.show() plt.savefig(output) # %% # 結果を表示 suptitle = 'batch:32, lr:2e-5, type:constant' show_graph(history_df, suptitle, 'base_line/mullingual_baseline_output.png')