Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +35 -0
- bert_finetue_task2.py +223 -0
- bert_finetue_task2_froze.py +247 -0
- bert_finetune_task1.py +126 -0
- bert_finetune_task1_froze.py +148 -0
- continue_pretrain.py +132 -0
- dapt_data_process.py +69 -0
- data_process_task1.py +89 -0
- data_process_task2.py +95 -0
- data_source.xlsx +3 -0
- dataset_pretrain/Experiment_sentences_training_filtered_part1.csv +3 -0
- dataset_pretrain/domain_corpus.txt +3 -0
- dataset_pretrain/预训练数据第二部分_年报.zip +3 -0
- model_inference_task1.py +162 -0
- model_inference_task2.py +153 -0
- outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/config.json +43 -0
- outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/model.safetensors +3 -0
- outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/special_tokens_map.json +7 -0
- outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/tokenizer.json +0 -0
- outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/tokenizer_config.json +56 -0
- outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/training_args.bin +3 -0
- outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/vocab.txt +0 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-1094/config.json +30 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-1094/model.safetensors +3 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-1094/optimizer.pt +3 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-1094/rng_state.pth +3 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-1094/scheduler.pt +3 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-1094/trainer_state.json +191 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-1094/training_args.bin +3 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-2188/config.json +30 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-2188/model.safetensors +3 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-2188/optimizer.pt +3 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-2188/rng_state.pth +3 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-2188/scheduler.pt +3 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-2188/trainer_state.json +355 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-2188/training_args.bin +3 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-3282/config.json +30 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-3282/model.safetensors +3 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-3282/optimizer.pt +3 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-3282/rng_state.pth +3 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-3282/scheduler.pt +3 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-3282/trainer_state.json +519 -0
- outputs/bert_bilabel_finetuned_model/checkpoint-3282/training_args.bin +3 -0
- outputs/bert_bilabel_finetuned_model/final/config.json +30 -0
- outputs/bert_bilabel_finetuned_model/final/model.safetensors +3 -0
- outputs/bert_bilabel_finetuned_model/final/training_args.bin +3 -0
- outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/config.json +30 -0
- outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/model.safetensors +3 -0
- outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/optimizer.pt +3 -0
- outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/rng_state.pth +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,38 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data_source.xlsx filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
dataset_pretrain/Experiment_sentences_training_filtered_part1.csv filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
dataset_pretrain/domain_corpus.txt filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
wandb/run-20251113_020110-hwo46nr1/run-hwo46nr1.wandb filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
wandb/run-20251113_024451-lmhafthr/run-lmhafthr.wandb filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
wandb/run-20251113_055509-63h4lqr8/run-63h4lqr8.wandb filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
wandb/run-20251113_055942-lq4qoqk3/run-lq4qoqk3.wandb filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
wandb/run-20251113_074324-2o6q17un/run-2o6q17un.wandb filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
wandb/run-20251113_075410-cilrwgz8/run-cilrwgz8.wandb filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
wandb/run-20251113_080542-irf9fgra/run-irf9fgra.wandb filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
wandb/run-20251113_170012-vj6z0qct/run-vj6z0qct.wandb filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
wandb/run-20251114_144619-0hviozok/run-0hviozok.wandb filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
wandb/run-20251114_145658-0lepm1if/run-0lepm1if.wandb filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
wandb/run-20251114_150434-0nq8ji5k/run-0nq8ji5k.wandb filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
wandb/run-20251114_152637-xe0tjdf6/run-xe0tjdf6.wandb filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
wandb/run-20251114_153529-7r2aeenh/run-7r2aeenh.wandb filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
wandb/run-20251114_154223-xmrnfh0j/run-xmrnfh0j.wandb filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
wandb/run-20251114_154302-7tit87eb/run-7tit87eb.wandb filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
wandb/run-20251114_161829-g1azoa0i/run-g1azoa0i.wandb filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
wandb/run-20251114_171922-j1hfy78o/run-j1hfy78o.wandb filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
wandb/run-20251115_023230-j4s1o16p/run-j4s1o16p.wandb filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
wandb/run-20251115_024020-whj9y4hx/run-whj9y4hx.wandb filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
wandb/run-20251115_031217-29o94la6/run-29o94la6.wandb filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
wandb/run-20251115_032957-oljr07ni/run-oljr07ni.wandb filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
wandb/run-20251115_033525-i1hsksbs/run-i1hsksbs.wandb filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
wandb/run-20251115_033750-ybm95q2x/run-ybm95q2x.wandb filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
wandb/run-20251115_034104-e4a2rovd/run-e4a2rovd.wandb filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
wandb/run-20251115_034702-q5cv2xfu/run-q5cv2xfu.wandb filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
wandb/run-20251115_034922-e287xu9n/run-e287xu9n.wandb filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
wandb/run-20251115_034939-zlf3muf5/run-zlf3muf5.wandb filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
wandb/run-20251115_035223-nehpw594/run-nehpw594.wandb filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
wandb/run-20251115_035728-inhxwz05/run-inhxwz05.wandb filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
wandb/run-20251115_035746-cmttchar/run-cmttchar.wandb filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
wandb/run-20251115_050557-37a3t1f4/run-37a3t1f4.wandb filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
标注数据_更正后.xlsx filter=lfs diff=lfs merge=lfs -text
|
bert_finetue_task2.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import numpy as np
|
| 4 |
+
import json
|
| 5 |
+
import evaluate
|
| 6 |
+
from datasets import load_dataset
|
| 7 |
+
from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast
|
| 8 |
+
from transformers import TrainingArguments, Trainer
|
| 9 |
+
from torch import nn
|
| 10 |
+
from sklearn.metrics import f1_score, accuracy_score, jaccard_score
|
| 11 |
+
|
| 12 |
+
# --- 配置参数 ---
|
| 13 |
+
DATA_DIR = "./processed_data_task2_fixed"
|
| 14 |
+
# MODEL_NAME = "bert-base-chinese"
|
| 15 |
+
TOKENIZER_NAME = "valuesimplex-ai-lab/FinBERT2-base"
|
| 16 |
+
MODEL_NAME = "/home/hsichen/part_time/BERT_finetune/outputs/finbert2_dapt_model"
|
| 17 |
+
# MODEL_NAME = "valuesimplex-ai-lab/FinBERT2-base"
|
| 18 |
+
# 标签总数:Data, Action, Gain, Regu, Vague
|
| 19 |
+
NUM_LABELS = 5
|
| 20 |
+
OUTPUT_DIR = "/home/hsichen/part_time/BERT_finetune/outputs/finbert2_multilabel_model_finetuned_from_dapt"
|
| 21 |
+
EPOCHS = 5
|
| 22 |
+
BATCH_SIZE = 16
|
| 23 |
+
LEARNING_RATE = 2e-5
|
| 24 |
+
SEED = 42
|
| 25 |
+
|
| 26 |
+
# ----------------------------------------------------
|
| 27 |
+
# A. 定义支持多标签分类的 BERT 模型
|
| 28 |
+
# ----------------------------------------------------
|
| 29 |
+
class BertForMultiLabelClassification(BertPreTrainedModel):
|
| 30 |
+
"""
|
| 31 |
+
基于 BERT 的多标签分类模型,使用 BCEWithLogitsLoss
|
| 32 |
+
"""
|
| 33 |
+
def __init__(self, config):
|
| 34 |
+
super().__init__(config)
|
| 35 |
+
self.num_labels = config.num_labels
|
| 36 |
+
|
| 37 |
+
self.bert = BertModel(config)
|
| 38 |
+
|
| 39 |
+
classifier_dropout = config.hidden_dropout_prob
|
| 40 |
+
self.dropout = nn.Dropout(classifier_dropout)
|
| 41 |
+
|
| 42 |
+
# 线性层输出维度 = 标签数量 (5)
|
| 43 |
+
self.classifier = nn.Linear(config.hidden_size, self.num_labels)
|
| 44 |
+
|
| 45 |
+
self.post_init()
|
| 46 |
+
self.loss_fct = nn.BCEWithLogitsLoss()
|
| 47 |
+
|
| 48 |
+
def forward(self,
|
| 49 |
+
input_ids=None,
|
| 50 |
+
attention_mask=None,
|
| 51 |
+
token_type_ids=None,
|
| 52 |
+
labels=None):
|
| 53 |
+
|
| 54 |
+
outputs = self.bert(
|
| 55 |
+
input_ids,
|
| 56 |
+
attention_mask=attention_mask,
|
| 57 |
+
token_type_ids=token_type_ids,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# 取 [CLS] token 的隐藏状态 (即 pooler output)
|
| 61 |
+
pooled_output = outputs.pooler_output
|
| 62 |
+
pooled_output = self.dropout(pooled_output)
|
| 63 |
+
|
| 64 |
+
# 经过分类器层,输出 logits (未经 Sigmoid 的分数)
|
| 65 |
+
logits = self.classifier(pooled_output)
|
| 66 |
+
|
| 67 |
+
loss = None
|
| 68 |
+
if labels is not None:
|
| 69 |
+
# 确保 labels 是 float 类型,因为损失函数需要 float
|
| 70 |
+
loss = self.loss_fct(logits, labels.float())
|
| 71 |
+
|
| 72 |
+
return (loss, logits) if loss is not None else (logits,)
|
| 73 |
+
|
| 74 |
+
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
|
| 75 |
+
|
| 76 |
+
# 标签名称,用于报告输出
|
| 77 |
+
TAG_COLS = ['Data', 'Action', 'Gain', 'Regu', 'Vague']
|
| 78 |
+
PREDICTION_THRESHOLD = 0.5
|
| 79 |
+
|
| 80 |
+
def compute_metrics(p):
|
| 81 |
+
"""
|
| 82 |
+
计算多标签分类的评估指标,包括全局指标和每个类别的指标。
|
| 83 |
+
要求:损失、F1、Precision、Accuracy、Recall。
|
| 84 |
+
"""
|
| 85 |
+
# 损失(Loss)由 Trainer 自动处理并记录在 logs 中,这里主要关注评估指标
|
| 86 |
+
logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
|
| 87 |
+
labels = p.label_ids
|
| 88 |
+
|
| 89 |
+
# Sigmoid -> 概率
|
| 90 |
+
probs = 1 / (1 + np.exp(-logits))
|
| 91 |
+
# 应用阈值 0.5 得到二元预测
|
| 92 |
+
preds = (probs > PREDICTION_THRESHOLD).astype(int)
|
| 93 |
+
|
| 94 |
+
# ----------------------------------------
|
| 95 |
+
# 1. 全局汇总指标 (用于 metric_for_best_model)
|
| 96 |
+
# ----------------------------------------
|
| 97 |
+
metrics = {}
|
| 98 |
+
|
| 99 |
+
# Micro-F1 (通常用于选择最佳模型)
|
| 100 |
+
metrics["f1_micro"] = f1_score(labels, preds, average='micro')
|
| 101 |
+
# Macro-F1 (平衡各类别贡献)
|
| 102 |
+
metrics["f1_macro"] = f1_score(labels, preds, average='macro')
|
| 103 |
+
# 样本级 Jaccard (度量样本预测的标签集合与真实标签集合的重合度)
|
| 104 |
+
metrics["jaccard_samples"] = jaccard_score(labels, preds, average='samples')
|
| 105 |
+
|
| 106 |
+
# ----------------------------------------
|
| 107 |
+
# 2. 每个类别的指标 (Per-Class)
|
| 108 |
+
# ----------------------------------------
|
| 109 |
+
# 注意:多标签的 Per-Class Metrics 就是针对每一列(每个标签)做一次二分类指标计算。
|
| 110 |
+
for i, tag in enumerate(TAG_COLS):
|
| 111 |
+
y_true_class = labels[:, i] # 第 i 个标签的真实值
|
| 112 |
+
y_pred_class = preds[:, i] # 第 i 个标签的预测值
|
| 113 |
+
|
| 114 |
+
# 计算该类别的指标
|
| 115 |
+
# 类别级指标命名规范:{tag}_f1, {tag}_precision, {tag}_recall, {tag}_accuracy
|
| 116 |
+
|
| 117 |
+
# F1-Score (二分类指标)
|
| 118 |
+
metrics[f"{tag}_f1"] = f1_score(y_true_class, y_pred_class, average='binary', zero_division=0)
|
| 119 |
+
# Precision
|
| 120 |
+
metrics[f"{tag}_precision"] = precision_score(y_true_class, y_pred_class, average='binary', zero_division=0)
|
| 121 |
+
# Recall
|
| 122 |
+
metrics[f"{tag}_recall"] = recall_score(y_true_class, y_pred_class, average='binary', zero_division=0)
|
| 123 |
+
# Accuracy (当前类别预测对的样本数 / 总样本数)
|
| 124 |
+
# 注意:这里计算的是该标签本身的准确率,而非整个样本的准确率
|
| 125 |
+
metrics[f"{tag}_accuracy"] = accuracy_score(y_true_class, y_pred_class)
|
| 126 |
+
|
| 127 |
+
return metrics
|
| 128 |
+
# ----------------------------------------------------
|
| 129 |
+
# C. 主微调函数
|
| 130 |
+
# ----------------------------------------------------
|
| 131 |
+
def finetune_multilabel_bert():
|
| 132 |
+
|
| 133 |
+
# 1. 加载数据集
|
| 134 |
+
print("--- 1. 加载数据集 ---")
|
| 135 |
+
data_files = {
|
| 136 |
+
"train": os.path.join(DATA_DIR, "train.csv"),
|
| 137 |
+
"validation": os.path.join(DATA_DIR, "validation.csv"),
|
| 138 |
+
"test": os.path.join(DATA_DIR, "test.csv")
|
| 139 |
+
}
|
| 140 |
+
raw_datasets = load_dataset("csv", data_files=data_files)
|
| 141 |
+
|
| 142 |
+
# 2. 加载分词器和自定义模型
|
| 143 |
+
print("--- 2. 加载分词器和自定义模型 ---")
|
| 144 |
+
|
| 145 |
+
tokenizer = BertTokenizerFast.from_pretrained(TOKENIZER_NAME)
|
| 146 |
+
|
| 147 |
+
# 使用自定义模型 BertForMultiLabelClassification
|
| 148 |
+
model = BertForMultiLabelClassification.from_pretrained(
|
| 149 |
+
MODEL_NAME,
|
| 150 |
+
num_labels=NUM_LABELS,
|
| 151 |
+
ignore_mismatched_sizes=True
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
# 3. 数据集 Tokenization (分词)
|
| 155 |
+
def tokenize_function(examples):
|
| 156 |
+
# 假设文本在 'text' 列
|
| 157 |
+
tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
|
| 158 |
+
|
| 159 |
+
# 将 CSV 中的字符串形式的标签列表 (e.g., '[1, 0, 1, 0, 0]') 转换为 float 列表
|
| 160 |
+
|
| 161 |
+
label_list = json.loads(examples["labels"].replace("'", "\""))
|
| 162 |
+
tokenized["labels"] = label_list
|
| 163 |
+
return tokenized
|
| 164 |
+
|
| 165 |
+
# 注意:多标签任务 map 时 batched=False 确保标签解析正确
|
| 166 |
+
tokenized_datasets = raw_datasets.map(tokenize_function, batched=False)
|
| 167 |
+
|
| 168 |
+
# 移除原始的 'text' 列
|
| 169 |
+
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
|
| 170 |
+
|
| 171 |
+
# 调整 PyTorch 张量格式
|
| 172 |
+
tokenized_datasets.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])
|
| 173 |
+
|
| 174 |
+
train_dataset = tokenized_datasets["train"]
|
| 175 |
+
eval_dataset = tokenized_datasets["validation"]
|
| 176 |
+
test_dataset = tokenized_datasets["test"]
|
| 177 |
+
|
| 178 |
+
# 4. 设置训练参数
|
| 179 |
+
print("--- 3. 设置训练参数和 Trainer ---")
|
| 180 |
+
training_args = TrainingArguments(
|
| 181 |
+
output_dir=OUTPUT_DIR,
|
| 182 |
+
num_train_epochs=EPOCHS,
|
| 183 |
+
per_device_train_batch_size=BATCH_SIZE,
|
| 184 |
+
per_device_eval_batch_size=BATCH_SIZE,
|
| 185 |
+
warmup_steps=200,
|
| 186 |
+
weight_decay=0.01,
|
| 187 |
+
logging_steps=50,
|
| 188 |
+
eval_strategy="steps",
|
| 189 |
+
eval_steps=50,
|
| 190 |
+
save_strategy="steps",
|
| 191 |
+
save_steps=500,
|
| 192 |
+
load_best_model_at_end=True,
|
| 193 |
+
metric_for_best_model="f1_micro",
|
| 194 |
+
seed=SEED,
|
| 195 |
+
learning_rate=3e-5,
|
| 196 |
+
report_to="wandb"
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# 5. 初始化 Trainer
|
| 200 |
+
trainer = Trainer(
|
| 201 |
+
model=model,
|
| 202 |
+
args=training_args,
|
| 203 |
+
train_dataset=train_dataset,
|
| 204 |
+
eval_dataset=eval_dataset,
|
| 205 |
+
compute_metrics=compute_metrics,
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
# 6. 开始训练
|
| 209 |
+
print("--- 4. 开始训练 ---")
|
| 210 |
+
trainer.train()
|
| 211 |
+
|
| 212 |
+
# 7. 评估测试集
|
| 213 |
+
print("--- 5. 评估测试集 ---")
|
| 214 |
+
results = trainer.evaluate(test_dataset)
|
| 215 |
+
print(f"测试集评估结果: {results}")
|
| 216 |
+
|
| 217 |
+
# 8. 保存最终模型
|
| 218 |
+
trainer.save_model(os.path.join(OUTPUT_DIR, "final"))
|
| 219 |
+
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "final"))
|
| 220 |
+
print(f"模型和分词器已保存至: {os.path.join(OUTPUT_DIR, 'final')}")
|
| 221 |
+
|
| 222 |
+
if __name__ == "__main__":
|
| 223 |
+
finetune_multilabel_bert()
|
bert_finetue_task2_froze.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import numpy as np
|
| 4 |
+
import json
|
| 5 |
+
import evaluate
|
| 6 |
+
from datasets import load_dataset
|
| 7 |
+
from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast
|
| 8 |
+
from transformers import TrainingArguments, Trainer
|
| 9 |
+
from torch import nn
|
| 10 |
+
from sklearn.metrics import f1_score, accuracy_score, jaccard_score
|
| 11 |
+
|
| 12 |
+
# --- 配置参数 ---
|
| 13 |
+
DATA_DIR = "./processed_data_task2_fixed"
|
| 14 |
+
MODEL_NAME = "bert-base-chinese"
|
| 15 |
+
# MODEL_NAME = "/home/hsichen/part_time/BERT_finetune/outputs/finbert2_dapt_model"
|
| 16 |
+
# MODEL_NAME = "valuesimplex-ai-lab/FinBERT2-base"
|
| 17 |
+
# 标签总数:Data, Action, Gain, Regu, Vague
|
| 18 |
+
NUM_LABELS = 5
|
| 19 |
+
OUTPUT_DIR = "/home/hsichen/part_time/BERT_finetune/outputs/bert_multilabel_frozen_classifier_finetuned_model"
|
| 20 |
+
EPOCHS = 5
|
| 21 |
+
BATCH_SIZE = 16
|
| 22 |
+
LEARNING_RATE = 1e-4
|
| 23 |
+
SEED = 42
|
| 24 |
+
|
| 25 |
+
# ----------------------------------------------------
|
| 26 |
+
# A. 定义支持多标签分类的 BERT 模型
|
| 27 |
+
# ----------------------------------------------------
|
| 28 |
+
class BertForMultiLabelClassification(BertPreTrainedModel):
|
| 29 |
+
"""
|
| 30 |
+
基于 BERT 的多标签分类模型,使用 BCEWithLogitsLoss
|
| 31 |
+
"""
|
| 32 |
+
def __init__(self, config):
|
| 33 |
+
super().__init__(config)
|
| 34 |
+
self.num_labels = config.num_labels
|
| 35 |
+
|
| 36 |
+
self.bert = BertModel(config)
|
| 37 |
+
|
| 38 |
+
classifier_dropout = config.hidden_dropout_prob
|
| 39 |
+
self.dropout = nn.Dropout(classifier_dropout)
|
| 40 |
+
|
| 41 |
+
# 线性层输出维度 = 标签数量 (5)
|
| 42 |
+
self.classifier = nn.Linear(config.hidden_size, self.num_labels)
|
| 43 |
+
|
| 44 |
+
self.post_init()
|
| 45 |
+
self.loss_fct = nn.BCEWithLogitsLoss()
|
| 46 |
+
|
| 47 |
+
def forward(self,
|
| 48 |
+
input_ids=None,
|
| 49 |
+
attention_mask=None,
|
| 50 |
+
token_type_ids=None,
|
| 51 |
+
labels=None):
|
| 52 |
+
|
| 53 |
+
outputs = self.bert(
|
| 54 |
+
input_ids,
|
| 55 |
+
attention_mask=attention_mask,
|
| 56 |
+
token_type_ids=token_type_ids,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# 取 [CLS] token 的隐藏状态 (即 pooler output)
|
| 60 |
+
pooled_output = outputs.pooler_output
|
| 61 |
+
pooled_output = self.dropout(pooled_output)
|
| 62 |
+
|
| 63 |
+
# 经过分类器层,输出 logits (未经 Sigmoid 的分数)
|
| 64 |
+
logits = self.classifier(pooled_output)
|
| 65 |
+
|
| 66 |
+
loss = None
|
| 67 |
+
if labels is not None:
|
| 68 |
+
# 确保 labels 是 float 类型,因为损失函数需要 float
|
| 69 |
+
loss = self.loss_fct(logits, labels.float())
|
| 70 |
+
|
| 71 |
+
return (loss, logits) if loss is not None else (logits,)
|
| 72 |
+
|
| 73 |
+
# ----------------------------------------------------
|
| 74 |
+
# B. 评估指标函数 (Multi-Label)
|
| 75 |
+
# ----------------------------------------------------
|
| 76 |
+
|
| 77 |
+
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
|
| 78 |
+
|
| 79 |
+
# 标签名称,用于报告输出
|
| 80 |
+
TAG_COLS = ['Data', 'Action', 'Gain', 'Regu', 'Vague']
|
| 81 |
+
PREDICTION_THRESHOLD = 0.5
|
| 82 |
+
|
| 83 |
+
def compute_metrics(p):
|
| 84 |
+
"""
|
| 85 |
+
计算多标签分类的评估指标,包括全局指标和每个类别的指标。
|
| 86 |
+
要求:损失、F1、Precision、Accuracy、Recall。
|
| 87 |
+
"""
|
| 88 |
+
# 损失(Loss)由 Trainer 自动处理并记录在 logs 中,这里主要关注评估指标
|
| 89 |
+
# logits 可能是 tuple,需要提取
|
| 90 |
+
logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
|
| 91 |
+
labels = p.label_ids
|
| 92 |
+
|
| 93 |
+
# Sigmoid -> 概率
|
| 94 |
+
probs = 1 / (1 + np.exp(-logits))
|
| 95 |
+
# 应用阈值 0.5 得到二元预测
|
| 96 |
+
preds = (probs > PREDICTION_THRESHOLD).astype(int)
|
| 97 |
+
|
| 98 |
+
# ----------------------------------------
|
| 99 |
+
# 1. 全局汇总指标 (用于 metric_for_best_model)
|
| 100 |
+
# ----------------------------------------
|
| 101 |
+
metrics = {}
|
| 102 |
+
|
| 103 |
+
# Micro-F1 (通常用于选择最佳模型)
|
| 104 |
+
metrics["f1_micro"] = f1_score(labels, preds, average='micro')
|
| 105 |
+
# Macro-F1 (平衡各类别贡献)
|
| 106 |
+
metrics["f1_macro"] = f1_score(labels, preds, average='macro')
|
| 107 |
+
# 样本级 Jaccard (度量样本预测的标签集合与真实标签集合的重合度)
|
| 108 |
+
metrics["jaccard_samples"] = jaccard_score(labels, preds, average='samples')
|
| 109 |
+
|
| 110 |
+
# ----------------------------------------
|
| 111 |
+
# 2. 每个类别的指标 (Per-Class)
|
| 112 |
+
# ----------------------------------------
|
| 113 |
+
# 注意:多标签的 Per-Class Metrics 就是针对每一列(每个标签)做一次二分类指标计算。
|
| 114 |
+
for i, tag in enumerate(TAG_COLS):
|
| 115 |
+
y_true_class = labels[:, i] # 第 i 个标签的真实值
|
| 116 |
+
y_pred_class = preds[:, i] # 第 i 个标签的预测值
|
| 117 |
+
|
| 118 |
+
# 计算该类别的指标
|
| 119 |
+
# 类别级指标命名规范:{tag}_f1, {tag}_precision, {tag}_recall, {tag}_accuracy
|
| 120 |
+
|
| 121 |
+
# F1-Score (二分类指标)
|
| 122 |
+
metrics[f"{tag}_f1"] = f1_score(y_true_class, y_pred_class, average='binary', zero_division=0)
|
| 123 |
+
# Precision
|
| 124 |
+
metrics[f"{tag}_precision"] = precision_score(y_true_class, y_pred_class, average='binary', zero_division=0)
|
| 125 |
+
# Recall
|
| 126 |
+
metrics[f"{tag}_recall"] = recall_score(y_true_class, y_pred_class, average='binary', zero_division=0)
|
| 127 |
+
# Accuracy (当前类别预测对的样本数 / 总样本数)
|
| 128 |
+
# 注意:这里计算的是该标签本身的准确率,而非整个样本的准确率
|
| 129 |
+
metrics[f"{tag}_accuracy"] = accuracy_score(y_true_class, y_pred_class)
|
| 130 |
+
|
| 131 |
+
return metrics
|
| 132 |
+
|
| 133 |
+
# ----------------------------------------------------
|
| 134 |
+
# C. 主微调函数
|
| 135 |
+
# ----------------------------------------------------
|
| 136 |
+
def finetune_multilabel_bert():
|
| 137 |
+
|
| 138 |
+
# 1. 加载数据集
|
| 139 |
+
print("--- 1. 加载数据集 ---")
|
| 140 |
+
data_files = {
|
| 141 |
+
"train": os.path.join(DATA_DIR, "train.csv"),
|
| 142 |
+
"validation": os.path.join(DATA_DIR, "validation.csv"),
|
| 143 |
+
"test": os.path.join(DATA_DIR, "test.csv")
|
| 144 |
+
}
|
| 145 |
+
raw_datasets = load_dataset("csv", data_files=data_files)
|
| 146 |
+
|
| 147 |
+
# 2. 加载分词器和自定义模型
|
| 148 |
+
print("--- 2. 加载分词器和自定义模型 ---")
|
| 149 |
+
|
| 150 |
+
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
|
| 151 |
+
|
| 152 |
+
# 使用自定义模型 BertForMultiLabelClassification
|
| 153 |
+
model = BertForMultiLabelClassification.from_pretrained(
|
| 154 |
+
MODEL_NAME,
|
| 155 |
+
num_labels=NUM_LABELS,
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
print("--- 2.1. 冻结 BERT 主体权重 (修正版) ---")
|
| 159 |
+
|
| 160 |
+
trainable_params_count = 0
|
| 161 |
+
for name, param in model.named_parameters():
|
| 162 |
+
# 检查参数名是否以 'bert.' 开头
|
| 163 |
+
if name.startswith('bert.'):
|
| 164 |
+
param.requires_grad = False
|
| 165 |
+
else:
|
| 166 |
+
# 如果不以 'bert.' 开头,则它是分类器 (classifier.weight/bias)
|
| 167 |
+
param.requires_grad = True
|
| 168 |
+
trainable_params_count += param.numel() # 计算可训练参数量
|
| 169 |
+
|
| 170 |
+
# 检查冻结是否生效
|
| 171 |
+
total_params = sum(p.numel() for p in model.parameters())
|
| 172 |
+
# 这里的 trainable_params 应该接近 train_param_count 的值
|
| 173 |
+
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 174 |
+
|
| 175 |
+
print(f"总参数量: {total_params / 1e6:.2f} M")
|
| 176 |
+
print(f"可训练参数量 (仅分类头): {trainable_params / 1e6:.6f} M")
|
| 177 |
+
|
| 178 |
+
# 3. 数据集 Tokenization (分词)
|
| 179 |
+
def tokenize_function(examples):
|
| 180 |
+
# 假设文本在 'text' 列
|
| 181 |
+
tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
|
| 182 |
+
|
| 183 |
+
# 将 CSV 中的字符串形式的标签列表 (e.g., '[1, 0, 1, 0, 0]') 转换为 float 列表
|
| 184 |
+
# 使用 json.loads 比 eval() 更安全
|
| 185 |
+
label_list = json.loads(examples["labels"].replace("'", "\""))
|
| 186 |
+
tokenized["labels"] = label_list
|
| 187 |
+
return tokenized
|
| 188 |
+
|
| 189 |
+
# 注意:多标签任务 map 时 batched=False 确保标签解析正确
|
| 190 |
+
tokenized_datasets = raw_datasets.map(tokenize_function, batched=False)
|
| 191 |
+
|
| 192 |
+
# 移除原始的 'text' 列
|
| 193 |
+
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
|
| 194 |
+
|
| 195 |
+
# 调整 PyTorch 张量格式
|
| 196 |
+
tokenized_datasets.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])
|
| 197 |
+
|
| 198 |
+
train_dataset = tokenized_datasets["train"]
|
| 199 |
+
eval_dataset = tokenized_datasets["validation"]
|
| 200 |
+
test_dataset = tokenized_datasets["test"]
|
| 201 |
+
|
| 202 |
+
# 4. 设置训练参数
|
| 203 |
+
print("--- 3. 设置训练参数和 Trainer ---")
|
| 204 |
+
training_args = TrainingArguments(
|
| 205 |
+
output_dir=OUTPUT_DIR,
|
| 206 |
+
num_train_epochs=EPOCHS,
|
| 207 |
+
per_device_train_batch_size=BATCH_SIZE,
|
| 208 |
+
per_device_eval_batch_size=BATCH_SIZE,
|
| 209 |
+
warmup_steps=200,
|
| 210 |
+
weight_decay=0.01,
|
| 211 |
+
logging_steps=50,
|
| 212 |
+
eval_strategy="steps",
|
| 213 |
+
eval_steps=50,
|
| 214 |
+
save_strategy="steps",
|
| 215 |
+
save_steps=50,
|
| 216 |
+
load_best_model_at_end=True,
|
| 217 |
+
metric_for_best_model="f1_micro",
|
| 218 |
+
seed=SEED,
|
| 219 |
+
learning_rate=3e-5,
|
| 220 |
+
report_to="wandb"
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
# 5. 初始化 Trainer
|
| 224 |
+
trainer = Trainer(
|
| 225 |
+
model=model,
|
| 226 |
+
args=training_args,
|
| 227 |
+
train_dataset=train_dataset,
|
| 228 |
+
eval_dataset=eval_dataset,
|
| 229 |
+
compute_metrics=compute_metrics,
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
# 6. 开始训练
|
| 233 |
+
print("--- 4. 开始训练 ---")
|
| 234 |
+
trainer.train()
|
| 235 |
+
|
| 236 |
+
# 7. 评估测试集
|
| 237 |
+
print("--- 5. 评估测试集 ---")
|
| 238 |
+
results = trainer.evaluate(test_dataset)
|
| 239 |
+
print(f"测试集评估结果: {results}")
|
| 240 |
+
|
| 241 |
+
# 8. 保存最终模型
|
| 242 |
+
trainer.save_model(os.path.join(OUTPUT_DIR, "final"))
|
| 243 |
+
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "final"))
|
| 244 |
+
print(f"模型和分词器已保存至: {os.path.join(OUTPUT_DIR, 'final')}")
|
| 245 |
+
|
| 246 |
+
if __name__ == "__main__":
|
| 247 |
+
finetune_multilabel_bert()
|
bert_finetune_task1.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import numpy as np
|
| 4 |
+
from datasets import load_dataset
|
| 5 |
+
import evaluate
|
| 6 |
+
from transformers import BertForSequenceClassification, BertTokenizerFast
|
| 7 |
+
from transformers import TrainingArguments, Trainer
|
| 8 |
+
|
| 9 |
+
# --- 配置参数 ---
|
| 10 |
+
# 上一步处理好的数据目录
|
| 11 |
+
DATA_DIR = "./processed_data_task1"
|
| 12 |
+
# MODEL_NAME = "valuesimplex-ai-lab/FinBERT2-base"
|
| 13 |
+
MODEL_NAME = '/home/hsichen/part_time/BERT_finetune/outputs/finbert2_dapt_model'
|
| 14 |
+
# 预留给模型的标签数量 (0 和 1)
|
| 15 |
+
NUM_LABELS = 2
|
| 16 |
+
# 微调结果保存目录
|
| 17 |
+
OUTPUT_DIR = "./finbert2_bilabel_finetuned_model_from_dapt"
|
| 18 |
+
# 训练参数
|
| 19 |
+
EPOCHS = 3
|
| 20 |
+
BATCH_SIZE = 16
|
| 21 |
+
LEARNING_RATE = 2e-5
|
| 22 |
+
SEED = 42
|
| 23 |
+
|
| 24 |
+
def compute_metrics(p):
|
| 25 |
+
"""
|
| 26 |
+
计算评估指标 (准确率, F1, Precision, Recall)
|
| 27 |
+
"""
|
| 28 |
+
preds = np.argmax(p.predictions, axis=1)
|
| 29 |
+
labels = p.label_ids
|
| 30 |
+
|
| 31 |
+
# 使用 Hugging Face 的 metrics 库
|
| 32 |
+
metric = evaluate.load("accuracy")
|
| 33 |
+
accuracy = metric.compute(predictions=preds, references=labels)["accuracy"]
|
| 34 |
+
|
| 35 |
+
# 也可以计算 F1, Precision, Recall
|
| 36 |
+
metric_f1 = evaluate.load("f1")
|
| 37 |
+
f1 = metric_f1.compute(predictions=preds, references=labels, average="binary")["f1"]
|
| 38 |
+
|
| 39 |
+
return {
|
| 40 |
+
'accuracy': accuracy,
|
| 41 |
+
'f1': f1,
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def finetune_bert():
|
| 46 |
+
"""
|
| 47 |
+
执行BERT模型的微调
|
| 48 |
+
"""
|
| 49 |
+
# 1. 加载数据集
|
| 50 |
+
print("--- 1. 加载数据集 ---")
|
| 51 |
+
try:
|
| 52 |
+
# 加载CSV文件作为DatasetDict对象
|
| 53 |
+
data_files = {
|
| 54 |
+
"train": os.path.join(DATA_DIR, "train.csv"),
|
| 55 |
+
"validation": os.path.join(DATA_DIR, "validation.csv"),
|
| 56 |
+
"test": os.path.join(DATA_DIR, "test.csv")
|
| 57 |
+
}
|
| 58 |
+
raw_datasets = load_dataset("csv", data_files=data_files)
|
| 59 |
+
print(raw_datasets)
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"加载数据集时发生错误,请检查 {DATA_DIR} 目录下的CSV文件: {e}")
|
| 62 |
+
return
|
| 63 |
+
|
| 64 |
+
# 2. 加载分词器和模型
|
| 65 |
+
print("--- 2. 加载分词器和模型 ---")
|
| 66 |
+
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
|
| 67 |
+
model = BertForSequenceClassification.from_pretrained(
|
| 68 |
+
MODEL_NAME,
|
| 69 |
+
num_labels=NUM_LABELS
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# 3. 数据集 Tokenization (分词)
|
| 73 |
+
def tokenize_function(examples):
|
| 74 |
+
# 假设文本在 'text' 列,标签在 'label' 列
|
| 75 |
+
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
|
| 76 |
+
|
| 77 |
+
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
|
| 78 |
+
|
| 79 |
+
# 选择训练、验证、测试子集
|
| 80 |
+
train_dataset = tokenized_datasets["train"]
|
| 81 |
+
eval_dataset = tokenized_datasets["validation"]
|
| 82 |
+
test_dataset = tokenized_datasets["test"]
|
| 83 |
+
|
| 84 |
+
# 4. 设置训练参数
|
| 85 |
+
print("--- 3. 设置训练参数和 Trainer ---")
|
| 86 |
+
training_args = TrainingArguments(
|
| 87 |
+
output_dir=OUTPUT_DIR,
|
| 88 |
+
num_train_epochs=EPOCHS,
|
| 89 |
+
per_device_train_batch_size=BATCH_SIZE,
|
| 90 |
+
per_device_eval_batch_size=BATCH_SIZE,
|
| 91 |
+
warmup_steps=500,
|
| 92 |
+
weight_decay=0.01,
|
| 93 |
+
logging_steps=50,
|
| 94 |
+
eval_strategy="epoch", # 每个epoch结束时评估
|
| 95 |
+
save_strategy="epoch",
|
| 96 |
+
load_best_model_at_end=True, # 训练结束后加载效果最好的模型
|
| 97 |
+
metric_for_best_model="f1",
|
| 98 |
+
seed=SEED,
|
| 99 |
+
learning_rate=LEARNING_RATE,
|
| 100 |
+
report_to="wandb" # 将训练日志保存到wandb
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# 5. 初始化 Trainer
|
| 104 |
+
trainer = Trainer(
|
| 105 |
+
model=model,
|
| 106 |
+
args=training_args,
|
| 107 |
+
train_dataset=train_dataset,
|
| 108 |
+
eval_dataset=eval_dataset,
|
| 109 |
+
compute_metrics=compute_metrics,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# 6. 开始训练
|
| 113 |
+
print("--- 4. 开始训练 ---")
|
| 114 |
+
trainer.train()
|
| 115 |
+
|
| 116 |
+
# 7. 评估测试集
|
| 117 |
+
print("--- 5. 评估测试集 ---")
|
| 118 |
+
results = trainer.evaluate(test_dataset)
|
| 119 |
+
print(f"测试集评估结果: {results}")
|
| 120 |
+
|
| 121 |
+
# 8. 保存最终模型
|
| 122 |
+
trainer.save_model(os.path.join(OUTPUT_DIR, "final"))
|
| 123 |
+
print(f"模型和分词器已保存至: {os.path.join(OUTPUT_DIR, 'final')}")
|
| 124 |
+
|
| 125 |
+
if __name__ == "__main__":
|
| 126 |
+
finetune_bert()
|
bert_finetune_task1_froze.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import numpy as np
|
| 4 |
+
from datasets import load_dataset
|
| 5 |
+
import evaluate
|
| 6 |
+
from transformers import BertForSequenceClassification, BertTokenizerFast
|
| 7 |
+
from transformers import TrainingArguments, Trainer
|
| 8 |
+
|
| 9 |
+
# --- 配置参数 ---
|
| 10 |
+
|
| 11 |
+
DATA_DIR = "./processed_data_task1"
|
| 12 |
+
# 使用的BERT模型,中文任务推荐使用BERT-base-chinese
|
| 13 |
+
|
| 14 |
+
MODEL_NAME = "valuesimplex-ai-lab/FinBERT2-base"
|
| 15 |
+
# MODEL_NAME = "bert-base-chinese"
|
| 16 |
+
# 预留给模型的标签数量 (0 和 1)
|
| 17 |
+
NUM_LABELS = 2
|
| 18 |
+
# 微调结果保存目录
|
| 19 |
+
OUTPUT_DIR = "/home/hsichen/part_time/BERT_finetune/outputs/finbert2_bilabel_frozen_classifier_finetuned_model"
|
| 20 |
+
# 训练参数
|
| 21 |
+
EPOCHS = 3
|
| 22 |
+
BATCH_SIZE = 16
|
| 23 |
+
LEARNING_RATE = 1e-4
|
| 24 |
+
SEED = 42
|
| 25 |
+
|
| 26 |
+
def compute_metrics(p):
|
| 27 |
+
"""
|
| 28 |
+
计算评估指标 (准确率, F1, Precision, Recall)
|
| 29 |
+
"""
|
| 30 |
+
preds = np.argmax(p.predictions, axis=1)
|
| 31 |
+
labels = p.label_ids
|
| 32 |
+
|
| 33 |
+
# 使用 Hugging Face 的 metrics 库
|
| 34 |
+
metric = evaluate.load("accuracy")
|
| 35 |
+
accuracy = metric.compute(predictions=preds, references=labels)["accuracy"]
|
| 36 |
+
|
| 37 |
+
# 也可以计算 F1, Precision, Recall
|
| 38 |
+
metric_f1 = evaluate.load("f1")
|
| 39 |
+
f1 = metric_f1.compute(predictions=preds, references=labels, average="binary")["f1"]
|
| 40 |
+
|
| 41 |
+
return {
|
| 42 |
+
'accuracy': accuracy,
|
| 43 |
+
'f1': f1,
|
| 44 |
+
# 可以根据需要添加其他指标,如 precision, recall 等
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def finetune_bert():
|
| 49 |
+
"""
|
| 50 |
+
执行BERT模型的微调
|
| 51 |
+
"""
|
| 52 |
+
# 1. 加载数据集
|
| 53 |
+
print("--- 1. 加载数据集 ---")
|
| 54 |
+
try:
|
| 55 |
+
# 加载CSV文件作为DatasetDict对象
|
| 56 |
+
data_files = {
|
| 57 |
+
"train": os.path.join(DATA_DIR, "train.csv"),
|
| 58 |
+
"validation": os.path.join(DATA_DIR, "validation.csv"),
|
| 59 |
+
"test": os.path.join(DATA_DIR, "test.csv")
|
| 60 |
+
}
|
| 61 |
+
raw_datasets = load_dataset("csv", data_files=data_files)
|
| 62 |
+
print(raw_datasets)
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print(f"加载数据集时发生错误,请检查 {DATA_DIR} 目录下的CSV文件: {e}")
|
| 65 |
+
return
|
| 66 |
+
|
| 67 |
+
print("--- 2. 加载分词器和模型 ---")
|
| 68 |
+
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
|
| 69 |
+
model = BertForSequenceClassification.from_pretrained(
|
| 70 |
+
MODEL_NAME,
|
| 71 |
+
num_labels=NUM_LABELS
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
print("--- 2.1. 冻结 BERT 主体权重 (修正版) ---")
|
| 75 |
+
|
| 76 |
+
trainable_params_count = 0
|
| 77 |
+
for name, param in model.named_parameters():
|
| 78 |
+
# 检查参数名是否以 'bert.' 开头
|
| 79 |
+
if name.startswith('bert.'):
|
| 80 |
+
param.requires_grad = False
|
| 81 |
+
else:
|
| 82 |
+
# 如果不以 'bert.' 开头,则它是分类器 (classifier.weight/bias)
|
| 83 |
+
param.requires_grad = True
|
| 84 |
+
trainable_params_count += param.numel() # 计算可训练参数量
|
| 85 |
+
|
| 86 |
+
# 检查冻结是否生效
|
| 87 |
+
total_params = sum(p.numel() for p in model.parameters())
|
| 88 |
+
# 这里的 trainable_params 应该接近 train_param_count 的值
|
| 89 |
+
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 90 |
+
|
| 91 |
+
print(f"总参数量: {total_params / 1e6:.2f} M")
|
| 92 |
+
print(f"可训练参数量 (仅分类头): {trainable_params / 1e6:.6f} M")
|
| 93 |
+
|
| 94 |
+
# 3. 数据集 Tokenization (分词)
|
| 95 |
+
def tokenize_function(examples):
|
| 96 |
+
# 假设文本在 'text' 列,标签在 'label' 列
|
| 97 |
+
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
|
| 98 |
+
|
| 99 |
+
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
|
| 100 |
+
|
| 101 |
+
# 选择训练、验证、测试子集
|
| 102 |
+
train_dataset = tokenized_datasets["train"]
|
| 103 |
+
eval_dataset = tokenized_datasets["validation"]
|
| 104 |
+
test_dataset = tokenized_datasets["test"]
|
| 105 |
+
|
| 106 |
+
# 4. 设置训练参数
|
| 107 |
+
print("--- 3. 设置训练参数和 Trainer ---")
|
| 108 |
+
training_args = TrainingArguments(
|
| 109 |
+
output_dir=OUTPUT_DIR,
|
| 110 |
+
num_train_epochs=EPOCHS,
|
| 111 |
+
per_device_train_batch_size=BATCH_SIZE,
|
| 112 |
+
per_device_eval_batch_size=BATCH_SIZE,
|
| 113 |
+
warmup_steps=500,
|
| 114 |
+
weight_decay=0.01,
|
| 115 |
+
logging_steps=50,
|
| 116 |
+
eval_strategy="epoch", # 每个epoch结束时评估
|
| 117 |
+
save_strategy="epoch",
|
| 118 |
+
load_best_model_at_end=True, # 训练结束后加载效果最好的模型
|
| 119 |
+
metric_for_best_model="f1",
|
| 120 |
+
seed=SEED,
|
| 121 |
+
learning_rate=LEARNING_RATE,
|
| 122 |
+
report_to="wandb" # 将训练日志保存到wandb
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
# 5. 初始化 Trainer
|
| 126 |
+
trainer = Trainer(
|
| 127 |
+
model=model,
|
| 128 |
+
args=training_args,
|
| 129 |
+
train_dataset=train_dataset,
|
| 130 |
+
eval_dataset=eval_dataset,
|
| 131 |
+
compute_metrics=compute_metrics,
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# 6. 开始训练
|
| 135 |
+
print("--- 4. 开始训练 ---")
|
| 136 |
+
trainer.train()
|
| 137 |
+
|
| 138 |
+
# 7. 评估测试集
|
| 139 |
+
print("--- 5. 评估测试集 ---")
|
| 140 |
+
results = trainer.evaluate(test_dataset)
|
| 141 |
+
print(f"测试集评估结果: {results}")
|
| 142 |
+
|
| 143 |
+
# 8. 保存最终模型
|
| 144 |
+
trainer.save_model(os.path.join(OUTPUT_DIR, "final"))
|
| 145 |
+
print(f"模型和分词器已保存至: {os.path.join(OUTPUT_DIR, 'final')}")
|
| 146 |
+
|
| 147 |
+
if __name__ == "__main__":
|
| 148 |
+
finetune_bert()
|
continue_pretrain.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
from transformers import (
|
| 5 |
+
AutoModelForMaskedLM,
|
| 6 |
+
AutoTokenizer,
|
| 7 |
+
TrainingArguments,
|
| 8 |
+
Trainer,
|
| 9 |
+
DataCollatorForLanguageModeling,
|
| 10 |
+
set_seed
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
# --- 配置参数 ---
|
| 14 |
+
DOMAIN_TEXT_FILE = "/home/hsichen/part_time/BERT_finetune/dataset_pretrain/domain_corpus.txt"
|
| 15 |
+
MODEL_NAME = "valuesimplex-ai-lab/FinBERT2-base"
|
| 16 |
+
OUTPUT_DIR = "./bert_dapt_model"
|
| 17 |
+
|
| 18 |
+
# 预训练超参数
|
| 19 |
+
DAPT_LR = 1e-5 # 较低的学习率,防止破坏原有知识
|
| 20 |
+
DAPT_EPOCHS = 3 # 适中的训练轮数
|
| 21 |
+
BATCH_SIZE = 16 # 批次大小 (请根据您的 GPU 显存调整)
|
| 22 |
+
MLM_PROBABILITY = 0.15 # 掩码比例
|
| 23 |
+
SEED = 42
|
| 24 |
+
NUM_PROC = 64 # 并行处理的进程数
|
| 25 |
+
|
| 26 |
+
# 设置随机种子以保证结果可复现
|
| 27 |
+
set_seed(SEED)
|
| 28 |
+
|
| 29 |
+
def domain_adaptive_pretrain():
|
| 30 |
+
|
| 31 |
+
# 路径检查
|
| 32 |
+
if not os.path.exists(DOMAIN_TEXT_FILE):
|
| 33 |
+
print(f"致命错误:领域语料库文件未找到在 {DOMAIN_TEXT_FILE}。请先运行数据预处理脚本。")
|
| 34 |
+
return
|
| 35 |
+
|
| 36 |
+
# 1. 加载模型和分词器
|
| 37 |
+
print("--- 1. 加载模型和分词器 ---")
|
| 38 |
+
# AutoTokenizer 会自动识别模型对应的分词器
|
| 39 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 40 |
+
# AutoModelForMaskedLM 专门用于 MLM 任务
|
| 41 |
+
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
|
| 42 |
+
|
| 43 |
+
# 2. 加载和处理文本数据集
|
| 44 |
+
print("--- 2. 加载和处理文本数据集 ---")
|
| 45 |
+
|
| 46 |
+
# 使用 datasets 库加载纯文本文件
|
| 47 |
+
# 文件必须包含在 'train' 键下,以支持 Trainer
|
| 48 |
+
raw_datasets = load_dataset("text", data_files={"train": DOMAIN_TEXT_FILE})
|
| 49 |
+
|
| 50 |
+
# 定义 tokenization 函数
|
| 51 |
+
def tokenize_function(examples):
|
| 52 |
+
# 截断但不填充,因为 DataCollatorForLanguageModeling 会处理填充
|
| 53 |
+
return tokenizer(
|
| 54 |
+
examples["text"],
|
| 55 |
+
truncation=True,
|
| 56 |
+
max_length=512, # 推荐修改
|
| 57 |
+
return_special_tokens_mask=True
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
tokenized_datasets = raw_datasets.map(
|
| 61 |
+
tokenize_function, batched=True, remove_columns=["text"], num_proc=NUM_PROC
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
# 将长文本切块 (Chunking) 和分组 (Grouping)
|
| 65 |
+
def group_texts(examples):
|
| 66 |
+
# 拼接所有文本
|
| 67 |
+
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
| 68 |
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
| 69 |
+
|
| 70 |
+
# 设定切块大小
|
| 71 |
+
chunk_size = 512
|
| 72 |
+
|
| 73 |
+
# print(f"Total length: {total_length}, after chunking: {total_length // chunk_size}")
|
| 74 |
+
|
| 75 |
+
# 通过截断 total_length 来丢弃最后一个不完整的切块
|
| 76 |
+
total_length = (total_length // chunk_size) * chunk_size
|
| 77 |
+
|
| 78 |
+
# 将文本切分成 max_length (512) 的块
|
| 79 |
+
result = {
|
| 80 |
+
k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
|
| 81 |
+
for k, t in concatenated_examples.items()
|
| 82 |
+
}
|
| 83 |
+
# 标签 ID 设为 input_ids,DataCollator 会将非掩码位置设置为 -100
|
| 84 |
+
result["labels"] = result["input_ids"].copy()
|
| 85 |
+
return result
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# 最终的 DAPT 训练数据集
|
| 89 |
+
lm_datasets = tokenized_datasets.map(
|
| 90 |
+
group_texts, batched=True, num_proc=NUM_PROC
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# # 3. 数据收集器 (动态掩码)
|
| 94 |
+
# # 这个 Collator 会在每个批次中随机应用 15% 的掩码
|
| 95 |
+
data_collator = DataCollatorForLanguageModeling(
|
| 96 |
+
tokenizer=tokenizer,
|
| 97 |
+
mlm=True,
|
| 98 |
+
mlm_probability=MLM_PROBABILITY
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
# 4. 设置训练参数
|
| 102 |
+
print("--- 3. 设置训练参数 ---")
|
| 103 |
+
training_args = TrainingArguments(
|
| 104 |
+
output_dir=OUTPUT_DIR,
|
| 105 |
+
num_train_epochs=DAPT_EPOCHS,
|
| 106 |
+
per_device_train_batch_size=BATCH_SIZE,
|
| 107 |
+
learning_rate=DAPT_LR,
|
| 108 |
+
weight_decay=0.01,
|
| 109 |
+
logging_steps=50,
|
| 110 |
+
save_strategy="epoch",
|
| 111 |
+
report_to="wandb",
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
# 5. 初始化 Trainer
|
| 115 |
+
trainer = Trainer(
|
| 116 |
+
model=model,
|
| 117 |
+
args=training_args,
|
| 118 |
+
train_dataset=lm_datasets["train"],
|
| 119 |
+
data_collator=data_collator,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
# 6. 开始继续预训练
|
| 123 |
+
print("--- 4. 开始继续预训练 ---")
|
| 124 |
+
trainer.train()
|
| 125 |
+
|
| 126 |
+
# 7. 保存 DAPT 模型
|
| 127 |
+
trainer.save_model(OUTPUT_DIR)
|
| 128 |
+
tokenizer.save_pretrained(OUTPUT_DIR)
|
| 129 |
+
print(f"DAPT 模型已保存至: {OUTPUT_DIR}")
|
| 130 |
+
|
| 131 |
+
if __name__ == "__main__":
|
| 132 |
+
domain_adaptive_pretrain()
|
dapt_data_process.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import os
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
# --- 配置参数 ---
|
| 6 |
+
INPUT_CSV_PATH = "/home/hsichen/part_time/BERT_finetune/dataset_pretrain/Experiment_sentences_training_filtered_part1.csv"
|
| 7 |
+
# 输出的纯文本文件路径 (用于 DAPT 脚本中的 DOMAIN_TEXT_FILE)
|
| 8 |
+
OUTPUT_TXT_PATH = "/home/hsichen/part_time/BERT_finetune/dataset_pretrain/domain_corpus.txt"
|
| 9 |
+
# 纯文本文件的编码
|
| 10 |
+
ENCODING = 'utf-8'
|
| 11 |
+
|
| 12 |
+
def prepare_dapt_data(input_csv_path: str, output_txt_path: str, encoding: str):
|
| 13 |
+
"""
|
| 14 |
+
从 CSV 文件中提取 'sentence' 列,并保存为纯文本文件,每行一个句子。
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
input_csv_path: 原始 CSV 文件的路径。
|
| 18 |
+
output_txt_path: 目标纯文本文件的路径。
|
| 19 |
+
encoding: 文件编码。
|
| 20 |
+
"""
|
| 21 |
+
print(f"--- 1. 读取数据: {input_csv_path} ---")
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
# 尝试读取 CSV 文件
|
| 25 |
+
# 假设文件不含复杂编码问题,使用默认读取
|
| 26 |
+
df = pd.read_csv(input_csv_path)
|
| 27 |
+
except FileNotFoundError:
|
| 28 |
+
print(f"错误:输入文件未找到在路径: {input_csv_path}")
|
| 29 |
+
return
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"读取 CSV 文件时发生错误: {e}")
|
| 32 |
+
return
|
| 33 |
+
|
| 34 |
+
# --- 2. 数据处理与清洗 ---
|
| 35 |
+
|
| 36 |
+
# 检查 'sentence' 列是否存在
|
| 37 |
+
if 'sentence' not in df.columns:
|
| 38 |
+
print("错误:CSV 文件中未找到 'sentence' 列。请检查列名是否正确。")
|
| 39 |
+
return
|
| 40 |
+
|
| 41 |
+
# 提取 'sentence' 列,并去除 NaN 值
|
| 42 |
+
sentences: List[str] = df['sentence'].dropna().astype(str).tolist()
|
| 43 |
+
|
| 44 |
+
if not sentences:
|
| 45 |
+
print("警告:'sentence' 列中没有有效数据,无法生成语料库。")
|
| 46 |
+
return
|
| 47 |
+
|
| 48 |
+
# 简单清洗:去除多余的空格或换行符(如果有的话)
|
| 49 |
+
sentences = [s.strip() for s in sentences]
|
| 50 |
+
|
| 51 |
+
print(f"提取到 {len(sentences)} 条有效句子。")
|
| 52 |
+
|
| 53 |
+
# --- 3. 保存为纯文本文件 ---
|
| 54 |
+
print(f"--- 3. 保存至纯文本文件: {output_txt_path} ---")
|
| 55 |
+
|
| 56 |
+
# 将句子列表写入文件,每行一个句子
|
| 57 |
+
try:
|
| 58 |
+
with open(output_txt_path, 'w', encoding=encoding) as f:
|
| 59 |
+
f.write('\n'.join(sentences))
|
| 60 |
+
|
| 61 |
+
print(f"数据成功保存!")
|
| 62 |
+
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print(f"写入文件时发生错误: {e}")
|
| 65 |
+
|
| 66 |
+
# --- 运行主函数 ---
|
| 67 |
+
if __name__ == "__main__":
|
| 68 |
+
# 请确保您已安装 pandas: pip install pandas
|
| 69 |
+
prepare_dapt_data(INPUT_CSV_PATH, OUTPUT_TXT_PATH, ENCODING)
|
data_process_task1.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sklearn.model_selection import train_test_split
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
# --- 配置参数 ---
|
| 6 |
+
EXCEL_FILE_PATH = "/home/hsichen/part_time/BERT_finetune/data_source.xlsx"
|
| 7 |
+
OUTPUT_DIR = "./processed_data"
|
| 8 |
+
# 划分数据集的比例 (训练集:测试集)
|
| 9 |
+
TEST_SIZE = 0.2
|
| 10 |
+
# 随机种子,用于确保每次划分结果一致
|
| 11 |
+
RANDOM_SEED = 42
|
| 12 |
+
|
| 13 |
+
def preprocess_data(excel_path: str, output_dir: str, test_size: float, random_seed: int):
|
| 14 |
+
"""
|
| 15 |
+
读取Excel数据,进行清洗和格式转换,并划分为训练集和测试集。
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
excel_path: 原始Excel文件的路径。
|
| 19 |
+
output_dir: 存放处理后CSV文件的目录。
|
| 20 |
+
test_size: 测试集占总数据的比例。
|
| 21 |
+
random_seed: 随机种子。
|
| 22 |
+
"""
|
| 23 |
+
print(f"--- 1. 读取数据: {excel_path} ---")
|
| 24 |
+
try:
|
| 25 |
+
# 尝试读取Excel文件的第一个工作表
|
| 26 |
+
df = pd.read_excel(excel_path)
|
| 27 |
+
except FileNotFoundError:
|
| 28 |
+
print(f"错误:文件未找到在路径: {excel_path}")
|
| 29 |
+
return
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"读取Excel文件时发生错误: {e}")
|
| 32 |
+
return
|
| 33 |
+
|
| 34 |
+
# --- 2. 数据清洗与格式转换 ---
|
| 35 |
+
|
| 36 |
+
# 检查所需的列是否存在
|
| 37 |
+
required_cols = ['sentence', 'Envir']
|
| 38 |
+
if not all(col in df.columns for col in required_cols):
|
| 39 |
+
print(f"错误:Excel中缺少必需的列。找到的列有: {df.columns.tolist()}")
|
| 40 |
+
print(f"必需的列是: {required_cols}")
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
# 重命名列以符合通用NLP任务格式 (text 和 label)
|
| 44 |
+
df = df.rename(columns={'sentence': 'text', 'Envir': 'label'})
|
| 45 |
+
|
| 46 |
+
# 确保'label'列是整数类型 (0或1)
|
| 47 |
+
df['label'] = df['label'].astype(int)
|
| 48 |
+
|
| 49 |
+
# 仅保留 'text' 和 'label' 两列
|
| 50 |
+
df = df[['text', 'label']].dropna()
|
| 51 |
+
print(f"原始数据条数: {len(df)}")
|
| 52 |
+
|
| 53 |
+
# --- 3. 划分数据集 ---
|
| 54 |
+
print(f"--- 划分数据集 (训练集:{1-test_size}, 测试集:{test_size}) ---")
|
| 55 |
+
|
| 56 |
+
# 将数据划分为训练集和测试集,使用分层抽样 (stratify) 确保标签比例一致
|
| 57 |
+
train_df, test_df = train_test_split(
|
| 58 |
+
df,
|
| 59 |
+
test_size=test_size,
|
| 60 |
+
random_state=random_seed,
|
| 61 |
+
stratify=df['label']
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
val_size_from_train = 0.1 / (1 - test_size)
|
| 65 |
+
train_df, val_df = train_test_split(
|
| 66 |
+
train_df,
|
| 67 |
+
test_size=val_size_from_train,
|
| 68 |
+
random_state=random_seed,
|
| 69 |
+
stratify=train_df['label']
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# --- 4. 保存为CSV文件 ---
|
| 73 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 74 |
+
|
| 75 |
+
train_output_path = os.path.join(output_dir, 'train.csv')
|
| 76 |
+
val_output_path = os.path.join(output_dir, 'validation.csv')
|
| 77 |
+
test_output_path = os.path.join(output_dir, 'test.csv')
|
| 78 |
+
|
| 79 |
+
train_df.to_csv(train_output_path, index=False)
|
| 80 |
+
val_df.to_csv(val_output_path, index=False)
|
| 81 |
+
test_df.to_csv(test_output_path, index=False)
|
| 82 |
+
|
| 83 |
+
print("--- 结果保存成功 ---")
|
| 84 |
+
print(f"训练集条数: {len(train_df)}. 保存至: {train_output_path}")
|
| 85 |
+
print(f"验证集条数: {len(val_df)}. 保存至: {val_output_path}")
|
| 86 |
+
print(f"测试集条数: {len(test_df)}. 保存至: {test_output_path}")
|
| 87 |
+
|
| 88 |
+
if __name__ == "__main__":
|
| 89 |
+
preprocess_data(EXCEL_FILE_PATH, OUTPUT_DIR, TEST_SIZE, RANDOM_SEED)
|
data_process_task2.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sklearn.model_selection import train_test_split
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
# --- 配置参数 ---
|
| 6 |
+
EXCEL_FILE_PATH = "/home/hsichen/part_time/BERT_finetune/标注数据_更正后.xlsx"
|
| 7 |
+
OUTPUT_DIR = "./processed_data_task2_fixed"
|
| 8 |
+
# 划分数据集的比例 (训练集:测试集)
|
| 9 |
+
TEST_SIZE = 0.2
|
| 10 |
+
# 随机种子,用于确保每次划分结果一致
|
| 11 |
+
RANDOM_SEED = 42
|
| 12 |
+
|
| 13 |
+
def preprocess_data(excel_path: str, output_dir: str, test_size: float, random_seed: int):
|
| 14 |
+
"""
|
| 15 |
+
读取Excel数据,进行清洗和格式转换,并划分为训练集、验证集和测试集。
|
| 16 |
+
使用标签数量 (Label Count) 进行分层抽样。
|
| 17 |
+
"""
|
| 18 |
+
print(f"--- 1. 读取数据: {excel_path} ---")
|
| 19 |
+
try:
|
| 20 |
+
df = pd.read_excel(excel_path)
|
| 21 |
+
except FileNotFoundError:
|
| 22 |
+
print(f"错误:文件未找到在路径: {excel_path}")
|
| 23 |
+
return
|
| 24 |
+
except Exception as e:
|
| 25 |
+
print(f"读取Excel文件时发生错误: {e}")
|
| 26 |
+
return
|
| 27 |
+
|
| 28 |
+
# --- 2. 数据清洗与格式转换 ---
|
| 29 |
+
|
| 30 |
+
# 1. 筛选数据:只保留 Envir=1 的行
|
| 31 |
+
df = df[df['Envir'] == 1].copy()
|
| 32 |
+
print(f"筛选 Envir=1 后数据条数: {len(df)}")
|
| 33 |
+
|
| 34 |
+
# 2. 整合标签
|
| 35 |
+
TAG_COLS = ['Data', 'Action', 'Gain', 'Regu', 'Vague']
|
| 36 |
+
|
| 37 |
+
# 将标签列转换为列表
|
| 38 |
+
df['labels'] = df[TAG_COLS].values.tolist()
|
| 39 |
+
|
| 40 |
+
df = df.rename(columns={'sentence': 'text'})
|
| 41 |
+
|
| 42 |
+
# 3. 统计标签组合及其个数 (用于分析,保留逻辑)
|
| 43 |
+
print("--- 3. 标签组合类型统计 ---")
|
| 44 |
+
|
| 45 |
+
# 将标签列表转换为元组
|
| 46 |
+
df['label_tuple'] = df['labels'].apply(tuple)
|
| 47 |
+
|
| 48 |
+
# 将元组转换为字符串,作为 train_test_split 的 stratify 参数
|
| 49 |
+
# 【新增/修改】:创建用于分层的字符串列
|
| 50 |
+
df['stratify_col'] = df['label_tuple'].astype(str)
|
| 51 |
+
|
| 52 |
+
print("-" * 30)
|
| 53 |
+
|
| 54 |
+
# 仅保留 'text', 'labels', 'stratify_col' 列用于划分 (注意不再需要 'label_count')
|
| 55 |
+
df = df[['text', 'labels', 'stratify_col']].copy()
|
| 56 |
+
|
| 57 |
+
# --- 4. 划分数据集 (使用 stratify_col 进行分层) ---
|
| 58 |
+
print(f"--- 划分数据集 (训练集:{1-test_size}, 测试集:{test_size}) ---")
|
| 59 |
+
|
| 60 |
+
# 第一次划分:训练集+验证集 vs 测试集
|
| 61 |
+
train_val_df, test_df = train_test_split(
|
| 62 |
+
df,
|
| 63 |
+
test_size=test_size,
|
| 64 |
+
random_state=random_seed,
|
| 65 |
+
# 【关键修改】:使用 'stratify_col' 进行分层
|
| 66 |
+
stratify=df['stratify_col']
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# 第二次划分:训练集 vs 验证集
|
| 70 |
+
val_size_from_train = 0.1 / (1 - test_size)
|
| 71 |
+
|
| 72 |
+
train_df, val_df = train_test_split(
|
| 73 |
+
train_val_df,
|
| 74 |
+
test_size=val_size_from_train,
|
| 75 |
+
random_state=random_seed,
|
| 76 |
+
# 【关键修改】:使用 'stratify_col' 进行分层
|
| 77 |
+
stratify=train_val_df['stratify_col']
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# --- 5. 保存为CSV文件 ---
|
| 81 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 82 |
+
|
| 83 |
+
# 保存时仅保留 BERT 需要的 'text' 和 'labels' 列
|
| 84 |
+
train_df[['text', 'labels']].to_csv(os.path.join(output_dir, 'train.csv'), index=False)
|
| 85 |
+
val_df[['text', 'labels']].to_csv(os.path.join(output_dir, 'validation.csv'), index=False)
|
| 86 |
+
test_df[['text', 'labels']].to_csv(os.path.join(output_dir, 'test.csv'), index=False)
|
| 87 |
+
|
| 88 |
+
print("--- 结果保存成功 ---")
|
| 89 |
+
print(f"训练集条数: {len(train_df)}. 保存至: {os.path.join(output_dir, 'train.csv')}")
|
| 90 |
+
print(f"验证集条数: {len(val_df)}. 保存至: {os.path.join(output_dir, 'validation.csv')}")
|
| 91 |
+
print(f"测试集条数: {len(test_df)}. 保存至: {os.path.join(output_dir, 'test.csv')}")
|
| 92 |
+
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
# 请确保您已安装必要的库: pip install pandas openpyxl scikit-learn
|
| 95 |
+
preprocess_data(EXCEL_FILE_PATH, OUTPUT_DIR, TEST_SIZE, RANDOM_SEED)
|
data_source.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4d691e8657afbf64b7d7e51fb69293651106ee4e890f8046bdbb588593936b45
|
| 3 |
+
size 4571190
|
dataset_pretrain/Experiment_sentences_training_filtered_part1.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:165ed0ca475a1c026c609cb441ce9969d1ccf33cbec744cfe4277deffd60228e
|
| 3 |
+
size 1365723082
|
dataset_pretrain/domain_corpus.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1337725c4e8ea9ca886466a88d9bb9185bdbafe1100465d368919a918519db4f
|
| 3 |
+
size 787886543
|
dataset_pretrain/预训练数据第二部分_年报.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff51de07b828c9d4ea132ae5e91f66dac802187bc2598bcccb3ee58a4693b3c1
|
| 3 |
+
size 698809156
|
model_inference_task1.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import numpy as np
|
| 4 |
+
from torch import nn
|
| 5 |
+
from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast, AutoConfig
|
| 6 |
+
|
| 7 |
+
# 定义标签名称,与任务一致
|
| 8 |
+
TAG_COLS = ['Data', 'Action', 'Gain', 'Regu', 'Vague']
|
| 9 |
+
PREDICTION_THRESHOLD = 0.5 # 预测阈值
|
| 10 |
+
|
| 11 |
+
# ----------------------------------------------------
|
| 12 |
+
# A. 定义支持多标签分类的 BERT 模型(必须与训练时一致)
|
| 13 |
+
# ----------------------------------------------------
|
| 14 |
+
class BertForMultiLabelClassification(BertPreTrainedModel):
|
| 15 |
+
"""
|
| 16 |
+
基于 BERT 的多标签分类模型,使用 BCEWithLogitsLoss
|
| 17 |
+
"""
|
| 18 |
+
def __init__(self, config):
|
| 19 |
+
super().__init__(config)
|
| 20 |
+
self.num_labels = config.num_labels
|
| 21 |
+
|
| 22 |
+
# 加载 BERT 主体
|
| 23 |
+
self.bert = BertModel(config)
|
| 24 |
+
|
| 25 |
+
# 加载训练时的 dropout 比例
|
| 26 |
+
classifier_dropout = config.hidden_dropout_prob
|
| 27 |
+
self.dropout = nn.Dropout(classifier_dropout)
|
| 28 |
+
|
| 29 |
+
# 加载训练时的分类器层
|
| 30 |
+
self.classifier = nn.Linear(config.hidden_size, self.num_labels)
|
| 31 |
+
|
| 32 |
+
self.post_init()
|
| 33 |
+
# 注意:推理时不需要损失函数,但保持结构完整性
|
| 34 |
+
self.loss_fct = nn.BCEWithLogitsLoss()
|
| 35 |
+
|
| 36 |
+
def forward(self,
|
| 37 |
+
input_ids=None,
|
| 38 |
+
attention_mask=None,
|
| 39 |
+
token_type_ids=None,
|
| 40 |
+
labels=None):
|
| 41 |
+
|
| 42 |
+
outputs = self.bert(
|
| 43 |
+
input_ids,
|
| 44 |
+
attention_mask=attention_mask,
|
| 45 |
+
token_type_ids=token_type_ids,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# 取 [CLS] token 的隐藏状态 (即 pooler output)
|
| 49 |
+
pooled_output = outputs.pooler_output
|
| 50 |
+
pooled_output = self.dropout(pooled_output)
|
| 51 |
+
|
| 52 |
+
# 经过分类器层,输出 logits (未经 Sigmoid 的分数)
|
| 53 |
+
logits = self.classifier(pooled_output)
|
| 54 |
+
|
| 55 |
+
# 推理时 labels 为 None,直接返回 logits
|
| 56 |
+
return logits
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# ----------------------------------------------------
|
| 60 |
+
# B. 模型推理函数
|
| 61 |
+
# ----------------------------------------------------
|
| 62 |
+
def predict_multilabel(checkpoint_path: str, tokenizer_path: str, text_to_predict: str):
|
| 63 |
+
"""
|
| 64 |
+
加载模型检查点,对单个文本进行多标签预测。
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
checkpoint_path: BERT 模型检查点目录(包含 config.json, model.safetensors)。
|
| 68 |
+
tokenizer_path: 分词器路径或名称。
|
| 69 |
+
text_to_predict: 待预测的输入文本。
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
包含预测标签和概率的字典。
|
| 73 |
+
"""
|
| 74 |
+
print(f"--- 1. 正在加载模型和分词器: {checkpoint_path} ---")
|
| 75 |
+
|
| 76 |
+
try:
|
| 77 |
+
config = AutoConfig.from_pretrained(checkpoint_path)
|
| 78 |
+
# 确保配置中的 num_labels 与实际标签数量匹配
|
| 79 |
+
if config.num_labels != len(TAG_COLS):
|
| 80 |
+
# 运行时动态修正 num_labels,以防 checkpoint-config.json 里的 num_labels 不匹配
|
| 81 |
+
config.num_labels = len(TAG_COLS)
|
| 82 |
+
print(f"警告: 检查点配置的 num_labels 已从 {config.num_labels} 修正为 {len(TAG_COLS)}")
|
| 83 |
+
|
| 84 |
+
# 从检查点加载分词器(假设分词器文件已存在或被复制)
|
| 85 |
+
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path)
|
| 86 |
+
|
| 87 |
+
# 使用自定义模型类加载模型权重
|
| 88 |
+
model = BertForMultiLabelClassification.from_pretrained(
|
| 89 |
+
checkpoint_path,
|
| 90 |
+
config=config # 传入更新后的 config
|
| 91 |
+
)
|
| 92 |
+
except Exception as e:
|
| 93 |
+
print(f"加载模型或分词器失败,请检查路径中是否包含所有必需文件(如 model.safetensors, config.json, vocab.txt): {e}")
|
| 94 |
+
return None
|
| 95 |
+
|
| 96 |
+
model.eval() # 切换到评估模式 (关闭 Dropout等)
|
| 97 |
+
|
| 98 |
+
# 2. 文本编码
|
| 99 |
+
inputs = tokenizer(
|
| 100 |
+
text_to_predict,
|
| 101 |
+
padding="max_length",
|
| 102 |
+
truncation=True,
|
| 103 |
+
max_length=512,
|
| 104 |
+
return_tensors="pt"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# 3. 执行推理
|
| 108 |
+
with torch.no_grad():
|
| 109 |
+
# 模型返回的是 logits
|
| 110 |
+
outputs = model(**inputs)
|
| 111 |
+
logits = outputs.cpu().numpy() # 移动到 CPU 并转为 numpy
|
| 112 |
+
|
| 113 |
+
# 4. 后处理:Sigmoid 和 阈值
|
| 114 |
+
# 应用 Sigmoid 转换为概率
|
| 115 |
+
probs = 1 / (1 + np.exp(-logits))
|
| 116 |
+
# 应用阈值得到二元预测
|
| 117 |
+
preds = (probs > PREDICTION_THRESHOLD).astype(int)
|
| 118 |
+
|
| 119 |
+
# 5. 格式化输出
|
| 120 |
+
result = {}
|
| 121 |
+
|
| 122 |
+
# 遍历每个标签,并记录其预测结果和概率
|
| 123 |
+
for i, tag in enumerate(TAG_COLS):
|
| 124 |
+
# 结果只针对单个样本(批次大小为 1)
|
| 125 |
+
is_predicted = preds[0][i] == 1
|
| 126 |
+
probability = probs[0][i]
|
| 127 |
+
|
| 128 |
+
result[tag] = {
|
| 129 |
+
"predicted": is_predicted,
|
| 130 |
+
"probability": float(f"{probability:.4f}") # 保留 4 位小数
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
print("--- 5. 预测结果 ---")
|
| 134 |
+
|
| 135 |
+
# 提取所有预测为 True 的标签
|
| 136 |
+
predicted_tags = [tag for tag, info in result.items() if info["predicted"]]
|
| 137 |
+
|
| 138 |
+
if predicted_tags:
|
| 139 |
+
print(f"预测标签���别: {predicted_tags}")
|
| 140 |
+
print(f"对应概率:")
|
| 141 |
+
for tag in predicted_tags:
|
| 142 |
+
print(f" - {tag}: {result[tag]['probability']}")
|
| 143 |
+
else:
|
| 144 |
+
print("未预测任何标签(所有标签概率均低于 0.5)。")
|
| 145 |
+
print(f"所有标签的最高概率: {max(p['probability'] for p in result.values()):.4f}")
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# ----------------------------------------------------
|
| 149 |
+
# C. 示例运行
|
| 150 |
+
# ----------------------------------------------------
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
# 以下三个参数是需要替换的,TOKENIZER需要与MODEL匹配
|
| 153 |
+
MODEL_CHECKPOINT = "/home/hsichen/part_time/BERT_finetune/outputs/finbert2_multilabel_model_finetuned_from_dapt/final"
|
| 154 |
+
TOKENIZER = 'valuesimplex-ai-lab/FinBERT2-base'
|
| 155 |
+
# TOKENIZER = 'bert-base-chinese'
|
| 156 |
+
SAMPLE_TEXT = "密切关注安全环保对原料市场的影响,提前落实应对预案;"
|
| 157 |
+
|
| 158 |
+
# 确保检查点目录存在
|
| 159 |
+
if not os.path.exists(MODEL_CHECKPOINT):
|
| 160 |
+
print(f"错误:模型检查点目录不存在: {MODEL_CHECKPOINT}")
|
| 161 |
+
else:
|
| 162 |
+
predict_multilabel(MODEL_CHECKPOINT,TOKENIZER, SAMPLE_TEXT)
|
model_inference_task2.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import numpy as np
|
| 4 |
+
from torch import nn
|
| 5 |
+
from transformers import AutoModelForSequenceClassification, BertTokenizerFast, AutoConfig, pipeline, BertPreTrainedModel, BertModel
|
| 6 |
+
|
| 7 |
+
# 定义标签名称,与任务一致
|
| 8 |
+
BINARY_LABELS = ['Non-Envir', 'Envir']
|
| 9 |
+
NUM_LABELS = 2
|
| 10 |
+
|
| 11 |
+
# ----------------------------------------------------
|
| 12 |
+
# A. 定义支持多标签分类的 BERT 模型(必须与训练时一致)
|
| 13 |
+
# ----------------------------------------------------
|
| 14 |
+
class BertForMultiLabelClassification(BertPreTrainedModel):
|
| 15 |
+
"""
|
| 16 |
+
基于 BERT 的多标签分类模型,使用 BCEWithLogitsLoss
|
| 17 |
+
"""
|
| 18 |
+
def __init__(self, config):
|
| 19 |
+
super().__init__(config)
|
| 20 |
+
self.num_labels = config.num_labels
|
| 21 |
+
|
| 22 |
+
# 加载 BERT 主体
|
| 23 |
+
self.bert = BertModel(config)
|
| 24 |
+
|
| 25 |
+
# 加载训练时的 dropout 比例
|
| 26 |
+
classifier_dropout = config.hidden_dropout_prob
|
| 27 |
+
self.dropout = nn.Dropout(classifier_dropout)
|
| 28 |
+
|
| 29 |
+
# 加载训练时的分类器层
|
| 30 |
+
self.classifier = nn.Linear(config.hidden_size, self.num_labels)
|
| 31 |
+
|
| 32 |
+
self.post_init()
|
| 33 |
+
# 注意:推理时不需要损失函数,但保持结构完整性
|
| 34 |
+
self.loss_fct = nn.BCEWithLogitsLoss()
|
| 35 |
+
|
| 36 |
+
def forward(self,
|
| 37 |
+
input_ids=None,
|
| 38 |
+
attention_mask=None,
|
| 39 |
+
token_type_ids=None,
|
| 40 |
+
labels=None):
|
| 41 |
+
|
| 42 |
+
outputs = self.bert(
|
| 43 |
+
input_ids,
|
| 44 |
+
attention_mask=attention_mask,
|
| 45 |
+
token_type_ids=token_type_ids,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# 取 [CLS] token 的隐藏状态 (即 pooler output)
|
| 49 |
+
pooled_output = outputs.pooler_output
|
| 50 |
+
pooled_output = self.dropout(pooled_output)
|
| 51 |
+
|
| 52 |
+
# 经过分类器层,输出 logits (未经 Sigmoid 的分数)
|
| 53 |
+
logits = self.classifier(pooled_output)
|
| 54 |
+
|
| 55 |
+
# 推理时 labels 为 None,直接返回 logits
|
| 56 |
+
return logits
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# ----------------------------------------------------
|
| 60 |
+
# B. 模型推理函数
|
| 61 |
+
# ----------------------------------------------------
|
| 62 |
+
def predict_binary_classification(checkpoint_path: str, tokenizer_path: str, text_to_predict: str):
|
| 63 |
+
"""
|
| 64 |
+
加载 BERT 二分类模型检查点,对单个文本进行二分类预测。
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
checkpoint_path: BERT 模型检查点目录(包含 config.json, model.safetensors)。
|
| 68 |
+
tokenizer_path: 分词器路径或名称。
|
| 69 |
+
text_to_predict: 待预测的输入文本。
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
包含预测标签和概率的字典。
|
| 73 |
+
"""
|
| 74 |
+
print(f"--- 1. 正在加载二分类模型和分词器: {checkpoint_path} ---")
|
| 75 |
+
|
| 76 |
+
try:
|
| 77 |
+
# 1. 加载配置和分词器
|
| 78 |
+
config = AutoConfig.from_pretrained(checkpoint_path, num_labels=NUM_LABELS)
|
| 79 |
+
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path)
|
| 80 |
+
|
| 81 |
+
# 2. 使用标准的 AutoModelForSequenceClassification 加载模型
|
| 82 |
+
# 这将自动处理模型加载和分类头维度不匹配的问题
|
| 83 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 84 |
+
checkpoint_path,
|
| 85 |
+
config=config,
|
| 86 |
+
ignore_mismatched_sizes=True # 容忍加载时的分类头尺寸不匹配
|
| 87 |
+
)
|
| 88 |
+
except Exception as e:
|
| 89 |
+
print(f"加载模型或分词器失败,请检查路径中是否包含所有必需文件: {e}")
|
| 90 |
+
return None
|
| 91 |
+
|
| 92 |
+
model.eval() # 切换到评估模式
|
| 93 |
+
|
| 94 |
+
# 3. 文本编码
|
| 95 |
+
inputs = tokenizer(
|
| 96 |
+
text_to_predict,
|
| 97 |
+
padding=True,
|
| 98 |
+
truncation=True,
|
| 99 |
+
max_length=512,
|
| 100 |
+
return_tensors="pt"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# 4. 执行推理
|
| 104 |
+
with torch.no_grad():
|
| 105 |
+
# 模型返回的是 Logits (维度通常是 [1, 2])
|
| 106 |
+
outputs = model(**inputs)
|
| 107 |
+
logits = outputs.logits # 获取 Logits
|
| 108 |
+
|
| 109 |
+
# 应用 Softmax 转换为概率分布
|
| 110 |
+
probabilities = torch.softmax(logits, dim=1).cpu().numpy()[0]
|
| 111 |
+
|
| 112 |
+
# 确定预测的类别索引 (0 或 1)
|
| 113 |
+
predicted_index = np.argmax(probabilities)
|
| 114 |
+
|
| 115 |
+
# 5. 格式化输出
|
| 116 |
+
|
| 117 |
+
# 预测的类别名称
|
| 118 |
+
predicted_label = BINARY_LABELS[predicted_index]
|
| 119 |
+
# 预测类别的概率
|
| 120 |
+
predicted_prob = probabilities[predicted_index]
|
| 121 |
+
|
| 122 |
+
# 打印结果
|
| 123 |
+
print("--- 5. 预测结果 ---")
|
| 124 |
+
print(f"输入文本: {text_to_predict}")
|
| 125 |
+
print(f"预测类别: {predicted_label}")
|
| 126 |
+
print(f"对应概率: {predicted_prob:.4f}")
|
| 127 |
+
|
| 128 |
+
# 返回所有类别的概率
|
| 129 |
+
result = {
|
| 130 |
+
'prediction': predicted_label,
|
| 131 |
+
'probability': float(f"{predicted_prob:.4f}"),
|
| 132 |
+
'all_probabilities': {
|
| 133 |
+
BINARY_LABELS[i]: float(f"{probabilities[i]:.4f}") for i in range(NUM_LABELS)
|
| 134 |
+
}
|
| 135 |
+
}
|
| 136 |
+
return result
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
# ----------------------------------------------------
|
| 140 |
+
# C. 示例运行
|
| 141 |
+
# ----------------------------------------------------
|
| 142 |
+
if __name__ == "__main__":
|
| 143 |
+
# 以下三个参数是需要替换的,TOKENIZER需要与MODEL匹��
|
| 144 |
+
MODEL_CHECKPOINT = "/home/hsichen/part_time/BERT_finetune/outputs/finbert2_bilabel_finetuned_model_from_dapt/final"
|
| 145 |
+
TOKENIZER = 'valuesimplex-ai-lab/FinBERT2-base'
|
| 146 |
+
# TOKENIZER = 'bert-base-chinese'
|
| 147 |
+
SAMPLE_TEXT = "密切关注安全环保对原料市场的影响,提前落实应对预案;"
|
| 148 |
+
|
| 149 |
+
# 确保检查点目录存在
|
| 150 |
+
if not os.path.exists(MODEL_CHECKPOINT):
|
| 151 |
+
print(f"错误:模型检查点目录不存在: {MODEL_CHECKPOINT}")
|
| 152 |
+
else:
|
| 153 |
+
predict_binary_classification(MODEL_CHECKPOINT,TOKENIZER, SAMPLE_TEXT)
|
outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertForMultiLabelClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
| 7 |
+
"directionality": "bidi",
|
| 8 |
+
"dtype": "float32",
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"id2label": {
|
| 13 |
+
"0": "LABEL_0",
|
| 14 |
+
"1": "LABEL_1",
|
| 15 |
+
"2": "LABEL_2",
|
| 16 |
+
"3": "LABEL_3",
|
| 17 |
+
"4": "LABEL_4"
|
| 18 |
+
},
|
| 19 |
+
"initializer_range": 0.02,
|
| 20 |
+
"intermediate_size": 3072,
|
| 21 |
+
"label2id": {
|
| 22 |
+
"LABEL_0": 0,
|
| 23 |
+
"LABEL_1": 1,
|
| 24 |
+
"LABEL_2": 2,
|
| 25 |
+
"LABEL_3": 3,
|
| 26 |
+
"LABEL_4": 4
|
| 27 |
+
},
|
| 28 |
+
"layer_norm_eps": 1e-12,
|
| 29 |
+
"max_position_embeddings": 512,
|
| 30 |
+
"model_type": "bert",
|
| 31 |
+
"num_attention_heads": 12,
|
| 32 |
+
"num_hidden_layers": 12,
|
| 33 |
+
"pad_token_id": 0,
|
| 34 |
+
"pooler_fc_size": 768,
|
| 35 |
+
"pooler_num_attention_heads": 12,
|
| 36 |
+
"pooler_num_fc_layers": 3,
|
| 37 |
+
"pooler_size_per_head": 128,
|
| 38 |
+
"pooler_type": "first_token_transform",
|
| 39 |
+
"transformers_version": "5.0.0.dev0",
|
| 40 |
+
"type_vocab_size": 2,
|
| 41 |
+
"use_cache": false,
|
| 42 |
+
"vocab_size": 21128
|
| 43 |
+
}
|
outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db1b29e958367916a505a9c6b0c691768326cd696d2a1f18b4977621aff808d4
|
| 3 |
+
size 409109468
|
outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"mask_token": "[MASK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"sep_token": "[SEP]",
|
| 6 |
+
"unk_token": "[UNK]"
|
| 7 |
+
}
|
outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/tokenizer_config.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"100": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"101": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"102": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"103": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"clean_up_tokenization_spaces": false,
|
| 45 |
+
"cls_token": "[CLS]",
|
| 46 |
+
"do_lower_case": false,
|
| 47 |
+
"extra_special_tokens": {},
|
| 48 |
+
"mask_token": "[MASK]",
|
| 49 |
+
"model_max_length": 512,
|
| 50 |
+
"pad_token": "[PAD]",
|
| 51 |
+
"sep_token": "[SEP]",
|
| 52 |
+
"strip_accents": null,
|
| 53 |
+
"tokenize_chinese_chars": true,
|
| 54 |
+
"tokenizer_class": "BertTokenizer",
|
| 55 |
+
"unk_token": "[UNK]"
|
| 56 |
+
}
|
outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6021ebdaf4d143aee6780f3f7323087af8fe80c7cadc2add939b077d330f0cc
|
| 3 |
+
size 5201
|
outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
outputs/bert_bilabel_finetuned_model/checkpoint-1094/config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertForSequenceClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
| 7 |
+
"directionality": "bidi",
|
| 8 |
+
"dtype": "float32",
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 3072,
|
| 14 |
+
"layer_norm_eps": 1e-12,
|
| 15 |
+
"max_position_embeddings": 512,
|
| 16 |
+
"model_type": "bert",
|
| 17 |
+
"num_attention_heads": 12,
|
| 18 |
+
"num_hidden_layers": 12,
|
| 19 |
+
"pad_token_id": 0,
|
| 20 |
+
"pooler_fc_size": 768,
|
| 21 |
+
"pooler_num_attention_heads": 12,
|
| 22 |
+
"pooler_num_fc_layers": 3,
|
| 23 |
+
"pooler_size_per_head": 128,
|
| 24 |
+
"pooler_type": "first_token_transform",
|
| 25 |
+
"problem_type": "single_label_classification",
|
| 26 |
+
"transformers_version": "5.0.0.dev0",
|
| 27 |
+
"type_vocab_size": 2,
|
| 28 |
+
"use_cache": false,
|
| 29 |
+
"vocab_size": 21128
|
| 30 |
+
}
|
outputs/bert_bilabel_finetuned_model/checkpoint-1094/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:008f6adad10108d3cc7a5c01474525cd308971bbfaeab910af694124fbb12750
|
| 3 |
+
size 409100240
|
outputs/bert_bilabel_finetuned_model/checkpoint-1094/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:84929c0ce67f5a4a63810f68dd4367f3a37b4648b4c6197ee21b5810ab0529b0
|
| 3 |
+
size 818324875
|
outputs/bert_bilabel_finetuned_model/checkpoint-1094/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7df46a9f83e371cdeb326e2171479963c0b2372be2b82e7056ff56b48e5999c
|
| 3 |
+
size 14645
|
outputs/bert_bilabel_finetuned_model/checkpoint-1094/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:819f3a38dcbba4f9f621d51359778f1704914a94d1d1ba3a7961e9fbf54ac1bb
|
| 3 |
+
size 1465
|
outputs/bert_bilabel_finetuned_model/checkpoint-1094/trainer_state.json
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": 1094,
|
| 3 |
+
"best_metric": 0.9395770392749244,
|
| 4 |
+
"best_model_checkpoint": "./bert_finetuned_model/checkpoint-1094",
|
| 5 |
+
"epoch": 1.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1094,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.04570383912248629,
|
| 14 |
+
"grad_norm": 6.597176551818848,
|
| 15 |
+
"learning_rate": 1.9600000000000003e-06,
|
| 16 |
+
"loss": 0.8315,
|
| 17 |
+
"step": 50
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.09140767824497258,
|
| 21 |
+
"grad_norm": 4.10335636138916,
|
| 22 |
+
"learning_rate": 3.96e-06,
|
| 23 |
+
"loss": 0.403,
|
| 24 |
+
"step": 100
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.13711151736745886,
|
| 28 |
+
"grad_norm": 5.460880756378174,
|
| 29 |
+
"learning_rate": 5.9600000000000005e-06,
|
| 30 |
+
"loss": 0.2138,
|
| 31 |
+
"step": 150
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.18281535648994515,
|
| 35 |
+
"grad_norm": 1.7257156372070312,
|
| 36 |
+
"learning_rate": 7.960000000000002e-06,
|
| 37 |
+
"loss": 0.0675,
|
| 38 |
+
"step": 200
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.22851919561243145,
|
| 42 |
+
"grad_norm": 0.3548933267593384,
|
| 43 |
+
"learning_rate": 9.960000000000001e-06,
|
| 44 |
+
"loss": 0.0887,
|
| 45 |
+
"step": 250
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.2742230347349177,
|
| 49 |
+
"grad_norm": 0.07574323564767838,
|
| 50 |
+
"learning_rate": 1.196e-05,
|
| 51 |
+
"loss": 0.0625,
|
| 52 |
+
"step": 300
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.31992687385740404,
|
| 56 |
+
"grad_norm": 0.27218344807624817,
|
| 57 |
+
"learning_rate": 1.396e-05,
|
| 58 |
+
"loss": 0.0909,
|
| 59 |
+
"step": 350
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.3656307129798903,
|
| 63 |
+
"grad_norm": 0.07252885401248932,
|
| 64 |
+
"learning_rate": 1.5960000000000003e-05,
|
| 65 |
+
"loss": 0.0388,
|
| 66 |
+
"step": 400
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.4113345521023766,
|
| 70 |
+
"grad_norm": 1.1499181985855103,
|
| 71 |
+
"learning_rate": 1.796e-05,
|
| 72 |
+
"loss": 0.0955,
|
| 73 |
+
"step": 450
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.4570383912248629,
|
| 77 |
+
"grad_norm": 13.650275230407715,
|
| 78 |
+
"learning_rate": 1.9960000000000002e-05,
|
| 79 |
+
"loss": 0.0869,
|
| 80 |
+
"step": 500
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.5027422303473492,
|
| 84 |
+
"grad_norm": 11.625408172607422,
|
| 85 |
+
"learning_rate": 1.9647735442127967e-05,
|
| 86 |
+
"loss": 0.0851,
|
| 87 |
+
"step": 550
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.5484460694698354,
|
| 91 |
+
"grad_norm": 0.3337002992630005,
|
| 92 |
+
"learning_rate": 1.92882818116463e-05,
|
| 93 |
+
"loss": 0.103,
|
| 94 |
+
"step": 600
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.5941499085923218,
|
| 98 |
+
"grad_norm": 7.300892353057861,
|
| 99 |
+
"learning_rate": 1.892882818116463e-05,
|
| 100 |
+
"loss": 0.082,
|
| 101 |
+
"step": 650
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.6398537477148081,
|
| 105 |
+
"grad_norm": 0.24430198967456818,
|
| 106 |
+
"learning_rate": 1.8569374550682964e-05,
|
| 107 |
+
"loss": 0.0711,
|
| 108 |
+
"step": 700
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.6855575868372943,
|
| 112 |
+
"grad_norm": 15.26744270324707,
|
| 113 |
+
"learning_rate": 1.8209920920201294e-05,
|
| 114 |
+
"loss": 0.0737,
|
| 115 |
+
"step": 750
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 0.7312614259597806,
|
| 119 |
+
"grad_norm": 0.24188373982906342,
|
| 120 |
+
"learning_rate": 1.7850467289719628e-05,
|
| 121 |
+
"loss": 0.0668,
|
| 122 |
+
"step": 800
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.7769652650822669,
|
| 126 |
+
"grad_norm": 0.1296696811914444,
|
| 127 |
+
"learning_rate": 1.7491013659237958e-05,
|
| 128 |
+
"loss": 0.0537,
|
| 129 |
+
"step": 850
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 0.8226691042047533,
|
| 133 |
+
"grad_norm": 0.13343055546283722,
|
| 134 |
+
"learning_rate": 1.7131560028756292e-05,
|
| 135 |
+
"loss": 0.0785,
|
| 136 |
+
"step": 900
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.8683729433272395,
|
| 140 |
+
"grad_norm": 4.3099517822265625,
|
| 141 |
+
"learning_rate": 1.6772106398274622e-05,
|
| 142 |
+
"loss": 0.1045,
|
| 143 |
+
"step": 950
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 0.9140767824497258,
|
| 147 |
+
"grad_norm": 0.024240005761384964,
|
| 148 |
+
"learning_rate": 1.6412652767792956e-05,
|
| 149 |
+
"loss": 0.023,
|
| 150 |
+
"step": 1000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 0.9597806215722121,
|
| 154 |
+
"grad_norm": 1.5524265766143799,
|
| 155 |
+
"learning_rate": 1.605319913731129e-05,
|
| 156 |
+
"loss": 0.0541,
|
| 157 |
+
"step": 1050
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 1.0,
|
| 161 |
+
"eval_accuracy": 0.984,
|
| 162 |
+
"eval_f1": 0.9395770392749244,
|
| 163 |
+
"eval_loss": 0.06908556073904037,
|
| 164 |
+
"eval_runtime": 28.0922,
|
| 165 |
+
"eval_samples_per_second": 88.993,
|
| 166 |
+
"eval_steps_per_second": 5.589,
|
| 167 |
+
"step": 1094
|
| 168 |
+
}
|
| 169 |
+
],
|
| 170 |
+
"logging_steps": 50,
|
| 171 |
+
"max_steps": 3282,
|
| 172 |
+
"num_input_tokens_seen": 0,
|
| 173 |
+
"num_train_epochs": 3,
|
| 174 |
+
"save_steps": 500,
|
| 175 |
+
"stateful_callbacks": {
|
| 176 |
+
"TrainerControl": {
|
| 177 |
+
"args": {
|
| 178 |
+
"should_epoch_stop": false,
|
| 179 |
+
"should_evaluate": false,
|
| 180 |
+
"should_log": false,
|
| 181 |
+
"should_save": true,
|
| 182 |
+
"should_training_stop": false
|
| 183 |
+
},
|
| 184 |
+
"attributes": {}
|
| 185 |
+
}
|
| 186 |
+
},
|
| 187 |
+
"total_flos": 4604443468800000.0,
|
| 188 |
+
"train_batch_size": 16,
|
| 189 |
+
"trial_name": null,
|
| 190 |
+
"trial_params": null
|
| 191 |
+
}
|
outputs/bert_bilabel_finetuned_model/checkpoint-1094/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:525b07a66e715289db75a841e0609901e3ee221ba4268c678c362a7bbb781388
|
| 3 |
+
size 5137
|
outputs/bert_bilabel_finetuned_model/checkpoint-2188/config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertForSequenceClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
| 7 |
+
"directionality": "bidi",
|
| 8 |
+
"dtype": "float32",
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 3072,
|
| 14 |
+
"layer_norm_eps": 1e-12,
|
| 15 |
+
"max_position_embeddings": 512,
|
| 16 |
+
"model_type": "bert",
|
| 17 |
+
"num_attention_heads": 12,
|
| 18 |
+
"num_hidden_layers": 12,
|
| 19 |
+
"pad_token_id": 0,
|
| 20 |
+
"pooler_fc_size": 768,
|
| 21 |
+
"pooler_num_attention_heads": 12,
|
| 22 |
+
"pooler_num_fc_layers": 3,
|
| 23 |
+
"pooler_size_per_head": 128,
|
| 24 |
+
"pooler_type": "first_token_transform",
|
| 25 |
+
"problem_type": "single_label_classification",
|
| 26 |
+
"transformers_version": "5.0.0.dev0",
|
| 27 |
+
"type_vocab_size": 2,
|
| 28 |
+
"use_cache": false,
|
| 29 |
+
"vocab_size": 21128
|
| 30 |
+
}
|
outputs/bert_bilabel_finetuned_model/checkpoint-2188/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:de2ee416e9b57c2f5950073423afa4ce4969acef04ab4fac2e67b511ad0d7828
|
| 3 |
+
size 409100240
|
outputs/bert_bilabel_finetuned_model/checkpoint-2188/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3782131594e410bca55a994fe35dea25cabcc7266f22a9ffe9530377ab90826
|
| 3 |
+
size 818324875
|
outputs/bert_bilabel_finetuned_model/checkpoint-2188/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e1c014c8f02969df2fb6b09bfa058898bb6a730c9745ecc985b52eb65b54fddb
|
| 3 |
+
size 14645
|
outputs/bert_bilabel_finetuned_model/checkpoint-2188/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e292baa34fe268bf54671510dc8dca778a92537e26e986b9a3f9c6b5645bd29d
|
| 3 |
+
size 1465
|
outputs/bert_bilabel_finetuned_model/checkpoint-2188/trainer_state.json
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": 2188,
|
| 3 |
+
"best_metric": 0.9396170839469808,
|
| 4 |
+
"best_model_checkpoint": "./bert_finetuned_model/checkpoint-2188",
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 2188,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.04570383912248629,
|
| 14 |
+
"grad_norm": 6.597176551818848,
|
| 15 |
+
"learning_rate": 1.9600000000000003e-06,
|
| 16 |
+
"loss": 0.8315,
|
| 17 |
+
"step": 50
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.09140767824497258,
|
| 21 |
+
"grad_norm": 4.10335636138916,
|
| 22 |
+
"learning_rate": 3.96e-06,
|
| 23 |
+
"loss": 0.403,
|
| 24 |
+
"step": 100
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.13711151736745886,
|
| 28 |
+
"grad_norm": 5.460880756378174,
|
| 29 |
+
"learning_rate": 5.9600000000000005e-06,
|
| 30 |
+
"loss": 0.2138,
|
| 31 |
+
"step": 150
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.18281535648994515,
|
| 35 |
+
"grad_norm": 1.7257156372070312,
|
| 36 |
+
"learning_rate": 7.960000000000002e-06,
|
| 37 |
+
"loss": 0.0675,
|
| 38 |
+
"step": 200
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.22851919561243145,
|
| 42 |
+
"grad_norm": 0.3548933267593384,
|
| 43 |
+
"learning_rate": 9.960000000000001e-06,
|
| 44 |
+
"loss": 0.0887,
|
| 45 |
+
"step": 250
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.2742230347349177,
|
| 49 |
+
"grad_norm": 0.07574323564767838,
|
| 50 |
+
"learning_rate": 1.196e-05,
|
| 51 |
+
"loss": 0.0625,
|
| 52 |
+
"step": 300
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.31992687385740404,
|
| 56 |
+
"grad_norm": 0.27218344807624817,
|
| 57 |
+
"learning_rate": 1.396e-05,
|
| 58 |
+
"loss": 0.0909,
|
| 59 |
+
"step": 350
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.3656307129798903,
|
| 63 |
+
"grad_norm": 0.07252885401248932,
|
| 64 |
+
"learning_rate": 1.5960000000000003e-05,
|
| 65 |
+
"loss": 0.0388,
|
| 66 |
+
"step": 400
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.4113345521023766,
|
| 70 |
+
"grad_norm": 1.1499181985855103,
|
| 71 |
+
"learning_rate": 1.796e-05,
|
| 72 |
+
"loss": 0.0955,
|
| 73 |
+
"step": 450
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.4570383912248629,
|
| 77 |
+
"grad_norm": 13.650275230407715,
|
| 78 |
+
"learning_rate": 1.9960000000000002e-05,
|
| 79 |
+
"loss": 0.0869,
|
| 80 |
+
"step": 500
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.5027422303473492,
|
| 84 |
+
"grad_norm": 11.625408172607422,
|
| 85 |
+
"learning_rate": 1.9647735442127967e-05,
|
| 86 |
+
"loss": 0.0851,
|
| 87 |
+
"step": 550
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.5484460694698354,
|
| 91 |
+
"grad_norm": 0.3337002992630005,
|
| 92 |
+
"learning_rate": 1.92882818116463e-05,
|
| 93 |
+
"loss": 0.103,
|
| 94 |
+
"step": 600
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.5941499085923218,
|
| 98 |
+
"grad_norm": 7.300892353057861,
|
| 99 |
+
"learning_rate": 1.892882818116463e-05,
|
| 100 |
+
"loss": 0.082,
|
| 101 |
+
"step": 650
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.6398537477148081,
|
| 105 |
+
"grad_norm": 0.24430198967456818,
|
| 106 |
+
"learning_rate": 1.8569374550682964e-05,
|
| 107 |
+
"loss": 0.0711,
|
| 108 |
+
"step": 700
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.6855575868372943,
|
| 112 |
+
"grad_norm": 15.26744270324707,
|
| 113 |
+
"learning_rate": 1.8209920920201294e-05,
|
| 114 |
+
"loss": 0.0737,
|
| 115 |
+
"step": 750
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 0.7312614259597806,
|
| 119 |
+
"grad_norm": 0.24188373982906342,
|
| 120 |
+
"learning_rate": 1.7850467289719628e-05,
|
| 121 |
+
"loss": 0.0668,
|
| 122 |
+
"step": 800
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.7769652650822669,
|
| 126 |
+
"grad_norm": 0.1296696811914444,
|
| 127 |
+
"learning_rate": 1.7491013659237958e-05,
|
| 128 |
+
"loss": 0.0537,
|
| 129 |
+
"step": 850
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 0.8226691042047533,
|
| 133 |
+
"grad_norm": 0.13343055546283722,
|
| 134 |
+
"learning_rate": 1.7131560028756292e-05,
|
| 135 |
+
"loss": 0.0785,
|
| 136 |
+
"step": 900
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.8683729433272395,
|
| 140 |
+
"grad_norm": 4.3099517822265625,
|
| 141 |
+
"learning_rate": 1.6772106398274622e-05,
|
| 142 |
+
"loss": 0.1045,
|
| 143 |
+
"step": 950
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 0.9140767824497258,
|
| 147 |
+
"grad_norm": 0.024240005761384964,
|
| 148 |
+
"learning_rate": 1.6412652767792956e-05,
|
| 149 |
+
"loss": 0.023,
|
| 150 |
+
"step": 1000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 0.9597806215722121,
|
| 154 |
+
"grad_norm": 1.5524265766143799,
|
| 155 |
+
"learning_rate": 1.605319913731129e-05,
|
| 156 |
+
"loss": 0.0541,
|
| 157 |
+
"step": 1050
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 1.0,
|
| 161 |
+
"eval_accuracy": 0.984,
|
| 162 |
+
"eval_f1": 0.9395770392749244,
|
| 163 |
+
"eval_loss": 0.06908556073904037,
|
| 164 |
+
"eval_runtime": 28.0922,
|
| 165 |
+
"eval_samples_per_second": 88.993,
|
| 166 |
+
"eval_steps_per_second": 5.589,
|
| 167 |
+
"step": 1094
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
"epoch": 1.0054844606946984,
|
| 171 |
+
"grad_norm": 0.1564575880765915,
|
| 172 |
+
"learning_rate": 1.569374550682962e-05,
|
| 173 |
+
"loss": 0.066,
|
| 174 |
+
"step": 1100
|
| 175 |
+
},
|
| 176 |
+
{
|
| 177 |
+
"epoch": 1.0511882998171846,
|
| 178 |
+
"grad_norm": 0.014012756757438183,
|
| 179 |
+
"learning_rate": 1.5334291876347953e-05,
|
| 180 |
+
"loss": 0.0309,
|
| 181 |
+
"step": 1150
|
| 182 |
+
},
|
| 183 |
+
{
|
| 184 |
+
"epoch": 1.0968921389396709,
|
| 185 |
+
"grad_norm": 0.023974154144525528,
|
| 186 |
+
"learning_rate": 1.4974838245866285e-05,
|
| 187 |
+
"loss": 0.0341,
|
| 188 |
+
"step": 1200
|
| 189 |
+
},
|
| 190 |
+
{
|
| 191 |
+
"epoch": 1.1425959780621573,
|
| 192 |
+
"grad_norm": 0.013898388482630253,
|
| 193 |
+
"learning_rate": 1.4615384615384615e-05,
|
| 194 |
+
"loss": 0.0335,
|
| 195 |
+
"step": 1250
|
| 196 |
+
},
|
| 197 |
+
{
|
| 198 |
+
"epoch": 1.1882998171846435,
|
| 199 |
+
"grad_norm": 0.07936646789312363,
|
| 200 |
+
"learning_rate": 1.4255930984902949e-05,
|
| 201 |
+
"loss": 0.0479,
|
| 202 |
+
"step": 1300
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"epoch": 1.2340036563071297,
|
| 206 |
+
"grad_norm": 0.10548417270183563,
|
| 207 |
+
"learning_rate": 1.389647735442128e-05,
|
| 208 |
+
"loss": 0.0481,
|
| 209 |
+
"step": 1350
|
| 210 |
+
},
|
| 211 |
+
{
|
| 212 |
+
"epoch": 1.2797074954296161,
|
| 213 |
+
"grad_norm": 0.015461038798093796,
|
| 214 |
+
"learning_rate": 1.3537023723939613e-05,
|
| 215 |
+
"loss": 0.0302,
|
| 216 |
+
"step": 1400
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"epoch": 1.3254113345521024,
|
| 220 |
+
"grad_norm": 0.03913908079266548,
|
| 221 |
+
"learning_rate": 1.3177570093457945e-05,
|
| 222 |
+
"loss": 0.0196,
|
| 223 |
+
"step": 1450
|
| 224 |
+
},
|
| 225 |
+
{
|
| 226 |
+
"epoch": 1.3711151736745886,
|
| 227 |
+
"grad_norm": 0.0657438263297081,
|
| 228 |
+
"learning_rate": 1.2818116462976278e-05,
|
| 229 |
+
"loss": 0.07,
|
| 230 |
+
"step": 1500
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"epoch": 1.416819012797075,
|
| 234 |
+
"grad_norm": 0.08092936873435974,
|
| 235 |
+
"learning_rate": 1.245866283249461e-05,
|
| 236 |
+
"loss": 0.0372,
|
| 237 |
+
"step": 1550
|
| 238 |
+
},
|
| 239 |
+
{
|
| 240 |
+
"epoch": 1.4625228519195612,
|
| 241 |
+
"grad_norm": 0.019851330667734146,
|
| 242 |
+
"learning_rate": 1.209920920201294e-05,
|
| 243 |
+
"loss": 0.0337,
|
| 244 |
+
"step": 1600
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
"epoch": 1.5082266910420477,
|
| 248 |
+
"grad_norm": 0.013996358960866928,
|
| 249 |
+
"learning_rate": 1.1739755571531272e-05,
|
| 250 |
+
"loss": 0.038,
|
| 251 |
+
"step": 1650
|
| 252 |
+
},
|
| 253 |
+
{
|
| 254 |
+
"epoch": 1.5539305301645339,
|
| 255 |
+
"grad_norm": 0.011369767598807812,
|
| 256 |
+
"learning_rate": 1.1380301941049606e-05,
|
| 257 |
+
"loss": 0.0281,
|
| 258 |
+
"step": 1700
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"epoch": 1.59963436928702,
|
| 262 |
+
"grad_norm": 0.07967428863048553,
|
| 263 |
+
"learning_rate": 1.1020848310567938e-05,
|
| 264 |
+
"loss": 0.0426,
|
| 265 |
+
"step": 1750
|
| 266 |
+
},
|
| 267 |
+
{
|
| 268 |
+
"epoch": 1.6453382084095063,
|
| 269 |
+
"grad_norm": 0.005350353196263313,
|
| 270 |
+
"learning_rate": 1.066139468008627e-05,
|
| 271 |
+
"loss": 0.0334,
|
| 272 |
+
"step": 1800
|
| 273 |
+
},
|
| 274 |
+
{
|
| 275 |
+
"epoch": 1.6910420475319927,
|
| 276 |
+
"grad_norm": 0.007268950808793306,
|
| 277 |
+
"learning_rate": 1.0301941049604602e-05,
|
| 278 |
+
"loss": 0.0341,
|
| 279 |
+
"step": 1850
|
| 280 |
+
},
|
| 281 |
+
{
|
| 282 |
+
"epoch": 1.736745886654479,
|
| 283 |
+
"grad_norm": 0.007129556033760309,
|
| 284 |
+
"learning_rate": 9.942487419122934e-06,
|
| 285 |
+
"loss": 0.0139,
|
| 286 |
+
"step": 1900
|
| 287 |
+
},
|
| 288 |
+
{
|
| 289 |
+
"epoch": 1.7824497257769654,
|
| 290 |
+
"grad_norm": 1.3157267570495605,
|
| 291 |
+
"learning_rate": 9.583033788641266e-06,
|
| 292 |
+
"loss": 0.0412,
|
| 293 |
+
"step": 1950
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"epoch": 1.8281535648994516,
|
| 297 |
+
"grad_norm": 6.9985222816467285,
|
| 298 |
+
"learning_rate": 9.223580158159599e-06,
|
| 299 |
+
"loss": 0.0383,
|
| 300 |
+
"step": 2000
|
| 301 |
+
},
|
| 302 |
+
{
|
| 303 |
+
"epoch": 1.8738574040219378,
|
| 304 |
+
"grad_norm": 0.008648707531392574,
|
| 305 |
+
"learning_rate": 8.86412652767793e-06,
|
| 306 |
+
"loss": 0.0308,
|
| 307 |
+
"step": 2050
|
| 308 |
+
},
|
| 309 |
+
{
|
| 310 |
+
"epoch": 1.919561243144424,
|
| 311 |
+
"grad_norm": 11.036811828613281,
|
| 312 |
+
"learning_rate": 8.504672897196263e-06,
|
| 313 |
+
"loss": 0.0444,
|
| 314 |
+
"step": 2100
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"epoch": 1.9652650822669104,
|
| 318 |
+
"grad_norm": 0.005460981745272875,
|
| 319 |
+
"learning_rate": 8.145219266714595e-06,
|
| 320 |
+
"loss": 0.0288,
|
| 321 |
+
"step": 2150
|
| 322 |
+
},
|
| 323 |
+
{
|
| 324 |
+
"epoch": 2.0,
|
| 325 |
+
"eval_accuracy": 0.9836,
|
| 326 |
+
"eval_f1": 0.9396170839469808,
|
| 327 |
+
"eval_loss": 0.08339423686265945,
|
| 328 |
+
"eval_runtime": 28.9448,
|
| 329 |
+
"eval_samples_per_second": 86.371,
|
| 330 |
+
"eval_steps_per_second": 5.424,
|
| 331 |
+
"step": 2188
|
| 332 |
+
}
|
| 333 |
+
],
|
| 334 |
+
"logging_steps": 50,
|
| 335 |
+
"max_steps": 3282,
|
| 336 |
+
"num_input_tokens_seen": 0,
|
| 337 |
+
"num_train_epochs": 3,
|
| 338 |
+
"save_steps": 500,
|
| 339 |
+
"stateful_callbacks": {
|
| 340 |
+
"TrainerControl": {
|
| 341 |
+
"args": {
|
| 342 |
+
"should_epoch_stop": false,
|
| 343 |
+
"should_evaluate": false,
|
| 344 |
+
"should_log": false,
|
| 345 |
+
"should_save": true,
|
| 346 |
+
"should_training_stop": false
|
| 347 |
+
},
|
| 348 |
+
"attributes": {}
|
| 349 |
+
}
|
| 350 |
+
},
|
| 351 |
+
"total_flos": 9208886937600000.0,
|
| 352 |
+
"train_batch_size": 16,
|
| 353 |
+
"trial_name": null,
|
| 354 |
+
"trial_params": null
|
| 355 |
+
}
|
outputs/bert_bilabel_finetuned_model/checkpoint-2188/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:525b07a66e715289db75a841e0609901e3ee221ba4268c678c362a7bbb781388
|
| 3 |
+
size 5137
|
outputs/bert_bilabel_finetuned_model/checkpoint-3282/config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertForSequenceClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
| 7 |
+
"directionality": "bidi",
|
| 8 |
+
"dtype": "float32",
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 3072,
|
| 14 |
+
"layer_norm_eps": 1e-12,
|
| 15 |
+
"max_position_embeddings": 512,
|
| 16 |
+
"model_type": "bert",
|
| 17 |
+
"num_attention_heads": 12,
|
| 18 |
+
"num_hidden_layers": 12,
|
| 19 |
+
"pad_token_id": 0,
|
| 20 |
+
"pooler_fc_size": 768,
|
| 21 |
+
"pooler_num_attention_heads": 12,
|
| 22 |
+
"pooler_num_fc_layers": 3,
|
| 23 |
+
"pooler_size_per_head": 128,
|
| 24 |
+
"pooler_type": "first_token_transform",
|
| 25 |
+
"problem_type": "single_label_classification",
|
| 26 |
+
"transformers_version": "5.0.0.dev0",
|
| 27 |
+
"type_vocab_size": 2,
|
| 28 |
+
"use_cache": false,
|
| 29 |
+
"vocab_size": 21128
|
| 30 |
+
}
|
outputs/bert_bilabel_finetuned_model/checkpoint-3282/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b58a46568ff450837fbf3ee0f51fa89fd82a450959464b503f893036b86b5a01
|
| 3 |
+
size 409100240
|
outputs/bert_bilabel_finetuned_model/checkpoint-3282/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb1ea6fc0ae5e09b8a3be646658110ca41c0e6fc08b68cab2ddeb74c0ae82d38
|
| 3 |
+
size 818324875
|
outputs/bert_bilabel_finetuned_model/checkpoint-3282/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5412faccf347e6ccc0399ab61829229374cd91c9d7662d44fcb0bb456d151a0d
|
| 3 |
+
size 14645
|
outputs/bert_bilabel_finetuned_model/checkpoint-3282/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:73208a74cd8690df2f7dd0a81633cdfde37ecda9e183b1a86782b8075ba454d0
|
| 3 |
+
size 1465
|
outputs/bert_bilabel_finetuned_model/checkpoint-3282/trainer_state.json
ADDED
|
@@ -0,0 +1,519 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": 3282,
|
| 3 |
+
"best_metric": 0.9413489736070382,
|
| 4 |
+
"best_model_checkpoint": "./bert_finetuned_model/checkpoint-3282",
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 3282,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.04570383912248629,
|
| 14 |
+
"grad_norm": 6.597176551818848,
|
| 15 |
+
"learning_rate": 1.9600000000000003e-06,
|
| 16 |
+
"loss": 0.8315,
|
| 17 |
+
"step": 50
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.09140767824497258,
|
| 21 |
+
"grad_norm": 4.10335636138916,
|
| 22 |
+
"learning_rate": 3.96e-06,
|
| 23 |
+
"loss": 0.403,
|
| 24 |
+
"step": 100
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.13711151736745886,
|
| 28 |
+
"grad_norm": 5.460880756378174,
|
| 29 |
+
"learning_rate": 5.9600000000000005e-06,
|
| 30 |
+
"loss": 0.2138,
|
| 31 |
+
"step": 150
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.18281535648994515,
|
| 35 |
+
"grad_norm": 1.7257156372070312,
|
| 36 |
+
"learning_rate": 7.960000000000002e-06,
|
| 37 |
+
"loss": 0.0675,
|
| 38 |
+
"step": 200
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.22851919561243145,
|
| 42 |
+
"grad_norm": 0.3548933267593384,
|
| 43 |
+
"learning_rate": 9.960000000000001e-06,
|
| 44 |
+
"loss": 0.0887,
|
| 45 |
+
"step": 250
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.2742230347349177,
|
| 49 |
+
"grad_norm": 0.07574323564767838,
|
| 50 |
+
"learning_rate": 1.196e-05,
|
| 51 |
+
"loss": 0.0625,
|
| 52 |
+
"step": 300
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.31992687385740404,
|
| 56 |
+
"grad_norm": 0.27218344807624817,
|
| 57 |
+
"learning_rate": 1.396e-05,
|
| 58 |
+
"loss": 0.0909,
|
| 59 |
+
"step": 350
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.3656307129798903,
|
| 63 |
+
"grad_norm": 0.07252885401248932,
|
| 64 |
+
"learning_rate": 1.5960000000000003e-05,
|
| 65 |
+
"loss": 0.0388,
|
| 66 |
+
"step": 400
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.4113345521023766,
|
| 70 |
+
"grad_norm": 1.1499181985855103,
|
| 71 |
+
"learning_rate": 1.796e-05,
|
| 72 |
+
"loss": 0.0955,
|
| 73 |
+
"step": 450
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.4570383912248629,
|
| 77 |
+
"grad_norm": 13.650275230407715,
|
| 78 |
+
"learning_rate": 1.9960000000000002e-05,
|
| 79 |
+
"loss": 0.0869,
|
| 80 |
+
"step": 500
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.5027422303473492,
|
| 84 |
+
"grad_norm": 11.625408172607422,
|
| 85 |
+
"learning_rate": 1.9647735442127967e-05,
|
| 86 |
+
"loss": 0.0851,
|
| 87 |
+
"step": 550
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.5484460694698354,
|
| 91 |
+
"grad_norm": 0.3337002992630005,
|
| 92 |
+
"learning_rate": 1.92882818116463e-05,
|
| 93 |
+
"loss": 0.103,
|
| 94 |
+
"step": 600
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.5941499085923218,
|
| 98 |
+
"grad_norm": 7.300892353057861,
|
| 99 |
+
"learning_rate": 1.892882818116463e-05,
|
| 100 |
+
"loss": 0.082,
|
| 101 |
+
"step": 650
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.6398537477148081,
|
| 105 |
+
"grad_norm": 0.24430198967456818,
|
| 106 |
+
"learning_rate": 1.8569374550682964e-05,
|
| 107 |
+
"loss": 0.0711,
|
| 108 |
+
"step": 700
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.6855575868372943,
|
| 112 |
+
"grad_norm": 15.26744270324707,
|
| 113 |
+
"learning_rate": 1.8209920920201294e-05,
|
| 114 |
+
"loss": 0.0737,
|
| 115 |
+
"step": 750
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 0.7312614259597806,
|
| 119 |
+
"grad_norm": 0.24188373982906342,
|
| 120 |
+
"learning_rate": 1.7850467289719628e-05,
|
| 121 |
+
"loss": 0.0668,
|
| 122 |
+
"step": 800
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.7769652650822669,
|
| 126 |
+
"grad_norm": 0.1296696811914444,
|
| 127 |
+
"learning_rate": 1.7491013659237958e-05,
|
| 128 |
+
"loss": 0.0537,
|
| 129 |
+
"step": 850
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 0.8226691042047533,
|
| 133 |
+
"grad_norm": 0.13343055546283722,
|
| 134 |
+
"learning_rate": 1.7131560028756292e-05,
|
| 135 |
+
"loss": 0.0785,
|
| 136 |
+
"step": 900
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.8683729433272395,
|
| 140 |
+
"grad_norm": 4.3099517822265625,
|
| 141 |
+
"learning_rate": 1.6772106398274622e-05,
|
| 142 |
+
"loss": 0.1045,
|
| 143 |
+
"step": 950
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 0.9140767824497258,
|
| 147 |
+
"grad_norm": 0.024240005761384964,
|
| 148 |
+
"learning_rate": 1.6412652767792956e-05,
|
| 149 |
+
"loss": 0.023,
|
| 150 |
+
"step": 1000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 0.9597806215722121,
|
| 154 |
+
"grad_norm": 1.5524265766143799,
|
| 155 |
+
"learning_rate": 1.605319913731129e-05,
|
| 156 |
+
"loss": 0.0541,
|
| 157 |
+
"step": 1050
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 1.0,
|
| 161 |
+
"eval_accuracy": 0.984,
|
| 162 |
+
"eval_f1": 0.9395770392749244,
|
| 163 |
+
"eval_loss": 0.06908556073904037,
|
| 164 |
+
"eval_runtime": 28.0922,
|
| 165 |
+
"eval_samples_per_second": 88.993,
|
| 166 |
+
"eval_steps_per_second": 5.589,
|
| 167 |
+
"step": 1094
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
"epoch": 1.0054844606946984,
|
| 171 |
+
"grad_norm": 0.1564575880765915,
|
| 172 |
+
"learning_rate": 1.569374550682962e-05,
|
| 173 |
+
"loss": 0.066,
|
| 174 |
+
"step": 1100
|
| 175 |
+
},
|
| 176 |
+
{
|
| 177 |
+
"epoch": 1.0511882998171846,
|
| 178 |
+
"grad_norm": 0.014012756757438183,
|
| 179 |
+
"learning_rate": 1.5334291876347953e-05,
|
| 180 |
+
"loss": 0.0309,
|
| 181 |
+
"step": 1150
|
| 182 |
+
},
|
| 183 |
+
{
|
| 184 |
+
"epoch": 1.0968921389396709,
|
| 185 |
+
"grad_norm": 0.023974154144525528,
|
| 186 |
+
"learning_rate": 1.4974838245866285e-05,
|
| 187 |
+
"loss": 0.0341,
|
| 188 |
+
"step": 1200
|
| 189 |
+
},
|
| 190 |
+
{
|
| 191 |
+
"epoch": 1.1425959780621573,
|
| 192 |
+
"grad_norm": 0.013898388482630253,
|
| 193 |
+
"learning_rate": 1.4615384615384615e-05,
|
| 194 |
+
"loss": 0.0335,
|
| 195 |
+
"step": 1250
|
| 196 |
+
},
|
| 197 |
+
{
|
| 198 |
+
"epoch": 1.1882998171846435,
|
| 199 |
+
"grad_norm": 0.07936646789312363,
|
| 200 |
+
"learning_rate": 1.4255930984902949e-05,
|
| 201 |
+
"loss": 0.0479,
|
| 202 |
+
"step": 1300
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"epoch": 1.2340036563071297,
|
| 206 |
+
"grad_norm": 0.10548417270183563,
|
| 207 |
+
"learning_rate": 1.389647735442128e-05,
|
| 208 |
+
"loss": 0.0481,
|
| 209 |
+
"step": 1350
|
| 210 |
+
},
|
| 211 |
+
{
|
| 212 |
+
"epoch": 1.2797074954296161,
|
| 213 |
+
"grad_norm": 0.015461038798093796,
|
| 214 |
+
"learning_rate": 1.3537023723939613e-05,
|
| 215 |
+
"loss": 0.0302,
|
| 216 |
+
"step": 1400
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"epoch": 1.3254113345521024,
|
| 220 |
+
"grad_norm": 0.03913908079266548,
|
| 221 |
+
"learning_rate": 1.3177570093457945e-05,
|
| 222 |
+
"loss": 0.0196,
|
| 223 |
+
"step": 1450
|
| 224 |
+
},
|
| 225 |
+
{
|
| 226 |
+
"epoch": 1.3711151736745886,
|
| 227 |
+
"grad_norm": 0.0657438263297081,
|
| 228 |
+
"learning_rate": 1.2818116462976278e-05,
|
| 229 |
+
"loss": 0.07,
|
| 230 |
+
"step": 1500
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"epoch": 1.416819012797075,
|
| 234 |
+
"grad_norm": 0.08092936873435974,
|
| 235 |
+
"learning_rate": 1.245866283249461e-05,
|
| 236 |
+
"loss": 0.0372,
|
| 237 |
+
"step": 1550
|
| 238 |
+
},
|
| 239 |
+
{
|
| 240 |
+
"epoch": 1.4625228519195612,
|
| 241 |
+
"grad_norm": 0.019851330667734146,
|
| 242 |
+
"learning_rate": 1.209920920201294e-05,
|
| 243 |
+
"loss": 0.0337,
|
| 244 |
+
"step": 1600
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
"epoch": 1.5082266910420477,
|
| 248 |
+
"grad_norm": 0.013996358960866928,
|
| 249 |
+
"learning_rate": 1.1739755571531272e-05,
|
| 250 |
+
"loss": 0.038,
|
| 251 |
+
"step": 1650
|
| 252 |
+
},
|
| 253 |
+
{
|
| 254 |
+
"epoch": 1.5539305301645339,
|
| 255 |
+
"grad_norm": 0.011369767598807812,
|
| 256 |
+
"learning_rate": 1.1380301941049606e-05,
|
| 257 |
+
"loss": 0.0281,
|
| 258 |
+
"step": 1700
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"epoch": 1.59963436928702,
|
| 262 |
+
"grad_norm": 0.07967428863048553,
|
| 263 |
+
"learning_rate": 1.1020848310567938e-05,
|
| 264 |
+
"loss": 0.0426,
|
| 265 |
+
"step": 1750
|
| 266 |
+
},
|
| 267 |
+
{
|
| 268 |
+
"epoch": 1.6453382084095063,
|
| 269 |
+
"grad_norm": 0.005350353196263313,
|
| 270 |
+
"learning_rate": 1.066139468008627e-05,
|
| 271 |
+
"loss": 0.0334,
|
| 272 |
+
"step": 1800
|
| 273 |
+
},
|
| 274 |
+
{
|
| 275 |
+
"epoch": 1.6910420475319927,
|
| 276 |
+
"grad_norm": 0.007268950808793306,
|
| 277 |
+
"learning_rate": 1.0301941049604602e-05,
|
| 278 |
+
"loss": 0.0341,
|
| 279 |
+
"step": 1850
|
| 280 |
+
},
|
| 281 |
+
{
|
| 282 |
+
"epoch": 1.736745886654479,
|
| 283 |
+
"grad_norm": 0.007129556033760309,
|
| 284 |
+
"learning_rate": 9.942487419122934e-06,
|
| 285 |
+
"loss": 0.0139,
|
| 286 |
+
"step": 1900
|
| 287 |
+
},
|
| 288 |
+
{
|
| 289 |
+
"epoch": 1.7824497257769654,
|
| 290 |
+
"grad_norm": 1.3157267570495605,
|
| 291 |
+
"learning_rate": 9.583033788641266e-06,
|
| 292 |
+
"loss": 0.0412,
|
| 293 |
+
"step": 1950
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"epoch": 1.8281535648994516,
|
| 297 |
+
"grad_norm": 6.9985222816467285,
|
| 298 |
+
"learning_rate": 9.223580158159599e-06,
|
| 299 |
+
"loss": 0.0383,
|
| 300 |
+
"step": 2000
|
| 301 |
+
},
|
| 302 |
+
{
|
| 303 |
+
"epoch": 1.8738574040219378,
|
| 304 |
+
"grad_norm": 0.008648707531392574,
|
| 305 |
+
"learning_rate": 8.86412652767793e-06,
|
| 306 |
+
"loss": 0.0308,
|
| 307 |
+
"step": 2050
|
| 308 |
+
},
|
| 309 |
+
{
|
| 310 |
+
"epoch": 1.919561243144424,
|
| 311 |
+
"grad_norm": 11.036811828613281,
|
| 312 |
+
"learning_rate": 8.504672897196263e-06,
|
| 313 |
+
"loss": 0.0444,
|
| 314 |
+
"step": 2100
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"epoch": 1.9652650822669104,
|
| 318 |
+
"grad_norm": 0.005460981745272875,
|
| 319 |
+
"learning_rate": 8.145219266714595e-06,
|
| 320 |
+
"loss": 0.0288,
|
| 321 |
+
"step": 2150
|
| 322 |
+
},
|
| 323 |
+
{
|
| 324 |
+
"epoch": 2.0,
|
| 325 |
+
"eval_accuracy": 0.9836,
|
| 326 |
+
"eval_f1": 0.9396170839469808,
|
| 327 |
+
"eval_loss": 0.08339423686265945,
|
| 328 |
+
"eval_runtime": 28.9448,
|
| 329 |
+
"eval_samples_per_second": 86.371,
|
| 330 |
+
"eval_steps_per_second": 5.424,
|
| 331 |
+
"step": 2188
|
| 332 |
+
},
|
| 333 |
+
{
|
| 334 |
+
"epoch": 2.010968921389397,
|
| 335 |
+
"grad_norm": 0.8983257412910461,
|
| 336 |
+
"learning_rate": 7.785765636232927e-06,
|
| 337 |
+
"loss": 0.0354,
|
| 338 |
+
"step": 2200
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"epoch": 2.056672760511883,
|
| 342 |
+
"grad_norm": 0.10194671899080276,
|
| 343 |
+
"learning_rate": 7.426312005751259e-06,
|
| 344 |
+
"loss": 0.0195,
|
| 345 |
+
"step": 2250
|
| 346 |
+
},
|
| 347 |
+
{
|
| 348 |
+
"epoch": 2.1023765996343693,
|
| 349 |
+
"grad_norm": 3.3761022090911865,
|
| 350 |
+
"learning_rate": 7.066858375269591e-06,
|
| 351 |
+
"loss": 0.0074,
|
| 352 |
+
"step": 2300
|
| 353 |
+
},
|
| 354 |
+
{
|
| 355 |
+
"epoch": 2.1480804387568555,
|
| 356 |
+
"grad_norm": 0.0022166408598423004,
|
| 357 |
+
"learning_rate": 6.707404744787923e-06,
|
| 358 |
+
"loss": 0.0116,
|
| 359 |
+
"step": 2350
|
| 360 |
+
},
|
| 361 |
+
{
|
| 362 |
+
"epoch": 2.1937842778793417,
|
| 363 |
+
"grad_norm": 0.007358817849308252,
|
| 364 |
+
"learning_rate": 6.347951114306255e-06,
|
| 365 |
+
"loss": 0.0038,
|
| 366 |
+
"step": 2400
|
| 367 |
+
},
|
| 368 |
+
{
|
| 369 |
+
"epoch": 2.2394881170018284,
|
| 370 |
+
"grad_norm": 0.004738911986351013,
|
| 371 |
+
"learning_rate": 5.988497483824587e-06,
|
| 372 |
+
"loss": 0.0224,
|
| 373 |
+
"step": 2450
|
| 374 |
+
},
|
| 375 |
+
{
|
| 376 |
+
"epoch": 2.2851919561243146,
|
| 377 |
+
"grad_norm": 0.003663586685433984,
|
| 378 |
+
"learning_rate": 5.629043853342919e-06,
|
| 379 |
+
"loss": 0.0122,
|
| 380 |
+
"step": 2500
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"epoch": 2.330895795246801,
|
| 384 |
+
"grad_norm": 0.010519472882151604,
|
| 385 |
+
"learning_rate": 5.269590222861252e-06,
|
| 386 |
+
"loss": 0.0081,
|
| 387 |
+
"step": 2550
|
| 388 |
+
},
|
| 389 |
+
{
|
| 390 |
+
"epoch": 2.376599634369287,
|
| 391 |
+
"grad_norm": 0.007029661443084478,
|
| 392 |
+
"learning_rate": 4.910136592379584e-06,
|
| 393 |
+
"loss": 0.0302,
|
| 394 |
+
"step": 2600
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"epoch": 2.422303473491773,
|
| 398 |
+
"grad_norm": 0.011014117859303951,
|
| 399 |
+
"learning_rate": 4.550682961897916e-06,
|
| 400 |
+
"loss": 0.0195,
|
| 401 |
+
"step": 2650
|
| 402 |
+
},
|
| 403 |
+
{
|
| 404 |
+
"epoch": 2.4680073126142594,
|
| 405 |
+
"grad_norm": 0.006674727890640497,
|
| 406 |
+
"learning_rate": 4.191229331416248e-06,
|
| 407 |
+
"loss": 0.0094,
|
| 408 |
+
"step": 2700
|
| 409 |
+
},
|
| 410 |
+
{
|
| 411 |
+
"epoch": 2.5137111517367456,
|
| 412 |
+
"grad_norm": 0.011101804673671722,
|
| 413 |
+
"learning_rate": 3.8317757009345796e-06,
|
| 414 |
+
"loss": 0.0292,
|
| 415 |
+
"step": 2750
|
| 416 |
+
},
|
| 417 |
+
{
|
| 418 |
+
"epoch": 2.5594149908592323,
|
| 419 |
+
"grad_norm": 0.032363053411245346,
|
| 420 |
+
"learning_rate": 3.472322070452912e-06,
|
| 421 |
+
"loss": 0.0074,
|
| 422 |
+
"step": 2800
|
| 423 |
+
},
|
| 424 |
+
{
|
| 425 |
+
"epoch": 2.6051188299817185,
|
| 426 |
+
"grad_norm": 0.0015758282970637083,
|
| 427 |
+
"learning_rate": 3.112868439971244e-06,
|
| 428 |
+
"loss": 0.0076,
|
| 429 |
+
"step": 2850
|
| 430 |
+
},
|
| 431 |
+
{
|
| 432 |
+
"epoch": 2.6508226691042047,
|
| 433 |
+
"grad_norm": 0.0019405486527830362,
|
| 434 |
+
"learning_rate": 2.753414809489576e-06,
|
| 435 |
+
"loss": 0.0178,
|
| 436 |
+
"step": 2900
|
| 437 |
+
},
|
| 438 |
+
{
|
| 439 |
+
"epoch": 2.696526508226691,
|
| 440 |
+
"grad_norm": 0.002395535819232464,
|
| 441 |
+
"learning_rate": 2.393961179007908e-06,
|
| 442 |
+
"loss": 0.0163,
|
| 443 |
+
"step": 2950
|
| 444 |
+
},
|
| 445 |
+
{
|
| 446 |
+
"epoch": 2.742230347349177,
|
| 447 |
+
"grad_norm": 0.051430843770504,
|
| 448 |
+
"learning_rate": 2.0345075485262404e-06,
|
| 449 |
+
"loss": 0.0281,
|
| 450 |
+
"step": 3000
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"epoch": 2.787934186471664,
|
| 454 |
+
"grad_norm": 0.002579999854788184,
|
| 455 |
+
"learning_rate": 1.6750539180445723e-06,
|
| 456 |
+
"loss": 0.0241,
|
| 457 |
+
"step": 3050
|
| 458 |
+
},
|
| 459 |
+
{
|
| 460 |
+
"epoch": 2.83363802559415,
|
| 461 |
+
"grad_norm": 0.00829145684838295,
|
| 462 |
+
"learning_rate": 1.3156002875629045e-06,
|
| 463 |
+
"loss": 0.0229,
|
| 464 |
+
"step": 3100
|
| 465 |
+
},
|
| 466 |
+
{
|
| 467 |
+
"epoch": 2.8793418647166362,
|
| 468 |
+
"grad_norm": 0.003575286827981472,
|
| 469 |
+
"learning_rate": 9.561466570812366e-07,
|
| 470 |
+
"loss": 0.0016,
|
| 471 |
+
"step": 3150
|
| 472 |
+
},
|
| 473 |
+
{
|
| 474 |
+
"epoch": 2.9250457038391224,
|
| 475 |
+
"grad_norm": 0.00501601118594408,
|
| 476 |
+
"learning_rate": 5.966930265995687e-07,
|
| 477 |
+
"loss": 0.0069,
|
| 478 |
+
"step": 3200
|
| 479 |
+
},
|
| 480 |
+
{
|
| 481 |
+
"epoch": 2.9707495429616086,
|
| 482 |
+
"grad_norm": 0.01910424418747425,
|
| 483 |
+
"learning_rate": 2.3723939611790082e-07,
|
| 484 |
+
"loss": 0.016,
|
| 485 |
+
"step": 3250
|
| 486 |
+
},
|
| 487 |
+
{
|
| 488 |
+
"epoch": 3.0,
|
| 489 |
+
"eval_accuracy": 0.984,
|
| 490 |
+
"eval_f1": 0.9413489736070382,
|
| 491 |
+
"eval_loss": 0.08269735425710678,
|
| 492 |
+
"eval_runtime": 27.2195,
|
| 493 |
+
"eval_samples_per_second": 91.846,
|
| 494 |
+
"eval_steps_per_second": 5.768,
|
| 495 |
+
"step": 3282
|
| 496 |
+
}
|
| 497 |
+
],
|
| 498 |
+
"logging_steps": 50,
|
| 499 |
+
"max_steps": 3282,
|
| 500 |
+
"num_input_tokens_seen": 0,
|
| 501 |
+
"num_train_epochs": 3,
|
| 502 |
+
"save_steps": 500,
|
| 503 |
+
"stateful_callbacks": {
|
| 504 |
+
"TrainerControl": {
|
| 505 |
+
"args": {
|
| 506 |
+
"should_epoch_stop": false,
|
| 507 |
+
"should_evaluate": false,
|
| 508 |
+
"should_log": false,
|
| 509 |
+
"should_save": true,
|
| 510 |
+
"should_training_stop": true
|
| 511 |
+
},
|
| 512 |
+
"attributes": {}
|
| 513 |
+
}
|
| 514 |
+
},
|
| 515 |
+
"total_flos": 1.38133304064e+16,
|
| 516 |
+
"train_batch_size": 16,
|
| 517 |
+
"trial_name": null,
|
| 518 |
+
"trial_params": null
|
| 519 |
+
}
|
outputs/bert_bilabel_finetuned_model/checkpoint-3282/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:525b07a66e715289db75a841e0609901e3ee221ba4268c678c362a7bbb781388
|
| 3 |
+
size 5137
|
outputs/bert_bilabel_finetuned_model/final/config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertForSequenceClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
| 7 |
+
"directionality": "bidi",
|
| 8 |
+
"dtype": "float32",
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 3072,
|
| 14 |
+
"layer_norm_eps": 1e-12,
|
| 15 |
+
"max_position_embeddings": 512,
|
| 16 |
+
"model_type": "bert",
|
| 17 |
+
"num_attention_heads": 12,
|
| 18 |
+
"num_hidden_layers": 12,
|
| 19 |
+
"pad_token_id": 0,
|
| 20 |
+
"pooler_fc_size": 768,
|
| 21 |
+
"pooler_num_attention_heads": 12,
|
| 22 |
+
"pooler_num_fc_layers": 3,
|
| 23 |
+
"pooler_size_per_head": 128,
|
| 24 |
+
"pooler_type": "first_token_transform",
|
| 25 |
+
"problem_type": "single_label_classification",
|
| 26 |
+
"transformers_version": "5.0.0.dev0",
|
| 27 |
+
"type_vocab_size": 2,
|
| 28 |
+
"use_cache": false,
|
| 29 |
+
"vocab_size": 21128
|
| 30 |
+
}
|
outputs/bert_bilabel_finetuned_model/final/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b58a46568ff450837fbf3ee0f51fa89fd82a450959464b503f893036b86b5a01
|
| 3 |
+
size 409100240
|
outputs/bert_bilabel_finetuned_model/final/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:525b07a66e715289db75a841e0609901e3ee221ba4268c678c362a7bbb781388
|
| 3 |
+
size 5137
|
outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertForSequenceClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
| 7 |
+
"directionality": "bidi",
|
| 8 |
+
"dtype": "float32",
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 3072,
|
| 14 |
+
"layer_norm_eps": 1e-12,
|
| 15 |
+
"max_position_embeddings": 512,
|
| 16 |
+
"model_type": "bert",
|
| 17 |
+
"num_attention_heads": 12,
|
| 18 |
+
"num_hidden_layers": 12,
|
| 19 |
+
"pad_token_id": 0,
|
| 20 |
+
"pooler_fc_size": 768,
|
| 21 |
+
"pooler_num_attention_heads": 12,
|
| 22 |
+
"pooler_num_fc_layers": 3,
|
| 23 |
+
"pooler_size_per_head": 128,
|
| 24 |
+
"pooler_type": "first_token_transform",
|
| 25 |
+
"problem_type": "single_label_classification",
|
| 26 |
+
"transformers_version": "5.0.0.dev0",
|
| 27 |
+
"type_vocab_size": 2,
|
| 28 |
+
"use_cache": false,
|
| 29 |
+
"vocab_size": 21128
|
| 30 |
+
}
|
outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9097c32c49180f4aa454c161c6d4f7836309cae1da6f9b0999742b6126e974a1
|
| 3 |
+
size 409100240
|
outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:364f58fb6b7311d72cc67732e12056c8397d13aab84128e44fccc9a4f96440a9
|
| 3 |
+
size 15597
|
outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7df46a9f83e371cdeb326e2171479963c0b2372be2b82e7056ff56b48e5999c
|
| 3 |
+
size 14645
|