Riverise commited on
Commit
fc9ae4e
·
verified ·
1 Parent(s): aa54241

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. bert_finetue_task2.py +223 -0
  3. bert_finetue_task2_froze.py +247 -0
  4. bert_finetune_task1.py +126 -0
  5. bert_finetune_task1_froze.py +148 -0
  6. continue_pretrain.py +132 -0
  7. dapt_data_process.py +69 -0
  8. data_process_task1.py +89 -0
  9. data_process_task2.py +95 -0
  10. data_source.xlsx +3 -0
  11. dataset_pretrain/Experiment_sentences_training_filtered_part1.csv +3 -0
  12. dataset_pretrain/domain_corpus.txt +3 -0
  13. dataset_pretrain/预训练数据第二部分_年报.zip +3 -0
  14. model_inference_task1.py +162 -0
  15. model_inference_task2.py +153 -0
  16. outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/config.json +43 -0
  17. outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/model.safetensors +3 -0
  18. outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/special_tokens_map.json +7 -0
  19. outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/tokenizer.json +0 -0
  20. outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/tokenizer_config.json +56 -0
  21. outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/training_args.bin +3 -0
  22. outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/vocab.txt +0 -0
  23. outputs/bert_bilabel_finetuned_model/checkpoint-1094/config.json +30 -0
  24. outputs/bert_bilabel_finetuned_model/checkpoint-1094/model.safetensors +3 -0
  25. outputs/bert_bilabel_finetuned_model/checkpoint-1094/optimizer.pt +3 -0
  26. outputs/bert_bilabel_finetuned_model/checkpoint-1094/rng_state.pth +3 -0
  27. outputs/bert_bilabel_finetuned_model/checkpoint-1094/scheduler.pt +3 -0
  28. outputs/bert_bilabel_finetuned_model/checkpoint-1094/trainer_state.json +191 -0
  29. outputs/bert_bilabel_finetuned_model/checkpoint-1094/training_args.bin +3 -0
  30. outputs/bert_bilabel_finetuned_model/checkpoint-2188/config.json +30 -0
  31. outputs/bert_bilabel_finetuned_model/checkpoint-2188/model.safetensors +3 -0
  32. outputs/bert_bilabel_finetuned_model/checkpoint-2188/optimizer.pt +3 -0
  33. outputs/bert_bilabel_finetuned_model/checkpoint-2188/rng_state.pth +3 -0
  34. outputs/bert_bilabel_finetuned_model/checkpoint-2188/scheduler.pt +3 -0
  35. outputs/bert_bilabel_finetuned_model/checkpoint-2188/trainer_state.json +355 -0
  36. outputs/bert_bilabel_finetuned_model/checkpoint-2188/training_args.bin +3 -0
  37. outputs/bert_bilabel_finetuned_model/checkpoint-3282/config.json +30 -0
  38. outputs/bert_bilabel_finetuned_model/checkpoint-3282/model.safetensors +3 -0
  39. outputs/bert_bilabel_finetuned_model/checkpoint-3282/optimizer.pt +3 -0
  40. outputs/bert_bilabel_finetuned_model/checkpoint-3282/rng_state.pth +3 -0
  41. outputs/bert_bilabel_finetuned_model/checkpoint-3282/scheduler.pt +3 -0
  42. outputs/bert_bilabel_finetuned_model/checkpoint-3282/trainer_state.json +519 -0
  43. outputs/bert_bilabel_finetuned_model/checkpoint-3282/training_args.bin +3 -0
  44. outputs/bert_bilabel_finetuned_model/final/config.json +30 -0
  45. outputs/bert_bilabel_finetuned_model/final/model.safetensors +3 -0
  46. outputs/bert_bilabel_finetuned_model/final/training_args.bin +3 -0
  47. outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/config.json +30 -0
  48. outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/model.safetensors +3 -0
  49. outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/optimizer.pt +3 -0
  50. outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/rng_state.pth +3 -0
.gitattributes CHANGED
@@ -33,3 +33,38 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data_source.xlsx filter=lfs diff=lfs merge=lfs -text
37
+ dataset_pretrain/Experiment_sentences_training_filtered_part1.csv filter=lfs diff=lfs merge=lfs -text
38
+ dataset_pretrain/domain_corpus.txt filter=lfs diff=lfs merge=lfs -text
39
+ wandb/run-20251113_020110-hwo46nr1/run-hwo46nr1.wandb filter=lfs diff=lfs merge=lfs -text
40
+ wandb/run-20251113_024451-lmhafthr/run-lmhafthr.wandb filter=lfs diff=lfs merge=lfs -text
41
+ wandb/run-20251113_055509-63h4lqr8/run-63h4lqr8.wandb filter=lfs diff=lfs merge=lfs -text
42
+ wandb/run-20251113_055942-lq4qoqk3/run-lq4qoqk3.wandb filter=lfs diff=lfs merge=lfs -text
43
+ wandb/run-20251113_074324-2o6q17un/run-2o6q17un.wandb filter=lfs diff=lfs merge=lfs -text
44
+ wandb/run-20251113_075410-cilrwgz8/run-cilrwgz8.wandb filter=lfs diff=lfs merge=lfs -text
45
+ wandb/run-20251113_080542-irf9fgra/run-irf9fgra.wandb filter=lfs diff=lfs merge=lfs -text
46
+ wandb/run-20251113_170012-vj6z0qct/run-vj6z0qct.wandb filter=lfs diff=lfs merge=lfs -text
47
+ wandb/run-20251114_144619-0hviozok/run-0hviozok.wandb filter=lfs diff=lfs merge=lfs -text
48
+ wandb/run-20251114_145658-0lepm1if/run-0lepm1if.wandb filter=lfs diff=lfs merge=lfs -text
49
+ wandb/run-20251114_150434-0nq8ji5k/run-0nq8ji5k.wandb filter=lfs diff=lfs merge=lfs -text
50
+ wandb/run-20251114_152637-xe0tjdf6/run-xe0tjdf6.wandb filter=lfs diff=lfs merge=lfs -text
51
+ wandb/run-20251114_153529-7r2aeenh/run-7r2aeenh.wandb filter=lfs diff=lfs merge=lfs -text
52
+ wandb/run-20251114_154223-xmrnfh0j/run-xmrnfh0j.wandb filter=lfs diff=lfs merge=lfs -text
53
+ wandb/run-20251114_154302-7tit87eb/run-7tit87eb.wandb filter=lfs diff=lfs merge=lfs -text
54
+ wandb/run-20251114_161829-g1azoa0i/run-g1azoa0i.wandb filter=lfs diff=lfs merge=lfs -text
55
+ wandb/run-20251114_171922-j1hfy78o/run-j1hfy78o.wandb filter=lfs diff=lfs merge=lfs -text
56
+ wandb/run-20251115_023230-j4s1o16p/run-j4s1o16p.wandb filter=lfs diff=lfs merge=lfs -text
57
+ wandb/run-20251115_024020-whj9y4hx/run-whj9y4hx.wandb filter=lfs diff=lfs merge=lfs -text
58
+ wandb/run-20251115_031217-29o94la6/run-29o94la6.wandb filter=lfs diff=lfs merge=lfs -text
59
+ wandb/run-20251115_032957-oljr07ni/run-oljr07ni.wandb filter=lfs diff=lfs merge=lfs -text
60
+ wandb/run-20251115_033525-i1hsksbs/run-i1hsksbs.wandb filter=lfs diff=lfs merge=lfs -text
61
+ wandb/run-20251115_033750-ybm95q2x/run-ybm95q2x.wandb filter=lfs diff=lfs merge=lfs -text
62
+ wandb/run-20251115_034104-e4a2rovd/run-e4a2rovd.wandb filter=lfs diff=lfs merge=lfs -text
63
+ wandb/run-20251115_034702-q5cv2xfu/run-q5cv2xfu.wandb filter=lfs diff=lfs merge=lfs -text
64
+ wandb/run-20251115_034922-e287xu9n/run-e287xu9n.wandb filter=lfs diff=lfs merge=lfs -text
65
+ wandb/run-20251115_034939-zlf3muf5/run-zlf3muf5.wandb filter=lfs diff=lfs merge=lfs -text
66
+ wandb/run-20251115_035223-nehpw594/run-nehpw594.wandb filter=lfs diff=lfs merge=lfs -text
67
+ wandb/run-20251115_035728-inhxwz05/run-inhxwz05.wandb filter=lfs diff=lfs merge=lfs -text
68
+ wandb/run-20251115_035746-cmttchar/run-cmttchar.wandb filter=lfs diff=lfs merge=lfs -text
69
+ wandb/run-20251115_050557-37a3t1f4/run-37a3t1f4.wandb filter=lfs diff=lfs merge=lfs -text
70
+ 标注数据_更正后.xlsx filter=lfs diff=lfs merge=lfs -text
bert_finetue_task2.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ import json
5
+ import evaluate
6
+ from datasets import load_dataset
7
+ from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast
8
+ from transformers import TrainingArguments, Trainer
9
+ from torch import nn
10
+ from sklearn.metrics import f1_score, accuracy_score, jaccard_score
11
+
12
+ # --- 配置参数 ---
13
+ DATA_DIR = "./processed_data_task2_fixed"
14
+ # MODEL_NAME = "bert-base-chinese"
15
+ TOKENIZER_NAME = "valuesimplex-ai-lab/FinBERT2-base"
16
+ MODEL_NAME = "/home/hsichen/part_time/BERT_finetune/outputs/finbert2_dapt_model"
17
+ # MODEL_NAME = "valuesimplex-ai-lab/FinBERT2-base"
18
+ # 标签总数:Data, Action, Gain, Regu, Vague
19
+ NUM_LABELS = 5
20
+ OUTPUT_DIR = "/home/hsichen/part_time/BERT_finetune/outputs/finbert2_multilabel_model_finetuned_from_dapt"
21
+ EPOCHS = 5
22
+ BATCH_SIZE = 16
23
+ LEARNING_RATE = 2e-5
24
+ SEED = 42
25
+
26
+ # ----------------------------------------------------
27
+ # A. 定义支持多标签分类的 BERT 模型
28
+ # ----------------------------------------------------
29
+ class BertForMultiLabelClassification(BertPreTrainedModel):
30
+ """
31
+ 基于 BERT 的多标签分类模型,使用 BCEWithLogitsLoss
32
+ """
33
+ def __init__(self, config):
34
+ super().__init__(config)
35
+ self.num_labels = config.num_labels
36
+
37
+ self.bert = BertModel(config)
38
+
39
+ classifier_dropout = config.hidden_dropout_prob
40
+ self.dropout = nn.Dropout(classifier_dropout)
41
+
42
+ # 线性层输出维度 = 标签数量 (5)
43
+ self.classifier = nn.Linear(config.hidden_size, self.num_labels)
44
+
45
+ self.post_init()
46
+ self.loss_fct = nn.BCEWithLogitsLoss()
47
+
48
+ def forward(self,
49
+ input_ids=None,
50
+ attention_mask=None,
51
+ token_type_ids=None,
52
+ labels=None):
53
+
54
+ outputs = self.bert(
55
+ input_ids,
56
+ attention_mask=attention_mask,
57
+ token_type_ids=token_type_ids,
58
+ )
59
+
60
+ # 取 [CLS] token 的隐藏状态 (即 pooler output)
61
+ pooled_output = outputs.pooler_output
62
+ pooled_output = self.dropout(pooled_output)
63
+
64
+ # 经过分类器层,输出 logits (未经 Sigmoid 的分数)
65
+ logits = self.classifier(pooled_output)
66
+
67
+ loss = None
68
+ if labels is not None:
69
+ # 确保 labels 是 float 类型,因为损失函数需要 float
70
+ loss = self.loss_fct(logits, labels.float())
71
+
72
+ return (loss, logits) if loss is not None else (logits,)
73
+
74
+ from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
75
+
76
+ # 标签名称,用于报告输出
77
+ TAG_COLS = ['Data', 'Action', 'Gain', 'Regu', 'Vague']
78
+ PREDICTION_THRESHOLD = 0.5
79
+
80
+ def compute_metrics(p):
81
+ """
82
+ 计算多标签分类的评估指标,包括全局指标和每个类别的指标。
83
+ 要求:损失、F1、Precision、Accuracy、Recall。
84
+ """
85
+ # 损失(Loss)由 Trainer 自动处理并记录在 logs 中,这里主要关注评估指标
86
+ logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
87
+ labels = p.label_ids
88
+
89
+ # Sigmoid -> 概率
90
+ probs = 1 / (1 + np.exp(-logits))
91
+ # 应用阈值 0.5 得到二元预测
92
+ preds = (probs > PREDICTION_THRESHOLD).astype(int)
93
+
94
+ # ----------------------------------------
95
+ # 1. 全局汇总指标 (用于 metric_for_best_model)
96
+ # ----------------------------------------
97
+ metrics = {}
98
+
99
+ # Micro-F1 (通常用于选择最佳模型)
100
+ metrics["f1_micro"] = f1_score(labels, preds, average='micro')
101
+ # Macro-F1 (平衡各类别贡献)
102
+ metrics["f1_macro"] = f1_score(labels, preds, average='macro')
103
+ # 样本级 Jaccard (度量样本预测的标签集合与真实标签集合的重合度)
104
+ metrics["jaccard_samples"] = jaccard_score(labels, preds, average='samples')
105
+
106
+ # ----------------------------------------
107
+ # 2. 每个类别的指标 (Per-Class)
108
+ # ----------------------------------------
109
+ # 注意:多标签的 Per-Class Metrics 就是针对每一列(每个标签)做一次二分类指标计算。
110
+ for i, tag in enumerate(TAG_COLS):
111
+ y_true_class = labels[:, i] # 第 i 个标签的真实值
112
+ y_pred_class = preds[:, i] # 第 i 个标签的预测值
113
+
114
+ # 计算该类别的指标
115
+ # 类别级指标命名规范:{tag}_f1, {tag}_precision, {tag}_recall, {tag}_accuracy
116
+
117
+ # F1-Score (二分类指标)
118
+ metrics[f"{tag}_f1"] = f1_score(y_true_class, y_pred_class, average='binary', zero_division=0)
119
+ # Precision
120
+ metrics[f"{tag}_precision"] = precision_score(y_true_class, y_pred_class, average='binary', zero_division=0)
121
+ # Recall
122
+ metrics[f"{tag}_recall"] = recall_score(y_true_class, y_pred_class, average='binary', zero_division=0)
123
+ # Accuracy (当前类别预测对的样本数 / 总样本数)
124
+ # 注意:这里计算的是该标签本身的准确率,而非整个样本的准确率
125
+ metrics[f"{tag}_accuracy"] = accuracy_score(y_true_class, y_pred_class)
126
+
127
+ return metrics
128
+ # ----------------------------------------------------
129
+ # C. 主微调函数
130
+ # ----------------------------------------------------
131
+ def finetune_multilabel_bert():
132
+
133
+ # 1. 加载数据集
134
+ print("--- 1. 加载数据集 ---")
135
+ data_files = {
136
+ "train": os.path.join(DATA_DIR, "train.csv"),
137
+ "validation": os.path.join(DATA_DIR, "validation.csv"),
138
+ "test": os.path.join(DATA_DIR, "test.csv")
139
+ }
140
+ raw_datasets = load_dataset("csv", data_files=data_files)
141
+
142
+ # 2. 加载分词器和自定义模型
143
+ print("--- 2. 加载分词器和自定义模型 ---")
144
+
145
+ tokenizer = BertTokenizerFast.from_pretrained(TOKENIZER_NAME)
146
+
147
+ # 使用自定义模型 BertForMultiLabelClassification
148
+ model = BertForMultiLabelClassification.from_pretrained(
149
+ MODEL_NAME,
150
+ num_labels=NUM_LABELS,
151
+ ignore_mismatched_sizes=True
152
+ )
153
+
154
+ # 3. 数据集 Tokenization (分词)
155
+ def tokenize_function(examples):
156
+ # 假设文本在 'text' 列
157
+ tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
158
+
159
+ # 将 CSV 中的字符串形式的标签列表 (e.g., '[1, 0, 1, 0, 0]') 转换为 float 列表
160
+
161
+ label_list = json.loads(examples["labels"].replace("'", "\""))
162
+ tokenized["labels"] = label_list
163
+ return tokenized
164
+
165
+ # 注意:多标签任务 map 时 batched=False 确保标签解析正确
166
+ tokenized_datasets = raw_datasets.map(tokenize_function, batched=False)
167
+
168
+ # 移除原始的 'text' 列
169
+ tokenized_datasets = tokenized_datasets.remove_columns(["text"])
170
+
171
+ # 调整 PyTorch 张量格式
172
+ tokenized_datasets.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])
173
+
174
+ train_dataset = tokenized_datasets["train"]
175
+ eval_dataset = tokenized_datasets["validation"]
176
+ test_dataset = tokenized_datasets["test"]
177
+
178
+ # 4. 设置训练参数
179
+ print("--- 3. 设置训练参数和 Trainer ---")
180
+ training_args = TrainingArguments(
181
+ output_dir=OUTPUT_DIR,
182
+ num_train_epochs=EPOCHS,
183
+ per_device_train_batch_size=BATCH_SIZE,
184
+ per_device_eval_batch_size=BATCH_SIZE,
185
+ warmup_steps=200,
186
+ weight_decay=0.01,
187
+ logging_steps=50,
188
+ eval_strategy="steps",
189
+ eval_steps=50,
190
+ save_strategy="steps",
191
+ save_steps=500,
192
+ load_best_model_at_end=True,
193
+ metric_for_best_model="f1_micro",
194
+ seed=SEED,
195
+ learning_rate=3e-5,
196
+ report_to="wandb"
197
+ )
198
+
199
+ # 5. 初始化 Trainer
200
+ trainer = Trainer(
201
+ model=model,
202
+ args=training_args,
203
+ train_dataset=train_dataset,
204
+ eval_dataset=eval_dataset,
205
+ compute_metrics=compute_metrics,
206
+ )
207
+
208
+ # 6. 开始训练
209
+ print("--- 4. 开始训练 ---")
210
+ trainer.train()
211
+
212
+ # 7. 评估测试集
213
+ print("--- 5. 评估测试集 ---")
214
+ results = trainer.evaluate(test_dataset)
215
+ print(f"测试集评估结果: {results}")
216
+
217
+ # 8. 保存最终模型
218
+ trainer.save_model(os.path.join(OUTPUT_DIR, "final"))
219
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "final"))
220
+ print(f"模型和分词器已保存至: {os.path.join(OUTPUT_DIR, 'final')}")
221
+
222
+ if __name__ == "__main__":
223
+ finetune_multilabel_bert()
bert_finetue_task2_froze.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ import json
5
+ import evaluate
6
+ from datasets import load_dataset
7
+ from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast
8
+ from transformers import TrainingArguments, Trainer
9
+ from torch import nn
10
+ from sklearn.metrics import f1_score, accuracy_score, jaccard_score
11
+
12
+ # --- 配置参数 ---
13
+ DATA_DIR = "./processed_data_task2_fixed"
14
+ MODEL_NAME = "bert-base-chinese"
15
+ # MODEL_NAME = "/home/hsichen/part_time/BERT_finetune/outputs/finbert2_dapt_model"
16
+ # MODEL_NAME = "valuesimplex-ai-lab/FinBERT2-base"
17
+ # 标签总数:Data, Action, Gain, Regu, Vague
18
+ NUM_LABELS = 5
19
+ OUTPUT_DIR = "/home/hsichen/part_time/BERT_finetune/outputs/bert_multilabel_frozen_classifier_finetuned_model"
20
+ EPOCHS = 5
21
+ BATCH_SIZE = 16
22
+ LEARNING_RATE = 1e-4
23
+ SEED = 42
24
+
25
+ # ----------------------------------------------------
26
+ # A. 定义支持多标签分类的 BERT 模型
27
+ # ----------------------------------------------------
28
+ class BertForMultiLabelClassification(BertPreTrainedModel):
29
+ """
30
+ 基于 BERT 的多标签分类模型,使用 BCEWithLogitsLoss
31
+ """
32
+ def __init__(self, config):
33
+ super().__init__(config)
34
+ self.num_labels = config.num_labels
35
+
36
+ self.bert = BertModel(config)
37
+
38
+ classifier_dropout = config.hidden_dropout_prob
39
+ self.dropout = nn.Dropout(classifier_dropout)
40
+
41
+ # 线性层输出维度 = 标签数量 (5)
42
+ self.classifier = nn.Linear(config.hidden_size, self.num_labels)
43
+
44
+ self.post_init()
45
+ self.loss_fct = nn.BCEWithLogitsLoss()
46
+
47
+ def forward(self,
48
+ input_ids=None,
49
+ attention_mask=None,
50
+ token_type_ids=None,
51
+ labels=None):
52
+
53
+ outputs = self.bert(
54
+ input_ids,
55
+ attention_mask=attention_mask,
56
+ token_type_ids=token_type_ids,
57
+ )
58
+
59
+ # 取 [CLS] token 的隐藏状态 (即 pooler output)
60
+ pooled_output = outputs.pooler_output
61
+ pooled_output = self.dropout(pooled_output)
62
+
63
+ # 经过分类器层,输出 logits (未经 Sigmoid 的分数)
64
+ logits = self.classifier(pooled_output)
65
+
66
+ loss = None
67
+ if labels is not None:
68
+ # 确保 labels 是 float 类型,因为损失函数需要 float
69
+ loss = self.loss_fct(logits, labels.float())
70
+
71
+ return (loss, logits) if loss is not None else (logits,)
72
+
73
+ # ----------------------------------------------------
74
+ # B. 评估指标函数 (Multi-Label)
75
+ # ----------------------------------------------------
76
+
77
+ from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
78
+
79
+ # 标签名称,用于报告输出
80
+ TAG_COLS = ['Data', 'Action', 'Gain', 'Regu', 'Vague']
81
+ PREDICTION_THRESHOLD = 0.5
82
+
83
+ def compute_metrics(p):
84
+ """
85
+ 计算多标签分类的评估指标,包括全局指标和每个类别的指标。
86
+ 要求:损失、F1、Precision、Accuracy、Recall。
87
+ """
88
+ # 损失(Loss)由 Trainer 自动处理并记录在 logs 中,这里主要关注评估指标
89
+ # logits 可能是 tuple,需要提取
90
+ logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
91
+ labels = p.label_ids
92
+
93
+ # Sigmoid -> 概率
94
+ probs = 1 / (1 + np.exp(-logits))
95
+ # 应用阈值 0.5 得到二元预测
96
+ preds = (probs > PREDICTION_THRESHOLD).astype(int)
97
+
98
+ # ----------------------------------------
99
+ # 1. 全局汇总指标 (用于 metric_for_best_model)
100
+ # ----------------------------------------
101
+ metrics = {}
102
+
103
+ # Micro-F1 (通常用于选择最佳模型)
104
+ metrics["f1_micro"] = f1_score(labels, preds, average='micro')
105
+ # Macro-F1 (平衡各类别贡献)
106
+ metrics["f1_macro"] = f1_score(labels, preds, average='macro')
107
+ # 样本级 Jaccard (度量样本预测的标签集合与真实标签集合的重合度)
108
+ metrics["jaccard_samples"] = jaccard_score(labels, preds, average='samples')
109
+
110
+ # ----------------------------------------
111
+ # 2. 每个类别的指标 (Per-Class)
112
+ # ----------------------------------------
113
+ # 注意:多标签的 Per-Class Metrics 就是针对每一列(每个标签)做一次二分类指标计算。
114
+ for i, tag in enumerate(TAG_COLS):
115
+ y_true_class = labels[:, i] # 第 i 个标签的真实值
116
+ y_pred_class = preds[:, i] # 第 i 个标签的预测值
117
+
118
+ # 计算该类别的指标
119
+ # 类别级指标命名规范:{tag}_f1, {tag}_precision, {tag}_recall, {tag}_accuracy
120
+
121
+ # F1-Score (二分类指标)
122
+ metrics[f"{tag}_f1"] = f1_score(y_true_class, y_pred_class, average='binary', zero_division=0)
123
+ # Precision
124
+ metrics[f"{tag}_precision"] = precision_score(y_true_class, y_pred_class, average='binary', zero_division=0)
125
+ # Recall
126
+ metrics[f"{tag}_recall"] = recall_score(y_true_class, y_pred_class, average='binary', zero_division=0)
127
+ # Accuracy (当前类别预测对的样本数 / 总样本数)
128
+ # 注意:这里计算的是该标签本身的准确率,而非整个样本的准确率
129
+ metrics[f"{tag}_accuracy"] = accuracy_score(y_true_class, y_pred_class)
130
+
131
+ return metrics
132
+
133
+ # ----------------------------------------------------
134
+ # C. 主微调函数
135
+ # ----------------------------------------------------
136
+ def finetune_multilabel_bert():
137
+
138
+ # 1. 加载数据集
139
+ print("--- 1. 加载数据集 ---")
140
+ data_files = {
141
+ "train": os.path.join(DATA_DIR, "train.csv"),
142
+ "validation": os.path.join(DATA_DIR, "validation.csv"),
143
+ "test": os.path.join(DATA_DIR, "test.csv")
144
+ }
145
+ raw_datasets = load_dataset("csv", data_files=data_files)
146
+
147
+ # 2. 加载分词器和自定义模型
148
+ print("--- 2. 加载分词器和自定义模型 ---")
149
+
150
+ tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
151
+
152
+ # 使用自定义模型 BertForMultiLabelClassification
153
+ model = BertForMultiLabelClassification.from_pretrained(
154
+ MODEL_NAME,
155
+ num_labels=NUM_LABELS,
156
+ )
157
+
158
+ print("--- 2.1. 冻结 BERT 主体权重 (修正版) ---")
159
+
160
+ trainable_params_count = 0
161
+ for name, param in model.named_parameters():
162
+ # 检查参数名是否以 'bert.' 开头
163
+ if name.startswith('bert.'):
164
+ param.requires_grad = False
165
+ else:
166
+ # 如果不以 'bert.' 开头,则它是分类器 (classifier.weight/bias)
167
+ param.requires_grad = True
168
+ trainable_params_count += param.numel() # 计算可训练参数量
169
+
170
+ # 检查冻结是否生效
171
+ total_params = sum(p.numel() for p in model.parameters())
172
+ # 这里的 trainable_params 应该接近 train_param_count 的值
173
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
174
+
175
+ print(f"总参数量: {total_params / 1e6:.2f} M")
176
+ print(f"可训练参数量 (仅分类头): {trainable_params / 1e6:.6f} M")
177
+
178
+ # 3. 数据集 Tokenization (分词)
179
+ def tokenize_function(examples):
180
+ # 假设文本在 'text' 列
181
+ tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
182
+
183
+ # 将 CSV 中的字符串形式的标签列表 (e.g., '[1, 0, 1, 0, 0]') 转换为 float 列表
184
+ # 使用 json.loads 比 eval() 更安全
185
+ label_list = json.loads(examples["labels"].replace("'", "\""))
186
+ tokenized["labels"] = label_list
187
+ return tokenized
188
+
189
+ # 注意:多标签任务 map 时 batched=False 确保标签解析正确
190
+ tokenized_datasets = raw_datasets.map(tokenize_function, batched=False)
191
+
192
+ # 移除原始的 'text' 列
193
+ tokenized_datasets = tokenized_datasets.remove_columns(["text"])
194
+
195
+ # 调整 PyTorch 张量格式
196
+ tokenized_datasets.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])
197
+
198
+ train_dataset = tokenized_datasets["train"]
199
+ eval_dataset = tokenized_datasets["validation"]
200
+ test_dataset = tokenized_datasets["test"]
201
+
202
+ # 4. 设置训练参数
203
+ print("--- 3. 设置训练参数和 Trainer ---")
204
+ training_args = TrainingArguments(
205
+ output_dir=OUTPUT_DIR,
206
+ num_train_epochs=EPOCHS,
207
+ per_device_train_batch_size=BATCH_SIZE,
208
+ per_device_eval_batch_size=BATCH_SIZE,
209
+ warmup_steps=200,
210
+ weight_decay=0.01,
211
+ logging_steps=50,
212
+ eval_strategy="steps",
213
+ eval_steps=50,
214
+ save_strategy="steps",
215
+ save_steps=50,
216
+ load_best_model_at_end=True,
217
+ metric_for_best_model="f1_micro",
218
+ seed=SEED,
219
+ learning_rate=3e-5,
220
+ report_to="wandb"
221
+ )
222
+
223
+ # 5. 初始化 Trainer
224
+ trainer = Trainer(
225
+ model=model,
226
+ args=training_args,
227
+ train_dataset=train_dataset,
228
+ eval_dataset=eval_dataset,
229
+ compute_metrics=compute_metrics,
230
+ )
231
+
232
+ # 6. 开始训练
233
+ print("--- 4. 开始训练 ---")
234
+ trainer.train()
235
+
236
+ # 7. 评估测试集
237
+ print("--- 5. 评估测试集 ---")
238
+ results = trainer.evaluate(test_dataset)
239
+ print(f"测试集评估结果: {results}")
240
+
241
+ # 8. 保存最终模型
242
+ trainer.save_model(os.path.join(OUTPUT_DIR, "final"))
243
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "final"))
244
+ print(f"模型和分词器已保存至: {os.path.join(OUTPUT_DIR, 'final')}")
245
+
246
+ if __name__ == "__main__":
247
+ finetune_multilabel_bert()
bert_finetune_task1.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ from datasets import load_dataset
5
+ import evaluate
6
+ from transformers import BertForSequenceClassification, BertTokenizerFast
7
+ from transformers import TrainingArguments, Trainer
8
+
9
+ # --- 配置参数 ---
10
+ # 上一步处理好的数据目录
11
+ DATA_DIR = "./processed_data_task1"
12
+ # MODEL_NAME = "valuesimplex-ai-lab/FinBERT2-base"
13
+ MODEL_NAME = '/home/hsichen/part_time/BERT_finetune/outputs/finbert2_dapt_model'
14
+ # 预留给模型的标签数量 (0 和 1)
15
+ NUM_LABELS = 2
16
+ # 微调结果保存目录
17
+ OUTPUT_DIR = "./finbert2_bilabel_finetuned_model_from_dapt"
18
+ # 训练参数
19
+ EPOCHS = 3
20
+ BATCH_SIZE = 16
21
+ LEARNING_RATE = 2e-5
22
+ SEED = 42
23
+
24
+ def compute_metrics(p):
25
+ """
26
+ 计算评估指标 (准确率, F1, Precision, Recall)
27
+ """
28
+ preds = np.argmax(p.predictions, axis=1)
29
+ labels = p.label_ids
30
+
31
+ # 使用 Hugging Face 的 metrics 库
32
+ metric = evaluate.load("accuracy")
33
+ accuracy = metric.compute(predictions=preds, references=labels)["accuracy"]
34
+
35
+ # 也可以计算 F1, Precision, Recall
36
+ metric_f1 = evaluate.load("f1")
37
+ f1 = metric_f1.compute(predictions=preds, references=labels, average="binary")["f1"]
38
+
39
+ return {
40
+ 'accuracy': accuracy,
41
+ 'f1': f1,
42
+ }
43
+
44
+
45
+ def finetune_bert():
46
+ """
47
+ 执行BERT模型的微调
48
+ """
49
+ # 1. 加载数据集
50
+ print("--- 1. 加载数据集 ---")
51
+ try:
52
+ # 加载CSV文件作为DatasetDict对象
53
+ data_files = {
54
+ "train": os.path.join(DATA_DIR, "train.csv"),
55
+ "validation": os.path.join(DATA_DIR, "validation.csv"),
56
+ "test": os.path.join(DATA_DIR, "test.csv")
57
+ }
58
+ raw_datasets = load_dataset("csv", data_files=data_files)
59
+ print(raw_datasets)
60
+ except Exception as e:
61
+ print(f"加载数据集时发生错误,请检查 {DATA_DIR} 目录下的CSV文件: {e}")
62
+ return
63
+
64
+ # 2. 加载分词器和模型
65
+ print("--- 2. 加载分词器和模型 ---")
66
+ tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
67
+ model = BertForSequenceClassification.from_pretrained(
68
+ MODEL_NAME,
69
+ num_labels=NUM_LABELS
70
+ )
71
+
72
+ # 3. 数据集 Tokenization (分词)
73
+ def tokenize_function(examples):
74
+ # 假设文本在 'text' 列,标签在 'label' 列
75
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
76
+
77
+ tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
78
+
79
+ # 选择训练、验证、测试子集
80
+ train_dataset = tokenized_datasets["train"]
81
+ eval_dataset = tokenized_datasets["validation"]
82
+ test_dataset = tokenized_datasets["test"]
83
+
84
+ # 4. 设置训练参数
85
+ print("--- 3. 设置训练参数和 Trainer ---")
86
+ training_args = TrainingArguments(
87
+ output_dir=OUTPUT_DIR,
88
+ num_train_epochs=EPOCHS,
89
+ per_device_train_batch_size=BATCH_SIZE,
90
+ per_device_eval_batch_size=BATCH_SIZE,
91
+ warmup_steps=500,
92
+ weight_decay=0.01,
93
+ logging_steps=50,
94
+ eval_strategy="epoch", # 每个epoch结束时评估
95
+ save_strategy="epoch",
96
+ load_best_model_at_end=True, # 训练结束后加载效果最好的模型
97
+ metric_for_best_model="f1",
98
+ seed=SEED,
99
+ learning_rate=LEARNING_RATE,
100
+ report_to="wandb" # 将训练日志保存到wandb
101
+ )
102
+
103
+ # 5. 初始化 Trainer
104
+ trainer = Trainer(
105
+ model=model,
106
+ args=training_args,
107
+ train_dataset=train_dataset,
108
+ eval_dataset=eval_dataset,
109
+ compute_metrics=compute_metrics,
110
+ )
111
+
112
+ # 6. 开始训练
113
+ print("--- 4. 开始训练 ---")
114
+ trainer.train()
115
+
116
+ # 7. 评估测试集
117
+ print("--- 5. 评估测试集 ---")
118
+ results = trainer.evaluate(test_dataset)
119
+ print(f"测试集评估结果: {results}")
120
+
121
+ # 8. 保存最终模型
122
+ trainer.save_model(os.path.join(OUTPUT_DIR, "final"))
123
+ print(f"模型和分词器已保存至: {os.path.join(OUTPUT_DIR, 'final')}")
124
+
125
+ if __name__ == "__main__":
126
+ finetune_bert()
bert_finetune_task1_froze.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ from datasets import load_dataset
5
+ import evaluate
6
+ from transformers import BertForSequenceClassification, BertTokenizerFast
7
+ from transformers import TrainingArguments, Trainer
8
+
9
+ # --- 配置参数 ---
10
+
11
+ DATA_DIR = "./processed_data_task1"
12
+ # 使用的BERT模型,中文任务推荐使用BERT-base-chinese
13
+
14
+ MODEL_NAME = "valuesimplex-ai-lab/FinBERT2-base"
15
+ # MODEL_NAME = "bert-base-chinese"
16
+ # 预留给模型的标签数量 (0 和 1)
17
+ NUM_LABELS = 2
18
+ # 微调结果保存目录
19
+ OUTPUT_DIR = "/home/hsichen/part_time/BERT_finetune/outputs/finbert2_bilabel_frozen_classifier_finetuned_model"
20
+ # 训练参数
21
+ EPOCHS = 3
22
+ BATCH_SIZE = 16
23
+ LEARNING_RATE = 1e-4
24
+ SEED = 42
25
+
26
+ def compute_metrics(p):
27
+ """
28
+ 计算评估指标 (准确率, F1, Precision, Recall)
29
+ """
30
+ preds = np.argmax(p.predictions, axis=1)
31
+ labels = p.label_ids
32
+
33
+ # 使用 Hugging Face 的 metrics 库
34
+ metric = evaluate.load("accuracy")
35
+ accuracy = metric.compute(predictions=preds, references=labels)["accuracy"]
36
+
37
+ # 也可以计算 F1, Precision, Recall
38
+ metric_f1 = evaluate.load("f1")
39
+ f1 = metric_f1.compute(predictions=preds, references=labels, average="binary")["f1"]
40
+
41
+ return {
42
+ 'accuracy': accuracy,
43
+ 'f1': f1,
44
+ # 可以根据需要添加其他指标,如 precision, recall 等
45
+ }
46
+
47
+
48
+ def finetune_bert():
49
+ """
50
+ 执行BERT模型的微调
51
+ """
52
+ # 1. 加载数据集
53
+ print("--- 1. 加载数据集 ---")
54
+ try:
55
+ # 加载CSV文件作为DatasetDict对象
56
+ data_files = {
57
+ "train": os.path.join(DATA_DIR, "train.csv"),
58
+ "validation": os.path.join(DATA_DIR, "validation.csv"),
59
+ "test": os.path.join(DATA_DIR, "test.csv")
60
+ }
61
+ raw_datasets = load_dataset("csv", data_files=data_files)
62
+ print(raw_datasets)
63
+ except Exception as e:
64
+ print(f"加载数据集时发生错误,请检查 {DATA_DIR} 目录下的CSV文件: {e}")
65
+ return
66
+
67
+ print("--- 2. 加载分词器和模型 ---")
68
+ tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
69
+ model = BertForSequenceClassification.from_pretrained(
70
+ MODEL_NAME,
71
+ num_labels=NUM_LABELS
72
+ )
73
+
74
+ print("--- 2.1. 冻结 BERT 主体权重 (修正版) ---")
75
+
76
+ trainable_params_count = 0
77
+ for name, param in model.named_parameters():
78
+ # 检查参数名是否以 'bert.' 开头
79
+ if name.startswith('bert.'):
80
+ param.requires_grad = False
81
+ else:
82
+ # 如果不以 'bert.' 开头,则它是分类器 (classifier.weight/bias)
83
+ param.requires_grad = True
84
+ trainable_params_count += param.numel() # 计算可训练参数量
85
+
86
+ # 检查冻结是否生效
87
+ total_params = sum(p.numel() for p in model.parameters())
88
+ # 这里的 trainable_params 应该接近 train_param_count 的值
89
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
90
+
91
+ print(f"总参数量: {total_params / 1e6:.2f} M")
92
+ print(f"可训练参数量 (仅分类头): {trainable_params / 1e6:.6f} M")
93
+
94
+ # 3. 数据集 Tokenization (分词)
95
+ def tokenize_function(examples):
96
+ # 假设文本在 'text' 列,标签在 'label' 列
97
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
98
+
99
+ tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
100
+
101
+ # 选择训练、验证、测试子集
102
+ train_dataset = tokenized_datasets["train"]
103
+ eval_dataset = tokenized_datasets["validation"]
104
+ test_dataset = tokenized_datasets["test"]
105
+
106
+ # 4. 设置训练参数
107
+ print("--- 3. 设置训练参数和 Trainer ---")
108
+ training_args = TrainingArguments(
109
+ output_dir=OUTPUT_DIR,
110
+ num_train_epochs=EPOCHS,
111
+ per_device_train_batch_size=BATCH_SIZE,
112
+ per_device_eval_batch_size=BATCH_SIZE,
113
+ warmup_steps=500,
114
+ weight_decay=0.01,
115
+ logging_steps=50,
116
+ eval_strategy="epoch", # 每个epoch结束时评估
117
+ save_strategy="epoch",
118
+ load_best_model_at_end=True, # 训练结束后加载效果最好的模型
119
+ metric_for_best_model="f1",
120
+ seed=SEED,
121
+ learning_rate=LEARNING_RATE,
122
+ report_to="wandb" # 将训练日志保存到wandb
123
+ )
124
+
125
+ # 5. 初始化 Trainer
126
+ trainer = Trainer(
127
+ model=model,
128
+ args=training_args,
129
+ train_dataset=train_dataset,
130
+ eval_dataset=eval_dataset,
131
+ compute_metrics=compute_metrics,
132
+ )
133
+
134
+ # 6. 开始训练
135
+ print("--- 4. 开始训练 ---")
136
+ trainer.train()
137
+
138
+ # 7. 评估测试集
139
+ print("--- 5. 评估测试集 ---")
140
+ results = trainer.evaluate(test_dataset)
141
+ print(f"测试集评估结果: {results}")
142
+
143
+ # 8. 保存最终模型
144
+ trainer.save_model(os.path.join(OUTPUT_DIR, "final"))
145
+ print(f"模型和分词器已保存至: {os.path.join(OUTPUT_DIR, 'final')}")
146
+
147
+ if __name__ == "__main__":
148
+ finetune_bert()
continue_pretrain.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from datasets import load_dataset
4
+ from transformers import (
5
+ AutoModelForMaskedLM,
6
+ AutoTokenizer,
7
+ TrainingArguments,
8
+ Trainer,
9
+ DataCollatorForLanguageModeling,
10
+ set_seed
11
+ )
12
+
13
+ # --- 配置参数 ---
14
+ DOMAIN_TEXT_FILE = "/home/hsichen/part_time/BERT_finetune/dataset_pretrain/domain_corpus.txt"
15
+ MODEL_NAME = "valuesimplex-ai-lab/FinBERT2-base"
16
+ OUTPUT_DIR = "./bert_dapt_model"
17
+
18
+ # 预训练超参数
19
+ DAPT_LR = 1e-5 # 较低的学习率,防止破坏原有知识
20
+ DAPT_EPOCHS = 3 # 适中的训练轮数
21
+ BATCH_SIZE = 16 # 批次大小 (请根据您的 GPU 显存调整)
22
+ MLM_PROBABILITY = 0.15 # 掩码比例
23
+ SEED = 42
24
+ NUM_PROC = 64 # 并行处理的进程数
25
+
26
+ # 设置随机种子以保证结果可复现
27
+ set_seed(SEED)
28
+
29
+ def domain_adaptive_pretrain():
30
+
31
+ # 路径检查
32
+ if not os.path.exists(DOMAIN_TEXT_FILE):
33
+ print(f"致命错误:领域语料库文件未找到在 {DOMAIN_TEXT_FILE}。请先运行数据预处理脚本。")
34
+ return
35
+
36
+ # 1. 加载模型和分词器
37
+ print("--- 1. 加载模型和分词器 ---")
38
+ # AutoTokenizer 会自动识别模型对应的分词器
39
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
40
+ # AutoModelForMaskedLM 专门用于 MLM 任务
41
+ model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
42
+
43
+ # 2. 加载和处理文本数据集
44
+ print("--- 2. 加载和处理文本数据集 ---")
45
+
46
+ # 使用 datasets 库加载纯文本文件
47
+ # 文件必须包含在 'train' 键下,以支持 Trainer
48
+ raw_datasets = load_dataset("text", data_files={"train": DOMAIN_TEXT_FILE})
49
+
50
+ # 定义 tokenization 函数
51
+ def tokenize_function(examples):
52
+ # 截断但不填充,因为 DataCollatorForLanguageModeling 会处理填充
53
+ return tokenizer(
54
+ examples["text"],
55
+ truncation=True,
56
+ max_length=512, # 推荐修改
57
+ return_special_tokens_mask=True
58
+ )
59
+
60
+ tokenized_datasets = raw_datasets.map(
61
+ tokenize_function, batched=True, remove_columns=["text"], num_proc=NUM_PROC
62
+ )
63
+
64
+ # 将长文本切块 (Chunking) 和分组 (Grouping)
65
+ def group_texts(examples):
66
+ # 拼接所有文本
67
+ concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
68
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
69
+
70
+ # 设定切块大小
71
+ chunk_size = 512
72
+
73
+ # print(f"Total length: {total_length}, after chunking: {total_length // chunk_size}")
74
+
75
+ # 通过截断 total_length 来丢弃最后一个不完整的切块
76
+ total_length = (total_length // chunk_size) * chunk_size
77
+
78
+ # 将文本切分成 max_length (512) 的块
79
+ result = {
80
+ k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
81
+ for k, t in concatenated_examples.items()
82
+ }
83
+ # 标签 ID 设为 input_ids,DataCollator 会将非掩码位置设置为 -100
84
+ result["labels"] = result["input_ids"].copy()
85
+ return result
86
+
87
+
88
+ # 最终的 DAPT 训练数据集
89
+ lm_datasets = tokenized_datasets.map(
90
+ group_texts, batched=True, num_proc=NUM_PROC
91
+ )
92
+
93
+ # # 3. 数据收集器 (动态掩码)
94
+ # # 这个 Collator 会在每个批次中随机应用 15% 的掩码
95
+ data_collator = DataCollatorForLanguageModeling(
96
+ tokenizer=tokenizer,
97
+ mlm=True,
98
+ mlm_probability=MLM_PROBABILITY
99
+ )
100
+
101
+ # 4. 设置训练参数
102
+ print("--- 3. 设置训练参数 ---")
103
+ training_args = TrainingArguments(
104
+ output_dir=OUTPUT_DIR,
105
+ num_train_epochs=DAPT_EPOCHS,
106
+ per_device_train_batch_size=BATCH_SIZE,
107
+ learning_rate=DAPT_LR,
108
+ weight_decay=0.01,
109
+ logging_steps=50,
110
+ save_strategy="epoch",
111
+ report_to="wandb",
112
+ )
113
+
114
+ # 5. 初始化 Trainer
115
+ trainer = Trainer(
116
+ model=model,
117
+ args=training_args,
118
+ train_dataset=lm_datasets["train"],
119
+ data_collator=data_collator,
120
+ )
121
+
122
+ # 6. 开始继续预训练
123
+ print("--- 4. 开始继续预训练 ---")
124
+ trainer.train()
125
+
126
+ # 7. 保存 DAPT 模型
127
+ trainer.save_model(OUTPUT_DIR)
128
+ tokenizer.save_pretrained(OUTPUT_DIR)
129
+ print(f"DAPT 模型已保存至: {OUTPUT_DIR}")
130
+
131
+ if __name__ == "__main__":
132
+ domain_adaptive_pretrain()
dapt_data_process.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ from typing import List
4
+
5
+ # --- 配置参数 ---
6
+ INPUT_CSV_PATH = "/home/hsichen/part_time/BERT_finetune/dataset_pretrain/Experiment_sentences_training_filtered_part1.csv"
7
+ # 输出的纯文本文件路径 (用于 DAPT 脚本中的 DOMAIN_TEXT_FILE)
8
+ OUTPUT_TXT_PATH = "/home/hsichen/part_time/BERT_finetune/dataset_pretrain/domain_corpus.txt"
9
+ # 纯文本文件的编码
10
+ ENCODING = 'utf-8'
11
+
12
+ def prepare_dapt_data(input_csv_path: str, output_txt_path: str, encoding: str):
13
+ """
14
+ 从 CSV 文件中提取 'sentence' 列,并保存为纯文本文件,每行一个句子。
15
+
16
+ Args:
17
+ input_csv_path: 原始 CSV 文件的路径。
18
+ output_txt_path: 目标纯文本文件的路径。
19
+ encoding: 文件编码。
20
+ """
21
+ print(f"--- 1. 读取数据: {input_csv_path} ---")
22
+
23
+ try:
24
+ # 尝试读取 CSV 文件
25
+ # 假设文件不含复杂编码问题,使用默认读取
26
+ df = pd.read_csv(input_csv_path)
27
+ except FileNotFoundError:
28
+ print(f"错误:输入文件未找到在路径: {input_csv_path}")
29
+ return
30
+ except Exception as e:
31
+ print(f"读取 CSV 文件时发生错误: {e}")
32
+ return
33
+
34
+ # --- 2. 数据处理与清洗 ---
35
+
36
+ # 检查 'sentence' 列是否存在
37
+ if 'sentence' not in df.columns:
38
+ print("错误:CSV 文件中未找到 'sentence' 列。请检查列名是否正确。")
39
+ return
40
+
41
+ # 提取 'sentence' 列,并去除 NaN 值
42
+ sentences: List[str] = df['sentence'].dropna().astype(str).tolist()
43
+
44
+ if not sentences:
45
+ print("警告:'sentence' 列中没有有效数据,无法生成语料库。")
46
+ return
47
+
48
+ # 简单清洗:去除多余的空格或换行符(如果有的话)
49
+ sentences = [s.strip() for s in sentences]
50
+
51
+ print(f"提取到 {len(sentences)} 条有效句子。")
52
+
53
+ # --- 3. 保存为纯文本文件 ---
54
+ print(f"--- 3. 保存至纯文本文件: {output_txt_path} ---")
55
+
56
+ # 将句子列表写入文件,每行一个句子
57
+ try:
58
+ with open(output_txt_path, 'w', encoding=encoding) as f:
59
+ f.write('\n'.join(sentences))
60
+
61
+ print(f"数据成功保存!")
62
+
63
+ except Exception as e:
64
+ print(f"写入文件时发生错误: {e}")
65
+
66
+ # --- 运行主函数 ---
67
+ if __name__ == "__main__":
68
+ # 请确保您已安装 pandas: pip install pandas
69
+ prepare_dapt_data(INPUT_CSV_PATH, OUTPUT_TXT_PATH, ENCODING)
data_process_task1.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ import os
4
+
5
+ # --- 配置参数 ---
6
+ EXCEL_FILE_PATH = "/home/hsichen/part_time/BERT_finetune/data_source.xlsx"
7
+ OUTPUT_DIR = "./processed_data"
8
+ # 划分数据集的比例 (训练集:测试集)
9
+ TEST_SIZE = 0.2
10
+ # 随机种子,用于确保每次划分结果一致
11
+ RANDOM_SEED = 42
12
+
13
+ def preprocess_data(excel_path: str, output_dir: str, test_size: float, random_seed: int):
14
+ """
15
+ 读取Excel数据,进行清洗和格式转换,并划分为训练集和测试集。
16
+
17
+ Args:
18
+ excel_path: 原始Excel文件的路径。
19
+ output_dir: 存放处理后CSV文件的目录。
20
+ test_size: 测试集占总数据的比例。
21
+ random_seed: 随机种子。
22
+ """
23
+ print(f"--- 1. 读取数据: {excel_path} ---")
24
+ try:
25
+ # 尝试读取Excel文件的第一个工作表
26
+ df = pd.read_excel(excel_path)
27
+ except FileNotFoundError:
28
+ print(f"错误:文件未找到在路径: {excel_path}")
29
+ return
30
+ except Exception as e:
31
+ print(f"读取Excel文件时发生错误: {e}")
32
+ return
33
+
34
+ # --- 2. 数据清洗与格式转换 ---
35
+
36
+ # 检查所需的列是否存在
37
+ required_cols = ['sentence', 'Envir']
38
+ if not all(col in df.columns for col in required_cols):
39
+ print(f"错误:Excel中缺少必需的列。找到的列有: {df.columns.tolist()}")
40
+ print(f"必需的列是: {required_cols}")
41
+ return
42
+
43
+ # 重命名列以符合通用NLP任务格式 (text 和 label)
44
+ df = df.rename(columns={'sentence': 'text', 'Envir': 'label'})
45
+
46
+ # 确保'label'列是整数类型 (0或1)
47
+ df['label'] = df['label'].astype(int)
48
+
49
+ # 仅保留 'text' 和 'label' 两列
50
+ df = df[['text', 'label']].dropna()
51
+ print(f"原始数据条数: {len(df)}")
52
+
53
+ # --- 3. 划分数据集 ---
54
+ print(f"--- 划分数据集 (训练集:{1-test_size}, 测试集:{test_size}) ---")
55
+
56
+ # 将数据划分为训练集和测试集,使用分层抽样 (stratify) 确保标签比例一致
57
+ train_df, test_df = train_test_split(
58
+ df,
59
+ test_size=test_size,
60
+ random_state=random_seed,
61
+ stratify=df['label']
62
+ )
63
+
64
+ val_size_from_train = 0.1 / (1 - test_size)
65
+ train_df, val_df = train_test_split(
66
+ train_df,
67
+ test_size=val_size_from_train,
68
+ random_state=random_seed,
69
+ stratify=train_df['label']
70
+ )
71
+
72
+ # --- 4. 保存为CSV文件 ---
73
+ os.makedirs(output_dir, exist_ok=True)
74
+
75
+ train_output_path = os.path.join(output_dir, 'train.csv')
76
+ val_output_path = os.path.join(output_dir, 'validation.csv')
77
+ test_output_path = os.path.join(output_dir, 'test.csv')
78
+
79
+ train_df.to_csv(train_output_path, index=False)
80
+ val_df.to_csv(val_output_path, index=False)
81
+ test_df.to_csv(test_output_path, index=False)
82
+
83
+ print("--- 结果保存成功 ---")
84
+ print(f"训练集条数: {len(train_df)}. 保存至: {train_output_path}")
85
+ print(f"验证集条数: {len(val_df)}. 保存至: {val_output_path}")
86
+ print(f"测试集条数: {len(test_df)}. 保存至: {test_output_path}")
87
+
88
+ if __name__ == "__main__":
89
+ preprocess_data(EXCEL_FILE_PATH, OUTPUT_DIR, TEST_SIZE, RANDOM_SEED)
data_process_task2.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ import os
4
+
5
+ # --- 配置参数 ---
6
+ EXCEL_FILE_PATH = "/home/hsichen/part_time/BERT_finetune/标注数据_更正后.xlsx"
7
+ OUTPUT_DIR = "./processed_data_task2_fixed"
8
+ # 划分数据集的比例 (训练集:测试集)
9
+ TEST_SIZE = 0.2
10
+ # 随机种子,用于确保每次划分结果一致
11
+ RANDOM_SEED = 42
12
+
13
+ def preprocess_data(excel_path: str, output_dir: str, test_size: float, random_seed: int):
14
+ """
15
+ 读取Excel数据,进行清洗和格式转换,并划分为训练集、验证集和测试集。
16
+ 使用标签数量 (Label Count) 进行分层抽样。
17
+ """
18
+ print(f"--- 1. 读取数据: {excel_path} ---")
19
+ try:
20
+ df = pd.read_excel(excel_path)
21
+ except FileNotFoundError:
22
+ print(f"错误:文件未找到在路径: {excel_path}")
23
+ return
24
+ except Exception as e:
25
+ print(f"读取Excel文件时发生错误: {e}")
26
+ return
27
+
28
+ # --- 2. 数据清洗与格式转换 ---
29
+
30
+ # 1. 筛选数据:只保留 Envir=1 的行
31
+ df = df[df['Envir'] == 1].copy()
32
+ print(f"筛选 Envir=1 后数据条数: {len(df)}")
33
+
34
+ # 2. 整合标签
35
+ TAG_COLS = ['Data', 'Action', 'Gain', 'Regu', 'Vague']
36
+
37
+ # 将标签列转换为列表
38
+ df['labels'] = df[TAG_COLS].values.tolist()
39
+
40
+ df = df.rename(columns={'sentence': 'text'})
41
+
42
+ # 3. 统计标签组合及其个数 (用于分析,保留逻辑)
43
+ print("--- 3. 标签组合类型统计 ---")
44
+
45
+ # 将标签列表转换为元组
46
+ df['label_tuple'] = df['labels'].apply(tuple)
47
+
48
+ # 将元组转换为字符串,作为 train_test_split 的 stratify 参数
49
+ # 【新增/修改】:创建用于分层的字符串列
50
+ df['stratify_col'] = df['label_tuple'].astype(str)
51
+
52
+ print("-" * 30)
53
+
54
+ # 仅保留 'text', 'labels', 'stratify_col' 列用于划分 (注意不再需要 'label_count')
55
+ df = df[['text', 'labels', 'stratify_col']].copy()
56
+
57
+ # --- 4. 划分数据集 (使用 stratify_col 进行分层) ---
58
+ print(f"--- 划分数据集 (训练集:{1-test_size}, 测试集:{test_size}) ---")
59
+
60
+ # 第一次划分:训练集+验证集 vs 测试集
61
+ train_val_df, test_df = train_test_split(
62
+ df,
63
+ test_size=test_size,
64
+ random_state=random_seed,
65
+ # 【关键修改】:使用 'stratify_col' 进行分层
66
+ stratify=df['stratify_col']
67
+ )
68
+
69
+ # 第二次划分:训练集 vs 验证集
70
+ val_size_from_train = 0.1 / (1 - test_size)
71
+
72
+ train_df, val_df = train_test_split(
73
+ train_val_df,
74
+ test_size=val_size_from_train,
75
+ random_state=random_seed,
76
+ # 【关键修改】:使用 'stratify_col' 进行分层
77
+ stratify=train_val_df['stratify_col']
78
+ )
79
+
80
+ # --- 5. 保存为CSV文件 ---
81
+ os.makedirs(output_dir, exist_ok=True)
82
+
83
+ # 保存时仅保留 BERT 需要的 'text' 和 'labels' 列
84
+ train_df[['text', 'labels']].to_csv(os.path.join(output_dir, 'train.csv'), index=False)
85
+ val_df[['text', 'labels']].to_csv(os.path.join(output_dir, 'validation.csv'), index=False)
86
+ test_df[['text', 'labels']].to_csv(os.path.join(output_dir, 'test.csv'), index=False)
87
+
88
+ print("--- 结果保存成功 ---")
89
+ print(f"训练集条数: {len(train_df)}. 保存至: {os.path.join(output_dir, 'train.csv')}")
90
+ print(f"验证集条数: {len(val_df)}. 保存至: {os.path.join(output_dir, 'validation.csv')}")
91
+ print(f"测试集条数: {len(test_df)}. 保存至: {os.path.join(output_dir, 'test.csv')}")
92
+
93
+ if __name__ == "__main__":
94
+ # 请确保您已安装必要的库: pip install pandas openpyxl scikit-learn
95
+ preprocess_data(EXCEL_FILE_PATH, OUTPUT_DIR, TEST_SIZE, RANDOM_SEED)
data_source.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d691e8657afbf64b7d7e51fb69293651106ee4e890f8046bdbb588593936b45
3
+ size 4571190
dataset_pretrain/Experiment_sentences_training_filtered_part1.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:165ed0ca475a1c026c609cb441ce9969d1ccf33cbec744cfe4277deffd60228e
3
+ size 1365723082
dataset_pretrain/domain_corpus.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1337725c4e8ea9ca886466a88d9bb9185bdbafe1100465d368919a918519db4f
3
+ size 787886543
dataset_pretrain/预训练数据第二部分_年报.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff51de07b828c9d4ea132ae5e91f66dac802187bc2598bcccb3ee58a4693b3c1
3
+ size 698809156
model_inference_task1.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ from torch import nn
5
+ from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast, AutoConfig
6
+
7
+ # 定义标签名称,与任务一致
8
+ TAG_COLS = ['Data', 'Action', 'Gain', 'Regu', 'Vague']
9
+ PREDICTION_THRESHOLD = 0.5 # 预测阈值
10
+
11
+ # ----------------------------------------------------
12
+ # A. 定义支持多标签分类的 BERT 模型(必须与训练时一致)
13
+ # ----------------------------------------------------
14
+ class BertForMultiLabelClassification(BertPreTrainedModel):
15
+ """
16
+ 基于 BERT 的多标签分类模型,使用 BCEWithLogitsLoss
17
+ """
18
+ def __init__(self, config):
19
+ super().__init__(config)
20
+ self.num_labels = config.num_labels
21
+
22
+ # 加载 BERT 主体
23
+ self.bert = BertModel(config)
24
+
25
+ # 加载训练时的 dropout 比例
26
+ classifier_dropout = config.hidden_dropout_prob
27
+ self.dropout = nn.Dropout(classifier_dropout)
28
+
29
+ # 加载训练时的分类器层
30
+ self.classifier = nn.Linear(config.hidden_size, self.num_labels)
31
+
32
+ self.post_init()
33
+ # 注意:推理时不需要损失函数,但保持结构完整性
34
+ self.loss_fct = nn.BCEWithLogitsLoss()
35
+
36
+ def forward(self,
37
+ input_ids=None,
38
+ attention_mask=None,
39
+ token_type_ids=None,
40
+ labels=None):
41
+
42
+ outputs = self.bert(
43
+ input_ids,
44
+ attention_mask=attention_mask,
45
+ token_type_ids=token_type_ids,
46
+ )
47
+
48
+ # 取 [CLS] token 的隐藏状态 (即 pooler output)
49
+ pooled_output = outputs.pooler_output
50
+ pooled_output = self.dropout(pooled_output)
51
+
52
+ # 经过分类器层,输出 logits (未经 Sigmoid 的分数)
53
+ logits = self.classifier(pooled_output)
54
+
55
+ # 推理时 labels 为 None,直接返回 logits
56
+ return logits
57
+
58
+
59
+ # ----------------------------------------------------
60
+ # B. 模型推理函数
61
+ # ----------------------------------------------------
62
+ def predict_multilabel(checkpoint_path: str, tokenizer_path: str, text_to_predict: str):
63
+ """
64
+ 加载模型检查点,对单个文本进行多标签预测。
65
+
66
+ Args:
67
+ checkpoint_path: BERT 模型检查点目录(包含 config.json, model.safetensors)。
68
+ tokenizer_path: 分词器路径或名称。
69
+ text_to_predict: 待预测的输入文本。
70
+
71
+ Returns:
72
+ 包含预测标签和概率的字典。
73
+ """
74
+ print(f"--- 1. 正在加载模型和分词器: {checkpoint_path} ---")
75
+
76
+ try:
77
+ config = AutoConfig.from_pretrained(checkpoint_path)
78
+ # 确保配置中的 num_labels 与实际标签数量匹配
79
+ if config.num_labels != len(TAG_COLS):
80
+ # 运行时动态修正 num_labels,以防 checkpoint-config.json 里的 num_labels 不匹配
81
+ config.num_labels = len(TAG_COLS)
82
+ print(f"警告: 检查点配置的 num_labels 已从 {config.num_labels} 修正为 {len(TAG_COLS)}")
83
+
84
+ # 从检查点加载分词器(假设分词器文件已存在或被复制)
85
+ tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path)
86
+
87
+ # 使用自定义模型类加载模型权重
88
+ model = BertForMultiLabelClassification.from_pretrained(
89
+ checkpoint_path,
90
+ config=config # 传入更新后的 config
91
+ )
92
+ except Exception as e:
93
+ print(f"加载模型或分词器失败,请检查路径中是否包含所有必需文件(如 model.safetensors, config.json, vocab.txt): {e}")
94
+ return None
95
+
96
+ model.eval() # 切换到评估模式 (关闭 Dropout等)
97
+
98
+ # 2. 文本编码
99
+ inputs = tokenizer(
100
+ text_to_predict,
101
+ padding="max_length",
102
+ truncation=True,
103
+ max_length=512,
104
+ return_tensors="pt"
105
+ )
106
+
107
+ # 3. 执行推理
108
+ with torch.no_grad():
109
+ # 模型返回的是 logits
110
+ outputs = model(**inputs)
111
+ logits = outputs.cpu().numpy() # 移动到 CPU 并转为 numpy
112
+
113
+ # 4. 后处理:Sigmoid 和 阈值
114
+ # 应用 Sigmoid 转换为概率
115
+ probs = 1 / (1 + np.exp(-logits))
116
+ # 应用阈值得到二元预测
117
+ preds = (probs > PREDICTION_THRESHOLD).astype(int)
118
+
119
+ # 5. 格式化输出
120
+ result = {}
121
+
122
+ # 遍历每个标签,并记录其预测结果和概率
123
+ for i, tag in enumerate(TAG_COLS):
124
+ # 结果只针对单个样本(批次大小为 1)
125
+ is_predicted = preds[0][i] == 1
126
+ probability = probs[0][i]
127
+
128
+ result[tag] = {
129
+ "predicted": is_predicted,
130
+ "probability": float(f"{probability:.4f}") # 保留 4 位小数
131
+ }
132
+
133
+ print("--- 5. 预测结果 ---")
134
+
135
+ # 提取所有预测为 True 的标签
136
+ predicted_tags = [tag for tag, info in result.items() if info["predicted"]]
137
+
138
+ if predicted_tags:
139
+ print(f"预测标签���别: {predicted_tags}")
140
+ print(f"对应概率:")
141
+ for tag in predicted_tags:
142
+ print(f" - {tag}: {result[tag]['probability']}")
143
+ else:
144
+ print("未预测任何标签(所有标签概率均低于 0.5)。")
145
+ print(f"所有标签的最高概率: {max(p['probability'] for p in result.values()):.4f}")
146
+
147
+
148
+ # ----------------------------------------------------
149
+ # C. 示例运行
150
+ # ----------------------------------------------------
151
+ if __name__ == "__main__":
152
+ # 以下三个参数是需要替换的,TOKENIZER需要与MODEL匹配
153
+ MODEL_CHECKPOINT = "/home/hsichen/part_time/BERT_finetune/outputs/finbert2_multilabel_model_finetuned_from_dapt/final"
154
+ TOKENIZER = 'valuesimplex-ai-lab/FinBERT2-base'
155
+ # TOKENIZER = 'bert-base-chinese'
156
+ SAMPLE_TEXT = "密切关注安全环保对原料市场的影响,提前落实应对预案;"
157
+
158
+ # 确保检查点目录存在
159
+ if not os.path.exists(MODEL_CHECKPOINT):
160
+ print(f"错误:模型检查点目录不存在: {MODEL_CHECKPOINT}")
161
+ else:
162
+ predict_multilabel(MODEL_CHECKPOINT,TOKENIZER, SAMPLE_TEXT)
model_inference_task2.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ from torch import nn
5
+ from transformers import AutoModelForSequenceClassification, BertTokenizerFast, AutoConfig, pipeline, BertPreTrainedModel, BertModel
6
+
7
+ # 定义标签名称,与任务一致
8
+ BINARY_LABELS = ['Non-Envir', 'Envir']
9
+ NUM_LABELS = 2
10
+
11
+ # ----------------------------------------------------
12
+ # A. 定义支持多标签分类的 BERT 模型(必须与训练时一致)
13
+ # ----------------------------------------------------
14
+ class BertForMultiLabelClassification(BertPreTrainedModel):
15
+ """
16
+ 基于 BERT 的多标签分类模型,使用 BCEWithLogitsLoss
17
+ """
18
+ def __init__(self, config):
19
+ super().__init__(config)
20
+ self.num_labels = config.num_labels
21
+
22
+ # 加载 BERT 主体
23
+ self.bert = BertModel(config)
24
+
25
+ # 加载训练时的 dropout 比例
26
+ classifier_dropout = config.hidden_dropout_prob
27
+ self.dropout = nn.Dropout(classifier_dropout)
28
+
29
+ # 加载训练时的分类器层
30
+ self.classifier = nn.Linear(config.hidden_size, self.num_labels)
31
+
32
+ self.post_init()
33
+ # 注意:推理时不需要损失函数,但保持结构完整性
34
+ self.loss_fct = nn.BCEWithLogitsLoss()
35
+
36
+ def forward(self,
37
+ input_ids=None,
38
+ attention_mask=None,
39
+ token_type_ids=None,
40
+ labels=None):
41
+
42
+ outputs = self.bert(
43
+ input_ids,
44
+ attention_mask=attention_mask,
45
+ token_type_ids=token_type_ids,
46
+ )
47
+
48
+ # 取 [CLS] token 的隐藏状态 (即 pooler output)
49
+ pooled_output = outputs.pooler_output
50
+ pooled_output = self.dropout(pooled_output)
51
+
52
+ # 经过分类器层,输出 logits (未经 Sigmoid 的分数)
53
+ logits = self.classifier(pooled_output)
54
+
55
+ # 推理时 labels 为 None,直接返回 logits
56
+ return logits
57
+
58
+
59
+ # ----------------------------------------------------
60
+ # B. 模型推理函数
61
+ # ----------------------------------------------------
62
+ def predict_binary_classification(checkpoint_path: str, tokenizer_path: str, text_to_predict: str):
63
+ """
64
+ 加载 BERT 二分类模型检查点,对单个文本进行二分类预测。
65
+
66
+ Args:
67
+ checkpoint_path: BERT 模型检查点目录(包含 config.json, model.safetensors)。
68
+ tokenizer_path: 分词器路径或名称。
69
+ text_to_predict: 待预测的输入文本。
70
+
71
+ Returns:
72
+ 包含预测标签和概率的字典。
73
+ """
74
+ print(f"--- 1. 正在加载二分类模型和分词器: {checkpoint_path} ---")
75
+
76
+ try:
77
+ # 1. 加载配置和分词器
78
+ config = AutoConfig.from_pretrained(checkpoint_path, num_labels=NUM_LABELS)
79
+ tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path)
80
+
81
+ # 2. 使用标准的 AutoModelForSequenceClassification 加载模型
82
+ # 这将自动处理模型加载和分类头维度不匹配的问题
83
+ model = AutoModelForSequenceClassification.from_pretrained(
84
+ checkpoint_path,
85
+ config=config,
86
+ ignore_mismatched_sizes=True # 容忍加载时的分类头尺寸不匹配
87
+ )
88
+ except Exception as e:
89
+ print(f"加载模型或分词器失败,请检查路径中是否包含所有必需文件: {e}")
90
+ return None
91
+
92
+ model.eval() # 切换到评估模式
93
+
94
+ # 3. 文本编码
95
+ inputs = tokenizer(
96
+ text_to_predict,
97
+ padding=True,
98
+ truncation=True,
99
+ max_length=512,
100
+ return_tensors="pt"
101
+ )
102
+
103
+ # 4. 执行推理
104
+ with torch.no_grad():
105
+ # 模型返回的是 Logits (维度通常是 [1, 2])
106
+ outputs = model(**inputs)
107
+ logits = outputs.logits # 获取 Logits
108
+
109
+ # 应用 Softmax 转换为概率分布
110
+ probabilities = torch.softmax(logits, dim=1).cpu().numpy()[0]
111
+
112
+ # 确定预测的类别索引 (0 或 1)
113
+ predicted_index = np.argmax(probabilities)
114
+
115
+ # 5. 格式化输出
116
+
117
+ # 预测的类别名称
118
+ predicted_label = BINARY_LABELS[predicted_index]
119
+ # 预测类别的概率
120
+ predicted_prob = probabilities[predicted_index]
121
+
122
+ # 打印结果
123
+ print("--- 5. 预测结果 ---")
124
+ print(f"输入文本: {text_to_predict}")
125
+ print(f"预测类别: {predicted_label}")
126
+ print(f"对应概率: {predicted_prob:.4f}")
127
+
128
+ # 返回所有类别的概率
129
+ result = {
130
+ 'prediction': predicted_label,
131
+ 'probability': float(f"{predicted_prob:.4f}"),
132
+ 'all_probabilities': {
133
+ BINARY_LABELS[i]: float(f"{probabilities[i]:.4f}") for i in range(NUM_LABELS)
134
+ }
135
+ }
136
+ return result
137
+
138
+
139
+ # ----------------------------------------------------
140
+ # C. 示例运行
141
+ # ----------------------------------------------------
142
+ if __name__ == "__main__":
143
+ # 以下三个参数是需要替换的,TOKENIZER需要与MODEL匹��
144
+ MODEL_CHECKPOINT = "/home/hsichen/part_time/BERT_finetune/outputs/finbert2_bilabel_finetuned_model_from_dapt/final"
145
+ TOKENIZER = 'valuesimplex-ai-lab/FinBERT2-base'
146
+ # TOKENIZER = 'bert-base-chinese'
147
+ SAMPLE_TEXT = "密切关注安全环保对原料市场的影响,提前落实应对预案;"
148
+
149
+ # 确保检查点目录存在
150
+ if not os.path.exists(MODEL_CHECKPOINT):
151
+ print(f"错误:模型检查点目录不存在: {MODEL_CHECKPOINT}")
152
+ else:
153
+ predict_binary_classification(MODEL_CHECKPOINT,TOKENIZER, SAMPLE_TEXT)
outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMultiLabelClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "directionality": "bidi",
8
+ "dtype": "float32",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "LABEL_0",
14
+ "1": "LABEL_1",
15
+ "2": "LABEL_2",
16
+ "3": "LABEL_3",
17
+ "4": "LABEL_4"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "LABEL_0": 0,
23
+ "LABEL_1": 1,
24
+ "LABEL_2": 2,
25
+ "LABEL_3": 3,
26
+ "LABEL_4": 4
27
+ },
28
+ "layer_norm_eps": 1e-12,
29
+ "max_position_embeddings": 512,
30
+ "model_type": "bert",
31
+ "num_attention_heads": 12,
32
+ "num_hidden_layers": 12,
33
+ "pad_token_id": 0,
34
+ "pooler_fc_size": 768,
35
+ "pooler_num_attention_heads": 12,
36
+ "pooler_num_fc_layers": 3,
37
+ "pooler_size_per_head": 128,
38
+ "pooler_type": "first_token_transform",
39
+ "transformers_version": "5.0.0.dev0",
40
+ "type_vocab_size": 2,
41
+ "use_cache": false,
42
+ "vocab_size": 21128
43
+ }
outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db1b29e958367916a505a9c6b0c691768326cd696d2a1f18b4977621aff808d4
3
+ size 409109468
outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": false,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6021ebdaf4d143aee6780f3f7323087af8fe80c7cadc2add939b077d330f0cc
3
+ size 5201
outputs/bert2_multilabel_frozen_classifier_finetuned_model/final/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
outputs/bert_bilabel_finetuned_model/checkpoint-1094/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "directionality": "bidi",
8
+ "dtype": "float32",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "pooler_fc_size": 768,
21
+ "pooler_num_attention_heads": 12,
22
+ "pooler_num_fc_layers": 3,
23
+ "pooler_size_per_head": 128,
24
+ "pooler_type": "first_token_transform",
25
+ "problem_type": "single_label_classification",
26
+ "transformers_version": "5.0.0.dev0",
27
+ "type_vocab_size": 2,
28
+ "use_cache": false,
29
+ "vocab_size": 21128
30
+ }
outputs/bert_bilabel_finetuned_model/checkpoint-1094/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:008f6adad10108d3cc7a5c01474525cd308971bbfaeab910af694124fbb12750
3
+ size 409100240
outputs/bert_bilabel_finetuned_model/checkpoint-1094/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84929c0ce67f5a4a63810f68dd4367f3a37b4648b4c6197ee21b5810ab0529b0
3
+ size 818324875
outputs/bert_bilabel_finetuned_model/checkpoint-1094/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7df46a9f83e371cdeb326e2171479963c0b2372be2b82e7056ff56b48e5999c
3
+ size 14645
outputs/bert_bilabel_finetuned_model/checkpoint-1094/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:819f3a38dcbba4f9f621d51359778f1704914a94d1d1ba3a7961e9fbf54ac1bb
3
+ size 1465
outputs/bert_bilabel_finetuned_model/checkpoint-1094/trainer_state.json ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1094,
3
+ "best_metric": 0.9395770392749244,
4
+ "best_model_checkpoint": "./bert_finetuned_model/checkpoint-1094",
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1094,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.04570383912248629,
14
+ "grad_norm": 6.597176551818848,
15
+ "learning_rate": 1.9600000000000003e-06,
16
+ "loss": 0.8315,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.09140767824497258,
21
+ "grad_norm": 4.10335636138916,
22
+ "learning_rate": 3.96e-06,
23
+ "loss": 0.403,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.13711151736745886,
28
+ "grad_norm": 5.460880756378174,
29
+ "learning_rate": 5.9600000000000005e-06,
30
+ "loss": 0.2138,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.18281535648994515,
35
+ "grad_norm": 1.7257156372070312,
36
+ "learning_rate": 7.960000000000002e-06,
37
+ "loss": 0.0675,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.22851919561243145,
42
+ "grad_norm": 0.3548933267593384,
43
+ "learning_rate": 9.960000000000001e-06,
44
+ "loss": 0.0887,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.2742230347349177,
49
+ "grad_norm": 0.07574323564767838,
50
+ "learning_rate": 1.196e-05,
51
+ "loss": 0.0625,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.31992687385740404,
56
+ "grad_norm": 0.27218344807624817,
57
+ "learning_rate": 1.396e-05,
58
+ "loss": 0.0909,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.3656307129798903,
63
+ "grad_norm": 0.07252885401248932,
64
+ "learning_rate": 1.5960000000000003e-05,
65
+ "loss": 0.0388,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.4113345521023766,
70
+ "grad_norm": 1.1499181985855103,
71
+ "learning_rate": 1.796e-05,
72
+ "loss": 0.0955,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 0.4570383912248629,
77
+ "grad_norm": 13.650275230407715,
78
+ "learning_rate": 1.9960000000000002e-05,
79
+ "loss": 0.0869,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 0.5027422303473492,
84
+ "grad_norm": 11.625408172607422,
85
+ "learning_rate": 1.9647735442127967e-05,
86
+ "loss": 0.0851,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 0.5484460694698354,
91
+ "grad_norm": 0.3337002992630005,
92
+ "learning_rate": 1.92882818116463e-05,
93
+ "loss": 0.103,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 0.5941499085923218,
98
+ "grad_norm": 7.300892353057861,
99
+ "learning_rate": 1.892882818116463e-05,
100
+ "loss": 0.082,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 0.6398537477148081,
105
+ "grad_norm": 0.24430198967456818,
106
+ "learning_rate": 1.8569374550682964e-05,
107
+ "loss": 0.0711,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 0.6855575868372943,
112
+ "grad_norm": 15.26744270324707,
113
+ "learning_rate": 1.8209920920201294e-05,
114
+ "loss": 0.0737,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 0.7312614259597806,
119
+ "grad_norm": 0.24188373982906342,
120
+ "learning_rate": 1.7850467289719628e-05,
121
+ "loss": 0.0668,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 0.7769652650822669,
126
+ "grad_norm": 0.1296696811914444,
127
+ "learning_rate": 1.7491013659237958e-05,
128
+ "loss": 0.0537,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 0.8226691042047533,
133
+ "grad_norm": 0.13343055546283722,
134
+ "learning_rate": 1.7131560028756292e-05,
135
+ "loss": 0.0785,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 0.8683729433272395,
140
+ "grad_norm": 4.3099517822265625,
141
+ "learning_rate": 1.6772106398274622e-05,
142
+ "loss": 0.1045,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 0.9140767824497258,
147
+ "grad_norm": 0.024240005761384964,
148
+ "learning_rate": 1.6412652767792956e-05,
149
+ "loss": 0.023,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 0.9597806215722121,
154
+ "grad_norm": 1.5524265766143799,
155
+ "learning_rate": 1.605319913731129e-05,
156
+ "loss": 0.0541,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 1.0,
161
+ "eval_accuracy": 0.984,
162
+ "eval_f1": 0.9395770392749244,
163
+ "eval_loss": 0.06908556073904037,
164
+ "eval_runtime": 28.0922,
165
+ "eval_samples_per_second": 88.993,
166
+ "eval_steps_per_second": 5.589,
167
+ "step": 1094
168
+ }
169
+ ],
170
+ "logging_steps": 50,
171
+ "max_steps": 3282,
172
+ "num_input_tokens_seen": 0,
173
+ "num_train_epochs": 3,
174
+ "save_steps": 500,
175
+ "stateful_callbacks": {
176
+ "TrainerControl": {
177
+ "args": {
178
+ "should_epoch_stop": false,
179
+ "should_evaluate": false,
180
+ "should_log": false,
181
+ "should_save": true,
182
+ "should_training_stop": false
183
+ },
184
+ "attributes": {}
185
+ }
186
+ },
187
+ "total_flos": 4604443468800000.0,
188
+ "train_batch_size": 16,
189
+ "trial_name": null,
190
+ "trial_params": null
191
+ }
outputs/bert_bilabel_finetuned_model/checkpoint-1094/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:525b07a66e715289db75a841e0609901e3ee221ba4268c678c362a7bbb781388
3
+ size 5137
outputs/bert_bilabel_finetuned_model/checkpoint-2188/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "directionality": "bidi",
8
+ "dtype": "float32",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "pooler_fc_size": 768,
21
+ "pooler_num_attention_heads": 12,
22
+ "pooler_num_fc_layers": 3,
23
+ "pooler_size_per_head": 128,
24
+ "pooler_type": "first_token_transform",
25
+ "problem_type": "single_label_classification",
26
+ "transformers_version": "5.0.0.dev0",
27
+ "type_vocab_size": 2,
28
+ "use_cache": false,
29
+ "vocab_size": 21128
30
+ }
outputs/bert_bilabel_finetuned_model/checkpoint-2188/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de2ee416e9b57c2f5950073423afa4ce4969acef04ab4fac2e67b511ad0d7828
3
+ size 409100240
outputs/bert_bilabel_finetuned_model/checkpoint-2188/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3782131594e410bca55a994fe35dea25cabcc7266f22a9ffe9530377ab90826
3
+ size 818324875
outputs/bert_bilabel_finetuned_model/checkpoint-2188/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1c014c8f02969df2fb6b09bfa058898bb6a730c9745ecc985b52eb65b54fddb
3
+ size 14645
outputs/bert_bilabel_finetuned_model/checkpoint-2188/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e292baa34fe268bf54671510dc8dca778a92537e26e986b9a3f9c6b5645bd29d
3
+ size 1465
outputs/bert_bilabel_finetuned_model/checkpoint-2188/trainer_state.json ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2188,
3
+ "best_metric": 0.9396170839469808,
4
+ "best_model_checkpoint": "./bert_finetuned_model/checkpoint-2188",
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2188,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.04570383912248629,
14
+ "grad_norm": 6.597176551818848,
15
+ "learning_rate": 1.9600000000000003e-06,
16
+ "loss": 0.8315,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.09140767824497258,
21
+ "grad_norm": 4.10335636138916,
22
+ "learning_rate": 3.96e-06,
23
+ "loss": 0.403,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.13711151736745886,
28
+ "grad_norm": 5.460880756378174,
29
+ "learning_rate": 5.9600000000000005e-06,
30
+ "loss": 0.2138,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.18281535648994515,
35
+ "grad_norm": 1.7257156372070312,
36
+ "learning_rate": 7.960000000000002e-06,
37
+ "loss": 0.0675,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.22851919561243145,
42
+ "grad_norm": 0.3548933267593384,
43
+ "learning_rate": 9.960000000000001e-06,
44
+ "loss": 0.0887,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.2742230347349177,
49
+ "grad_norm": 0.07574323564767838,
50
+ "learning_rate": 1.196e-05,
51
+ "loss": 0.0625,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.31992687385740404,
56
+ "grad_norm": 0.27218344807624817,
57
+ "learning_rate": 1.396e-05,
58
+ "loss": 0.0909,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.3656307129798903,
63
+ "grad_norm": 0.07252885401248932,
64
+ "learning_rate": 1.5960000000000003e-05,
65
+ "loss": 0.0388,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.4113345521023766,
70
+ "grad_norm": 1.1499181985855103,
71
+ "learning_rate": 1.796e-05,
72
+ "loss": 0.0955,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 0.4570383912248629,
77
+ "grad_norm": 13.650275230407715,
78
+ "learning_rate": 1.9960000000000002e-05,
79
+ "loss": 0.0869,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 0.5027422303473492,
84
+ "grad_norm": 11.625408172607422,
85
+ "learning_rate": 1.9647735442127967e-05,
86
+ "loss": 0.0851,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 0.5484460694698354,
91
+ "grad_norm": 0.3337002992630005,
92
+ "learning_rate": 1.92882818116463e-05,
93
+ "loss": 0.103,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 0.5941499085923218,
98
+ "grad_norm": 7.300892353057861,
99
+ "learning_rate": 1.892882818116463e-05,
100
+ "loss": 0.082,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 0.6398537477148081,
105
+ "grad_norm": 0.24430198967456818,
106
+ "learning_rate": 1.8569374550682964e-05,
107
+ "loss": 0.0711,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 0.6855575868372943,
112
+ "grad_norm": 15.26744270324707,
113
+ "learning_rate": 1.8209920920201294e-05,
114
+ "loss": 0.0737,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 0.7312614259597806,
119
+ "grad_norm": 0.24188373982906342,
120
+ "learning_rate": 1.7850467289719628e-05,
121
+ "loss": 0.0668,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 0.7769652650822669,
126
+ "grad_norm": 0.1296696811914444,
127
+ "learning_rate": 1.7491013659237958e-05,
128
+ "loss": 0.0537,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 0.8226691042047533,
133
+ "grad_norm": 0.13343055546283722,
134
+ "learning_rate": 1.7131560028756292e-05,
135
+ "loss": 0.0785,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 0.8683729433272395,
140
+ "grad_norm": 4.3099517822265625,
141
+ "learning_rate": 1.6772106398274622e-05,
142
+ "loss": 0.1045,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 0.9140767824497258,
147
+ "grad_norm": 0.024240005761384964,
148
+ "learning_rate": 1.6412652767792956e-05,
149
+ "loss": 0.023,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 0.9597806215722121,
154
+ "grad_norm": 1.5524265766143799,
155
+ "learning_rate": 1.605319913731129e-05,
156
+ "loss": 0.0541,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 1.0,
161
+ "eval_accuracy": 0.984,
162
+ "eval_f1": 0.9395770392749244,
163
+ "eval_loss": 0.06908556073904037,
164
+ "eval_runtime": 28.0922,
165
+ "eval_samples_per_second": 88.993,
166
+ "eval_steps_per_second": 5.589,
167
+ "step": 1094
168
+ },
169
+ {
170
+ "epoch": 1.0054844606946984,
171
+ "grad_norm": 0.1564575880765915,
172
+ "learning_rate": 1.569374550682962e-05,
173
+ "loss": 0.066,
174
+ "step": 1100
175
+ },
176
+ {
177
+ "epoch": 1.0511882998171846,
178
+ "grad_norm": 0.014012756757438183,
179
+ "learning_rate": 1.5334291876347953e-05,
180
+ "loss": 0.0309,
181
+ "step": 1150
182
+ },
183
+ {
184
+ "epoch": 1.0968921389396709,
185
+ "grad_norm": 0.023974154144525528,
186
+ "learning_rate": 1.4974838245866285e-05,
187
+ "loss": 0.0341,
188
+ "step": 1200
189
+ },
190
+ {
191
+ "epoch": 1.1425959780621573,
192
+ "grad_norm": 0.013898388482630253,
193
+ "learning_rate": 1.4615384615384615e-05,
194
+ "loss": 0.0335,
195
+ "step": 1250
196
+ },
197
+ {
198
+ "epoch": 1.1882998171846435,
199
+ "grad_norm": 0.07936646789312363,
200
+ "learning_rate": 1.4255930984902949e-05,
201
+ "loss": 0.0479,
202
+ "step": 1300
203
+ },
204
+ {
205
+ "epoch": 1.2340036563071297,
206
+ "grad_norm": 0.10548417270183563,
207
+ "learning_rate": 1.389647735442128e-05,
208
+ "loss": 0.0481,
209
+ "step": 1350
210
+ },
211
+ {
212
+ "epoch": 1.2797074954296161,
213
+ "grad_norm": 0.015461038798093796,
214
+ "learning_rate": 1.3537023723939613e-05,
215
+ "loss": 0.0302,
216
+ "step": 1400
217
+ },
218
+ {
219
+ "epoch": 1.3254113345521024,
220
+ "grad_norm": 0.03913908079266548,
221
+ "learning_rate": 1.3177570093457945e-05,
222
+ "loss": 0.0196,
223
+ "step": 1450
224
+ },
225
+ {
226
+ "epoch": 1.3711151736745886,
227
+ "grad_norm": 0.0657438263297081,
228
+ "learning_rate": 1.2818116462976278e-05,
229
+ "loss": 0.07,
230
+ "step": 1500
231
+ },
232
+ {
233
+ "epoch": 1.416819012797075,
234
+ "grad_norm": 0.08092936873435974,
235
+ "learning_rate": 1.245866283249461e-05,
236
+ "loss": 0.0372,
237
+ "step": 1550
238
+ },
239
+ {
240
+ "epoch": 1.4625228519195612,
241
+ "grad_norm": 0.019851330667734146,
242
+ "learning_rate": 1.209920920201294e-05,
243
+ "loss": 0.0337,
244
+ "step": 1600
245
+ },
246
+ {
247
+ "epoch": 1.5082266910420477,
248
+ "grad_norm": 0.013996358960866928,
249
+ "learning_rate": 1.1739755571531272e-05,
250
+ "loss": 0.038,
251
+ "step": 1650
252
+ },
253
+ {
254
+ "epoch": 1.5539305301645339,
255
+ "grad_norm": 0.011369767598807812,
256
+ "learning_rate": 1.1380301941049606e-05,
257
+ "loss": 0.0281,
258
+ "step": 1700
259
+ },
260
+ {
261
+ "epoch": 1.59963436928702,
262
+ "grad_norm": 0.07967428863048553,
263
+ "learning_rate": 1.1020848310567938e-05,
264
+ "loss": 0.0426,
265
+ "step": 1750
266
+ },
267
+ {
268
+ "epoch": 1.6453382084095063,
269
+ "grad_norm": 0.005350353196263313,
270
+ "learning_rate": 1.066139468008627e-05,
271
+ "loss": 0.0334,
272
+ "step": 1800
273
+ },
274
+ {
275
+ "epoch": 1.6910420475319927,
276
+ "grad_norm": 0.007268950808793306,
277
+ "learning_rate": 1.0301941049604602e-05,
278
+ "loss": 0.0341,
279
+ "step": 1850
280
+ },
281
+ {
282
+ "epoch": 1.736745886654479,
283
+ "grad_norm": 0.007129556033760309,
284
+ "learning_rate": 9.942487419122934e-06,
285
+ "loss": 0.0139,
286
+ "step": 1900
287
+ },
288
+ {
289
+ "epoch": 1.7824497257769654,
290
+ "grad_norm": 1.3157267570495605,
291
+ "learning_rate": 9.583033788641266e-06,
292
+ "loss": 0.0412,
293
+ "step": 1950
294
+ },
295
+ {
296
+ "epoch": 1.8281535648994516,
297
+ "grad_norm": 6.9985222816467285,
298
+ "learning_rate": 9.223580158159599e-06,
299
+ "loss": 0.0383,
300
+ "step": 2000
301
+ },
302
+ {
303
+ "epoch": 1.8738574040219378,
304
+ "grad_norm": 0.008648707531392574,
305
+ "learning_rate": 8.86412652767793e-06,
306
+ "loss": 0.0308,
307
+ "step": 2050
308
+ },
309
+ {
310
+ "epoch": 1.919561243144424,
311
+ "grad_norm": 11.036811828613281,
312
+ "learning_rate": 8.504672897196263e-06,
313
+ "loss": 0.0444,
314
+ "step": 2100
315
+ },
316
+ {
317
+ "epoch": 1.9652650822669104,
318
+ "grad_norm": 0.005460981745272875,
319
+ "learning_rate": 8.145219266714595e-06,
320
+ "loss": 0.0288,
321
+ "step": 2150
322
+ },
323
+ {
324
+ "epoch": 2.0,
325
+ "eval_accuracy": 0.9836,
326
+ "eval_f1": 0.9396170839469808,
327
+ "eval_loss": 0.08339423686265945,
328
+ "eval_runtime": 28.9448,
329
+ "eval_samples_per_second": 86.371,
330
+ "eval_steps_per_second": 5.424,
331
+ "step": 2188
332
+ }
333
+ ],
334
+ "logging_steps": 50,
335
+ "max_steps": 3282,
336
+ "num_input_tokens_seen": 0,
337
+ "num_train_epochs": 3,
338
+ "save_steps": 500,
339
+ "stateful_callbacks": {
340
+ "TrainerControl": {
341
+ "args": {
342
+ "should_epoch_stop": false,
343
+ "should_evaluate": false,
344
+ "should_log": false,
345
+ "should_save": true,
346
+ "should_training_stop": false
347
+ },
348
+ "attributes": {}
349
+ }
350
+ },
351
+ "total_flos": 9208886937600000.0,
352
+ "train_batch_size": 16,
353
+ "trial_name": null,
354
+ "trial_params": null
355
+ }
outputs/bert_bilabel_finetuned_model/checkpoint-2188/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:525b07a66e715289db75a841e0609901e3ee221ba4268c678c362a7bbb781388
3
+ size 5137
outputs/bert_bilabel_finetuned_model/checkpoint-3282/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "directionality": "bidi",
8
+ "dtype": "float32",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "pooler_fc_size": 768,
21
+ "pooler_num_attention_heads": 12,
22
+ "pooler_num_fc_layers": 3,
23
+ "pooler_size_per_head": 128,
24
+ "pooler_type": "first_token_transform",
25
+ "problem_type": "single_label_classification",
26
+ "transformers_version": "5.0.0.dev0",
27
+ "type_vocab_size": 2,
28
+ "use_cache": false,
29
+ "vocab_size": 21128
30
+ }
outputs/bert_bilabel_finetuned_model/checkpoint-3282/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b58a46568ff450837fbf3ee0f51fa89fd82a450959464b503f893036b86b5a01
3
+ size 409100240
outputs/bert_bilabel_finetuned_model/checkpoint-3282/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb1ea6fc0ae5e09b8a3be646658110ca41c0e6fc08b68cab2ddeb74c0ae82d38
3
+ size 818324875
outputs/bert_bilabel_finetuned_model/checkpoint-3282/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5412faccf347e6ccc0399ab61829229374cd91c9d7662d44fcb0bb456d151a0d
3
+ size 14645
outputs/bert_bilabel_finetuned_model/checkpoint-3282/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73208a74cd8690df2f7dd0a81633cdfde37ecda9e183b1a86782b8075ba454d0
3
+ size 1465
outputs/bert_bilabel_finetuned_model/checkpoint-3282/trainer_state.json ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 3282,
3
+ "best_metric": 0.9413489736070382,
4
+ "best_model_checkpoint": "./bert_finetuned_model/checkpoint-3282",
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 3282,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.04570383912248629,
14
+ "grad_norm": 6.597176551818848,
15
+ "learning_rate": 1.9600000000000003e-06,
16
+ "loss": 0.8315,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.09140767824497258,
21
+ "grad_norm": 4.10335636138916,
22
+ "learning_rate": 3.96e-06,
23
+ "loss": 0.403,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.13711151736745886,
28
+ "grad_norm": 5.460880756378174,
29
+ "learning_rate": 5.9600000000000005e-06,
30
+ "loss": 0.2138,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.18281535648994515,
35
+ "grad_norm": 1.7257156372070312,
36
+ "learning_rate": 7.960000000000002e-06,
37
+ "loss": 0.0675,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.22851919561243145,
42
+ "grad_norm": 0.3548933267593384,
43
+ "learning_rate": 9.960000000000001e-06,
44
+ "loss": 0.0887,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.2742230347349177,
49
+ "grad_norm": 0.07574323564767838,
50
+ "learning_rate": 1.196e-05,
51
+ "loss": 0.0625,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.31992687385740404,
56
+ "grad_norm": 0.27218344807624817,
57
+ "learning_rate": 1.396e-05,
58
+ "loss": 0.0909,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.3656307129798903,
63
+ "grad_norm": 0.07252885401248932,
64
+ "learning_rate": 1.5960000000000003e-05,
65
+ "loss": 0.0388,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.4113345521023766,
70
+ "grad_norm": 1.1499181985855103,
71
+ "learning_rate": 1.796e-05,
72
+ "loss": 0.0955,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 0.4570383912248629,
77
+ "grad_norm": 13.650275230407715,
78
+ "learning_rate": 1.9960000000000002e-05,
79
+ "loss": 0.0869,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 0.5027422303473492,
84
+ "grad_norm": 11.625408172607422,
85
+ "learning_rate": 1.9647735442127967e-05,
86
+ "loss": 0.0851,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 0.5484460694698354,
91
+ "grad_norm": 0.3337002992630005,
92
+ "learning_rate": 1.92882818116463e-05,
93
+ "loss": 0.103,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 0.5941499085923218,
98
+ "grad_norm": 7.300892353057861,
99
+ "learning_rate": 1.892882818116463e-05,
100
+ "loss": 0.082,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 0.6398537477148081,
105
+ "grad_norm": 0.24430198967456818,
106
+ "learning_rate": 1.8569374550682964e-05,
107
+ "loss": 0.0711,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 0.6855575868372943,
112
+ "grad_norm": 15.26744270324707,
113
+ "learning_rate": 1.8209920920201294e-05,
114
+ "loss": 0.0737,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 0.7312614259597806,
119
+ "grad_norm": 0.24188373982906342,
120
+ "learning_rate": 1.7850467289719628e-05,
121
+ "loss": 0.0668,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 0.7769652650822669,
126
+ "grad_norm": 0.1296696811914444,
127
+ "learning_rate": 1.7491013659237958e-05,
128
+ "loss": 0.0537,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 0.8226691042047533,
133
+ "grad_norm": 0.13343055546283722,
134
+ "learning_rate": 1.7131560028756292e-05,
135
+ "loss": 0.0785,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 0.8683729433272395,
140
+ "grad_norm": 4.3099517822265625,
141
+ "learning_rate": 1.6772106398274622e-05,
142
+ "loss": 0.1045,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 0.9140767824497258,
147
+ "grad_norm": 0.024240005761384964,
148
+ "learning_rate": 1.6412652767792956e-05,
149
+ "loss": 0.023,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 0.9597806215722121,
154
+ "grad_norm": 1.5524265766143799,
155
+ "learning_rate": 1.605319913731129e-05,
156
+ "loss": 0.0541,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 1.0,
161
+ "eval_accuracy": 0.984,
162
+ "eval_f1": 0.9395770392749244,
163
+ "eval_loss": 0.06908556073904037,
164
+ "eval_runtime": 28.0922,
165
+ "eval_samples_per_second": 88.993,
166
+ "eval_steps_per_second": 5.589,
167
+ "step": 1094
168
+ },
169
+ {
170
+ "epoch": 1.0054844606946984,
171
+ "grad_norm": 0.1564575880765915,
172
+ "learning_rate": 1.569374550682962e-05,
173
+ "loss": 0.066,
174
+ "step": 1100
175
+ },
176
+ {
177
+ "epoch": 1.0511882998171846,
178
+ "grad_norm": 0.014012756757438183,
179
+ "learning_rate": 1.5334291876347953e-05,
180
+ "loss": 0.0309,
181
+ "step": 1150
182
+ },
183
+ {
184
+ "epoch": 1.0968921389396709,
185
+ "grad_norm": 0.023974154144525528,
186
+ "learning_rate": 1.4974838245866285e-05,
187
+ "loss": 0.0341,
188
+ "step": 1200
189
+ },
190
+ {
191
+ "epoch": 1.1425959780621573,
192
+ "grad_norm": 0.013898388482630253,
193
+ "learning_rate": 1.4615384615384615e-05,
194
+ "loss": 0.0335,
195
+ "step": 1250
196
+ },
197
+ {
198
+ "epoch": 1.1882998171846435,
199
+ "grad_norm": 0.07936646789312363,
200
+ "learning_rate": 1.4255930984902949e-05,
201
+ "loss": 0.0479,
202
+ "step": 1300
203
+ },
204
+ {
205
+ "epoch": 1.2340036563071297,
206
+ "grad_norm": 0.10548417270183563,
207
+ "learning_rate": 1.389647735442128e-05,
208
+ "loss": 0.0481,
209
+ "step": 1350
210
+ },
211
+ {
212
+ "epoch": 1.2797074954296161,
213
+ "grad_norm": 0.015461038798093796,
214
+ "learning_rate": 1.3537023723939613e-05,
215
+ "loss": 0.0302,
216
+ "step": 1400
217
+ },
218
+ {
219
+ "epoch": 1.3254113345521024,
220
+ "grad_norm": 0.03913908079266548,
221
+ "learning_rate": 1.3177570093457945e-05,
222
+ "loss": 0.0196,
223
+ "step": 1450
224
+ },
225
+ {
226
+ "epoch": 1.3711151736745886,
227
+ "grad_norm": 0.0657438263297081,
228
+ "learning_rate": 1.2818116462976278e-05,
229
+ "loss": 0.07,
230
+ "step": 1500
231
+ },
232
+ {
233
+ "epoch": 1.416819012797075,
234
+ "grad_norm": 0.08092936873435974,
235
+ "learning_rate": 1.245866283249461e-05,
236
+ "loss": 0.0372,
237
+ "step": 1550
238
+ },
239
+ {
240
+ "epoch": 1.4625228519195612,
241
+ "grad_norm": 0.019851330667734146,
242
+ "learning_rate": 1.209920920201294e-05,
243
+ "loss": 0.0337,
244
+ "step": 1600
245
+ },
246
+ {
247
+ "epoch": 1.5082266910420477,
248
+ "grad_norm": 0.013996358960866928,
249
+ "learning_rate": 1.1739755571531272e-05,
250
+ "loss": 0.038,
251
+ "step": 1650
252
+ },
253
+ {
254
+ "epoch": 1.5539305301645339,
255
+ "grad_norm": 0.011369767598807812,
256
+ "learning_rate": 1.1380301941049606e-05,
257
+ "loss": 0.0281,
258
+ "step": 1700
259
+ },
260
+ {
261
+ "epoch": 1.59963436928702,
262
+ "grad_norm": 0.07967428863048553,
263
+ "learning_rate": 1.1020848310567938e-05,
264
+ "loss": 0.0426,
265
+ "step": 1750
266
+ },
267
+ {
268
+ "epoch": 1.6453382084095063,
269
+ "grad_norm": 0.005350353196263313,
270
+ "learning_rate": 1.066139468008627e-05,
271
+ "loss": 0.0334,
272
+ "step": 1800
273
+ },
274
+ {
275
+ "epoch": 1.6910420475319927,
276
+ "grad_norm": 0.007268950808793306,
277
+ "learning_rate": 1.0301941049604602e-05,
278
+ "loss": 0.0341,
279
+ "step": 1850
280
+ },
281
+ {
282
+ "epoch": 1.736745886654479,
283
+ "grad_norm": 0.007129556033760309,
284
+ "learning_rate": 9.942487419122934e-06,
285
+ "loss": 0.0139,
286
+ "step": 1900
287
+ },
288
+ {
289
+ "epoch": 1.7824497257769654,
290
+ "grad_norm": 1.3157267570495605,
291
+ "learning_rate": 9.583033788641266e-06,
292
+ "loss": 0.0412,
293
+ "step": 1950
294
+ },
295
+ {
296
+ "epoch": 1.8281535648994516,
297
+ "grad_norm": 6.9985222816467285,
298
+ "learning_rate": 9.223580158159599e-06,
299
+ "loss": 0.0383,
300
+ "step": 2000
301
+ },
302
+ {
303
+ "epoch": 1.8738574040219378,
304
+ "grad_norm": 0.008648707531392574,
305
+ "learning_rate": 8.86412652767793e-06,
306
+ "loss": 0.0308,
307
+ "step": 2050
308
+ },
309
+ {
310
+ "epoch": 1.919561243144424,
311
+ "grad_norm": 11.036811828613281,
312
+ "learning_rate": 8.504672897196263e-06,
313
+ "loss": 0.0444,
314
+ "step": 2100
315
+ },
316
+ {
317
+ "epoch": 1.9652650822669104,
318
+ "grad_norm": 0.005460981745272875,
319
+ "learning_rate": 8.145219266714595e-06,
320
+ "loss": 0.0288,
321
+ "step": 2150
322
+ },
323
+ {
324
+ "epoch": 2.0,
325
+ "eval_accuracy": 0.9836,
326
+ "eval_f1": 0.9396170839469808,
327
+ "eval_loss": 0.08339423686265945,
328
+ "eval_runtime": 28.9448,
329
+ "eval_samples_per_second": 86.371,
330
+ "eval_steps_per_second": 5.424,
331
+ "step": 2188
332
+ },
333
+ {
334
+ "epoch": 2.010968921389397,
335
+ "grad_norm": 0.8983257412910461,
336
+ "learning_rate": 7.785765636232927e-06,
337
+ "loss": 0.0354,
338
+ "step": 2200
339
+ },
340
+ {
341
+ "epoch": 2.056672760511883,
342
+ "grad_norm": 0.10194671899080276,
343
+ "learning_rate": 7.426312005751259e-06,
344
+ "loss": 0.0195,
345
+ "step": 2250
346
+ },
347
+ {
348
+ "epoch": 2.1023765996343693,
349
+ "grad_norm": 3.3761022090911865,
350
+ "learning_rate": 7.066858375269591e-06,
351
+ "loss": 0.0074,
352
+ "step": 2300
353
+ },
354
+ {
355
+ "epoch": 2.1480804387568555,
356
+ "grad_norm": 0.0022166408598423004,
357
+ "learning_rate": 6.707404744787923e-06,
358
+ "loss": 0.0116,
359
+ "step": 2350
360
+ },
361
+ {
362
+ "epoch": 2.1937842778793417,
363
+ "grad_norm": 0.007358817849308252,
364
+ "learning_rate": 6.347951114306255e-06,
365
+ "loss": 0.0038,
366
+ "step": 2400
367
+ },
368
+ {
369
+ "epoch": 2.2394881170018284,
370
+ "grad_norm": 0.004738911986351013,
371
+ "learning_rate": 5.988497483824587e-06,
372
+ "loss": 0.0224,
373
+ "step": 2450
374
+ },
375
+ {
376
+ "epoch": 2.2851919561243146,
377
+ "grad_norm": 0.003663586685433984,
378
+ "learning_rate": 5.629043853342919e-06,
379
+ "loss": 0.0122,
380
+ "step": 2500
381
+ },
382
+ {
383
+ "epoch": 2.330895795246801,
384
+ "grad_norm": 0.010519472882151604,
385
+ "learning_rate": 5.269590222861252e-06,
386
+ "loss": 0.0081,
387
+ "step": 2550
388
+ },
389
+ {
390
+ "epoch": 2.376599634369287,
391
+ "grad_norm": 0.007029661443084478,
392
+ "learning_rate": 4.910136592379584e-06,
393
+ "loss": 0.0302,
394
+ "step": 2600
395
+ },
396
+ {
397
+ "epoch": 2.422303473491773,
398
+ "grad_norm": 0.011014117859303951,
399
+ "learning_rate": 4.550682961897916e-06,
400
+ "loss": 0.0195,
401
+ "step": 2650
402
+ },
403
+ {
404
+ "epoch": 2.4680073126142594,
405
+ "grad_norm": 0.006674727890640497,
406
+ "learning_rate": 4.191229331416248e-06,
407
+ "loss": 0.0094,
408
+ "step": 2700
409
+ },
410
+ {
411
+ "epoch": 2.5137111517367456,
412
+ "grad_norm": 0.011101804673671722,
413
+ "learning_rate": 3.8317757009345796e-06,
414
+ "loss": 0.0292,
415
+ "step": 2750
416
+ },
417
+ {
418
+ "epoch": 2.5594149908592323,
419
+ "grad_norm": 0.032363053411245346,
420
+ "learning_rate": 3.472322070452912e-06,
421
+ "loss": 0.0074,
422
+ "step": 2800
423
+ },
424
+ {
425
+ "epoch": 2.6051188299817185,
426
+ "grad_norm": 0.0015758282970637083,
427
+ "learning_rate": 3.112868439971244e-06,
428
+ "loss": 0.0076,
429
+ "step": 2850
430
+ },
431
+ {
432
+ "epoch": 2.6508226691042047,
433
+ "grad_norm": 0.0019405486527830362,
434
+ "learning_rate": 2.753414809489576e-06,
435
+ "loss": 0.0178,
436
+ "step": 2900
437
+ },
438
+ {
439
+ "epoch": 2.696526508226691,
440
+ "grad_norm": 0.002395535819232464,
441
+ "learning_rate": 2.393961179007908e-06,
442
+ "loss": 0.0163,
443
+ "step": 2950
444
+ },
445
+ {
446
+ "epoch": 2.742230347349177,
447
+ "grad_norm": 0.051430843770504,
448
+ "learning_rate": 2.0345075485262404e-06,
449
+ "loss": 0.0281,
450
+ "step": 3000
451
+ },
452
+ {
453
+ "epoch": 2.787934186471664,
454
+ "grad_norm": 0.002579999854788184,
455
+ "learning_rate": 1.6750539180445723e-06,
456
+ "loss": 0.0241,
457
+ "step": 3050
458
+ },
459
+ {
460
+ "epoch": 2.83363802559415,
461
+ "grad_norm": 0.00829145684838295,
462
+ "learning_rate": 1.3156002875629045e-06,
463
+ "loss": 0.0229,
464
+ "step": 3100
465
+ },
466
+ {
467
+ "epoch": 2.8793418647166362,
468
+ "grad_norm": 0.003575286827981472,
469
+ "learning_rate": 9.561466570812366e-07,
470
+ "loss": 0.0016,
471
+ "step": 3150
472
+ },
473
+ {
474
+ "epoch": 2.9250457038391224,
475
+ "grad_norm": 0.00501601118594408,
476
+ "learning_rate": 5.966930265995687e-07,
477
+ "loss": 0.0069,
478
+ "step": 3200
479
+ },
480
+ {
481
+ "epoch": 2.9707495429616086,
482
+ "grad_norm": 0.01910424418747425,
483
+ "learning_rate": 2.3723939611790082e-07,
484
+ "loss": 0.016,
485
+ "step": 3250
486
+ },
487
+ {
488
+ "epoch": 3.0,
489
+ "eval_accuracy": 0.984,
490
+ "eval_f1": 0.9413489736070382,
491
+ "eval_loss": 0.08269735425710678,
492
+ "eval_runtime": 27.2195,
493
+ "eval_samples_per_second": 91.846,
494
+ "eval_steps_per_second": 5.768,
495
+ "step": 3282
496
+ }
497
+ ],
498
+ "logging_steps": 50,
499
+ "max_steps": 3282,
500
+ "num_input_tokens_seen": 0,
501
+ "num_train_epochs": 3,
502
+ "save_steps": 500,
503
+ "stateful_callbacks": {
504
+ "TrainerControl": {
505
+ "args": {
506
+ "should_epoch_stop": false,
507
+ "should_evaluate": false,
508
+ "should_log": false,
509
+ "should_save": true,
510
+ "should_training_stop": true
511
+ },
512
+ "attributes": {}
513
+ }
514
+ },
515
+ "total_flos": 1.38133304064e+16,
516
+ "train_batch_size": 16,
517
+ "trial_name": null,
518
+ "trial_params": null
519
+ }
outputs/bert_bilabel_finetuned_model/checkpoint-3282/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:525b07a66e715289db75a841e0609901e3ee221ba4268c678c362a7bbb781388
3
+ size 5137
outputs/bert_bilabel_finetuned_model/final/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "directionality": "bidi",
8
+ "dtype": "float32",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "pooler_fc_size": 768,
21
+ "pooler_num_attention_heads": 12,
22
+ "pooler_num_fc_layers": 3,
23
+ "pooler_size_per_head": 128,
24
+ "pooler_type": "first_token_transform",
25
+ "problem_type": "single_label_classification",
26
+ "transformers_version": "5.0.0.dev0",
27
+ "type_vocab_size": 2,
28
+ "use_cache": false,
29
+ "vocab_size": 21128
30
+ }
outputs/bert_bilabel_finetuned_model/final/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b58a46568ff450837fbf3ee0f51fa89fd82a450959464b503f893036b86b5a01
3
+ size 409100240
outputs/bert_bilabel_finetuned_model/final/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:525b07a66e715289db75a841e0609901e3ee221ba4268c678c362a7bbb781388
3
+ size 5137
outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "directionality": "bidi",
8
+ "dtype": "float32",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "pooler_fc_size": 768,
21
+ "pooler_num_attention_heads": 12,
22
+ "pooler_num_fc_layers": 3,
23
+ "pooler_size_per_head": 128,
24
+ "pooler_type": "first_token_transform",
25
+ "problem_type": "single_label_classification",
26
+ "transformers_version": "5.0.0.dev0",
27
+ "type_vocab_size": 2,
28
+ "use_cache": false,
29
+ "vocab_size": 21128
30
+ }
outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9097c32c49180f4aa454c161c6d4f7836309cae1da6f9b0999742b6126e974a1
3
+ size 409100240
outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:364f58fb6b7311d72cc67732e12056c8397d13aab84128e44fccc9a4f96440a9
3
+ size 15597
outputs/bert_bilabel_frozen_classifier_finetuned_model/checkpoint-1094/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7df46a9f83e371cdeb326e2171479963c0b2372be2b82e7056ff56b48e5999c
3
+ size 14645