heavyhelium commited on
Commit
568ff69
·
verified ·
1 Parent(s): 75edf99

Training in progress, epoch 5, checkpoint

Browse files
checkpoint-235/config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "ElectraForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "embedding_size": 128,
11
+ "eos_token_id": null,
12
+ "hidden_act": "gelu",
13
+ "hidden_dropout_prob": 0.1,
14
+ "hidden_size": 256,
15
+ "id2label": {
16
+ "0": "valid",
17
+ "1": "fallacy"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 1024,
21
+ "is_decoder": false,
22
+ "label2id": {
23
+ "fallacy": 1,
24
+ "valid": 0
25
+ },
26
+ "layer_norm_eps": 1e-12,
27
+ "max_position_embeddings": 512,
28
+ "model_type": "electra",
29
+ "num_attention_heads": 4,
30
+ "num_hidden_layers": 12,
31
+ "pad_token_id": 0,
32
+ "position_embedding_type": "absolute",
33
+ "problem_type": "single_label_classification",
34
+ "summary_activation": "gelu",
35
+ "summary_last_dropout": 0.1,
36
+ "summary_type": "first",
37
+ "summary_use_proj": true,
38
+ "tie_word_embeddings": true,
39
+ "transformers_version": "5.9.0",
40
+ "type_vocab_size": 2,
41
+ "use_cache": false,
42
+ "vocab_size": 30522
43
+ }
checkpoint-235/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e74d60afcb36718d989af9c5824bb37f7abc35e87e5694d9a539c43839d15c2
3
+ size 54221200
checkpoint-235/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4afc547d5ed9672a6ea581cbcefe90301f991dab29500acea180f20bf8c06eaf
3
+ size 108567563
checkpoint-235/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c7022145e75b26fb5453057f190dec34ae3ed32e656c97df434cf0d46721ea2
3
+ size 14645
checkpoint-235/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ebf75bfdec9cd91636c8d83fe2e1ebad7190a36eafe90e3fd6e6f00a6e3b2ff
3
+ size 1383
checkpoint-235/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44fb94a88c15d58ffa1bf978fccff6134483f7613a240fb8f9d498a06f59dd4a
3
+ size 1465
checkpoint-235/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-235/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "is_local": false,
6
+ "local_files_only": false,
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 512,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
checkpoint-235/trainer_state.json ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 235,
3
+ "best_metric": 0.8399359743897559,
4
+ "best_model_checkpoint": "models/electra-small-touche-enhanced-binary/trainer/checkpoint-235",
5
+ "epoch": 5.0,
6
+ "eval_steps": 500,
7
+ "global_step": 235,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.2127659574468085,
14
+ "grad_norm": 0.7465218901634216,
15
+ "learning_rate": 1.125e-05,
16
+ "loss": 0.694000244140625,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.425531914893617,
21
+ "grad_norm": 1.2217258214950562,
22
+ "learning_rate": 2.3749999999999998e-05,
23
+ "loss": 0.6908843994140625,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.6382978723404256,
28
+ "grad_norm": 0.6178316473960876,
29
+ "learning_rate": 2.928909952606635e-05,
30
+ "loss": 0.6861663818359375,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.851063829787234,
35
+ "grad_norm": 2.0509798526763916,
36
+ "learning_rate": 2.7867298578199053e-05,
37
+ "loss": 0.6931976318359375,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 1.0,
42
+ "eval_accuracy": 0.65,
43
+ "eval_fallacy_f1": 0.7307692307692307,
44
+ "eval_loss": 0.6805615425109863,
45
+ "eval_macro_f1": 0.6153846153846154,
46
+ "eval_runtime": 0.2623,
47
+ "eval_samples_per_second": 762.415,
48
+ "eval_steps_per_second": 15.248,
49
+ "step": 47
50
+ },
51
+ {
52
+ "epoch": 1.0638297872340425,
53
+ "grad_norm": 0.7195119261741638,
54
+ "learning_rate": 2.6445497630331753e-05,
55
+ "loss": 0.6897735595703125,
56
+ "step": 50
57
+ },
58
+ {
59
+ "epoch": 1.2765957446808511,
60
+ "grad_norm": 1.8138525485992432,
61
+ "learning_rate": 2.5023696682464456e-05,
62
+ "loss": 0.6711883544921875,
63
+ "step": 60
64
+ },
65
+ {
66
+ "epoch": 1.4893617021276595,
67
+ "grad_norm": 1.5502103567123413,
68
+ "learning_rate": 2.360189573459716e-05,
69
+ "loss": 0.66123046875,
70
+ "step": 70
71
+ },
72
+ {
73
+ "epoch": 1.702127659574468,
74
+ "grad_norm": 1.2442517280578613,
75
+ "learning_rate": 2.2180094786729858e-05,
76
+ "loss": 0.6327056884765625,
77
+ "step": 80
78
+ },
79
+ {
80
+ "epoch": 1.9148936170212765,
81
+ "grad_norm": 3.835188150405884,
82
+ "learning_rate": 2.075829383886256e-05,
83
+ "loss": 0.615692138671875,
84
+ "step": 90
85
+ },
86
+ {
87
+ "epoch": 2.0,
88
+ "eval_accuracy": 0.77,
89
+ "eval_fallacy_f1": 0.7604166666666666,
90
+ "eval_loss": 0.5881457328796387,
91
+ "eval_macro_f1": 0.7696314102564102,
92
+ "eval_runtime": 0.2664,
93
+ "eval_samples_per_second": 750.698,
94
+ "eval_steps_per_second": 15.014,
95
+ "step": 94
96
+ },
97
+ {
98
+ "epoch": 2.127659574468085,
99
+ "grad_norm": 1.659440279006958,
100
+ "learning_rate": 1.933649289099526e-05,
101
+ "loss": 0.6120407104492187,
102
+ "step": 100
103
+ },
104
+ {
105
+ "epoch": 2.3404255319148937,
106
+ "grad_norm": 1.9919922351837158,
107
+ "learning_rate": 1.791469194312796e-05,
108
+ "loss": 0.5680572509765625,
109
+ "step": 110
110
+ },
111
+ {
112
+ "epoch": 2.5531914893617023,
113
+ "grad_norm": 2.1114556789398193,
114
+ "learning_rate": 1.6492890995260666e-05,
115
+ "loss": 0.5408950805664062,
116
+ "step": 120
117
+ },
118
+ {
119
+ "epoch": 2.7659574468085104,
120
+ "grad_norm": 2.3676552772521973,
121
+ "learning_rate": 1.5071090047393365e-05,
122
+ "loss": 0.49699249267578127,
123
+ "step": 130
124
+ },
125
+ {
126
+ "epoch": 2.978723404255319,
127
+ "grad_norm": 2.613635301589966,
128
+ "learning_rate": 1.3649289099526066e-05,
129
+ "loss": 0.4867919921875,
130
+ "step": 140
131
+ },
132
+ {
133
+ "epoch": 3.0,
134
+ "eval_accuracy": 0.82,
135
+ "eval_fallacy_f1": 0.8181818181818182,
136
+ "eval_loss": 0.4984106421470642,
137
+ "eval_macro_f1": 0.81998199819982,
138
+ "eval_runtime": 0.5325,
139
+ "eval_samples_per_second": 375.585,
140
+ "eval_steps_per_second": 7.512,
141
+ "step": 141
142
+ },
143
+ {
144
+ "epoch": 3.1914893617021276,
145
+ "grad_norm": 5.378300189971924,
146
+ "learning_rate": 1.2227488151658767e-05,
147
+ "loss": 0.47115631103515626,
148
+ "step": 150
149
+ },
150
+ {
151
+ "epoch": 3.404255319148936,
152
+ "grad_norm": 2.245528221130371,
153
+ "learning_rate": 1.080568720379147e-05,
154
+ "loss": 0.43046722412109373,
155
+ "step": 160
156
+ },
157
+ {
158
+ "epoch": 3.617021276595745,
159
+ "grad_norm": 3.435818672180176,
160
+ "learning_rate": 9.383886255924171e-06,
161
+ "loss": 0.4158058166503906,
162
+ "step": 170
163
+ },
164
+ {
165
+ "epoch": 3.829787234042553,
166
+ "grad_norm": 3.552385091781616,
167
+ "learning_rate": 7.962085308056872e-06,
168
+ "loss": 0.4091644287109375,
169
+ "step": 180
170
+ },
171
+ {
172
+ "epoch": 4.0,
173
+ "eval_accuracy": 0.81,
174
+ "eval_fallacy_f1": 0.8207547169811321,
175
+ "eval_loss": 0.4597650170326233,
176
+ "eval_macro_f1": 0.809313528703332,
177
+ "eval_runtime": 0.2614,
178
+ "eval_samples_per_second": 765.065,
179
+ "eval_steps_per_second": 15.301,
180
+ "step": 188
181
+ },
182
+ {
183
+ "epoch": 4.042553191489362,
184
+ "grad_norm": 3.47029447555542,
185
+ "learning_rate": 6.5402843601895735e-06,
186
+ "loss": 0.40631790161132814,
187
+ "step": 190
188
+ },
189
+ {
190
+ "epoch": 4.25531914893617,
191
+ "grad_norm": 5.288370609283447,
192
+ "learning_rate": 5.1184834123222755e-06,
193
+ "loss": 0.3656455993652344,
194
+ "step": 200
195
+ },
196
+ {
197
+ "epoch": 4.468085106382979,
198
+ "grad_norm": 3.9861114025115967,
199
+ "learning_rate": 3.696682464454976e-06,
200
+ "loss": 0.3499275207519531,
201
+ "step": 210
202
+ },
203
+ {
204
+ "epoch": 4.680851063829787,
205
+ "grad_norm": 3.6823275089263916,
206
+ "learning_rate": 2.274881516587678e-06,
207
+ "loss": 0.38124618530273435,
208
+ "step": 220
209
+ },
210
+ {
211
+ "epoch": 4.8936170212765955,
212
+ "grad_norm": 2.9968783855438232,
213
+ "learning_rate": 8.530805687203791e-07,
214
+ "loss": 0.3715087890625,
215
+ "step": 230
216
+ },
217
+ {
218
+ "epoch": 5.0,
219
+ "eval_accuracy": 0.84,
220
+ "eval_fallacy_f1": 0.8431372549019608,
221
+ "eval_loss": 0.43988463282585144,
222
+ "eval_macro_f1": 0.8399359743897559,
223
+ "eval_runtime": 0.2604,
224
+ "eval_samples_per_second": 768.008,
225
+ "eval_steps_per_second": 15.36,
226
+ "step": 235
227
+ }
228
+ ],
229
+ "logging_steps": 10,
230
+ "max_steps": 235,
231
+ "num_input_tokens_seen": 0,
232
+ "num_train_epochs": 5,
233
+ "save_steps": 500,
234
+ "stateful_callbacks": {
235
+ "TrainerControl": {
236
+ "args": {
237
+ "should_epoch_stop": false,
238
+ "should_evaluate": false,
239
+ "should_log": false,
240
+ "should_save": true,
241
+ "should_training_stop": true
242
+ },
243
+ "attributes": {}
244
+ }
245
+ },
246
+ "total_flos": 58296167513328.0,
247
+ "train_batch_size": 16,
248
+ "trial_name": null,
249
+ "trial_params": null
250
+ }
checkpoint-235/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6cc86806948eeade9111e096da8ecaa7ffd0b7f89647756f199fb88d9a3903b
3
+ size 5393