heavyhelium commited on
Commit
d36744b
·
verified ·
1 Parent(s): a582077

Training in progress, epoch 5, checkpoint

Browse files
checkpoint-235/config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "ElectraForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "embedding_size": 128,
11
+ "eos_token_id": null,
12
+ "hidden_act": "gelu",
13
+ "hidden_dropout_prob": 0.1,
14
+ "hidden_size": 256,
15
+ "id2label": {
16
+ "0": "valid",
17
+ "1": "fallacy"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 1024,
21
+ "is_decoder": false,
22
+ "label2id": {
23
+ "fallacy": 1,
24
+ "valid": 0
25
+ },
26
+ "layer_norm_eps": 1e-12,
27
+ "max_position_embeddings": 512,
28
+ "model_type": "electra",
29
+ "num_attention_heads": 4,
30
+ "num_hidden_layers": 12,
31
+ "pad_token_id": 0,
32
+ "position_embedding_type": "absolute",
33
+ "problem_type": "single_label_classification",
34
+ "summary_activation": "gelu",
35
+ "summary_last_dropout": 0.1,
36
+ "summary_type": "first",
37
+ "summary_use_proj": true,
38
+ "tie_word_embeddings": true,
39
+ "transformers_version": "5.9.0",
40
+ "type_vocab_size": 2,
41
+ "use_cache": false,
42
+ "vocab_size": 30522
43
+ }
checkpoint-235/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6f6a8505c36e37b1b5fcd38d4455a343639019d4e6f70f57493b8dc37fed351
3
+ size 54221200
checkpoint-235/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5853af63ceaea0b2d7b3ed21127b9b6e8ae5ecbece3d5738b4661f18ada1d05
3
+ size 108567563
checkpoint-235/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:678e4e519fa896a5f9c4841fa19a1e4dfd8dbe937f178c5f71171f52ac8d6a65
3
+ size 14645
checkpoint-235/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be0e932021d58d3c46733ca20dc12b546e8bd64bc0a440d710adc37c34a403eb
3
+ size 1383
checkpoint-235/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44fb94a88c15d58ffa1bf978fccff6134483f7613a240fb8f9d498a06f59dd4a
3
+ size 1465
checkpoint-235/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-235/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "is_local": false,
6
+ "local_files_only": false,
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 512,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
checkpoint-235/trainer_state.json ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 235,
3
+ "best_metric": 0.65996599659966,
4
+ "best_model_checkpoint": "models/electra-small-touche-base-binary/trainer/checkpoint-235",
5
+ "epoch": 5.0,
6
+ "eval_steps": 500,
7
+ "global_step": 235,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.2127659574468085,
14
+ "grad_norm": 0.7140693068504333,
15
+ "learning_rate": 1.125e-05,
16
+ "loss": 0.69647216796875,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.425531914893617,
21
+ "grad_norm": 1.1431223154067993,
22
+ "learning_rate": 2.3749999999999998e-05,
23
+ "loss": 0.6931365966796875,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.6382978723404256,
28
+ "grad_norm": 0.6381158828735352,
29
+ "learning_rate": 2.928909952606635e-05,
30
+ "loss": 0.69478759765625,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.851063829787234,
35
+ "grad_norm": 1.9318883419036865,
36
+ "learning_rate": 2.7867298578199053e-05,
37
+ "loss": 0.6951080322265625,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 1.0,
42
+ "eval_accuracy": 0.555,
43
+ "eval_fallacy_f1": 0.3597122302158273,
44
+ "eval_loss": 0.6898486614227295,
45
+ "eval_macro_f1": 0.5093580308167259,
46
+ "eval_runtime": 0.1669,
47
+ "eval_samples_per_second": 1198.414,
48
+ "eval_steps_per_second": 23.968,
49
+ "step": 47
50
+ },
51
+ {
52
+ "epoch": 1.0638297872340425,
53
+ "grad_norm": 0.6443229913711548,
54
+ "learning_rate": 2.6445497630331753e-05,
55
+ "loss": 0.6960784912109375,
56
+ "step": 50
57
+ },
58
+ {
59
+ "epoch": 1.2765957446808511,
60
+ "grad_norm": 1.3156065940856934,
61
+ "learning_rate": 2.5023696682464456e-05,
62
+ "loss": 0.6871307373046875,
63
+ "step": 60
64
+ },
65
+ {
66
+ "epoch": 1.4893617021276595,
67
+ "grad_norm": 0.9663533568382263,
68
+ "learning_rate": 2.360189573459716e-05,
69
+ "loss": 0.69168701171875,
70
+ "step": 70
71
+ },
72
+ {
73
+ "epoch": 1.702127659574468,
74
+ "grad_norm": 0.7499622702598572,
75
+ "learning_rate": 2.2180094786729858e-05,
76
+ "loss": 0.6835113525390625,
77
+ "step": 80
78
+ },
79
+ {
80
+ "epoch": 1.9148936170212765,
81
+ "grad_norm": 1.1466385126113892,
82
+ "learning_rate": 2.075829383886256e-05,
83
+ "loss": 0.6876190185546875,
84
+ "step": 90
85
+ },
86
+ {
87
+ "epoch": 2.0,
88
+ "eval_accuracy": 0.64,
89
+ "eval_fallacy_f1": 0.64,
90
+ "eval_loss": 0.6786279082298279,
91
+ "eval_macro_f1": 0.64,
92
+ "eval_runtime": 0.2441,
93
+ "eval_samples_per_second": 819.224,
94
+ "eval_steps_per_second": 16.384,
95
+ "step": 94
96
+ },
97
+ {
98
+ "epoch": 2.127659574468085,
99
+ "grad_norm": 0.7131018042564392,
100
+ "learning_rate": 1.933649289099526e-05,
101
+ "loss": 0.6760498046875,
102
+ "step": 100
103
+ },
104
+ {
105
+ "epoch": 2.3404255319148937,
106
+ "grad_norm": 0.8485779762268066,
107
+ "learning_rate": 1.791469194312796e-05,
108
+ "loss": 0.67723388671875,
109
+ "step": 110
110
+ },
111
+ {
112
+ "epoch": 2.5531914893617023,
113
+ "grad_norm": 1.0046956539154053,
114
+ "learning_rate": 1.6492890995260666e-05,
115
+ "loss": 0.6680267333984375,
116
+ "step": 120
117
+ },
118
+ {
119
+ "epoch": 2.7659574468085104,
120
+ "grad_norm": 1.3051499128341675,
121
+ "learning_rate": 1.5071090047393365e-05,
122
+ "loss": 0.6558563232421875,
123
+ "step": 130
124
+ },
125
+ {
126
+ "epoch": 2.978723404255319,
127
+ "grad_norm": 1.3270968198776245,
128
+ "learning_rate": 1.3649289099526066e-05,
129
+ "loss": 0.6429000854492187,
130
+ "step": 140
131
+ },
132
+ {
133
+ "epoch": 3.0,
134
+ "eval_accuracy": 0.63,
135
+ "eval_fallacy_f1": 0.5432098765432098,
136
+ "eval_loss": 0.6571618914604187,
137
+ "eval_macro_f1": 0.6161427533976553,
138
+ "eval_runtime": 0.1787,
139
+ "eval_samples_per_second": 1119.018,
140
+ "eval_steps_per_second": 22.38,
141
+ "step": 141
142
+ },
143
+ {
144
+ "epoch": 3.1914893617021276,
145
+ "grad_norm": 1.5919787883758545,
146
+ "learning_rate": 1.2227488151658767e-05,
147
+ "loss": 0.6370620727539062,
148
+ "step": 150
149
+ },
150
+ {
151
+ "epoch": 3.404255319148936,
152
+ "grad_norm": 1.217308759689331,
153
+ "learning_rate": 1.080568720379147e-05,
154
+ "loss": 0.6328628540039063,
155
+ "step": 160
156
+ },
157
+ {
158
+ "epoch": 3.617021276595745,
159
+ "grad_norm": 2.489976406097412,
160
+ "learning_rate": 9.383886255924171e-06,
161
+ "loss": 0.6132949829101563,
162
+ "step": 170
163
+ },
164
+ {
165
+ "epoch": 3.829787234042553,
166
+ "grad_norm": 1.557442307472229,
167
+ "learning_rate": 7.962085308056872e-06,
168
+ "loss": 0.6257461547851563,
169
+ "step": 180
170
+ },
171
+ {
172
+ "epoch": 4.0,
173
+ "eval_accuracy": 0.645,
174
+ "eval_fallacy_f1": 0.6697674418604651,
175
+ "eval_loss": 0.6352868676185608,
176
+ "eval_macro_f1": 0.6429918290383407,
177
+ "eval_runtime": 0.1618,
178
+ "eval_samples_per_second": 1236.069,
179
+ "eval_steps_per_second": 24.721,
180
+ "step": 188
181
+ },
182
+ {
183
+ "epoch": 4.042553191489362,
184
+ "grad_norm": 2.8439695835113525,
185
+ "learning_rate": 6.5402843601895735e-06,
186
+ "loss": 0.60234375,
187
+ "step": 190
188
+ },
189
+ {
190
+ "epoch": 4.25531914893617,
191
+ "grad_norm": 1.99105703830719,
192
+ "learning_rate": 5.1184834123222755e-06,
193
+ "loss": 0.6104934692382813,
194
+ "step": 200
195
+ },
196
+ {
197
+ "epoch": 4.468085106382979,
198
+ "grad_norm": 1.5985376834869385,
199
+ "learning_rate": 3.696682464454976e-06,
200
+ "loss": 0.5836532592773438,
201
+ "step": 210
202
+ },
203
+ {
204
+ "epoch": 4.680851063829787,
205
+ "grad_norm": 1.6915240287780762,
206
+ "learning_rate": 2.274881516587678e-06,
207
+ "loss": 0.5967620849609375,
208
+ "step": 220
209
+ },
210
+ {
211
+ "epoch": 4.8936170212765955,
212
+ "grad_norm": 1.8389923572540283,
213
+ "learning_rate": 8.530805687203791e-07,
214
+ "loss": 0.5855438232421875,
215
+ "step": 230
216
+ },
217
+ {
218
+ "epoch": 5.0,
219
+ "eval_accuracy": 0.66,
220
+ "eval_fallacy_f1": 0.6633663366336634,
221
+ "eval_loss": 0.6288647651672363,
222
+ "eval_macro_f1": 0.65996599659966,
223
+ "eval_runtime": 0.1577,
224
+ "eval_samples_per_second": 1268.019,
225
+ "eval_steps_per_second": 25.36,
226
+ "step": 235
227
+ }
228
+ ],
229
+ "logging_steps": 10,
230
+ "max_steps": 235,
231
+ "num_input_tokens_seen": 0,
232
+ "num_train_epochs": 5,
233
+ "save_steps": 500,
234
+ "stateful_callbacks": {
235
+ "TrainerControl": {
236
+ "args": {
237
+ "should_epoch_stop": false,
238
+ "should_evaluate": false,
239
+ "should_log": false,
240
+ "should_save": true,
241
+ "should_training_stop": true
242
+ },
243
+ "attributes": {}
244
+ }
245
+ },
246
+ "total_flos": 40353289458552.0,
247
+ "train_batch_size": 16,
248
+ "trial_name": null,
249
+ "trial_params": null
250
+ }
checkpoint-235/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:feed2b83d33087270d16ed7f3e10fa99ab935ebc4c954f924db9b9235a327ec4
3
+ size 5393