HamsterShiu commited on
Commit
7bfb539
·
verified ·
1 Parent(s): a1db0dc

Upload 10 files

Browse files
hf_bert_pro_20_epochs/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.50.2",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
hf_bert_pro_20_epochs/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c75dcaa4c0a8882aeff6076d9c9c831468dd5cf4705344abd39081a07d309e1d
3
+ size 438080896
hf_bert_pro_20_epochs/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7a36a06c9bf3dce81b1985ab339b90923af77dbd4698852ef6da9fadaae6e97
3
+ size 876283258
hf_bert_pro_20_epochs/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:684f71dc1df0e694759ed6bbef1ad2a0a5bae49b4730ac5c9f3ae44cd2457a6a
3
+ size 14244
hf_bert_pro_20_epochs/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4170c39a78ba24967ff6c2afbeb63788a3da9095ea34b1afdb0875d95d31831c
3
+ size 1064
hf_bert_pro_20_epochs/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
hf_bert_pro_20_epochs/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
hf_bert_pro_20_epochs/trainer_state.json ADDED
@@ -0,0 +1,575 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 26250,
3
+ "best_metric": 1.6726031303405762,
4
+ "best_model_checkpoint": "output/checkpoint-26250",
5
+ "epoch": 21.0,
6
+ "eval_steps": 500,
7
+ "global_step": 26250,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.4,
14
+ "grad_norm": 3.792271852493286,
15
+ "learning_rate": 4.9800000000000004e-05,
16
+ "loss": 2.1127,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.8,
21
+ "grad_norm": 3.493194341659546,
22
+ "learning_rate": 4.96e-05,
23
+ "loss": 2.0322,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 1.0,
28
+ "eval_loss": 1.8727474212646484,
29
+ "eval_runtime": 79.5418,
30
+ "eval_samples_per_second": 125.72,
31
+ "eval_steps_per_second": 3.935,
32
+ "step": 1250
33
+ },
34
+ {
35
+ "epoch": 1.2,
36
+ "grad_norm": 3.216500759124756,
37
+ "learning_rate": 4.94e-05,
38
+ "loss": 1.9934,
39
+ "step": 1500
40
+ },
41
+ {
42
+ "epoch": 1.6,
43
+ "grad_norm": 3.296123743057251,
44
+ "learning_rate": 4.92e-05,
45
+ "loss": 1.9576,
46
+ "step": 2000
47
+ },
48
+ {
49
+ "epoch": 2.0,
50
+ "grad_norm": 3.778991460800171,
51
+ "learning_rate": 4.9e-05,
52
+ "loss": 1.9395,
53
+ "step": 2500
54
+ },
55
+ {
56
+ "epoch": 2.0,
57
+ "eval_loss": 1.8361996412277222,
58
+ "eval_runtime": 79.8291,
59
+ "eval_samples_per_second": 125.268,
60
+ "eval_steps_per_second": 3.921,
61
+ "step": 2500
62
+ },
63
+ {
64
+ "epoch": 2.4,
65
+ "grad_norm": 3.35451340675354,
66
+ "learning_rate": 4.88e-05,
67
+ "loss": 1.9039,
68
+ "step": 3000
69
+ },
70
+ {
71
+ "epoch": 2.8,
72
+ "grad_norm": 3.2007856369018555,
73
+ "learning_rate": 4.86e-05,
74
+ "loss": 1.899,
75
+ "step": 3500
76
+ },
77
+ {
78
+ "epoch": 3.0,
79
+ "eval_loss": 1.8026707172393799,
80
+ "eval_runtime": 79.3622,
81
+ "eval_samples_per_second": 126.005,
82
+ "eval_steps_per_second": 3.944,
83
+ "step": 3750
84
+ },
85
+ {
86
+ "epoch": 3.2,
87
+ "grad_norm": 3.4291458129882812,
88
+ "learning_rate": 4.8400000000000004e-05,
89
+ "loss": 1.8836,
90
+ "step": 4000
91
+ },
92
+ {
93
+ "epoch": 3.6,
94
+ "grad_norm": 3.329169988632202,
95
+ "learning_rate": 4.82e-05,
96
+ "loss": 1.8707,
97
+ "step": 4500
98
+ },
99
+ {
100
+ "epoch": 4.0,
101
+ "grad_norm": 3.29327392578125,
102
+ "learning_rate": 4.8e-05,
103
+ "loss": 1.8571,
104
+ "step": 5000
105
+ },
106
+ {
107
+ "epoch": 4.0,
108
+ "eval_loss": 1.793080449104309,
109
+ "eval_runtime": 79.2565,
110
+ "eval_samples_per_second": 126.173,
111
+ "eval_steps_per_second": 3.949,
112
+ "step": 5000
113
+ },
114
+ {
115
+ "epoch": 4.4,
116
+ "grad_norm": 3.511061429977417,
117
+ "learning_rate": 4.78e-05,
118
+ "loss": 1.8375,
119
+ "step": 5500
120
+ },
121
+ {
122
+ "epoch": 4.8,
123
+ "grad_norm": 2.9841179847717285,
124
+ "learning_rate": 4.76e-05,
125
+ "loss": 1.8261,
126
+ "step": 6000
127
+ },
128
+ {
129
+ "epoch": 5.0,
130
+ "eval_loss": 1.7694125175476074,
131
+ "eval_runtime": 79.3477,
132
+ "eval_samples_per_second": 126.028,
133
+ "eval_steps_per_second": 3.945,
134
+ "step": 6250
135
+ },
136
+ {
137
+ "epoch": 5.2,
138
+ "grad_norm": 3.2486345767974854,
139
+ "learning_rate": 4.74e-05,
140
+ "loss": 1.8106,
141
+ "step": 6500
142
+ },
143
+ {
144
+ "epoch": 5.6,
145
+ "grad_norm": 3.1133296489715576,
146
+ "learning_rate": 4.72e-05,
147
+ "loss": 1.797,
148
+ "step": 7000
149
+ },
150
+ {
151
+ "epoch": 6.0,
152
+ "grad_norm": 3.72696852684021,
153
+ "learning_rate": 4.7e-05,
154
+ "loss": 1.7989,
155
+ "step": 7500
156
+ },
157
+ {
158
+ "epoch": 6.0,
159
+ "eval_loss": 1.752471685409546,
160
+ "eval_runtime": 79.6162,
161
+ "eval_samples_per_second": 125.603,
162
+ "eval_steps_per_second": 3.931,
163
+ "step": 7500
164
+ },
165
+ {
166
+ "epoch": 6.4,
167
+ "grad_norm": 3.4890694618225098,
168
+ "learning_rate": 4.6800000000000006e-05,
169
+ "loss": 1.7788,
170
+ "step": 8000
171
+ },
172
+ {
173
+ "epoch": 6.8,
174
+ "grad_norm": 3.2990198135375977,
175
+ "learning_rate": 4.660000000000001e-05,
176
+ "loss": 1.7786,
177
+ "step": 8500
178
+ },
179
+ {
180
+ "epoch": 7.0,
181
+ "eval_loss": 1.743485927581787,
182
+ "eval_runtime": 79.3229,
183
+ "eval_samples_per_second": 126.067,
184
+ "eval_steps_per_second": 3.946,
185
+ "step": 8750
186
+ },
187
+ {
188
+ "epoch": 7.2,
189
+ "grad_norm": 3.170145034790039,
190
+ "learning_rate": 4.64e-05,
191
+ "loss": 1.7718,
192
+ "step": 9000
193
+ },
194
+ {
195
+ "epoch": 7.6,
196
+ "grad_norm": 3.1817779541015625,
197
+ "learning_rate": 4.6200000000000005e-05,
198
+ "loss": 1.7558,
199
+ "step": 9500
200
+ },
201
+ {
202
+ "epoch": 8.0,
203
+ "grad_norm": 3.174926519393921,
204
+ "learning_rate": 4.600000000000001e-05,
205
+ "loss": 1.756,
206
+ "step": 10000
207
+ },
208
+ {
209
+ "epoch": 8.0,
210
+ "eval_loss": 1.7325628995895386,
211
+ "eval_runtime": 79.3253,
212
+ "eval_samples_per_second": 126.063,
213
+ "eval_steps_per_second": 3.946,
214
+ "step": 10000
215
+ },
216
+ {
217
+ "epoch": 8.4,
218
+ "grad_norm": 3.2396745681762695,
219
+ "learning_rate": 4.58e-05,
220
+ "loss": 1.7342,
221
+ "step": 10500
222
+ },
223
+ {
224
+ "epoch": 8.8,
225
+ "grad_norm": 3.2978265285491943,
226
+ "learning_rate": 4.5600000000000004e-05,
227
+ "loss": 1.7325,
228
+ "step": 11000
229
+ },
230
+ {
231
+ "epoch": 9.0,
232
+ "eval_loss": 1.7237361669540405,
233
+ "eval_runtime": 79.3397,
234
+ "eval_samples_per_second": 126.04,
235
+ "eval_steps_per_second": 3.945,
236
+ "step": 11250
237
+ },
238
+ {
239
+ "epoch": 9.2,
240
+ "grad_norm": 3.6129040718078613,
241
+ "learning_rate": 4.5400000000000006e-05,
242
+ "loss": 1.7311,
243
+ "step": 11500
244
+ },
245
+ {
246
+ "epoch": 9.6,
247
+ "grad_norm": 3.5869312286376953,
248
+ "learning_rate": 4.52e-05,
249
+ "loss": 1.721,
250
+ "step": 12000
251
+ },
252
+ {
253
+ "epoch": 10.0,
254
+ "grad_norm": 3.2217297554016113,
255
+ "learning_rate": 4.5e-05,
256
+ "loss": 1.7256,
257
+ "step": 12500
258
+ },
259
+ {
260
+ "epoch": 10.0,
261
+ "eval_loss": 1.7248116731643677,
262
+ "eval_runtime": 79.9338,
263
+ "eval_samples_per_second": 125.104,
264
+ "eval_steps_per_second": 3.916,
265
+ "step": 12500
266
+ },
267
+ {
268
+ "epoch": 10.4,
269
+ "grad_norm": 3.1793668270111084,
270
+ "learning_rate": 4.4800000000000005e-05,
271
+ "loss": 1.6923,
272
+ "step": 13000
273
+ },
274
+ {
275
+ "epoch": 10.8,
276
+ "grad_norm": 2.925715684890747,
277
+ "learning_rate": 4.46e-05,
278
+ "loss": 1.7066,
279
+ "step": 13500
280
+ },
281
+ {
282
+ "epoch": 11.0,
283
+ "eval_loss": 1.7208353281021118,
284
+ "eval_runtime": 79.1759,
285
+ "eval_samples_per_second": 126.301,
286
+ "eval_steps_per_second": 3.953,
287
+ "step": 13750
288
+ },
289
+ {
290
+ "epoch": 11.2,
291
+ "grad_norm": 3.0651283264160156,
292
+ "learning_rate": 4.44e-05,
293
+ "loss": 1.6894,
294
+ "step": 14000
295
+ },
296
+ {
297
+ "epoch": 11.6,
298
+ "grad_norm": 2.897967576980591,
299
+ "learning_rate": 4.4200000000000004e-05,
300
+ "loss": 1.6852,
301
+ "step": 14500
302
+ },
303
+ {
304
+ "epoch": 12.0,
305
+ "grad_norm": 3.138188123703003,
306
+ "learning_rate": 4.4000000000000006e-05,
307
+ "loss": 1.6891,
308
+ "step": 15000
309
+ },
310
+ {
311
+ "epoch": 12.0,
312
+ "eval_loss": 1.7089905738830566,
313
+ "eval_runtime": 79.1389,
314
+ "eval_samples_per_second": 126.36,
315
+ "eval_steps_per_second": 3.955,
316
+ "step": 15000
317
+ },
318
+ {
319
+ "epoch": 12.4,
320
+ "grad_norm": 3.3055591583251953,
321
+ "learning_rate": 4.38e-05,
322
+ "loss": 1.6685,
323
+ "step": 15500
324
+ },
325
+ {
326
+ "epoch": 12.8,
327
+ "grad_norm": 3.092822551727295,
328
+ "learning_rate": 4.36e-05,
329
+ "loss": 1.6747,
330
+ "step": 16000
331
+ },
332
+ {
333
+ "epoch": 13.0,
334
+ "eval_loss": 1.7045754194259644,
335
+ "eval_runtime": 79.6544,
336
+ "eval_samples_per_second": 125.542,
337
+ "eval_steps_per_second": 3.929,
338
+ "step": 16250
339
+ },
340
+ {
341
+ "epoch": 13.2,
342
+ "grad_norm": 3.1481597423553467,
343
+ "learning_rate": 4.3400000000000005e-05,
344
+ "loss": 1.662,
345
+ "step": 16500
346
+ },
347
+ {
348
+ "epoch": 13.6,
349
+ "grad_norm": 3.1880903244018555,
350
+ "learning_rate": 4.32e-05,
351
+ "loss": 1.6573,
352
+ "step": 17000
353
+ },
354
+ {
355
+ "epoch": 14.0,
356
+ "grad_norm": 3.1915714740753174,
357
+ "learning_rate": 4.3e-05,
358
+ "loss": 1.656,
359
+ "step": 17500
360
+ },
361
+ {
362
+ "epoch": 14.0,
363
+ "eval_loss": 1.699405312538147,
364
+ "eval_runtime": 79.9022,
365
+ "eval_samples_per_second": 125.153,
366
+ "eval_steps_per_second": 3.917,
367
+ "step": 17500
368
+ },
369
+ {
370
+ "epoch": 14.4,
371
+ "grad_norm": 3.177797317504883,
372
+ "learning_rate": 4.2800000000000004e-05,
373
+ "loss": 1.6334,
374
+ "step": 18000
375
+ },
376
+ {
377
+ "epoch": 14.8,
378
+ "grad_norm": 3.0355215072631836,
379
+ "learning_rate": 4.26e-05,
380
+ "loss": 1.646,
381
+ "step": 18500
382
+ },
383
+ {
384
+ "epoch": 15.0,
385
+ "eval_loss": 1.6880792379379272,
386
+ "eval_runtime": 79.6882,
387
+ "eval_samples_per_second": 125.489,
388
+ "eval_steps_per_second": 3.928,
389
+ "step": 18750
390
+ },
391
+ {
392
+ "epoch": 15.2,
393
+ "grad_norm": 3.201802968978882,
394
+ "learning_rate": 4.24e-05,
395
+ "loss": 1.6369,
396
+ "step": 19000
397
+ },
398
+ {
399
+ "epoch": 15.6,
400
+ "grad_norm": 2.9140782356262207,
401
+ "learning_rate": 4.22e-05,
402
+ "loss": 1.6254,
403
+ "step": 19500
404
+ },
405
+ {
406
+ "epoch": 16.0,
407
+ "grad_norm": 2.9663515090942383,
408
+ "learning_rate": 4.2e-05,
409
+ "loss": 1.6272,
410
+ "step": 20000
411
+ },
412
+ {
413
+ "epoch": 16.0,
414
+ "eval_loss": 1.6963340044021606,
415
+ "eval_runtime": 79.4158,
416
+ "eval_samples_per_second": 125.919,
417
+ "eval_steps_per_second": 3.941,
418
+ "step": 20000
419
+ },
420
+ {
421
+ "epoch": 16.4,
422
+ "grad_norm": 3.0621337890625,
423
+ "learning_rate": 4.18e-05,
424
+ "loss": 1.6149,
425
+ "step": 20500
426
+ },
427
+ {
428
+ "epoch": 16.8,
429
+ "grad_norm": 3.031522750854492,
430
+ "learning_rate": 4.16e-05,
431
+ "loss": 1.6216,
432
+ "step": 21000
433
+ },
434
+ {
435
+ "epoch": 17.0,
436
+ "eval_loss": 1.6839699745178223,
437
+ "eval_runtime": 79.786,
438
+ "eval_samples_per_second": 125.335,
439
+ "eval_steps_per_second": 3.923,
440
+ "step": 21250
441
+ },
442
+ {
443
+ "epoch": 17.2,
444
+ "grad_norm": 3.0754151344299316,
445
+ "learning_rate": 4.14e-05,
446
+ "loss": 1.6137,
447
+ "step": 21500
448
+ },
449
+ {
450
+ "epoch": 17.6,
451
+ "grad_norm": 3.2484161853790283,
452
+ "learning_rate": 4.12e-05,
453
+ "loss": 1.601,
454
+ "step": 22000
455
+ },
456
+ {
457
+ "epoch": 18.0,
458
+ "grad_norm": 2.982337236404419,
459
+ "learning_rate": 4.1e-05,
460
+ "loss": 1.6163,
461
+ "step": 22500
462
+ },
463
+ {
464
+ "epoch": 18.0,
465
+ "eval_loss": 1.6896445751190186,
466
+ "eval_runtime": 79.3863,
467
+ "eval_samples_per_second": 125.966,
468
+ "eval_steps_per_second": 3.943,
469
+ "step": 22500
470
+ },
471
+ {
472
+ "epoch": 18.4,
473
+ "grad_norm": 3.2979774475097656,
474
+ "learning_rate": 4.08e-05,
475
+ "loss": 1.5941,
476
+ "step": 23000
477
+ },
478
+ {
479
+ "epoch": 18.8,
480
+ "grad_norm": 3.40487003326416,
481
+ "learning_rate": 4.0600000000000004e-05,
482
+ "loss": 1.5928,
483
+ "step": 23500
484
+ },
485
+ {
486
+ "epoch": 19.0,
487
+ "eval_loss": 1.6783804893493652,
488
+ "eval_runtime": 79.4878,
489
+ "eval_samples_per_second": 125.805,
490
+ "eval_steps_per_second": 3.938,
491
+ "step": 23750
492
+ },
493
+ {
494
+ "epoch": 19.2,
495
+ "grad_norm": 3.0854432582855225,
496
+ "learning_rate": 4.0400000000000006e-05,
497
+ "loss": 1.5911,
498
+ "step": 24000
499
+ },
500
+ {
501
+ "epoch": 19.6,
502
+ "grad_norm": 3.0497543811798096,
503
+ "learning_rate": 4.02e-05,
504
+ "loss": 1.5861,
505
+ "step": 24500
506
+ },
507
+ {
508
+ "epoch": 20.0,
509
+ "grad_norm": 3.2561333179473877,
510
+ "learning_rate": 4e-05,
511
+ "loss": 1.5906,
512
+ "step": 25000
513
+ },
514
+ {
515
+ "epoch": 20.0,
516
+ "eval_loss": 1.686535358428955,
517
+ "eval_runtime": 79.4354,
518
+ "eval_samples_per_second": 125.888,
519
+ "eval_steps_per_second": 3.94,
520
+ "step": 25000
521
+ },
522
+ {
523
+ "epoch": 20.4,
524
+ "grad_norm": 2.977836847305298,
525
+ "learning_rate": 3.9800000000000005e-05,
526
+ "loss": 1.5759,
527
+ "step": 25500
528
+ },
529
+ {
530
+ "epoch": 20.8,
531
+ "grad_norm": 3.172325849533081,
532
+ "learning_rate": 3.960000000000001e-05,
533
+ "loss": 1.5745,
534
+ "step": 26000
535
+ },
536
+ {
537
+ "epoch": 21.0,
538
+ "eval_loss": 1.6726031303405762,
539
+ "eval_runtime": 79.5286,
540
+ "eval_samples_per_second": 125.741,
541
+ "eval_steps_per_second": 3.936,
542
+ "step": 26250
543
+ }
544
+ ],
545
+ "logging_steps": 500,
546
+ "max_steps": 125000,
547
+ "num_input_tokens_seen": 0,
548
+ "num_train_epochs": 100,
549
+ "save_steps": 500,
550
+ "stateful_callbacks": {
551
+ "EarlyStoppingCallback": {
552
+ "args": {
553
+ "early_stopping_patience": 2,
554
+ "early_stopping_threshold": 0.0
555
+ },
556
+ "attributes": {
557
+ "early_stopping_patience_counter": 0
558
+ }
559
+ },
560
+ "TrainerControl": {
561
+ "args": {
562
+ "should_epoch_stop": false,
563
+ "should_evaluate": false,
564
+ "should_log": false,
565
+ "should_save": true,
566
+ "should_training_stop": false
567
+ },
568
+ "attributes": {}
569
+ }
570
+ },
571
+ "total_flos": 2.21092042752e+17,
572
+ "train_batch_size": 32,
573
+ "trial_name": null,
574
+ "trial_params": null
575
+ }
hf_bert_pro_20_epochs/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a869cfb5e8648cac80034c3c19dd58178378a28654fb923b5a7f7ee5de136ecf
3
+ size 5304
hf_bert_pro_20_epochs/vocab.txt ADDED
The diff for this file is too large to render. See raw diff