fpuentes commited on
Commit
1dc0043
·
1 Parent(s): 942ddd6

Model save

Browse files
last-checkpoint/config.json DELETED
@@ -1,27 +0,0 @@
1
- {
2
- "_name_or_path": "/home/pcjf/CESGA/works/lmodels/models/large",
3
- "architectures": [
4
- "RobertaForMaskedLM"
5
- ],
6
- "attention_probs_dropout_prob": 0.1,
7
- "bos_token_id": 0,
8
- "classifier_dropout": null,
9
- "eos_token_id": 2,
10
- "hidden_act": "gelu",
11
- "hidden_dropout_prob": 0.1,
12
- "hidden_size": 768,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 3072,
15
- "layer_norm_eps": 1e-05,
16
- "max_position_embeddings": 514,
17
- "model_type": "roberta",
18
- "num_attention_heads": 12,
19
- "num_hidden_layers": 12,
20
- "pad_token_id": 1,
21
- "position_embedding_type": "absolute",
22
- "torch_dtype": "float32",
23
- "transformers_version": "4.24.0",
24
- "type_vocab_size": 1,
25
- "use_cache": true,
26
- "vocab_size": 50265
27
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd68ef232e3e68b2a523145fb4aab44e1178593f4ecebfe4a7fb7c2a61d39159
3
- size 997747845
 
 
 
 
last-checkpoint/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c293560287e12e9f8d6f988947bc53ce911420ce65ca91ba9813acb4185c488
3
- size 498863417
 
 
 
 
last-checkpoint/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a06bfe35557371cd4030124443e6805028c2a839df314289636cf0caa8997b79
3
- size 14575
 
 
 
 
last-checkpoint/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ff1be153872ceab362bc8f896bf3f611b155e54edf151eccfc448653a32209d
3
- size 627
 
 
 
 
last-checkpoint/special_tokens_map.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "bos_token": "<s>",
3
- "cls_token": "<s>",
4
- "eos_token": "</s>",
5
- "mask_token": {
6
- "content": "<mask>",
7
- "lstrip": true,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
- "pad_token": "<pad>",
13
- "sep_token": "</s>",
14
- "unk_token": "<unk>"
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer_config.json DELETED
@@ -1,17 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "bos_token": "<s>",
4
- "cls_token": "<s>",
5
- "eos_token": "</s>",
6
- "errors": "replace",
7
- "mask_token": "<mask>",
8
- "max_len": 512,
9
- "model_max_length": 512,
10
- "name_or_path": "/home/pcjf/CESGA/works/lmodels/models/large",
11
- "pad_token": "<pad>",
12
- "sep_token": "</s>",
13
- "special_tokens_map_file": null,
14
- "tokenizer_class": "RobertaTokenizer",
15
- "trim_offsets": true,
16
- "unk_token": "<unk>"
17
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/trainer_state.json DELETED
@@ -1,982 +0,0 @@
1
- {
2
- "best_metric": 1.0537773370742798,
3
- "best_model_checkpoint": "/home/pcjf/CESGA/works/lmodels/models/large/checkpoint-102000",
4
- "epoch": 14.902730598086016,
5
- "global_step": 103500,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.22,
12
- "learning_rate": 9.856011519078475e-06,
13
- "loss": 3.6976,
14
- "step": 1500
15
- },
16
- {
17
- "epoch": 0.22,
18
- "eval_loss": 2.2865896224975586,
19
- "eval_runtime": 84.2237,
20
- "eval_samples_per_second": 215.45,
21
- "eval_steps_per_second": 26.94,
22
- "step": 1500
23
- },
24
- {
25
- "epoch": 0.43,
26
- "learning_rate": 9.712023038156948e-06,
27
- "loss": 2.3057,
28
- "step": 3000
29
- },
30
- {
31
- "epoch": 0.43,
32
- "eval_loss": 1.9275821447372437,
33
- "eval_runtime": 83.9089,
34
- "eval_samples_per_second": 216.258,
35
- "eval_steps_per_second": 27.041,
36
- "step": 3000
37
- },
38
- {
39
- "epoch": 0.65,
40
- "learning_rate": 9.568034557235422e-06,
41
- "loss": 2.0428,
42
- "step": 4500
43
- },
44
- {
45
- "epoch": 0.65,
46
- "eval_loss": 1.7688409090042114,
47
- "eval_runtime": 83.2995,
48
- "eval_samples_per_second": 217.841,
49
- "eval_steps_per_second": 27.239,
50
- "step": 4500
51
- },
52
- {
53
- "epoch": 0.86,
54
- "learning_rate": 9.424046076313895e-06,
55
- "loss": 1.8963,
56
- "step": 6000
57
- },
58
- {
59
- "epoch": 0.86,
60
- "eval_loss": 1.651775598526001,
61
- "eval_runtime": 84.0186,
62
- "eval_samples_per_second": 215.976,
63
- "eval_steps_per_second": 27.006,
64
- "step": 6000
65
- },
66
- {
67
- "epoch": 1.08,
68
- "learning_rate": 9.28005759539237e-06,
69
- "loss": 1.8031,
70
- "step": 7500
71
- },
72
- {
73
- "epoch": 1.08,
74
- "eval_loss": 1.593921184539795,
75
- "eval_runtime": 84.6892,
76
- "eval_samples_per_second": 214.266,
77
- "eval_steps_per_second": 26.792,
78
- "step": 7500
79
- },
80
- {
81
- "epoch": 1.3,
82
- "learning_rate": 9.136069114470844e-06,
83
- "loss": 1.7288,
84
- "step": 9000
85
- },
86
- {
87
- "epoch": 1.3,
88
- "eval_loss": 1.5367190837860107,
89
- "eval_runtime": 89.3272,
90
- "eval_samples_per_second": 203.141,
91
- "eval_steps_per_second": 25.401,
92
- "step": 9000
93
- },
94
- {
95
- "epoch": 1.51,
96
- "learning_rate": 8.992080633549316e-06,
97
- "loss": 1.6743,
98
- "step": 10500
99
- },
100
- {
101
- "epoch": 1.51,
102
- "eval_loss": 1.4779834747314453,
103
- "eval_runtime": 81.597,
104
- "eval_samples_per_second": 222.386,
105
- "eval_steps_per_second": 27.807,
106
- "step": 10500
107
- },
108
- {
109
- "epoch": 1.73,
110
- "learning_rate": 8.84809215262779e-06,
111
- "loss": 1.6304,
112
- "step": 12000
113
- },
114
- {
115
- "epoch": 1.73,
116
- "eval_loss": 1.4477195739746094,
117
- "eval_runtime": 84.4637,
118
- "eval_samples_per_second": 214.838,
119
- "eval_steps_per_second": 26.864,
120
- "step": 12000
121
- },
122
- {
123
- "epoch": 1.94,
124
- "learning_rate": 8.704103671706265e-06,
125
- "loss": 1.5896,
126
- "step": 13500
127
- },
128
- {
129
- "epoch": 1.94,
130
- "eval_loss": 1.4108401536941528,
131
- "eval_runtime": 84.7175,
132
- "eval_samples_per_second": 214.194,
133
- "eval_steps_per_second": 26.783,
134
- "step": 13500
135
- },
136
- {
137
- "epoch": 2.16,
138
- "learning_rate": 8.560115190784738e-06,
139
- "loss": 1.5581,
140
- "step": 15000
141
- },
142
- {
143
- "epoch": 2.16,
144
- "eval_loss": 1.3877344131469727,
145
- "eval_runtime": 84.5715,
146
- "eval_samples_per_second": 214.564,
147
- "eval_steps_per_second": 26.829,
148
- "step": 15000
149
- },
150
- {
151
- "epoch": 2.38,
152
- "learning_rate": 8.416126709863212e-06,
153
- "loss": 1.5267,
154
- "step": 16500
155
- },
156
- {
157
- "epoch": 2.38,
158
- "eval_loss": 1.3697640895843506,
159
- "eval_runtime": 84.7025,
160
- "eval_samples_per_second": 214.232,
161
- "eval_steps_per_second": 26.788,
162
- "step": 16500
163
- },
164
- {
165
- "epoch": 2.59,
166
- "learning_rate": 8.272138228941685e-06,
167
- "loss": 1.5027,
168
- "step": 18000
169
- },
170
- {
171
- "epoch": 2.59,
172
- "eval_loss": 1.3324816226959229,
173
- "eval_runtime": 84.1197,
174
- "eval_samples_per_second": 215.716,
175
- "eval_steps_per_second": 26.973,
176
- "step": 18000
177
- },
178
- {
179
- "epoch": 2.81,
180
- "learning_rate": 8.12814974802016e-06,
181
- "loss": 1.4798,
182
- "step": 19500
183
- },
184
- {
185
- "epoch": 2.81,
186
- "eval_loss": 1.3138675689697266,
187
- "eval_runtime": 85.4337,
188
- "eval_samples_per_second": 212.399,
189
- "eval_steps_per_second": 26.559,
190
- "step": 19500
191
- },
192
- {
193
- "epoch": 3.02,
194
- "learning_rate": 7.984161267098632e-06,
195
- "loss": 1.461,
196
- "step": 21000
197
- },
198
- {
199
- "epoch": 3.02,
200
- "eval_loss": 1.298140525817871,
201
- "eval_runtime": 84.4026,
202
- "eval_samples_per_second": 214.993,
203
- "eval_steps_per_second": 26.883,
204
- "step": 21000
205
- },
206
- {
207
- "epoch": 3.24,
208
- "learning_rate": 7.840172786177106e-06,
209
- "loss": 1.4403,
210
- "step": 22500
211
- },
212
- {
213
- "epoch": 3.24,
214
- "eval_loss": 1.2826968431472778,
215
- "eval_runtime": 85.2074,
216
- "eval_samples_per_second": 212.963,
217
- "eval_steps_per_second": 26.629,
218
- "step": 22500
219
- },
220
- {
221
- "epoch": 3.46,
222
- "learning_rate": 7.69618430525558e-06,
223
- "loss": 1.4267,
224
- "step": 24000
225
- },
226
- {
227
- "epoch": 3.46,
228
- "eval_loss": 1.270477294921875,
229
- "eval_runtime": 84.6854,
230
- "eval_samples_per_second": 214.275,
231
- "eval_steps_per_second": 26.793,
232
- "step": 24000
233
- },
234
- {
235
- "epoch": 3.67,
236
- "learning_rate": 7.552195824334054e-06,
237
- "loss": 1.4095,
238
- "step": 25500
239
- },
240
- {
241
- "epoch": 3.67,
242
- "eval_loss": 1.2709885835647583,
243
- "eval_runtime": 84.9755,
244
- "eval_samples_per_second": 213.544,
245
- "eval_steps_per_second": 26.702,
246
- "step": 25500
247
- },
248
- {
249
- "epoch": 3.89,
250
- "learning_rate": 7.408207343412528e-06,
251
- "loss": 1.3988,
252
- "step": 27000
253
- },
254
- {
255
- "epoch": 3.89,
256
- "eval_loss": 1.2431179285049438,
257
- "eval_runtime": 84.5543,
258
- "eval_samples_per_second": 214.608,
259
- "eval_steps_per_second": 26.835,
260
- "step": 27000
261
- },
262
- {
263
- "epoch": 4.1,
264
- "learning_rate": 7.264218862491001e-06,
265
- "loss": 1.386,
266
- "step": 28500
267
- },
268
- {
269
- "epoch": 4.1,
270
- "eval_loss": 1.2419956922531128,
271
- "eval_runtime": 100.1186,
272
- "eval_samples_per_second": 181.245,
273
- "eval_steps_per_second": 22.663,
274
- "step": 28500
275
- },
276
- {
277
- "epoch": 4.32,
278
- "learning_rate": 7.1202303815694755e-06,
279
- "loss": 1.371,
280
- "step": 30000
281
- },
282
- {
283
- "epoch": 4.32,
284
- "eval_loss": 1.221591591835022,
285
- "eval_runtime": 84.502,
286
- "eval_samples_per_second": 214.741,
287
- "eval_steps_per_second": 26.851,
288
- "step": 30000
289
- },
290
- {
291
- "epoch": 4.54,
292
- "learning_rate": 6.976241900647949e-06,
293
- "loss": 1.3598,
294
- "step": 31500
295
- },
296
- {
297
- "epoch": 4.54,
298
- "eval_loss": 1.2219996452331543,
299
- "eval_runtime": 84.5085,
300
- "eval_samples_per_second": 214.724,
301
- "eval_steps_per_second": 26.849,
302
- "step": 31500
303
- },
304
- {
305
- "epoch": 4.75,
306
- "learning_rate": 6.8322534197264226e-06,
307
- "loss": 1.3537,
308
- "step": 33000
309
- },
310
- {
311
- "epoch": 4.75,
312
- "eval_loss": 1.2087223529815674,
313
- "eval_runtime": 85.2139,
314
- "eval_samples_per_second": 212.946,
315
- "eval_steps_per_second": 26.627,
316
- "step": 33000
317
- },
318
- {
319
- "epoch": 4.97,
320
- "learning_rate": 6.688264938804896e-06,
321
- "loss": 1.3435,
322
- "step": 34500
323
- },
324
- {
325
- "epoch": 4.97,
326
- "eval_loss": 1.1993805170059204,
327
- "eval_runtime": 84.8549,
328
- "eval_samples_per_second": 213.847,
329
- "eval_steps_per_second": 26.74,
330
- "step": 34500
331
- },
332
- {
333
- "epoch": 5.18,
334
- "learning_rate": 6.54427645788337e-06,
335
- "loss": 1.3324,
336
- "step": 36000
337
- },
338
- {
339
- "epoch": 5.18,
340
- "eval_loss": 1.1966261863708496,
341
- "eval_runtime": 85.1605,
342
- "eval_samples_per_second": 213.08,
343
- "eval_steps_per_second": 26.644,
344
- "step": 36000
345
- },
346
- {
347
- "epoch": 5.4,
348
- "learning_rate": 6.400287976961843e-06,
349
- "loss": 1.3247,
350
- "step": 37500
351
- },
352
- {
353
- "epoch": 5.4,
354
- "eval_loss": 1.179038643836975,
355
- "eval_runtime": 84.5668,
356
- "eval_samples_per_second": 214.576,
357
- "eval_steps_per_second": 26.831,
358
- "step": 37500
359
- },
360
- {
361
- "epoch": 5.62,
362
- "learning_rate": 6.2562994960403175e-06,
363
- "loss": 1.3189,
364
- "step": 39000
365
- },
366
- {
367
- "epoch": 5.62,
368
- "eval_loss": 1.1733150482177734,
369
- "eval_runtime": 86.5829,
370
- "eval_samples_per_second": 209.579,
371
- "eval_steps_per_second": 26.206,
372
- "step": 39000
373
- },
374
- {
375
- "epoch": 5.83,
376
- "learning_rate": 6.112311015118791e-06,
377
- "loss": 1.3118,
378
- "step": 40500
379
- },
380
- {
381
- "epoch": 5.83,
382
- "eval_loss": 1.1638059616088867,
383
- "eval_runtime": 85.3947,
384
- "eval_samples_per_second": 212.496,
385
- "eval_steps_per_second": 26.571,
386
- "step": 40500
387
- },
388
- {
389
- "epoch": 6.05,
390
- "learning_rate": 5.968322534197265e-06,
391
- "loss": 1.3033,
392
- "step": 42000
393
- },
394
- {
395
- "epoch": 6.05,
396
- "eval_loss": 1.166013240814209,
397
- "eval_runtime": 85.1093,
398
- "eval_samples_per_second": 213.208,
399
- "eval_steps_per_second": 26.66,
400
- "step": 42000
401
- },
402
- {
403
- "epoch": 6.26,
404
- "learning_rate": 5.824334053275739e-06,
405
- "loss": 1.2962,
406
- "step": 43500
407
- },
408
- {
409
- "epoch": 6.26,
410
- "eval_loss": 1.1626156568527222,
411
- "eval_runtime": 84.6549,
412
- "eval_samples_per_second": 214.353,
413
- "eval_steps_per_second": 26.803,
414
- "step": 43500
415
- },
416
- {
417
- "epoch": 6.48,
418
- "learning_rate": 5.6803455723542124e-06,
419
- "loss": 1.2939,
420
- "step": 45000
421
- },
422
- {
423
- "epoch": 6.48,
424
- "eval_loss": 1.1594172716140747,
425
- "eval_runtime": 85.287,
426
- "eval_samples_per_second": 212.764,
427
- "eval_steps_per_second": 26.604,
428
- "step": 45000
429
- },
430
- {
431
- "epoch": 6.7,
432
- "learning_rate": 5.536357091432686e-06,
433
- "loss": 1.2871,
434
- "step": 46500
435
- },
436
- {
437
- "epoch": 6.7,
438
- "eval_loss": 1.1451094150543213,
439
- "eval_runtime": 84.769,
440
- "eval_samples_per_second": 214.064,
441
- "eval_steps_per_second": 26.767,
442
- "step": 46500
443
- },
444
- {
445
- "epoch": 6.91,
446
- "learning_rate": 5.3923686105111595e-06,
447
- "loss": 1.2801,
448
- "step": 48000
449
- },
450
- {
451
- "epoch": 6.91,
452
- "eval_loss": 1.1349542140960693,
453
- "eval_runtime": 85.0192,
454
- "eval_samples_per_second": 213.434,
455
- "eval_steps_per_second": 26.688,
456
- "step": 48000
457
- },
458
- {
459
- "epoch": 7.13,
460
- "learning_rate": 5.248380129589633e-06,
461
- "loss": 1.2732,
462
- "step": 49500
463
- },
464
- {
465
- "epoch": 7.13,
466
- "eval_loss": 1.1414066553115845,
467
- "eval_runtime": 83.7422,
468
- "eval_samples_per_second": 216.689,
469
- "eval_steps_per_second": 27.095,
470
- "step": 49500
471
- },
472
- {
473
- "epoch": 7.34,
474
- "learning_rate": 5.1043916486681065e-06,
475
- "loss": 1.272,
476
- "step": 51000
477
- },
478
- {
479
- "epoch": 7.34,
480
- "eval_loss": 1.1416987180709839,
481
- "eval_runtime": 85.9362,
482
- "eval_samples_per_second": 211.157,
483
- "eval_steps_per_second": 26.403,
484
- "step": 51000
485
- },
486
- {
487
- "epoch": 7.56,
488
- "learning_rate": 4.960403167746581e-06,
489
- "loss": 1.2652,
490
- "step": 52500
491
- },
492
- {
493
- "epoch": 7.56,
494
- "eval_loss": 1.1306627988815308,
495
- "eval_runtime": 84.746,
496
- "eval_samples_per_second": 214.122,
497
- "eval_steps_per_second": 26.774,
498
- "step": 52500
499
- },
500
- {
501
- "epoch": 7.78,
502
- "learning_rate": 4.8164146868250544e-06,
503
- "loss": 1.2602,
504
- "step": 54000
505
- },
506
- {
507
- "epoch": 7.78,
508
- "eval_loss": 1.1231367588043213,
509
- "eval_runtime": 84.9153,
510
- "eval_samples_per_second": 213.695,
511
- "eval_steps_per_second": 26.721,
512
- "step": 54000
513
- },
514
- {
515
- "epoch": 7.99,
516
- "learning_rate": 4.672426205903528e-06,
517
- "loss": 1.2589,
518
- "step": 55500
519
- },
520
- {
521
- "epoch": 7.99,
522
- "eval_loss": 1.116618275642395,
523
- "eval_runtime": 85.1024,
524
- "eval_samples_per_second": 213.226,
525
- "eval_steps_per_second": 26.662,
526
- "step": 55500
527
- },
528
- {
529
- "epoch": 8.21,
530
- "learning_rate": 4.5284377249820015e-06,
531
- "loss": 1.2515,
532
- "step": 57000
533
- },
534
- {
535
- "epoch": 8.21,
536
- "eval_loss": 1.1146851778030396,
537
- "eval_runtime": 86.1091,
538
- "eval_samples_per_second": 210.733,
539
- "eval_steps_per_second": 26.35,
540
- "step": 57000
541
- },
542
- {
543
- "epoch": 8.42,
544
- "learning_rate": 4.384449244060476e-06,
545
- "loss": 1.2508,
546
- "step": 58500
547
- },
548
- {
549
- "epoch": 8.42,
550
- "eval_loss": 1.104642391204834,
551
- "eval_runtime": 86.8395,
552
- "eval_samples_per_second": 208.96,
553
- "eval_steps_per_second": 26.129,
554
- "step": 58500
555
- },
556
- {
557
- "epoch": 8.64,
558
- "learning_rate": 4.240460763138949e-06,
559
- "loss": 1.2483,
560
- "step": 60000
561
- },
562
- {
563
- "epoch": 8.64,
564
- "eval_loss": 1.1124722957611084,
565
- "eval_runtime": 92.3126,
566
- "eval_samples_per_second": 196.571,
567
- "eval_steps_per_second": 24.58,
568
- "step": 60000
569
- },
570
- {
571
- "epoch": 8.86,
572
- "learning_rate": 4.096472282217423e-06,
573
- "loss": 1.2439,
574
- "step": 61500
575
- },
576
- {
577
- "epoch": 8.86,
578
- "eval_loss": 1.1115002632141113,
579
- "eval_runtime": 85.7598,
580
- "eval_samples_per_second": 211.591,
581
- "eval_steps_per_second": 26.458,
582
- "step": 61500
583
- },
584
- {
585
- "epoch": 9.07,
586
- "learning_rate": 3.952483801295896e-06,
587
- "loss": 1.2393,
588
- "step": 63000
589
- },
590
- {
591
- "epoch": 9.07,
592
- "eval_loss": 1.0992404222488403,
593
- "eval_runtime": 85.8343,
594
- "eval_samples_per_second": 211.407,
595
- "eval_steps_per_second": 26.435,
596
- "step": 63000
597
- },
598
- {
599
- "epoch": 9.29,
600
- "learning_rate": 3.8084953203743704e-06,
601
- "loss": 1.2346,
602
- "step": 64500
603
- },
604
- {
605
- "epoch": 9.29,
606
- "eval_loss": 1.087247610092163,
607
- "eval_runtime": 85.1097,
608
- "eval_samples_per_second": 213.207,
609
- "eval_steps_per_second": 26.66,
610
- "step": 64500
611
- },
612
- {
613
- "epoch": 9.5,
614
- "learning_rate": 3.664506839452844e-06,
615
- "loss": 1.2319,
616
- "step": 66000
617
- },
618
- {
619
- "epoch": 9.5,
620
- "eval_loss": 1.1074174642562866,
621
- "eval_runtime": 94.0488,
622
- "eval_samples_per_second": 192.942,
623
- "eval_steps_per_second": 24.126,
624
- "step": 66000
625
- },
626
- {
627
- "epoch": 9.72,
628
- "learning_rate": 3.520518358531318e-06,
629
- "loss": 1.2275,
630
- "step": 67500
631
- },
632
- {
633
- "epoch": 9.72,
634
- "eval_loss": 1.0995101928710938,
635
- "eval_runtime": 86.1197,
636
- "eval_samples_per_second": 210.707,
637
- "eval_steps_per_second": 26.347,
638
- "step": 67500
639
- },
640
- {
641
- "epoch": 9.94,
642
- "learning_rate": 3.3765298776097914e-06,
643
- "loss": 1.2263,
644
- "step": 69000
645
- },
646
- {
647
- "epoch": 9.94,
648
- "eval_loss": 1.079862117767334,
649
- "eval_runtime": 86.1089,
650
- "eval_samples_per_second": 210.733,
651
- "eval_steps_per_second": 26.35,
652
- "step": 69000
653
- },
654
- {
655
- "epoch": 10.15,
656
- "learning_rate": 3.2325413966882653e-06,
657
- "loss": 1.2242,
658
- "step": 70500
659
- },
660
- {
661
- "epoch": 10.15,
662
- "eval_loss": 1.090984582901001,
663
- "eval_runtime": 89.7862,
664
- "eval_samples_per_second": 202.102,
665
- "eval_steps_per_second": 25.271,
666
- "step": 70500
667
- },
668
- {
669
- "epoch": 10.37,
670
- "learning_rate": 3.088552915766739e-06,
671
- "loss": 1.2189,
672
- "step": 72000
673
- },
674
- {
675
- "epoch": 10.37,
676
- "eval_loss": 1.0839877128601074,
677
- "eval_runtime": 84.5722,
678
- "eval_samples_per_second": 214.562,
679
- "eval_steps_per_second": 26.829,
680
- "step": 72000
681
- },
682
- {
683
- "epoch": 10.58,
684
- "learning_rate": 2.9445644348452123e-06,
685
- "loss": 1.2175,
686
- "step": 73500
687
- },
688
- {
689
- "epoch": 10.58,
690
- "eval_loss": 1.0865727663040161,
691
- "eval_runtime": 84.3078,
692
- "eval_samples_per_second": 215.235,
693
- "eval_steps_per_second": 26.913,
694
- "step": 73500
695
- },
696
- {
697
- "epoch": 10.8,
698
- "learning_rate": 2.8005759539236867e-06,
699
- "loss": 1.218,
700
- "step": 75000
701
- },
702
- {
703
- "epoch": 10.8,
704
- "eval_loss": 1.0715888738632202,
705
- "eval_runtime": 84.6924,
706
- "eval_samples_per_second": 214.258,
707
- "eval_steps_per_second": 26.791,
708
- "step": 75000
709
- },
710
- {
711
- "epoch": 11.02,
712
- "learning_rate": 2.6565874730021602e-06,
713
- "loss": 1.2159,
714
- "step": 76500
715
- },
716
- {
717
- "epoch": 11.02,
718
- "eval_loss": 1.0899019241333008,
719
- "eval_runtime": 85.705,
720
- "eval_samples_per_second": 211.726,
721
- "eval_steps_per_second": 26.475,
722
- "step": 76500
723
- },
724
- {
725
- "epoch": 11.23,
726
- "learning_rate": 2.5125989920806338e-06,
727
- "loss": 1.2153,
728
- "step": 78000
729
- },
730
- {
731
- "epoch": 11.23,
732
- "eval_loss": 1.087203860282898,
733
- "eval_runtime": 84.3785,
734
- "eval_samples_per_second": 215.055,
735
- "eval_steps_per_second": 26.891,
736
- "step": 78000
737
- },
738
- {
739
- "epoch": 11.45,
740
- "learning_rate": 2.3686105111591073e-06,
741
- "loss": 1.2129,
742
- "step": 79500
743
- },
744
- {
745
- "epoch": 11.45,
746
- "eval_loss": 1.0858579874038696,
747
- "eval_runtime": 95.1152,
748
- "eval_samples_per_second": 190.779,
749
- "eval_steps_per_second": 23.855,
750
- "step": 79500
751
- },
752
- {
753
- "epoch": 11.66,
754
- "learning_rate": 2.2246220302375812e-06,
755
- "loss": 1.2105,
756
- "step": 81000
757
- },
758
- {
759
- "epoch": 11.66,
760
- "eval_loss": 1.0726720094680786,
761
- "eval_runtime": 86.0753,
762
- "eval_samples_per_second": 210.815,
763
- "eval_steps_per_second": 26.361,
764
- "step": 81000
765
- },
766
- {
767
- "epoch": 11.88,
768
- "learning_rate": 2.0806335493160548e-06,
769
- "loss": 1.2081,
770
- "step": 82500
771
- },
772
- {
773
- "epoch": 11.88,
774
- "eval_loss": 1.0759787559509277,
775
- "eval_runtime": 85.7141,
776
- "eval_samples_per_second": 211.704,
777
- "eval_steps_per_second": 26.472,
778
- "step": 82500
779
- },
780
- {
781
- "epoch": 12.1,
782
- "learning_rate": 1.9366450683945287e-06,
783
- "loss": 1.2107,
784
- "step": 84000
785
- },
786
- {
787
- "epoch": 12.1,
788
- "eval_loss": 1.0708467960357666,
789
- "eval_runtime": 84.1645,
790
- "eval_samples_per_second": 215.602,
791
- "eval_steps_per_second": 26.959,
792
- "step": 84000
793
- },
794
- {
795
- "epoch": 12.31,
796
- "learning_rate": 1.7926565874730022e-06,
797
- "loss": 1.2033,
798
- "step": 85500
799
- },
800
- {
801
- "epoch": 12.31,
802
- "eval_loss": 1.072534441947937,
803
- "eval_runtime": 85.0352,
804
- "eval_samples_per_second": 213.394,
805
- "eval_steps_per_second": 26.683,
806
- "step": 85500
807
- },
808
- {
809
- "epoch": 12.53,
810
- "learning_rate": 1.648668106551476e-06,
811
- "loss": 1.2033,
812
- "step": 87000
813
- },
814
- {
815
- "epoch": 12.53,
816
- "eval_loss": 1.0773580074310303,
817
- "eval_runtime": 94.5251,
818
- "eval_samples_per_second": 191.97,
819
- "eval_steps_per_second": 24.004,
820
- "step": 87000
821
- },
822
- {
823
- "epoch": 12.74,
824
- "learning_rate": 1.5046796256299497e-06,
825
- "loss": 1.2052,
826
- "step": 88500
827
- },
828
- {
829
- "epoch": 12.74,
830
- "eval_loss": 1.0760116577148438,
831
- "eval_runtime": 84.4275,
832
- "eval_samples_per_second": 214.93,
833
- "eval_steps_per_second": 26.875,
834
- "step": 88500
835
- },
836
- {
837
- "epoch": 12.96,
838
- "learning_rate": 1.3606911447084234e-06,
839
- "loss": 1.2012,
840
- "step": 90000
841
- },
842
- {
843
- "epoch": 12.96,
844
- "eval_loss": 1.0765339136123657,
845
- "eval_runtime": 84.1883,
846
- "eval_samples_per_second": 215.541,
847
- "eval_steps_per_second": 26.951,
848
- "step": 90000
849
- },
850
- {
851
- "epoch": 13.17,
852
- "learning_rate": 1.2167026637868972e-06,
853
- "loss": 1.2011,
854
- "step": 91500
855
- },
856
- {
857
- "epoch": 13.17,
858
- "eval_loss": 1.0625150203704834,
859
- "eval_runtime": 84.0684,
860
- "eval_samples_per_second": 215.848,
861
- "eval_steps_per_second": 26.99,
862
- "step": 91500
863
- },
864
- {
865
- "epoch": 13.39,
866
- "learning_rate": 1.072714182865371e-06,
867
- "loss": 1.2015,
868
- "step": 93000
869
- },
870
- {
871
- "epoch": 13.39,
872
- "eval_loss": 1.0583701133728027,
873
- "eval_runtime": 84.3705,
874
- "eval_samples_per_second": 215.075,
875
- "eval_steps_per_second": 26.893,
876
- "step": 93000
877
- },
878
- {
879
- "epoch": 13.61,
880
- "learning_rate": 9.287257019438446e-07,
881
- "loss": 1.1986,
882
- "step": 94500
883
- },
884
- {
885
- "epoch": 13.61,
886
- "eval_loss": 1.0719395875930786,
887
- "eval_runtime": 84.3375,
888
- "eval_samples_per_second": 215.159,
889
- "eval_steps_per_second": 26.904,
890
- "step": 94500
891
- },
892
- {
893
- "epoch": 13.82,
894
- "learning_rate": 7.847372210223183e-07,
895
- "loss": 1.2012,
896
- "step": 96000
897
- },
898
- {
899
- "epoch": 13.82,
900
- "eval_loss": 1.0656870603561401,
901
- "eval_runtime": 84.0312,
902
- "eval_samples_per_second": 215.944,
903
- "eval_steps_per_second": 27.002,
904
- "step": 96000
905
- },
906
- {
907
- "epoch": 14.04,
908
- "learning_rate": 6.40748740100792e-07,
909
- "loss": 1.1983,
910
- "step": 97500
911
- },
912
- {
913
- "epoch": 14.04,
914
- "eval_loss": 1.07068932056427,
915
- "eval_runtime": 84.1653,
916
- "eval_samples_per_second": 215.6,
917
- "eval_steps_per_second": 26.959,
918
- "step": 97500
919
- },
920
- {
921
- "epoch": 14.25,
922
- "learning_rate": 4.967602591792657e-07,
923
- "loss": 1.1982,
924
- "step": 99000
925
- },
926
- {
927
- "epoch": 14.25,
928
- "eval_loss": 1.0600839853286743,
929
- "eval_runtime": 81.6267,
930
- "eval_samples_per_second": 222.305,
931
- "eval_steps_per_second": 27.797,
932
- "step": 99000
933
- },
934
- {
935
- "epoch": 14.47,
936
- "learning_rate": 3.5277177825773936e-07,
937
- "loss": 1.196,
938
- "step": 100500
939
- },
940
- {
941
- "epoch": 14.47,
942
- "eval_loss": 1.055431842803955,
943
- "eval_runtime": 84.0067,
944
- "eval_samples_per_second": 216.007,
945
- "eval_steps_per_second": 27.01,
946
- "step": 100500
947
- },
948
- {
949
- "epoch": 14.69,
950
- "learning_rate": 2.0878329733621312e-07,
951
- "loss": 1.1971,
952
- "step": 102000
953
- },
954
- {
955
- "epoch": 14.69,
956
- "eval_loss": 1.0537773370742798,
957
- "eval_runtime": 84.2231,
958
- "eval_samples_per_second": 215.452,
959
- "eval_steps_per_second": 26.94,
960
- "step": 102000
961
- },
962
- {
963
- "epoch": 14.9,
964
- "learning_rate": 6.479481641468683e-08,
965
- "loss": 1.1954,
966
- "step": 103500
967
- },
968
- {
969
- "epoch": 14.9,
970
- "eval_loss": 1.0612763166427612,
971
- "eval_runtime": 83.7888,
972
- "eval_samples_per_second": 216.568,
973
- "eval_steps_per_second": 27.08,
974
- "step": 103500
975
- }
976
- ],
977
- "max_steps": 104175,
978
- "num_train_epochs": 15,
979
- "total_flos": 1.5507603575881085e+18,
980
- "trial_name": null,
981
- "trial_params": null
982
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef3b1845b37bbb078cf8ca8a9159180a7e5d40d1d9b5ce146385526c2fd6c125
3
- size 3451
 
 
 
 
last-checkpoint/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c293560287e12e9f8d6f988947bc53ce911420ce65ca91ba9813acb4185c488
3
  size 498863417
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2fc030a17cebe49f0bf7d940f707ed90b0474de6a2938d06804e0674cd69601
3
  size 498863417
runs/Feb06_10-35-49_turing/events.out.tfevents.1675676161.turing.964098.1 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a884945b4f71c57927acb7a14ee296e06185a165e27fa3bc7f979f8a5fcb8a39
3
- size 33857
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5de672299d86b9671a5799a15a47643c3945a326870280d8528dacd442cd3b2f
3
+ size 34217