dq158 commited on
Commit
7efa135
·
1 Parent(s): 97b3de7

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72e09c96eac9a5a5d12013b41dda13b70836234bc8e79dede1d0e69838b32cd5
3
  size 3132668808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcae599684a4fbbc309d29596a13d1f8e60c8669d4f351f311b7e665d723fa40
3
  size 3132668808
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3e0cc1950821aee72f368717c0c4eaca6964baee54bfdd8fe9937bb25df79ba
3
  size 6265677800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72020de09af1159730a810cc5fea25580a0fc8159def1615cca3b3c80c1487c1
3
  size 6265677800
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:714f24020961e064909455be3a37151a09a776324c46a43c77ff89f66b058427
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40625c746bfa84c1633ce1b215fe2e2c14d62ad791af8011ae10ef5e56dcfb97
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7bab7b51078791231733fbc4c23771b030e167e47650cf7b4cbbe893dfeb9092
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a1a678710554b909b7044c4e7143bd849126d09514f3bc102b14b7bf0893c3f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,669 +1,854 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.0,
5
  "eval_steps": 500,
6
- "global_step": 51022,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.02,
13
  "learning_rate": 0.0001,
14
- "loss": 3.8798,
15
  "step": 500
16
  },
17
  {
18
- "epoch": 0.04,
19
- "learning_rate": 9.999617887970705e-05,
20
- "loss": 3.3824,
21
  "step": 1000
22
  },
23
  {
24
- "epoch": 0.06,
25
- "learning_rate": 9.998471610286659e-05,
26
- "loss": 3.3404,
27
  "step": 1500
28
  },
29
  {
30
- "epoch": 0.08,
31
- "learning_rate": 9.996561342150463e-05,
32
- "loss": 3.3427,
33
  "step": 2000
34
  },
35
  {
36
- "epoch": 0.1,
37
- "learning_rate": 9.993887375536685e-05,
38
- "loss": 3.2575,
39
  "step": 2500
40
  },
41
  {
42
- "epoch": 0.12,
43
- "learning_rate": 9.990450119147252e-05,
44
- "loss": 3.2676,
45
  "step": 3000
46
  },
47
  {
48
- "epoch": 0.14,
49
- "learning_rate": 9.98625009834897e-05,
50
- "loss": 3.1225,
51
  "step": 3500
52
  },
53
  {
54
- "epoch": 0.16,
55
- "learning_rate": 9.981287955093226e-05,
56
- "loss": 3.1038,
57
  "step": 4000
58
  },
59
  {
60
- "epoch": 0.18,
61
- "learning_rate": 9.97556444781787e-05,
62
- "loss": 3.132,
63
  "step": 4500
64
  },
65
  {
66
- "epoch": 0.2,
67
- "learning_rate": 9.969080451331299e-05,
68
- "loss": 3.1255,
69
  "step": 5000
70
  },
71
  {
72
- "epoch": 0.22,
73
- "learning_rate": 9.96183695667873e-05,
74
- "loss": 3.1358,
75
  "step": 5500
76
  },
77
  {
78
- "epoch": 0.24,
79
- "learning_rate": 9.95383507099074e-05,
80
- "loss": 3.1162,
81
  "step": 6000
82
  },
83
  {
84
- "epoch": 0.25,
85
- "learning_rate": 9.945076017314044e-05,
86
- "loss": 3.0475,
87
  "step": 6500
88
  },
89
  {
90
- "epoch": 0.27,
91
- "learning_rate": 9.935561134424548e-05,
92
- "loss": 3.0587,
93
  "step": 7000
94
  },
95
  {
96
- "epoch": 0.29,
97
- "learning_rate": 9.925291876622738e-05,
98
- "loss": 3.0822,
99
  "step": 7500
100
  },
101
  {
102
- "epoch": 0.31,
103
- "learning_rate": 9.914269813511388e-05,
104
- "loss": 3.0829,
105
  "step": 8000
106
  },
107
  {
108
- "epoch": 0.33,
109
- "learning_rate": 9.902496629755661e-05,
110
- "loss": 3.0594,
111
  "step": 8500
112
  },
113
  {
114
- "epoch": 0.35,
115
- "learning_rate": 9.88997412482561e-05,
116
- "loss": 3.0949,
117
  "step": 9000
118
  },
119
  {
120
- "epoch": 0.37,
121
- "learning_rate": 9.876704212721141e-05,
122
- "loss": 3.0807,
123
  "step": 9500
124
  },
125
  {
126
- "epoch": 0.39,
127
- "learning_rate": 9.862688921679476e-05,
128
- "loss": 2.9289,
129
  "step": 10000
130
  },
131
  {
132
- "epoch": 0.41,
133
- "learning_rate": 9.847930393865132e-05,
134
- "loss": 3.0059,
135
  "step": 10500
136
  },
137
  {
138
- "epoch": 0.43,
139
- "learning_rate": 9.832430885042515e-05,
140
- "loss": 3.0391,
141
  "step": 11000
142
  },
143
  {
144
- "epoch": 0.45,
145
- "learning_rate": 9.816192764231132e-05,
146
- "loss": 3.0225,
147
  "step": 11500
148
  },
149
  {
150
- "epoch": 0.47,
151
- "learning_rate": 9.799218513343504e-05,
152
- "loss": 3.0199,
153
  "step": 12000
154
  },
155
  {
156
- "epoch": 0.49,
157
- "learning_rate": 9.781510726805807e-05,
158
- "loss": 3.0201,
159
  "step": 12500
160
  },
161
  {
162
- "epoch": 0.51,
163
- "learning_rate": 9.763072111161344e-05,
164
- "loss": 3.0321,
165
  "step": 13000
166
  },
167
  {
168
- "epoch": 0.53,
169
- "learning_rate": 9.743905484656852e-05,
170
- "loss": 2.8965,
171
  "step": 13500
172
  },
173
  {
174
- "epoch": 0.55,
175
- "learning_rate": 9.724013776811747e-05,
176
- "loss": 3.0016,
177
  "step": 14000
178
  },
179
  {
180
- "epoch": 0.57,
181
- "learning_rate": 9.70340002797037e-05,
182
- "loss": 3.0327,
183
  "step": 14500
184
  },
185
  {
186
- "epoch": 0.59,
187
- "learning_rate": 9.682067388837286e-05,
188
- "loss": 3.036,
189
  "step": 15000
190
  },
191
  {
192
- "epoch": 0.61,
193
- "learning_rate": 9.660019119995702e-05,
194
- "loss": 3.0152,
195
  "step": 15500
196
  },
197
  {
198
- "epoch": 0.63,
199
- "learning_rate": 9.63725859140912e-05,
200
- "loss": 2.9673,
201
  "step": 16000
202
  },
203
  {
204
- "epoch": 0.65,
205
- "learning_rate": 9.613789281906243e-05,
206
- "loss": 3.0032,
207
  "step": 16500
208
  },
209
  {
210
- "epoch": 0.67,
211
- "learning_rate": 9.589614778649267e-05,
212
- "loss": 2.9865,
213
  "step": 17000
214
  },
215
  {
216
- "epoch": 0.69,
217
- "learning_rate": 9.564738776585591e-05,
218
- "loss": 3.0042,
219
  "step": 17500
220
  },
221
  {
222
- "epoch": 0.71,
223
- "learning_rate": 9.539165077883064e-05,
224
- "loss": 2.988,
225
  "step": 18000
226
  },
227
  {
228
- "epoch": 0.73,
229
- "learning_rate": 9.51289759134885e-05,
230
- "loss": 2.9729,
231
  "step": 18500
232
  },
233
  {
234
- "epoch": 0.74,
235
- "learning_rate": 9.485940331831984e-05,
236
- "loss": 2.9924,
237
  "step": 19000
238
  },
239
  {
240
- "epoch": 0.76,
241
- "learning_rate": 9.45829741960972e-05,
242
- "loss": 2.96,
243
  "step": 19500
244
  },
245
  {
246
- "epoch": 0.78,
247
- "learning_rate": 9.429973079757773e-05,
248
- "loss": 2.9702,
249
  "step": 20000
250
  },
251
  {
252
- "epoch": 0.8,
253
- "learning_rate": 9.400971641504533e-05,
254
- "loss": 2.9362,
255
  "step": 20500
256
  },
257
  {
258
- "epoch": 0.82,
259
- "learning_rate": 9.371297537569369e-05,
260
- "loss": 2.9421,
261
  "step": 21000
262
  },
263
  {
264
- "epoch": 0.84,
265
- "learning_rate": 9.340955303485112e-05,
266
- "loss": 2.8557,
267
  "step": 21500
268
  },
269
  {
270
- "epoch": 0.86,
271
- "learning_rate": 9.309949576904817e-05,
272
- "loss": 2.8443,
273
  "step": 22000
274
  },
275
  {
276
- "epoch": 0.88,
277
- "learning_rate": 9.278285096892927e-05,
278
- "loss": 2.8352,
279
  "step": 22500
280
  },
281
  {
282
- "epoch": 0.9,
283
- "learning_rate": 9.245966703200923e-05,
284
- "loss": 2.8187,
285
  "step": 23000
286
  },
287
  {
288
- "epoch": 0.92,
289
- "learning_rate": 9.212999335527607e-05,
290
- "loss": 2.887,
291
  "step": 23500
292
  },
293
  {
294
- "epoch": 0.94,
295
- "learning_rate": 9.179388032764086e-05,
296
- "loss": 2.9268,
297
  "step": 24000
298
  },
299
  {
300
- "epoch": 0.96,
301
- "learning_rate": 9.145137932223598e-05,
302
- "loss": 2.9458,
303
  "step": 24500
304
  },
305
  {
306
- "epoch": 0.98,
307
- "learning_rate": 9.110254268856312e-05,
308
- "loss": 2.8961,
309
  "step": 25000
310
  },
311
  {
312
- "epoch": 1.0,
313
- "learning_rate": 9.074742374449192e-05,
314
- "loss": 2.9421,
315
  "step": 25500
316
  },
317
  {
318
- "epoch": 1.0,
319
- "eval_bleu": 1.0,
320
- "eval_brevity_penalty": 1.0,
321
- "eval_length_ratio": 1.0,
322
- "eval_loss": 2.790889024734497,
323
- "eval_precisions": [
324
- 1.0,
325
- 1.0,
326
- 1.0,
327
- 1.0
328
- ],
329
- "eval_reference_length": 5805056,
330
- "eval_runtime": 9395.1429,
331
- "eval_samples_per_second": 1.207,
332
- "eval_steps_per_second": 0.302,
333
- "eval_translation_length": 5805056,
334
- "step": 25511
335
- },
336
- {
337
- "epoch": 1.02,
338
- "learning_rate": 9.038607676811049e-05,
339
- "loss": 2.6527,
340
  "step": 26000
341
  },
342
  {
343
- "epoch": 1.04,
344
- "learning_rate": 9.00185569894294e-05,
345
- "loss": 2.5876,
346
  "step": 26500
347
  },
348
  {
349
- "epoch": 1.06,
350
- "learning_rate": 8.964492058194002e-05,
351
- "loss": 2.6107,
352
  "step": 27000
353
  },
354
  {
355
- "epoch": 1.08,
356
- "learning_rate": 8.926522465402872e-05,
357
- "loss": 2.6123,
358
  "step": 27500
359
  },
360
  {
361
- "epoch": 1.1,
362
- "learning_rate": 8.887952724024808e-05,
363
- "loss": 2.5845,
364
  "step": 28000
365
  },
366
  {
367
- "epoch": 1.12,
368
- "learning_rate": 8.848788729244675e-05,
369
- "loss": 2.621,
370
  "step": 28500
371
  },
372
  {
373
- "epoch": 1.14,
374
- "learning_rate": 8.809036467075875e-05,
375
- "loss": 2.6317,
376
  "step": 29000
377
  },
378
  {
379
- "epoch": 1.16,
380
- "learning_rate": 8.768702013445438e-05,
381
- "loss": 2.6083,
382
  "step": 29500
383
  },
384
  {
385
- "epoch": 1.18,
386
- "learning_rate": 8.727791533265335e-05,
387
- "loss": 2.6469,
388
  "step": 30000
389
  },
390
  {
391
- "epoch": 1.2,
392
- "learning_rate": 8.686311279490205e-05,
393
- "loss": 2.6186,
394
  "step": 30500
395
  },
396
  {
397
- "epoch": 1.22,
398
- "learning_rate": 8.644267592161625e-05,
399
- "loss": 2.6418,
400
  "step": 31000
401
  },
402
  {
403
- "epoch": 1.23,
404
- "learning_rate": 8.601666897439072e-05,
405
- "loss": 2.6127,
406
  "step": 31500
407
  },
408
  {
409
- "epoch": 1.25,
410
- "learning_rate": 8.55851570661771e-05,
411
- "loss": 2.6585,
412
  "step": 32000
413
  },
414
  {
415
- "epoch": 1.27,
416
- "learning_rate": 8.514820615133171e-05,
417
- "loss": 2.6329,
418
  "step": 32500
419
  },
420
  {
421
- "epoch": 1.29,
422
- "learning_rate": 8.47058830155349e-05,
423
- "loss": 2.6495,
424
  "step": 33000
425
  },
426
  {
427
- "epoch": 1.31,
428
- "learning_rate": 8.425825526558306e-05,
429
- "loss": 2.6859,
430
  "step": 33500
431
  },
432
  {
433
- "epoch": 1.33,
434
- "learning_rate": 8.380539131905538e-05,
435
- "loss": 2.6083,
436
  "step": 34000
437
  },
438
  {
439
- "epoch": 1.35,
440
- "learning_rate": 8.334736039385647e-05,
441
- "loss": 2.6761,
442
  "step": 34500
443
  },
444
  {
445
- "epoch": 1.37,
446
- "learning_rate": 8.288423249763687e-05,
447
- "loss": 2.5866,
448
  "step": 35000
449
  },
450
  {
451
- "epoch": 1.39,
452
- "learning_rate": 8.241607841709266e-05,
453
- "loss": 2.7219,
454
  "step": 35500
455
  },
456
  {
457
- "epoch": 1.41,
458
- "learning_rate": 8.194296970714615e-05,
459
- "loss": 2.6395,
460
  "step": 36000
461
  },
462
  {
463
- "epoch": 1.43,
464
- "learning_rate": 8.146497868000903e-05,
465
- "loss": 2.6553,
466
  "step": 36500
467
  },
468
  {
469
- "epoch": 1.45,
470
- "learning_rate": 8.098217839412985e-05,
471
- "loss": 2.6139,
472
  "step": 37000
473
  },
474
  {
475
- "epoch": 1.47,
476
- "learning_rate": 8.049464264302741e-05,
477
- "loss": 2.6203,
478
  "step": 37500
479
  },
480
  {
481
- "epoch": 1.49,
482
- "learning_rate": 8.000244594401178e-05,
483
- "loss": 2.729,
484
  "step": 38000
485
  },
486
  {
487
- "epoch": 1.51,
488
- "learning_rate": 7.950566352679475e-05,
489
- "loss": 2.607,
490
  "step": 38500
491
  },
492
  {
493
- "epoch": 1.53,
494
- "learning_rate": 7.900437132199135e-05,
495
- "loss": 2.7186,
496
  "step": 39000
497
  },
498
  {
499
- "epoch": 1.55,
500
- "learning_rate": 7.849864594951422e-05,
501
- "loss": 2.6396,
502
  "step": 39500
503
  },
504
  {
505
- "epoch": 1.57,
506
- "learning_rate": 7.798856470686275e-05,
507
- "loss": 2.6618,
508
  "step": 40000
509
  },
510
  {
511
- "epoch": 1.59,
512
- "learning_rate": 7.747420555730837e-05,
513
- "loss": 2.595,
514
  "step": 40500
515
  },
516
  {
517
- "epoch": 1.61,
518
- "learning_rate": 7.695564711797849e-05,
519
- "loss": 2.6412,
520
  "step": 41000
521
  },
522
  {
523
- "epoch": 1.63,
524
- "learning_rate": 7.643296864784011e-05,
525
- "loss": 2.6468,
526
  "step": 41500
527
  },
528
  {
529
- "epoch": 1.65,
530
- "learning_rate": 7.590625003558561e-05,
531
- "loss": 2.5908,
532
  "step": 42000
533
  },
534
  {
535
- "epoch": 1.67,
536
- "learning_rate": 7.53755717874221e-05,
537
- "loss": 2.6129,
538
  "step": 42500
539
  },
540
  {
541
- "epoch": 1.69,
542
- "learning_rate": 7.484101501476649e-05,
543
- "loss": 2.6015,
544
  "step": 43000
545
  },
546
  {
547
- "epoch": 1.71,
548
- "learning_rate": 7.430266142184806e-05,
549
- "loss": 2.6249,
550
  "step": 43500
551
  },
552
  {
553
- "epoch": 1.72,
554
- "learning_rate": 7.376059329322036e-05,
555
- "loss": 2.6027,
556
  "step": 44000
557
  },
558
  {
559
- "epoch": 1.74,
560
- "learning_rate": 7.321489348118445e-05,
561
- "loss": 2.5851,
562
  "step": 44500
563
  },
564
  {
565
- "epoch": 1.76,
566
- "learning_rate": 7.266564539312535e-05,
567
- "loss": 2.6235,
568
  "step": 45000
569
  },
570
  {
571
- "epoch": 1.78,
572
- "learning_rate": 7.211293297876365e-05,
573
- "loss": 2.6393,
574
  "step": 45500
575
  },
576
  {
577
- "epoch": 1.8,
578
- "learning_rate": 7.155684071732431e-05,
579
- "loss": 2.5699,
580
  "step": 46000
581
  },
582
  {
583
- "epoch": 1.82,
584
- "learning_rate": 7.099745360462426e-05,
585
- "loss": 2.6722,
586
  "step": 46500
587
  },
588
  {
589
- "epoch": 1.84,
590
- "learning_rate": 7.043485714008147e-05,
591
- "loss": 2.6375,
592
  "step": 47000
593
  },
594
  {
595
- "epoch": 1.86,
596
- "learning_rate": 6.98691373136466e-05,
597
- "loss": 2.6718,
598
  "step": 47500
599
  },
600
  {
601
- "epoch": 1.88,
602
- "learning_rate": 6.930038059266004e-05,
603
- "loss": 2.6249,
604
  "step": 48000
605
  },
606
  {
607
- "epoch": 1.9,
608
- "learning_rate": 6.872867390863569e-05,
609
- "loss": 2.6526,
610
  "step": 48500
611
  },
612
  {
613
- "epoch": 1.92,
614
- "learning_rate": 6.815410464397405e-05,
615
- "loss": 2.6139,
616
  "step": 49000
617
  },
618
  {
619
- "epoch": 1.94,
620
- "learning_rate": 6.757676061860619e-05,
621
- "loss": 2.5463,
622
  "step": 49500
623
  },
624
  {
625
- "epoch": 1.96,
626
- "learning_rate": 6.699673007657097e-05,
627
- "loss": 2.6163,
628
  "step": 50000
629
  },
630
  {
631
- "epoch": 1.98,
632
- "learning_rate": 6.641410167252738e-05,
633
- "loss": 2.6282,
634
  "step": 50500
635
  },
636
  {
637
- "epoch": 2.0,
638
- "learning_rate": 6.582896445820412e-05,
639
- "loss": 2.5324,
640
  "step": 51000
641
  },
642
  {
643
- "epoch": 2.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
644
  "eval_bleu": 1.0,
645
  "eval_brevity_penalty": 1.0,
646
  "eval_length_ratio": 1.0,
647
- "eval_loss": 2.711536407470703,
648
  "eval_precisions": [
649
  1.0,
650
  1.0,
651
  1.0,
652
  1.0
653
  ],
654
- "eval_reference_length": 5805056,
655
- "eval_runtime": 9329.1164,
656
- "eval_samples_per_second": 1.215,
657
- "eval_steps_per_second": 0.304,
658
- "eval_translation_length": 5805056,
659
- "step": 51022
660
  }
661
  ],
662
  "logging_steps": 500,
663
- "max_steps": 127555,
664
  "num_train_epochs": 5,
665
  "save_steps": 500,
666
- "total_flos": 4.703662624808632e+17,
667
  "trial_name": null,
668
  "trial_params": null
669
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 68219,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01,
13
  "learning_rate": 0.0001,
14
+ "loss": 4.0519,
15
  "step": 500
16
  },
17
  {
18
+ "epoch": 0.01,
19
+ "learning_rate": 9.999946825617329e-05,
20
+ "loss": 3.6979,
21
  "step": 1000
22
  },
23
  {
24
+ "epoch": 0.02,
25
+ "learning_rate": 9.99978730360032e-05,
26
+ "loss": 3.5393,
27
  "step": 1500
28
  },
29
  {
30
+ "epoch": 0.03,
31
+ "learning_rate": 9.999521437341967e-05,
32
+ "loss": 3.4397,
33
  "step": 2000
34
  },
35
  {
36
+ "epoch": 0.04,
37
+ "learning_rate": 9.999149232497183e-05,
38
+ "loss": 3.4883,
39
  "step": 2500
40
  },
41
  {
42
+ "epoch": 0.04,
43
+ "learning_rate": 9.998670696982668e-05,
44
+ "loss": 3.5107,
45
  "step": 3000
46
  },
47
  {
48
+ "epoch": 0.05,
49
+ "learning_rate": 9.998085840976759e-05,
50
+ "loss": 3.4742,
51
  "step": 3500
52
  },
53
  {
54
+ "epoch": 0.06,
55
+ "learning_rate": 9.997394676919193e-05,
56
+ "loss": 3.3594,
57
  "step": 4000
58
  },
59
  {
60
+ "epoch": 0.07,
61
+ "learning_rate": 9.996597219510866e-05,
62
+ "loss": 3.3098,
63
  "step": 4500
64
  },
65
  {
66
+ "epoch": 0.07,
67
+ "learning_rate": 9.995693485713496e-05,
68
+ "loss": 3.4248,
69
  "step": 5000
70
  },
71
  {
72
+ "epoch": 0.08,
73
+ "learning_rate": 9.994683494749277e-05,
74
+ "loss": 3.3875,
75
  "step": 5500
76
  },
77
  {
78
+ "epoch": 0.09,
79
+ "learning_rate": 9.993567268100469e-05,
80
+ "loss": 3.3726,
81
  "step": 6000
82
  },
83
  {
84
+ "epoch": 0.1,
85
+ "learning_rate": 9.992344829508938e-05,
86
+ "loss": 3.3911,
87
  "step": 6500
88
  },
89
  {
90
+ "epoch": 0.1,
91
+ "learning_rate": 9.991016204975648e-05,
92
+ "loss": 3.3826,
93
  "step": 7000
94
  },
95
  {
96
+ "epoch": 0.11,
97
+ "learning_rate": 9.989581422760117e-05,
98
+ "loss": 3.3095,
99
  "step": 7500
100
  },
101
  {
102
+ "epoch": 0.12,
103
+ "learning_rate": 9.988040513379809e-05,
104
+ "loss": 3.3544,
105
  "step": 8000
106
  },
107
  {
108
+ "epoch": 0.12,
109
+ "learning_rate": 9.986393509609485e-05,
110
+ "loss": 3.3007,
111
  "step": 8500
112
  },
113
  {
114
+ "epoch": 0.13,
115
+ "learning_rate": 9.984640446480509e-05,
116
+ "loss": 3.289,
117
  "step": 9000
118
  },
119
  {
120
+ "epoch": 0.14,
121
+ "learning_rate": 9.9827813612801e-05,
122
+ "loss": 3.2892,
123
  "step": 9500
124
  },
125
  {
126
+ "epoch": 0.15,
127
+ "learning_rate": 9.98081629355054e-05,
128
+ "loss": 3.3141,
129
  "step": 10000
130
  },
131
  {
132
+ "epoch": 0.15,
133
+ "learning_rate": 9.978745285088338e-05,
134
+ "loss": 3.3381,
135
  "step": 10500
136
  },
137
  {
138
+ "epoch": 0.16,
139
+ "learning_rate": 9.97656837994333e-05,
140
+ "loss": 3.2098,
141
  "step": 11000
142
  },
143
  {
144
+ "epoch": 0.17,
145
+ "learning_rate": 9.974285624417751e-05,
146
+ "loss": 3.3139,
147
  "step": 11500
148
  },
149
  {
150
+ "epoch": 0.18,
151
+ "learning_rate": 9.971897067065248e-05,
152
+ "loss": 3.2457,
153
  "step": 12000
154
  },
155
  {
156
+ "epoch": 0.18,
157
+ "learning_rate": 9.969402758689845e-05,
158
+ "loss": 3.1359,
159
  "step": 12500
160
  },
161
  {
162
+ "epoch": 0.19,
163
+ "learning_rate": 9.966802752344868e-05,
164
+ "loss": 3.2499,
165
  "step": 13000
166
  },
167
  {
168
+ "epoch": 0.2,
169
+ "learning_rate": 9.964097103331806e-05,
170
+ "loss": 3.3144,
171
  "step": 13500
172
  },
173
  {
174
+ "epoch": 0.21,
175
+ "learning_rate": 9.961285869199149e-05,
176
+ "loss": 3.3416,
177
  "step": 14000
178
  },
179
  {
180
+ "epoch": 0.21,
181
+ "learning_rate": 9.95836910974115e-05,
182
+ "loss": 3.2466,
183
  "step": 14500
184
  },
185
  {
186
+ "epoch": 0.22,
187
+ "learning_rate": 9.955346886996564e-05,
188
+ "loss": 3.2055,
189
  "step": 15000
190
  },
191
  {
192
+ "epoch": 0.23,
193
+ "learning_rate": 9.952219265247323e-05,
194
+ "loss": 3.1853,
195
  "step": 15500
196
  },
197
  {
198
+ "epoch": 0.23,
199
+ "learning_rate": 9.948986311017168e-05,
200
+ "loss": 3.261,
201
  "step": 16000
202
  },
203
  {
204
+ "epoch": 0.24,
205
+ "learning_rate": 9.945648093070237e-05,
206
+ "loss": 3.1393,
207
  "step": 16500
208
  },
209
  {
210
+ "epoch": 0.25,
211
+ "learning_rate": 9.942204682409603e-05,
212
+ "loss": 3.3337,
213
  "step": 17000
214
  },
215
  {
216
+ "epoch": 0.26,
217
+ "learning_rate": 9.938656152275759e-05,
218
+ "loss": 3.1791,
219
  "step": 17500
220
  },
221
  {
222
+ "epoch": 0.26,
223
+ "learning_rate": 9.935002578145065e-05,
224
+ "loss": 3.1644,
225
  "step": 18000
226
  },
227
  {
228
+ "epoch": 0.27,
229
+ "learning_rate": 9.931244037728141e-05,
230
+ "loss": 3.2369,
231
  "step": 18500
232
  },
233
  {
234
+ "epoch": 0.28,
235
+ "learning_rate": 9.927380610968213e-05,
236
+ "loss": 3.2139,
237
  "step": 19000
238
  },
239
  {
240
+ "epoch": 0.29,
241
+ "learning_rate": 9.923412380039415e-05,
242
+ "loss": 3.1762,
243
  "step": 19500
244
  },
245
  {
246
+ "epoch": 0.29,
247
+ "learning_rate": 9.919339429345039e-05,
248
+ "loss": 3.2732,
249
  "step": 20000
250
  },
251
  {
252
+ "epoch": 0.3,
253
+ "learning_rate": 9.915161845515739e-05,
254
+ "loss": 3.197,
255
  "step": 20500
256
  },
257
  {
258
+ "epoch": 0.31,
259
+ "learning_rate": 9.910879717407693e-05,
260
+ "loss": 3.1034,
261
  "step": 21000
262
  },
263
  {
264
+ "epoch": 0.32,
265
+ "learning_rate": 9.906493136100707e-05,
266
+ "loss": 3.3108,
267
  "step": 21500
268
  },
269
  {
270
+ "epoch": 0.32,
271
+ "learning_rate": 9.902002194896285e-05,
272
+ "loss": 3.1394,
273
  "step": 22000
274
  },
275
  {
276
+ "epoch": 0.33,
277
+ "learning_rate": 9.897406989315634e-05,
278
+ "loss": 3.2385,
279
  "step": 22500
280
  },
281
  {
282
+ "epoch": 0.34,
283
+ "learning_rate": 9.892707617097645e-05,
284
+ "loss": 3.1855,
285
  "step": 23000
286
  },
287
  {
288
+ "epoch": 0.34,
289
+ "learning_rate": 9.887904178196804e-05,
290
+ "loss": 3.2088,
291
  "step": 23500
292
  },
293
  {
294
+ "epoch": 0.35,
295
+ "learning_rate": 9.882996774781066e-05,
296
+ "loss": 3.2111,
297
  "step": 24000
298
  },
299
  {
300
+ "epoch": 0.36,
301
+ "learning_rate": 9.877985511229697e-05,
302
+ "loss": 3.175,
303
  "step": 24500
304
  },
305
  {
306
+ "epoch": 0.37,
307
+ "learning_rate": 9.87287049413103e-05,
308
+ "loss": 3.1891,
309
  "step": 25000
310
  },
311
  {
312
+ "epoch": 0.37,
313
+ "learning_rate": 9.867651832280217e-05,
314
+ "loss": 3.182,
315
  "step": 25500
316
  },
317
  {
318
+ "epoch": 0.38,
319
+ "learning_rate": 9.86232963667691e-05,
320
+ "loss": 3.3346,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  "step": 26000
322
  },
323
  {
324
+ "epoch": 0.39,
325
+ "learning_rate": 9.85690402052289e-05,
326
+ "loss": 3.2496,
327
  "step": 26500
328
  },
329
  {
330
+ "epoch": 0.4,
331
+ "learning_rate": 9.851375099219677e-05,
332
+ "loss": 3.222,
333
  "step": 27000
334
  },
335
  {
336
+ "epoch": 0.4,
337
+ "learning_rate": 9.845742990366059e-05,
338
+ "loss": 3.2083,
339
  "step": 27500
340
  },
341
  {
342
+ "epoch": 0.41,
343
+ "learning_rate": 9.840007813755603e-05,
344
+ "loss": 3.233,
345
  "step": 28000
346
  },
347
  {
348
+ "epoch": 0.42,
349
+ "learning_rate": 9.834169691374098e-05,
350
+ "loss": 3.1732,
351
  "step": 28500
352
  },
353
  {
354
+ "epoch": 0.43,
355
+ "learning_rate": 9.828228747396964e-05,
356
+ "loss": 3.1922,
357
  "step": 29000
358
  },
359
  {
360
+ "epoch": 0.43,
361
+ "learning_rate": 9.822185108186616e-05,
362
+ "loss": 3.1923,
363
  "step": 29500
364
  },
365
  {
366
+ "epoch": 0.44,
367
+ "learning_rate": 9.816038902289763e-05,
368
+ "loss": 3.2879,
369
  "step": 30000
370
  },
371
  {
372
+ "epoch": 0.45,
373
+ "learning_rate": 9.809790260434693e-05,
374
+ "loss": 3.2816,
375
  "step": 30500
376
  },
377
  {
378
+ "epoch": 0.45,
379
+ "learning_rate": 9.803439315528469e-05,
380
+ "loss": 3.2343,
381
  "step": 31000
382
  },
383
  {
384
+ "epoch": 0.46,
385
+ "learning_rate": 9.796986202654124e-05,
386
+ "loss": 3.1372,
387
  "step": 31500
388
  },
389
  {
390
+ "epoch": 0.47,
391
+ "learning_rate": 9.790431059067775e-05,
392
+ "loss": 3.2111,
393
  "step": 32000
394
  },
395
  {
396
+ "epoch": 0.48,
397
+ "learning_rate": 9.783774024195709e-05,
398
+ "loss": 3.1488,
399
  "step": 32500
400
  },
401
  {
402
+ "epoch": 0.48,
403
+ "learning_rate": 9.77701523963141e-05,
404
+ "loss": 3.1637,
405
  "step": 33000
406
  },
407
  {
408
+ "epoch": 0.49,
409
+ "learning_rate": 9.77015484913256e-05,
410
+ "loss": 3.1593,
411
  "step": 33500
412
  },
413
  {
414
+ "epoch": 0.5,
415
+ "learning_rate": 9.763192998617969e-05,
416
+ "loss": 3.1399,
417
  "step": 34000
418
  },
419
  {
420
+ "epoch": 0.51,
421
+ "learning_rate": 9.75612983616448e-05,
422
+ "loss": 3.0628,
423
  "step": 34500
424
  },
425
  {
426
+ "epoch": 0.51,
427
+ "learning_rate": 9.748965512003812e-05,
428
+ "loss": 3.2634,
429
  "step": 35000
430
  },
431
  {
432
+ "epoch": 0.52,
433
+ "learning_rate": 9.741700178519374e-05,
434
+ "loss": 3.1562,
435
  "step": 35500
436
  },
437
  {
438
+ "epoch": 0.53,
439
+ "learning_rate": 9.734333990243012e-05,
440
+ "loss": 3.2411,
441
  "step": 36000
442
  },
443
  {
444
+ "epoch": 0.54,
445
+ "learning_rate": 9.726867103851735e-05,
446
+ "loss": 3.1336,
447
  "step": 36500
448
  },
449
  {
450
+ "epoch": 0.54,
451
+ "learning_rate": 9.719299678164369e-05,
452
+ "loss": 3.1557,
453
  "step": 37000
454
  },
455
  {
456
+ "epoch": 0.55,
457
+ "learning_rate": 9.711631874138192e-05,
458
+ "loss": 3.1368,
459
  "step": 37500
460
  },
461
  {
462
+ "epoch": 0.56,
463
+ "learning_rate": 9.703863854865502e-05,
464
+ "loss": 3.1296,
465
  "step": 38000
466
  },
467
  {
468
+ "epoch": 0.56,
469
+ "learning_rate": 9.69599578557015e-05,
470
+ "loss": 3.1308,
471
  "step": 38500
472
  },
473
  {
474
+ "epoch": 0.57,
475
+ "learning_rate": 9.688027833604027e-05,
476
+ "loss": 3.1526,
477
  "step": 39000
478
  },
479
  {
480
+ "epoch": 0.58,
481
+ "learning_rate": 9.679960168443507e-05,
482
+ "loss": 3.2699,
483
  "step": 39500
484
  },
485
  {
486
+ "epoch": 0.59,
487
+ "learning_rate": 9.671792961685831e-05,
488
+ "loss": 3.0819,
489
  "step": 40000
490
  },
491
  {
492
+ "epoch": 0.59,
493
+ "learning_rate": 9.663526387045473e-05,
494
+ "loss": 3.0947,
495
  "step": 40500
496
  },
497
  {
498
+ "epoch": 0.6,
499
+ "learning_rate": 9.655160620350434e-05,
500
+ "loss": 3.1903,
501
  "step": 41000
502
  },
503
  {
504
+ "epoch": 0.61,
505
+ "learning_rate": 9.646695839538503e-05,
506
+ "loss": 3.0587,
507
  "step": 41500
508
  },
509
  {
510
+ "epoch": 0.62,
511
+ "learning_rate": 9.638132224653482e-05,
512
+ "loss": 3.1778,
513
  "step": 42000
514
  },
515
  {
516
+ "epoch": 0.62,
517
+ "learning_rate": 9.629469957841341e-05,
518
+ "loss": 3.0616,
519
  "step": 42500
520
  },
521
  {
522
+ "epoch": 0.63,
523
+ "learning_rate": 9.62070922334636e-05,
524
+ "loss": 3.1816,
525
  "step": 43000
526
  },
527
  {
528
+ "epoch": 0.64,
529
+ "learning_rate": 9.611850207507196e-05,
530
+ "loss": 3.1625,
531
  "step": 43500
532
  },
533
  {
534
+ "epoch": 0.64,
535
+ "learning_rate": 9.602893098752929e-05,
536
+ "loss": 3.2755,
537
  "step": 44000
538
  },
539
  {
540
+ "epoch": 0.65,
541
+ "learning_rate": 9.59383808759905e-05,
542
+ "loss": 3.1046,
543
  "step": 44500
544
  },
545
  {
546
+ "epoch": 0.66,
547
+ "learning_rate": 9.584685366643411e-05,
548
+ "loss": 3.176,
549
  "step": 45000
550
  },
551
  {
552
+ "epoch": 0.67,
553
+ "learning_rate": 9.575435130562125e-05,
554
+ "loss": 3.1618,
555
  "step": 45500
556
  },
557
  {
558
+ "epoch": 0.67,
559
+ "learning_rate": 9.566087576105431e-05,
560
+ "loss": 3.2012,
561
  "step": 46000
562
  },
563
  {
564
+ "epoch": 0.68,
565
+ "learning_rate": 9.556642902093503e-05,
566
+ "loss": 3.2124,
567
  "step": 46500
568
  },
569
  {
570
+ "epoch": 0.69,
571
+ "learning_rate": 9.547101309412226e-05,
572
+ "loss": 3.1282,
573
  "step": 47000
574
  },
575
  {
576
+ "epoch": 0.7,
577
+ "learning_rate": 9.53746300100892e-05,
578
+ "loss": 3.1725,
579
  "step": 47500
580
  },
581
  {
582
+ "epoch": 0.7,
583
+ "learning_rate": 9.527728181888023e-05,
584
+ "loss": 3.1428,
585
  "step": 48000
586
  },
587
  {
588
+ "epoch": 0.71,
589
+ "learning_rate": 9.517897059106737e-05,
590
+ "loss": 3.1074,
591
  "step": 48500
592
  },
593
  {
594
+ "epoch": 0.72,
595
+ "learning_rate": 9.507969841770614e-05,
596
+ "loss": 3.2534,
597
  "step": 49000
598
  },
599
  {
600
+ "epoch": 0.73,
601
+ "learning_rate": 9.497946741029116e-05,
602
+ "loss": 3.1394,
603
  "step": 49500
604
  },
605
  {
606
+ "epoch": 0.73,
607
+ "learning_rate": 9.48782797007112e-05,
608
+ "loss": 3.1688,
609
  "step": 50000
610
  },
611
  {
612
+ "epoch": 0.74,
613
+ "learning_rate": 9.477613744120386e-05,
614
+ "loss": 3.2439,
615
  "step": 50500
616
  },
617
  {
618
+ "epoch": 0.75,
619
+ "learning_rate": 9.467304280430977e-05,
620
+ "loss": 3.0768,
621
  "step": 51000
622
  },
623
  {
624
+ "epoch": 0.75,
625
+ "learning_rate": 9.456899798282642e-05,
626
+ "loss": 3.082,
627
+ "step": 51500
628
+ },
629
+ {
630
+ "epoch": 0.76,
631
+ "learning_rate": 9.446400518976144e-05,
632
+ "loss": 3.1203,
633
+ "step": 52000
634
+ },
635
+ {
636
+ "epoch": 0.77,
637
+ "learning_rate": 9.435806665828566e-05,
638
+ "loss": 3.1243,
639
+ "step": 52500
640
+ },
641
+ {
642
+ "epoch": 0.78,
643
+ "learning_rate": 9.425118464168545e-05,
644
+ "loss": 3.1732,
645
+ "step": 53000
646
+ },
647
+ {
648
+ "epoch": 0.78,
649
+ "learning_rate": 9.414336141331491e-05,
650
+ "loss": 3.118,
651
+ "step": 53500
652
+ },
653
+ {
654
+ "epoch": 0.79,
655
+ "learning_rate": 9.403459926654748e-05,
656
+ "loss": 3.1597,
657
+ "step": 54000
658
+ },
659
+ {
660
+ "epoch": 0.8,
661
+ "learning_rate": 9.392490051472718e-05,
662
+ "loss": 3.1854,
663
+ "step": 54500
664
+ },
665
+ {
666
+ "epoch": 0.81,
667
+ "learning_rate": 9.381426749111936e-05,
668
+ "loss": 3.1857,
669
+ "step": 55000
670
+ },
671
+ {
672
+ "epoch": 0.81,
673
+ "learning_rate": 9.370270254886115e-05,
674
+ "loss": 3.1094,
675
+ "step": 55500
676
+ },
677
+ {
678
+ "epoch": 0.82,
679
+ "learning_rate": 9.359020806091126e-05,
680
+ "loss": 3.1459,
681
+ "step": 56000
682
+ },
683
+ {
684
+ "epoch": 0.83,
685
+ "learning_rate": 9.347678641999973e-05,
686
+ "loss": 3.063,
687
+ "step": 56500
688
+ },
689
+ {
690
+ "epoch": 0.84,
691
+ "learning_rate": 9.336244003857682e-05,
692
+ "loss": 3.0853,
693
+ "step": 57000
694
+ },
695
+ {
696
+ "epoch": 0.84,
697
+ "learning_rate": 9.324717134876182e-05,
698
+ "loss": 3.1004,
699
+ "step": 57500
700
+ },
701
+ {
702
+ "epoch": 0.85,
703
+ "learning_rate": 9.313098280229133e-05,
704
+ "loss": 3.0624,
705
+ "step": 58000
706
+ },
707
+ {
708
+ "epoch": 0.86,
709
+ "learning_rate": 9.301387687046704e-05,
710
+ "loss": 3.1182,
711
+ "step": 58500
712
+ },
713
+ {
714
+ "epoch": 0.86,
715
+ "learning_rate": 9.289585604410317e-05,
716
+ "loss": 3.0812,
717
+ "step": 59000
718
+ },
719
+ {
720
+ "epoch": 0.87,
721
+ "learning_rate": 9.277692283347357e-05,
722
+ "loss": 3.1594,
723
+ "step": 59500
724
+ },
725
+ {
726
+ "epoch": 0.88,
727
+ "learning_rate": 9.265707976825829e-05,
728
+ "loss": 3.0691,
729
+ "step": 60000
730
+ },
731
+ {
732
+ "epoch": 0.89,
733
+ "learning_rate": 9.253632939748968e-05,
734
+ "loss": 3.0989,
735
+ "step": 60500
736
+ },
737
+ {
738
+ "epoch": 0.89,
739
+ "learning_rate": 9.241467428949837e-05,
740
+ "loss": 3.1739,
741
+ "step": 61000
742
+ },
743
+ {
744
+ "epoch": 0.9,
745
+ "learning_rate": 9.229211703185842e-05,
746
+ "loss": 3.0593,
747
+ "step": 61500
748
+ },
749
+ {
750
+ "epoch": 0.91,
751
+ "learning_rate": 9.216866023133246e-05,
752
+ "loss": 3.0508,
753
+ "step": 62000
754
+ },
755
+ {
756
+ "epoch": 0.92,
757
+ "learning_rate": 9.204430651381613e-05,
758
+ "loss": 3.1162,
759
+ "step": 62500
760
+ },
761
+ {
762
+ "epoch": 0.92,
763
+ "learning_rate": 9.191905852428232e-05,
764
+ "loss": 3.1316,
765
+ "step": 63000
766
+ },
767
+ {
768
+ "epoch": 0.93,
769
+ "learning_rate": 9.179291892672484e-05,
770
+ "loss": 3.0565,
771
+ "step": 63500
772
+ },
773
+ {
774
+ "epoch": 0.94,
775
+ "learning_rate": 9.166589040410175e-05,
776
+ "loss": 3.1502,
777
+ "step": 64000
778
+ },
779
+ {
780
+ "epoch": 0.95,
781
+ "learning_rate": 9.153797565827839e-05,
782
+ "loss": 3.1613,
783
+ "step": 64500
784
+ },
785
+ {
786
+ "epoch": 0.95,
787
+ "learning_rate": 9.140917740996979e-05,
788
+ "loss": 2.9902,
789
+ "step": 65000
790
+ },
791
+ {
792
+ "epoch": 0.96,
793
+ "learning_rate": 9.127949839868292e-05,
794
+ "loss": 3.0026,
795
+ "step": 65500
796
+ },
797
+ {
798
+ "epoch": 0.97,
799
+ "learning_rate": 9.114894138265832e-05,
800
+ "loss": 3.1636,
801
+ "step": 66000
802
+ },
803
+ {
804
+ "epoch": 0.97,
805
+ "learning_rate": 9.101750913881147e-05,
806
+ "loss": 3.1233,
807
+ "step": 66500
808
+ },
809
+ {
810
+ "epoch": 0.98,
811
+ "learning_rate": 9.088520446267374e-05,
812
+ "loss": 3.0781,
813
+ "step": 67000
814
+ },
815
+ {
816
+ "epoch": 0.99,
817
+ "learning_rate": 9.075203016833295e-05,
818
+ "loss": 3.0872,
819
+ "step": 67500
820
+ },
821
+ {
822
+ "epoch": 1.0,
823
+ "learning_rate": 9.061798908837341e-05,
824
+ "loss": 3.1095,
825
+ "step": 68000
826
+ },
827
+ {
828
+ "epoch": 1.0,
829
  "eval_bleu": 1.0,
830
  "eval_brevity_penalty": 1.0,
831
  "eval_length_ratio": 1.0,
832
+ "eval_loss": 2.9751689434051514,
833
  "eval_precisions": [
834
  1.0,
835
  1.0,
836
  1.0,
837
  1.0
838
  ],
839
+ "eval_reference_length": 7761920,
840
+ "eval_runtime": 15377.8782,
841
+ "eval_samples_per_second": 0.986,
842
+ "eval_steps_per_second": 0.493,
843
+ "eval_translation_length": 7761920,
844
+ "step": 68219
845
  }
846
  ],
847
  "logging_steps": 500,
848
+ "max_steps": 341095,
849
  "num_train_epochs": 5,
850
  "save_steps": 500,
851
+ "total_flos": 3.144579296777994e+17,
852
  "trial_name": null,
853
  "trial_params": null
854
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c32a9c65221d103b02faab497de247bbdfc7b9598acc0b6597ea2949b830722
3
  size 4728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a742f70b0846e59a06963ff7344d674f0f22eef8791af5874a171f202b5ca21
3
  size 4728