CocoRoF commited on
Commit
58cdf39
·
verified ·
1 Parent(s): 7470cef

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79f312030784e0c4c1ac477ec8be9baccaf31bb453a5f1f0ba47bae3a11d6736
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3eec04603345e3344b56349bbf3c8f2ab8c2870fb856bc03b802e4a9b84023a
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9f0413ff5d7932d1e3cf4853d666b874dcafb75ec0e3ed0aec08553a6ec61db
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e596f723566f73180179176318fe50c96148d6e69a806bcd5b46bd82b7f1cd4e
3
  size 2375752250
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68d8c53b42f31b6c7313464394859eb2afcdd967a1a813faffd1e0be60216485
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b2a49e94ec437b7eac846837816e3fa765a1213db3ca835c9332ec400042339
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,986 +1,720 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.999909559555033,
5
  "eval_steps": 500,
6
- "global_step": 691,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.007235235597359139,
13
- "grad_norm": 152.25,
14
  "learning_rate": 7.142857142857143e-07,
15
- "loss": 124.6481,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.014470471194718278,
20
- "grad_norm": 150.625,
21
  "learning_rate": 1.4285714285714286e-06,
22
- "loss": 119.964,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.021705706792077416,
27
- "grad_norm": 150.375,
28
  "learning_rate": 2.1428571428571427e-06,
29
- "loss": 118.5022,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.028940942389436557,
34
- "grad_norm": 168.125,
35
  "learning_rate": 2.8571428571428573e-06,
36
- "loss": 116.7498,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.0361761779867957,
41
- "grad_norm": 143.375,
42
  "learning_rate": 3.5714285714285718e-06,
43
- "loss": 114.4503,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.04341141358415483,
48
- "grad_norm": 135.125,
49
  "learning_rate": 4.2857142857142855e-06,
50
- "loss": 114.6372,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.05064664918151397,
55
- "grad_norm": 149.875,
56
  "learning_rate": 5e-06,
57
- "loss": 111.9492,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.05788188477887311,
62
- "grad_norm": 141.75,
63
  "learning_rate": 5.7142857142857145e-06,
64
- "loss": 112.9367,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.06511712037623225,
69
- "grad_norm": 159.0,
70
  "learning_rate": 6.4285714285714295e-06,
71
- "loss": 113.2503,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.0723523559735914,
76
- "grad_norm": 147.0,
77
  "learning_rate": 7.1428571428571436e-06,
78
- "loss": 110.1401,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.07958759157095052,
83
- "grad_norm": 161.125,
84
  "learning_rate": 7.857142857142858e-06,
85
- "loss": 110.8394,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.08682282716830966,
90
- "grad_norm": 158.125,
91
  "learning_rate": 8.571428571428571e-06,
92
- "loss": 113.1161,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.0940580627656688,
97
- "grad_norm": 159.0,
98
  "learning_rate": 9.285714285714288e-06,
99
- "loss": 111.898,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.10129329836302794,
104
- "grad_norm": 143.0,
105
  "learning_rate": 1e-05,
106
- "loss": 112.3912,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.10852853396038709,
111
- "grad_norm": 159.5,
112
  "learning_rate": 9.919484702093398e-06,
113
- "loss": 111.8873,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.11576376955774623,
118
- "grad_norm": 162.875,
119
  "learning_rate": 9.838969404186796e-06,
120
- "loss": 111.136,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.12299900515510537,
125
- "grad_norm": 159.0,
126
  "learning_rate": 9.758454106280194e-06,
127
- "loss": 113.4827,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.1302342407524645,
132
- "grad_norm": 155.625,
133
  "learning_rate": 9.677938808373591e-06,
134
- "loss": 111.4002,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.13746947634982365,
139
- "grad_norm": 167.75,
140
  "learning_rate": 9.59742351046699e-06,
141
- "loss": 111.9107,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.1447047119471828,
146
- "grad_norm": 153.375,
147
  "learning_rate": 9.516908212560388e-06,
148
- "loss": 113.0265,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.15193994754454193,
153
- "grad_norm": 166.0,
154
  "learning_rate": 9.436392914653784e-06,
155
- "loss": 110.3215,
156
  "step": 105
157
  },
158
  {
159
  "epoch": 0.15917518314190104,
160
- "grad_norm": 141.0,
161
  "learning_rate": 9.355877616747183e-06,
162
- "loss": 110.7277,
163
  "step": 110
164
  },
165
  {
166
  "epoch": 0.16641041873926019,
167
- "grad_norm": 147.5,
168
  "learning_rate": 9.275362318840581e-06,
169
- "loss": 112.1578,
170
  "step": 115
171
  },
172
  {
173
  "epoch": 0.17364565433661933,
174
- "grad_norm": 155.125,
175
  "learning_rate": 9.194847020933978e-06,
176
- "loss": 111.7271,
177
  "step": 120
178
  },
179
  {
180
  "epoch": 0.18088088993397847,
181
- "grad_norm": 149.25,
182
  "learning_rate": 9.114331723027376e-06,
183
- "loss": 109.4189,
184
  "step": 125
185
  },
186
  {
187
  "epoch": 0.1881161255313376,
188
- "grad_norm": 159.125,
189
  "learning_rate": 9.033816425120775e-06,
190
- "loss": 110.2367,
191
  "step": 130
192
  },
193
  {
194
  "epoch": 0.19535136112869675,
195
- "grad_norm": 146.75,
196
  "learning_rate": 8.953301127214171e-06,
197
- "loss": 110.2772,
198
  "step": 135
199
  },
200
  {
201
  "epoch": 0.2025865967260559,
202
- "grad_norm": 143.375,
203
  "learning_rate": 8.87278582930757e-06,
204
- "loss": 111.3193,
205
  "step": 140
206
  },
207
  {
208
  "epoch": 0.20982183232341503,
209
- "grad_norm": 153.375,
210
  "learning_rate": 8.792270531400966e-06,
211
- "loss": 108.8334,
212
  "step": 145
213
  },
214
  {
215
  "epoch": 0.21705706792077417,
216
- "grad_norm": 165.375,
217
  "learning_rate": 8.711755233494365e-06,
218
- "loss": 108.6805,
219
  "step": 150
220
  },
221
  {
222
  "epoch": 0.2242923035181333,
223
- "grad_norm": 152.125,
224
  "learning_rate": 8.631239935587761e-06,
225
- "loss": 109.1355,
226
  "step": 155
227
  },
228
  {
229
  "epoch": 0.23152753911549245,
230
- "grad_norm": 157.375,
231
  "learning_rate": 8.55072463768116e-06,
232
- "loss": 108.0569,
233
  "step": 160
234
  },
235
  {
236
  "epoch": 0.2387627747128516,
237
- "grad_norm": 146.625,
238
  "learning_rate": 8.470209339774558e-06,
239
- "loss": 109.2913,
240
  "step": 165
241
  },
242
  {
243
  "epoch": 0.24599801031021074,
244
- "grad_norm": 158.375,
245
  "learning_rate": 8.389694041867955e-06,
246
- "loss": 107.8965,
247
  "step": 170
248
  },
249
  {
250
  "epoch": 0.25323324590756985,
251
- "grad_norm": 153.25,
252
  "learning_rate": 8.309178743961353e-06,
253
- "loss": 107.1813,
254
  "step": 175
255
  },
256
  {
257
  "epoch": 0.260468481504929,
258
- "grad_norm": 145.25,
259
  "learning_rate": 8.228663446054752e-06,
260
- "loss": 107.5406,
261
  "step": 180
262
  },
263
  {
264
  "epoch": 0.26770371710228813,
265
- "grad_norm": 142.125,
266
  "learning_rate": 8.148148148148148e-06,
267
- "loss": 107.5163,
268
  "step": 185
269
  },
270
  {
271
  "epoch": 0.2749389526996473,
272
- "grad_norm": 151.5,
273
  "learning_rate": 8.067632850241547e-06,
274
- "loss": 107.8385,
275
  "step": 190
276
  },
277
  {
278
  "epoch": 0.2821741882970064,
279
- "grad_norm": 143.125,
280
  "learning_rate": 7.987117552334945e-06,
281
- "loss": 106.562,
282
  "step": 195
283
  },
284
  {
285
  "epoch": 0.2894094238943656,
286
- "grad_norm": 147.75,
287
  "learning_rate": 7.906602254428342e-06,
288
- "loss": 104.995,
289
  "step": 200
290
  },
291
  {
292
  "epoch": 0.2966446594917247,
293
- "grad_norm": 151.375,
294
  "learning_rate": 7.82608695652174e-06,
295
- "loss": 106.541,
296
  "step": 205
297
  },
298
  {
299
  "epoch": 0.30387989508908386,
300
- "grad_norm": 153.25,
301
  "learning_rate": 7.745571658615137e-06,
302
- "loss": 106.8991,
303
  "step": 210
304
  },
305
  {
306
  "epoch": 0.311115130686443,
307
- "grad_norm": 160.25,
308
  "learning_rate": 7.665056360708535e-06,
309
- "loss": 104.9197,
310
  "step": 215
311
  },
312
  {
313
  "epoch": 0.3183503662838021,
314
- "grad_norm": 152.875,
315
  "learning_rate": 7.584541062801934e-06,
316
- "loss": 105.1669,
317
  "step": 220
318
  },
319
  {
320
  "epoch": 0.32558560188116126,
321
- "grad_norm": 150.875,
322
  "learning_rate": 7.504025764895331e-06,
323
- "loss": 103.8021,
324
  "step": 225
325
  },
326
  {
327
  "epoch": 0.33282083747852037,
328
- "grad_norm": 135.125,
329
  "learning_rate": 7.423510466988728e-06,
330
- "loss": 104.6592,
331
  "step": 230
332
  },
333
  {
334
  "epoch": 0.34005607307587954,
335
- "grad_norm": 145.5,
336
  "learning_rate": 7.342995169082127e-06,
337
- "loss": 103.5859,
338
  "step": 235
339
  },
340
  {
341
  "epoch": 0.34729130867323865,
342
- "grad_norm": 148.875,
343
  "learning_rate": 7.262479871175524e-06,
344
- "loss": 104.8537,
345
  "step": 240
346
  },
347
  {
348
  "epoch": 0.3545265442705978,
349
- "grad_norm": 149.75,
350
  "learning_rate": 7.181964573268921e-06,
351
- "loss": 104.8267,
352
  "step": 245
353
  },
354
  {
355
  "epoch": 0.36176177986795693,
356
- "grad_norm": 162.625,
357
  "learning_rate": 7.10144927536232e-06,
358
- "loss": 104.704,
359
  "step": 250
360
  },
361
  {
362
  "epoch": 0.3689970154653161,
363
- "grad_norm": 143.375,
364
  "learning_rate": 7.020933977455717e-06,
365
- "loss": 105.0362,
366
  "step": 255
367
  },
368
  {
369
  "epoch": 0.3762322510626752,
370
- "grad_norm": 148.375,
371
  "learning_rate": 6.940418679549115e-06,
372
- "loss": 102.2345,
373
  "step": 260
374
  },
375
  {
376
  "epoch": 0.3834674866600344,
377
- "grad_norm": 152.375,
378
  "learning_rate": 6.859903381642513e-06,
379
- "loss": 105.5086,
380
  "step": 265
381
  },
382
  {
383
  "epoch": 0.3907027222573935,
384
- "grad_norm": 140.125,
385
  "learning_rate": 6.779388083735911e-06,
386
- "loss": 103.5476,
387
  "step": 270
388
  },
389
  {
390
  "epoch": 0.39793795785475267,
391
- "grad_norm": 148.375,
392
  "learning_rate": 6.698872785829308e-06,
393
- "loss": 103.0274,
394
  "step": 275
395
  },
396
  {
397
  "epoch": 0.4051731934521118,
398
- "grad_norm": 151.0,
399
  "learning_rate": 6.6183574879227065e-06,
400
- "loss": 103.2487,
401
  "step": 280
402
  },
403
  {
404
  "epoch": 0.41240842904947095,
405
- "grad_norm": 143.5,
406
  "learning_rate": 6.537842190016104e-06,
407
- "loss": 102.4844,
408
  "step": 285
409
  },
410
  {
411
  "epoch": 0.41964366464683006,
412
- "grad_norm": 139.625,
413
  "learning_rate": 6.457326892109501e-06,
414
- "loss": 102.6165,
415
  "step": 290
416
  },
417
  {
418
  "epoch": 0.4268789002441892,
419
- "grad_norm": 135.0,
420
  "learning_rate": 6.376811594202898e-06,
421
- "loss": 101.8949,
422
  "step": 295
423
  },
424
  {
425
  "epoch": 0.43411413584154834,
426
- "grad_norm": 159.125,
427
  "learning_rate": 6.296296296296297e-06,
428
- "loss": 101.1282,
429
  "step": 300
430
  },
431
  {
432
  "epoch": 0.44134937143890746,
433
- "grad_norm": 146.125,
434
  "learning_rate": 6.215780998389694e-06,
435
- "loss": 102.0691,
436
  "step": 305
437
  },
438
  {
439
  "epoch": 0.4485846070362666,
440
- "grad_norm": 145.625,
441
  "learning_rate": 6.135265700483092e-06,
442
- "loss": 101.359,
443
  "step": 310
444
  },
445
  {
446
  "epoch": 0.45581984263362574,
447
- "grad_norm": 148.5,
448
  "learning_rate": 6.05475040257649e-06,
449
- "loss": 101.5264,
450
  "step": 315
451
  },
452
  {
453
  "epoch": 0.4630550782309849,
454
- "grad_norm": 139.375,
455
  "learning_rate": 5.974235104669888e-06,
456
- "loss": 99.743,
457
  "step": 320
458
  },
459
  {
460
  "epoch": 0.470290313828344,
461
- "grad_norm": 145.0,
462
  "learning_rate": 5.893719806763285e-06,
463
- "loss": 103.0054,
464
  "step": 325
465
  },
466
  {
467
  "epoch": 0.4775255494257032,
468
- "grad_norm": 155.875,
469
  "learning_rate": 5.8132045088566835e-06,
470
- "loss": 99.6074,
471
  "step": 330
472
  },
473
  {
474
  "epoch": 0.4847607850230623,
475
- "grad_norm": 157.25,
476
  "learning_rate": 5.732689210950081e-06,
477
- "loss": 100.957,
478
  "step": 335
479
  },
480
  {
481
  "epoch": 0.49199602062042147,
482
- "grad_norm": 145.125,
483
  "learning_rate": 5.652173913043479e-06,
484
- "loss": 100.6761,
485
  "step": 340
486
  },
487
  {
488
  "epoch": 0.4992312562177806,
489
- "grad_norm": 144.5,
490
  "learning_rate": 5.571658615136877e-06,
491
- "loss": 99.3499,
492
  "step": 345
493
  },
494
  {
495
  "epoch": 0.5064664918151397,
496
- "grad_norm": 165.25,
497
  "learning_rate": 5.4911433172302745e-06,
498
- "loss": 101.0515,
499
  "step": 350
500
  },
501
  {
502
  "epoch": 0.5137017274124989,
503
- "grad_norm": 140.625,
504
  "learning_rate": 5.410628019323671e-06,
505
- "loss": 100.2075,
506
  "step": 355
507
  },
508
  {
509
  "epoch": 0.520936963009858,
510
- "grad_norm": 148.625,
511
  "learning_rate": 5.3301127214170704e-06,
512
- "loss": 101.0406,
513
  "step": 360
514
  },
515
  {
516
  "epoch": 0.5281721986072172,
517
- "grad_norm": 137.625,
518
  "learning_rate": 5.249597423510467e-06,
519
- "loss": 100.4312,
520
  "step": 365
521
  },
522
  {
523
  "epoch": 0.5354074342045763,
524
- "grad_norm": 143.25,
525
  "learning_rate": 5.169082125603865e-06,
526
- "loss": 98.7602,
527
  "step": 370
528
  },
529
  {
530
  "epoch": 0.5426426698019354,
531
- "grad_norm": 145.625,
532
  "learning_rate": 5.088566827697263e-06,
533
- "loss": 99.5661,
534
  "step": 375
535
  },
536
  {
537
  "epoch": 0.5498779053992946,
538
- "grad_norm": 130.875,
539
  "learning_rate": 5.0080515297906606e-06,
540
- "loss": 98.5447,
541
  "step": 380
542
  },
543
  {
544
  "epoch": 0.5571131409966537,
545
- "grad_norm": 149.25,
546
  "learning_rate": 4.927536231884059e-06,
547
- "loss": 98.0122,
548
  "step": 385
549
  },
550
  {
551
  "epoch": 0.5643483765940128,
552
- "grad_norm": 139.625,
553
  "learning_rate": 4.847020933977456e-06,
554
- "loss": 99.2832,
555
  "step": 390
556
  },
557
  {
558
  "epoch": 0.571583612191372,
559
- "grad_norm": 151.25,
560
  "learning_rate": 4.766505636070854e-06,
561
- "loss": 98.1141,
562
  "step": 395
563
  },
564
  {
565
  "epoch": 0.5788188477887312,
566
- "grad_norm": 153.75,
567
  "learning_rate": 4.6859903381642516e-06,
568
- "loss": 96.6736,
569
  "step": 400
570
  },
571
  {
572
  "epoch": 0.5860540833860902,
573
- "grad_norm": 153.375,
574
  "learning_rate": 4.605475040257649e-06,
575
- "loss": 98.9147,
576
  "step": 405
577
  },
578
  {
579
  "epoch": 0.5932893189834494,
580
- "grad_norm": 135.0,
581
  "learning_rate": 4.5249597423510475e-06,
582
- "loss": 97.6523,
583
  "step": 410
584
  },
585
  {
586
  "epoch": 0.6005245545808086,
587
- "grad_norm": 140.375,
588
  "learning_rate": 4.444444444444444e-06,
589
- "loss": 96.3539,
590
  "step": 415
591
  },
592
  {
593
  "epoch": 0.6077597901781677,
594
- "grad_norm": 143.125,
595
  "learning_rate": 4.3639291465378425e-06,
596
- "loss": 99.4855,
597
  "step": 420
598
  },
599
  {
600
  "epoch": 0.6149950257755268,
601
- "grad_norm": 150.5,
602
  "learning_rate": 4.28341384863124e-06,
603
- "loss": 98.0349,
604
  "step": 425
605
  },
606
  {
607
  "epoch": 0.622230261372886,
608
- "grad_norm": 135.5,
609
  "learning_rate": 4.202898550724638e-06,
610
- "loss": 97.4008,
611
  "step": 430
612
  },
613
  {
614
  "epoch": 0.6294654969702451,
615
- "grad_norm": 139.25,
616
  "learning_rate": 4.122383252818036e-06,
617
- "loss": 96.3096,
618
  "step": 435
619
  },
620
  {
621
  "epoch": 0.6367007325676042,
622
- "grad_norm": 138.0,
623
  "learning_rate": 4.0418679549114335e-06,
624
- "loss": 95.9571,
625
  "step": 440
626
  },
627
  {
628
  "epoch": 0.6439359681649633,
629
- "grad_norm": 148.125,
630
  "learning_rate": 3.961352657004831e-06,
631
- "loss": 95.6152,
632
  "step": 445
633
  },
634
  {
635
  "epoch": 0.6511712037623225,
636
- "grad_norm": 147.25,
637
  "learning_rate": 3.880837359098229e-06,
638
- "loss": 95.9353,
639
  "step": 450
640
  },
641
  {
642
  "epoch": 0.6584064393596817,
643
- "grad_norm": 140.25,
644
  "learning_rate": 3.800322061191627e-06,
645
- "loss": 97.5711,
646
  "step": 455
647
  },
648
  {
649
  "epoch": 0.6656416749570407,
650
- "grad_norm": 161.25,
651
  "learning_rate": 3.7198067632850245e-06,
652
- "loss": 94.4532,
653
  "step": 460
654
  },
655
  {
656
  "epoch": 0.6728769105543999,
657
- "grad_norm": 138.125,
658
  "learning_rate": 3.6392914653784224e-06,
659
- "loss": 95.4708,
660
  "step": 465
661
  },
662
  {
663
  "epoch": 0.6801121461517591,
664
- "grad_norm": 147.25,
665
  "learning_rate": 3.5587761674718204e-06,
666
- "loss": 93.6638,
667
  "step": 470
668
  },
669
  {
670
  "epoch": 0.6873473817491182,
671
- "grad_norm": 139.5,
672
  "learning_rate": 3.4782608695652175e-06,
673
- "loss": 95.4091,
674
  "step": 475
675
  },
676
  {
677
  "epoch": 0.6945826173464773,
678
- "grad_norm": 132.25,
679
  "learning_rate": 3.3977455716586155e-06,
680
- "loss": 93.6919,
681
  "step": 480
682
  },
683
  {
684
  "epoch": 0.7018178529438365,
685
- "grad_norm": 144.875,
686
  "learning_rate": 3.317230273752013e-06,
687
- "loss": 96.2361,
688
  "step": 485
689
  },
690
  {
691
  "epoch": 0.7090530885411956,
692
- "grad_norm": 133.625,
693
  "learning_rate": 3.236714975845411e-06,
694
- "loss": 95.2073,
695
  "step": 490
696
  },
697
  {
698
  "epoch": 0.7162883241385548,
699
- "grad_norm": 136.625,
700
  "learning_rate": 3.156199677938809e-06,
701
- "loss": 93.5076,
702
  "step": 495
703
  },
704
  {
705
  "epoch": 0.7235235597359139,
706
- "grad_norm": 137.5,
707
  "learning_rate": 3.075684380032206e-06,
708
- "loss": 94.2644,
709
  "step": 500
710
  },
711
  {
712
  "epoch": 0.7235235597359139,
713
- "eval_loss": NaN,
714
- "eval_runtime": 64.9086,
715
- "eval_samples_per_second": 573.792,
716
- "eval_steps_per_second": 35.866,
717
  "step": 500
718
- },
719
- {
720
- "epoch": 0.730758795333273,
721
- "grad_norm": 154.0,
722
- "learning_rate": 2.995169082125604e-06,
723
- "loss": 92.6857,
724
- "step": 505
725
- },
726
- {
727
- "epoch": 0.7379940309306322,
728
- "grad_norm": 148.625,
729
- "learning_rate": 2.914653784219002e-06,
730
- "loss": 94.1288,
731
- "step": 510
732
- },
733
- {
734
- "epoch": 0.7452292665279913,
735
- "grad_norm": 141.625,
736
- "learning_rate": 2.8341384863123995e-06,
737
- "loss": 93.0597,
738
- "step": 515
739
- },
740
- {
741
- "epoch": 0.7524645021253504,
742
- "grad_norm": 146.375,
743
- "learning_rate": 2.7536231884057974e-06,
744
- "loss": 94.0807,
745
- "step": 520
746
- },
747
- {
748
- "epoch": 0.7596997377227096,
749
- "grad_norm": 141.875,
750
- "learning_rate": 2.6731078904991954e-06,
751
- "loss": 95.1949,
752
- "step": 525
753
- },
754
- {
755
- "epoch": 0.7669349733200688,
756
- "grad_norm": 142.25,
757
- "learning_rate": 2.5925925925925925e-06,
758
- "loss": 94.323,
759
- "step": 530
760
- },
761
- {
762
- "epoch": 0.7741702089174278,
763
- "grad_norm": 137.125,
764
- "learning_rate": 2.5120772946859904e-06,
765
- "loss": 92.8978,
766
- "step": 535
767
- },
768
- {
769
- "epoch": 0.781405444514787,
770
- "grad_norm": 155.0,
771
- "learning_rate": 2.4315619967793884e-06,
772
- "loss": 92.656,
773
- "step": 540
774
- },
775
- {
776
- "epoch": 0.7886406801121462,
777
- "grad_norm": 136.875,
778
- "learning_rate": 2.351046698872786e-06,
779
- "loss": 94.4373,
780
- "step": 545
781
- },
782
- {
783
- "epoch": 0.7958759157095053,
784
- "grad_norm": 137.375,
785
- "learning_rate": 2.270531400966184e-06,
786
- "loss": 93.7375,
787
- "step": 550
788
- },
789
- {
790
- "epoch": 0.8031111513068644,
791
- "grad_norm": 141.125,
792
- "learning_rate": 2.1900161030595814e-06,
793
- "loss": 93.2835,
794
- "step": 555
795
- },
796
- {
797
- "epoch": 0.8103463869042236,
798
- "grad_norm": 136.125,
799
- "learning_rate": 2.109500805152979e-06,
800
- "loss": 92.7297,
801
- "step": 560
802
- },
803
- {
804
- "epoch": 0.8175816225015827,
805
- "grad_norm": 132.0,
806
- "learning_rate": 2.028985507246377e-06,
807
- "loss": 93.2055,
808
- "step": 565
809
- },
810
- {
811
- "epoch": 0.8248168580989419,
812
- "grad_norm": 142.0,
813
- "learning_rate": 1.948470209339775e-06,
814
- "loss": 92.6126,
815
- "step": 570
816
- },
817
- {
818
- "epoch": 0.832052093696301,
819
- "grad_norm": 128.375,
820
- "learning_rate": 1.8679549114331724e-06,
821
- "loss": 91.9927,
822
- "step": 575
823
- },
824
- {
825
- "epoch": 0.8392873292936601,
826
- "grad_norm": 142.625,
827
- "learning_rate": 1.7874396135265702e-06,
828
- "loss": 92.6083,
829
- "step": 580
830
- },
831
- {
832
- "epoch": 0.8465225648910193,
833
- "grad_norm": 142.5,
834
- "learning_rate": 1.7069243156199681e-06,
835
- "loss": 93.0809,
836
- "step": 585
837
- },
838
- {
839
- "epoch": 0.8537578004883783,
840
- "grad_norm": 131.625,
841
- "learning_rate": 1.6264090177133656e-06,
842
- "loss": 90.4466,
843
- "step": 590
844
- },
845
- {
846
- "epoch": 0.8609930360857375,
847
- "grad_norm": 153.0,
848
- "learning_rate": 1.5458937198067634e-06,
849
- "loss": 90.9484,
850
- "step": 595
851
- },
852
- {
853
- "epoch": 0.8682282716830967,
854
- "grad_norm": 141.75,
855
- "learning_rate": 1.4653784219001613e-06,
856
- "loss": 92.5591,
857
- "step": 600
858
- },
859
- {
860
- "epoch": 0.8754635072804559,
861
- "grad_norm": 149.25,
862
- "learning_rate": 1.3848631239935589e-06,
863
- "loss": 93.0069,
864
- "step": 605
865
- },
866
- {
867
- "epoch": 0.8826987428778149,
868
- "grad_norm": 131.625,
869
- "learning_rate": 1.3043478260869566e-06,
870
- "loss": 92.2291,
871
- "step": 610
872
- },
873
- {
874
- "epoch": 0.8899339784751741,
875
- "grad_norm": 129.875,
876
- "learning_rate": 1.2238325281803544e-06,
877
- "loss": 91.3298,
878
- "step": 615
879
- },
880
- {
881
- "epoch": 0.8971692140725332,
882
- "grad_norm": 139.25,
883
- "learning_rate": 1.1433172302737521e-06,
884
- "loss": 92.7737,
885
- "step": 620
886
- },
887
- {
888
- "epoch": 0.9044044496698924,
889
- "grad_norm": 134.375,
890
- "learning_rate": 1.0628019323671499e-06,
891
- "loss": 93.0011,
892
- "step": 625
893
- },
894
- {
895
- "epoch": 0.9116396852672515,
896
- "grad_norm": 131.625,
897
- "learning_rate": 9.822866344605476e-07,
898
- "loss": 92.2996,
899
- "step": 630
900
- },
901
- {
902
- "epoch": 0.9188749208646106,
903
- "grad_norm": 136.75,
904
- "learning_rate": 9.017713365539453e-07,
905
- "loss": 92.7781,
906
- "step": 635
907
- },
908
- {
909
- "epoch": 0.9261101564619698,
910
- "grad_norm": 139.75,
911
- "learning_rate": 8.212560386473431e-07,
912
- "loss": 91.0231,
913
- "step": 640
914
- },
915
- {
916
- "epoch": 0.933345392059329,
917
- "grad_norm": 136.875,
918
- "learning_rate": 7.407407407407407e-07,
919
- "loss": 91.6645,
920
- "step": 645
921
- },
922
- {
923
- "epoch": 0.940580627656688,
924
- "grad_norm": 144.875,
925
- "learning_rate": 6.602254428341386e-07,
926
- "loss": 90.7734,
927
- "step": 650
928
- },
929
- {
930
- "epoch": 0.9478158632540472,
931
- "grad_norm": 142.75,
932
- "learning_rate": 5.797101449275363e-07,
933
- "loss": 91.3177,
934
- "step": 655
935
- },
936
- {
937
- "epoch": 0.9550510988514064,
938
- "grad_norm": 148.75,
939
- "learning_rate": 4.991948470209341e-07,
940
- "loss": 90.913,
941
- "step": 660
942
- },
943
- {
944
- "epoch": 0.9622863344487654,
945
- "grad_norm": 128.125,
946
- "learning_rate": 4.1867954911433176e-07,
947
- "loss": 90.6037,
948
- "step": 665
949
- },
950
- {
951
- "epoch": 0.9695215700461246,
952
- "grad_norm": 132.25,
953
- "learning_rate": 3.3816425120772945e-07,
954
- "loss": 91.3471,
955
- "step": 670
956
- },
957
- {
958
- "epoch": 0.9767568056434838,
959
- "grad_norm": 126.4375,
960
- "learning_rate": 2.5764895330112725e-07,
961
- "loss": 91.4777,
962
- "step": 675
963
- },
964
- {
965
- "epoch": 0.9839920412408429,
966
- "grad_norm": 139.0,
967
- "learning_rate": 1.7713365539452497e-07,
968
- "loss": 92.0473,
969
- "step": 680
970
- },
971
- {
972
- "epoch": 0.991227276838202,
973
- "grad_norm": 134.875,
974
- "learning_rate": 9.661835748792271e-08,
975
- "loss": 90.7339,
976
- "step": 685
977
- },
978
- {
979
- "epoch": 0.9984625124355612,
980
- "grad_norm": 139.375,
981
- "learning_rate": 1.6103059581320453e-08,
982
- "loss": 92.0409,
983
- "step": 690
984
  }
985
  ],
986
  "logging_steps": 5,
@@ -995,12 +729,12 @@
995
  "should_evaluate": false,
996
  "should_log": false,
997
  "should_save": true,
998
- "should_training_stop": true
999
  },
1000
  "attributes": {}
1001
  }
1002
  },
1003
- "total_flos": 2.993659372963365e+18,
1004
  "train_batch_size": 4,
1005
  "trial_name": null,
1006
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7235235597359139,
5
  "eval_steps": 500,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.007235235597359139,
13
+ "grad_norm": 240.625,
14
  "learning_rate": 7.142857142857143e-07,
15
+ "loss": 119.8148,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.014470471194718278,
20
+ "grad_norm": 138.625,
21
  "learning_rate": 1.4285714285714286e-06,
22
+ "loss": 104.262,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.021705706792077416,
27
+ "grad_norm": 121.875,
28
  "learning_rate": 2.1428571428571427e-06,
29
+ "loss": 93.6603,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.028940942389436557,
34
+ "grad_norm": 120.625,
35
  "learning_rate": 2.8571428571428573e-06,
36
+ "loss": 90.7159,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.0361761779867957,
41
+ "grad_norm": 106.5625,
42
  "learning_rate": 3.5714285714285718e-06,
43
+ "loss": 86.0796,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.04341141358415483,
48
+ "grad_norm": 103.875,
49
  "learning_rate": 4.2857142857142855e-06,
50
+ "loss": 86.2538,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.05064664918151397,
55
+ "grad_norm": 108.375,
56
  "learning_rate": 5e-06,
57
+ "loss": 84.5117,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.05788188477887311,
62
+ "grad_norm": 101.0,
63
  "learning_rate": 5.7142857142857145e-06,
64
+ "loss": 83.5148,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.06511712037623225,
69
+ "grad_norm": 109.875,
70
  "learning_rate": 6.4285714285714295e-06,
71
+ "loss": 82.3289,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.0723523559735914,
76
+ "grad_norm": 102.5625,
77
  "learning_rate": 7.1428571428571436e-06,
78
+ "loss": 81.4988,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.07958759157095052,
83
+ "grad_norm": 115.375,
84
  "learning_rate": 7.857142857142858e-06,
85
+ "loss": 82.3074,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.08682282716830966,
90
+ "grad_norm": 115.625,
91
  "learning_rate": 8.571428571428571e-06,
92
+ "loss": 81.3627,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.0940580627656688,
97
+ "grad_norm": 103.0625,
98
  "learning_rate": 9.285714285714288e-06,
99
+ "loss": 83.3712,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.10129329836302794,
104
+ "grad_norm": 118.625,
105
  "learning_rate": 1e-05,
106
+ "loss": 81.1766,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.10852853396038709,
111
+ "grad_norm": 120.75,
112
  "learning_rate": 9.919484702093398e-06,
113
+ "loss": 80.6615,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.11576376955774623,
118
+ "grad_norm": 108.5,
119
  "learning_rate": 9.838969404186796e-06,
120
+ "loss": 78.3733,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.12299900515510537,
125
+ "grad_norm": 102.375,
126
  "learning_rate": 9.758454106280194e-06,
127
+ "loss": 80.2047,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.1302342407524645,
132
+ "grad_norm": 106.875,
133
  "learning_rate": 9.677938808373591e-06,
134
+ "loss": 79.1781,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.13746947634982365,
139
+ "grad_norm": 105.6875,
140
  "learning_rate": 9.59742351046699e-06,
141
+ "loss": 79.691,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.1447047119471828,
146
+ "grad_norm": 116.125,
147
  "learning_rate": 9.516908212560388e-06,
148
+ "loss": 80.0193,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.15193994754454193,
153
+ "grad_norm": 115.0,
154
  "learning_rate": 9.436392914653784e-06,
155
+ "loss": 80.1022,
156
  "step": 105
157
  },
158
  {
159
  "epoch": 0.15917518314190104,
160
+ "grad_norm": 117.875,
161
  "learning_rate": 9.355877616747183e-06,
162
+ "loss": 78.4374,
163
  "step": 110
164
  },
165
  {
166
  "epoch": 0.16641041873926019,
167
+ "grad_norm": 103.3125,
168
  "learning_rate": 9.275362318840581e-06,
169
+ "loss": 78.8168,
170
  "step": 115
171
  },
172
  {
173
  "epoch": 0.17364565433661933,
174
+ "grad_norm": 102.125,
175
  "learning_rate": 9.194847020933978e-06,
176
+ "loss": 77.7842,
177
  "step": 120
178
  },
179
  {
180
  "epoch": 0.18088088993397847,
181
+ "grad_norm": 105.375,
182
  "learning_rate": 9.114331723027376e-06,
183
+ "loss": 77.5425,
184
  "step": 125
185
  },
186
  {
187
  "epoch": 0.1881161255313376,
188
+ "grad_norm": 105.5,
189
  "learning_rate": 9.033816425120775e-06,
190
+ "loss": 76.4404,
191
  "step": 130
192
  },
193
  {
194
  "epoch": 0.19535136112869675,
195
+ "grad_norm": 105.125,
196
  "learning_rate": 8.953301127214171e-06,
197
+ "loss": 76.6364,
198
  "step": 135
199
  },
200
  {
201
  "epoch": 0.2025865967260559,
202
+ "grad_norm": 124.375,
203
  "learning_rate": 8.87278582930757e-06,
204
+ "loss": 76.9181,
205
  "step": 140
206
  },
207
  {
208
  "epoch": 0.20982183232341503,
209
+ "grad_norm": 103.0625,
210
  "learning_rate": 8.792270531400966e-06,
211
+ "loss": 77.4936,
212
  "step": 145
213
  },
214
  {
215
  "epoch": 0.21705706792077417,
216
+ "grad_norm": 104.875,
217
  "learning_rate": 8.711755233494365e-06,
218
+ "loss": 76.7636,
219
  "step": 150
220
  },
221
  {
222
  "epoch": 0.2242923035181333,
223
+ "grad_norm": 113.8125,
224
  "learning_rate": 8.631239935587761e-06,
225
+ "loss": 76.7539,
226
  "step": 155
227
  },
228
  {
229
  "epoch": 0.23152753911549245,
230
+ "grad_norm": 111.875,
231
  "learning_rate": 8.55072463768116e-06,
232
+ "loss": 75.9734,
233
  "step": 160
234
  },
235
  {
236
  "epoch": 0.2387627747128516,
237
+ "grad_norm": 98.75,
238
  "learning_rate": 8.470209339774558e-06,
239
+ "loss": 74.6409,
240
  "step": 165
241
  },
242
  {
243
  "epoch": 0.24599801031021074,
244
+ "grad_norm": 107.5625,
245
  "learning_rate": 8.389694041867955e-06,
246
+ "loss": 76.4473,
247
  "step": 170
248
  },
249
  {
250
  "epoch": 0.25323324590756985,
251
+ "grad_norm": 109.5,
252
  "learning_rate": 8.309178743961353e-06,
253
+ "loss": 75.0732,
254
  "step": 175
255
  },
256
  {
257
  "epoch": 0.260468481504929,
258
+ "grad_norm": 114.5,
259
  "learning_rate": 8.228663446054752e-06,
260
+ "loss": 74.5601,
261
  "step": 180
262
  },
263
  {
264
  "epoch": 0.26770371710228813,
265
+ "grad_norm": 101.9375,
266
  "learning_rate": 8.148148148148148e-06,
267
+ "loss": 77.3542,
268
  "step": 185
269
  },
270
  {
271
  "epoch": 0.2749389526996473,
272
+ "grad_norm": 107.75,
273
  "learning_rate": 8.067632850241547e-06,
274
+ "loss": 74.8798,
275
  "step": 190
276
  },
277
  {
278
  "epoch": 0.2821741882970064,
279
+ "grad_norm": 107.75,
280
  "learning_rate": 7.987117552334945e-06,
281
+ "loss": 75.2804,
282
  "step": 195
283
  },
284
  {
285
  "epoch": 0.2894094238943656,
286
+ "grad_norm": 97.3125,
287
  "learning_rate": 7.906602254428342e-06,
288
+ "loss": 74.8395,
289
  "step": 200
290
  },
291
  {
292
  "epoch": 0.2966446594917247,
293
+ "grad_norm": 96.9375,
294
  "learning_rate": 7.82608695652174e-06,
295
+ "loss": 72.9746,
296
  "step": 205
297
  },
298
  {
299
  "epoch": 0.30387989508908386,
300
+ "grad_norm": 102.25,
301
  "learning_rate": 7.745571658615137e-06,
302
+ "loss": 74.5221,
303
  "step": 210
304
  },
305
  {
306
  "epoch": 0.311115130686443,
307
+ "grad_norm": 100.9375,
308
  "learning_rate": 7.665056360708535e-06,
309
+ "loss": 72.7894,
310
  "step": 215
311
  },
312
  {
313
  "epoch": 0.3183503662838021,
314
+ "grad_norm": 102.625,
315
  "learning_rate": 7.584541062801934e-06,
316
+ "loss": 73.3365,
317
  "step": 220
318
  },
319
  {
320
  "epoch": 0.32558560188116126,
321
+ "grad_norm": 102.125,
322
  "learning_rate": 7.504025764895331e-06,
323
+ "loss": 73.3862,
324
  "step": 225
325
  },
326
  {
327
  "epoch": 0.33282083747852037,
328
+ "grad_norm": 100.25,
329
  "learning_rate": 7.423510466988728e-06,
330
+ "loss": 71.9684,
331
  "step": 230
332
  },
333
  {
334
  "epoch": 0.34005607307587954,
335
+ "grad_norm": 102.5625,
336
  "learning_rate": 7.342995169082127e-06,
337
+ "loss": 72.8994,
338
  "step": 235
339
  },
340
  {
341
  "epoch": 0.34729130867323865,
342
+ "grad_norm": 105.1875,
343
  "learning_rate": 7.262479871175524e-06,
344
+ "loss": 71.1332,
345
  "step": 240
346
  },
347
  {
348
  "epoch": 0.3545265442705978,
349
+ "grad_norm": 100.125,
350
  "learning_rate": 7.181964573268921e-06,
351
+ "loss": 70.3095,
352
  "step": 245
353
  },
354
  {
355
  "epoch": 0.36176177986795693,
356
+ "grad_norm": 99.375,
357
  "learning_rate": 7.10144927536232e-06,
358
+ "loss": 70.9021,
359
  "step": 250
360
  },
361
  {
362
  "epoch": 0.3689970154653161,
363
+ "grad_norm": 110.25,
364
  "learning_rate": 7.020933977455717e-06,
365
+ "loss": 72.08,
366
  "step": 255
367
  },
368
  {
369
  "epoch": 0.3762322510626752,
370
+ "grad_norm": 95.0625,
371
  "learning_rate": 6.940418679549115e-06,
372
+ "loss": 70.8334,
373
  "step": 260
374
  },
375
  {
376
  "epoch": 0.3834674866600344,
377
+ "grad_norm": 101.3125,
378
  "learning_rate": 6.859903381642513e-06,
379
+ "loss": 70.9255,
380
  "step": 265
381
  },
382
  {
383
  "epoch": 0.3907027222573935,
384
+ "grad_norm": 102.8125,
385
  "learning_rate": 6.779388083735911e-06,
386
+ "loss": 72.2681,
387
  "step": 270
388
  },
389
  {
390
  "epoch": 0.39793795785475267,
391
+ "grad_norm": 101.25,
392
  "learning_rate": 6.698872785829308e-06,
393
+ "loss": 70.6592,
394
  "step": 275
395
  },
396
  {
397
  "epoch": 0.4051731934521118,
398
+ "grad_norm": 104.0,
399
  "learning_rate": 6.6183574879227065e-06,
400
+ "loss": 70.5197,
401
  "step": 280
402
  },
403
  {
404
  "epoch": 0.41240842904947095,
405
+ "grad_norm": 103.5,
406
  "learning_rate": 6.537842190016104e-06,
407
+ "loss": 70.3267,
408
  "step": 285
409
  },
410
  {
411
  "epoch": 0.41964366464683006,
412
+ "grad_norm": 95.6875,
413
  "learning_rate": 6.457326892109501e-06,
414
+ "loss": 69.8351,
415
  "step": 290
416
  },
417
  {
418
  "epoch": 0.4268789002441892,
419
+ "grad_norm": 95.9375,
420
  "learning_rate": 6.376811594202898e-06,
421
+ "loss": 69.768,
422
  "step": 295
423
  },
424
  {
425
  "epoch": 0.43411413584154834,
426
+ "grad_norm": 99.9375,
427
  "learning_rate": 6.296296296296297e-06,
428
+ "loss": 68.9199,
429
  "step": 300
430
  },
431
  {
432
  "epoch": 0.44134937143890746,
433
+ "grad_norm": 103.8125,
434
  "learning_rate": 6.215780998389694e-06,
435
+ "loss": 70.4392,
436
  "step": 305
437
  },
438
  {
439
  "epoch": 0.4485846070362666,
440
+ "grad_norm": 94.6875,
441
  "learning_rate": 6.135265700483092e-06,
442
+ "loss": 69.9065,
443
  "step": 310
444
  },
445
  {
446
  "epoch": 0.45581984263362574,
447
+ "grad_norm": 110.9375,
448
  "learning_rate": 6.05475040257649e-06,
449
+ "loss": 68.7817,
450
  "step": 315
451
  },
452
  {
453
  "epoch": 0.4630550782309849,
454
+ "grad_norm": 95.25,
455
  "learning_rate": 5.974235104669888e-06,
456
+ "loss": 69.2291,
457
  "step": 320
458
  },
459
  {
460
  "epoch": 0.470290313828344,
461
+ "grad_norm": 94.5,
462
  "learning_rate": 5.893719806763285e-06,
463
+ "loss": 69.6696,
464
  "step": 325
465
  },
466
  {
467
  "epoch": 0.4775255494257032,
468
+ "grad_norm": 100.9375,
469
  "learning_rate": 5.8132045088566835e-06,
470
+ "loss": 68.0409,
471
  "step": 330
472
  },
473
  {
474
  "epoch": 0.4847607850230623,
475
+ "grad_norm": 95.5,
476
  "learning_rate": 5.732689210950081e-06,
477
+ "loss": 68.8582,
478
  "step": 335
479
  },
480
  {
481
  "epoch": 0.49199602062042147,
482
+ "grad_norm": 94.6875,
483
  "learning_rate": 5.652173913043479e-06,
484
+ "loss": 68.1767,
485
  "step": 340
486
  },
487
  {
488
  "epoch": 0.4992312562177806,
489
+ "grad_norm": 96.0625,
490
  "learning_rate": 5.571658615136877e-06,
491
+ "loss": 68.3804,
492
  "step": 345
493
  },
494
  {
495
  "epoch": 0.5064664918151397,
496
+ "grad_norm": 97.125,
497
  "learning_rate": 5.4911433172302745e-06,
498
+ "loss": 66.2784,
499
  "step": 350
500
  },
501
  {
502
  "epoch": 0.5137017274124989,
503
+ "grad_norm": 93.75,
504
  "learning_rate": 5.410628019323671e-06,
505
+ "loss": 66.8873,
506
  "step": 355
507
  },
508
  {
509
  "epoch": 0.520936963009858,
510
+ "grad_norm": 96.0,
511
  "learning_rate": 5.3301127214170704e-06,
512
+ "loss": 67.6164,
513
  "step": 360
514
  },
515
  {
516
  "epoch": 0.5281721986072172,
517
+ "grad_norm": 106.625,
518
  "learning_rate": 5.249597423510467e-06,
519
+ "loss": 68.5509,
520
  "step": 365
521
  },
522
  {
523
  "epoch": 0.5354074342045763,
524
+ "grad_norm": 102.125,
525
  "learning_rate": 5.169082125603865e-06,
526
+ "loss": 65.9846,
527
  "step": 370
528
  },
529
  {
530
  "epoch": 0.5426426698019354,
531
+ "grad_norm": 99.125,
532
  "learning_rate": 5.088566827697263e-06,
533
+ "loss": 67.4898,
534
  "step": 375
535
  },
536
  {
537
  "epoch": 0.5498779053992946,
538
+ "grad_norm": 98.25,
539
  "learning_rate": 5.0080515297906606e-06,
540
+ "loss": 68.5553,
541
  "step": 380
542
  },
543
  {
544
  "epoch": 0.5571131409966537,
545
+ "grad_norm": 99.5,
546
  "learning_rate": 4.927536231884059e-06,
547
+ "loss": 66.9819,
548
  "step": 385
549
  },
550
  {
551
  "epoch": 0.5643483765940128,
552
+ "grad_norm": 102.375,
553
  "learning_rate": 4.847020933977456e-06,
554
+ "loss": 65.8663,
555
  "step": 390
556
  },
557
  {
558
  "epoch": 0.571583612191372,
559
+ "grad_norm": 97.5,
560
  "learning_rate": 4.766505636070854e-06,
561
+ "loss": 67.4699,
562
  "step": 395
563
  },
564
  {
565
  "epoch": 0.5788188477887312,
566
+ "grad_norm": 104.4375,
567
  "learning_rate": 4.6859903381642516e-06,
568
+ "loss": 66.4532,
569
  "step": 400
570
  },
571
  {
572
  "epoch": 0.5860540833860902,
573
+ "grad_norm": 100.9375,
574
  "learning_rate": 4.605475040257649e-06,
575
+ "loss": 66.7904,
576
  "step": 405
577
  },
578
  {
579
  "epoch": 0.5932893189834494,
580
+ "grad_norm": 96.875,
581
  "learning_rate": 4.5249597423510475e-06,
582
+ "loss": 66.644,
583
  "step": 410
584
  },
585
  {
586
  "epoch": 0.6005245545808086,
587
+ "grad_norm": 95.8125,
588
  "learning_rate": 4.444444444444444e-06,
589
+ "loss": 64.846,
590
  "step": 415
591
  },
592
  {
593
  "epoch": 0.6077597901781677,
594
+ "grad_norm": 96.625,
595
  "learning_rate": 4.3639291465378425e-06,
596
+ "loss": 65.7853,
597
  "step": 420
598
  },
599
  {
600
  "epoch": 0.6149950257755268,
601
+ "grad_norm": 93.0625,
602
  "learning_rate": 4.28341384863124e-06,
603
+ "loss": 64.4889,
604
  "step": 425
605
  },
606
  {
607
  "epoch": 0.622230261372886,
608
+ "grad_norm": 106.5,
609
  "learning_rate": 4.202898550724638e-06,
610
+ "loss": 65.0533,
611
  "step": 430
612
  },
613
  {
614
  "epoch": 0.6294654969702451,
615
+ "grad_norm": 92.5,
616
  "learning_rate": 4.122383252818036e-06,
617
+ "loss": 65.3124,
618
  "step": 435
619
  },
620
  {
621
  "epoch": 0.6367007325676042,
622
+ "grad_norm": 94.875,
623
  "learning_rate": 4.0418679549114335e-06,
624
+ "loss": 65.9648,
625
  "step": 440
626
  },
627
  {
628
  "epoch": 0.6439359681649633,
629
+ "grad_norm": 98.125,
630
  "learning_rate": 3.961352657004831e-06,
631
+ "loss": 64.7386,
632
  "step": 445
633
  },
634
  {
635
  "epoch": 0.6511712037623225,
636
+ "grad_norm": 92.6875,
637
  "learning_rate": 3.880837359098229e-06,
638
+ "loss": 64.4612,
639
  "step": 450
640
  },
641
  {
642
  "epoch": 0.6584064393596817,
643
+ "grad_norm": 103.5,
644
  "learning_rate": 3.800322061191627e-06,
645
+ "loss": 64.202,
646
  "step": 455
647
  },
648
  {
649
  "epoch": 0.6656416749570407,
650
+ "grad_norm": 95.6875,
651
  "learning_rate": 3.7198067632850245e-06,
652
+ "loss": 63.8991,
653
  "step": 460
654
  },
655
  {
656
  "epoch": 0.6728769105543999,
657
+ "grad_norm": 95.625,
658
  "learning_rate": 3.6392914653784224e-06,
659
+ "loss": 63.4502,
660
  "step": 465
661
  },
662
  {
663
  "epoch": 0.6801121461517591,
664
+ "grad_norm": 92.4375,
665
  "learning_rate": 3.5587761674718204e-06,
666
+ "loss": 63.3427,
667
  "step": 470
668
  },
669
  {
670
  "epoch": 0.6873473817491182,
671
+ "grad_norm": 93.9375,
672
  "learning_rate": 3.4782608695652175e-06,
673
+ "loss": 64.2563,
674
  "step": 475
675
  },
676
  {
677
  "epoch": 0.6945826173464773,
678
+ "grad_norm": 97.25,
679
  "learning_rate": 3.3977455716586155e-06,
680
+ "loss": 63.9529,
681
  "step": 480
682
  },
683
  {
684
  "epoch": 0.7018178529438365,
685
+ "grad_norm": 94.375,
686
  "learning_rate": 3.317230273752013e-06,
687
+ "loss": 63.9575,
688
  "step": 485
689
  },
690
  {
691
  "epoch": 0.7090530885411956,
692
+ "grad_norm": 94.75,
693
  "learning_rate": 3.236714975845411e-06,
694
+ "loss": 64.8717,
695
  "step": 490
696
  },
697
  {
698
  "epoch": 0.7162883241385548,
699
+ "grad_norm": 97.0625,
700
  "learning_rate": 3.156199677938809e-06,
701
+ "loss": 63.5308,
702
  "step": 495
703
  },
704
  {
705
  "epoch": 0.7235235597359139,
706
+ "grad_norm": 95.25,
707
  "learning_rate": 3.075684380032206e-06,
708
+ "loss": 63.161,
709
  "step": 500
710
  },
711
  {
712
  "epoch": 0.7235235597359139,
713
+ "eval_loss": 1.7313103675842285,
714
+ "eval_runtime": 63.4514,
715
+ "eval_samples_per_second": 586.969,
716
+ "eval_steps_per_second": 36.69,
717
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
  }
719
  ],
720
  "logging_steps": 5,
 
729
  "should_evaluate": false,
730
  "should_log": false,
731
  "should_save": true,
732
+ "should_training_stop": false
733
  },
734
  "attributes": {}
735
  }
736
  },
737
+ "total_flos": 2.1661789963943936e+18,
738
  "train_batch_size": 4,
739
  "trial_name": null,
740
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c88d63b229521e393dd15e94a0dce5631f524c246b8dceda3a877ef2c9acdd7e
3
  size 5752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9ac479c0792c767bddcc93c08dda3764b8a2580cb4ea96fcc5be3a954af60ee
3
  size 5752