craa commited on
Commit
e54694a
·
verified ·
1 Parent(s): 2e46531

Training in progress, step 10000, checkpoint

Browse files
checkpoint-10000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1a5761ff85a9b925091004c9b5d326eab3b14b1e62bc9bd9f76ae25f65e33b3
3
  size 503128704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7139e359e39666d1d2bbc215bf672b98b6327c7dc8bd20c5fa16d1998711c2e
3
  size 503128704
checkpoint-10000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba453e42bb87ce014372643d0f5c4b1f1eee5148b2513b9fb8bd43125f4dbe56
3
  size 1006351290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d5288bb29698901a628a80c576994b82887c8b5b067878c54fd4a2404b71605
3
  size 1006351290
checkpoint-10000/trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 3.7569897174835205,
3
  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_low_0_6910/checkpoint-10000",
4
  "epoch": 1.0781671159029649,
5
  "eval_steps": 1000,
@@ -10,1491 +10,1491 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.005390835579514825,
13
- "grad_norm": 2.8118252754211426,
14
  "learning_rate": 0.000276,
15
- "loss": 9.0225,
16
  "step": 50
17
  },
18
  {
19
  "epoch": 0.01078167115902965,
20
- "grad_norm": 1.6856672763824463,
21
  "learning_rate": 0.0005759999999999999,
22
- "loss": 6.9773,
23
  "step": 100
24
  },
25
  {
26
  "epoch": 0.016172506738544475,
27
- "grad_norm": 0.9568011164665222,
28
  "learning_rate": 0.000599702104695089,
29
- "loss": 6.5177,
30
  "step": 150
31
  },
32
  {
33
  "epoch": 0.0215633423180593,
34
- "grad_norm": 1.2443560361862183,
35
  "learning_rate": 0.0005993783054506205,
36
- "loss": 6.2578,
37
  "step": 200
38
  },
39
  {
40
  "epoch": 0.026954177897574125,
41
- "grad_norm": 1.41825270652771,
42
  "learning_rate": 0.0005990545062061521,
43
- "loss": 6.0969,
44
  "step": 250
45
  },
46
  {
47
  "epoch": 0.03234501347708895,
48
- "grad_norm": 1.9072396755218506,
49
  "learning_rate": 0.0005987307069616836,
50
- "loss": 5.9635,
51
  "step": 300
52
  },
53
  {
54
  "epoch": 0.03773584905660377,
55
- "grad_norm": 1.5556931495666504,
56
  "learning_rate": 0.0005984069077172153,
57
- "loss": 5.8839,
58
  "step": 350
59
  },
60
  {
61
  "epoch": 0.0431266846361186,
62
- "grad_norm": 1.5125844478607178,
63
  "learning_rate": 0.0005980831084727469,
64
- "loss": 5.8238,
65
  "step": 400
66
  },
67
  {
68
  "epoch": 0.04851752021563342,
69
- "grad_norm": 1.5804698467254639,
70
  "learning_rate": 0.0005977593092282784,
71
- "loss": 5.7627,
72
  "step": 450
73
  },
74
  {
75
  "epoch": 0.05390835579514825,
76
- "grad_norm": 1.1904308795928955,
77
  "learning_rate": 0.00059743550998381,
78
- "loss": 5.6518,
79
  "step": 500
80
  },
81
  {
82
  "epoch": 0.05929919137466307,
83
- "grad_norm": 1.009339690208435,
84
  "learning_rate": 0.0005971117107393416,
85
- "loss": 5.6142,
86
  "step": 550
87
  },
88
  {
89
  "epoch": 0.0646900269541779,
90
- "grad_norm": 1.3357371091842651,
91
  "learning_rate": 0.0005967879114948732,
92
- "loss": 5.5406,
93
  "step": 600
94
  },
95
  {
96
  "epoch": 0.07008086253369272,
97
- "grad_norm": 1.434008240699768,
98
  "learning_rate": 0.0005964641122504047,
99
- "loss": 5.4783,
100
  "step": 650
101
  },
102
  {
103
  "epoch": 0.07547169811320754,
104
- "grad_norm": 0.8490063548088074,
105
  "learning_rate": 0.0005961403130059363,
106
- "loss": 5.3996,
107
  "step": 700
108
  },
109
  {
110
  "epoch": 0.08086253369272237,
111
- "grad_norm": 1.1594209671020508,
112
  "learning_rate": 0.0005958165137614678,
113
- "loss": 5.331,
114
  "step": 750
115
  },
116
  {
117
  "epoch": 0.0862533692722372,
118
- "grad_norm": 0.9333789348602295,
119
  "learning_rate": 0.0005954927145169995,
120
- "loss": 5.2596,
121
  "step": 800
122
  },
123
  {
124
  "epoch": 0.09164420485175202,
125
- "grad_norm": 1.0127500295639038,
126
  "learning_rate": 0.0005951689152725309,
127
- "loss": 5.2418,
128
  "step": 850
129
  },
130
  {
131
  "epoch": 0.09703504043126684,
132
- "grad_norm": 1.113052487373352,
133
  "learning_rate": 0.0005948451160280626,
134
- "loss": 5.1815,
135
  "step": 900
136
  },
137
  {
138
  "epoch": 0.10242587601078167,
139
- "grad_norm": 0.9113388061523438,
140
  "learning_rate": 0.0005945213167835941,
141
- "loss": 5.1162,
142
  "step": 950
143
  },
144
  {
145
  "epoch": 0.1078167115902965,
146
- "grad_norm": 0.9989785552024841,
147
  "learning_rate": 0.0005941975175391257,
148
- "loss": 5.1216,
149
  "step": 1000
150
  },
151
  {
152
  "epoch": 0.1078167115902965,
153
- "eval_accuracy": 0.22690449769715637,
154
- "eval_loss": 5.032634735107422,
155
- "eval_runtime": 184.9778,
156
- "eval_samples_per_second": 97.368,
157
- "eval_steps_per_second": 6.087,
158
  "step": 1000
159
  },
160
  {
161
  "epoch": 0.11320754716981132,
162
- "grad_norm": 1.0346425771713257,
163
  "learning_rate": 0.0005938737182946572,
164
- "loss": 5.0533,
165
  "step": 1050
166
  },
167
  {
168
  "epoch": 0.11859838274932614,
169
- "grad_norm": 1.3372762203216553,
170
  "learning_rate": 0.0005935499190501888,
171
- "loss": 5.014,
172
  "step": 1100
173
  },
174
  {
175
  "epoch": 0.12398921832884097,
176
- "grad_norm": 1.185377836227417,
177
  "learning_rate": 0.0005932261198057204,
178
- "loss": 5.0192,
179
  "step": 1150
180
  },
181
  {
182
  "epoch": 0.1293800539083558,
183
- "grad_norm": 1.010504961013794,
184
  "learning_rate": 0.000592902320561252,
185
- "loss": 4.9761,
186
  "step": 1200
187
  },
188
  {
189
  "epoch": 0.1347708894878706,
190
- "grad_norm": 0.8940131068229675,
191
  "learning_rate": 0.0005925785213167835,
192
- "loss": 4.9547,
193
  "step": 1250
194
  },
195
  {
196
  "epoch": 0.14016172506738545,
197
- "grad_norm": 1.0484291315078735,
198
  "learning_rate": 0.0005922547220723151,
199
- "loss": 4.9102,
200
  "step": 1300
201
  },
202
  {
203
  "epoch": 0.14555256064690028,
204
- "grad_norm": 0.882337212562561,
205
  "learning_rate": 0.0005919309228278468,
206
- "loss": 4.8568,
207
  "step": 1350
208
  },
209
  {
210
  "epoch": 0.1509433962264151,
211
- "grad_norm": 1.2623690366744995,
212
  "learning_rate": 0.0005916071235833783,
213
- "loss": 4.8624,
214
  "step": 1400
215
  },
216
  {
217
  "epoch": 0.15633423180592992,
218
- "grad_norm": 0.7710188627243042,
219
  "learning_rate": 0.0005912833243389097,
220
- "loss": 4.8192,
221
  "step": 1450
222
  },
223
  {
224
  "epoch": 0.16172506738544473,
225
- "grad_norm": 0.933644711971283,
226
  "learning_rate": 0.0005909595250944414,
227
- "loss": 4.8205,
228
  "step": 1500
229
  },
230
  {
231
  "epoch": 0.16711590296495957,
232
- "grad_norm": 1.3466095924377441,
233
  "learning_rate": 0.000590635725849973,
234
- "loss": 4.8013,
235
  "step": 1550
236
  },
237
  {
238
  "epoch": 0.1725067385444744,
239
- "grad_norm": 1.1399619579315186,
240
  "learning_rate": 0.0005903119266055045,
241
- "loss": 4.7772,
242
  "step": 1600
243
  },
244
  {
245
  "epoch": 0.1778975741239892,
246
- "grad_norm": 0.8201636075973511,
247
  "learning_rate": 0.0005899881273610361,
248
- "loss": 4.7511,
249
  "step": 1650
250
  },
251
  {
252
  "epoch": 0.18328840970350405,
253
- "grad_norm": 0.902688205242157,
254
  "learning_rate": 0.0005896643281165677,
255
- "loss": 4.7286,
256
  "step": 1700
257
  },
258
  {
259
  "epoch": 0.18867924528301888,
260
- "grad_norm": 0.8644108176231384,
261
  "learning_rate": 0.0005893405288720993,
262
- "loss": 4.6775,
263
  "step": 1750
264
  },
265
  {
266
  "epoch": 0.1940700808625337,
267
- "grad_norm": 0.7629256844520569,
268
  "learning_rate": 0.0005890167296276308,
269
- "loss": 4.704,
270
  "step": 1800
271
  },
272
  {
273
  "epoch": 0.19946091644204852,
274
- "grad_norm": 1.2781596183776855,
275
  "learning_rate": 0.0005886929303831624,
276
- "loss": 4.6549,
277
  "step": 1850
278
  },
279
  {
280
  "epoch": 0.20485175202156333,
281
- "grad_norm": 0.7812705636024475,
282
  "learning_rate": 0.0005883691311386939,
283
- "loss": 4.632,
284
  "step": 1900
285
  },
286
  {
287
  "epoch": 0.21024258760107817,
288
- "grad_norm": 0.7588018178939819,
289
  "learning_rate": 0.0005880453318942256,
290
- "loss": 4.608,
291
  "step": 1950
292
  },
293
  {
294
  "epoch": 0.215633423180593,
295
- "grad_norm": 0.7466468811035156,
296
  "learning_rate": 0.0005877215326497571,
297
- "loss": 4.5906,
298
  "step": 2000
299
  },
300
  {
301
  "epoch": 0.215633423180593,
302
- "eval_accuracy": 0.26987845327398857,
303
- "eval_loss": 4.5157904624938965,
304
- "eval_runtime": 183.7342,
305
- "eval_samples_per_second": 98.027,
306
- "eval_steps_per_second": 6.128,
307
  "step": 2000
308
  },
309
  {
310
  "epoch": 0.2210242587601078,
311
- "grad_norm": 0.826682448387146,
312
  "learning_rate": 0.0005873977334052887,
313
  "loss": 4.576,
314
  "step": 2050
315
  },
316
  {
317
  "epoch": 0.22641509433962265,
318
- "grad_norm": 0.9647506475448608,
319
  "learning_rate": 0.0005870739341608202,
320
- "loss": 4.5599,
321
  "step": 2100
322
  },
323
  {
324
  "epoch": 0.23180592991913745,
325
- "grad_norm": 1.005610466003418,
326
  "learning_rate": 0.0005867501349163519,
327
- "loss": 4.5335,
328
  "step": 2150
329
  },
330
  {
331
  "epoch": 0.2371967654986523,
332
- "grad_norm": 1.0172079801559448,
333
  "learning_rate": 0.0005864263356718833,
334
- "loss": 4.5304,
335
  "step": 2200
336
  },
337
  {
338
  "epoch": 0.24258760107816713,
339
- "grad_norm": 0.8978919982910156,
340
  "learning_rate": 0.000586102536427415,
341
- "loss": 4.5045,
342
  "step": 2250
343
  },
344
  {
345
  "epoch": 0.24797843665768193,
346
- "grad_norm": 0.9270089268684387,
347
  "learning_rate": 0.0005857787371829465,
348
- "loss": 4.5004,
349
  "step": 2300
350
  },
351
  {
352
  "epoch": 0.25336927223719674,
353
- "grad_norm": 1.124731183052063,
354
  "learning_rate": 0.0005854549379384781,
355
- "loss": 4.4774,
356
  "step": 2350
357
  },
358
  {
359
  "epoch": 0.2587601078167116,
360
- "grad_norm": 0.7949519157409668,
361
  "learning_rate": 0.0005851311386940096,
362
- "loss": 4.4615,
363
  "step": 2400
364
  },
365
  {
366
  "epoch": 0.2641509433962264,
367
- "grad_norm": 0.7883573770523071,
368
  "learning_rate": 0.0005848073394495412,
369
- "loss": 4.4503,
370
  "step": 2450
371
  },
372
  {
373
  "epoch": 0.2695417789757412,
374
- "grad_norm": 0.9283429384231567,
375
  "learning_rate": 0.0005844835402050728,
376
- "loss": 4.4294,
377
  "step": 2500
378
  },
379
  {
380
  "epoch": 0.2749326145552561,
381
- "grad_norm": 0.703683078289032,
382
  "learning_rate": 0.0005841597409606044,
383
- "loss": 4.3969,
384
  "step": 2550
385
  },
386
  {
387
  "epoch": 0.2803234501347709,
388
- "grad_norm": 0.812880277633667,
389
  "learning_rate": 0.000583835941716136,
390
- "loss": 4.4119,
391
  "step": 2600
392
  },
393
  {
394
  "epoch": 0.2857142857142857,
395
- "grad_norm": 0.9256618618965149,
396
  "learning_rate": 0.0005835121424716675,
397
- "loss": 4.3917,
398
  "step": 2650
399
  },
400
  {
401
  "epoch": 0.29110512129380056,
402
- "grad_norm": 0.7537206411361694,
403
  "learning_rate": 0.0005831883432271992,
404
- "loss": 4.3791,
405
  "step": 2700
406
  },
407
  {
408
  "epoch": 0.29649595687331537,
409
- "grad_norm": 0.9869926571846008,
410
  "learning_rate": 0.0005828645439827307,
411
- "loss": 4.3759,
412
  "step": 2750
413
  },
414
  {
415
  "epoch": 0.3018867924528302,
416
- "grad_norm": 0.8514196276664734,
417
  "learning_rate": 0.0005825407447382622,
418
- "loss": 4.3314,
419
  "step": 2800
420
  },
421
  {
422
  "epoch": 0.30727762803234504,
423
- "grad_norm": 0.7071971893310547,
424
  "learning_rate": 0.0005822169454937938,
425
- "loss": 4.3577,
426
  "step": 2850
427
  },
428
  {
429
  "epoch": 0.31266846361185985,
430
- "grad_norm": 0.8087393045425415,
431
  "learning_rate": 0.0005818931462493254,
432
- "loss": 4.326,
433
  "step": 2900
434
  },
435
  {
436
  "epoch": 0.31805929919137466,
437
- "grad_norm": 0.8092687129974365,
438
  "learning_rate": 0.0005815693470048569,
439
- "loss": 4.3304,
440
  "step": 2950
441
  },
442
  {
443
  "epoch": 0.32345013477088946,
444
- "grad_norm": 0.7119380235671997,
445
  "learning_rate": 0.0005812455477603885,
446
- "loss": 4.3198,
447
  "step": 3000
448
  },
449
  {
450
  "epoch": 0.32345013477088946,
451
- "eval_accuracy": 0.29812342337868974,
452
- "eval_loss": 4.240254878997803,
453
- "eval_runtime": 183.7893,
454
- "eval_samples_per_second": 97.998,
455
- "eval_steps_per_second": 6.127,
456
  "step": 3000
457
  },
458
  {
459
  "epoch": 0.3288409703504043,
460
- "grad_norm": 0.7455107569694519,
461
  "learning_rate": 0.0005809217485159201,
462
- "loss": 4.3188,
463
  "step": 3050
464
  },
465
  {
466
  "epoch": 0.33423180592991913,
467
- "grad_norm": 0.8890029191970825,
468
  "learning_rate": 0.0005805979492714517,
469
- "loss": 4.2983,
470
  "step": 3100
471
  },
472
  {
473
  "epoch": 0.33962264150943394,
474
- "grad_norm": 0.8936235308647156,
475
  "learning_rate": 0.0005802741500269832,
476
- "loss": 4.2817,
477
  "step": 3150
478
  },
479
  {
480
  "epoch": 0.3450134770889488,
481
- "grad_norm": 0.6431916356086731,
482
  "learning_rate": 0.0005799503507825148,
483
  "loss": 4.2802,
484
  "step": 3200
485
  },
486
  {
487
  "epoch": 0.3504043126684636,
488
- "grad_norm": 0.7159081697463989,
489
  "learning_rate": 0.0005796265515380463,
490
- "loss": 4.2707,
491
  "step": 3250
492
  },
493
  {
494
  "epoch": 0.3557951482479784,
495
- "grad_norm": 0.9267504215240479,
496
  "learning_rate": 0.000579302752293578,
497
- "loss": 4.277,
498
  "step": 3300
499
  },
500
  {
501
  "epoch": 0.3611859838274933,
502
- "grad_norm": 0.8296390771865845,
503
  "learning_rate": 0.0005789789530491095,
504
- "loss": 4.2509,
505
  "step": 3350
506
  },
507
  {
508
  "epoch": 0.3665768194070081,
509
- "grad_norm": 0.7630621790885925,
510
  "learning_rate": 0.0005786551538046411,
511
- "loss": 4.2406,
512
  "step": 3400
513
  },
514
  {
515
  "epoch": 0.3719676549865229,
516
- "grad_norm": 0.6547260284423828,
517
  "learning_rate": 0.0005783313545601726,
518
- "loss": 4.2344,
519
  "step": 3450
520
  },
521
  {
522
  "epoch": 0.37735849056603776,
523
- "grad_norm": 0.8587298393249512,
524
  "learning_rate": 0.0005780075553157043,
525
- "loss": 4.2341,
526
  "step": 3500
527
  },
528
  {
529
  "epoch": 0.38274932614555257,
530
- "grad_norm": 0.7555488348007202,
531
  "learning_rate": 0.0005776837560712357,
532
- "loss": 4.2299,
533
  "step": 3550
534
  },
535
  {
536
  "epoch": 0.3881401617250674,
537
- "grad_norm": 0.8421213626861572,
538
  "learning_rate": 0.0005773599568267673,
539
- "loss": 4.2202,
540
  "step": 3600
541
  },
542
  {
543
  "epoch": 0.3935309973045822,
544
- "grad_norm": 0.7566924095153809,
545
  "learning_rate": 0.0005770361575822989,
546
- "loss": 4.2103,
547
  "step": 3650
548
  },
549
  {
550
  "epoch": 0.39892183288409705,
551
- "grad_norm": 0.7638437747955322,
552
  "learning_rate": 0.0005767123583378305,
553
- "loss": 4.1973,
554
  "step": 3700
555
  },
556
  {
557
  "epoch": 0.40431266846361186,
558
- "grad_norm": 0.6439513564109802,
559
  "learning_rate": 0.000576388559093362,
560
- "loss": 4.199,
561
  "step": 3750
562
  },
563
  {
564
  "epoch": 0.40970350404312667,
565
- "grad_norm": 0.7719266414642334,
566
  "learning_rate": 0.0005760647598488936,
567
- "loss": 4.1635,
568
  "step": 3800
569
  },
570
  {
571
  "epoch": 0.41509433962264153,
572
- "grad_norm": 0.6647982597351074,
573
  "learning_rate": 0.0005757409606044253,
574
- "loss": 4.1739,
575
  "step": 3850
576
  },
577
  {
578
  "epoch": 0.42048517520215634,
579
- "grad_norm": 0.7858614325523376,
580
  "learning_rate": 0.0005754171613599568,
581
- "loss": 4.1932,
582
  "step": 3900
583
  },
584
  {
585
  "epoch": 0.42587601078167114,
586
- "grad_norm": 1.070395588874817,
587
  "learning_rate": 0.0005750933621154884,
588
- "loss": 4.1588,
589
  "step": 3950
590
  },
591
  {
592
  "epoch": 0.431266846361186,
593
- "grad_norm": 0.6882054805755615,
594
  "learning_rate": 0.0005747695628710199,
595
- "loss": 4.1634,
596
  "step": 4000
597
  },
598
  {
599
  "epoch": 0.431266846361186,
600
- "eval_accuracy": 0.3118610599024015,
601
- "eval_loss": 4.099079608917236,
602
- "eval_runtime": 183.6446,
603
- "eval_samples_per_second": 98.075,
604
- "eval_steps_per_second": 6.131,
605
  "step": 4000
606
  },
607
  {
608
  "epoch": 0.4366576819407008,
609
- "grad_norm": 0.8163891434669495,
610
  "learning_rate": 0.0005744457636265515,
611
- "loss": 4.1703,
612
  "step": 4050
613
  },
614
  {
615
  "epoch": 0.4420485175202156,
616
- "grad_norm": 0.7172017097473145,
617
  "learning_rate": 0.0005741219643820831,
618
- "loss": 4.1633,
619
  "step": 4100
620
  },
621
  {
622
  "epoch": 0.4474393530997305,
623
- "grad_norm": 0.7089101672172546,
624
  "learning_rate": 0.0005737981651376146,
625
- "loss": 4.1486,
626
  "step": 4150
627
  },
628
  {
629
  "epoch": 0.4528301886792453,
630
- "grad_norm": 0.6500125527381897,
631
  "learning_rate": 0.0005734743658931462,
632
- "loss": 4.1541,
633
  "step": 4200
634
  },
635
  {
636
  "epoch": 0.4582210242587601,
637
- "grad_norm": 0.6067988276481628,
638
  "learning_rate": 0.0005731505666486778,
639
- "loss": 4.1386,
640
  "step": 4250
641
  },
642
  {
643
  "epoch": 0.4636118598382749,
644
- "grad_norm": 0.8405300974845886,
645
  "learning_rate": 0.0005728267674042093,
646
- "loss": 4.1407,
647
  "step": 4300
648
  },
649
  {
650
  "epoch": 0.46900269541778977,
651
- "grad_norm": 0.65191650390625,
652
  "learning_rate": 0.0005725029681597409,
653
- "loss": 4.1283,
654
  "step": 4350
655
  },
656
  {
657
  "epoch": 0.4743935309973046,
658
- "grad_norm": 0.674238920211792,
659
  "learning_rate": 0.0005721791689152725,
660
- "loss": 4.1114,
661
  "step": 4400
662
  },
663
  {
664
  "epoch": 0.4797843665768194,
665
- "grad_norm": 0.660973072052002,
666
  "learning_rate": 0.0005718553696708041,
667
- "loss": 4.1208,
668
  "step": 4450
669
  },
670
  {
671
  "epoch": 0.48517520215633425,
672
- "grad_norm": 0.6465425491333008,
673
  "learning_rate": 0.0005715315704263356,
674
- "loss": 4.1158,
675
  "step": 4500
676
  },
677
  {
678
  "epoch": 0.49056603773584906,
679
- "grad_norm": 0.7483091950416565,
680
  "learning_rate": 0.0005712077711818672,
681
- "loss": 4.1276,
682
  "step": 4550
683
  },
684
  {
685
  "epoch": 0.49595687331536387,
686
- "grad_norm": 0.845150351524353,
687
  "learning_rate": 0.0005708839719373987,
688
- "loss": 4.1192,
689
  "step": 4600
690
  },
691
  {
692
  "epoch": 0.5013477088948787,
693
- "grad_norm": 0.634871244430542,
694
  "learning_rate": 0.0005705601726929304,
695
- "loss": 4.0755,
696
  "step": 4650
697
  },
698
  {
699
  "epoch": 0.5067385444743935,
700
- "grad_norm": 0.6169816851615906,
701
  "learning_rate": 0.0005702363734484619,
702
- "loss": 4.078,
703
  "step": 4700
704
  },
705
  {
706
  "epoch": 0.5121293800539084,
707
- "grad_norm": 0.8197508454322815,
708
  "learning_rate": 0.0005699125742039935,
709
- "loss": 4.084,
710
  "step": 4750
711
  },
712
  {
713
  "epoch": 0.5175202156334232,
714
- "grad_norm": 0.733070969581604,
715
  "learning_rate": 0.000569588774959525,
716
- "loss": 4.0813,
717
  "step": 4800
718
  },
719
  {
720
  "epoch": 0.522911051212938,
721
- "grad_norm": 0.6208024024963379,
722
  "learning_rate": 0.0005692649757150567,
723
- "loss": 4.0748,
724
  "step": 4850
725
  },
726
  {
727
  "epoch": 0.5283018867924528,
728
- "grad_norm": 0.7824249863624573,
729
  "learning_rate": 0.0005689411764705881,
730
- "loss": 4.0771,
731
  "step": 4900
732
  },
733
  {
734
  "epoch": 0.5336927223719676,
735
- "grad_norm": 0.6890459656715393,
736
  "learning_rate": 0.0005686173772261197,
737
- "loss": 4.0671,
738
  "step": 4950
739
  },
740
  {
741
  "epoch": 0.5390835579514824,
742
- "grad_norm": 0.7617940902709961,
743
  "learning_rate": 0.0005682935779816514,
744
  "loss": 4.0659,
745
  "step": 5000
746
  },
747
  {
748
  "epoch": 0.5390835579514824,
749
- "eval_accuracy": 0.32073376337421977,
750
- "eval_loss": 3.998711347579956,
751
- "eval_runtime": 183.5304,
752
- "eval_samples_per_second": 98.136,
753
- "eval_steps_per_second": 6.135,
754
  "step": 5000
755
  },
756
  {
757
  "epoch": 0.5444743935309974,
758
- "grad_norm": 0.6309065222740173,
759
  "learning_rate": 0.0005679697787371829,
760
- "loss": 4.0746,
761
  "step": 5050
762
  },
763
  {
764
  "epoch": 0.5498652291105122,
765
- "grad_norm": 0.5925028920173645,
766
  "learning_rate": 0.0005676459794927145,
767
- "loss": 4.0574,
768
  "step": 5100
769
  },
770
  {
771
  "epoch": 0.555256064690027,
772
- "grad_norm": 0.6035439968109131,
773
  "learning_rate": 0.000567322180248246,
774
- "loss": 4.0516,
775
  "step": 5150
776
  },
777
  {
778
  "epoch": 0.5606469002695418,
779
- "grad_norm": 0.7275799512863159,
780
  "learning_rate": 0.0005669983810037777,
781
- "loss": 4.0651,
782
  "step": 5200
783
  },
784
  {
785
  "epoch": 0.5660377358490566,
786
- "grad_norm": 0.6090968251228333,
787
  "learning_rate": 0.0005666745817593092,
788
- "loss": 4.0379,
789
  "step": 5250
790
  },
791
  {
792
  "epoch": 0.5714285714285714,
793
- "grad_norm": 0.632185161113739,
794
  "learning_rate": 0.0005663507825148408,
795
- "loss": 4.0381,
796
  "step": 5300
797
  },
798
  {
799
  "epoch": 0.5768194070080862,
800
- "grad_norm": 0.6599447131156921,
801
  "learning_rate": 0.0005660269832703723,
802
- "loss": 4.0278,
803
  "step": 5350
804
  },
805
  {
806
  "epoch": 0.5822102425876011,
807
- "grad_norm": 0.648209810256958,
808
  "learning_rate": 0.0005657031840259039,
809
- "loss": 4.0327,
810
  "step": 5400
811
  },
812
  {
813
  "epoch": 0.5876010781671159,
814
- "grad_norm": 0.6686100363731384,
815
  "learning_rate": 0.0005653793847814355,
816
- "loss": 4.0357,
817
  "step": 5450
818
  },
819
  {
820
  "epoch": 0.5929919137466307,
821
- "grad_norm": 0.7332231998443604,
822
  "learning_rate": 0.000565055585536967,
823
- "loss": 4.0131,
824
  "step": 5500
825
  },
826
  {
827
  "epoch": 0.5983827493261455,
828
- "grad_norm": 0.6814959645271301,
829
  "learning_rate": 0.0005647317862924986,
830
- "loss": 4.033,
831
  "step": 5550
832
  },
833
  {
834
  "epoch": 0.6037735849056604,
835
- "grad_norm": 0.6917067766189575,
836
  "learning_rate": 0.0005644079870480302,
837
- "loss": 3.9815,
838
  "step": 5600
839
  },
840
  {
841
  "epoch": 0.6091644204851752,
842
- "grad_norm": 0.6626110672950745,
843
  "learning_rate": 0.0005640841878035617,
844
- "loss": 4.0186,
845
  "step": 5650
846
  },
847
  {
848
  "epoch": 0.6145552560646901,
849
- "grad_norm": 0.7377511262893677,
850
  "learning_rate": 0.0005637603885590933,
851
- "loss": 4.0184,
852
  "step": 5700
853
  },
854
  {
855
  "epoch": 0.6199460916442049,
856
- "grad_norm": 0.6328345537185669,
857
  "learning_rate": 0.0005634365893146248,
858
- "loss": 4.019,
859
  "step": 5750
860
  },
861
  {
862
  "epoch": 0.6253369272237197,
863
- "grad_norm": 0.6522849798202515,
864
  "learning_rate": 0.0005631127900701565,
865
- "loss": 4.01,
866
  "step": 5800
867
  },
868
  {
869
  "epoch": 0.6307277628032345,
870
- "grad_norm": 0.6383638978004456,
871
  "learning_rate": 0.000562788990825688,
872
- "loss": 3.9816,
873
  "step": 5850
874
  },
875
  {
876
  "epoch": 0.6361185983827493,
877
- "grad_norm": 0.593140721321106,
878
  "learning_rate": 0.0005624651915812196,
879
- "loss": 3.9949,
880
  "step": 5900
881
  },
882
  {
883
  "epoch": 0.6415094339622641,
884
- "grad_norm": 0.7360444068908691,
885
  "learning_rate": 0.0005621413923367511,
886
- "loss": 4.0016,
887
  "step": 5950
888
  },
889
  {
890
  "epoch": 0.6469002695417789,
891
- "grad_norm": 0.6608056426048279,
892
  "learning_rate": 0.0005618175930922828,
893
- "loss": 4.0076,
894
  "step": 6000
895
  },
896
  {
897
  "epoch": 0.6469002695417789,
898
- "eval_accuracy": 0.32758476256247404,
899
- "eval_loss": 3.921957492828369,
900
- "eval_runtime": 183.5766,
901
- "eval_samples_per_second": 98.112,
902
- "eval_steps_per_second": 6.134,
903
  "step": 6000
904
  },
905
  {
906
  "epoch": 0.6522911051212938,
907
- "grad_norm": 0.6179393529891968,
908
  "learning_rate": 0.0005614937938478143,
909
- "loss": 3.9939,
910
  "step": 6050
911
  },
912
  {
913
  "epoch": 0.6576819407008087,
914
- "grad_norm": 0.7146060466766357,
915
  "learning_rate": 0.0005611699946033459,
916
- "loss": 3.9929,
917
  "step": 6100
918
  },
919
  {
920
  "epoch": 0.6630727762803235,
921
- "grad_norm": 0.601253867149353,
922
  "learning_rate": 0.0005608461953588774,
923
- "loss": 3.9838,
924
  "step": 6150
925
  },
926
  {
927
  "epoch": 0.6684636118598383,
928
- "grad_norm": 0.6216392517089844,
929
  "learning_rate": 0.000560522396114409,
930
- "loss": 3.9788,
931
  "step": 6200
932
  },
933
  {
934
  "epoch": 0.6738544474393531,
935
- "grad_norm": 0.6294983625411987,
936
  "learning_rate": 0.0005601985968699405,
937
- "loss": 3.9608,
938
  "step": 6250
939
  },
940
  {
941
  "epoch": 0.6792452830188679,
942
- "grad_norm": 0.7225786447525024,
943
  "learning_rate": 0.0005598747976254721,
944
- "loss": 3.9794,
945
  "step": 6300
946
  },
947
  {
948
  "epoch": 0.6846361185983828,
949
- "grad_norm": 0.6607632637023926,
950
  "learning_rate": 0.0005595509983810038,
951
- "loss": 3.9496,
952
  "step": 6350
953
  },
954
  {
955
  "epoch": 0.6900269541778976,
956
- "grad_norm": 0.5790310502052307,
957
  "learning_rate": 0.0005592271991365353,
958
- "loss": 3.9592,
959
  "step": 6400
960
  },
961
  {
962
  "epoch": 0.6954177897574124,
963
- "grad_norm": 0.6292189955711365,
964
  "learning_rate": 0.0005589033998920669,
965
- "loss": 3.9773,
966
  "step": 6450
967
  },
968
  {
969
  "epoch": 0.7008086253369272,
970
- "grad_norm": 0.6256137490272522,
971
  "learning_rate": 0.0005585796006475984,
972
- "loss": 3.9487,
973
  "step": 6500
974
  },
975
  {
976
  "epoch": 0.706199460916442,
977
- "grad_norm": 0.6231578588485718,
978
  "learning_rate": 0.0005582558014031301,
979
- "loss": 3.9727,
980
  "step": 6550
981
  },
982
  {
983
  "epoch": 0.7115902964959568,
984
- "grad_norm": 0.6470305323600769,
985
  "learning_rate": 0.0005579320021586616,
986
- "loss": 3.9563,
987
  "step": 6600
988
  },
989
  {
990
  "epoch": 0.7169811320754716,
991
- "grad_norm": 0.5552076697349548,
992
  "learning_rate": 0.0005576082029141932,
993
- "loss": 3.951,
994
  "step": 6650
995
  },
996
  {
997
  "epoch": 0.7223719676549866,
998
- "grad_norm": 0.5381990671157837,
999
  "learning_rate": 0.0005572844036697247,
1000
- "loss": 3.9356,
1001
  "step": 6700
1002
  },
1003
  {
1004
  "epoch": 0.7277628032345014,
1005
- "grad_norm": 0.6558448076248169,
1006
  "learning_rate": 0.0005569606044252563,
1007
- "loss": 3.9426,
1008
  "step": 6750
1009
  },
1010
  {
1011
  "epoch": 0.7331536388140162,
1012
- "grad_norm": 0.8135426640510559,
1013
  "learning_rate": 0.0005566368051807879,
1014
- "loss": 3.9613,
1015
  "step": 6800
1016
  },
1017
  {
1018
  "epoch": 0.738544474393531,
1019
- "grad_norm": 0.6013303995132446,
1020
  "learning_rate": 0.0005563130059363194,
1021
- "loss": 3.9451,
1022
  "step": 6850
1023
  },
1024
  {
1025
  "epoch": 0.7439353099730458,
1026
- "grad_norm": 0.5324015617370605,
1027
  "learning_rate": 0.000555989206691851,
1028
- "loss": 3.9444,
1029
  "step": 6900
1030
  },
1031
  {
1032
  "epoch": 0.7493261455525606,
1033
- "grad_norm": 0.6945801377296448,
1034
  "learning_rate": 0.0005556654074473826,
1035
- "loss": 3.9473,
1036
  "step": 6950
1037
  },
1038
  {
1039
  "epoch": 0.7547169811320755,
1040
- "grad_norm": 0.7069705128669739,
1041
  "learning_rate": 0.0005553416082029141,
1042
- "loss": 3.9328,
1043
  "step": 7000
1044
  },
1045
  {
1046
  "epoch": 0.7547169811320755,
1047
- "eval_accuracy": 0.3323933047655917,
1048
- "eval_loss": 3.8696444034576416,
1049
- "eval_runtime": 183.451,
1050
- "eval_samples_per_second": 98.179,
1051
- "eval_steps_per_second": 6.138,
1052
  "step": 7000
1053
  },
1054
  {
1055
  "epoch": 0.7601078167115903,
1056
- "grad_norm": 0.6576606631278992,
1057
  "learning_rate": 0.0005550178089584457,
1058
- "loss": 3.9266,
1059
  "step": 7050
1060
  },
1061
  {
1062
  "epoch": 0.7654986522911051,
1063
- "grad_norm": 0.5154832005500793,
1064
  "learning_rate": 0.0005546940097139772,
1065
- "loss": 3.9252,
1066
  "step": 7100
1067
  },
1068
  {
1069
  "epoch": 0.77088948787062,
1070
- "grad_norm": 0.6892321109771729,
1071
  "learning_rate": 0.0005543702104695089,
1072
- "loss": 3.9271,
1073
  "step": 7150
1074
  },
1075
  {
1076
  "epoch": 0.7762803234501348,
1077
- "grad_norm": 0.6380577087402344,
1078
  "learning_rate": 0.0005540464112250404,
1079
- "loss": 3.9261,
1080
  "step": 7200
1081
  },
1082
  {
1083
  "epoch": 0.7816711590296496,
1084
- "grad_norm": 0.652199923992157,
1085
  "learning_rate": 0.000553722611980572,
1086
- "loss": 3.9387,
1087
  "step": 7250
1088
  },
1089
  {
1090
  "epoch": 0.7870619946091644,
1091
- "grad_norm": 0.5706573724746704,
1092
  "learning_rate": 0.0005533988127361035,
1093
- "loss": 3.9201,
1094
  "step": 7300
1095
  },
1096
  {
1097
  "epoch": 0.7924528301886793,
1098
- "grad_norm": 0.5596190690994263,
1099
  "learning_rate": 0.0005530750134916352,
1100
- "loss": 3.9361,
1101
  "step": 7350
1102
  },
1103
  {
1104
  "epoch": 0.7978436657681941,
1105
- "grad_norm": 0.6239616274833679,
1106
  "learning_rate": 0.0005527512142471668,
1107
- "loss": 3.9104,
1108
  "step": 7400
1109
  },
1110
  {
1111
  "epoch": 0.8032345013477089,
1112
- "grad_norm": 0.5858375430107117,
1113
  "learning_rate": 0.0005524274150026982,
1114
- "loss": 3.9105,
1115
  "step": 7450
1116
  },
1117
  {
1118
  "epoch": 0.8086253369272237,
1119
- "grad_norm": 0.5788413286209106,
1120
  "learning_rate": 0.0005521036157582299,
1121
- "loss": 3.904,
1122
  "step": 7500
1123
  },
1124
  {
1125
  "epoch": 0.8140161725067385,
1126
- "grad_norm": 0.6172971725463867,
1127
  "learning_rate": 0.0005517798165137614,
1128
- "loss": 3.9068,
1129
  "step": 7550
1130
  },
1131
  {
1132
  "epoch": 0.8194070080862533,
1133
- "grad_norm": 0.6352159976959229,
1134
  "learning_rate": 0.000551456017269293,
1135
- "loss": 3.8812,
1136
  "step": 7600
1137
  },
1138
  {
1139
  "epoch": 0.8247978436657682,
1140
- "grad_norm": 0.6148518323898315,
1141
  "learning_rate": 0.0005511322180248245,
1142
- "loss": 3.8997,
1143
  "step": 7650
1144
  },
1145
  {
1146
  "epoch": 0.8301886792452831,
1147
- "grad_norm": 0.6033445000648499,
1148
  "learning_rate": 0.0005508084187803562,
1149
- "loss": 3.9111,
1150
  "step": 7700
1151
  },
1152
  {
1153
  "epoch": 0.8355795148247979,
1154
- "grad_norm": 0.5412169694900513,
1155
  "learning_rate": 0.0005504846195358877,
1156
- "loss": 3.9042,
1157
  "step": 7750
1158
  },
1159
  {
1160
  "epoch": 0.8409703504043127,
1161
- "grad_norm": 0.5904088616371155,
1162
  "learning_rate": 0.0005501608202914193,
1163
- "loss": 3.892,
1164
  "step": 7800
1165
  },
1166
  {
1167
  "epoch": 0.8463611859838275,
1168
- "grad_norm": 0.6405267715454102,
1169
  "learning_rate": 0.0005498370210469508,
1170
- "loss": 3.8977,
1171
  "step": 7850
1172
  },
1173
  {
1174
  "epoch": 0.8517520215633423,
1175
- "grad_norm": 0.6236185431480408,
1176
  "learning_rate": 0.0005495132218024824,
1177
- "loss": 3.8806,
1178
  "step": 7900
1179
  },
1180
  {
1181
  "epoch": 0.8571428571428571,
1182
- "grad_norm": 0.6019570231437683,
1183
  "learning_rate": 0.000549189422558014,
1184
- "loss": 3.8888,
1185
  "step": 7950
1186
  },
1187
  {
1188
  "epoch": 0.862533692722372,
1189
- "grad_norm": 0.5633127093315125,
1190
  "learning_rate": 0.0005488656233135456,
1191
- "loss": 3.8875,
1192
  "step": 8000
1193
  },
1194
  {
1195
  "epoch": 0.862533692722372,
1196
- "eval_accuracy": 0.33722933614932643,
1197
- "eval_loss": 3.821709632873535,
1198
- "eval_runtime": 183.4138,
1199
- "eval_samples_per_second": 98.199,
1200
- "eval_steps_per_second": 6.139,
1201
  "step": 8000
1202
  },
1203
  {
1204
  "epoch": 0.8679245283018868,
1205
- "grad_norm": 0.5992864966392517,
1206
  "learning_rate": 0.0005485418240690771,
1207
- "loss": 3.8707,
1208
  "step": 8050
1209
  },
1210
  {
1211
  "epoch": 0.8733153638814016,
1212
- "grad_norm": 0.6274523735046387,
1213
  "learning_rate": 0.0005482180248246087,
1214
- "loss": 3.8864,
1215
  "step": 8100
1216
  },
1217
  {
1218
  "epoch": 0.8787061994609164,
1219
- "grad_norm": 0.6176576614379883,
1220
  "learning_rate": 0.0005478942255801403,
1221
- "loss": 3.8807,
1222
  "step": 8150
1223
  },
1224
  {
1225
  "epoch": 0.8840970350404312,
1226
- "grad_norm": 0.5266938805580139,
1227
  "learning_rate": 0.0005475704263356718,
1228
- "loss": 3.8706,
1229
  "step": 8200
1230
  },
1231
  {
1232
  "epoch": 0.889487870619946,
1233
- "grad_norm": 0.5737940073013306,
1234
  "learning_rate": 0.0005472466270912034,
1235
- "loss": 3.8805,
1236
  "step": 8250
1237
  },
1238
  {
1239
  "epoch": 0.894878706199461,
1240
- "grad_norm": 0.6148428320884705,
1241
  "learning_rate": 0.000546922827846735,
1242
- "loss": 3.8693,
1243
  "step": 8300
1244
  },
1245
  {
1246
  "epoch": 0.9002695417789758,
1247
- "grad_norm": 0.5487964749336243,
1248
  "learning_rate": 0.0005465990286022665,
1249
- "loss": 3.869,
1250
  "step": 8350
1251
  },
1252
  {
1253
  "epoch": 0.9056603773584906,
1254
- "grad_norm": 0.6526573896408081,
1255
  "learning_rate": 0.0005462752293577981,
1256
- "loss": 3.8841,
1257
  "step": 8400
1258
  },
1259
  {
1260
  "epoch": 0.9110512129380054,
1261
- "grad_norm": 0.601149320602417,
1262
  "learning_rate": 0.0005459514301133296,
1263
- "loss": 3.8793,
1264
  "step": 8450
1265
  },
1266
  {
1267
  "epoch": 0.9164420485175202,
1268
- "grad_norm": 0.5263657569885254,
1269
  "learning_rate": 0.0005456276308688613,
1270
- "loss": 3.8761,
1271
  "step": 8500
1272
  },
1273
  {
1274
  "epoch": 0.921832884097035,
1275
- "grad_norm": 0.5656020045280457,
1276
  "learning_rate": 0.0005453038316243929,
1277
- "loss": 3.8672,
1278
  "step": 8550
1279
  },
1280
  {
1281
  "epoch": 0.9272237196765498,
1282
- "grad_norm": 0.565776526927948,
1283
  "learning_rate": 0.0005449800323799244,
1284
- "loss": 3.8777,
1285
  "step": 8600
1286
  },
1287
  {
1288
  "epoch": 0.9326145552560647,
1289
- "grad_norm": 0.5656868815422058,
1290
  "learning_rate": 0.000544656233135456,
1291
- "loss": 3.8577,
1292
  "step": 8650
1293
  },
1294
  {
1295
  "epoch": 0.9380053908355795,
1296
- "grad_norm": 0.5700314044952393,
1297
  "learning_rate": 0.0005443324338909875,
1298
- "loss": 3.8624,
1299
  "step": 8700
1300
  },
1301
  {
1302
  "epoch": 0.9433962264150944,
1303
- "grad_norm": 0.5940127968788147,
1304
  "learning_rate": 0.0005440086346465192,
1305
- "loss": 3.8726,
1306
  "step": 8750
1307
  },
1308
  {
1309
  "epoch": 0.9487870619946092,
1310
- "grad_norm": 0.5483199954032898,
1311
  "learning_rate": 0.0005436848354020506,
1312
- "loss": 3.8541,
1313
  "step": 8800
1314
  },
1315
  {
1316
  "epoch": 0.954177897574124,
1317
- "grad_norm": 0.6202383041381836,
1318
  "learning_rate": 0.0005433610361575823,
1319
- "loss": 3.8599,
1320
  "step": 8850
1321
  },
1322
  {
1323
  "epoch": 0.9595687331536388,
1324
- "grad_norm": 0.5427079200744629,
1325
  "learning_rate": 0.0005430372369131138,
1326
- "loss": 3.8437,
1327
  "step": 8900
1328
  },
1329
  {
1330
  "epoch": 0.9649595687331537,
1331
- "grad_norm": 0.5505421757698059,
1332
  "learning_rate": 0.0005427134376686454,
1333
- "loss": 3.858,
1334
  "step": 8950
1335
  },
1336
  {
1337
  "epoch": 0.9703504043126685,
1338
- "grad_norm": 0.6305214166641235,
1339
  "learning_rate": 0.0005423896384241769,
1340
- "loss": 3.8437,
1341
  "step": 9000
1342
  },
1343
  {
1344
  "epoch": 0.9703504043126685,
1345
- "eval_accuracy": 0.3406345176534323,
1346
- "eval_loss": 3.7832400798797607,
1347
- "eval_runtime": 183.7219,
1348
- "eval_samples_per_second": 98.034,
1349
- "eval_steps_per_second": 6.129,
1350
  "step": 9000
1351
  },
1352
  {
1353
  "epoch": 0.9757412398921833,
1354
- "grad_norm": 0.7392898797988892,
1355
  "learning_rate": 0.0005420658391797086,
1356
- "loss": 3.8372,
1357
  "step": 9050
1358
  },
1359
  {
1360
  "epoch": 0.9811320754716981,
1361
- "grad_norm": 0.587247908115387,
1362
  "learning_rate": 0.0005417420399352401,
1363
- "loss": 3.8504,
1364
  "step": 9100
1365
  },
1366
  {
1367
  "epoch": 0.9865229110512129,
1368
- "grad_norm": 0.5904769897460938,
1369
  "learning_rate": 0.0005414182406907717,
1370
- "loss": 3.844,
1371
  "step": 9150
1372
  },
1373
  {
1374
  "epoch": 0.9919137466307277,
1375
- "grad_norm": 0.632688581943512,
1376
  "learning_rate": 0.0005410944414463032,
1377
- "loss": 3.8564,
1378
  "step": 9200
1379
  },
1380
  {
1381
  "epoch": 0.9973045822102425,
1382
- "grad_norm": 0.5667609572410583,
1383
  "learning_rate": 0.0005407706422018348,
1384
- "loss": 3.8552,
1385
  "step": 9250
1386
  },
1387
  {
1388
  "epoch": 1.0026954177897573,
1389
- "grad_norm": 0.6239280700683594,
1390
  "learning_rate": 0.0005404468429573664,
1391
- "loss": 3.8025,
1392
  "step": 9300
1393
  },
1394
  {
1395
  "epoch": 1.0080862533692723,
1396
- "grad_norm": 0.6430540680885315,
1397
  "learning_rate": 0.000540123043712898,
1398
- "loss": 3.7743,
1399
  "step": 9350
1400
  },
1401
  {
1402
  "epoch": 1.013477088948787,
1403
- "grad_norm": 0.5992752909660339,
1404
  "learning_rate": 0.0005397992444684295,
1405
- "loss": 3.7575,
1406
  "step": 9400
1407
  },
1408
  {
1409
  "epoch": 1.0188679245283019,
1410
- "grad_norm": 0.6434339284896851,
1411
  "learning_rate": 0.0005394754452239611,
1412
- "loss": 3.7703,
1413
  "step": 9450
1414
  },
1415
  {
1416
  "epoch": 1.0242587601078168,
1417
- "grad_norm": 0.5548680424690247,
1418
  "learning_rate": 0.0005391516459794927,
1419
- "loss": 3.7811,
1420
  "step": 9500
1421
  },
1422
  {
1423
  "epoch": 1.0296495956873315,
1424
- "grad_norm": 0.5591529011726379,
1425
  "learning_rate": 0.0005388278467350242,
1426
- "loss": 3.7885,
1427
  "step": 9550
1428
  },
1429
  {
1430
  "epoch": 1.0350404312668464,
1431
- "grad_norm": 0.5492196083068848,
1432
  "learning_rate": 0.0005385040474905557,
1433
- "loss": 3.7816,
1434
  "step": 9600
1435
  },
1436
  {
1437
  "epoch": 1.0404312668463611,
1438
- "grad_norm": 0.5632776618003845,
1439
  "learning_rate": 0.0005381802482460874,
1440
- "loss": 3.7914,
1441
  "step": 9650
1442
  },
1443
  {
1444
  "epoch": 1.045822102425876,
1445
- "grad_norm": 0.5463435053825378,
1446
  "learning_rate": 0.000537856449001619,
1447
- "loss": 3.7925,
1448
  "step": 9700
1449
  },
1450
  {
1451
  "epoch": 1.0512129380053907,
1452
- "grad_norm": 0.5662521719932556,
1453
  "learning_rate": 0.0005375326497571505,
1454
- "loss": 3.7623,
1455
  "step": 9750
1456
  },
1457
  {
1458
  "epoch": 1.0566037735849056,
1459
- "grad_norm": 0.6173110008239746,
1460
  "learning_rate": 0.000537208850512682,
1461
- "loss": 3.7692,
1462
  "step": 9800
1463
  },
1464
  {
1465
  "epoch": 1.0619946091644206,
1466
- "grad_norm": 0.5675989389419556,
1467
  "learning_rate": 0.0005368850512682137,
1468
  "loss": 3.7665,
1469
  "step": 9850
1470
  },
1471
  {
1472
  "epoch": 1.0673854447439353,
1473
- "grad_norm": 0.5368490815162659,
1474
  "learning_rate": 0.0005365612520237453,
1475
- "loss": 3.7797,
1476
  "step": 9900
1477
  },
1478
  {
1479
  "epoch": 1.0727762803234502,
1480
- "grad_norm": 0.5896486639976501,
1481
  "learning_rate": 0.0005362374527792768,
1482
- "loss": 3.7825,
1483
  "step": 9950
1484
  },
1485
  {
1486
  "epoch": 1.0781671159029649,
1487
- "grad_norm": 0.5884218215942383,
1488
  "learning_rate": 0.0005359136535348084,
1489
- "loss": 3.7795,
1490
  "step": 10000
1491
  },
1492
  {
1493
  "epoch": 1.0781671159029649,
1494
- "eval_accuracy": 0.3442962286256681,
1495
- "eval_loss": 3.7569897174835205,
1496
- "eval_runtime": 183.6339,
1497
- "eval_samples_per_second": 98.081,
1498
  "eval_steps_per_second": 6.132,
1499
  "step": 10000
1500
  }
 
1
  {
2
+ "best_metric": 3.7604787349700928,
3
  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_low_0_6910/checkpoint-10000",
4
  "epoch": 1.0781671159029649,
5
  "eval_steps": 1000,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.005390835579514825,
13
+ "grad_norm": 4.480591297149658,
14
  "learning_rate": 0.000276,
15
+ "loss": 9.0235,
16
  "step": 50
17
  },
18
  {
19
  "epoch": 0.01078167115902965,
20
+ "grad_norm": 11.022015571594238,
21
  "learning_rate": 0.0005759999999999999,
22
+ "loss": 6.9569,
23
  "step": 100
24
  },
25
  {
26
  "epoch": 0.016172506738544475,
27
+ "grad_norm": 1.1546674966812134,
28
  "learning_rate": 0.000599702104695089,
29
+ "loss": 6.5399,
30
  "step": 150
31
  },
32
  {
33
  "epoch": 0.0215633423180593,
34
+ "grad_norm": 3.1353228092193604,
35
  "learning_rate": 0.0005993783054506205,
36
+ "loss": 6.248,
37
  "step": 200
38
  },
39
  {
40
  "epoch": 0.026954177897574125,
41
+ "grad_norm": 1.6697328090667725,
42
  "learning_rate": 0.0005990545062061521,
43
+ "loss": 6.0862,
44
  "step": 250
45
  },
46
  {
47
  "epoch": 0.03234501347708895,
48
+ "grad_norm": 2.0377206802368164,
49
  "learning_rate": 0.0005987307069616836,
50
+ "loss": 5.9479,
51
  "step": 300
52
  },
53
  {
54
  "epoch": 0.03773584905660377,
55
+ "grad_norm": 1.4318047761917114,
56
  "learning_rate": 0.0005984069077172153,
57
+ "loss": 5.8817,
58
  "step": 350
59
  },
60
  {
61
  "epoch": 0.0431266846361186,
62
+ "grad_norm": 1.5913349390029907,
63
  "learning_rate": 0.0005980831084727469,
64
+ "loss": 5.8152,
65
  "step": 400
66
  },
67
  {
68
  "epoch": 0.04851752021563342,
69
+ "grad_norm": 1.8550465106964111,
70
  "learning_rate": 0.0005977593092282784,
71
+ "loss": 5.7519,
72
  "step": 450
73
  },
74
  {
75
  "epoch": 0.05390835579514825,
76
+ "grad_norm": 0.9737682342529297,
77
  "learning_rate": 0.00059743550998381,
78
+ "loss": 5.6406,
79
  "step": 500
80
  },
81
  {
82
  "epoch": 0.05929919137466307,
83
+ "grad_norm": 0.9562398791313171,
84
  "learning_rate": 0.0005971117107393416,
85
+ "loss": 5.6015,
86
  "step": 550
87
  },
88
  {
89
  "epoch": 0.0646900269541779,
90
+ "grad_norm": 1.5083309412002563,
91
  "learning_rate": 0.0005967879114948732,
92
+ "loss": 5.5299,
93
  "step": 600
94
  },
95
  {
96
  "epoch": 0.07008086253369272,
97
+ "grad_norm": 1.215437650680542,
98
  "learning_rate": 0.0005964641122504047,
99
+ "loss": 5.4602,
100
  "step": 650
101
  },
102
  {
103
  "epoch": 0.07547169811320754,
104
+ "grad_norm": 0.867893636226654,
105
  "learning_rate": 0.0005961403130059363,
106
+ "loss": 5.3926,
107
  "step": 700
108
  },
109
  {
110
  "epoch": 0.08086253369272237,
111
+ "grad_norm": 0.9043947458267212,
112
  "learning_rate": 0.0005958165137614678,
113
+ "loss": 5.3218,
114
  "step": 750
115
  },
116
  {
117
  "epoch": 0.0862533692722372,
118
+ "grad_norm": 1.0625038146972656,
119
  "learning_rate": 0.0005954927145169995,
120
+ "loss": 5.2494,
121
  "step": 800
122
  },
123
  {
124
  "epoch": 0.09164420485175202,
125
+ "grad_norm": 1.1064561605453491,
126
  "learning_rate": 0.0005951689152725309,
127
+ "loss": 5.2301,
128
  "step": 850
129
  },
130
  {
131
  "epoch": 0.09703504043126684,
132
+ "grad_norm": 1.3318357467651367,
133
  "learning_rate": 0.0005948451160280626,
134
+ "loss": 5.1705,
135
  "step": 900
136
  },
137
  {
138
  "epoch": 0.10242587601078167,
139
+ "grad_norm": 0.7937541007995605,
140
  "learning_rate": 0.0005945213167835941,
141
+ "loss": 5.1053,
142
  "step": 950
143
  },
144
  {
145
  "epoch": 0.1078167115902965,
146
+ "grad_norm": 1.1719013452529907,
147
  "learning_rate": 0.0005941975175391257,
148
+ "loss": 5.1141,
149
  "step": 1000
150
  },
151
  {
152
  "epoch": 0.1078167115902965,
153
+ "eval_accuracy": 0.2272003595106752,
154
+ "eval_loss": 5.0241169929504395,
155
+ "eval_runtime": 184.1996,
156
+ "eval_samples_per_second": 97.78,
157
+ "eval_steps_per_second": 6.113,
158
  "step": 1000
159
  },
160
  {
161
  "epoch": 0.11320754716981132,
162
+ "grad_norm": 1.2634905576705933,
163
  "learning_rate": 0.0005938737182946572,
164
+ "loss": 5.0469,
165
  "step": 1050
166
  },
167
  {
168
  "epoch": 0.11859838274932614,
169
+ "grad_norm": 1.3801699876785278,
170
  "learning_rate": 0.0005935499190501888,
171
+ "loss": 5.0061,
172
  "step": 1100
173
  },
174
  {
175
  "epoch": 0.12398921832884097,
176
+ "grad_norm": 0.9360342025756836,
177
  "learning_rate": 0.0005932261198057204,
178
+ "loss": 5.0103,
179
  "step": 1150
180
  },
181
  {
182
  "epoch": 0.1293800539083558,
183
+ "grad_norm": 0.822311282157898,
184
  "learning_rate": 0.000592902320561252,
185
+ "loss": 4.9697,
186
  "step": 1200
187
  },
188
  {
189
  "epoch": 0.1347708894878706,
190
+ "grad_norm": 0.9512536525726318,
191
  "learning_rate": 0.0005925785213167835,
192
+ "loss": 4.9463,
193
  "step": 1250
194
  },
195
  {
196
  "epoch": 0.14016172506738545,
197
+ "grad_norm": 0.9103826284408569,
198
  "learning_rate": 0.0005922547220723151,
199
+ "loss": 4.9043,
200
  "step": 1300
201
  },
202
  {
203
  "epoch": 0.14555256064690028,
204
+ "grad_norm": 1.1774672269821167,
205
  "learning_rate": 0.0005919309228278468,
206
+ "loss": 4.8499,
207
  "step": 1350
208
  },
209
  {
210
  "epoch": 0.1509433962264151,
211
+ "grad_norm": 1.0059438943862915,
212
  "learning_rate": 0.0005916071235833783,
213
+ "loss": 4.8548,
214
  "step": 1400
215
  },
216
  {
217
  "epoch": 0.15633423180592992,
218
+ "grad_norm": 1.1289772987365723,
219
  "learning_rate": 0.0005912833243389097,
220
+ "loss": 4.8155,
221
  "step": 1450
222
  },
223
  {
224
  "epoch": 0.16172506738544473,
225
+ "grad_norm": 0.8365297317504883,
226
  "learning_rate": 0.0005909595250944414,
227
+ "loss": 4.8128,
228
  "step": 1500
229
  },
230
  {
231
  "epoch": 0.16711590296495957,
232
+ "grad_norm": 0.9716570377349854,
233
  "learning_rate": 0.000590635725849973,
234
+ "loss": 4.7983,
235
  "step": 1550
236
  },
237
  {
238
  "epoch": 0.1725067385444744,
239
+ "grad_norm": 1.0259066820144653,
240
  "learning_rate": 0.0005903119266055045,
241
+ "loss": 4.7741,
242
  "step": 1600
243
  },
244
  {
245
  "epoch": 0.1778975741239892,
246
+ "grad_norm": 1.1290489435195923,
247
  "learning_rate": 0.0005899881273610361,
248
+ "loss": 4.7448,
249
  "step": 1650
250
  },
251
  {
252
  "epoch": 0.18328840970350405,
253
+ "grad_norm": 1.0750524997711182,
254
  "learning_rate": 0.0005896643281165677,
255
+ "loss": 4.7288,
256
  "step": 1700
257
  },
258
  {
259
  "epoch": 0.18867924528301888,
260
+ "grad_norm": 0.9852170944213867,
261
  "learning_rate": 0.0005893405288720993,
262
+ "loss": 4.6785,
263
  "step": 1750
264
  },
265
  {
266
  "epoch": 0.1940700808625337,
267
+ "grad_norm": 0.7680540680885315,
268
  "learning_rate": 0.0005890167296276308,
269
+ "loss": 4.7077,
270
  "step": 1800
271
  },
272
  {
273
  "epoch": 0.19946091644204852,
274
+ "grad_norm": 1.04122793674469,
275
  "learning_rate": 0.0005886929303831624,
276
+ "loss": 4.6545,
277
  "step": 1850
278
  },
279
  {
280
  "epoch": 0.20485175202156333,
281
+ "grad_norm": 0.8569570779800415,
282
  "learning_rate": 0.0005883691311386939,
283
+ "loss": 4.6332,
284
  "step": 1900
285
  },
286
  {
287
  "epoch": 0.21024258760107817,
288
+ "grad_norm": 0.6798708438873291,
289
  "learning_rate": 0.0005880453318942256,
290
+ "loss": 4.6122,
291
  "step": 1950
292
  },
293
  {
294
  "epoch": 0.215633423180593,
295
+ "grad_norm": 0.9140726327896118,
296
  "learning_rate": 0.0005877215326497571,
297
+ "loss": 4.5896,
298
  "step": 2000
299
  },
300
  {
301
  "epoch": 0.215633423180593,
302
+ "eval_accuracy": 0.2689640305701419,
303
+ "eval_loss": 4.519481182098389,
304
+ "eval_runtime": 183.592,
305
+ "eval_samples_per_second": 98.103,
306
+ "eval_steps_per_second": 6.133,
307
  "step": 2000
308
  },
309
  {
310
  "epoch": 0.2210242587601078,
311
+ "grad_norm": 0.9053829908370972,
312
  "learning_rate": 0.0005873977334052887,
313
  "loss": 4.576,
314
  "step": 2050
315
  },
316
  {
317
  "epoch": 0.22641509433962265,
318
+ "grad_norm": 1.0720793008804321,
319
  "learning_rate": 0.0005870739341608202,
320
+ "loss": 4.5561,
321
  "step": 2100
322
  },
323
  {
324
  "epoch": 0.23180592991913745,
325
+ "grad_norm": 0.9383276104927063,
326
  "learning_rate": 0.0005867501349163519,
327
+ "loss": 4.5334,
328
  "step": 2150
329
  },
330
  {
331
  "epoch": 0.2371967654986523,
332
+ "grad_norm": 0.7491163015365601,
333
  "learning_rate": 0.0005864263356718833,
334
+ "loss": 4.5287,
335
  "step": 2200
336
  },
337
  {
338
  "epoch": 0.24258760107816713,
339
+ "grad_norm": 0.8868927955627441,
340
  "learning_rate": 0.000586102536427415,
341
+ "loss": 4.5065,
342
  "step": 2250
343
  },
344
  {
345
  "epoch": 0.24797843665768193,
346
+ "grad_norm": 0.7057210206985474,
347
  "learning_rate": 0.0005857787371829465,
348
+ "loss": 4.4984,
349
  "step": 2300
350
  },
351
  {
352
  "epoch": 0.25336927223719674,
353
+ "grad_norm": 1.031919002532959,
354
  "learning_rate": 0.0005854549379384781,
355
+ "loss": 4.4739,
356
  "step": 2350
357
  },
358
  {
359
  "epoch": 0.2587601078167116,
360
+ "grad_norm": 0.7643184065818787,
361
  "learning_rate": 0.0005851311386940096,
362
+ "loss": 4.4625,
363
  "step": 2400
364
  },
365
  {
366
  "epoch": 0.2641509433962264,
367
+ "grad_norm": 0.7940760850906372,
368
  "learning_rate": 0.0005848073394495412,
369
+ "loss": 4.4487,
370
  "step": 2450
371
  },
372
  {
373
  "epoch": 0.2695417789757412,
374
+ "grad_norm": 1.0397628545761108,
375
  "learning_rate": 0.0005844835402050728,
376
+ "loss": 4.427,
377
  "step": 2500
378
  },
379
  {
380
  "epoch": 0.2749326145552561,
381
+ "grad_norm": 0.8003842830657959,
382
  "learning_rate": 0.0005841597409606044,
383
+ "loss": 4.3974,
384
  "step": 2550
385
  },
386
  {
387
  "epoch": 0.2803234501347709,
388
+ "grad_norm": 0.8629958629608154,
389
  "learning_rate": 0.000583835941716136,
390
+ "loss": 4.4104,
391
  "step": 2600
392
  },
393
  {
394
  "epoch": 0.2857142857142857,
395
+ "grad_norm": 0.8870101571083069,
396
  "learning_rate": 0.0005835121424716675,
397
+ "loss": 4.3916,
398
  "step": 2650
399
  },
400
  {
401
  "epoch": 0.29110512129380056,
402
+ "grad_norm": 0.826655387878418,
403
  "learning_rate": 0.0005831883432271992,
404
+ "loss": 4.377,
405
  "step": 2700
406
  },
407
  {
408
  "epoch": 0.29649595687331537,
409
+ "grad_norm": 0.9174228310585022,
410
  "learning_rate": 0.0005828645439827307,
411
+ "loss": 4.3734,
412
  "step": 2750
413
  },
414
  {
415
  "epoch": 0.3018867924528302,
416
+ "grad_norm": 0.7232673168182373,
417
  "learning_rate": 0.0005825407447382622,
418
+ "loss": 4.3265,
419
  "step": 2800
420
  },
421
  {
422
  "epoch": 0.30727762803234504,
423
+ "grad_norm": 0.6589747071266174,
424
  "learning_rate": 0.0005822169454937938,
425
+ "loss": 4.3549,
426
  "step": 2850
427
  },
428
  {
429
  "epoch": 0.31266846361185985,
430
+ "grad_norm": 0.718672513961792,
431
  "learning_rate": 0.0005818931462493254,
432
+ "loss": 4.3224,
433
  "step": 2900
434
  },
435
  {
436
  "epoch": 0.31805929919137466,
437
+ "grad_norm": 0.7860566973686218,
438
  "learning_rate": 0.0005815693470048569,
439
+ "loss": 4.3266,
440
  "step": 2950
441
  },
442
  {
443
  "epoch": 0.32345013477088946,
444
+ "grad_norm": 0.7988869547843933,
445
  "learning_rate": 0.0005812455477603885,
446
+ "loss": 4.3176,
447
  "step": 3000
448
  },
449
  {
450
  "epoch": 0.32345013477088946,
451
+ "eval_accuracy": 0.2978062655991593,
452
+ "eval_loss": 4.240555286407471,
453
+ "eval_runtime": 183.5428,
454
+ "eval_samples_per_second": 98.13,
455
+ "eval_steps_per_second": 6.135,
456
  "step": 3000
457
  },
458
  {
459
  "epoch": 0.3288409703504043,
460
+ "grad_norm": 0.6903765797615051,
461
  "learning_rate": 0.0005809217485159201,
462
+ "loss": 4.3178,
463
  "step": 3050
464
  },
465
  {
466
  "epoch": 0.33423180592991913,
467
+ "grad_norm": 0.8304448127746582,
468
  "learning_rate": 0.0005805979492714517,
469
+ "loss": 4.2967,
470
  "step": 3100
471
  },
472
  {
473
  "epoch": 0.33962264150943394,
474
+ "grad_norm": 0.7549655437469482,
475
  "learning_rate": 0.0005802741500269832,
476
+ "loss": 4.2781,
477
  "step": 3150
478
  },
479
  {
480
  "epoch": 0.3450134770889488,
481
+ "grad_norm": 0.7030351758003235,
482
  "learning_rate": 0.0005799503507825148,
483
  "loss": 4.2802,
484
  "step": 3200
485
  },
486
  {
487
  "epoch": 0.3504043126684636,
488
+ "grad_norm": 0.7830392122268677,
489
  "learning_rate": 0.0005796265515380463,
490
+ "loss": 4.2733,
491
  "step": 3250
492
  },
493
  {
494
  "epoch": 0.3557951482479784,
495
+ "grad_norm": 1.0291404724121094,
496
  "learning_rate": 0.000579302752293578,
497
+ "loss": 4.2733,
498
  "step": 3300
499
  },
500
  {
501
  "epoch": 0.3611859838274933,
502
+ "grad_norm": 0.9816632866859436,
503
  "learning_rate": 0.0005789789530491095,
504
+ "loss": 4.2498,
505
  "step": 3350
506
  },
507
  {
508
  "epoch": 0.3665768194070081,
509
+ "grad_norm": 0.7603627443313599,
510
  "learning_rate": 0.0005786551538046411,
511
+ "loss": 4.243,
512
  "step": 3400
513
  },
514
  {
515
  "epoch": 0.3719676549865229,
516
+ "grad_norm": 0.6449446082115173,
517
  "learning_rate": 0.0005783313545601726,
518
+ "loss": 4.2318,
519
  "step": 3450
520
  },
521
  {
522
  "epoch": 0.37735849056603776,
523
+ "grad_norm": 0.8295075297355652,
524
  "learning_rate": 0.0005780075553157043,
525
+ "loss": 4.2306,
526
  "step": 3500
527
  },
528
  {
529
  "epoch": 0.38274932614555257,
530
+ "grad_norm": 0.7270027995109558,
531
  "learning_rate": 0.0005776837560712357,
532
+ "loss": 4.229,
533
  "step": 3550
534
  },
535
  {
536
  "epoch": 0.3881401617250674,
537
+ "grad_norm": 0.7727758884429932,
538
  "learning_rate": 0.0005773599568267673,
539
+ "loss": 4.2168,
540
  "step": 3600
541
  },
542
  {
543
  "epoch": 0.3935309973045822,
544
+ "grad_norm": 0.7819605469703674,
545
  "learning_rate": 0.0005770361575822989,
546
+ "loss": 4.2084,
547
  "step": 3650
548
  },
549
  {
550
  "epoch": 0.39892183288409705,
551
+ "grad_norm": 0.6820802092552185,
552
  "learning_rate": 0.0005767123583378305,
553
+ "loss": 4.1978,
554
  "step": 3700
555
  },
556
  {
557
  "epoch": 0.40431266846361186,
558
+ "grad_norm": 0.7236999273300171,
559
  "learning_rate": 0.000576388559093362,
560
+ "loss": 4.2005,
561
  "step": 3750
562
  },
563
  {
564
  "epoch": 0.40970350404312667,
565
+ "grad_norm": 0.6337780952453613,
566
  "learning_rate": 0.0005760647598488936,
567
+ "loss": 4.1639,
568
  "step": 3800
569
  },
570
  {
571
  "epoch": 0.41509433962264153,
572
+ "grad_norm": 0.6012069582939148,
573
  "learning_rate": 0.0005757409606044253,
574
+ "loss": 4.1724,
575
  "step": 3850
576
  },
577
  {
578
  "epoch": 0.42048517520215634,
579
+ "grad_norm": 0.702266275882721,
580
  "learning_rate": 0.0005754171613599568,
581
+ "loss": 4.1909,
582
  "step": 3900
583
  },
584
  {
585
  "epoch": 0.42587601078167114,
586
+ "grad_norm": 1.7298035621643066,
587
  "learning_rate": 0.0005750933621154884,
588
+ "loss": 4.1606,
589
  "step": 3950
590
  },
591
  {
592
  "epoch": 0.431266846361186,
593
+ "grad_norm": 0.6099041104316711,
594
  "learning_rate": 0.0005747695628710199,
595
+ "loss": 4.1612,
596
  "step": 4000
597
  },
598
  {
599
  "epoch": 0.431266846361186,
600
+ "eval_accuracy": 0.3116489694653876,
601
+ "eval_loss": 4.0956339836120605,
602
+ "eval_runtime": 183.5386,
603
+ "eval_samples_per_second": 98.132,
604
+ "eval_steps_per_second": 6.135,
605
  "step": 4000
606
  },
607
  {
608
  "epoch": 0.4366576819407008,
609
+ "grad_norm": 0.7810817360877991,
610
  "learning_rate": 0.0005744457636265515,
611
+ "loss": 4.1682,
612
  "step": 4050
613
  },
614
  {
615
  "epoch": 0.4420485175202156,
616
+ "grad_norm": 0.7356455326080322,
617
  "learning_rate": 0.0005741219643820831,
618
+ "loss": 4.1615,
619
  "step": 4100
620
  },
621
  {
622
  "epoch": 0.4474393530997305,
623
+ "grad_norm": 0.6011427044868469,
624
  "learning_rate": 0.0005737981651376146,
625
+ "loss": 4.1428,
626
  "step": 4150
627
  },
628
  {
629
  "epoch": 0.4528301886792453,
630
+ "grad_norm": 0.6384567618370056,
631
  "learning_rate": 0.0005734743658931462,
632
+ "loss": 4.1515,
633
  "step": 4200
634
  },
635
  {
636
  "epoch": 0.4582210242587601,
637
+ "grad_norm": 0.60776686668396,
638
  "learning_rate": 0.0005731505666486778,
639
+ "loss": 4.1383,
640
  "step": 4250
641
  },
642
  {
643
  "epoch": 0.4636118598382749,
644
+ "grad_norm": 0.7365734577178955,
645
  "learning_rate": 0.0005728267674042093,
646
+ "loss": 4.1402,
647
  "step": 4300
648
  },
649
  {
650
  "epoch": 0.46900269541778977,
651
+ "grad_norm": 0.9101449251174927,
652
  "learning_rate": 0.0005725029681597409,
653
+ "loss": 4.1279,
654
  "step": 4350
655
  },
656
  {
657
  "epoch": 0.4743935309973046,
658
+ "grad_norm": 0.7431625127792358,
659
  "learning_rate": 0.0005721791689152725,
660
+ "loss": 4.1123,
661
  "step": 4400
662
  },
663
  {
664
  "epoch": 0.4797843665768194,
665
+ "grad_norm": 0.6213470101356506,
666
  "learning_rate": 0.0005718553696708041,
667
+ "loss": 4.1218,
668
  "step": 4450
669
  },
670
  {
671
  "epoch": 0.48517520215633425,
672
+ "grad_norm": 0.5886948108673096,
673
  "learning_rate": 0.0005715315704263356,
674
+ "loss": 4.115,
675
  "step": 4500
676
  },
677
  {
678
  "epoch": 0.49056603773584906,
679
+ "grad_norm": 0.7804937958717346,
680
  "learning_rate": 0.0005712077711818672,
681
+ "loss": 4.1258,
682
  "step": 4550
683
  },
684
  {
685
  "epoch": 0.49595687331536387,
686
+ "grad_norm": 0.8453713059425354,
687
  "learning_rate": 0.0005708839719373987,
688
+ "loss": 4.1191,
689
  "step": 4600
690
  },
691
  {
692
  "epoch": 0.5013477088948787,
693
+ "grad_norm": 0.6225478053092957,
694
  "learning_rate": 0.0005705601726929304,
695
+ "loss": 4.077,
696
  "step": 4650
697
  },
698
  {
699
  "epoch": 0.5067385444743935,
700
+ "grad_norm": 0.6805531978607178,
701
  "learning_rate": 0.0005702363734484619,
702
+ "loss": 4.0773,
703
  "step": 4700
704
  },
705
  {
706
  "epoch": 0.5121293800539084,
707
+ "grad_norm": 0.5881138443946838,
708
  "learning_rate": 0.0005699125742039935,
709
+ "loss": 4.0845,
710
  "step": 4750
711
  },
712
  {
713
  "epoch": 0.5175202156334232,
714
+ "grad_norm": 0.7222368121147156,
715
  "learning_rate": 0.000569588774959525,
716
+ "loss": 4.0792,
717
  "step": 4800
718
  },
719
  {
720
  "epoch": 0.522911051212938,
721
+ "grad_norm": 0.6774405837059021,
722
  "learning_rate": 0.0005692649757150567,
723
+ "loss": 4.0735,
724
  "step": 4850
725
  },
726
  {
727
  "epoch": 0.5283018867924528,
728
+ "grad_norm": 0.6772297024726868,
729
  "learning_rate": 0.0005689411764705881,
730
+ "loss": 4.0765,
731
  "step": 4900
732
  },
733
  {
734
  "epoch": 0.5336927223719676,
735
+ "grad_norm": 0.6554037928581238,
736
  "learning_rate": 0.0005686173772261197,
737
+ "loss": 4.0658,
738
  "step": 4950
739
  },
740
  {
741
  "epoch": 0.5390835579514824,
742
+ "grad_norm": 0.8091623783111572,
743
  "learning_rate": 0.0005682935779816514,
744
  "loss": 4.0659,
745
  "step": 5000
746
  },
747
  {
748
  "epoch": 0.5390835579514824,
749
+ "eval_accuracy": 0.32031305939260213,
750
+ "eval_loss": 4.003881454467773,
751
+ "eval_runtime": 183.6428,
752
+ "eval_samples_per_second": 98.076,
753
+ "eval_steps_per_second": 6.131,
754
  "step": 5000
755
  },
756
  {
757
  "epoch": 0.5444743935309974,
758
+ "grad_norm": 0.683763861656189,
759
  "learning_rate": 0.0005679697787371829,
760
+ "loss": 4.073,
761
  "step": 5050
762
  },
763
  {
764
  "epoch": 0.5498652291105122,
765
+ "grad_norm": 0.6603772640228271,
766
  "learning_rate": 0.0005676459794927145,
767
+ "loss": 4.061,
768
  "step": 5100
769
  },
770
  {
771
  "epoch": 0.555256064690027,
772
+ "grad_norm": 0.6838295459747314,
773
  "learning_rate": 0.000567322180248246,
774
+ "loss": 4.0538,
775
  "step": 5150
776
  },
777
  {
778
  "epoch": 0.5606469002695418,
779
+ "grad_norm": 0.633878231048584,
780
  "learning_rate": 0.0005669983810037777,
781
+ "loss": 4.0601,
782
  "step": 5200
783
  },
784
  {
785
  "epoch": 0.5660377358490566,
786
+ "grad_norm": 0.6317396759986877,
787
  "learning_rate": 0.0005666745817593092,
788
+ "loss": 4.0392,
789
  "step": 5250
790
  },
791
  {
792
  "epoch": 0.5714285714285714,
793
+ "grad_norm": 0.5789533257484436,
794
  "learning_rate": 0.0005663507825148408,
795
+ "loss": 4.0392,
796
  "step": 5300
797
  },
798
  {
799
  "epoch": 0.5768194070080862,
800
+ "grad_norm": 0.6972223520278931,
801
  "learning_rate": 0.0005660269832703723,
802
+ "loss": 4.0281,
803
  "step": 5350
804
  },
805
  {
806
  "epoch": 0.5822102425876011,
807
+ "grad_norm": 0.6561431288719177,
808
  "learning_rate": 0.0005657031840259039,
809
+ "loss": 4.0341,
810
  "step": 5400
811
  },
812
  {
813
  "epoch": 0.5876010781671159,
814
+ "grad_norm": 0.7065162062644958,
815
  "learning_rate": 0.0005653793847814355,
816
+ "loss": 4.0336,
817
  "step": 5450
818
  },
819
  {
820
  "epoch": 0.5929919137466307,
821
+ "grad_norm": 0.7364255785942078,
822
  "learning_rate": 0.000565055585536967,
823
+ "loss": 4.0121,
824
  "step": 5500
825
  },
826
  {
827
  "epoch": 0.5983827493261455,
828
+ "grad_norm": 0.6912387013435364,
829
  "learning_rate": 0.0005647317862924986,
830
+ "loss": 4.0371,
831
  "step": 5550
832
  },
833
  {
834
  "epoch": 0.6037735849056604,
835
+ "grad_norm": 0.6509950757026672,
836
  "learning_rate": 0.0005644079870480302,
837
+ "loss": 3.9827,
838
  "step": 5600
839
  },
840
  {
841
  "epoch": 0.6091644204851752,
842
+ "grad_norm": 0.583453893661499,
843
  "learning_rate": 0.0005640841878035617,
844
+ "loss": 4.0182,
845
  "step": 5650
846
  },
847
  {
848
  "epoch": 0.6145552560646901,
849
+ "grad_norm": 0.652877926826477,
850
  "learning_rate": 0.0005637603885590933,
851
+ "loss": 4.0188,
852
  "step": 5700
853
  },
854
  {
855
  "epoch": 0.6199460916442049,
856
+ "grad_norm": 0.6168596148490906,
857
  "learning_rate": 0.0005634365893146248,
858
+ "loss": 4.0192,
859
  "step": 5750
860
  },
861
  {
862
  "epoch": 0.6253369272237197,
863
+ "grad_norm": 0.5807291865348816,
864
  "learning_rate": 0.0005631127900701565,
865
+ "loss": 4.008,
866
  "step": 5800
867
  },
868
  {
869
  "epoch": 0.6307277628032345,
870
+ "grad_norm": 0.6488702893257141,
871
  "learning_rate": 0.000562788990825688,
872
+ "loss": 3.9801,
873
  "step": 5850
874
  },
875
  {
876
  "epoch": 0.6361185983827493,
877
+ "grad_norm": 0.5922713279724121,
878
  "learning_rate": 0.0005624651915812196,
879
+ "loss": 3.9942,
880
  "step": 5900
881
  },
882
  {
883
  "epoch": 0.6415094339622641,
884
+ "grad_norm": 0.6832694411277771,
885
  "learning_rate": 0.0005621413923367511,
886
+ "loss": 4.0023,
887
  "step": 5950
888
  },
889
  {
890
  "epoch": 0.6469002695417789,
891
+ "grad_norm": 0.5798869729042053,
892
  "learning_rate": 0.0005618175930922828,
893
+ "loss": 4.0073,
894
  "step": 6000
895
  },
896
  {
897
  "epoch": 0.6469002695417789,
898
+ "eval_accuracy": 0.32747524045155707,
899
+ "eval_loss": 3.9221885204315186,
900
+ "eval_runtime": 183.4523,
901
+ "eval_samples_per_second": 98.178,
902
+ "eval_steps_per_second": 6.138,
903
  "step": 6000
904
  },
905
  {
906
  "epoch": 0.6522911051212938,
907
+ "grad_norm": 0.6464242935180664,
908
  "learning_rate": 0.0005614937938478143,
909
+ "loss": 3.9966,
910
  "step": 6050
911
  },
912
  {
913
  "epoch": 0.6576819407008087,
914
+ "grad_norm": 0.658988356590271,
915
  "learning_rate": 0.0005611699946033459,
916
+ "loss": 3.9887,
917
  "step": 6100
918
  },
919
  {
920
  "epoch": 0.6630727762803235,
921
+ "grad_norm": 0.6025015711784363,
922
  "learning_rate": 0.0005608461953588774,
923
+ "loss": 3.9854,
924
  "step": 6150
925
  },
926
  {
927
  "epoch": 0.6684636118598383,
928
+ "grad_norm": 0.5225794315338135,
929
  "learning_rate": 0.000560522396114409,
930
+ "loss": 3.9778,
931
  "step": 6200
932
  },
933
  {
934
  "epoch": 0.6738544474393531,
935
+ "grad_norm": 0.6130637526512146,
936
  "learning_rate": 0.0005601985968699405,
937
+ "loss": 3.9589,
938
  "step": 6250
939
  },
940
  {
941
  "epoch": 0.6792452830188679,
942
+ "grad_norm": 0.7012153267860413,
943
  "learning_rate": 0.0005598747976254721,
944
+ "loss": 3.9756,
945
  "step": 6300
946
  },
947
  {
948
  "epoch": 0.6846361185983828,
949
+ "grad_norm": 0.6216394901275635,
950
  "learning_rate": 0.0005595509983810038,
951
+ "loss": 3.9472,
952
  "step": 6350
953
  },
954
  {
955
  "epoch": 0.6900269541778976,
956
+ "grad_norm": 0.5851438641548157,
957
  "learning_rate": 0.0005592271991365353,
958
+ "loss": 3.961,
959
  "step": 6400
960
  },
961
  {
962
  "epoch": 0.6954177897574124,
963
+ "grad_norm": 0.6050034761428833,
964
  "learning_rate": 0.0005589033998920669,
965
+ "loss": 3.977,
966
  "step": 6450
967
  },
968
  {
969
  "epoch": 0.7008086253369272,
970
+ "grad_norm": 0.7108364105224609,
971
  "learning_rate": 0.0005585796006475984,
972
+ "loss": 3.9476,
973
  "step": 6500
974
  },
975
  {
976
  "epoch": 0.706199460916442,
977
+ "grad_norm": 0.6024855971336365,
978
  "learning_rate": 0.0005582558014031301,
979
+ "loss": 3.9737,
980
  "step": 6550
981
  },
982
  {
983
  "epoch": 0.7115902964959568,
984
+ "grad_norm": 0.6015711426734924,
985
  "learning_rate": 0.0005579320021586616,
986
+ "loss": 3.9551,
987
  "step": 6600
988
  },
989
  {
990
  "epoch": 0.7169811320754716,
991
+ "grad_norm": 0.5948218107223511,
992
  "learning_rate": 0.0005576082029141932,
993
+ "loss": 3.9501,
994
  "step": 6650
995
  },
996
  {
997
  "epoch": 0.7223719676549866,
998
+ "grad_norm": 0.5604109168052673,
999
  "learning_rate": 0.0005572844036697247,
1000
+ "loss": 3.9359,
1001
  "step": 6700
1002
  },
1003
  {
1004
  "epoch": 0.7277628032345014,
1005
+ "grad_norm": 0.5681114196777344,
1006
  "learning_rate": 0.0005569606044252563,
1007
+ "loss": 3.9409,
1008
  "step": 6750
1009
  },
1010
  {
1011
  "epoch": 0.7331536388140162,
1012
+ "grad_norm": 0.7211326360702515,
1013
  "learning_rate": 0.0005566368051807879,
1014
+ "loss": 3.959,
1015
  "step": 6800
1016
  },
1017
  {
1018
  "epoch": 0.738544474393531,
1019
+ "grad_norm": 0.7264713644981384,
1020
  "learning_rate": 0.0005563130059363194,
1021
+ "loss": 3.946,
1022
  "step": 6850
1023
  },
1024
  {
1025
  "epoch": 0.7439353099730458,
1026
+ "grad_norm": 0.537442684173584,
1027
  "learning_rate": 0.000555989206691851,
1028
+ "loss": 3.9459,
1029
  "step": 6900
1030
  },
1031
  {
1032
  "epoch": 0.7493261455525606,
1033
+ "grad_norm": 0.6321280002593994,
1034
  "learning_rate": 0.0005556654074473826,
1035
+ "loss": 3.9496,
1036
  "step": 6950
1037
  },
1038
  {
1039
  "epoch": 0.7547169811320755,
1040
+ "grad_norm": 0.6744154691696167,
1041
  "learning_rate": 0.0005553416082029141,
1042
+ "loss": 3.9322,
1043
  "step": 7000
1044
  },
1045
  {
1046
  "epoch": 0.7547169811320755,
1047
+ "eval_accuracy": 0.33246686277064214,
1048
+ "eval_loss": 3.870067596435547,
1049
+ "eval_runtime": 183.7087,
1050
+ "eval_samples_per_second": 98.041,
1051
+ "eval_steps_per_second": 6.129,
1052
  "step": 7000
1053
  },
1054
  {
1055
  "epoch": 0.7601078167115903,
1056
+ "grad_norm": 0.5471804141998291,
1057
  "learning_rate": 0.0005550178089584457,
1058
+ "loss": 3.925,
1059
  "step": 7050
1060
  },
1061
  {
1062
  "epoch": 0.7654986522911051,
1063
+ "grad_norm": 0.5467572808265686,
1064
  "learning_rate": 0.0005546940097139772,
1065
+ "loss": 3.9244,
1066
  "step": 7100
1067
  },
1068
  {
1069
  "epoch": 0.77088948787062,
1070
+ "grad_norm": 0.6606742143630981,
1071
  "learning_rate": 0.0005543702104695089,
1072
+ "loss": 3.928,
1073
  "step": 7150
1074
  },
1075
  {
1076
  "epoch": 0.7762803234501348,
1077
+ "grad_norm": 0.5569108724594116,
1078
  "learning_rate": 0.0005540464112250404,
1079
+ "loss": 3.9235,
1080
  "step": 7200
1081
  },
1082
  {
1083
  "epoch": 0.7816711590296496,
1084
+ "grad_norm": 0.6275643706321716,
1085
  "learning_rate": 0.000553722611980572,
1086
+ "loss": 3.9371,
1087
  "step": 7250
1088
  },
1089
  {
1090
  "epoch": 0.7870619946091644,
1091
+ "grad_norm": 0.5649941563606262,
1092
  "learning_rate": 0.0005533988127361035,
1093
+ "loss": 3.9205,
1094
  "step": 7300
1095
  },
1096
  {
1097
  "epoch": 0.7924528301886793,
1098
+ "grad_norm": 0.6518858671188354,
1099
  "learning_rate": 0.0005530750134916352,
1100
+ "loss": 3.9381,
1101
  "step": 7350
1102
  },
1103
  {
1104
  "epoch": 0.7978436657681941,
1105
+ "grad_norm": 0.5718905329704285,
1106
  "learning_rate": 0.0005527512142471668,
1107
+ "loss": 3.9101,
1108
  "step": 7400
1109
  },
1110
  {
1111
  "epoch": 0.8032345013477089,
1112
+ "grad_norm": 0.5825619101524353,
1113
  "learning_rate": 0.0005524274150026982,
1114
+ "loss": 3.9122,
1115
  "step": 7450
1116
  },
1117
  {
1118
  "epoch": 0.8086253369272237,
1119
+ "grad_norm": 0.5970243215560913,
1120
  "learning_rate": 0.0005521036157582299,
1121
+ "loss": 3.9021,
1122
  "step": 7500
1123
  },
1124
  {
1125
  "epoch": 0.8140161725067385,
1126
+ "grad_norm": 0.6513302326202393,
1127
  "learning_rate": 0.0005517798165137614,
1128
+ "loss": 3.9071,
1129
  "step": 7550
1130
  },
1131
  {
1132
  "epoch": 0.8194070080862533,
1133
+ "grad_norm": 0.6460273861885071,
1134
  "learning_rate": 0.000551456017269293,
1135
+ "loss": 3.8825,
1136
  "step": 7600
1137
  },
1138
  {
1139
  "epoch": 0.8247978436657682,
1140
+ "grad_norm": 0.5874283909797668,
1141
  "learning_rate": 0.0005511322180248245,
1142
+ "loss": 3.9013,
1143
  "step": 7650
1144
  },
1145
  {
1146
  "epoch": 0.8301886792452831,
1147
+ "grad_norm": 0.5879489183425903,
1148
  "learning_rate": 0.0005508084187803562,
1149
+ "loss": 3.9105,
1150
  "step": 7700
1151
  },
1152
  {
1153
  "epoch": 0.8355795148247979,
1154
+ "grad_norm": 0.5399644374847412,
1155
  "learning_rate": 0.0005504846195358877,
1156
+ "loss": 3.9043,
1157
  "step": 7750
1158
  },
1159
  {
1160
  "epoch": 0.8409703504043127,
1161
+ "grad_norm": 0.6143382787704468,
1162
  "learning_rate": 0.0005501608202914193,
1163
+ "loss": 3.894,
1164
  "step": 7800
1165
  },
1166
  {
1167
  "epoch": 0.8463611859838275,
1168
+ "grad_norm": 0.6967446804046631,
1169
  "learning_rate": 0.0005498370210469508,
1170
+ "loss": 3.8981,
1171
  "step": 7850
1172
  },
1173
  {
1174
  "epoch": 0.8517520215633423,
1175
+ "grad_norm": 0.6088925004005432,
1176
  "learning_rate": 0.0005495132218024824,
1177
+ "loss": 3.8815,
1178
  "step": 7900
1179
  },
1180
  {
1181
  "epoch": 0.8571428571428571,
1182
+ "grad_norm": 0.6350536346435547,
1183
  "learning_rate": 0.000549189422558014,
1184
+ "loss": 3.892,
1185
  "step": 7950
1186
  },
1187
  {
1188
  "epoch": 0.862533692722372,
1189
+ "grad_norm": 0.6048186421394348,
1190
  "learning_rate": 0.0005488656233135456,
1191
+ "loss": 3.8887,
1192
  "step": 8000
1193
  },
1194
  {
1195
  "epoch": 0.862533692722372,
1196
+ "eval_accuracy": 0.3370834153209916,
1197
+ "eval_loss": 3.8229870796203613,
1198
+ "eval_runtime": 183.3909,
1199
+ "eval_samples_per_second": 98.211,
1200
+ "eval_steps_per_second": 6.14,
1201
  "step": 8000
1202
  },
1203
  {
1204
  "epoch": 0.8679245283018868,
1205
+ "grad_norm": 0.5503471493721008,
1206
  "learning_rate": 0.0005485418240690771,
1207
+ "loss": 3.8711,
1208
  "step": 8050
1209
  },
1210
  {
1211
  "epoch": 0.8733153638814016,
1212
+ "grad_norm": 0.6174845695495605,
1213
  "learning_rate": 0.0005482180248246087,
1214
+ "loss": 3.8863,
1215
  "step": 8100
1216
  },
1217
  {
1218
  "epoch": 0.8787061994609164,
1219
+ "grad_norm": 0.6125220656394958,
1220
  "learning_rate": 0.0005478942255801403,
1221
+ "loss": 3.8837,
1222
  "step": 8150
1223
  },
1224
  {
1225
  "epoch": 0.8840970350404312,
1226
+ "grad_norm": 0.5201207399368286,
1227
  "learning_rate": 0.0005475704263356718,
1228
+ "loss": 3.8717,
1229
  "step": 8200
1230
  },
1231
  {
1232
  "epoch": 0.889487870619946,
1233
+ "grad_norm": 0.5790812373161316,
1234
  "learning_rate": 0.0005472466270912034,
1235
+ "loss": 3.882,
1236
  "step": 8250
1237
  },
1238
  {
1239
  "epoch": 0.894878706199461,
1240
+ "grad_norm": 0.6342065334320068,
1241
  "learning_rate": 0.000546922827846735,
1242
+ "loss": 3.8698,
1243
  "step": 8300
1244
  },
1245
  {
1246
  "epoch": 0.9002695417789758,
1247
+ "grad_norm": 0.564558744430542,
1248
  "learning_rate": 0.0005465990286022665,
1249
+ "loss": 3.8684,
1250
  "step": 8350
1251
  },
1252
  {
1253
  "epoch": 0.9056603773584906,
1254
+ "grad_norm": 0.5782247185707092,
1255
  "learning_rate": 0.0005462752293577981,
1256
+ "loss": 3.8869,
1257
  "step": 8400
1258
  },
1259
  {
1260
  "epoch": 0.9110512129380054,
1261
+ "grad_norm": 0.586661696434021,
1262
  "learning_rate": 0.0005459514301133296,
1263
+ "loss": 3.8805,
1264
  "step": 8450
1265
  },
1266
  {
1267
  "epoch": 0.9164420485175202,
1268
+ "grad_norm": 0.5306695699691772,
1269
  "learning_rate": 0.0005456276308688613,
1270
+ "loss": 3.8768,
1271
  "step": 8500
1272
  },
1273
  {
1274
  "epoch": 0.921832884097035,
1275
+ "grad_norm": 0.6053674221038818,
1276
  "learning_rate": 0.0005453038316243929,
1277
+ "loss": 3.8682,
1278
  "step": 8550
1279
  },
1280
  {
1281
  "epoch": 0.9272237196765498,
1282
+ "grad_norm": 0.5706995129585266,
1283
  "learning_rate": 0.0005449800323799244,
1284
+ "loss": 3.8806,
1285
  "step": 8600
1286
  },
1287
  {
1288
  "epoch": 0.9326145552560647,
1289
+ "grad_norm": 0.5597892999649048,
1290
  "learning_rate": 0.000544656233135456,
1291
+ "loss": 3.8589,
1292
  "step": 8650
1293
  },
1294
  {
1295
  "epoch": 0.9380053908355795,
1296
+ "grad_norm": 0.5851758122444153,
1297
  "learning_rate": 0.0005443324338909875,
1298
+ "loss": 3.8635,
1299
  "step": 8700
1300
  },
1301
  {
1302
  "epoch": 0.9433962264150944,
1303
+ "grad_norm": 0.564349889755249,
1304
  "learning_rate": 0.0005440086346465192,
1305
+ "loss": 3.8742,
1306
  "step": 8750
1307
  },
1308
  {
1309
  "epoch": 0.9487870619946092,
1310
+ "grad_norm": 0.5862805843353271,
1311
  "learning_rate": 0.0005436848354020506,
1312
+ "loss": 3.855,
1313
  "step": 8800
1314
  },
1315
  {
1316
  "epoch": 0.954177897574124,
1317
+ "grad_norm": 0.5932011008262634,
1318
  "learning_rate": 0.0005433610361575823,
1319
+ "loss": 3.8588,
1320
  "step": 8850
1321
  },
1322
  {
1323
  "epoch": 0.9595687331536388,
1324
+ "grad_norm": 0.57476806640625,
1325
  "learning_rate": 0.0005430372369131138,
1326
+ "loss": 3.8431,
1327
  "step": 8900
1328
  },
1329
  {
1330
  "epoch": 0.9649595687331537,
1331
+ "grad_norm": 0.6483927369117737,
1332
  "learning_rate": 0.0005427134376686454,
1333
+ "loss": 3.8577,
1334
  "step": 8950
1335
  },
1336
  {
1337
  "epoch": 0.9703504043126685,
1338
+ "grad_norm": 0.6311809420585632,
1339
  "learning_rate": 0.0005423896384241769,
1340
+ "loss": 3.8463,
1341
  "step": 9000
1342
  },
1343
  {
1344
  "epoch": 0.9703504043126685,
1345
+ "eval_accuracy": 0.3406645058504691,
1346
+ "eval_loss": 3.783052682876587,
1347
+ "eval_runtime": 183.7352,
1348
+ "eval_samples_per_second": 98.027,
1349
+ "eval_steps_per_second": 6.128,
1350
  "step": 9000
1351
  },
1352
  {
1353
  "epoch": 0.9757412398921833,
1354
+ "grad_norm": 0.585279643535614,
1355
  "learning_rate": 0.0005420658391797086,
1356
+ "loss": 3.8383,
1357
  "step": 9050
1358
  },
1359
  {
1360
  "epoch": 0.9811320754716981,
1361
+ "grad_norm": 0.5927286744117737,
1362
  "learning_rate": 0.0005417420399352401,
1363
+ "loss": 3.8525,
1364
  "step": 9100
1365
  },
1366
  {
1367
  "epoch": 0.9865229110512129,
1368
+ "grad_norm": 0.6152717471122742,
1369
  "learning_rate": 0.0005414182406907717,
1370
+ "loss": 3.8451,
1371
  "step": 9150
1372
  },
1373
  {
1374
  "epoch": 0.9919137466307277,
1375
+ "grad_norm": 0.6622257828712463,
1376
  "learning_rate": 0.0005410944414463032,
1377
+ "loss": 3.8608,
1378
  "step": 9200
1379
  },
1380
  {
1381
  "epoch": 0.9973045822102425,
1382
+ "grad_norm": 0.5549481511116028,
1383
  "learning_rate": 0.0005407706422018348,
1384
+ "loss": 3.8575,
1385
  "step": 9250
1386
  },
1387
  {
1388
  "epoch": 1.0026954177897573,
1389
+ "grad_norm": 0.5387691855430603,
1390
  "learning_rate": 0.0005404468429573664,
1391
+ "loss": 3.8051,
1392
  "step": 9300
1393
  },
1394
  {
1395
  "epoch": 1.0080862533692723,
1396
+ "grad_norm": 0.6694597005844116,
1397
  "learning_rate": 0.000540123043712898,
1398
+ "loss": 3.7746,
1399
  "step": 9350
1400
  },
1401
  {
1402
  "epoch": 1.013477088948787,
1403
+ "grad_norm": 0.5881537199020386,
1404
  "learning_rate": 0.0005397992444684295,
1405
+ "loss": 3.7565,
1406
  "step": 9400
1407
  },
1408
  {
1409
  "epoch": 1.0188679245283019,
1410
+ "grad_norm": 0.6585460901260376,
1411
  "learning_rate": 0.0005394754452239611,
1412
+ "loss": 3.7706,
1413
  "step": 9450
1414
  },
1415
  {
1416
  "epoch": 1.0242587601078168,
1417
+ "grad_norm": 0.5887331366539001,
1418
  "learning_rate": 0.0005391516459794927,
1419
+ "loss": 3.7821,
1420
  "step": 9500
1421
  },
1422
  {
1423
  "epoch": 1.0296495956873315,
1424
+ "grad_norm": 0.563365638256073,
1425
  "learning_rate": 0.0005388278467350242,
1426
+ "loss": 3.7902,
1427
  "step": 9550
1428
  },
1429
  {
1430
  "epoch": 1.0350404312668464,
1431
+ "grad_norm": 0.5882139801979065,
1432
  "learning_rate": 0.0005385040474905557,
1433
+ "loss": 3.7837,
1434
  "step": 9600
1435
  },
1436
  {
1437
  "epoch": 1.0404312668463611,
1438
+ "grad_norm": 0.5479844808578491,
1439
  "learning_rate": 0.0005381802482460874,
1440
+ "loss": 3.7941,
1441
  "step": 9650
1442
  },
1443
  {
1444
  "epoch": 1.045822102425876,
1445
+ "grad_norm": 0.530982494354248,
1446
  "learning_rate": 0.000537856449001619,
1447
+ "loss": 3.7935,
1448
  "step": 9700
1449
  },
1450
  {
1451
  "epoch": 1.0512129380053907,
1452
+ "grad_norm": 0.5939108729362488,
1453
  "learning_rate": 0.0005375326497571505,
1454
+ "loss": 3.7627,
1455
  "step": 9750
1456
  },
1457
  {
1458
  "epoch": 1.0566037735849056,
1459
+ "grad_norm": 0.6456679105758667,
1460
  "learning_rate": 0.000537208850512682,
1461
+ "loss": 3.7711,
1462
  "step": 9800
1463
  },
1464
  {
1465
  "epoch": 1.0619946091644206,
1466
+ "grad_norm": 0.5563388466835022,
1467
  "learning_rate": 0.0005368850512682137,
1468
  "loss": 3.7665,
1469
  "step": 9850
1470
  },
1471
  {
1472
  "epoch": 1.0673854447439353,
1473
+ "grad_norm": 0.5349035859107971,
1474
  "learning_rate": 0.0005365612520237453,
1475
+ "loss": 3.7807,
1476
  "step": 9900
1477
  },
1478
  {
1479
  "epoch": 1.0727762803234502,
1480
+ "grad_norm": 0.5764602422714233,
1481
  "learning_rate": 0.0005362374527792768,
1482
+ "loss": 3.7823,
1483
  "step": 9950
1484
  },
1485
  {
1486
  "epoch": 1.0781671159029649,
1487
+ "grad_norm": 0.5731992721557617,
1488
  "learning_rate": 0.0005359136535348084,
1489
+ "loss": 3.7831,
1490
  "step": 10000
1491
  },
1492
  {
1493
  "epoch": 1.0781671159029649,
1494
+ "eval_accuracy": 0.3440578441898031,
1495
+ "eval_loss": 3.7604787349700928,
1496
+ "eval_runtime": 183.6154,
1497
+ "eval_samples_per_second": 98.091,
1498
  "eval_steps_per_second": 6.132,
1499
  "step": 10000
1500
  }
checkpoint-10000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a817c7e11417e737801a5f84486a890a03f5efa8f82d52f39fcbcc53ecc8de5
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5446cb3556d5a7a5827b24898b2585968b6498d558252beee1814da016bbe785
3
  size 5304