c10 commited on
Commit
8c2441c
·
verified ·
1 Parent(s): c6d4908

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +455 -455
README.md CHANGED
@@ -17,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [yanolja/EEVE-Korean-Instruct-2.8B-v1.0](https://huggingface.co/yanolja/EEVE-Korean-Instruct-2.8B-v1.0) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.3944
21
 
22
  ## Model description
23
 
@@ -53,7 +53,7 @@ The following hyperparameters were used during training:
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:-----:|:----:|:---------------:|
56
- | 0.3027 | 1.0 | 233 | 0.3944 |
57
 
58
 
59
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [yanolja/EEVE-Korean-Instruct-2.8B-v1.0](https://huggingface.co/yanolja/EEVE-Korean-Instruct-2.8B-v1.0) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.3960
21
 
22
  ## Model description
23
 
 
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:-----:|:----:|:---------------:|
56
+ | 0.3026 | 1.0 | 233 | 0.3960 |
57
 
58
 
59
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 1.933383782157517e+16,
4
- "train_loss": 0.4337234101837797,
5
- "train_runtime": 302.365,
6
  "train_samples": 11140,
7
- "train_samples_per_second": 36.843,
8
- "train_steps_per_second": 0.771
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 1.933383782157517e+16,
4
+ "train_loss": 0.43378888076978694,
5
+ "train_runtime": 303.3238,
6
  "train_samples": 11140,
7
+ "train_samples_per_second": 36.726,
8
+ "train_steps_per_second": 0.768
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 1.933383782157517e+16,
4
- "train_loss": 0.4337234101837797,
5
- "train_runtime": 302.365,
6
  "train_samples": 11140,
7
- "train_samples_per_second": 36.843,
8
- "train_steps_per_second": 0.771
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 1.933383782157517e+16,
4
+ "train_loss": 0.43378888076978694,
5
+ "train_runtime": 303.3238,
6
  "train_samples": 11140,
7
+ "train_samples_per_second": 36.726,
8
+ "train_steps_per_second": 0.768
9
  }
trainer_state.json CHANGED
@@ -10,1651 +10,1651 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.004291845493562232,
13
- "grad_norm": 6.858874320983887,
14
  "learning_rate": 4.1666666666666667e-07,
15
  "loss": 2.3401,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.008583690987124463,
20
- "grad_norm": 7.190430164337158,
21
  "learning_rate": 8.333333333333333e-07,
22
  "loss": 2.3774,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.012875536480686695,
27
- "grad_norm": 7.033503532409668,
28
  "learning_rate": 1.25e-06,
29
- "loss": 2.3588,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.017167381974248927,
34
- "grad_norm": 6.961109161376953,
35
  "learning_rate": 1.6666666666666667e-06,
36
- "loss": 2.3643,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.02145922746781116,
41
- "grad_norm": 6.919872760772705,
42
  "learning_rate": 2.0833333333333334e-06,
43
- "loss": 2.3491,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.02575107296137339,
48
- "grad_norm": 6.795478820800781,
49
  "learning_rate": 2.5e-06,
50
- "loss": 2.2798,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.030042918454935622,
55
- "grad_norm": 6.641313552856445,
56
  "learning_rate": 2.916666666666667e-06,
57
- "loss": 2.2737,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.034334763948497854,
62
- "grad_norm": 5.885939598083496,
63
  "learning_rate": 3.3333333333333333e-06,
64
- "loss": 2.144,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.03862660944206009,
69
- "grad_norm": 5.8235673904418945,
70
  "learning_rate": 3.7500000000000005e-06,
71
- "loss": 2.1174,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.04291845493562232,
76
- "grad_norm": 5.560035228729248,
77
  "learning_rate": 4.166666666666667e-06,
78
- "loss": 2.0906,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.04721030042918455,
83
- "grad_norm": 4.423542022705078,
84
  "learning_rate": 4.583333333333333e-06,
85
- "loss": 1.7747,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.05150214592274678,
90
- "grad_norm": 4.328518867492676,
91
  "learning_rate": 5e-06,
92
- "loss": 1.6776,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.055793991416309016,
97
- "grad_norm": 4.233107566833496,
98
  "learning_rate": 5.416666666666667e-06,
99
- "loss": 1.623,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.060085836909871244,
104
- "grad_norm": 3.4439170360565186,
105
  "learning_rate": 5.833333333333334e-06,
106
- "loss": 1.1542,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.06437768240343347,
111
- "grad_norm": 3.5954792499542236,
112
  "learning_rate": 6.25e-06,
113
- "loss": 1.0954,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.06866952789699571,
118
- "grad_norm": 3.38552188873291,
119
  "learning_rate": 6.666666666666667e-06,
120
- "loss": 1.0228,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.07296137339055794,
125
- "grad_norm": 2.881553888320923,
126
  "learning_rate": 7.083333333333335e-06,
127
- "loss": 0.904,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.07725321888412018,
132
- "grad_norm": 2.4994957447052,
133
  "learning_rate": 7.500000000000001e-06,
134
- "loss": 0.8495,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.0815450643776824,
139
- "grad_norm": 2.0014519691467285,
140
  "learning_rate": 7.916666666666667e-06,
141
- "loss": 0.5866,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.08583690987124463,
146
- "grad_norm": 1.3816783428192139,
147
  "learning_rate": 8.333333333333334e-06,
148
- "loss": 0.4787,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.09012875536480687,
153
- "grad_norm": 1.0646474361419678,
154
  "learning_rate": 8.750000000000001e-06,
155
- "loss": 0.4628,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.0944206008583691,
160
- "grad_norm": 0.8993541598320007,
161
  "learning_rate": 9.166666666666666e-06,
162
- "loss": 0.4228,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.09871244635193133,
167
- "grad_norm": 0.7342298626899719,
168
  "learning_rate": 9.583333333333335e-06,
169
- "loss": 0.415,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.10300429184549356,
174
- "grad_norm": 0.6955525279045105,
175
  "learning_rate": 1e-05,
176
- "loss": 0.4173,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.1072961373390558,
181
- "grad_norm": 0.669081449508667,
182
  "learning_rate": 9.999435142363484e-06,
183
- "loss": 0.3829,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.11158798283261803,
188
- "grad_norm": 0.7821445465087891,
189
  "learning_rate": 9.997740697079595e-06,
190
- "loss": 0.4204,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.11587982832618025,
195
- "grad_norm": 0.6644569635391235,
196
  "learning_rate": 9.994917046996472e-06,
197
- "loss": 0.3668,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.12017167381974249,
202
- "grad_norm": 0.6669219732284546,
203
  "learning_rate": 9.990964830098246e-06,
204
- "loss": 0.3782,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.12446351931330472,
209
- "grad_norm": 0.6587529182434082,
210
  "learning_rate": 9.985884939360873e-06,
211
- "loss": 0.4079,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.12875536480686695,
216
- "grad_norm": 0.5538156628608704,
217
  "learning_rate": 9.979678522550382e-06,
218
- "loss": 0.3829,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.13304721030042918,
223
- "grad_norm": 0.5344951152801514,
224
  "learning_rate": 9.972346981963546e-06,
225
- "loss": 0.3661,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.13733905579399142,
230
- "grad_norm": 0.5352959632873535,
231
  "learning_rate": 9.963891974111042e-06,
232
- "loss": 0.3744,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.14163090128755365,
237
- "grad_norm": 0.5779032707214355,
238
  "learning_rate": 9.95431540934317e-06,
239
- "loss": 0.3734,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.1459227467811159,
244
- "grad_norm": 0.614485502243042,
245
  "learning_rate": 9.943619451418225e-06,
246
- "loss": 0.3987,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.15021459227467812,
251
- "grad_norm": 0.5526390671730042,
252
  "learning_rate": 9.931806517013612e-06,
253
- "loss": 0.3577,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.15450643776824036,
258
- "grad_norm": 0.5361288189888,
259
  "learning_rate": 9.918879275179819e-06,
260
- "loss": 0.3412,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.15879828326180256,
265
- "grad_norm": 0.5739436745643616,
266
  "learning_rate": 9.904840646737346e-06,
267
- "loss": 0.3737,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.1630901287553648,
272
- "grad_norm": 0.506790816783905,
273
  "learning_rate": 9.889693803616793e-06,
274
- "loss": 0.3686,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.16738197424892703,
279
- "grad_norm": 0.5421477556228638,
280
  "learning_rate": 9.873442168142158e-06,
281
- "loss": 0.3615,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.17167381974248927,
286
- "grad_norm": 0.5257068276405334,
287
  "learning_rate": 9.856089412257605e-06,
288
- "loss": 0.3349,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.1759656652360515,
293
- "grad_norm": 0.6092216968536377,
294
  "learning_rate": 9.837639456697802e-06,
295
- "loss": 0.382,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.18025751072961374,
300
- "grad_norm": 0.4873773753643036,
301
  "learning_rate": 9.818096470102067e-06,
302
  "loss": 0.3345,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.18454935622317598,
307
- "grad_norm": 0.4960627555847168,
308
  "learning_rate": 9.797464868072489e-06,
309
- "loss": 0.3521,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.1888412017167382,
314
- "grad_norm": 0.5022900104522705,
315
  "learning_rate": 9.775749312176249e-06,
316
- "loss": 0.3293,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.19313304721030042,
321
- "grad_norm": 0.47045403718948364,
322
  "learning_rate": 9.752954708892379e-06,
323
- "loss": 0.3392,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.19742489270386265,
328
- "grad_norm": 0.5006973147392273,
329
  "learning_rate": 9.729086208503174e-06,
330
- "loss": 0.3409,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.2017167381974249,
335
- "grad_norm": 0.4814106523990631,
336
  "learning_rate": 9.704149203930522e-06,
337
- "loss": 0.364,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.20600858369098712,
342
- "grad_norm": 0.5069060921669006,
343
  "learning_rate": 9.67814932951741e-06,
344
- "loss": 0.372,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.21030042918454936,
349
- "grad_norm": 0.5332397818565369,
350
  "learning_rate": 9.651092459754879e-06,
351
- "loss": 0.373,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.2145922746781116,
356
- "grad_norm": 0.46188411116600037,
357
  "learning_rate": 9.622984707954732e-06,
358
- "loss": 0.3478,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.21888412017167383,
363
- "grad_norm": 0.46651139855384827,
364
  "learning_rate": 9.593832424868271e-06,
365
- "loss": 0.3317,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.22317596566523606,
370
- "grad_norm": 0.5351216197013855,
371
  "learning_rate": 9.563642197251382e-06,
372
- "loss": 0.3714,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.22746781115879827,
377
- "grad_norm": 0.4725414514541626,
378
  "learning_rate": 9.532420846376316e-06,
379
- "loss": 0.3278,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.2317596566523605,
384
- "grad_norm": 0.47782063484191895,
385
  "learning_rate": 9.500175426490455e-06,
386
- "loss": 0.3166,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.23605150214592274,
391
- "grad_norm": 0.48254090547561646,
392
  "learning_rate": 9.466913223222467e-06,
393
- "loss": 0.3494,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.24034334763948498,
398
- "grad_norm": 0.46600666642189026,
399
  "learning_rate": 9.432641751936162e-06,
400
- "loss": 0.3297,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.2446351931330472,
405
- "grad_norm": 0.5330336093902588,
406
  "learning_rate": 9.397368756032445e-06,
407
- "loss": 0.343,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.24892703862660945,
412
- "grad_norm": 0.5342456698417664,
413
  "learning_rate": 9.361102205199762e-06,
414
  "loss": 0.3458,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.2532188841201717,
419
- "grad_norm": 0.48358458280563354,
420
  "learning_rate": 9.32385029361338e-06,
421
- "loss": 0.324,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.2575107296137339,
426
- "grad_norm": 0.47148555517196655,
427
  "learning_rate": 9.285621438083997e-06,
428
- "loss": 0.318,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.26180257510729615,
433
- "grad_norm": 0.5061048865318298,
434
  "learning_rate": 9.246424276156008e-06,
435
- "loss": 0.3294,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.26609442060085836,
440
- "grad_norm": 0.47446560859680176,
441
  "learning_rate": 9.206267664155906e-06,
442
- "loss": 0.3275,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.2703862660944206,
447
- "grad_norm": 0.5104511976242065,
448
  "learning_rate": 9.165160675191272e-06,
449
  "loss": 0.3323,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.27467811158798283,
454
- "grad_norm": 0.5101497769355774,
455
  "learning_rate": 9.123112597100759e-06,
456
- "loss": 0.3369,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.27896995708154504,
461
- "grad_norm": 0.5170594453811646,
462
  "learning_rate": 9.080132930355567e-06,
463
- "loss": 0.3333,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.2832618025751073,
468
- "grad_norm": 0.5066542029380798,
469
  "learning_rate": 9.03623138591289e-06,
470
- "loss": 0.3078,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.2875536480686695,
475
- "grad_norm": 0.47962620854377747,
476
  "learning_rate": 8.99141788302178e-06,
477
- "loss": 0.3069,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.2918454935622318,
482
- "grad_norm": 0.41906121373176575,
483
  "learning_rate": 8.94570254698197e-06,
484
- "loss": 0.2995,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.296137339055794,
489
- "grad_norm": 0.46970024704933167,
490
  "learning_rate": 8.899095706856122e-06,
491
- "loss": 0.3224,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.30042918454935624,
496
- "grad_norm": 0.6495445370674133,
497
  "learning_rate": 8.851607893136065e-06,
498
- "loss": 0.3528,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.30472103004291845,
503
- "grad_norm": 0.559937596321106,
504
  "learning_rate": 8.803249835363486e-06,
505
- "loss": 0.297,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.3090128755364807,
510
- "grad_norm": 0.5208443999290466,
511
  "learning_rate": 8.754032459705672e-06,
512
- "loss": 0.3465,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.3133047210300429,
517
- "grad_norm": 0.4508489668369293,
518
  "learning_rate": 8.703966886486819e-06,
519
  "loss": 0.3296,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.31759656652360513,
524
- "grad_norm": 0.4906875789165497,
525
  "learning_rate": 8.65306442767547e-06,
526
- "loss": 0.3207,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.3218884120171674,
531
- "grad_norm": 0.49397507309913635,
532
  "learning_rate": 8.601336584328659e-06,
533
- "loss": 0.3363,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.3261802575107296,
538
- "grad_norm": 0.4641498923301697,
539
  "learning_rate": 8.548795043993316e-06,
540
- "loss": 0.2956,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.33047210300429186,
545
- "grad_norm": 0.478019118309021,
546
  "learning_rate": 8.495451678065563e-06,
547
- "loss": 0.3334,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.33476394849785407,
552
- "grad_norm": 0.6079752445220947,
553
  "learning_rate": 8.441318539108433e-06,
554
- "loss": 0.3451,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.33905579399141633,
559
- "grad_norm": 0.46105825901031494,
560
  "learning_rate": 8.386407858128707e-06,
561
- "loss": 0.3378,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.34334763948497854,
566
- "grad_norm": 0.46352678537368774,
567
  "learning_rate": 8.330732041813367e-06,
568
- "loss": 0.3141,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.34763948497854075,
573
- "grad_norm": 0.5445449352264404,
574
  "learning_rate": 8.274303669726427e-06,
575
- "loss": 0.3447,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.351931330472103,
580
- "grad_norm": 0.47890031337738037,
581
  "learning_rate": 8.217135491466636e-06,
582
- "loss": 0.3472,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.3562231759656652,
587
- "grad_norm": 0.5047430396080017,
588
  "learning_rate": 8.15924042378682e-06,
589
  "loss": 0.3049,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.3605150214592275,
594
- "grad_norm": 0.423581600189209,
595
  "learning_rate": 8.100631547675417e-06,
596
- "loss": 0.304,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.3648068669527897,
601
- "grad_norm": 0.4484553039073944,
602
  "learning_rate": 8.041322105400923e-06,
603
- "loss": 0.3354,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.36909871244635195,
608
- "grad_norm": 0.48292070627212524,
609
  "learning_rate": 7.981325497519892e-06,
610
- "loss": 0.3192,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.37339055793991416,
615
- "grad_norm": 0.46491944789886475,
616
  "learning_rate": 7.920655279849173e-06,
617
- "loss": 0.3224,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.3776824034334764,
622
- "grad_norm": 0.43498605489730835,
623
  "learning_rate": 7.859325160403073e-06,
624
- "loss": 0.34,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.38197424892703863,
629
- "grad_norm": 0.4773942232131958,
630
  "learning_rate": 7.797348996296116e-06,
631
- "loss": 0.3203,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.38626609442060084,
636
- "grad_norm": 0.48013803362846375,
637
  "learning_rate": 7.734740790612137e-06,
638
- "loss": 0.3408,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.3905579399141631,
643
- "grad_norm": 0.4329844117164612,
644
  "learning_rate": 7.671514689240366e-06,
645
- "loss": 0.3088,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.3948497854077253,
650
- "grad_norm": 0.45222362875938416,
651
  "learning_rate": 7.607684977679284e-06,
652
- "loss": 0.2995,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.39914163090128757,
657
- "grad_norm": 0.4020434021949768,
658
  "learning_rate": 7.543266077808893e-06,
659
- "loss": 0.2854,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.4034334763948498,
664
- "grad_norm": 0.46024152636528015,
665
  "learning_rate": 7.478272544632204e-06,
666
  "loss": 0.32,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.40772532188841204,
671
- "grad_norm": 0.44250139594078064,
672
  "learning_rate": 7.412719062986632e-06,
673
- "loss": 0.3385,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.41201716738197425,
678
- "grad_norm": 0.43623730540275574,
679
  "learning_rate": 7.3466204442260605e-06,
680
- "loss": 0.2766,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.41630901287553645,
685
- "grad_norm": 0.5541526675224304,
686
  "learning_rate": 7.279991622874319e-06,
687
- "loss": 0.3426,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.4206008583690987,
692
- "grad_norm": 0.44901198148727417,
693
  "learning_rate": 7.212847653250828e-06,
694
- "loss": 0.3006,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.4248927038626609,
699
- "grad_norm": 0.4395318329334259,
700
  "learning_rate": 7.145203706069183e-06,
701
  "loss": 0.2864,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.4291845493562232,
706
- "grad_norm": 0.457040935754776,
707
  "learning_rate": 7.0770750650094335e-06,
708
- "loss": 0.3379,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 0.4334763948497854,
713
- "grad_norm": 0.4515530467033386,
714
  "learning_rate": 7.008477123264849e-06,
715
- "loss": 0.3446,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 0.43776824034334766,
720
- "grad_norm": 0.4430055022239685,
721
  "learning_rate": 6.939425380063924e-06,
722
- "loss": 0.3149,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 0.44206008583690987,
727
- "grad_norm": 0.43485042452812195,
728
  "learning_rate": 6.869935437168449e-06,
729
  "loss": 0.3228,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 0.44635193133047213,
734
- "grad_norm": 0.4601809084415436,
735
  "learning_rate": 6.800022995348381e-06,
736
- "loss": 0.3181,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 0.45064377682403434,
741
- "grad_norm": 0.48045915365219116,
742
  "learning_rate": 6.729703850834381e-06,
743
- "loss": 0.3249,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 0.45493562231759654,
748
- "grad_norm": 0.40369826555252075,
749
  "learning_rate": 6.65899389174876e-06,
750
- "loss": 0.2829,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 0.4592274678111588,
755
- "grad_norm": 0.4204097390174866,
756
  "learning_rate": 6.587909094515663e-06,
757
- "loss": 0.2964,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 0.463519313304721,
762
- "grad_norm": 0.39988335967063904,
763
  "learning_rate": 6.5164655202513135e-06,
764
- "loss": 0.2794,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 0.4678111587982833,
769
- "grad_norm": 0.5971567630767822,
770
  "learning_rate": 6.444679311135112e-06,
771
- "loss": 0.3404,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 0.4721030042918455,
776
- "grad_norm": 0.46785640716552734,
777
  "learning_rate": 6.372566686762427e-06,
778
  "loss": 0.3413,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 0.47639484978540775,
783
- "grad_norm": 0.4731470048427582,
784
  "learning_rate": 6.300143940479881e-06,
785
- "loss": 0.3255,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 0.48068669527896996,
790
- "grad_norm": 0.41854605078697205,
791
  "learning_rate": 6.227427435703997e-06,
792
- "loss": 0.2891,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 0.48497854077253216,
797
- "grad_norm": 0.43789684772491455,
798
  "learning_rate": 6.154433602223979e-06,
799
  "loss": 0.2692,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 0.4892703862660944,
804
- "grad_norm": 0.4322194755077362,
805
  "learning_rate": 6.0811789324895365e-06,
806
- "loss": 0.3002,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 0.49356223175965663,
811
- "grad_norm": 0.4425448775291443,
812
  "learning_rate": 6.0076799778845105e-06,
813
- "loss": 0.3067,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 0.4978540772532189,
818
- "grad_norm": 0.4232672154903412,
819
  "learning_rate": 5.933953344987215e-06,
820
- "loss": 0.312,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 0.5021459227467812,
825
- "grad_norm": 0.46017172932624817,
826
  "learning_rate": 5.860015691818292e-06,
827
- "loss": 0.3317,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 0.5064377682403434,
832
- "grad_norm": 0.44368740916252136,
833
  "learning_rate": 5.78588372407695e-06,
834
- "loss": 0.2917,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 0.5107296137339056,
839
- "grad_norm": 0.4277245104312897,
840
  "learning_rate": 5.711574191366427e-06,
841
- "loss": 0.3069,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 0.5150214592274678,
846
- "grad_norm": 0.4424058794975281,
847
  "learning_rate": 5.637103883409525e-06,
848
- "loss": 0.3045,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 0.51931330472103,
853
- "grad_norm": 0.4672929644584656,
854
  "learning_rate": 5.562489626255104e-06,
855
- "loss": 0.3125,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 0.5236051502145923,
860
- "grad_norm": 0.438452810049057,
861
  "learning_rate": 5.487748278476342e-06,
862
- "loss": 0.3135,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 0.5278969957081545,
867
- "grad_norm": 0.3892000615596771,
868
  "learning_rate": 5.412896727361663e-06,
869
- "loss": 0.2516,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 0.5321888412017167,
874
- "grad_norm": 0.43070968985557556,
875
  "learning_rate": 5.337951885099167e-06,
876
- "loss": 0.2903,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 0.5364806866952789,
881
- "grad_norm": 0.4564039409160614,
882
  "learning_rate": 5.262930684955439e-06,
883
- "loss": 0.2932,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 0.5407725321888412,
888
- "grad_norm": 0.4456172585487366,
889
  "learning_rate": 5.187850077449604e-06,
890
- "loss": 0.3176,
891
  "step": 126
892
  },
893
  {
894
  "epoch": 0.5450643776824035,
895
- "grad_norm": 0.4061397314071655,
896
  "learning_rate": 5.112727026523461e-06,
897
- "loss": 0.2757,
898
  "step": 127
899
  },
900
  {
901
  "epoch": 0.5493562231759657,
902
- "grad_norm": 0.43948251008987427,
903
  "learning_rate": 5.03757850570861e-06,
904
- "loss": 0.3079,
905
  "step": 128
906
  },
907
  {
908
  "epoch": 0.5536480686695279,
909
- "grad_norm": 0.41549184918403625,
910
  "learning_rate": 4.9624214942913916e-06,
911
- "loss": 0.3015,
912
  "step": 129
913
  },
914
  {
915
  "epoch": 0.5579399141630901,
916
- "grad_norm": 0.41364341974258423,
917
  "learning_rate": 4.88727297347654e-06,
918
- "loss": 0.3065,
919
  "step": 130
920
  },
921
  {
922
  "epoch": 0.5622317596566524,
923
- "grad_norm": 0.41310611367225647,
924
  "learning_rate": 4.8121499225503974e-06,
925
- "loss": 0.2883,
926
  "step": 131
927
  },
928
  {
929
  "epoch": 0.5665236051502146,
930
- "grad_norm": 0.4096148908138275,
931
  "learning_rate": 4.737069315044562e-06,
932
- "loss": 0.2955,
933
  "step": 132
934
  },
935
  {
936
  "epoch": 0.5708154506437768,
937
- "grad_norm": 0.46152743697166443,
938
  "learning_rate": 4.662048114900837e-06,
939
- "loss": 0.3237,
940
  "step": 133
941
  },
942
  {
943
  "epoch": 0.575107296137339,
944
- "grad_norm": 0.458767831325531,
945
  "learning_rate": 4.587103272638339e-06,
946
- "loss": 0.3269,
947
  "step": 134
948
  },
949
  {
950
  "epoch": 0.5793991416309013,
951
- "grad_norm": 0.4218490719795227,
952
  "learning_rate": 4.512251721523659e-06,
953
- "loss": 0.2685,
954
  "step": 135
955
  },
956
  {
957
  "epoch": 0.5836909871244635,
958
- "grad_norm": 0.45311009883880615,
959
  "learning_rate": 4.437510373744897e-06,
960
- "loss": 0.3089,
961
  "step": 136
962
  },
963
  {
964
  "epoch": 0.5879828326180258,
965
- "grad_norm": 0.4288144111633301,
966
  "learning_rate": 4.362896116590475e-06,
967
- "loss": 0.3043,
968
  "step": 137
969
  },
970
  {
971
  "epoch": 0.592274678111588,
972
- "grad_norm": 0.49440011382102966,
973
  "learning_rate": 4.2884258086335755e-06,
974
- "loss": 0.3233,
975
  "step": 138
976
  },
977
  {
978
  "epoch": 0.5965665236051502,
979
- "grad_norm": 0.41329360008239746,
980
  "learning_rate": 4.214116275923051e-06,
981
- "loss": 0.2735,
982
  "step": 139
983
  },
984
  {
985
  "epoch": 0.6008583690987125,
986
- "grad_norm": 0.4687763452529907,
987
  "learning_rate": 4.1399843081817085e-06,
988
- "loss": 0.3124,
989
  "step": 140
990
  },
991
  {
992
  "epoch": 0.6051502145922747,
993
- "grad_norm": 0.4186963438987732,
994
  "learning_rate": 4.066046655012786e-06,
995
- "loss": 0.3046,
996
  "step": 141
997
  },
998
  {
999
  "epoch": 0.6094420600858369,
1000
- "grad_norm": 0.4234992563724518,
1001
  "learning_rate": 3.992320022115492e-06,
1002
- "loss": 0.3161,
1003
  "step": 142
1004
  },
1005
  {
1006
  "epoch": 0.6137339055793991,
1007
- "grad_norm": 0.4364720284938812,
1008
  "learning_rate": 3.918821067510464e-06,
1009
- "loss": 0.2929,
1010
  "step": 143
1011
  },
1012
  {
1013
  "epoch": 0.6180257510729614,
1014
- "grad_norm": 0.45965394377708435,
1015
  "learning_rate": 3.845566397776022e-06,
1016
- "loss": 0.3302,
1017
  "step": 144
1018
  },
1019
  {
1020
  "epoch": 0.6223175965665236,
1021
- "grad_norm": 0.4237099289894104,
1022
  "learning_rate": 3.7725725642960047e-06,
1023
- "loss": 0.3058,
1024
  "step": 145
1025
  },
1026
  {
1027
  "epoch": 0.6266094420600858,
1028
- "grad_norm": 0.45285725593566895,
1029
  "learning_rate": 3.6998560595201188e-06,
1030
- "loss": 0.2914,
1031
  "step": 146
1032
  },
1033
  {
1034
  "epoch": 0.630901287553648,
1035
- "grad_norm": 0.42853862047195435,
1036
  "learning_rate": 3.627433313237576e-06,
1037
- "loss": 0.2959,
1038
  "step": 147
1039
  },
1040
  {
1041
  "epoch": 0.6351931330472103,
1042
- "grad_norm": 0.4184584319591522,
1043
  "learning_rate": 3.555320688864889e-06,
1044
- "loss": 0.291,
1045
  "step": 148
1046
  },
1047
  {
1048
  "epoch": 0.6394849785407726,
1049
- "grad_norm": 0.4067162573337555,
1050
  "learning_rate": 3.483534479748688e-06,
1051
- "loss": 0.2916,
1052
  "step": 149
1053
  },
1054
  {
1055
  "epoch": 0.6437768240343348,
1056
- "grad_norm": 0.4165968894958496,
1057
  "learning_rate": 3.4120909054843375e-06,
1058
- "loss": 0.2836,
1059
  "step": 150
1060
  },
1061
  {
1062
  "epoch": 0.648068669527897,
1063
- "grad_norm": 0.3848138451576233,
1064
  "learning_rate": 3.3410061082512422e-06,
1065
- "loss": 0.2589,
1066
  "step": 151
1067
  },
1068
  {
1069
  "epoch": 0.6523605150214592,
1070
- "grad_norm": 0.40843644738197327,
1071
  "learning_rate": 3.2702961491656197e-06,
1072
  "loss": 0.2864,
1073
  "step": 152
1074
  },
1075
  {
1076
  "epoch": 0.6566523605150214,
1077
- "grad_norm": 0.3916611969470978,
1078
  "learning_rate": 3.1999770046516198e-06,
1079
- "loss": 0.2829,
1080
  "step": 153
1081
  },
1082
  {
1083
  "epoch": 0.6609442060085837,
1084
- "grad_norm": 0.40662676095962524,
1085
  "learning_rate": 3.130064562831553e-06,
1086
- "loss": 0.276,
1087
  "step": 154
1088
  },
1089
  {
1090
  "epoch": 0.6652360515021459,
1091
- "grad_norm": 0.4375065565109253,
1092
  "learning_rate": 3.0605746199360755e-06,
1093
- "loss": 0.298,
1094
  "step": 155
1095
  },
1096
  {
1097
  "epoch": 0.6695278969957081,
1098
- "grad_norm": 0.41338327527046204,
1099
  "learning_rate": 2.991522876735154e-06,
1100
- "loss": 0.2864,
1101
  "step": 156
1102
  },
1103
  {
1104
  "epoch": 0.6738197424892703,
1105
- "grad_norm": 0.4776081442832947,
1106
  "learning_rate": 2.9229249349905686e-06,
1107
- "loss": 0.3015,
1108
  "step": 157
1109
  },
1110
  {
1111
  "epoch": 0.6781115879828327,
1112
- "grad_norm": 0.4213579297065735,
1113
  "learning_rate": 2.8547962939308187e-06,
1114
- "loss": 0.2831,
1115
  "step": 158
1116
  },
1117
  {
1118
  "epoch": 0.6824034334763949,
1119
- "grad_norm": 0.44098085165023804,
1120
  "learning_rate": 2.787152346749173e-06,
1121
- "loss": 0.3002,
1122
  "step": 159
1123
  },
1124
  {
1125
  "epoch": 0.6866952789699571,
1126
- "grad_norm": 0.4195074737071991,
1127
  "learning_rate": 2.720008377125682e-06,
1128
- "loss": 0.2817,
1129
  "step": 160
1130
  },
1131
  {
1132
  "epoch": 0.6909871244635193,
1133
- "grad_norm": 0.4369998574256897,
1134
  "learning_rate": 2.6533795557739407e-06,
1135
- "loss": 0.3246,
1136
  "step": 161
1137
  },
1138
  {
1139
  "epoch": 0.6952789699570815,
1140
- "grad_norm": 0.43651026487350464,
1141
  "learning_rate": 2.5872809370133704e-06,
1142
- "loss": 0.2949,
1143
  "step": 162
1144
  },
1145
  {
1146
  "epoch": 0.6995708154506438,
1147
- "grad_norm": 0.42907676100730896,
1148
  "learning_rate": 2.5217274553677975e-06,
1149
- "loss": 0.2984,
1150
  "step": 163
1151
  },
1152
  {
1153
  "epoch": 0.703862660944206,
1154
- "grad_norm": 0.4614652395248413,
1155
  "learning_rate": 2.4567339221911086e-06,
1156
- "loss": 0.2932,
1157
  "step": 164
1158
  },
1159
  {
1160
  "epoch": 0.7081545064377682,
1161
- "grad_norm": 0.43435779213905334,
1162
  "learning_rate": 2.3923150223207176e-06,
1163
- "loss": 0.3116,
1164
  "step": 165
1165
  },
1166
  {
1167
  "epoch": 0.7124463519313304,
1168
- "grad_norm": 0.4169783294200897,
1169
  "learning_rate": 2.328485310759635e-06,
1170
- "loss": 0.2868,
1171
  "step": 166
1172
  },
1173
  {
1174
  "epoch": 0.7167381974248928,
1175
- "grad_norm": 0.44295355677604675,
1176
  "learning_rate": 2.265259209387867e-06,
1177
- "loss": 0.3053,
1178
  "step": 167
1179
  },
1180
  {
1181
  "epoch": 0.721030042918455,
1182
- "grad_norm": 0.4090026319026947,
1183
  "learning_rate": 2.202651003703885e-06,
1184
- "loss": 0.2807,
1185
  "step": 168
1186
  },
1187
  {
1188
  "epoch": 0.7253218884120172,
1189
- "grad_norm": 0.3898420035839081,
1190
  "learning_rate": 2.140674839596931e-06,
1191
- "loss": 0.2658,
1192
  "step": 169
1193
  },
1194
  {
1195
  "epoch": 0.7296137339055794,
1196
- "grad_norm": 0.4384562075138092,
1197
  "learning_rate": 2.0793447201508288e-06,
1198
- "loss": 0.3157,
1199
  "step": 170
1200
  },
1201
  {
1202
  "epoch": 0.7339055793991416,
1203
- "grad_norm": 0.41513383388519287,
1204
  "learning_rate": 2.01867450248011e-06,
1205
- "loss": 0.3028,
1206
  "step": 171
1207
  },
1208
  {
1209
  "epoch": 0.7381974248927039,
1210
- "grad_norm": 0.39594322443008423,
1211
  "learning_rate": 1.9586778945990785e-06,
1212
- "loss": 0.2819,
1213
  "step": 172
1214
  },
1215
  {
1216
  "epoch": 0.7424892703862661,
1217
- "grad_norm": 0.4369785189628601,
1218
  "learning_rate": 1.8993684523245842e-06,
1219
- "loss": 0.2913,
1220
  "step": 173
1221
  },
1222
  {
1223
  "epoch": 0.7467811158798283,
1224
- "grad_norm": 0.47406381368637085,
1225
  "learning_rate": 1.8407595762131814e-06,
1226
- "loss": 0.3417,
1227
  "step": 174
1228
  },
1229
  {
1230
  "epoch": 0.7510729613733905,
1231
- "grad_norm": 0.47587499022483826,
1232
  "learning_rate": 1.7828645085333645e-06,
1233
- "loss": 0.3476,
1234
  "step": 175
1235
  },
1236
  {
1237
  "epoch": 0.7553648068669528,
1238
- "grad_norm": 0.4360049068927765,
1239
  "learning_rate": 1.7256963302735752e-06,
1240
- "loss": 0.318,
1241
  "step": 176
1242
  },
1243
  {
1244
  "epoch": 0.759656652360515,
1245
- "grad_norm": 0.4094788134098053,
1246
  "learning_rate": 1.6692679581866334e-06,
1247
- "loss": 0.3051,
1248
  "step": 177
1249
  },
1250
  {
1251
  "epoch": 0.7639484978540773,
1252
- "grad_norm": 0.42082950472831726,
1253
  "learning_rate": 1.6135921418712959e-06,
1254
  "loss": 0.274,
1255
  "step": 178
1256
  },
1257
  {
1258
  "epoch": 0.7682403433476395,
1259
- "grad_norm": 0.4242251217365265,
1260
  "learning_rate": 1.5586814608915673e-06,
1261
- "loss": 0.314,
1262
  "step": 179
1263
  },
1264
  {
1265
  "epoch": 0.7725321888412017,
1266
- "grad_norm": 0.41716471314430237,
1267
  "learning_rate": 1.5045483219344387e-06,
1268
- "loss": 0.2798,
1269
  "step": 180
1270
  },
1271
  {
1272
  "epoch": 0.776824034334764,
1273
- "grad_norm": 0.41137608885765076,
1274
  "learning_rate": 1.4512049560066837e-06,
1275
- "loss": 0.2758,
1276
  "step": 181
1277
  },
1278
  {
1279
  "epoch": 0.7811158798283262,
1280
- "grad_norm": 0.41746678948402405,
1281
  "learning_rate": 1.3986634156713418e-06,
1282
- "loss": 0.2842,
1283
  "step": 182
1284
  },
1285
  {
1286
  "epoch": 0.7854077253218884,
1287
- "grad_norm": 0.41145697236061096,
1288
  "learning_rate": 1.3469355723245303e-06,
1289
- "loss": 0.2861,
1290
  "step": 183
1291
  },
1292
  {
1293
  "epoch": 0.7896995708154506,
1294
- "grad_norm": 0.42715349793434143,
1295
  "learning_rate": 1.2960331135131826e-06,
1296
- "loss": 0.329,
1297
  "step": 184
1298
  },
1299
  {
1300
  "epoch": 0.7939914163090128,
1301
- "grad_norm": 0.4114415943622589,
1302
  "learning_rate": 1.245967540294329e-06,
1303
  "loss": 0.2851,
1304
  "step": 185
1305
  },
1306
  {
1307
  "epoch": 0.7982832618025751,
1308
- "grad_norm": 0.3830616772174835,
1309
  "learning_rate": 1.1967501646365147e-06,
1310
  "loss": 0.2759,
1311
  "step": 186
1312
  },
1313
  {
1314
  "epoch": 0.8025751072961373,
1315
- "grad_norm": 0.4197385609149933,
1316
  "learning_rate": 1.1483921068639353e-06,
1317
- "loss": 0.276,
1318
  "step": 187
1319
  },
1320
  {
1321
  "epoch": 0.8068669527896996,
1322
- "grad_norm": 0.42288023233413696,
1323
  "learning_rate": 1.1009042931438784e-06,
1324
- "loss": 0.2989,
1325
  "step": 188
1326
  },
1327
  {
1328
  "epoch": 0.8111587982832618,
1329
- "grad_norm": 0.44527870416641235,
1330
  "learning_rate": 1.0542974530180327e-06,
1331
- "loss": 0.3076,
1332
  "step": 189
1333
  },
1334
  {
1335
  "epoch": 0.8154506437768241,
1336
- "grad_norm": 0.393606960773468,
1337
  "learning_rate": 1.00858211697822e-06,
1338
- "loss": 0.2663,
1339
  "step": 190
1340
  },
1341
  {
1342
  "epoch": 0.8197424892703863,
1343
- "grad_norm": 0.4073619246482849,
1344
  "learning_rate": 9.637686140871121e-07,
1345
- "loss": 0.2906,
1346
  "step": 191
1347
  },
1348
  {
1349
  "epoch": 0.8240343347639485,
1350
- "grad_norm": 0.39690256118774414,
1351
  "learning_rate": 9.198670696444339e-07,
1352
- "loss": 0.2814,
1353
  "step": 192
1354
  },
1355
  {
1356
  "epoch": 0.8283261802575107,
1357
- "grad_norm": 0.5071620345115662,
1358
  "learning_rate": 8.768874028992431e-07,
1359
- "loss": 0.319,
1360
  "step": 193
1361
  },
1362
  {
1363
  "epoch": 0.8326180257510729,
1364
- "grad_norm": 0.4257042706012726,
1365
  "learning_rate": 8.348393248087289e-07,
1366
- "loss": 0.319,
1367
  "step": 194
1368
  },
1369
  {
1370
  "epoch": 0.8369098712446352,
1371
- "grad_norm": 0.4489593803882599,
1372
  "learning_rate": 7.937323358440935e-07,
1373
- "loss": 0.3408,
1374
  "step": 195
1375
  },
1376
  {
1377
  "epoch": 0.8412017167381974,
1378
- "grad_norm": 0.43908488750457764,
1379
  "learning_rate": 7.535757238439939e-07,
1380
- "loss": 0.3121,
1381
  "step": 196
1382
  },
1383
  {
1384
  "epoch": 0.8454935622317596,
1385
- "grad_norm": 0.3910050690174103,
1386
  "learning_rate": 7.143785619160026e-07,
1387
- "loss": 0.2791,
1388
  "step": 197
1389
  },
1390
  {
1391
  "epoch": 0.8497854077253219,
1392
- "grad_norm": 0.39390695095062256,
1393
  "learning_rate": 6.761497063866207e-07,
1394
- "loss": 0.2845,
1395
  "step": 198
1396
  },
1397
  {
1398
  "epoch": 0.8540772532188842,
1399
- "grad_norm": 0.4456288516521454,
1400
  "learning_rate": 6.388977948002406e-07,
1401
  "loss": 0.3031,
1402
  "step": 199
1403
  },
1404
  {
1405
  "epoch": 0.8583690987124464,
1406
- "grad_norm": 0.4015798568725586,
1407
  "learning_rate": 6.026312439675553e-07,
1408
- "loss": 0.2753,
1409
  "step": 200
1410
  },
1411
  {
1412
  "epoch": 0.8626609442060086,
1413
- "grad_norm": 0.41456156969070435,
1414
  "learning_rate": 5.673582480638395e-07,
1415
- "loss": 0.3022,
1416
  "step": 201
1417
  },
1418
  {
1419
  "epoch": 0.8669527896995708,
1420
- "grad_norm": 0.4418617784976959,
1421
  "learning_rate": 5.330867767775333e-07,
1422
- "loss": 0.3403,
1423
  "step": 202
1424
  },
1425
  {
1426
  "epoch": 0.871244635193133,
1427
- "grad_norm": 0.41422247886657715,
1428
  "learning_rate": 4.998245735095459e-07,
1429
- "loss": 0.3045,
1430
  "step": 203
1431
  },
1432
  {
1433
  "epoch": 0.8755364806866953,
1434
- "grad_norm": 0.4306741952896118,
1435
  "learning_rate": 4.6757915362368567e-07,
1436
- "loss": 0.2879,
1437
  "step": 204
1438
  },
1439
  {
1440
  "epoch": 0.8798283261802575,
1441
- "grad_norm": 0.4205842614173889,
1442
  "learning_rate": 4.363578027486187e-07,
1443
- "loss": 0.3026,
1444
  "step": 205
1445
  },
1446
  {
1447
  "epoch": 0.8841201716738197,
1448
- "grad_norm": 0.4199366271495819,
1449
  "learning_rate": 4.0616757513173123e-07,
1450
- "loss": 0.2836,
1451
  "step": 206
1452
  },
1453
  {
1454
  "epoch": 0.8884120171673819,
1455
- "grad_norm": 0.4141881465911865,
1456
  "learning_rate": 3.7701529204526856e-07,
1457
- "loss": 0.2945,
1458
  "step": 207
1459
  },
1460
  {
1461
  "epoch": 0.8927038626609443,
1462
- "grad_norm": 0.40555790066719055,
1463
  "learning_rate": 3.4890754024512254e-07,
1464
- "loss": 0.3027,
1465
  "step": 208
1466
  },
1467
  {
1468
  "epoch": 0.8969957081545065,
1469
- "grad_norm": 0.4243229329586029,
1470
  "learning_rate": 3.2185067048259245e-07,
1471
  "loss": 0.2932,
1472
  "step": 209
1473
  },
1474
  {
1475
  "epoch": 0.9012875536480687,
1476
- "grad_norm": 0.4138815402984619,
1477
  "learning_rate": 2.9585079606947843e-07,
1478
- "loss": 0.2976,
1479
  "step": 210
1480
  },
1481
  {
1482
  "epoch": 0.9055793991416309,
1483
- "grad_norm": 0.4239174425601959,
1484
  "learning_rate": 2.7091379149682683e-07,
1485
- "loss": 0.2749,
1486
  "step": 211
1487
  },
1488
  {
1489
  "epoch": 0.9098712446351931,
1490
- "grad_norm": 0.384939968585968,
1491
  "learning_rate": 2.470452911076227e-07,
1492
- "loss": 0.2764,
1493
  "step": 212
1494
  },
1495
  {
1496
  "epoch": 0.9141630901287554,
1497
- "grad_norm": 0.4070870280265808,
1498
  "learning_rate": 2.242506878237538e-07,
1499
- "loss": 0.295,
1500
  "step": 213
1501
  },
1502
  {
1503
  "epoch": 0.9184549356223176,
1504
- "grad_norm": 0.3996812701225281,
1505
  "learning_rate": 2.0253513192751374e-07,
1506
- "loss": 0.2829,
1507
  "step": 214
1508
  },
1509
  {
1510
  "epoch": 0.9227467811158798,
1511
- "grad_norm": 0.4206875264644623,
1512
  "learning_rate": 1.8190352989793325e-07,
1513
- "loss": 0.2818,
1514
  "step": 215
1515
  },
1516
  {
1517
  "epoch": 0.927038626609442,
1518
- "grad_norm": 0.43261945247650146,
1519
  "learning_rate": 1.6236054330219853e-07,
1520
- "loss": 0.301,
1521
  "step": 216
1522
  },
1523
  {
1524
  "epoch": 0.9313304721030042,
1525
- "grad_norm": 0.410249799489975,
1526
  "learning_rate": 1.439105877423963e-07,
1527
- "loss": 0.2964,
1528
  "step": 217
1529
  },
1530
  {
1531
  "epoch": 0.9356223175965666,
1532
- "grad_norm": 0.4331822693347931,
1533
  "learning_rate": 1.2655783185784253e-07,
1534
- "loss": 0.2978,
1535
  "step": 218
1536
  },
1537
  {
1538
  "epoch": 0.9399141630901288,
1539
- "grad_norm": 0.4318710267543793,
1540
  "learning_rate": 1.1030619638320805e-07,
1541
- "loss": 0.2959,
1542
  "step": 219
1543
  },
1544
  {
1545
  "epoch": 0.944206008583691,
1546
- "grad_norm": 0.43562668561935425,
1547
  "learning_rate": 9.51593532626538e-08,
1548
- "loss": 0.2807,
1549
  "step": 220
1550
  },
1551
  {
1552
  "epoch": 0.9484978540772532,
1553
- "grad_norm": 0.416435569524765,
1554
  "learning_rate": 8.11207248201834e-08,
1555
- "loss": 0.2914,
1556
  "step": 221
1557
  },
1558
  {
1559
  "epoch": 0.9527896995708155,
1560
- "grad_norm": 0.44661349058151245,
1561
  "learning_rate": 6.819348298638839e-08,
1562
- "loss": 0.3264,
1563
  "step": 222
1564
  },
1565
  {
1566
  "epoch": 0.9570815450643777,
1567
- "grad_norm": 0.42408230900764465,
1568
  "learning_rate": 5.638054858177644e-08,
1569
- "loss": 0.2793,
1570
  "step": 223
1571
  },
1572
  {
1573
  "epoch": 0.9613733905579399,
1574
- "grad_norm": 0.43619269132614136,
1575
  "learning_rate": 4.568459065683206e-08,
1576
- "loss": 0.2876,
1577
  "step": 224
1578
  },
1579
  {
1580
  "epoch": 0.9656652360515021,
1581
- "grad_norm": 0.4247714877128601,
1582
  "learning_rate": 3.610802588895845e-08,
1583
- "loss": 0.3178,
1584
  "step": 225
1585
  },
1586
  {
1587
  "epoch": 0.9699570815450643,
1588
- "grad_norm": 0.3977547883987427,
1589
  "learning_rate": 2.765301803645426e-08,
1590
  "loss": 0.2593,
1591
  "step": 226
1592
  },
1593
  {
1594
  "epoch": 0.9742489270386266,
1595
- "grad_norm": 0.4386114776134491,
1596
  "learning_rate": 2.0321477449619098e-08,
1597
- "loss": 0.3098,
1598
  "step": 227
1599
  },
1600
  {
1601
  "epoch": 0.9785407725321889,
1602
- "grad_norm": 0.4722432494163513,
1603
  "learning_rate": 1.411506063912882e-08,
1604
- "loss": 0.323,
1605
  "step": 228
1606
  },
1607
  {
1608
  "epoch": 0.9828326180257511,
1609
- "grad_norm": 0.4180794060230255,
1610
  "learning_rate": 9.035169901754902e-09,
1611
- "loss": 0.2839,
1612
  "step": 229
1613
  },
1614
  {
1615
  "epoch": 0.9871244635193133,
1616
- "grad_norm": 0.4218989610671997,
1617
  "learning_rate": 5.082953003528457e-09,
1618
- "loss": 0.2761,
1619
  "step": 230
1620
  },
1621
  {
1622
  "epoch": 0.9914163090128756,
1623
- "grad_norm": 0.3851815462112427,
1624
  "learning_rate": 2.2593029204076578e-09,
1625
- "loss": 0.2794,
1626
  "step": 231
1627
  },
1628
  {
1629
  "epoch": 0.9957081545064378,
1630
- "grad_norm": 0.4388996362686157,
1631
  "learning_rate": 5.648576365169245e-10,
1632
- "loss": 0.308,
1633
  "step": 232
1634
  },
1635
  {
1636
  "epoch": 1.0,
1637
- "grad_norm": 0.40494707226753235,
1638
  "learning_rate": 0.0,
1639
- "loss": 0.3027,
1640
  "step": 233
1641
  },
1642
  {
1643
  "epoch": 1.0,
1644
- "eval_loss": 0.39438411593437195,
1645
- "eval_runtime": 1.5211,
1646
- "eval_samples_per_second": 151.203,
1647
- "eval_steps_per_second": 6.574,
1648
  "step": 233
1649
  },
1650
  {
1651
  "epoch": 1.0,
1652
  "step": 233,
1653
  "total_flos": 1.933383782157517e+16,
1654
- "train_loss": 0.4337234101837797,
1655
- "train_runtime": 302.365,
1656
- "train_samples_per_second": 36.843,
1657
- "train_steps_per_second": 0.771
1658
  }
1659
  ],
1660
  "logging_steps": 1,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.004291845493562232,
13
+ "grad_norm": 6.85894775390625,
14
  "learning_rate": 4.1666666666666667e-07,
15
  "loss": 2.3401,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.008583690987124463,
20
+ "grad_norm": 7.190288543701172,
21
  "learning_rate": 8.333333333333333e-07,
22
  "loss": 2.3774,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.012875536480686695,
27
+ "grad_norm": 6.9984612464904785,
28
  "learning_rate": 1.25e-06,
29
+ "loss": 2.3585,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.017167381974248927,
34
+ "grad_norm": 6.962902069091797,
35
  "learning_rate": 1.6666666666666667e-06,
36
+ "loss": 2.3644,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.02145922746781116,
41
+ "grad_norm": 6.923016548156738,
42
  "learning_rate": 2.0833333333333334e-06,
43
+ "loss": 2.3494,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.02575107296137339,
48
+ "grad_norm": 6.84339714050293,
49
  "learning_rate": 2.5e-06,
50
+ "loss": 2.2799,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.030042918454935622,
55
+ "grad_norm": 6.630039691925049,
56
  "learning_rate": 2.916666666666667e-06,
57
+ "loss": 2.2745,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.034334763948497854,
62
+ "grad_norm": 5.8876824378967285,
63
  "learning_rate": 3.3333333333333333e-06,
64
+ "loss": 2.1438,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.03862660944206009,
69
+ "grad_norm": 5.809715270996094,
70
  "learning_rate": 3.7500000000000005e-06,
71
+ "loss": 2.1167,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.04291845493562232,
76
+ "grad_norm": 5.581740856170654,
77
  "learning_rate": 4.166666666666667e-06,
78
+ "loss": 2.0908,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.04721030042918455,
83
+ "grad_norm": 4.400636672973633,
84
  "learning_rate": 4.583333333333333e-06,
85
+ "loss": 1.7756,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.05150214592274678,
90
+ "grad_norm": 4.300999641418457,
91
  "learning_rate": 5e-06,
92
+ "loss": 1.6775,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.055793991416309016,
97
+ "grad_norm": 4.224782466888428,
98
  "learning_rate": 5.416666666666667e-06,
99
+ "loss": 1.6229,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.060085836909871244,
104
+ "grad_norm": 3.458139181137085,
105
  "learning_rate": 5.833333333333334e-06,
106
+ "loss": 1.1551,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.06437768240343347,
111
+ "grad_norm": 3.579143762588501,
112
  "learning_rate": 6.25e-06,
113
+ "loss": 1.0968,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.06866952789699571,
118
+ "grad_norm": 3.3714845180511475,
119
  "learning_rate": 6.666666666666667e-06,
120
+ "loss": 1.0247,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.07296137339055794,
125
+ "grad_norm": 2.885876417160034,
126
  "learning_rate": 7.083333333333335e-06,
127
+ "loss": 0.9058,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.07725321888412018,
132
+ "grad_norm": 2.5267300605773926,
133
  "learning_rate": 7.500000000000001e-06,
134
+ "loss": 0.8515,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.0815450643776824,
139
+ "grad_norm": 2.0233078002929688,
140
  "learning_rate": 7.916666666666667e-06,
141
+ "loss": 0.5884,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.08583690987124463,
146
+ "grad_norm": 1.3672244548797607,
147
  "learning_rate": 8.333333333333334e-06,
148
+ "loss": 0.4798,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.09012875536480687,
153
+ "grad_norm": 1.0522749423980713,
154
  "learning_rate": 8.750000000000001e-06,
155
+ "loss": 0.4636,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.0944206008583691,
160
+ "grad_norm": 0.857843816280365,
161
  "learning_rate": 9.166666666666666e-06,
162
+ "loss": 0.4232,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.09871244635193133,
167
+ "grad_norm": 0.7717468738555908,
168
  "learning_rate": 9.583333333333335e-06,
169
+ "loss": 0.4153,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.10300429184549356,
174
+ "grad_norm": 0.7489557862281799,
175
  "learning_rate": 1e-05,
176
+ "loss": 0.4172,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.1072961373390558,
181
+ "grad_norm": 0.6532961130142212,
182
  "learning_rate": 9.999435142363484e-06,
183
+ "loss": 0.3827,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.11158798283261803,
188
+ "grad_norm": 0.8831511735916138,
189
  "learning_rate": 9.997740697079595e-06,
190
+ "loss": 0.4201,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.11587982832618025,
195
+ "grad_norm": 0.6500473618507385,
196
  "learning_rate": 9.994917046996472e-06,
197
+ "loss": 0.3676,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.12017167381974249,
202
+ "grad_norm": 0.6477255821228027,
203
  "learning_rate": 9.990964830098246e-06,
204
+ "loss": 0.3787,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.12446351931330472,
209
+ "grad_norm": 0.6460564136505127,
210
  "learning_rate": 9.985884939360873e-06,
211
+ "loss": 0.408,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.12875536480686695,
216
+ "grad_norm": 0.6947969198226929,
217
  "learning_rate": 9.979678522550382e-06,
218
+ "loss": 0.3834,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.13304721030042918,
223
+ "grad_norm": 0.648025393486023,
224
  "learning_rate": 9.972346981963546e-06,
225
+ "loss": 0.3656,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.13733905579399142,
230
+ "grad_norm": 0.528627336025238,
231
  "learning_rate": 9.963891974111042e-06,
232
+ "loss": 0.3739,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.14163090128755365,
237
+ "grad_norm": 0.5402371287345886,
238
  "learning_rate": 9.95431540934317e-06,
239
+ "loss": 0.3732,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.1459227467811159,
244
+ "grad_norm": 0.5850553512573242,
245
  "learning_rate": 9.943619451418225e-06,
246
+ "loss": 0.3984,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.15021459227467812,
251
+ "grad_norm": 0.562908411026001,
252
  "learning_rate": 9.931806517013612e-06,
253
+ "loss": 0.3582,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.15450643776824036,
258
+ "grad_norm": 0.5080158710479736,
259
  "learning_rate": 9.918879275179819e-06,
260
+ "loss": 0.3419,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.15879828326180256,
265
+ "grad_norm": 0.5503780841827393,
266
  "learning_rate": 9.904840646737346e-06,
267
+ "loss": 0.3729,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.1630901287553648,
272
+ "grad_norm": 0.48786163330078125,
273
  "learning_rate": 9.889693803616793e-06,
274
+ "loss": 0.3682,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.16738197424892703,
279
+ "grad_norm": 0.4980204105377197,
280
  "learning_rate": 9.873442168142158e-06,
281
+ "loss": 0.3612,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.17167381974248927,
286
+ "grad_norm": 0.5012345314025879,
287
  "learning_rate": 9.856089412257605e-06,
288
+ "loss": 0.336,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.1759656652360515,
293
+ "grad_norm": 0.5027839541435242,
294
  "learning_rate": 9.837639456697802e-06,
295
+ "loss": 0.3825,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.18025751072961374,
300
+ "grad_norm": 0.47596386075019836,
301
  "learning_rate": 9.818096470102067e-06,
302
  "loss": 0.3345,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.18454935622317598,
307
+ "grad_norm": 0.6094445586204529,
308
  "learning_rate": 9.797464868072489e-06,
309
+ "loss": 0.3522,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.1888412017167382,
314
+ "grad_norm": 0.4871525764465332,
315
  "learning_rate": 9.775749312176249e-06,
316
+ "loss": 0.3297,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.19313304721030042,
321
+ "grad_norm": 0.6575652360916138,
322
  "learning_rate": 9.752954708892379e-06,
323
+ "loss": 0.3387,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.19742489270386265,
328
+ "grad_norm": 0.6938695311546326,
329
  "learning_rate": 9.729086208503174e-06,
330
+ "loss": 0.3417,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.2017167381974249,
335
+ "grad_norm": 0.7864469289779663,
336
  "learning_rate": 9.704149203930522e-06,
337
+ "loss": 0.3642,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.20600858369098712,
342
+ "grad_norm": 0.6004130244255066,
343
  "learning_rate": 9.67814932951741e-06,
344
+ "loss": 0.3717,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.21030042918454936,
349
+ "grad_norm": 0.5369061231613159,
350
  "learning_rate": 9.651092459754879e-06,
351
+ "loss": 0.3727,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.2145922746781116,
356
+ "grad_norm": 0.6097694039344788,
357
  "learning_rate": 9.622984707954732e-06,
358
+ "loss": 0.3487,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.21888412017167383,
363
+ "grad_norm": 0.4255381226539612,
364
  "learning_rate": 9.593832424868271e-06,
365
+ "loss": 0.3326,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.22317596566523606,
370
+ "grad_norm": 0.49553343653678894,
371
  "learning_rate": 9.563642197251382e-06,
372
+ "loss": 0.3706,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.22746781115879827,
377
+ "grad_norm": 0.6784220933914185,
378
  "learning_rate": 9.532420846376316e-06,
379
+ "loss": 0.3283,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.2317596566523605,
384
+ "grad_norm": 0.4617113173007965,
385
  "learning_rate": 9.500175426490455e-06,
386
+ "loss": 0.3175,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.23605150214592274,
391
+ "grad_norm": 0.5140352845191956,
392
  "learning_rate": 9.466913223222467e-06,
393
+ "loss": 0.3491,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.24034334763948498,
398
+ "grad_norm": 0.5974839329719543,
399
  "learning_rate": 9.432641751936162e-06,
400
+ "loss": 0.328,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.2446351931330472,
405
+ "grad_norm": 0.5429268479347229,
406
  "learning_rate": 9.397368756032445e-06,
407
+ "loss": 0.3438,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.24892703862660945,
412
+ "grad_norm": 0.5929590463638306,
413
  "learning_rate": 9.361102205199762e-06,
414
  "loss": 0.3458,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.2532188841201717,
419
+ "grad_norm": 0.5589198470115662,
420
  "learning_rate": 9.32385029361338e-06,
421
+ "loss": 0.3246,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.2575107296137339,
426
+ "grad_norm": 0.5041264295578003,
427
  "learning_rate": 9.285621438083997e-06,
428
+ "loss": 0.3191,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.26180257510729615,
433
+ "grad_norm": 0.5429166555404663,
434
  "learning_rate": 9.246424276156008e-06,
435
+ "loss": 0.3291,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.26609442060085836,
440
+ "grad_norm": 0.4658041298389435,
441
  "learning_rate": 9.206267664155906e-06,
442
+ "loss": 0.3282,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.2703862660944206,
447
+ "grad_norm": 0.5400846004486084,
448
  "learning_rate": 9.165160675191272e-06,
449
  "loss": 0.3323,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.27467811158798283,
454
+ "grad_norm": 0.8674777746200562,
455
  "learning_rate": 9.123112597100759e-06,
456
+ "loss": 0.3367,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.27896995708154504,
461
+ "grad_norm": 0.5036664009094238,
462
  "learning_rate": 9.080132930355567e-06,
463
+ "loss": 0.3335,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.2832618025751073,
468
+ "grad_norm": 0.47038599848747253,
469
  "learning_rate": 9.03623138591289e-06,
470
+ "loss": 0.3085,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.2875536480686695,
475
+ "grad_norm": 0.4432419538497925,
476
  "learning_rate": 8.99141788302178e-06,
477
+ "loss": 0.3075,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.2918454935622318,
482
+ "grad_norm": 0.4220591187477112,
483
  "learning_rate": 8.94570254698197e-06,
484
+ "loss": 0.2999,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.296137339055794,
489
+ "grad_norm": 0.5135315656661987,
490
  "learning_rate": 8.899095706856122e-06,
491
+ "loss": 0.322,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.30042918454935624,
496
+ "grad_norm": 0.5353755354881287,
497
  "learning_rate": 8.851607893136065e-06,
498
+ "loss": 0.3527,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.30472103004291845,
503
+ "grad_norm": 0.44178667664527893,
504
  "learning_rate": 8.803249835363486e-06,
505
+ "loss": 0.2963,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.3090128755364807,
510
+ "grad_norm": 0.4627586007118225,
511
  "learning_rate": 8.754032459705672e-06,
512
+ "loss": 0.3453,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.3133047210300429,
517
+ "grad_norm": 0.47342514991760254,
518
  "learning_rate": 8.703966886486819e-06,
519
  "loss": 0.3296,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.31759656652360513,
524
+ "grad_norm": 0.4531494081020355,
525
  "learning_rate": 8.65306442767547e-06,
526
+ "loss": 0.3205,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.3218884120171674,
531
+ "grad_norm": 0.4715803265571594,
532
  "learning_rate": 8.601336584328659e-06,
533
+ "loss": 0.3367,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.3261802575107296,
538
+ "grad_norm": 0.5289749503135681,
539
  "learning_rate": 8.548795043993316e-06,
540
+ "loss": 0.2952,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.33047210300429186,
545
+ "grad_norm": 0.47284507751464844,
546
  "learning_rate": 8.495451678065563e-06,
547
+ "loss": 0.3327,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.33476394849785407,
552
+ "grad_norm": 0.44787925481796265,
553
  "learning_rate": 8.441318539108433e-06,
554
+ "loss": 0.3455,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.33905579399141633,
559
+ "grad_norm": 0.4576720595359802,
560
  "learning_rate": 8.386407858128707e-06,
561
+ "loss": 0.338,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.34334763948497854,
566
+ "grad_norm": 0.4470963180065155,
567
  "learning_rate": 8.330732041813367e-06,
568
+ "loss": 0.3136,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.34763948497854075,
573
+ "grad_norm": 0.4846912622451782,
574
  "learning_rate": 8.274303669726427e-06,
575
+ "loss": 0.345,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.351931330472103,
580
+ "grad_norm": 0.4856449067592621,
581
  "learning_rate": 8.217135491466636e-06,
582
+ "loss": 0.3474,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.3562231759656652,
587
+ "grad_norm": 0.41884860396385193,
588
  "learning_rate": 8.15924042378682e-06,
589
  "loss": 0.3049,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.3605150214592275,
594
+ "grad_norm": 0.4210873246192932,
595
  "learning_rate": 8.100631547675417e-06,
596
+ "loss": 0.3045,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.3648068669527897,
601
+ "grad_norm": 0.42840081453323364,
602
  "learning_rate": 8.041322105400923e-06,
603
+ "loss": 0.3345,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.36909871244635195,
608
+ "grad_norm": 0.4452343285083771,
609
  "learning_rate": 7.981325497519892e-06,
610
+ "loss": 0.3191,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.37339055793991416,
615
+ "grad_norm": 0.4384053945541382,
616
  "learning_rate": 7.920655279849173e-06,
617
+ "loss": 0.3212,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.3776824034334764,
622
+ "grad_norm": 0.44782280921936035,
623
  "learning_rate": 7.859325160403073e-06,
624
+ "loss": 0.3406,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.38197424892703863,
629
+ "grad_norm": 0.4817277789115906,
630
  "learning_rate": 7.797348996296116e-06,
631
+ "loss": 0.3207,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.38626609442060084,
636
+ "grad_norm": 0.449329137802124,
637
  "learning_rate": 7.734740790612137e-06,
638
+ "loss": 0.3413,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.3905579399141631,
643
+ "grad_norm": 0.45743170380592346,
644
  "learning_rate": 7.671514689240366e-06,
645
+ "loss": 0.3086,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.3948497854077253,
650
+ "grad_norm": 0.46335718035697937,
651
  "learning_rate": 7.607684977679284e-06,
652
+ "loss": 0.2999,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.39914163090128757,
657
+ "grad_norm": 0.4085894227027893,
658
  "learning_rate": 7.543266077808893e-06,
659
+ "loss": 0.285,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.4034334763948498,
664
+ "grad_norm": 0.44108864665031433,
665
  "learning_rate": 7.478272544632204e-06,
666
  "loss": 0.32,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.40772532188841204,
671
+ "grad_norm": 0.42785921692848206,
672
  "learning_rate": 7.412719062986632e-06,
673
+ "loss": 0.3376,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.41201716738197425,
678
+ "grad_norm": 0.41579216718673706,
679
  "learning_rate": 7.3466204442260605e-06,
680
+ "loss": 0.2756,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.41630901287553645,
685
+ "grad_norm": 0.4566468596458435,
686
  "learning_rate": 7.279991622874319e-06,
687
+ "loss": 0.342,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.4206008583690987,
692
+ "grad_norm": 0.4558704197406769,
693
  "learning_rate": 7.212847653250828e-06,
694
+ "loss": 0.2999,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.4248927038626609,
699
+ "grad_norm": 0.4498668611049652,
700
  "learning_rate": 7.145203706069183e-06,
701
  "loss": 0.2864,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.4291845493562232,
706
+ "grad_norm": 0.4367908835411072,
707
  "learning_rate": 7.0770750650094335e-06,
708
+ "loss": 0.3386,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 0.4334763948497854,
713
+ "grad_norm": 0.4558912515640259,
714
  "learning_rate": 7.008477123264849e-06,
715
+ "loss": 0.3457,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 0.43776824034334766,
720
+ "grad_norm": 0.453532338142395,
721
  "learning_rate": 6.939425380063924e-06,
722
+ "loss": 0.3155,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 0.44206008583690987,
727
+ "grad_norm": 0.4329833686351776,
728
  "learning_rate": 6.869935437168449e-06,
729
  "loss": 0.3228,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 0.44635193133047213,
734
+ "grad_norm": 0.4507530927658081,
735
  "learning_rate": 6.800022995348381e-06,
736
+ "loss": 0.3185,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 0.45064377682403434,
741
+ "grad_norm": 0.437532514333725,
742
  "learning_rate": 6.729703850834381e-06,
743
+ "loss": 0.3255,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 0.45493562231759654,
748
+ "grad_norm": 0.4122028052806854,
749
  "learning_rate": 6.65899389174876e-06,
750
+ "loss": 0.2825,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 0.4592274678111588,
755
+ "grad_norm": 0.41888755559921265,
756
  "learning_rate": 6.587909094515663e-06,
757
+ "loss": 0.2961,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 0.463519313304721,
762
+ "grad_norm": 0.3912885785102844,
763
  "learning_rate": 6.5164655202513135e-06,
764
+ "loss": 0.279,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 0.4678111587982833,
769
+ "grad_norm": 0.46730318665504456,
770
  "learning_rate": 6.444679311135112e-06,
771
+ "loss": 0.3406,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 0.4721030042918455,
776
+ "grad_norm": 0.4653036594390869,
777
  "learning_rate": 6.372566686762427e-06,
778
  "loss": 0.3413,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 0.47639484978540775,
783
+ "grad_norm": 0.4568288028240204,
784
  "learning_rate": 6.300143940479881e-06,
785
+ "loss": 0.3262,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 0.48068669527896996,
790
+ "grad_norm": 0.4192184805870056,
791
  "learning_rate": 6.227427435703997e-06,
792
+ "loss": 0.29,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 0.48497854077253216,
797
+ "grad_norm": 0.4163326621055603,
798
  "learning_rate": 6.154433602223979e-06,
799
  "loss": 0.2692,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 0.4892703862660944,
804
+ "grad_norm": 0.41952961683273315,
805
  "learning_rate": 6.0811789324895365e-06,
806
+ "loss": 0.3,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 0.49356223175965663,
811
+ "grad_norm": 0.43658506870269775,
812
  "learning_rate": 6.0076799778845105e-06,
813
+ "loss": 0.3061,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 0.4978540772532189,
818
+ "grad_norm": 0.4165233373641968,
819
  "learning_rate": 5.933953344987215e-06,
820
+ "loss": 0.3126,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 0.5021459227467812,
825
+ "grad_norm": 0.4537828862667084,
826
  "learning_rate": 5.860015691818292e-06,
827
+ "loss": 0.3312,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 0.5064377682403434,
832
+ "grad_norm": 0.42886224389076233,
833
  "learning_rate": 5.78588372407695e-06,
834
+ "loss": 0.2903,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 0.5107296137339056,
839
+ "grad_norm": 0.4168955385684967,
840
  "learning_rate": 5.711574191366427e-06,
841
+ "loss": 0.3079,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 0.5150214592274678,
846
+ "grad_norm": 0.4388049840927124,
847
  "learning_rate": 5.637103883409525e-06,
848
+ "loss": 0.3042,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 0.51931330472103,
853
+ "grad_norm": 0.473834365606308,
854
  "learning_rate": 5.562489626255104e-06,
855
+ "loss": 0.3133,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 0.5236051502145923,
860
+ "grad_norm": 0.4318896532058716,
861
  "learning_rate": 5.487748278476342e-06,
862
+ "loss": 0.3121,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 0.5278969957081545,
867
+ "grad_norm": 0.40922820568084717,
868
  "learning_rate": 5.412896727361663e-06,
869
+ "loss": 0.2525,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 0.5321888412017167,
874
+ "grad_norm": 0.424252986907959,
875
  "learning_rate": 5.337951885099167e-06,
876
+ "loss": 0.2907,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 0.5364806866952789,
881
+ "grad_norm": 0.43667683005332947,
882
  "learning_rate": 5.262930684955439e-06,
883
+ "loss": 0.2934,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 0.5407725321888412,
888
+ "grad_norm": 0.4362064599990845,
889
  "learning_rate": 5.187850077449604e-06,
890
+ "loss": 0.3177,
891
  "step": 126
892
  },
893
  {
894
  "epoch": 0.5450643776824035,
895
+ "grad_norm": 0.4123782813549042,
896
  "learning_rate": 5.112727026523461e-06,
897
+ "loss": 0.2759,
898
  "step": 127
899
  },
900
  {
901
  "epoch": 0.5493562231759657,
902
+ "grad_norm": 0.4588283896446228,
903
  "learning_rate": 5.03757850570861e-06,
904
+ "loss": 0.3074,
905
  "step": 128
906
  },
907
  {
908
  "epoch": 0.5536480686695279,
909
+ "grad_norm": 0.43132638931274414,
910
  "learning_rate": 4.9624214942913916e-06,
911
+ "loss": 0.3022,
912
  "step": 129
913
  },
914
  {
915
  "epoch": 0.5579399141630901,
916
+ "grad_norm": 0.42370718717575073,
917
  "learning_rate": 4.88727297347654e-06,
918
+ "loss": 0.3062,
919
  "step": 130
920
  },
921
  {
922
  "epoch": 0.5622317596566524,
923
+ "grad_norm": 0.41679641604423523,
924
  "learning_rate": 4.8121499225503974e-06,
925
+ "loss": 0.2876,
926
  "step": 131
927
  },
928
  {
929
  "epoch": 0.5665236051502146,
930
+ "grad_norm": 0.41238632798194885,
931
  "learning_rate": 4.737069315044562e-06,
932
+ "loss": 0.2957,
933
  "step": 132
934
  },
935
  {
936
  "epoch": 0.5708154506437768,
937
+ "grad_norm": 0.44139111042022705,
938
  "learning_rate": 4.662048114900837e-06,
939
+ "loss": 0.3243,
940
  "step": 133
941
  },
942
  {
943
  "epoch": 0.575107296137339,
944
+ "grad_norm": 0.4432527720928192,
945
  "learning_rate": 4.587103272638339e-06,
946
+ "loss": 0.327,
947
  "step": 134
948
  },
949
  {
950
  "epoch": 0.5793991416309013,
951
+ "grad_norm": 0.41122639179229736,
952
  "learning_rate": 4.512251721523659e-06,
953
+ "loss": 0.2675,
954
  "step": 135
955
  },
956
  {
957
  "epoch": 0.5836909871244635,
958
+ "grad_norm": 0.44076064229011536,
959
  "learning_rate": 4.437510373744897e-06,
960
+ "loss": 0.3093,
961
  "step": 136
962
  },
963
  {
964
  "epoch": 0.5879828326180258,
965
+ "grad_norm": 0.43945518136024475,
966
  "learning_rate": 4.362896116590475e-06,
967
+ "loss": 0.3038,
968
  "step": 137
969
  },
970
  {
971
  "epoch": 0.592274678111588,
972
+ "grad_norm": 0.43592870235443115,
973
  "learning_rate": 4.2884258086335755e-06,
974
+ "loss": 0.3236,
975
  "step": 138
976
  },
977
  {
978
  "epoch": 0.5965665236051502,
979
+ "grad_norm": 0.3952674865722656,
980
  "learning_rate": 4.214116275923051e-06,
981
+ "loss": 0.2734,
982
  "step": 139
983
  },
984
  {
985
  "epoch": 0.6008583690987125,
986
+ "grad_norm": 0.440900593996048,
987
  "learning_rate": 4.1399843081817085e-06,
988
+ "loss": 0.3125,
989
  "step": 140
990
  },
991
  {
992
  "epoch": 0.6051502145922747,
993
+ "grad_norm": 0.41623416543006897,
994
  "learning_rate": 4.066046655012786e-06,
995
+ "loss": 0.3048,
996
  "step": 141
997
  },
998
  {
999
  "epoch": 0.6094420600858369,
1000
+ "grad_norm": 0.4618232548236847,
1001
  "learning_rate": 3.992320022115492e-06,
1002
+ "loss": 0.3158,
1003
  "step": 142
1004
  },
1005
  {
1006
  "epoch": 0.6137339055793991,
1007
+ "grad_norm": 0.5157040357589722,
1008
  "learning_rate": 3.918821067510464e-06,
1009
+ "loss": 0.2926,
1010
  "step": 143
1011
  },
1012
  {
1013
  "epoch": 0.6180257510729614,
1014
+ "grad_norm": 0.45628827810287476,
1015
  "learning_rate": 3.845566397776022e-06,
1016
+ "loss": 0.3306,
1017
  "step": 144
1018
  },
1019
  {
1020
  "epoch": 0.6223175965665236,
1021
+ "grad_norm": 0.4237128496170044,
1022
  "learning_rate": 3.7725725642960047e-06,
1023
+ "loss": 0.3061,
1024
  "step": 145
1025
  },
1026
  {
1027
  "epoch": 0.6266094420600858,
1028
+ "grad_norm": 0.44287946820259094,
1029
  "learning_rate": 3.6998560595201188e-06,
1030
+ "loss": 0.2913,
1031
  "step": 146
1032
  },
1033
  {
1034
  "epoch": 0.630901287553648,
1035
+ "grad_norm": 0.42727428674697876,
1036
  "learning_rate": 3.627433313237576e-06,
1037
+ "loss": 0.2956,
1038
  "step": 147
1039
  },
1040
  {
1041
  "epoch": 0.6351931330472103,
1042
+ "grad_norm": 0.42002803087234497,
1043
  "learning_rate": 3.555320688864889e-06,
1044
+ "loss": 0.2908,
1045
  "step": 148
1046
  },
1047
  {
1048
  "epoch": 0.6394849785407726,
1049
+ "grad_norm": 0.4202839136123657,
1050
  "learning_rate": 3.483534479748688e-06,
1051
+ "loss": 0.2924,
1052
  "step": 149
1053
  },
1054
  {
1055
  "epoch": 0.6437768240343348,
1056
+ "grad_norm": 0.4546595513820648,
1057
  "learning_rate": 3.4120909054843375e-06,
1058
+ "loss": 0.2834,
1059
  "step": 150
1060
  },
1061
  {
1062
  "epoch": 0.648068669527897,
1063
+ "grad_norm": 0.3927839994430542,
1064
  "learning_rate": 3.3410061082512422e-06,
1065
+ "loss": 0.2592,
1066
  "step": 151
1067
  },
1068
  {
1069
  "epoch": 0.6523605150214592,
1070
+ "grad_norm": 0.41537243127822876,
1071
  "learning_rate": 3.2702961491656197e-06,
1072
  "loss": 0.2864,
1073
  "step": 152
1074
  },
1075
  {
1076
  "epoch": 0.6566523605150214,
1077
+ "grad_norm": 0.41363805532455444,
1078
  "learning_rate": 3.1999770046516198e-06,
1079
+ "loss": 0.2826,
1080
  "step": 153
1081
  },
1082
  {
1083
  "epoch": 0.6609442060085837,
1084
+ "grad_norm": 0.4108889698982239,
1085
  "learning_rate": 3.130064562831553e-06,
1086
+ "loss": 0.2764,
1087
  "step": 154
1088
  },
1089
  {
1090
  "epoch": 0.6652360515021459,
1091
+ "grad_norm": 0.4345560371875763,
1092
  "learning_rate": 3.0605746199360755e-06,
1093
+ "loss": 0.2988,
1094
  "step": 155
1095
  },
1096
  {
1097
  "epoch": 0.6695278969957081,
1098
+ "grad_norm": 0.4138844907283783,
1099
  "learning_rate": 2.991522876735154e-06,
1100
+ "loss": 0.2868,
1101
  "step": 156
1102
  },
1103
  {
1104
  "epoch": 0.6738197424892703,
1105
+ "grad_norm": 0.4299696981906891,
1106
  "learning_rate": 2.9229249349905686e-06,
1107
+ "loss": 0.3025,
1108
  "step": 157
1109
  },
1110
  {
1111
  "epoch": 0.6781115879828327,
1112
+ "grad_norm": 0.4283064305782318,
1113
  "learning_rate": 2.8547962939308187e-06,
1114
+ "loss": 0.2841,
1115
  "step": 158
1116
  },
1117
  {
1118
  "epoch": 0.6824034334763949,
1119
+ "grad_norm": 0.44353562593460083,
1120
  "learning_rate": 2.787152346749173e-06,
1121
+ "loss": 0.2998,
1122
  "step": 159
1123
  },
1124
  {
1125
  "epoch": 0.6866952789699571,
1126
+ "grad_norm": 0.41590532660484314,
1127
  "learning_rate": 2.720008377125682e-06,
1128
+ "loss": 0.2809,
1129
  "step": 160
1130
  },
1131
  {
1132
  "epoch": 0.6909871244635193,
1133
+ "grad_norm": 0.45160338282585144,
1134
  "learning_rate": 2.6533795557739407e-06,
1135
+ "loss": 0.3248,
1136
  "step": 161
1137
  },
1138
  {
1139
  "epoch": 0.6952789699570815,
1140
+ "grad_norm": 0.4410744309425354,
1141
  "learning_rate": 2.5872809370133704e-06,
1142
+ "loss": 0.2939,
1143
  "step": 162
1144
  },
1145
  {
1146
  "epoch": 0.6995708154506438,
1147
+ "grad_norm": 0.43647775053977966,
1148
  "learning_rate": 2.5217274553677975e-06,
1149
+ "loss": 0.2988,
1150
  "step": 163
1151
  },
1152
  {
1153
  "epoch": 0.703862660944206,
1154
+ "grad_norm": 0.42308133840560913,
1155
  "learning_rate": 2.4567339221911086e-06,
1156
+ "loss": 0.2924,
1157
  "step": 164
1158
  },
1159
  {
1160
  "epoch": 0.7081545064377682,
1161
+ "grad_norm": 0.43360158801078796,
1162
  "learning_rate": 2.3923150223207176e-06,
1163
+ "loss": 0.3108,
1164
  "step": 165
1165
  },
1166
  {
1167
  "epoch": 0.7124463519313304,
1168
+ "grad_norm": 0.4196073114871979,
1169
  "learning_rate": 2.328485310759635e-06,
1170
+ "loss": 0.2873,
1171
  "step": 166
1172
  },
1173
  {
1174
  "epoch": 0.7167381974248928,
1175
+ "grad_norm": 0.475564569234848,
1176
  "learning_rate": 2.265259209387867e-06,
1177
+ "loss": 0.3046,
1178
  "step": 167
1179
  },
1180
  {
1181
  "epoch": 0.721030042918455,
1182
+ "grad_norm": 0.4262593686580658,
1183
  "learning_rate": 2.202651003703885e-06,
1184
+ "loss": 0.2805,
1185
  "step": 168
1186
  },
1187
  {
1188
  "epoch": 0.7253218884120172,
1189
+ "grad_norm": 0.4053172767162323,
1190
  "learning_rate": 2.140674839596931e-06,
1191
+ "loss": 0.2662,
1192
  "step": 169
1193
  },
1194
  {
1195
  "epoch": 0.7296137339055794,
1196
+ "grad_norm": 0.45334163308143616,
1197
  "learning_rate": 2.0793447201508288e-06,
1198
+ "loss": 0.3155,
1199
  "step": 170
1200
  },
1201
  {
1202
  "epoch": 0.7339055793991416,
1203
+ "grad_norm": 0.44115695357322693,
1204
  "learning_rate": 2.01867450248011e-06,
1205
+ "loss": 0.3022,
1206
  "step": 171
1207
  },
1208
  {
1209
  "epoch": 0.7381974248927039,
1210
+ "grad_norm": 0.425508975982666,
1211
  "learning_rate": 1.9586778945990785e-06,
1212
+ "loss": 0.282,
1213
  "step": 172
1214
  },
1215
  {
1216
  "epoch": 0.7424892703862661,
1217
+ "grad_norm": 0.44059520959854126,
1218
  "learning_rate": 1.8993684523245842e-06,
1219
+ "loss": 0.291,
1220
  "step": 173
1221
  },
1222
  {
1223
  "epoch": 0.7467811158798283,
1224
+ "grad_norm": 0.5246726274490356,
1225
  "learning_rate": 1.8407595762131814e-06,
1226
+ "loss": 0.3419,
1227
  "step": 174
1228
  },
1229
  {
1230
  "epoch": 0.7510729613733905,
1231
+ "grad_norm": 0.46658292412757874,
1232
  "learning_rate": 1.7828645085333645e-06,
1233
+ "loss": 0.3474,
1234
  "step": 175
1235
  },
1236
  {
1237
  "epoch": 0.7553648068669528,
1238
+ "grad_norm": 0.43256646394729614,
1239
  "learning_rate": 1.7256963302735752e-06,
1240
+ "loss": 0.3177,
1241
  "step": 176
1242
  },
1243
  {
1244
  "epoch": 0.759656652360515,
1245
+ "grad_norm": 0.4265493154525757,
1246
  "learning_rate": 1.6692679581866334e-06,
1247
+ "loss": 0.3048,
1248
  "step": 177
1249
  },
1250
  {
1251
  "epoch": 0.7639484978540773,
1252
+ "grad_norm": 0.43324798345565796,
1253
  "learning_rate": 1.6135921418712959e-06,
1254
  "loss": 0.274,
1255
  "step": 178
1256
  },
1257
  {
1258
  "epoch": 0.7682403433476395,
1259
+ "grad_norm": 0.4457760155200958,
1260
  "learning_rate": 1.5586814608915673e-06,
1261
+ "loss": 0.3137,
1262
  "step": 179
1263
  },
1264
  {
1265
  "epoch": 0.7725321888412017,
1266
+ "grad_norm": 0.4235510230064392,
1267
  "learning_rate": 1.5045483219344387e-06,
1268
+ "loss": 0.2794,
1269
  "step": 180
1270
  },
1271
  {
1272
  "epoch": 0.776824034334764,
1273
+ "grad_norm": 0.4203346073627472,
1274
  "learning_rate": 1.4512049560066837e-06,
1275
+ "loss": 0.2757,
1276
  "step": 181
1277
  },
1278
  {
1279
  "epoch": 0.7811158798283262,
1280
+ "grad_norm": 0.44381627440452576,
1281
  "learning_rate": 1.3986634156713418e-06,
1282
+ "loss": 0.2848,
1283
  "step": 182
1284
  },
1285
  {
1286
  "epoch": 0.7854077253218884,
1287
+ "grad_norm": 0.43037548661231995,
1288
  "learning_rate": 1.3469355723245303e-06,
1289
+ "loss": 0.2873,
1290
  "step": 183
1291
  },
1292
  {
1293
  "epoch": 0.7896995708154506,
1294
+ "grad_norm": 0.4308881163597107,
1295
  "learning_rate": 1.2960331135131826e-06,
1296
+ "loss": 0.3289,
1297
  "step": 184
1298
  },
1299
  {
1300
  "epoch": 0.7939914163090128,
1301
+ "grad_norm": 0.41342395544052124,
1302
  "learning_rate": 1.245967540294329e-06,
1303
  "loss": 0.2851,
1304
  "step": 185
1305
  },
1306
  {
1307
  "epoch": 0.7982832618025751,
1308
+ "grad_norm": 0.39916035532951355,
1309
  "learning_rate": 1.1967501646365147e-06,
1310
  "loss": 0.2759,
1311
  "step": 186
1312
  },
1313
  {
1314
  "epoch": 0.8025751072961373,
1315
+ "grad_norm": 0.43077540397644043,
1316
  "learning_rate": 1.1483921068639353e-06,
1317
+ "loss": 0.2757,
1318
  "step": 187
1319
  },
1320
  {
1321
  "epoch": 0.8068669527896996,
1322
+ "grad_norm": 0.4402754604816437,
1323
  "learning_rate": 1.1009042931438784e-06,
1324
+ "loss": 0.2986,
1325
  "step": 188
1326
  },
1327
  {
1328
  "epoch": 0.8111587982832618,
1329
+ "grad_norm": 0.433856338262558,
1330
  "learning_rate": 1.0542974530180327e-06,
1331
+ "loss": 0.3087,
1332
  "step": 189
1333
  },
1334
  {
1335
  "epoch": 0.8154506437768241,
1336
+ "grad_norm": 0.3864525556564331,
1337
  "learning_rate": 1.00858211697822e-06,
1338
+ "loss": 0.2653,
1339
  "step": 190
1340
  },
1341
  {
1342
  "epoch": 0.8197424892703863,
1343
+ "grad_norm": 0.4217948019504547,
1344
  "learning_rate": 9.637686140871121e-07,
1345
+ "loss": 0.291,
1346
  "step": 191
1347
  },
1348
  {
1349
  "epoch": 0.8240343347639485,
1350
+ "grad_norm": 0.41786640882492065,
1351
  "learning_rate": 9.198670696444339e-07,
1352
+ "loss": 0.281,
1353
  "step": 192
1354
  },
1355
  {
1356
  "epoch": 0.8283261802575107,
1357
+ "grad_norm": 0.4900873601436615,
1358
  "learning_rate": 8.768874028992431e-07,
1359
+ "loss": 0.3202,
1360
  "step": 193
1361
  },
1362
  {
1363
  "epoch": 0.8326180257510729,
1364
+ "grad_norm": 0.42931240797042847,
1365
  "learning_rate": 8.348393248087289e-07,
1366
+ "loss": 0.3186,
1367
  "step": 194
1368
  },
1369
  {
1370
  "epoch": 0.8369098712446352,
1371
+ "grad_norm": 0.47779154777526855,
1372
  "learning_rate": 7.937323358440935e-07,
1373
+ "loss": 0.3418,
1374
  "step": 195
1375
  },
1376
  {
1377
  "epoch": 0.8412017167381974,
1378
+ "grad_norm": 0.4264460504055023,
1379
  "learning_rate": 7.535757238439939e-07,
1380
+ "loss": 0.3114,
1381
  "step": 196
1382
  },
1383
  {
1384
  "epoch": 0.8454935622317596,
1385
+ "grad_norm": 0.4112091660499573,
1386
  "learning_rate": 7.143785619160026e-07,
1387
+ "loss": 0.2787,
1388
  "step": 197
1389
  },
1390
  {
1391
  "epoch": 0.8497854077253219,
1392
+ "grad_norm": 0.40943408012390137,
1393
  "learning_rate": 6.761497063866207e-07,
1394
+ "loss": 0.2851,
1395
  "step": 198
1396
  },
1397
  {
1398
  "epoch": 0.8540772532188842,
1399
+ "grad_norm": 0.4719178080558777,
1400
  "learning_rate": 6.388977948002406e-07,
1401
  "loss": 0.3031,
1402
  "step": 199
1403
  },
1404
  {
1405
  "epoch": 0.8583690987124464,
1406
+ "grad_norm": 0.4523196518421173,
1407
  "learning_rate": 6.026312439675553e-07,
1408
+ "loss": 0.2752,
1409
  "step": 200
1410
  },
1411
  {
1412
  "epoch": 0.8626609442060086,
1413
+ "grad_norm": 0.4043319523334503,
1414
  "learning_rate": 5.673582480638395e-07,
1415
+ "loss": 0.3032,
1416
  "step": 201
1417
  },
1418
  {
1419
  "epoch": 0.8669527896995708,
1420
+ "grad_norm": 0.44797205924987793,
1421
  "learning_rate": 5.330867767775333e-07,
1422
+ "loss": 0.3399,
1423
  "step": 202
1424
  },
1425
  {
1426
  "epoch": 0.871244635193133,
1427
+ "grad_norm": 0.42670944333076477,
1428
  "learning_rate": 4.998245735095459e-07,
1429
+ "loss": 0.3049,
1430
  "step": 203
1431
  },
1432
  {
1433
  "epoch": 0.8755364806866953,
1434
+ "grad_norm": 0.4361429214477539,
1435
  "learning_rate": 4.6757915362368567e-07,
1436
+ "loss": 0.2873,
1437
  "step": 204
1438
  },
1439
  {
1440
  "epoch": 0.8798283261802575,
1441
+ "grad_norm": 0.42739784717559814,
1442
  "learning_rate": 4.363578027486187e-07,
1443
+ "loss": 0.3029,
1444
  "step": 205
1445
  },
1446
  {
1447
  "epoch": 0.8841201716738197,
1448
+ "grad_norm": 0.4033448100090027,
1449
  "learning_rate": 4.0616757513173123e-07,
1450
+ "loss": 0.2834,
1451
  "step": 206
1452
  },
1453
  {
1454
  "epoch": 0.8884120171673819,
1455
+ "grad_norm": 0.42973992228507996,
1456
  "learning_rate": 3.7701529204526856e-07,
1457
+ "loss": 0.2944,
1458
  "step": 207
1459
  },
1460
  {
1461
  "epoch": 0.8927038626609443,
1462
+ "grad_norm": 0.4102339446544647,
1463
  "learning_rate": 3.4890754024512254e-07,
1464
+ "loss": 0.303,
1465
  "step": 208
1466
  },
1467
  {
1468
  "epoch": 0.8969957081545065,
1469
+ "grad_norm": 0.4557758569717407,
1470
  "learning_rate": 3.2185067048259245e-07,
1471
  "loss": 0.2932,
1472
  "step": 209
1473
  },
1474
  {
1475
  "epoch": 0.9012875536480687,
1476
+ "grad_norm": 0.4004939794540405,
1477
  "learning_rate": 2.9585079606947843e-07,
1478
+ "loss": 0.2979,
1479
  "step": 210
1480
  },
1481
  {
1482
  "epoch": 0.9055793991416309,
1483
+ "grad_norm": 0.43249982595443726,
1484
  "learning_rate": 2.7091379149682683e-07,
1485
+ "loss": 0.2745,
1486
  "step": 211
1487
  },
1488
  {
1489
  "epoch": 0.9098712446351931,
1490
+ "grad_norm": 0.4003370702266693,
1491
  "learning_rate": 2.470452911076227e-07,
1492
+ "loss": 0.2775,
1493
  "step": 212
1494
  },
1495
  {
1496
  "epoch": 0.9141630901287554,
1497
+ "grad_norm": 0.40859144926071167,
1498
  "learning_rate": 2.242506878237538e-07,
1499
+ "loss": 0.2951,
1500
  "step": 213
1501
  },
1502
  {
1503
  "epoch": 0.9184549356223176,
1504
+ "grad_norm": 0.39313316345214844,
1505
  "learning_rate": 2.0253513192751374e-07,
1506
+ "loss": 0.2826,
1507
  "step": 214
1508
  },
1509
  {
1510
  "epoch": 0.9227467811158798,
1511
+ "grad_norm": 0.4221353530883789,
1512
  "learning_rate": 1.8190352989793325e-07,
1513
+ "loss": 0.2819,
1514
  "step": 215
1515
  },
1516
  {
1517
  "epoch": 0.927038626609442,
1518
+ "grad_norm": 0.4318053424358368,
1519
  "learning_rate": 1.6236054330219853e-07,
1520
+ "loss": 0.3003,
1521
  "step": 216
1522
  },
1523
  {
1524
  "epoch": 0.9313304721030042,
1525
+ "grad_norm": 0.428039014339447,
1526
  "learning_rate": 1.439105877423963e-07,
1527
+ "loss": 0.2963,
1528
  "step": 217
1529
  },
1530
  {
1531
  "epoch": 0.9356223175965666,
1532
+ "grad_norm": 0.4233408272266388,
1533
  "learning_rate": 1.2655783185784253e-07,
1534
+ "loss": 0.2975,
1535
  "step": 218
1536
  },
1537
  {
1538
  "epoch": 0.9399141630901288,
1539
+ "grad_norm": 0.4406803250312805,
1540
  "learning_rate": 1.1030619638320805e-07,
1541
+ "loss": 0.2955,
1542
  "step": 219
1543
  },
1544
  {
1545
  "epoch": 0.944206008583691,
1546
+ "grad_norm": 0.46127110719680786,
1547
  "learning_rate": 9.51593532626538e-08,
1548
+ "loss": 0.2805,
1549
  "step": 220
1550
  },
1551
  {
1552
  "epoch": 0.9484978540772532,
1553
+ "grad_norm": 0.416983038187027,
1554
  "learning_rate": 8.11207248201834e-08,
1555
+ "loss": 0.2911,
1556
  "step": 221
1557
  },
1558
  {
1559
  "epoch": 0.9527896995708155,
1560
+ "grad_norm": 0.44284337759017944,
1561
  "learning_rate": 6.819348298638839e-08,
1562
+ "loss": 0.326,
1563
  "step": 222
1564
  },
1565
  {
1566
  "epoch": 0.9570815450643777,
1567
+ "grad_norm": 0.4572809338569641,
1568
  "learning_rate": 5.638054858177644e-08,
1569
+ "loss": 0.278,
1570
  "step": 223
1571
  },
1572
  {
1573
  "epoch": 0.9613733905579399,
1574
+ "grad_norm": 0.43752995133399963,
1575
  "learning_rate": 4.568459065683206e-08,
1576
+ "loss": 0.2866,
1577
  "step": 224
1578
  },
1579
  {
1580
  "epoch": 0.9656652360515021,
1581
+ "grad_norm": 0.4339517056941986,
1582
  "learning_rate": 3.610802588895845e-08,
1583
+ "loss": 0.3182,
1584
  "step": 225
1585
  },
1586
  {
1587
  "epoch": 0.9699570815450643,
1588
+ "grad_norm": 0.4011162519454956,
1589
  "learning_rate": 2.765301803645426e-08,
1590
  "loss": 0.2593,
1591
  "step": 226
1592
  },
1593
  {
1594
  "epoch": 0.9742489270386266,
1595
+ "grad_norm": 0.4327218234539032,
1596
  "learning_rate": 2.0321477449619098e-08,
1597
+ "loss": 0.31,
1598
  "step": 227
1599
  },
1600
  {
1601
  "epoch": 0.9785407725321889,
1602
+ "grad_norm": 0.46699926257133484,
1603
  "learning_rate": 1.411506063912882e-08,
1604
+ "loss": 0.3234,
1605
  "step": 228
1606
  },
1607
  {
1608
  "epoch": 0.9828326180257511,
1609
+ "grad_norm": 0.4302132725715637,
1610
  "learning_rate": 9.035169901754902e-09,
1611
+ "loss": 0.2837,
1612
  "step": 229
1613
  },
1614
  {
1615
  "epoch": 0.9871244635193133,
1616
+ "grad_norm": 0.4342568516731262,
1617
  "learning_rate": 5.082953003528457e-09,
1618
+ "loss": 0.276,
1619
  "step": 230
1620
  },
1621
  {
1622
  "epoch": 0.9914163090128756,
1623
+ "grad_norm": 0.41555383801460266,
1624
  "learning_rate": 2.2593029204076578e-09,
1625
+ "loss": 0.2795,
1626
  "step": 231
1627
  },
1628
  {
1629
  "epoch": 0.9957081545064378,
1630
+ "grad_norm": 0.4377802610397339,
1631
  "learning_rate": 5.648576365169245e-10,
1632
+ "loss": 0.3075,
1633
  "step": 232
1634
  },
1635
  {
1636
  "epoch": 1.0,
1637
+ "grad_norm": 0.4171040654182434,
1638
  "learning_rate": 0.0,
1639
+ "loss": 0.3026,
1640
  "step": 233
1641
  },
1642
  {
1643
  "epoch": 1.0,
1644
+ "eval_loss": 0.39601820707321167,
1645
+ "eval_runtime": 1.525,
1646
+ "eval_samples_per_second": 150.817,
1647
+ "eval_steps_per_second": 6.557,
1648
  "step": 233
1649
  },
1650
  {
1651
  "epoch": 1.0,
1652
  "step": 233,
1653
  "total_flos": 1.933383782157517e+16,
1654
+ "train_loss": 0.43378888076978694,
1655
+ "train_runtime": 303.3238,
1656
+ "train_samples_per_second": 36.726,
1657
+ "train_steps_per_second": 0.768
1658
  }
1659
  ],
1660
  "logging_steps": 1,