CocoRoF's picture
Training in progress, step 8342, checkpoint
a23fbdd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999700320656897,
"eval_steps": 500,
"global_step": 8342,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005993586862057599,
"grad_norm": 108.0,
"learning_rate": 5.988023952095808e-09,
"loss": 101.4288,
"step": 5
},
{
"epoch": 0.0011987173724115198,
"grad_norm": 106.75,
"learning_rate": 1.1976047904191617e-08,
"loss": 102.372,
"step": 10
},
{
"epoch": 0.0017980760586172794,
"grad_norm": 106.6875,
"learning_rate": 1.7964071856287425e-08,
"loss": 100.4109,
"step": 15
},
{
"epoch": 0.0023974347448230396,
"grad_norm": 107.375,
"learning_rate": 2.3952095808383233e-08,
"loss": 102.0437,
"step": 20
},
{
"epoch": 0.0029967934310287992,
"grad_norm": 102.5,
"learning_rate": 2.994011976047904e-08,
"loss": 100.0221,
"step": 25
},
{
"epoch": 0.003596152117234559,
"grad_norm": 104.9375,
"learning_rate": 3.592814371257485e-08,
"loss": 102.2138,
"step": 30
},
{
"epoch": 0.004195510803440319,
"grad_norm": 104.625,
"learning_rate": 4.191616766467065e-08,
"loss": 100.7864,
"step": 35
},
{
"epoch": 0.004794869489646079,
"grad_norm": 101.5625,
"learning_rate": 4.7904191616766466e-08,
"loss": 100.7124,
"step": 40
},
{
"epoch": 0.005394228175851838,
"grad_norm": 105.8125,
"learning_rate": 5.3892215568862274e-08,
"loss": 100.6161,
"step": 45
},
{
"epoch": 0.0059935868620575984,
"grad_norm": 100.875,
"learning_rate": 5.988023952095808e-08,
"loss": 100.9569,
"step": 50
},
{
"epoch": 0.0065929455482633586,
"grad_norm": 104.5625,
"learning_rate": 6.586826347305389e-08,
"loss": 99.5128,
"step": 55
},
{
"epoch": 0.007192304234469118,
"grad_norm": 107.1875,
"learning_rate": 7.18562874251497e-08,
"loss": 100.0423,
"step": 60
},
{
"epoch": 0.007791662920674878,
"grad_norm": 103.1875,
"learning_rate": 7.784431137724551e-08,
"loss": 99.8547,
"step": 65
},
{
"epoch": 0.008391021606880638,
"grad_norm": 103.4375,
"learning_rate": 8.38323353293413e-08,
"loss": 99.0963,
"step": 70
},
{
"epoch": 0.008990380293086398,
"grad_norm": 103.625,
"learning_rate": 8.982035928143712e-08,
"loss": 101.1167,
"step": 75
},
{
"epoch": 0.009589738979292158,
"grad_norm": 105.25,
"learning_rate": 9.580838323353293e-08,
"loss": 98.1266,
"step": 80
},
{
"epoch": 0.010189097665497917,
"grad_norm": 103.3125,
"learning_rate": 1.0179640718562874e-07,
"loss": 100.9546,
"step": 85
},
{
"epoch": 0.010788456351703677,
"grad_norm": 104.4375,
"learning_rate": 1.0778443113772455e-07,
"loss": 99.5327,
"step": 90
},
{
"epoch": 0.011387815037909437,
"grad_norm": 104.875,
"learning_rate": 1.1377245508982034e-07,
"loss": 99.6909,
"step": 95
},
{
"epoch": 0.011987173724115197,
"grad_norm": 107.25,
"learning_rate": 1.1976047904191617e-07,
"loss": 100.4652,
"step": 100
},
{
"epoch": 0.012586532410320957,
"grad_norm": 107.625,
"learning_rate": 1.2574850299401197e-07,
"loss": 100.3513,
"step": 105
},
{
"epoch": 0.013185891096526717,
"grad_norm": 107.625,
"learning_rate": 1.3173652694610778e-07,
"loss": 99.1105,
"step": 110
},
{
"epoch": 0.013785249782732475,
"grad_norm": 106.8125,
"learning_rate": 1.377245508982036e-07,
"loss": 99.3605,
"step": 115
},
{
"epoch": 0.014384608468938236,
"grad_norm": 104.6875,
"learning_rate": 1.437125748502994e-07,
"loss": 100.7677,
"step": 120
},
{
"epoch": 0.014983967155143996,
"grad_norm": 102.125,
"learning_rate": 1.4970059880239518e-07,
"loss": 98.4561,
"step": 125
},
{
"epoch": 0.015583325841349756,
"grad_norm": 103.9375,
"learning_rate": 1.5568862275449102e-07,
"loss": 97.9862,
"step": 130
},
{
"epoch": 0.016182684527555516,
"grad_norm": 105.625,
"learning_rate": 1.6167664670658682e-07,
"loss": 99.3458,
"step": 135
},
{
"epoch": 0.016782043213761276,
"grad_norm": 105.5,
"learning_rate": 1.676646706586826e-07,
"loss": 98.1435,
"step": 140
},
{
"epoch": 0.017381401899967036,
"grad_norm": 106.875,
"learning_rate": 1.7365269461077844e-07,
"loss": 98.1245,
"step": 145
},
{
"epoch": 0.017980760586172796,
"grad_norm": 106.0,
"learning_rate": 1.7964071856287425e-07,
"loss": 97.9571,
"step": 150
},
{
"epoch": 0.018580119272378556,
"grad_norm": 107.125,
"learning_rate": 1.8562874251497006e-07,
"loss": 99.0638,
"step": 155
},
{
"epoch": 0.019179477958584316,
"grad_norm": 104.875,
"learning_rate": 1.9161676646706586e-07,
"loss": 100.1442,
"step": 160
},
{
"epoch": 0.019778836644790073,
"grad_norm": 103.0625,
"learning_rate": 1.9760479041916167e-07,
"loss": 99.5631,
"step": 165
},
{
"epoch": 0.020378195330995833,
"grad_norm": 103.25,
"learning_rate": 2.0359281437125748e-07,
"loss": 99.3583,
"step": 170
},
{
"epoch": 0.020977554017201593,
"grad_norm": 111.0,
"learning_rate": 2.0958083832335326e-07,
"loss": 99.157,
"step": 175
},
{
"epoch": 0.021576912703407353,
"grad_norm": 108.0625,
"learning_rate": 2.155688622754491e-07,
"loss": 98.6387,
"step": 180
},
{
"epoch": 0.022176271389613113,
"grad_norm": 107.625,
"learning_rate": 2.215568862275449e-07,
"loss": 98.7289,
"step": 185
},
{
"epoch": 0.022775630075818874,
"grad_norm": 106.0,
"learning_rate": 2.275449101796407e-07,
"loss": 99.7657,
"step": 190
},
{
"epoch": 0.023374988762024634,
"grad_norm": 108.875,
"learning_rate": 2.3353293413173652e-07,
"loss": 99.5338,
"step": 195
},
{
"epoch": 0.023974347448230394,
"grad_norm": 105.3125,
"learning_rate": 2.3952095808383233e-07,
"loss": 98.2597,
"step": 200
},
{
"epoch": 0.024573706134436154,
"grad_norm": 106.25,
"learning_rate": 2.455089820359281e-07,
"loss": 98.0168,
"step": 205
},
{
"epoch": 0.025173064820641914,
"grad_norm": 105.1875,
"learning_rate": 2.5149700598802395e-07,
"loss": 98.9083,
"step": 210
},
{
"epoch": 0.025772423506847674,
"grad_norm": 102.9375,
"learning_rate": 2.5748502994011973e-07,
"loss": 98.8039,
"step": 215
},
{
"epoch": 0.026371782193053434,
"grad_norm": 102.875,
"learning_rate": 2.6347305389221556e-07,
"loss": 96.897,
"step": 220
},
{
"epoch": 0.026971140879259194,
"grad_norm": 106.0,
"learning_rate": 2.694610778443114e-07,
"loss": 98.8567,
"step": 225
},
{
"epoch": 0.02757049956546495,
"grad_norm": 104.4375,
"learning_rate": 2.754491017964072e-07,
"loss": 99.9957,
"step": 230
},
{
"epoch": 0.02816985825167071,
"grad_norm": 103.875,
"learning_rate": 2.8143712574850296e-07,
"loss": 98.0264,
"step": 235
},
{
"epoch": 0.02876921693787647,
"grad_norm": 107.0625,
"learning_rate": 2.874251497005988e-07,
"loss": 97.1543,
"step": 240
},
{
"epoch": 0.02936857562408223,
"grad_norm": 103.3125,
"learning_rate": 2.934131736526946e-07,
"loss": 98.1825,
"step": 245
},
{
"epoch": 0.02996793431028799,
"grad_norm": 105.875,
"learning_rate": 2.9940119760479036e-07,
"loss": 98.2953,
"step": 250
},
{
"epoch": 0.03056729299649375,
"grad_norm": 107.1875,
"learning_rate": 3.0538922155688625e-07,
"loss": 99.6612,
"step": 255
},
{
"epoch": 0.03116665168269951,
"grad_norm": 107.375,
"learning_rate": 3.1137724550898203e-07,
"loss": 97.6773,
"step": 260
},
{
"epoch": 0.03176601036890527,
"grad_norm": 106.1875,
"learning_rate": 3.173652694610778e-07,
"loss": 98.2748,
"step": 265
},
{
"epoch": 0.03236536905511103,
"grad_norm": 106.0625,
"learning_rate": 3.2335329341317365e-07,
"loss": 97.7014,
"step": 270
},
{
"epoch": 0.03296472774131679,
"grad_norm": 109.8125,
"learning_rate": 3.2934131736526943e-07,
"loss": 99.1549,
"step": 275
},
{
"epoch": 0.03356408642752255,
"grad_norm": 106.0,
"learning_rate": 3.353293413173652e-07,
"loss": 98.5528,
"step": 280
},
{
"epoch": 0.03416344511372831,
"grad_norm": 107.3125,
"learning_rate": 3.413173652694611e-07,
"loss": 98.652,
"step": 285
},
{
"epoch": 0.03476280379993407,
"grad_norm": 102.625,
"learning_rate": 3.473053892215569e-07,
"loss": 99.1912,
"step": 290
},
{
"epoch": 0.03536216248613983,
"grad_norm": 103.8125,
"learning_rate": 3.5329341317365266e-07,
"loss": 98.7347,
"step": 295
},
{
"epoch": 0.03596152117234559,
"grad_norm": 104.3125,
"learning_rate": 3.592814371257485e-07,
"loss": 98.0729,
"step": 300
},
{
"epoch": 0.03656087985855135,
"grad_norm": 108.75,
"learning_rate": 3.652694610778443e-07,
"loss": 98.3793,
"step": 305
},
{
"epoch": 0.03716023854475711,
"grad_norm": 105.3125,
"learning_rate": 3.712574850299401e-07,
"loss": 99.3356,
"step": 310
},
{
"epoch": 0.03775959723096287,
"grad_norm": 107.8125,
"learning_rate": 3.772455089820359e-07,
"loss": 96.9595,
"step": 315
},
{
"epoch": 0.03835895591716863,
"grad_norm": 107.6875,
"learning_rate": 3.8323353293413173e-07,
"loss": 97.5843,
"step": 320
},
{
"epoch": 0.038958314603374386,
"grad_norm": 105.125,
"learning_rate": 3.8922155688622756e-07,
"loss": 100.0804,
"step": 325
},
{
"epoch": 0.039557673289580146,
"grad_norm": 108.9375,
"learning_rate": 3.9520958083832335e-07,
"loss": 96.4505,
"step": 330
},
{
"epoch": 0.040157031975785906,
"grad_norm": 104.625,
"learning_rate": 4.0119760479041913e-07,
"loss": 98.6832,
"step": 335
},
{
"epoch": 0.040756390661991666,
"grad_norm": 109.1875,
"learning_rate": 4.0718562874251496e-07,
"loss": 97.3873,
"step": 340
},
{
"epoch": 0.041355749348197426,
"grad_norm": 105.375,
"learning_rate": 4.1317365269461074e-07,
"loss": 96.7431,
"step": 345
},
{
"epoch": 0.04195510803440319,
"grad_norm": 108.0625,
"learning_rate": 4.191616766467065e-07,
"loss": 96.5581,
"step": 350
},
{
"epoch": 0.04255446672060895,
"grad_norm": 105.875,
"learning_rate": 4.251497005988024e-07,
"loss": 97.4509,
"step": 355
},
{
"epoch": 0.04315382540681471,
"grad_norm": 104.375,
"learning_rate": 4.311377245508982e-07,
"loss": 98.1399,
"step": 360
},
{
"epoch": 0.04375318409302047,
"grad_norm": 108.125,
"learning_rate": 4.37125748502994e-07,
"loss": 97.8897,
"step": 365
},
{
"epoch": 0.04435254277922623,
"grad_norm": 105.25,
"learning_rate": 4.431137724550898e-07,
"loss": 98.8462,
"step": 370
},
{
"epoch": 0.04495190146543199,
"grad_norm": 107.625,
"learning_rate": 4.491017964071856e-07,
"loss": 98.7402,
"step": 375
},
{
"epoch": 0.04555126015163775,
"grad_norm": 109.4375,
"learning_rate": 4.550898203592814e-07,
"loss": 98.7856,
"step": 380
},
{
"epoch": 0.04615061883784351,
"grad_norm": 108.3125,
"learning_rate": 4.6107784431137726e-07,
"loss": 98.0607,
"step": 385
},
{
"epoch": 0.04674997752404927,
"grad_norm": 106.25,
"learning_rate": 4.6706586826347305e-07,
"loss": 98.49,
"step": 390
},
{
"epoch": 0.04734933621025503,
"grad_norm": 108.25,
"learning_rate": 4.7305389221556883e-07,
"loss": 97.775,
"step": 395
},
{
"epoch": 0.04794869489646079,
"grad_norm": 108.5,
"learning_rate": 4.790419161676647e-07,
"loss": 96.6219,
"step": 400
},
{
"epoch": 0.04854805358266655,
"grad_norm": 105.875,
"learning_rate": 4.850299401197605e-07,
"loss": 98.3288,
"step": 405
},
{
"epoch": 0.04914741226887231,
"grad_norm": 104.875,
"learning_rate": 4.910179640718562e-07,
"loss": 98.0089,
"step": 410
},
{
"epoch": 0.04974677095507807,
"grad_norm": 106.5625,
"learning_rate": 4.970059880239521e-07,
"loss": 98.042,
"step": 415
},
{
"epoch": 0.05034612964128383,
"grad_norm": 107.4375,
"learning_rate": 5.029940119760479e-07,
"loss": 98.1441,
"step": 420
},
{
"epoch": 0.05094548832748959,
"grad_norm": 109.9375,
"learning_rate": 5.089820359281437e-07,
"loss": 98.7648,
"step": 425
},
{
"epoch": 0.05154484701369535,
"grad_norm": 110.4375,
"learning_rate": 5.149700598802395e-07,
"loss": 98.7029,
"step": 430
},
{
"epoch": 0.05214420569990111,
"grad_norm": 107.8125,
"learning_rate": 5.209580838323353e-07,
"loss": 97.6725,
"step": 435
},
{
"epoch": 0.05274356438610687,
"grad_norm": 109.125,
"learning_rate": 5.269461077844311e-07,
"loss": 97.5752,
"step": 440
},
{
"epoch": 0.05334292307231263,
"grad_norm": 103.4375,
"learning_rate": 5.329341317365269e-07,
"loss": 97.6427,
"step": 445
},
{
"epoch": 0.05394228175851839,
"grad_norm": 104.5625,
"learning_rate": 5.389221556886228e-07,
"loss": 99.5309,
"step": 450
},
{
"epoch": 0.05454164044472414,
"grad_norm": 107.25,
"learning_rate": 5.449101796407185e-07,
"loss": 98.6651,
"step": 455
},
{
"epoch": 0.0551409991309299,
"grad_norm": 108.25,
"learning_rate": 5.508982035928144e-07,
"loss": 97.7356,
"step": 460
},
{
"epoch": 0.05574035781713566,
"grad_norm": 108.3125,
"learning_rate": 5.568862275449101e-07,
"loss": 97.3537,
"step": 465
},
{
"epoch": 0.05633971650334142,
"grad_norm": 106.25,
"learning_rate": 5.628742514970059e-07,
"loss": 97.6986,
"step": 470
},
{
"epoch": 0.05693907518954718,
"grad_norm": 107.0,
"learning_rate": 5.688622754491019e-07,
"loss": 97.4101,
"step": 475
},
{
"epoch": 0.05753843387575294,
"grad_norm": 106.25,
"learning_rate": 5.748502994011976e-07,
"loss": 98.0793,
"step": 480
},
{
"epoch": 0.0581377925619587,
"grad_norm": 103.75,
"learning_rate": 5.808383233532934e-07,
"loss": 99.4187,
"step": 485
},
{
"epoch": 0.05873715124816446,
"grad_norm": 103.3125,
"learning_rate": 5.868263473053892e-07,
"loss": 98.6429,
"step": 490
},
{
"epoch": 0.05933650993437022,
"grad_norm": 107.5625,
"learning_rate": 5.92814371257485e-07,
"loss": 97.5,
"step": 495
},
{
"epoch": 0.05993586862057598,
"grad_norm": 105.5625,
"learning_rate": 5.988023952095807e-07,
"loss": 97.07,
"step": 500
},
{
"epoch": 0.05993586862057598,
"eval_loss": 3.0657827854156494,
"eval_runtime": 402.7563,
"eval_samples_per_second": 1116.315,
"eval_steps_per_second": 34.887,
"step": 500
},
{
"epoch": 0.06053522730678174,
"grad_norm": 109.75,
"learning_rate": 6.047904191616767e-07,
"loss": 97.7991,
"step": 505
},
{
"epoch": 0.0611345859929875,
"grad_norm": 108.125,
"learning_rate": 6.107784431137725e-07,
"loss": 97.4808,
"step": 510
},
{
"epoch": 0.06173394467919326,
"grad_norm": 103.8125,
"learning_rate": 6.167664670658682e-07,
"loss": 97.707,
"step": 515
},
{
"epoch": 0.06233330336539902,
"grad_norm": 108.375,
"learning_rate": 6.227544910179641e-07,
"loss": 98.3281,
"step": 520
},
{
"epoch": 0.06293266205160478,
"grad_norm": 110.3125,
"learning_rate": 6.287425149700598e-07,
"loss": 99.0089,
"step": 525
},
{
"epoch": 0.06353202073781054,
"grad_norm": 105.75,
"learning_rate": 6.347305389221556e-07,
"loss": 98.7936,
"step": 530
},
{
"epoch": 0.0641313794240163,
"grad_norm": 105.5,
"learning_rate": 6.407185628742516e-07,
"loss": 98.6227,
"step": 535
},
{
"epoch": 0.06473073811022206,
"grad_norm": 104.3125,
"learning_rate": 6.467065868263473e-07,
"loss": 97.955,
"step": 540
},
{
"epoch": 0.06533009679642782,
"grad_norm": 107.75,
"learning_rate": 6.526946107784431e-07,
"loss": 98.5175,
"step": 545
},
{
"epoch": 0.06592945548263358,
"grad_norm": 107.125,
"learning_rate": 6.586826347305389e-07,
"loss": 97.9932,
"step": 550
},
{
"epoch": 0.06652881416883934,
"grad_norm": 105.75,
"learning_rate": 6.646706586826347e-07,
"loss": 99.2275,
"step": 555
},
{
"epoch": 0.0671281728550451,
"grad_norm": 104.9375,
"learning_rate": 6.706586826347304e-07,
"loss": 98.1961,
"step": 560
},
{
"epoch": 0.06772753154125086,
"grad_norm": 105.6875,
"learning_rate": 6.766467065868264e-07,
"loss": 98.7839,
"step": 565
},
{
"epoch": 0.06832689022745662,
"grad_norm": 110.25,
"learning_rate": 6.826347305389222e-07,
"loss": 98.6332,
"step": 570
},
{
"epoch": 0.06892624891366238,
"grad_norm": 105.25,
"learning_rate": 6.886227544910179e-07,
"loss": 100.4485,
"step": 575
},
{
"epoch": 0.06952560759986814,
"grad_norm": 111.75,
"learning_rate": 6.946107784431138e-07,
"loss": 98.7815,
"step": 580
},
{
"epoch": 0.0701249662860739,
"grad_norm": 107.125,
"learning_rate": 7.005988023952095e-07,
"loss": 98.5106,
"step": 585
},
{
"epoch": 0.07072432497227966,
"grad_norm": 106.375,
"learning_rate": 7.065868263473053e-07,
"loss": 97.8071,
"step": 590
},
{
"epoch": 0.07132368365848542,
"grad_norm": 106.875,
"learning_rate": 7.125748502994012e-07,
"loss": 98.2991,
"step": 595
},
{
"epoch": 0.07192304234469118,
"grad_norm": 105.9375,
"learning_rate": 7.18562874251497e-07,
"loss": 97.3966,
"step": 600
},
{
"epoch": 0.07252240103089694,
"grad_norm": 110.0,
"learning_rate": 7.245508982035928e-07,
"loss": 100.179,
"step": 605
},
{
"epoch": 0.0731217597171027,
"grad_norm": 110.0625,
"learning_rate": 7.305389221556886e-07,
"loss": 98.9886,
"step": 610
},
{
"epoch": 0.07372111840330846,
"grad_norm": 106.25,
"learning_rate": 7.365269461077844e-07,
"loss": 98.379,
"step": 615
},
{
"epoch": 0.07432047708951423,
"grad_norm": 108.1875,
"learning_rate": 7.425149700598802e-07,
"loss": 98.8186,
"step": 620
},
{
"epoch": 0.07491983577571998,
"grad_norm": 109.375,
"learning_rate": 7.485029940119761e-07,
"loss": 98.6054,
"step": 625
},
{
"epoch": 0.07551919446192575,
"grad_norm": 111.0625,
"learning_rate": 7.544910179640718e-07,
"loss": 99.7864,
"step": 630
},
{
"epoch": 0.0761185531481315,
"grad_norm": 109.3125,
"learning_rate": 7.604790419161676e-07,
"loss": 99.6344,
"step": 635
},
{
"epoch": 0.07671791183433727,
"grad_norm": 107.25,
"learning_rate": 7.664670658682635e-07,
"loss": 98.9922,
"step": 640
},
{
"epoch": 0.07731727052054302,
"grad_norm": 104.25,
"learning_rate": 7.724550898203592e-07,
"loss": 100.2282,
"step": 645
},
{
"epoch": 0.07791662920674877,
"grad_norm": 108.9375,
"learning_rate": 7.784431137724551e-07,
"loss": 97.6066,
"step": 650
},
{
"epoch": 0.07851598789295454,
"grad_norm": 109.1875,
"learning_rate": 7.844311377245509e-07,
"loss": 99.5096,
"step": 655
},
{
"epoch": 0.07911534657916029,
"grad_norm": 110.0,
"learning_rate": 7.904191616766467e-07,
"loss": 98.4398,
"step": 660
},
{
"epoch": 0.07971470526536606,
"grad_norm": 107.3125,
"learning_rate": 7.964071856287424e-07,
"loss": 97.4769,
"step": 665
},
{
"epoch": 0.08031406395157181,
"grad_norm": 104.4375,
"learning_rate": 8.023952095808383e-07,
"loss": 98.9293,
"step": 670
},
{
"epoch": 0.08091342263777758,
"grad_norm": 106.0,
"learning_rate": 8.083832335329341e-07,
"loss": 98.9894,
"step": 675
},
{
"epoch": 0.08151278132398333,
"grad_norm": 108.125,
"learning_rate": 8.143712574850299e-07,
"loss": 99.3078,
"step": 680
},
{
"epoch": 0.0821121400101891,
"grad_norm": 108.0,
"learning_rate": 8.203592814371258e-07,
"loss": 99.7428,
"step": 685
},
{
"epoch": 0.08271149869639485,
"grad_norm": 107.75,
"learning_rate": 8.263473053892215e-07,
"loss": 100.1608,
"step": 690
},
{
"epoch": 0.08331085738260062,
"grad_norm": 109.25,
"learning_rate": 8.323353293413173e-07,
"loss": 98.0089,
"step": 695
},
{
"epoch": 0.08391021606880637,
"grad_norm": 109.875,
"learning_rate": 8.38323353293413e-07,
"loss": 99.6416,
"step": 700
},
{
"epoch": 0.08450957475501214,
"grad_norm": 106.5,
"learning_rate": 8.443113772455089e-07,
"loss": 98.5848,
"step": 705
},
{
"epoch": 0.0851089334412179,
"grad_norm": 106.625,
"learning_rate": 8.502994011976048e-07,
"loss": 99.8258,
"step": 710
},
{
"epoch": 0.08570829212742366,
"grad_norm": 106.75,
"learning_rate": 8.562874251497006e-07,
"loss": 98.2121,
"step": 715
},
{
"epoch": 0.08630765081362941,
"grad_norm": 108.75,
"learning_rate": 8.622754491017964e-07,
"loss": 98.832,
"step": 720
},
{
"epoch": 0.08690700949983518,
"grad_norm": 105.1875,
"learning_rate": 8.682634730538921e-07,
"loss": 99.0368,
"step": 725
},
{
"epoch": 0.08750636818604093,
"grad_norm": 106.3125,
"learning_rate": 8.74251497005988e-07,
"loss": 99.2618,
"step": 730
},
{
"epoch": 0.0881057268722467,
"grad_norm": 108.875,
"learning_rate": 8.802395209580839e-07,
"loss": 100.3378,
"step": 735
},
{
"epoch": 0.08870508555845245,
"grad_norm": 104.1875,
"learning_rate": 8.862275449101796e-07,
"loss": 99.1748,
"step": 740
},
{
"epoch": 0.08930444424465822,
"grad_norm": 109.375,
"learning_rate": 8.922155688622755e-07,
"loss": 100.3106,
"step": 745
},
{
"epoch": 0.08990380293086397,
"grad_norm": 106.1875,
"learning_rate": 8.982035928143712e-07,
"loss": 97.4286,
"step": 750
},
{
"epoch": 0.09050316161706974,
"grad_norm": 107.3125,
"learning_rate": 9.04191616766467e-07,
"loss": 99.5878,
"step": 755
},
{
"epoch": 0.0911025203032755,
"grad_norm": 105.375,
"learning_rate": 9.101796407185628e-07,
"loss": 99.2444,
"step": 760
},
{
"epoch": 0.09170187898948126,
"grad_norm": 109.9375,
"learning_rate": 9.161676646706587e-07,
"loss": 100.2878,
"step": 765
},
{
"epoch": 0.09230123767568701,
"grad_norm": 106.375,
"learning_rate": 9.221556886227545e-07,
"loss": 99.3472,
"step": 770
},
{
"epoch": 0.09290059636189278,
"grad_norm": 107.5,
"learning_rate": 9.281437125748503e-07,
"loss": 99.7771,
"step": 775
},
{
"epoch": 0.09349995504809853,
"grad_norm": 110.75,
"learning_rate": 9.341317365269461e-07,
"loss": 101.1653,
"step": 780
},
{
"epoch": 0.09409931373430429,
"grad_norm": 107.0625,
"learning_rate": 9.401197604790418e-07,
"loss": 98.7351,
"step": 785
},
{
"epoch": 0.09469867242051005,
"grad_norm": 109.8125,
"learning_rate": 9.461077844311377e-07,
"loss": 98.4055,
"step": 790
},
{
"epoch": 0.09529803110671581,
"grad_norm": 105.6875,
"learning_rate": 9.520958083832335e-07,
"loss": 100.6708,
"step": 795
},
{
"epoch": 0.09589738979292158,
"grad_norm": 107.9375,
"learning_rate": 9.580838323353293e-07,
"loss": 99.4253,
"step": 800
},
{
"epoch": 0.09649674847912733,
"grad_norm": 102.1875,
"learning_rate": 9.640718562874252e-07,
"loss": 99.4844,
"step": 805
},
{
"epoch": 0.0970961071653331,
"grad_norm": 104.3125,
"learning_rate": 9.70059880239521e-07,
"loss": 100.4691,
"step": 810
},
{
"epoch": 0.09769546585153885,
"grad_norm": 104.8125,
"learning_rate": 9.760479041916168e-07,
"loss": 98.4424,
"step": 815
},
{
"epoch": 0.09829482453774462,
"grad_norm": 106.375,
"learning_rate": 9.820359281437125e-07,
"loss": 99.2889,
"step": 820
},
{
"epoch": 0.09889418322395037,
"grad_norm": 104.4375,
"learning_rate": 9.880239520958083e-07,
"loss": 98.5602,
"step": 825
},
{
"epoch": 0.09949354191015614,
"grad_norm": 104.0625,
"learning_rate": 9.940119760479041e-07,
"loss": 98.6925,
"step": 830
},
{
"epoch": 0.10009290059636189,
"grad_norm": 106.0,
"learning_rate": 1e-06,
"loss": 99.9813,
"step": 835
},
{
"epoch": 0.10069225928256766,
"grad_norm": 104.3125,
"learning_rate": 9.993339549753564e-07,
"loss": 100.2518,
"step": 840
},
{
"epoch": 0.10129161796877341,
"grad_norm": 107.5625,
"learning_rate": 9.986679099507126e-07,
"loss": 101.2464,
"step": 845
},
{
"epoch": 0.10189097665497918,
"grad_norm": 110.25,
"learning_rate": 9.98001864926069e-07,
"loss": 98.2911,
"step": 850
},
{
"epoch": 0.10249033534118493,
"grad_norm": 104.4375,
"learning_rate": 9.973358199014254e-07,
"loss": 98.8816,
"step": 855
},
{
"epoch": 0.1030896940273907,
"grad_norm": 106.5625,
"learning_rate": 9.966697748767816e-07,
"loss": 100.1502,
"step": 860
},
{
"epoch": 0.10368905271359645,
"grad_norm": 104.1875,
"learning_rate": 9.96003729852138e-07,
"loss": 99.8585,
"step": 865
},
{
"epoch": 0.10428841139980222,
"grad_norm": 102.375,
"learning_rate": 9.953376848274942e-07,
"loss": 99.9457,
"step": 870
},
{
"epoch": 0.10488777008600797,
"grad_norm": 104.375,
"learning_rate": 9.946716398028506e-07,
"loss": 100.6051,
"step": 875
},
{
"epoch": 0.10548712877221374,
"grad_norm": 105.75,
"learning_rate": 9.94005594778207e-07,
"loss": 98.5651,
"step": 880
},
{
"epoch": 0.10608648745841949,
"grad_norm": 105.5625,
"learning_rate": 9.933395497535634e-07,
"loss": 99.5896,
"step": 885
},
{
"epoch": 0.10668584614462526,
"grad_norm": 104.1875,
"learning_rate": 9.926735047289196e-07,
"loss": 101.2675,
"step": 890
},
{
"epoch": 0.10728520483083101,
"grad_norm": 110.75,
"learning_rate": 9.92007459704276e-07,
"loss": 100.4223,
"step": 895
},
{
"epoch": 0.10788456351703678,
"grad_norm": 107.4375,
"learning_rate": 9.913414146796324e-07,
"loss": 99.0593,
"step": 900
},
{
"epoch": 0.10848392220324253,
"grad_norm": 107.625,
"learning_rate": 9.906753696549886e-07,
"loss": 99.1331,
"step": 905
},
{
"epoch": 0.10908328088944828,
"grad_norm": 109.25,
"learning_rate": 9.90009324630345e-07,
"loss": 98.8885,
"step": 910
},
{
"epoch": 0.10968263957565405,
"grad_norm": 102.875,
"learning_rate": 9.893432796057014e-07,
"loss": 99.1233,
"step": 915
},
{
"epoch": 0.1102819982618598,
"grad_norm": 109.5,
"learning_rate": 9.886772345810576e-07,
"loss": 98.3095,
"step": 920
},
{
"epoch": 0.11088135694806557,
"grad_norm": 105.5,
"learning_rate": 9.88011189556414e-07,
"loss": 99.468,
"step": 925
},
{
"epoch": 0.11148071563427132,
"grad_norm": 100.5625,
"learning_rate": 9.873451445317704e-07,
"loss": 99.1593,
"step": 930
},
{
"epoch": 0.11208007432047709,
"grad_norm": 107.6875,
"learning_rate": 9.866790995071268e-07,
"loss": 100.0329,
"step": 935
},
{
"epoch": 0.11267943300668284,
"grad_norm": 106.125,
"learning_rate": 9.86013054482483e-07,
"loss": 98.3336,
"step": 940
},
{
"epoch": 0.11327879169288861,
"grad_norm": 106.0625,
"learning_rate": 9.853470094578394e-07,
"loss": 100.0068,
"step": 945
},
{
"epoch": 0.11387815037909436,
"grad_norm": 101.0625,
"learning_rate": 9.846809644331956e-07,
"loss": 97.691,
"step": 950
},
{
"epoch": 0.11447750906530013,
"grad_norm": 103.5,
"learning_rate": 9.84014919408552e-07,
"loss": 98.8459,
"step": 955
},
{
"epoch": 0.11507686775150588,
"grad_norm": 106.125,
"learning_rate": 9.833488743839084e-07,
"loss": 99.4046,
"step": 960
},
{
"epoch": 0.11567622643771165,
"grad_norm": 103.4375,
"learning_rate": 9.826828293592646e-07,
"loss": 97.2283,
"step": 965
},
{
"epoch": 0.1162755851239174,
"grad_norm": 106.3125,
"learning_rate": 9.82016784334621e-07,
"loss": 99.0077,
"step": 970
},
{
"epoch": 0.11687494381012317,
"grad_norm": 108.375,
"learning_rate": 9.813507393099774e-07,
"loss": 99.1617,
"step": 975
},
{
"epoch": 0.11747430249632893,
"grad_norm": 107.125,
"learning_rate": 9.806846942853336e-07,
"loss": 99.5582,
"step": 980
},
{
"epoch": 0.11807366118253469,
"grad_norm": 108.0625,
"learning_rate": 9.8001864926069e-07,
"loss": 99.7954,
"step": 985
},
{
"epoch": 0.11867301986874045,
"grad_norm": 108.0625,
"learning_rate": 9.793526042360462e-07,
"loss": 99.1493,
"step": 990
},
{
"epoch": 0.11927237855494621,
"grad_norm": 104.5625,
"learning_rate": 9.786865592114026e-07,
"loss": 97.6637,
"step": 995
},
{
"epoch": 0.11987173724115197,
"grad_norm": 107.375,
"learning_rate": 9.78020514186759e-07,
"loss": 97.6329,
"step": 1000
},
{
"epoch": 0.11987173724115197,
"eval_loss": 3.089021682739258,
"eval_runtime": 401.7854,
"eval_samples_per_second": 1119.013,
"eval_steps_per_second": 34.971,
"step": 1000
},
{
"epoch": 0.12047109592735773,
"grad_norm": 105.375,
"learning_rate": 9.773544691621152e-07,
"loss": 98.9716,
"step": 1005
},
{
"epoch": 0.12107045461356349,
"grad_norm": 106.75,
"learning_rate": 9.766884241374716e-07,
"loss": 99.2064,
"step": 1010
},
{
"epoch": 0.12166981329976925,
"grad_norm": 104.8125,
"learning_rate": 9.76022379112828e-07,
"loss": 100.1851,
"step": 1015
},
{
"epoch": 0.122269171985975,
"grad_norm": 103.5625,
"learning_rate": 9.753563340881844e-07,
"loss": 97.3658,
"step": 1020
},
{
"epoch": 0.12286853067218077,
"grad_norm": 106.75,
"learning_rate": 9.746902890635406e-07,
"loss": 99.1893,
"step": 1025
},
{
"epoch": 0.12346788935838653,
"grad_norm": 107.0625,
"learning_rate": 9.74024244038897e-07,
"loss": 98.0062,
"step": 1030
},
{
"epoch": 0.1240672480445923,
"grad_norm": 105.9375,
"learning_rate": 9.733581990142534e-07,
"loss": 98.6342,
"step": 1035
},
{
"epoch": 0.12466660673079805,
"grad_norm": 105.1875,
"learning_rate": 9.726921539896096e-07,
"loss": 98.5687,
"step": 1040
},
{
"epoch": 0.1252659654170038,
"grad_norm": 100.75,
"learning_rate": 9.72026108964966e-07,
"loss": 96.8965,
"step": 1045
},
{
"epoch": 0.12586532410320955,
"grad_norm": 105.0625,
"learning_rate": 9.713600639403224e-07,
"loss": 97.2172,
"step": 1050
},
{
"epoch": 0.12646468278941533,
"grad_norm": 105.625,
"learning_rate": 9.706940189156786e-07,
"loss": 98.1943,
"step": 1055
},
{
"epoch": 0.1270640414756211,
"grad_norm": 106.5,
"learning_rate": 9.70027973891035e-07,
"loss": 97.6631,
"step": 1060
},
{
"epoch": 0.12766340016182684,
"grad_norm": 106.5,
"learning_rate": 9.693619288663914e-07,
"loss": 96.5269,
"step": 1065
},
{
"epoch": 0.1282627588480326,
"grad_norm": 106.1875,
"learning_rate": 9.686958838417476e-07,
"loss": 99.6469,
"step": 1070
},
{
"epoch": 0.12886211753423837,
"grad_norm": 108.75,
"learning_rate": 9.68029838817104e-07,
"loss": 97.0663,
"step": 1075
},
{
"epoch": 0.12946147622044413,
"grad_norm": 101.4375,
"learning_rate": 9.673637937924604e-07,
"loss": 97.5067,
"step": 1080
},
{
"epoch": 0.13006083490664988,
"grad_norm": 108.375,
"learning_rate": 9.666977487678166e-07,
"loss": 98.6691,
"step": 1085
},
{
"epoch": 0.13066019359285563,
"grad_norm": 108.0,
"learning_rate": 9.66031703743173e-07,
"loss": 98.2915,
"step": 1090
},
{
"epoch": 0.13125955227906141,
"grad_norm": 103.1875,
"learning_rate": 9.653656587185294e-07,
"loss": 98.3848,
"step": 1095
},
{
"epoch": 0.13185891096526717,
"grad_norm": 104.1875,
"learning_rate": 9.646996136938856e-07,
"loss": 98.996,
"step": 1100
},
{
"epoch": 0.13245826965147292,
"grad_norm": 104.125,
"learning_rate": 9.64033568669242e-07,
"loss": 99.8995,
"step": 1105
},
{
"epoch": 0.13305762833767867,
"grad_norm": 106.1875,
"learning_rate": 9.633675236445982e-07,
"loss": 98.3694,
"step": 1110
},
{
"epoch": 0.13365698702388445,
"grad_norm": 103.125,
"learning_rate": 9.627014786199546e-07,
"loss": 97.7132,
"step": 1115
},
{
"epoch": 0.1342563457100902,
"grad_norm": 104.125,
"learning_rate": 9.62035433595311e-07,
"loss": 96.71,
"step": 1120
},
{
"epoch": 0.13485570439629596,
"grad_norm": 103.625,
"learning_rate": 9.613693885706672e-07,
"loss": 98.2567,
"step": 1125
},
{
"epoch": 0.13545506308250171,
"grad_norm": 103.625,
"learning_rate": 9.607033435460236e-07,
"loss": 97.9151,
"step": 1130
},
{
"epoch": 0.1360544217687075,
"grad_norm": 103.0,
"learning_rate": 9.6003729852138e-07,
"loss": 97.817,
"step": 1135
},
{
"epoch": 0.13665378045491325,
"grad_norm": 103.25,
"learning_rate": 9.593712534967362e-07,
"loss": 98.3279,
"step": 1140
},
{
"epoch": 0.137253139141119,
"grad_norm": 104.75,
"learning_rate": 9.587052084720926e-07,
"loss": 97.5354,
"step": 1145
},
{
"epoch": 0.13785249782732475,
"grad_norm": 103.875,
"learning_rate": 9.58039163447449e-07,
"loss": 97.3573,
"step": 1150
},
{
"epoch": 0.13845185651353054,
"grad_norm": 105.5,
"learning_rate": 9.573731184228054e-07,
"loss": 97.9208,
"step": 1155
},
{
"epoch": 0.1390512151997363,
"grad_norm": 106.5,
"learning_rate": 9.567070733981616e-07,
"loss": 96.5023,
"step": 1160
},
{
"epoch": 0.13965057388594204,
"grad_norm": 106.5625,
"learning_rate": 9.56041028373518e-07,
"loss": 96.8592,
"step": 1165
},
{
"epoch": 0.1402499325721478,
"grad_norm": 106.75,
"learning_rate": 9.553749833488744e-07,
"loss": 97.1946,
"step": 1170
},
{
"epoch": 0.14084929125835355,
"grad_norm": 106.875,
"learning_rate": 9.547089383242306e-07,
"loss": 97.9764,
"step": 1175
},
{
"epoch": 0.14144864994455933,
"grad_norm": 106.0,
"learning_rate": 9.54042893299587e-07,
"loss": 98.4564,
"step": 1180
},
{
"epoch": 0.14204800863076508,
"grad_norm": 105.5625,
"learning_rate": 9.533768482749433e-07,
"loss": 96.7205,
"step": 1185
},
{
"epoch": 0.14264736731697084,
"grad_norm": 105.6875,
"learning_rate": 9.527108032502996e-07,
"loss": 98.374,
"step": 1190
},
{
"epoch": 0.1432467260031766,
"grad_norm": 106.5625,
"learning_rate": 9.52044758225656e-07,
"loss": 97.8351,
"step": 1195
},
{
"epoch": 0.14384608468938237,
"grad_norm": 104.1875,
"learning_rate": 9.513787132010123e-07,
"loss": 97.8296,
"step": 1200
},
{
"epoch": 0.14444544337558812,
"grad_norm": 103.5625,
"learning_rate": 9.507126681763686e-07,
"loss": 98.0513,
"step": 1205
},
{
"epoch": 0.14504480206179388,
"grad_norm": 107.6875,
"learning_rate": 9.50046623151725e-07,
"loss": 96.9166,
"step": 1210
},
{
"epoch": 0.14564416074799963,
"grad_norm": 105.375,
"learning_rate": 9.493805781270814e-07,
"loss": 98.0969,
"step": 1215
},
{
"epoch": 0.1462435194342054,
"grad_norm": 105.0,
"learning_rate": 9.487145331024376e-07,
"loss": 96.9892,
"step": 1220
},
{
"epoch": 0.14684287812041116,
"grad_norm": 102.625,
"learning_rate": 9.48048488077794e-07,
"loss": 97.9786,
"step": 1225
},
{
"epoch": 0.14744223680661692,
"grad_norm": 101.625,
"learning_rate": 9.473824430531504e-07,
"loss": 96.3052,
"step": 1230
},
{
"epoch": 0.14804159549282267,
"grad_norm": 107.8125,
"learning_rate": 9.467163980285066e-07,
"loss": 97.7565,
"step": 1235
},
{
"epoch": 0.14864095417902845,
"grad_norm": 107.5,
"learning_rate": 9.46050353003863e-07,
"loss": 96.6391,
"step": 1240
},
{
"epoch": 0.1492403128652342,
"grad_norm": 103.375,
"learning_rate": 9.453843079792193e-07,
"loss": 95.5644,
"step": 1245
},
{
"epoch": 0.14983967155143996,
"grad_norm": 106.125,
"learning_rate": 9.447182629545756e-07,
"loss": 95.803,
"step": 1250
},
{
"epoch": 0.1504390302376457,
"grad_norm": 105.875,
"learning_rate": 9.44052217929932e-07,
"loss": 96.3256,
"step": 1255
},
{
"epoch": 0.1510383889238515,
"grad_norm": 103.25,
"learning_rate": 9.433861729052883e-07,
"loss": 98.1857,
"step": 1260
},
{
"epoch": 0.15163774761005724,
"grad_norm": 104.5625,
"learning_rate": 9.427201278806447e-07,
"loss": 97.9819,
"step": 1265
},
{
"epoch": 0.152237106296263,
"grad_norm": 102.6875,
"learning_rate": 9.42054082856001e-07,
"loss": 96.2453,
"step": 1270
},
{
"epoch": 0.15283646498246875,
"grad_norm": 102.75,
"learning_rate": 9.413880378313573e-07,
"loss": 96.0352,
"step": 1275
},
{
"epoch": 0.15343582366867453,
"grad_norm": 104.125,
"learning_rate": 9.407219928067138e-07,
"loss": 97.2261,
"step": 1280
},
{
"epoch": 0.15403518235488028,
"grad_norm": 101.5625,
"learning_rate": 9.400559477820699e-07,
"loss": 95.9944,
"step": 1285
},
{
"epoch": 0.15463454104108604,
"grad_norm": 105.5,
"learning_rate": 9.393899027574263e-07,
"loss": 96.8925,
"step": 1290
},
{
"epoch": 0.1552338997272918,
"grad_norm": 102.0,
"learning_rate": 9.387238577327828e-07,
"loss": 97.0403,
"step": 1295
},
{
"epoch": 0.15583325841349754,
"grad_norm": 102.625,
"learning_rate": 9.38057812708139e-07,
"loss": 95.9254,
"step": 1300
},
{
"epoch": 0.15643261709970332,
"grad_norm": 106.0625,
"learning_rate": 9.373917676834954e-07,
"loss": 97.3174,
"step": 1305
},
{
"epoch": 0.15703197578590908,
"grad_norm": 103.5,
"learning_rate": 9.367257226588518e-07,
"loss": 95.8913,
"step": 1310
},
{
"epoch": 0.15763133447211483,
"grad_norm": 104.375,
"learning_rate": 9.360596776342081e-07,
"loss": 96.4431,
"step": 1315
},
{
"epoch": 0.15823069315832058,
"grad_norm": 105.1875,
"learning_rate": 9.353936326095644e-07,
"loss": 97.7829,
"step": 1320
},
{
"epoch": 0.15883005184452637,
"grad_norm": 103.9375,
"learning_rate": 9.347275875849207e-07,
"loss": 97.3946,
"step": 1325
},
{
"epoch": 0.15942941053073212,
"grad_norm": 103.5625,
"learning_rate": 9.340615425602771e-07,
"loss": 96.9318,
"step": 1330
},
{
"epoch": 0.16002876921693787,
"grad_norm": 103.6875,
"learning_rate": 9.333954975356334e-07,
"loss": 97.0651,
"step": 1335
},
{
"epoch": 0.16062812790314362,
"grad_norm": 103.9375,
"learning_rate": 9.327294525109897e-07,
"loss": 96.876,
"step": 1340
},
{
"epoch": 0.1612274865893494,
"grad_norm": 103.8125,
"learning_rate": 9.320634074863461e-07,
"loss": 96.045,
"step": 1345
},
{
"epoch": 0.16182684527555516,
"grad_norm": 106.375,
"learning_rate": 9.313973624617025e-07,
"loss": 96.8118,
"step": 1350
},
{
"epoch": 0.1624262039617609,
"grad_norm": 104.9375,
"learning_rate": 9.307313174370587e-07,
"loss": 96.7171,
"step": 1355
},
{
"epoch": 0.16302556264796667,
"grad_norm": 104.125,
"learning_rate": 9.300652724124151e-07,
"loss": 94.9321,
"step": 1360
},
{
"epoch": 0.16362492133417245,
"grad_norm": 103.8125,
"learning_rate": 9.293992273877714e-07,
"loss": 95.2336,
"step": 1365
},
{
"epoch": 0.1642242800203782,
"grad_norm": 106.9375,
"learning_rate": 9.287331823631277e-07,
"loss": 96.755,
"step": 1370
},
{
"epoch": 0.16482363870658395,
"grad_norm": 103.5,
"learning_rate": 9.280671373384841e-07,
"loss": 95.191,
"step": 1375
},
{
"epoch": 0.1654229973927897,
"grad_norm": 103.5625,
"learning_rate": 9.274010923138404e-07,
"loss": 96.1609,
"step": 1380
},
{
"epoch": 0.1660223560789955,
"grad_norm": 105.125,
"learning_rate": 9.267350472891967e-07,
"loss": 95.8075,
"step": 1385
},
{
"epoch": 0.16662171476520124,
"grad_norm": 101.75,
"learning_rate": 9.260690022645531e-07,
"loss": 96.2969,
"step": 1390
},
{
"epoch": 0.167221073451407,
"grad_norm": 100.75,
"learning_rate": 9.254029572399094e-07,
"loss": 96.6542,
"step": 1395
},
{
"epoch": 0.16782043213761275,
"grad_norm": 103.3125,
"learning_rate": 9.247369122152658e-07,
"loss": 96.97,
"step": 1400
},
{
"epoch": 0.16841979082381853,
"grad_norm": 101.6875,
"learning_rate": 9.24070867190622e-07,
"loss": 97.02,
"step": 1405
},
{
"epoch": 0.16901914951002428,
"grad_norm": 105.0,
"learning_rate": 9.234048221659784e-07,
"loss": 94.5289,
"step": 1410
},
{
"epoch": 0.16961850819623003,
"grad_norm": 106.5,
"learning_rate": 9.227387771413348e-07,
"loss": 95.6256,
"step": 1415
},
{
"epoch": 0.1702178668824358,
"grad_norm": 102.875,
"learning_rate": 9.22072732116691e-07,
"loss": 95.2031,
"step": 1420
},
{
"epoch": 0.17081722556864154,
"grad_norm": 98.4375,
"learning_rate": 9.214066870920474e-07,
"loss": 96.182,
"step": 1425
},
{
"epoch": 0.17141658425484732,
"grad_norm": 103.0,
"learning_rate": 9.207406420674038e-07,
"loss": 96.1099,
"step": 1430
},
{
"epoch": 0.17201594294105307,
"grad_norm": 104.75,
"learning_rate": 9.2007459704276e-07,
"loss": 96.3921,
"step": 1435
},
{
"epoch": 0.17261530162725883,
"grad_norm": 106.8125,
"learning_rate": 9.194085520181164e-07,
"loss": 95.0675,
"step": 1440
},
{
"epoch": 0.17321466031346458,
"grad_norm": 98.625,
"learning_rate": 9.187425069934727e-07,
"loss": 96.4779,
"step": 1445
},
{
"epoch": 0.17381401899967036,
"grad_norm": 104.0,
"learning_rate": 9.180764619688291e-07,
"loss": 96.3266,
"step": 1450
},
{
"epoch": 0.17441337768587611,
"grad_norm": 101.6875,
"learning_rate": 9.174104169441854e-07,
"loss": 95.1507,
"step": 1455
},
{
"epoch": 0.17501273637208187,
"grad_norm": 105.4375,
"learning_rate": 9.167443719195417e-07,
"loss": 96.3075,
"step": 1460
},
{
"epoch": 0.17561209505828762,
"grad_norm": 104.1875,
"learning_rate": 9.160783268948981e-07,
"loss": 95.6688,
"step": 1465
},
{
"epoch": 0.1762114537444934,
"grad_norm": 106.875,
"learning_rate": 9.154122818702544e-07,
"loss": 96.2328,
"step": 1470
},
{
"epoch": 0.17681081243069915,
"grad_norm": 102.9375,
"learning_rate": 9.147462368456107e-07,
"loss": 95.5972,
"step": 1475
},
{
"epoch": 0.1774101711169049,
"grad_norm": 102.4375,
"learning_rate": 9.140801918209671e-07,
"loss": 96.3281,
"step": 1480
},
{
"epoch": 0.17800952980311066,
"grad_norm": 106.3125,
"learning_rate": 9.134141467963233e-07,
"loss": 95.6763,
"step": 1485
},
{
"epoch": 0.17860888848931644,
"grad_norm": 106.25,
"learning_rate": 9.127481017716797e-07,
"loss": 96.2383,
"step": 1490
},
{
"epoch": 0.1792082471755222,
"grad_norm": 104.625,
"learning_rate": 9.120820567470361e-07,
"loss": 95.6165,
"step": 1495
},
{
"epoch": 0.17980760586172795,
"grad_norm": 108.25,
"learning_rate": 9.114160117223924e-07,
"loss": 94.2036,
"step": 1500
},
{
"epoch": 0.17980760586172795,
"eval_loss": 2.9848318099975586,
"eval_runtime": 403.3245,
"eval_samples_per_second": 1114.743,
"eval_steps_per_second": 34.838,
"step": 1500
},
{
"epoch": 0.1804069645479337,
"grad_norm": 106.5,
"learning_rate": 9.107499666977487e-07,
"loss": 95.92,
"step": 1505
},
{
"epoch": 0.18100632323413948,
"grad_norm": 105.5,
"learning_rate": 9.100839216731051e-07,
"loss": 96.1439,
"step": 1510
},
{
"epoch": 0.18160568192034524,
"grad_norm": 104.0,
"learning_rate": 9.094178766484614e-07,
"loss": 94.9684,
"step": 1515
},
{
"epoch": 0.182205040606551,
"grad_norm": 103.8125,
"learning_rate": 9.087518316238177e-07,
"loss": 95.09,
"step": 1520
},
{
"epoch": 0.18280439929275674,
"grad_norm": 104.6875,
"learning_rate": 9.08085786599174e-07,
"loss": 93.9746,
"step": 1525
},
{
"epoch": 0.18340375797896252,
"grad_norm": 105.5625,
"learning_rate": 9.074197415745304e-07,
"loss": 93.8851,
"step": 1530
},
{
"epoch": 0.18400311666516828,
"grad_norm": 104.125,
"learning_rate": 9.067536965498868e-07,
"loss": 95.6037,
"step": 1535
},
{
"epoch": 0.18460247535137403,
"grad_norm": 104.75,
"learning_rate": 9.06087651525243e-07,
"loss": 94.7618,
"step": 1540
},
{
"epoch": 0.18520183403757978,
"grad_norm": 103.0,
"learning_rate": 9.054216065005994e-07,
"loss": 94.8233,
"step": 1545
},
{
"epoch": 0.18580119272378556,
"grad_norm": 105.375,
"learning_rate": 9.047555614759558e-07,
"loss": 95.4594,
"step": 1550
},
{
"epoch": 0.18640055140999132,
"grad_norm": 105.75,
"learning_rate": 9.04089516451312e-07,
"loss": 94.7827,
"step": 1555
},
{
"epoch": 0.18699991009619707,
"grad_norm": 108.875,
"learning_rate": 9.034234714266684e-07,
"loss": 95.8091,
"step": 1560
},
{
"epoch": 0.18759926878240282,
"grad_norm": 105.9375,
"learning_rate": 9.027574264020248e-07,
"loss": 94.6553,
"step": 1565
},
{
"epoch": 0.18819862746860858,
"grad_norm": 103.5625,
"learning_rate": 9.02091381377381e-07,
"loss": 93.9361,
"step": 1570
},
{
"epoch": 0.18879798615481436,
"grad_norm": 103.5625,
"learning_rate": 9.014253363527374e-07,
"loss": 95.9433,
"step": 1575
},
{
"epoch": 0.1893973448410201,
"grad_norm": 103.875,
"learning_rate": 9.007592913280937e-07,
"loss": 95.7227,
"step": 1580
},
{
"epoch": 0.18999670352722586,
"grad_norm": 106.375,
"learning_rate": 9.000932463034501e-07,
"loss": 94.9062,
"step": 1585
},
{
"epoch": 0.19059606221343162,
"grad_norm": 105.1875,
"learning_rate": 8.994272012788064e-07,
"loss": 96.6315,
"step": 1590
},
{
"epoch": 0.1911954208996374,
"grad_norm": 102.4375,
"learning_rate": 8.987611562541627e-07,
"loss": 96.9965,
"step": 1595
},
{
"epoch": 0.19179477958584315,
"grad_norm": 101.8125,
"learning_rate": 8.980951112295191e-07,
"loss": 95.0741,
"step": 1600
},
{
"epoch": 0.1923941382720489,
"grad_norm": 102.3125,
"learning_rate": 8.974290662048754e-07,
"loss": 94.5174,
"step": 1605
},
{
"epoch": 0.19299349695825466,
"grad_norm": 103.8125,
"learning_rate": 8.967630211802317e-07,
"loss": 94.2278,
"step": 1610
},
{
"epoch": 0.19359285564446044,
"grad_norm": 105.5625,
"learning_rate": 8.960969761555881e-07,
"loss": 94.2003,
"step": 1615
},
{
"epoch": 0.1941922143306662,
"grad_norm": 105.5625,
"learning_rate": 8.954309311309444e-07,
"loss": 94.3427,
"step": 1620
},
{
"epoch": 0.19479157301687194,
"grad_norm": 107.75,
"learning_rate": 8.947648861063007e-07,
"loss": 94.4432,
"step": 1625
},
{
"epoch": 0.1953909317030777,
"grad_norm": 106.5,
"learning_rate": 8.940988410816571e-07,
"loss": 95.272,
"step": 1630
},
{
"epoch": 0.19599029038928348,
"grad_norm": 102.375,
"learning_rate": 8.934327960570134e-07,
"loss": 94.1873,
"step": 1635
},
{
"epoch": 0.19658964907548923,
"grad_norm": 103.375,
"learning_rate": 8.927667510323697e-07,
"loss": 94.2735,
"step": 1640
},
{
"epoch": 0.19718900776169498,
"grad_norm": 102.6875,
"learning_rate": 8.921007060077261e-07,
"loss": 94.2252,
"step": 1645
},
{
"epoch": 0.19778836644790074,
"grad_norm": 106.75,
"learning_rate": 8.914346609830824e-07,
"loss": 94.3895,
"step": 1650
},
{
"epoch": 0.19838772513410652,
"grad_norm": 103.1875,
"learning_rate": 8.907686159584387e-07,
"loss": 95.1937,
"step": 1655
},
{
"epoch": 0.19898708382031227,
"grad_norm": 102.4375,
"learning_rate": 8.90102570933795e-07,
"loss": 94.2603,
"step": 1660
},
{
"epoch": 0.19958644250651802,
"grad_norm": 102.8125,
"learning_rate": 8.894365259091514e-07,
"loss": 94.7452,
"step": 1665
},
{
"epoch": 0.20018580119272378,
"grad_norm": 101.5,
"learning_rate": 8.887704808845078e-07,
"loss": 93.8857,
"step": 1670
},
{
"epoch": 0.20078515987892956,
"grad_norm": 102.9375,
"learning_rate": 8.88104435859864e-07,
"loss": 94.3981,
"step": 1675
},
{
"epoch": 0.2013845185651353,
"grad_norm": 105.375,
"learning_rate": 8.874383908352204e-07,
"loss": 94.9569,
"step": 1680
},
{
"epoch": 0.20198387725134107,
"grad_norm": 106.3125,
"learning_rate": 8.867723458105768e-07,
"loss": 93.7188,
"step": 1685
},
{
"epoch": 0.20258323593754682,
"grad_norm": 102.375,
"learning_rate": 8.86106300785933e-07,
"loss": 95.8018,
"step": 1690
},
{
"epoch": 0.20318259462375257,
"grad_norm": 103.875,
"learning_rate": 8.854402557612894e-07,
"loss": 94.5524,
"step": 1695
},
{
"epoch": 0.20378195330995835,
"grad_norm": 105.0,
"learning_rate": 8.847742107366457e-07,
"loss": 94.7519,
"step": 1700
},
{
"epoch": 0.2043813119961641,
"grad_norm": 103.9375,
"learning_rate": 8.84108165712002e-07,
"loss": 94.7115,
"step": 1705
},
{
"epoch": 0.20498067068236986,
"grad_norm": 100.4375,
"learning_rate": 8.834421206873584e-07,
"loss": 95.3129,
"step": 1710
},
{
"epoch": 0.2055800293685756,
"grad_norm": 103.25,
"learning_rate": 8.827760756627147e-07,
"loss": 94.4636,
"step": 1715
},
{
"epoch": 0.2061793880547814,
"grad_norm": 106.3125,
"learning_rate": 8.821100306380711e-07,
"loss": 94.4248,
"step": 1720
},
{
"epoch": 0.20677874674098715,
"grad_norm": 103.1875,
"learning_rate": 8.814439856134274e-07,
"loss": 93.4922,
"step": 1725
},
{
"epoch": 0.2073781054271929,
"grad_norm": 103.75,
"learning_rate": 8.807779405887837e-07,
"loss": 94.1299,
"step": 1730
},
{
"epoch": 0.20797746411339865,
"grad_norm": 105.75,
"learning_rate": 8.801118955641401e-07,
"loss": 92.188,
"step": 1735
},
{
"epoch": 0.20857682279960443,
"grad_norm": 105.9375,
"learning_rate": 8.794458505394963e-07,
"loss": 93.6489,
"step": 1740
},
{
"epoch": 0.2091761814858102,
"grad_norm": 104.875,
"learning_rate": 8.787798055148527e-07,
"loss": 94.0485,
"step": 1745
},
{
"epoch": 0.20977554017201594,
"grad_norm": 105.75,
"learning_rate": 8.781137604902091e-07,
"loss": 94.2215,
"step": 1750
},
{
"epoch": 0.2103748988582217,
"grad_norm": 103.4375,
"learning_rate": 8.774477154655654e-07,
"loss": 93.8523,
"step": 1755
},
{
"epoch": 0.21097425754442747,
"grad_norm": 99.9375,
"learning_rate": 8.767816704409217e-07,
"loss": 92.7818,
"step": 1760
},
{
"epoch": 0.21157361623063323,
"grad_norm": 104.375,
"learning_rate": 8.761156254162782e-07,
"loss": 93.718,
"step": 1765
},
{
"epoch": 0.21217297491683898,
"grad_norm": 103.375,
"learning_rate": 8.754495803916345e-07,
"loss": 93.3778,
"step": 1770
},
{
"epoch": 0.21277233360304473,
"grad_norm": 103.125,
"learning_rate": 8.747835353669907e-07,
"loss": 93.6893,
"step": 1775
},
{
"epoch": 0.21337169228925051,
"grad_norm": 105.0,
"learning_rate": 8.74117490342347e-07,
"loss": 93.1915,
"step": 1780
},
{
"epoch": 0.21397105097545627,
"grad_norm": 110.9375,
"learning_rate": 8.734514453177035e-07,
"loss": 93.6866,
"step": 1785
},
{
"epoch": 0.21457040966166202,
"grad_norm": 101.625,
"learning_rate": 8.727854002930598e-07,
"loss": 93.3116,
"step": 1790
},
{
"epoch": 0.21516976834786777,
"grad_norm": 105.4375,
"learning_rate": 8.72119355268416e-07,
"loss": 94.4551,
"step": 1795
},
{
"epoch": 0.21576912703407355,
"grad_norm": 101.75,
"learning_rate": 8.714533102437725e-07,
"loss": 93.7837,
"step": 1800
},
{
"epoch": 0.2163684857202793,
"grad_norm": 101.9375,
"learning_rate": 8.707872652191289e-07,
"loss": 94.558,
"step": 1805
},
{
"epoch": 0.21696784440648506,
"grad_norm": 103.6875,
"learning_rate": 8.701212201944851e-07,
"loss": 94.2175,
"step": 1810
},
{
"epoch": 0.21756720309269081,
"grad_norm": 106.125,
"learning_rate": 8.694551751698415e-07,
"loss": 94.034,
"step": 1815
},
{
"epoch": 0.21816656177889657,
"grad_norm": 104.0625,
"learning_rate": 8.687891301451978e-07,
"loss": 92.8112,
"step": 1820
},
{
"epoch": 0.21876592046510235,
"grad_norm": 102.75,
"learning_rate": 8.681230851205541e-07,
"loss": 92.7093,
"step": 1825
},
{
"epoch": 0.2193652791513081,
"grad_norm": 101.4375,
"learning_rate": 8.674570400959105e-07,
"loss": 93.7157,
"step": 1830
},
{
"epoch": 0.21996463783751385,
"grad_norm": 104.0,
"learning_rate": 8.667909950712668e-07,
"loss": 92.883,
"step": 1835
},
{
"epoch": 0.2205639965237196,
"grad_norm": 107.6875,
"learning_rate": 8.661249500466232e-07,
"loss": 93.1181,
"step": 1840
},
{
"epoch": 0.2211633552099254,
"grad_norm": 102.0625,
"learning_rate": 8.654589050219795e-07,
"loss": 92.2143,
"step": 1845
},
{
"epoch": 0.22176271389613114,
"grad_norm": 107.625,
"learning_rate": 8.647928599973358e-07,
"loss": 93.7465,
"step": 1850
},
{
"epoch": 0.2223620725823369,
"grad_norm": 102.375,
"learning_rate": 8.641268149726922e-07,
"loss": 92.4702,
"step": 1855
},
{
"epoch": 0.22296143126854265,
"grad_norm": 104.0625,
"learning_rate": 8.634607699480484e-07,
"loss": 94.137,
"step": 1860
},
{
"epoch": 0.22356078995474843,
"grad_norm": 102.8125,
"learning_rate": 8.627947249234048e-07,
"loss": 92.3921,
"step": 1865
},
{
"epoch": 0.22416014864095418,
"grad_norm": 101.875,
"learning_rate": 8.621286798987612e-07,
"loss": 93.8057,
"step": 1870
},
{
"epoch": 0.22475950732715994,
"grad_norm": 104.9375,
"learning_rate": 8.614626348741174e-07,
"loss": 92.852,
"step": 1875
},
{
"epoch": 0.2253588660133657,
"grad_norm": 103.25,
"learning_rate": 8.607965898494738e-07,
"loss": 92.9432,
"step": 1880
},
{
"epoch": 0.22595822469957147,
"grad_norm": 103.0625,
"learning_rate": 8.601305448248302e-07,
"loss": 91.3228,
"step": 1885
},
{
"epoch": 0.22655758338577722,
"grad_norm": 103.8125,
"learning_rate": 8.594644998001865e-07,
"loss": 92.478,
"step": 1890
},
{
"epoch": 0.22715694207198298,
"grad_norm": 104.5625,
"learning_rate": 8.587984547755428e-07,
"loss": 93.0927,
"step": 1895
},
{
"epoch": 0.22775630075818873,
"grad_norm": 102.25,
"learning_rate": 8.581324097508991e-07,
"loss": 94.3626,
"step": 1900
},
{
"epoch": 0.2283556594443945,
"grad_norm": 104.625,
"learning_rate": 8.574663647262555e-07,
"loss": 92.3862,
"step": 1905
},
{
"epoch": 0.22895501813060026,
"grad_norm": 102.8125,
"learning_rate": 8.568003197016118e-07,
"loss": 92.2181,
"step": 1910
},
{
"epoch": 0.22955437681680602,
"grad_norm": 107.5,
"learning_rate": 8.561342746769681e-07,
"loss": 94.641,
"step": 1915
},
{
"epoch": 0.23015373550301177,
"grad_norm": 100.625,
"learning_rate": 8.554682296523245e-07,
"loss": 92.6945,
"step": 1920
},
{
"epoch": 0.23075309418921755,
"grad_norm": 103.6875,
"learning_rate": 8.548021846276808e-07,
"loss": 94.1129,
"step": 1925
},
{
"epoch": 0.2313524528754233,
"grad_norm": 104.0,
"learning_rate": 8.541361396030371e-07,
"loss": 92.2068,
"step": 1930
},
{
"epoch": 0.23195181156162906,
"grad_norm": 104.875,
"learning_rate": 8.534700945783935e-07,
"loss": 93.1549,
"step": 1935
},
{
"epoch": 0.2325511702478348,
"grad_norm": 104.625,
"learning_rate": 8.528040495537499e-07,
"loss": 90.6698,
"step": 1940
},
{
"epoch": 0.23315052893404056,
"grad_norm": 103.25,
"learning_rate": 8.521380045291061e-07,
"loss": 93.3605,
"step": 1945
},
{
"epoch": 0.23374988762024634,
"grad_norm": 106.0625,
"learning_rate": 8.514719595044625e-07,
"loss": 93.7277,
"step": 1950
},
{
"epoch": 0.2343492463064521,
"grad_norm": 100.25,
"learning_rate": 8.508059144798188e-07,
"loss": 92.605,
"step": 1955
},
{
"epoch": 0.23494860499265785,
"grad_norm": 102.3125,
"learning_rate": 8.501398694551751e-07,
"loss": 93.0283,
"step": 1960
},
{
"epoch": 0.2355479636788636,
"grad_norm": 107.5625,
"learning_rate": 8.494738244305315e-07,
"loss": 91.955,
"step": 1965
},
{
"epoch": 0.23614732236506938,
"grad_norm": 99.375,
"learning_rate": 8.488077794058878e-07,
"loss": 92.1366,
"step": 1970
},
{
"epoch": 0.23674668105127514,
"grad_norm": 104.8125,
"learning_rate": 8.481417343812442e-07,
"loss": 91.6786,
"step": 1975
},
{
"epoch": 0.2373460397374809,
"grad_norm": 103.875,
"learning_rate": 8.474756893566005e-07,
"loss": 91.9743,
"step": 1980
},
{
"epoch": 0.23794539842368664,
"grad_norm": 103.25,
"learning_rate": 8.468096443319568e-07,
"loss": 91.3638,
"step": 1985
},
{
"epoch": 0.23854475710989242,
"grad_norm": 103.3125,
"learning_rate": 8.461435993073132e-07,
"loss": 92.2037,
"step": 1990
},
{
"epoch": 0.23914411579609818,
"grad_norm": 109.25,
"learning_rate": 8.454775542826694e-07,
"loss": 92.2242,
"step": 1995
},
{
"epoch": 0.23974347448230393,
"grad_norm": 103.4375,
"learning_rate": 8.448115092580258e-07,
"loss": 93.88,
"step": 2000
},
{
"epoch": 0.23974347448230393,
"eval_loss": 2.8944661617279053,
"eval_runtime": 404.8634,
"eval_samples_per_second": 1110.505,
"eval_steps_per_second": 34.706,
"step": 2000
},
{
"epoch": 0.24034283316850968,
"grad_norm": 101.5625,
"learning_rate": 8.441454642333822e-07,
"loss": 93.8037,
"step": 2005
},
{
"epoch": 0.24094219185471547,
"grad_norm": 103.5,
"learning_rate": 8.434794192087384e-07,
"loss": 92.7435,
"step": 2010
},
{
"epoch": 0.24154155054092122,
"grad_norm": 100.4375,
"learning_rate": 8.428133741840948e-07,
"loss": 93.3825,
"step": 2015
},
{
"epoch": 0.24214090922712697,
"grad_norm": 103.0,
"learning_rate": 8.421473291594512e-07,
"loss": 91.97,
"step": 2020
},
{
"epoch": 0.24274026791333272,
"grad_norm": 105.75,
"learning_rate": 8.414812841348075e-07,
"loss": 92.3056,
"step": 2025
},
{
"epoch": 0.2433396265995385,
"grad_norm": 101.5625,
"learning_rate": 8.408152391101638e-07,
"loss": 93.6722,
"step": 2030
},
{
"epoch": 0.24393898528574426,
"grad_norm": 104.3125,
"learning_rate": 8.401491940855201e-07,
"loss": 92.8126,
"step": 2035
},
{
"epoch": 0.24453834397195,
"grad_norm": 106.25,
"learning_rate": 8.394831490608765e-07,
"loss": 92.2455,
"step": 2040
},
{
"epoch": 0.24513770265815576,
"grad_norm": 103.8125,
"learning_rate": 8.388171040362328e-07,
"loss": 93.0326,
"step": 2045
},
{
"epoch": 0.24573706134436155,
"grad_norm": 102.4375,
"learning_rate": 8.381510590115891e-07,
"loss": 92.0434,
"step": 2050
},
{
"epoch": 0.2463364200305673,
"grad_norm": 103.9375,
"learning_rate": 8.374850139869455e-07,
"loss": 92.13,
"step": 2055
},
{
"epoch": 0.24693577871677305,
"grad_norm": 103.4375,
"learning_rate": 8.368189689623019e-07,
"loss": 92.0325,
"step": 2060
},
{
"epoch": 0.2475351374029788,
"grad_norm": 105.875,
"learning_rate": 8.361529239376581e-07,
"loss": 92.9597,
"step": 2065
},
{
"epoch": 0.2481344960891846,
"grad_norm": 102.25,
"learning_rate": 8.354868789130145e-07,
"loss": 91.3815,
"step": 2070
},
{
"epoch": 0.24873385477539034,
"grad_norm": 107.0,
"learning_rate": 8.348208338883708e-07,
"loss": 92.8083,
"step": 2075
},
{
"epoch": 0.2493332134615961,
"grad_norm": 106.0,
"learning_rate": 8.341547888637271e-07,
"loss": 90.4569,
"step": 2080
},
{
"epoch": 0.24993257214780185,
"grad_norm": 104.375,
"learning_rate": 8.334887438390835e-07,
"loss": 92.4561,
"step": 2085
},
{
"epoch": 0.2505319308340076,
"grad_norm": 105.125,
"learning_rate": 8.328226988144398e-07,
"loss": 93.1541,
"step": 2090
},
{
"epoch": 0.25113128952021335,
"grad_norm": 103.5625,
"learning_rate": 8.321566537897961e-07,
"loss": 92.0643,
"step": 2095
},
{
"epoch": 0.2517306482064191,
"grad_norm": 103.625,
"learning_rate": 8.314906087651525e-07,
"loss": 92.059,
"step": 2100
},
{
"epoch": 0.2523300068926249,
"grad_norm": 104.0,
"learning_rate": 8.308245637405088e-07,
"loss": 92.2131,
"step": 2105
},
{
"epoch": 0.25292936557883067,
"grad_norm": 101.0,
"learning_rate": 8.301585187158652e-07,
"loss": 92.5779,
"step": 2110
},
{
"epoch": 0.2535287242650364,
"grad_norm": 105.625,
"learning_rate": 8.294924736912214e-07,
"loss": 93.5033,
"step": 2115
},
{
"epoch": 0.2541280829512422,
"grad_norm": 105.0,
"learning_rate": 8.288264286665778e-07,
"loss": 92.5464,
"step": 2120
},
{
"epoch": 0.2547274416374479,
"grad_norm": 105.625,
"learning_rate": 8.281603836419342e-07,
"loss": 91.3447,
"step": 2125
},
{
"epoch": 0.2553268003236537,
"grad_norm": 105.5,
"learning_rate": 8.274943386172904e-07,
"loss": 91.7229,
"step": 2130
},
{
"epoch": 0.25592615900985943,
"grad_norm": 106.125,
"learning_rate": 8.268282935926468e-07,
"loss": 90.8414,
"step": 2135
},
{
"epoch": 0.2565255176960652,
"grad_norm": 103.875,
"learning_rate": 8.261622485680032e-07,
"loss": 91.5145,
"step": 2140
},
{
"epoch": 0.257124876382271,
"grad_norm": 101.5,
"learning_rate": 8.254962035433594e-07,
"loss": 91.8491,
"step": 2145
},
{
"epoch": 0.25772423506847675,
"grad_norm": 105.0,
"learning_rate": 8.248301585187158e-07,
"loss": 92.1283,
"step": 2150
},
{
"epoch": 0.2583235937546825,
"grad_norm": 105.1875,
"learning_rate": 8.241641134940721e-07,
"loss": 91.1556,
"step": 2155
},
{
"epoch": 0.25892295244088825,
"grad_norm": 102.25,
"learning_rate": 8.234980684694285e-07,
"loss": 93.0634,
"step": 2160
},
{
"epoch": 0.259522311127094,
"grad_norm": 101.25,
"learning_rate": 8.228320234447848e-07,
"loss": 91.6604,
"step": 2165
},
{
"epoch": 0.26012166981329976,
"grad_norm": 108.1875,
"learning_rate": 8.221659784201411e-07,
"loss": 90.7036,
"step": 2170
},
{
"epoch": 0.2607210284995055,
"grad_norm": 101.0,
"learning_rate": 8.214999333954975e-07,
"loss": 91.9036,
"step": 2175
},
{
"epoch": 0.26132038718571127,
"grad_norm": 103.9375,
"learning_rate": 8.208338883708538e-07,
"loss": 92.7012,
"step": 2180
},
{
"epoch": 0.2619197458719171,
"grad_norm": 107.0,
"learning_rate": 8.201678433462101e-07,
"loss": 91.6033,
"step": 2185
},
{
"epoch": 0.26251910455812283,
"grad_norm": 99.25,
"learning_rate": 8.195017983215665e-07,
"loss": 91.3827,
"step": 2190
},
{
"epoch": 0.2631184632443286,
"grad_norm": 103.375,
"learning_rate": 8.188357532969227e-07,
"loss": 91.5665,
"step": 2195
},
{
"epoch": 0.26371782193053434,
"grad_norm": 102.4375,
"learning_rate": 8.181697082722791e-07,
"loss": 91.2502,
"step": 2200
},
{
"epoch": 0.2643171806167401,
"grad_norm": 105.25,
"learning_rate": 8.175036632476355e-07,
"loss": 91.9588,
"step": 2205
},
{
"epoch": 0.26491653930294584,
"grad_norm": 102.9375,
"learning_rate": 8.168376182229918e-07,
"loss": 92.1473,
"step": 2210
},
{
"epoch": 0.2655158979891516,
"grad_norm": 101.6875,
"learning_rate": 8.161715731983481e-07,
"loss": 91.0215,
"step": 2215
},
{
"epoch": 0.26611525667535735,
"grad_norm": 103.25,
"learning_rate": 8.155055281737045e-07,
"loss": 91.4128,
"step": 2220
},
{
"epoch": 0.2667146153615631,
"grad_norm": 104.0625,
"learning_rate": 8.148394831490608e-07,
"loss": 91.6982,
"step": 2225
},
{
"epoch": 0.2673139740477689,
"grad_norm": 103.1875,
"learning_rate": 8.141734381244171e-07,
"loss": 91.3595,
"step": 2230
},
{
"epoch": 0.26791333273397466,
"grad_norm": 106.0,
"learning_rate": 8.135073930997734e-07,
"loss": 91.1334,
"step": 2235
},
{
"epoch": 0.2685126914201804,
"grad_norm": 105.8125,
"learning_rate": 8.128413480751298e-07,
"loss": 91.4205,
"step": 2240
},
{
"epoch": 0.26911205010638617,
"grad_norm": 105.125,
"learning_rate": 8.121753030504863e-07,
"loss": 91.8378,
"step": 2245
},
{
"epoch": 0.2697114087925919,
"grad_norm": 101.875,
"learning_rate": 8.115092580258424e-07,
"loss": 90.8697,
"step": 2250
},
{
"epoch": 0.2703107674787977,
"grad_norm": 101.75,
"learning_rate": 8.108432130011989e-07,
"loss": 90.6172,
"step": 2255
},
{
"epoch": 0.27091012616500343,
"grad_norm": 103.6875,
"learning_rate": 8.101771679765553e-07,
"loss": 90.3885,
"step": 2260
},
{
"epoch": 0.2715094848512092,
"grad_norm": 101.6875,
"learning_rate": 8.095111229519114e-07,
"loss": 91.2055,
"step": 2265
},
{
"epoch": 0.272108843537415,
"grad_norm": 104.3125,
"learning_rate": 8.088450779272679e-07,
"loss": 90.61,
"step": 2270
},
{
"epoch": 0.27270820222362074,
"grad_norm": 103.375,
"learning_rate": 8.081790329026243e-07,
"loss": 91.2788,
"step": 2275
},
{
"epoch": 0.2733075609098265,
"grad_norm": 104.1875,
"learning_rate": 8.075129878779805e-07,
"loss": 92.0951,
"step": 2280
},
{
"epoch": 0.27390691959603225,
"grad_norm": 104.375,
"learning_rate": 8.068469428533369e-07,
"loss": 90.7074,
"step": 2285
},
{
"epoch": 0.274506278282238,
"grad_norm": 104.625,
"learning_rate": 8.061808978286932e-07,
"loss": 89.8515,
"step": 2290
},
{
"epoch": 0.27510563696844376,
"grad_norm": 104.25,
"learning_rate": 8.055148528040496e-07,
"loss": 89.7757,
"step": 2295
},
{
"epoch": 0.2757049956546495,
"grad_norm": 102.8125,
"learning_rate": 8.048488077794059e-07,
"loss": 90.4891,
"step": 2300
},
{
"epoch": 0.27630435434085526,
"grad_norm": 102.0,
"learning_rate": 8.041827627547622e-07,
"loss": 90.5568,
"step": 2305
},
{
"epoch": 0.27690371302706107,
"grad_norm": 104.375,
"learning_rate": 8.035167177301186e-07,
"loss": 90.9858,
"step": 2310
},
{
"epoch": 0.2775030717132668,
"grad_norm": 106.8125,
"learning_rate": 8.028506727054749e-07,
"loss": 90.1024,
"step": 2315
},
{
"epoch": 0.2781024303994726,
"grad_norm": 102.25,
"learning_rate": 8.021846276808312e-07,
"loss": 91.8137,
"step": 2320
},
{
"epoch": 0.27870178908567833,
"grad_norm": 102.375,
"learning_rate": 8.015185826561876e-07,
"loss": 90.8296,
"step": 2325
},
{
"epoch": 0.2793011477718841,
"grad_norm": 99.4375,
"learning_rate": 8.008525376315438e-07,
"loss": 89.5984,
"step": 2330
},
{
"epoch": 0.27990050645808984,
"grad_norm": 104.0625,
"learning_rate": 8.001864926069002e-07,
"loss": 91.0086,
"step": 2335
},
{
"epoch": 0.2804998651442956,
"grad_norm": 104.6875,
"learning_rate": 7.995204475822566e-07,
"loss": 90.5621,
"step": 2340
},
{
"epoch": 0.28109922383050134,
"grad_norm": 102.5625,
"learning_rate": 7.988544025576129e-07,
"loss": 90.5947,
"step": 2345
},
{
"epoch": 0.2816985825167071,
"grad_norm": 103.0625,
"learning_rate": 7.981883575329692e-07,
"loss": 91.9352,
"step": 2350
},
{
"epoch": 0.2822979412029129,
"grad_norm": 102.8125,
"learning_rate": 7.975223125083256e-07,
"loss": 90.7241,
"step": 2355
},
{
"epoch": 0.28289729988911866,
"grad_norm": 103.1875,
"learning_rate": 7.968562674836819e-07,
"loss": 90.8843,
"step": 2360
},
{
"epoch": 0.2834966585753244,
"grad_norm": 105.6875,
"learning_rate": 7.961902224590382e-07,
"loss": 91.3989,
"step": 2365
},
{
"epoch": 0.28409601726153016,
"grad_norm": 100.6875,
"learning_rate": 7.955241774343945e-07,
"loss": 89.8323,
"step": 2370
},
{
"epoch": 0.2846953759477359,
"grad_norm": 101.5,
"learning_rate": 7.948581324097509e-07,
"loss": 90.1746,
"step": 2375
},
{
"epoch": 0.28529473463394167,
"grad_norm": 104.0,
"learning_rate": 7.941920873851073e-07,
"loss": 90.6545,
"step": 2380
},
{
"epoch": 0.2858940933201474,
"grad_norm": 104.6875,
"learning_rate": 7.935260423604635e-07,
"loss": 90.5792,
"step": 2385
},
{
"epoch": 0.2864934520063532,
"grad_norm": 104.625,
"learning_rate": 7.928599973358199e-07,
"loss": 91.1984,
"step": 2390
},
{
"epoch": 0.287092810692559,
"grad_norm": 104.5,
"learning_rate": 7.921939523111763e-07,
"loss": 90.3363,
"step": 2395
},
{
"epoch": 0.28769216937876474,
"grad_norm": 101.5,
"learning_rate": 7.915279072865325e-07,
"loss": 90.6117,
"step": 2400
},
{
"epoch": 0.2882915280649705,
"grad_norm": 104.875,
"learning_rate": 7.908618622618889e-07,
"loss": 89.2634,
"step": 2405
},
{
"epoch": 0.28889088675117625,
"grad_norm": 105.625,
"learning_rate": 7.901958172372452e-07,
"loss": 90.1148,
"step": 2410
},
{
"epoch": 0.289490245437382,
"grad_norm": 104.5625,
"learning_rate": 7.895297722126015e-07,
"loss": 91.1194,
"step": 2415
},
{
"epoch": 0.29008960412358775,
"grad_norm": 105.3125,
"learning_rate": 7.888637271879579e-07,
"loss": 90.17,
"step": 2420
},
{
"epoch": 0.2906889628097935,
"grad_norm": 105.6875,
"learning_rate": 7.881976821633142e-07,
"loss": 91.4705,
"step": 2425
},
{
"epoch": 0.29128832149599926,
"grad_norm": 101.8125,
"learning_rate": 7.875316371386706e-07,
"loss": 90.0706,
"step": 2430
},
{
"epoch": 0.29188768018220507,
"grad_norm": 102.5,
"learning_rate": 7.868655921140269e-07,
"loss": 89.9069,
"step": 2435
},
{
"epoch": 0.2924870388684108,
"grad_norm": 103.0625,
"learning_rate": 7.861995470893832e-07,
"loss": 89.592,
"step": 2440
},
{
"epoch": 0.2930863975546166,
"grad_norm": 101.5,
"learning_rate": 7.855335020647396e-07,
"loss": 90.5722,
"step": 2445
},
{
"epoch": 0.2936857562408223,
"grad_norm": 103.9375,
"learning_rate": 7.848674570400958e-07,
"loss": 90.2673,
"step": 2450
},
{
"epoch": 0.2942851149270281,
"grad_norm": 101.6875,
"learning_rate": 7.842014120154522e-07,
"loss": 90.2366,
"step": 2455
},
{
"epoch": 0.29488447361323383,
"grad_norm": 102.25,
"learning_rate": 7.835353669908086e-07,
"loss": 90.1811,
"step": 2460
},
{
"epoch": 0.2954838322994396,
"grad_norm": 102.375,
"learning_rate": 7.828693219661648e-07,
"loss": 91.0067,
"step": 2465
},
{
"epoch": 0.29608319098564534,
"grad_norm": 102.25,
"learning_rate": 7.822032769415212e-07,
"loss": 89.0682,
"step": 2470
},
{
"epoch": 0.2966825496718511,
"grad_norm": 103.25,
"learning_rate": 7.815372319168776e-07,
"loss": 90.7641,
"step": 2475
},
{
"epoch": 0.2972819083580569,
"grad_norm": 106.375,
"learning_rate": 7.808711868922339e-07,
"loss": 92.3741,
"step": 2480
},
{
"epoch": 0.29788126704426265,
"grad_norm": 108.0,
"learning_rate": 7.802051418675902e-07,
"loss": 90.1461,
"step": 2485
},
{
"epoch": 0.2984806257304684,
"grad_norm": 105.9375,
"learning_rate": 7.795390968429465e-07,
"loss": 91.2824,
"step": 2490
},
{
"epoch": 0.29907998441667416,
"grad_norm": 101.625,
"learning_rate": 7.788730518183029e-07,
"loss": 89.7306,
"step": 2495
},
{
"epoch": 0.2996793431028799,
"grad_norm": 103.3125,
"learning_rate": 7.782070067936592e-07,
"loss": 91.2063,
"step": 2500
},
{
"epoch": 0.2996793431028799,
"eval_loss": 2.8105251789093018,
"eval_runtime": 402.9918,
"eval_samples_per_second": 1115.663,
"eval_steps_per_second": 34.867,
"step": 2500
},
{
"epoch": 0.30027870178908567,
"grad_norm": 105.9375,
"learning_rate": 7.775409617690155e-07,
"loss": 89.4656,
"step": 2505
},
{
"epoch": 0.3008780604752914,
"grad_norm": 107.3125,
"learning_rate": 7.768749167443719e-07,
"loss": 89.8508,
"step": 2510
},
{
"epoch": 0.3014774191614972,
"grad_norm": 101.125,
"learning_rate": 7.762088717197283e-07,
"loss": 89.322,
"step": 2515
},
{
"epoch": 0.302076777847703,
"grad_norm": 105.875,
"learning_rate": 7.755428266950845e-07,
"loss": 89.2538,
"step": 2520
},
{
"epoch": 0.30267613653390874,
"grad_norm": 102.1875,
"learning_rate": 7.748767816704409e-07,
"loss": 89.4337,
"step": 2525
},
{
"epoch": 0.3032754952201145,
"grad_norm": 102.5,
"learning_rate": 7.742107366457972e-07,
"loss": 89.8735,
"step": 2530
},
{
"epoch": 0.30387485390632024,
"grad_norm": 102.375,
"learning_rate": 7.735446916211535e-07,
"loss": 90.7425,
"step": 2535
},
{
"epoch": 0.304474212592526,
"grad_norm": 105.75,
"learning_rate": 7.728786465965099e-07,
"loss": 89.6295,
"step": 2540
},
{
"epoch": 0.30507357127873175,
"grad_norm": 100.5625,
"learning_rate": 7.722126015718662e-07,
"loss": 89.3076,
"step": 2545
},
{
"epoch": 0.3056729299649375,
"grad_norm": 103.8125,
"learning_rate": 7.715465565472225e-07,
"loss": 89.8791,
"step": 2550
},
{
"epoch": 0.30627228865114325,
"grad_norm": 101.1875,
"learning_rate": 7.708805115225789e-07,
"loss": 89.6218,
"step": 2555
},
{
"epoch": 0.30687164733734906,
"grad_norm": 103.875,
"learning_rate": 7.702144664979352e-07,
"loss": 90.6524,
"step": 2560
},
{
"epoch": 0.3074710060235548,
"grad_norm": 101.0625,
"learning_rate": 7.695484214732916e-07,
"loss": 89.677,
"step": 2565
},
{
"epoch": 0.30807036470976057,
"grad_norm": 106.75,
"learning_rate": 7.688823764486478e-07,
"loss": 88.4502,
"step": 2570
},
{
"epoch": 0.3086697233959663,
"grad_norm": 105.125,
"learning_rate": 7.682163314240042e-07,
"loss": 91.3921,
"step": 2575
},
{
"epoch": 0.3092690820821721,
"grad_norm": 99.375,
"learning_rate": 7.675502863993606e-07,
"loss": 90.8827,
"step": 2580
},
{
"epoch": 0.30986844076837783,
"grad_norm": 101.6875,
"learning_rate": 7.668842413747168e-07,
"loss": 88.3922,
"step": 2585
},
{
"epoch": 0.3104677994545836,
"grad_norm": 102.4375,
"learning_rate": 7.662181963500732e-07,
"loss": 89.6833,
"step": 2590
},
{
"epoch": 0.31106715814078933,
"grad_norm": 103.125,
"learning_rate": 7.655521513254296e-07,
"loss": 88.9644,
"step": 2595
},
{
"epoch": 0.3116665168269951,
"grad_norm": 102.1875,
"learning_rate": 7.648861063007859e-07,
"loss": 88.5355,
"step": 2600
},
{
"epoch": 0.3122658755132009,
"grad_norm": 105.6875,
"learning_rate": 7.642200612761422e-07,
"loss": 89.3148,
"step": 2605
},
{
"epoch": 0.31286523419940665,
"grad_norm": 100.9375,
"learning_rate": 7.635540162514985e-07,
"loss": 89.9472,
"step": 2610
},
{
"epoch": 0.3134645928856124,
"grad_norm": 103.0,
"learning_rate": 7.628879712268549e-07,
"loss": 89.6862,
"step": 2615
},
{
"epoch": 0.31406395157181816,
"grad_norm": 102.6875,
"learning_rate": 7.622219262022112e-07,
"loss": 88.9315,
"step": 2620
},
{
"epoch": 0.3146633102580239,
"grad_norm": 102.4375,
"learning_rate": 7.615558811775675e-07,
"loss": 89.8684,
"step": 2625
},
{
"epoch": 0.31526266894422966,
"grad_norm": 102.75,
"learning_rate": 7.608898361529239e-07,
"loss": 89.971,
"step": 2630
},
{
"epoch": 0.3158620276304354,
"grad_norm": 104.6875,
"learning_rate": 7.602237911282802e-07,
"loss": 90.4864,
"step": 2635
},
{
"epoch": 0.31646138631664117,
"grad_norm": 102.25,
"learning_rate": 7.595577461036365e-07,
"loss": 89.3933,
"step": 2640
},
{
"epoch": 0.317060745002847,
"grad_norm": 107.5625,
"learning_rate": 7.588917010789929e-07,
"loss": 91.2062,
"step": 2645
},
{
"epoch": 0.31766010368905273,
"grad_norm": 104.1875,
"learning_rate": 7.582256560543493e-07,
"loss": 88.6704,
"step": 2650
},
{
"epoch": 0.3182594623752585,
"grad_norm": 100.75,
"learning_rate": 7.575596110297055e-07,
"loss": 88.1193,
"step": 2655
},
{
"epoch": 0.31885882106146424,
"grad_norm": 103.1875,
"learning_rate": 7.568935660050619e-07,
"loss": 89.7495,
"step": 2660
},
{
"epoch": 0.31945817974767,
"grad_norm": 104.3125,
"learning_rate": 7.562275209804182e-07,
"loss": 89.872,
"step": 2665
},
{
"epoch": 0.32005753843387574,
"grad_norm": 108.1875,
"learning_rate": 7.555614759557745e-07,
"loss": 88.5228,
"step": 2670
},
{
"epoch": 0.3206568971200815,
"grad_norm": 103.5625,
"learning_rate": 7.548954309311309e-07,
"loss": 89.145,
"step": 2675
},
{
"epoch": 0.32125625580628725,
"grad_norm": 101.625,
"learning_rate": 7.542293859064872e-07,
"loss": 87.3988,
"step": 2680
},
{
"epoch": 0.32185561449249306,
"grad_norm": 103.25,
"learning_rate": 7.535633408818435e-07,
"loss": 89.5909,
"step": 2685
},
{
"epoch": 0.3224549731786988,
"grad_norm": 102.5625,
"learning_rate": 7.528972958571999e-07,
"loss": 88.2594,
"step": 2690
},
{
"epoch": 0.32305433186490456,
"grad_norm": 103.125,
"learning_rate": 7.522312508325562e-07,
"loss": 90.1901,
"step": 2695
},
{
"epoch": 0.3236536905511103,
"grad_norm": 105.0,
"learning_rate": 7.515652058079126e-07,
"loss": 88.9892,
"step": 2700
},
{
"epoch": 0.32425304923731607,
"grad_norm": 106.0,
"learning_rate": 7.508991607832688e-07,
"loss": 88.7119,
"step": 2705
},
{
"epoch": 0.3248524079235218,
"grad_norm": 106.8125,
"learning_rate": 7.502331157586252e-07,
"loss": 90.3967,
"step": 2710
},
{
"epoch": 0.3254517666097276,
"grad_norm": 102.0,
"learning_rate": 7.495670707339817e-07,
"loss": 89.0769,
"step": 2715
},
{
"epoch": 0.32605112529593333,
"grad_norm": 101.375,
"learning_rate": 7.489010257093378e-07,
"loss": 88.1126,
"step": 2720
},
{
"epoch": 0.3266504839821391,
"grad_norm": 104.75,
"learning_rate": 7.482349806846942e-07,
"loss": 88.2526,
"step": 2725
},
{
"epoch": 0.3272498426683449,
"grad_norm": 99.625,
"learning_rate": 7.475689356600507e-07,
"loss": 88.5508,
"step": 2730
},
{
"epoch": 0.32784920135455065,
"grad_norm": 106.875,
"learning_rate": 7.46902890635407e-07,
"loss": 89.857,
"step": 2735
},
{
"epoch": 0.3284485600407564,
"grad_norm": 102.0625,
"learning_rate": 7.462368456107633e-07,
"loss": 88.2153,
"step": 2740
},
{
"epoch": 0.32904791872696215,
"grad_norm": 104.4375,
"learning_rate": 7.455708005861196e-07,
"loss": 89.4912,
"step": 2745
},
{
"epoch": 0.3296472774131679,
"grad_norm": 102.625,
"learning_rate": 7.44904755561476e-07,
"loss": 89.2615,
"step": 2750
},
{
"epoch": 0.33024663609937366,
"grad_norm": 104.8125,
"learning_rate": 7.442387105368323e-07,
"loss": 87.596,
"step": 2755
},
{
"epoch": 0.3308459947855794,
"grad_norm": 101.625,
"learning_rate": 7.435726655121886e-07,
"loss": 89.6667,
"step": 2760
},
{
"epoch": 0.33144535347178516,
"grad_norm": 106.3125,
"learning_rate": 7.42906620487545e-07,
"loss": 89.072,
"step": 2765
},
{
"epoch": 0.332044712157991,
"grad_norm": 102.3125,
"learning_rate": 7.422405754629013e-07,
"loss": 89.1722,
"step": 2770
},
{
"epoch": 0.3326440708441967,
"grad_norm": 102.8125,
"learning_rate": 7.415745304382576e-07,
"loss": 88.0948,
"step": 2775
},
{
"epoch": 0.3332434295304025,
"grad_norm": 105.5625,
"learning_rate": 7.40908485413614e-07,
"loss": 88.5523,
"step": 2780
},
{
"epoch": 0.33384278821660823,
"grad_norm": 99.75,
"learning_rate": 7.402424403889703e-07,
"loss": 88.9792,
"step": 2785
},
{
"epoch": 0.334442146902814,
"grad_norm": 101.5,
"learning_rate": 7.395763953643266e-07,
"loss": 89.6106,
"step": 2790
},
{
"epoch": 0.33504150558901974,
"grad_norm": 103.25,
"learning_rate": 7.38910350339683e-07,
"loss": 89.6081,
"step": 2795
},
{
"epoch": 0.3356408642752255,
"grad_norm": 105.625,
"learning_rate": 7.382443053150393e-07,
"loss": 89.297,
"step": 2800
},
{
"epoch": 0.33624022296143125,
"grad_norm": 104.25,
"learning_rate": 7.375782602903956e-07,
"loss": 87.779,
"step": 2805
},
{
"epoch": 0.33683958164763705,
"grad_norm": 102.0625,
"learning_rate": 7.36912215265752e-07,
"loss": 86.9398,
"step": 2810
},
{
"epoch": 0.3374389403338428,
"grad_norm": 106.5625,
"learning_rate": 7.362461702411083e-07,
"loss": 87.6691,
"step": 2815
},
{
"epoch": 0.33803829902004856,
"grad_norm": 104.6875,
"learning_rate": 7.355801252164647e-07,
"loss": 90.144,
"step": 2820
},
{
"epoch": 0.3386376577062543,
"grad_norm": 105.8125,
"learning_rate": 7.349140801918209e-07,
"loss": 88.8413,
"step": 2825
},
{
"epoch": 0.33923701639246007,
"grad_norm": 106.875,
"learning_rate": 7.342480351671773e-07,
"loss": 88.062,
"step": 2830
},
{
"epoch": 0.3398363750786658,
"grad_norm": 105.3125,
"learning_rate": 7.335819901425337e-07,
"loss": 89.8285,
"step": 2835
},
{
"epoch": 0.3404357337648716,
"grad_norm": 104.0625,
"learning_rate": 7.329159451178899e-07,
"loss": 88.6042,
"step": 2840
},
{
"epoch": 0.3410350924510773,
"grad_norm": 104.75,
"learning_rate": 7.322499000932463e-07,
"loss": 89.5081,
"step": 2845
},
{
"epoch": 0.3416344511372831,
"grad_norm": 103.0625,
"learning_rate": 7.315838550686027e-07,
"loss": 88.0691,
"step": 2850
},
{
"epoch": 0.3422338098234889,
"grad_norm": 103.6875,
"learning_rate": 7.309178100439589e-07,
"loss": 87.786,
"step": 2855
},
{
"epoch": 0.34283316850969464,
"grad_norm": 102.6875,
"learning_rate": 7.302517650193153e-07,
"loss": 88.005,
"step": 2860
},
{
"epoch": 0.3434325271959004,
"grad_norm": 101.3125,
"learning_rate": 7.295857199946716e-07,
"loss": 90.2978,
"step": 2865
},
{
"epoch": 0.34403188588210615,
"grad_norm": 104.8125,
"learning_rate": 7.28919674970028e-07,
"loss": 88.8875,
"step": 2870
},
{
"epoch": 0.3446312445683119,
"grad_norm": 104.375,
"learning_rate": 7.282536299453843e-07,
"loss": 88.095,
"step": 2875
},
{
"epoch": 0.34523060325451765,
"grad_norm": 105.5625,
"learning_rate": 7.275875849207406e-07,
"loss": 88.96,
"step": 2880
},
{
"epoch": 0.3458299619407234,
"grad_norm": 104.375,
"learning_rate": 7.26921539896097e-07,
"loss": 88.8805,
"step": 2885
},
{
"epoch": 0.34642932062692916,
"grad_norm": 104.1875,
"learning_rate": 7.262554948714533e-07,
"loss": 88.451,
"step": 2890
},
{
"epoch": 0.34702867931313497,
"grad_norm": 109.0,
"learning_rate": 7.255894498468096e-07,
"loss": 87.8166,
"step": 2895
},
{
"epoch": 0.3476280379993407,
"grad_norm": 101.25,
"learning_rate": 7.24923404822166e-07,
"loss": 87.2797,
"step": 2900
},
{
"epoch": 0.3482273966855465,
"grad_norm": 105.3125,
"learning_rate": 7.242573597975222e-07,
"loss": 87.5472,
"step": 2905
},
{
"epoch": 0.34882675537175223,
"grad_norm": 106.125,
"learning_rate": 7.235913147728786e-07,
"loss": 88.0739,
"step": 2910
},
{
"epoch": 0.349426114057958,
"grad_norm": 103.6875,
"learning_rate": 7.22925269748235e-07,
"loss": 88.6875,
"step": 2915
},
{
"epoch": 0.35002547274416373,
"grad_norm": 102.25,
"learning_rate": 7.222592247235913e-07,
"loss": 88.5786,
"step": 2920
},
{
"epoch": 0.3506248314303695,
"grad_norm": 108.1875,
"learning_rate": 7.215931796989476e-07,
"loss": 88.9486,
"step": 2925
},
{
"epoch": 0.35122419011657524,
"grad_norm": 104.25,
"learning_rate": 7.20927134674304e-07,
"loss": 88.3493,
"step": 2930
},
{
"epoch": 0.35182354880278105,
"grad_norm": 103.25,
"learning_rate": 7.202610896496603e-07,
"loss": 88.4265,
"step": 2935
},
{
"epoch": 0.3524229074889868,
"grad_norm": 101.8125,
"learning_rate": 7.195950446250166e-07,
"loss": 87.42,
"step": 2940
},
{
"epoch": 0.35302226617519256,
"grad_norm": 102.0625,
"learning_rate": 7.189289996003729e-07,
"loss": 86.7885,
"step": 2945
},
{
"epoch": 0.3536216248613983,
"grad_norm": 104.0,
"learning_rate": 7.182629545757293e-07,
"loss": 88.2069,
"step": 2950
},
{
"epoch": 0.35422098354760406,
"grad_norm": 104.875,
"learning_rate": 7.175969095510857e-07,
"loss": 86.0218,
"step": 2955
},
{
"epoch": 0.3548203422338098,
"grad_norm": 100.8125,
"learning_rate": 7.169308645264419e-07,
"loss": 87.1083,
"step": 2960
},
{
"epoch": 0.35541970092001557,
"grad_norm": 106.1875,
"learning_rate": 7.162648195017983e-07,
"loss": 88.2362,
"step": 2965
},
{
"epoch": 0.3560190596062213,
"grad_norm": 103.8125,
"learning_rate": 7.155987744771547e-07,
"loss": 87.8695,
"step": 2970
},
{
"epoch": 0.35661841829242713,
"grad_norm": 100.125,
"learning_rate": 7.149327294525109e-07,
"loss": 87.0618,
"step": 2975
},
{
"epoch": 0.3572177769786329,
"grad_norm": 104.5,
"learning_rate": 7.142666844278673e-07,
"loss": 88.6391,
"step": 2980
},
{
"epoch": 0.35781713566483864,
"grad_norm": 106.125,
"learning_rate": 7.136006394032237e-07,
"loss": 89.9931,
"step": 2985
},
{
"epoch": 0.3584164943510444,
"grad_norm": 105.9375,
"learning_rate": 7.129345943785799e-07,
"loss": 87.4074,
"step": 2990
},
{
"epoch": 0.35901585303725014,
"grad_norm": 104.5625,
"learning_rate": 7.122685493539363e-07,
"loss": 88.1837,
"step": 2995
},
{
"epoch": 0.3596152117234559,
"grad_norm": 106.9375,
"learning_rate": 7.116025043292926e-07,
"loss": 88.1768,
"step": 3000
},
{
"epoch": 0.3596152117234559,
"eval_loss": 2.7459769248962402,
"eval_runtime": 403.4871,
"eval_samples_per_second": 1114.293,
"eval_steps_per_second": 34.824,
"step": 3000
},
{
"epoch": 0.36021457040966165,
"grad_norm": 103.625,
"learning_rate": 7.10936459304649e-07,
"loss": 88.046,
"step": 3005
},
{
"epoch": 0.3608139290958674,
"grad_norm": 103.4375,
"learning_rate": 7.102704142800053e-07,
"loss": 88.8067,
"step": 3010
},
{
"epoch": 0.36141328778207316,
"grad_norm": 106.25,
"learning_rate": 7.096043692553616e-07,
"loss": 88.1824,
"step": 3015
},
{
"epoch": 0.36201264646827896,
"grad_norm": 101.0,
"learning_rate": 7.08938324230718e-07,
"loss": 87.2302,
"step": 3020
},
{
"epoch": 0.3626120051544847,
"grad_norm": 105.75,
"learning_rate": 7.082722792060743e-07,
"loss": 88.3142,
"step": 3025
},
{
"epoch": 0.36321136384069047,
"grad_norm": 105.5625,
"learning_rate": 7.076062341814306e-07,
"loss": 87.6098,
"step": 3030
},
{
"epoch": 0.3638107225268962,
"grad_norm": 103.125,
"learning_rate": 7.06940189156787e-07,
"loss": 88.7042,
"step": 3035
},
{
"epoch": 0.364410081213102,
"grad_norm": 104.0625,
"learning_rate": 7.062741441321432e-07,
"loss": 87.8503,
"step": 3040
},
{
"epoch": 0.36500943989930773,
"grad_norm": 102.75,
"learning_rate": 7.056080991074996e-07,
"loss": 87.639,
"step": 3045
},
{
"epoch": 0.3656087985855135,
"grad_norm": 105.1875,
"learning_rate": 7.04942054082856e-07,
"loss": 87.0323,
"step": 3050
},
{
"epoch": 0.36620815727171924,
"grad_norm": 102.0625,
"learning_rate": 7.042760090582123e-07,
"loss": 87.2938,
"step": 3055
},
{
"epoch": 0.36680751595792505,
"grad_norm": 102.625,
"learning_rate": 7.036099640335686e-07,
"loss": 87.2754,
"step": 3060
},
{
"epoch": 0.3674068746441308,
"grad_norm": 103.6875,
"learning_rate": 7.02943919008925e-07,
"loss": 87.5661,
"step": 3065
},
{
"epoch": 0.36800623333033655,
"grad_norm": 105.4375,
"learning_rate": 7.022778739842813e-07,
"loss": 87.7275,
"step": 3070
},
{
"epoch": 0.3686055920165423,
"grad_norm": 103.75,
"learning_rate": 7.016118289596376e-07,
"loss": 86.8911,
"step": 3075
},
{
"epoch": 0.36920495070274806,
"grad_norm": 103.6875,
"learning_rate": 7.009457839349939e-07,
"loss": 88.436,
"step": 3080
},
{
"epoch": 0.3698043093889538,
"grad_norm": 102.1875,
"learning_rate": 7.002797389103503e-07,
"loss": 87.3927,
"step": 3085
},
{
"epoch": 0.37040366807515956,
"grad_norm": 102.3125,
"learning_rate": 6.996136938857067e-07,
"loss": 86.9897,
"step": 3090
},
{
"epoch": 0.3710030267613653,
"grad_norm": 104.9375,
"learning_rate": 6.989476488610629e-07,
"loss": 87.0566,
"step": 3095
},
{
"epoch": 0.3716023854475711,
"grad_norm": 106.5,
"learning_rate": 6.982816038364193e-07,
"loss": 86.4962,
"step": 3100
},
{
"epoch": 0.3722017441337769,
"grad_norm": 104.75,
"learning_rate": 6.976155588117757e-07,
"loss": 87.2792,
"step": 3105
},
{
"epoch": 0.37280110281998263,
"grad_norm": 104.1875,
"learning_rate": 6.969495137871319e-07,
"loss": 88.572,
"step": 3110
},
{
"epoch": 0.3734004615061884,
"grad_norm": 105.3125,
"learning_rate": 6.962834687624883e-07,
"loss": 86.962,
"step": 3115
},
{
"epoch": 0.37399982019239414,
"grad_norm": 105.5,
"learning_rate": 6.956174237378446e-07,
"loss": 88.2113,
"step": 3120
},
{
"epoch": 0.3745991788785999,
"grad_norm": 107.125,
"learning_rate": 6.949513787132009e-07,
"loss": 86.343,
"step": 3125
},
{
"epoch": 0.37519853756480565,
"grad_norm": 106.875,
"learning_rate": 6.942853336885573e-07,
"loss": 85.8898,
"step": 3130
},
{
"epoch": 0.3757978962510114,
"grad_norm": 102.625,
"learning_rate": 6.936192886639136e-07,
"loss": 86.6348,
"step": 3135
},
{
"epoch": 0.37639725493721715,
"grad_norm": 103.3125,
"learning_rate": 6.9295324363927e-07,
"loss": 87.8125,
"step": 3140
},
{
"epoch": 0.37699661362342296,
"grad_norm": 102.1875,
"learning_rate": 6.922871986146263e-07,
"loss": 88.9906,
"step": 3145
},
{
"epoch": 0.3775959723096287,
"grad_norm": 101.1875,
"learning_rate": 6.916211535899826e-07,
"loss": 87.7328,
"step": 3150
},
{
"epoch": 0.37819533099583447,
"grad_norm": 105.875,
"learning_rate": 6.90955108565339e-07,
"loss": 87.4569,
"step": 3155
},
{
"epoch": 0.3787946896820402,
"grad_norm": 104.0625,
"learning_rate": 6.902890635406952e-07,
"loss": 87.0657,
"step": 3160
},
{
"epoch": 0.379394048368246,
"grad_norm": 108.6875,
"learning_rate": 6.896230185160516e-07,
"loss": 88.1625,
"step": 3165
},
{
"epoch": 0.3799934070544517,
"grad_norm": 103.875,
"learning_rate": 6.88956973491408e-07,
"loss": 87.7685,
"step": 3170
},
{
"epoch": 0.3805927657406575,
"grad_norm": 103.9375,
"learning_rate": 6.882909284667642e-07,
"loss": 86.1103,
"step": 3175
},
{
"epoch": 0.38119212442686323,
"grad_norm": 103.9375,
"learning_rate": 6.876248834421206e-07,
"loss": 87.3419,
"step": 3180
},
{
"epoch": 0.38179148311306904,
"grad_norm": 100.8125,
"learning_rate": 6.86958838417477e-07,
"loss": 87.24,
"step": 3185
},
{
"epoch": 0.3823908417992748,
"grad_norm": 105.8125,
"learning_rate": 6.862927933928333e-07,
"loss": 87.5693,
"step": 3190
},
{
"epoch": 0.38299020048548055,
"grad_norm": 103.9375,
"learning_rate": 6.856267483681896e-07,
"loss": 85.8992,
"step": 3195
},
{
"epoch": 0.3835895591716863,
"grad_norm": 102.0,
"learning_rate": 6.849607033435459e-07,
"loss": 86.0182,
"step": 3200
},
{
"epoch": 0.38418891785789205,
"grad_norm": 101.5,
"learning_rate": 6.842946583189024e-07,
"loss": 86.133,
"step": 3205
},
{
"epoch": 0.3847882765440978,
"grad_norm": 100.3125,
"learning_rate": 6.836286132942586e-07,
"loss": 86.924,
"step": 3210
},
{
"epoch": 0.38538763523030356,
"grad_norm": 105.5625,
"learning_rate": 6.82962568269615e-07,
"loss": 87.4621,
"step": 3215
},
{
"epoch": 0.3859869939165093,
"grad_norm": 104.3125,
"learning_rate": 6.822965232449714e-07,
"loss": 86.111,
"step": 3220
},
{
"epoch": 0.3865863526027151,
"grad_norm": 103.75,
"learning_rate": 6.816304782203278e-07,
"loss": 87.4626,
"step": 3225
},
{
"epoch": 0.3871857112889209,
"grad_norm": 106.0,
"learning_rate": 6.80964433195684e-07,
"loss": 88.3997,
"step": 3230
},
{
"epoch": 0.38778506997512663,
"grad_norm": 106.4375,
"learning_rate": 6.802983881710404e-07,
"loss": 87.3667,
"step": 3235
},
{
"epoch": 0.3883844286613324,
"grad_norm": 103.9375,
"learning_rate": 6.796323431463967e-07,
"loss": 86.2968,
"step": 3240
},
{
"epoch": 0.38898378734753813,
"grad_norm": 105.875,
"learning_rate": 6.78966298121753e-07,
"loss": 89.0274,
"step": 3245
},
{
"epoch": 0.3895831460337439,
"grad_norm": 101.75,
"learning_rate": 6.783002530971094e-07,
"loss": 85.2452,
"step": 3250
},
{
"epoch": 0.39018250471994964,
"grad_norm": 104.25,
"learning_rate": 6.776342080724657e-07,
"loss": 86.6051,
"step": 3255
},
{
"epoch": 0.3907818634061554,
"grad_norm": 104.875,
"learning_rate": 6.76968163047822e-07,
"loss": 85.579,
"step": 3260
},
{
"epoch": 0.39138122209236115,
"grad_norm": 103.4375,
"learning_rate": 6.763021180231784e-07,
"loss": 86.8788,
"step": 3265
},
{
"epoch": 0.39198058077856696,
"grad_norm": 105.0625,
"learning_rate": 6.756360729985347e-07,
"loss": 88.0645,
"step": 3270
},
{
"epoch": 0.3925799394647727,
"grad_norm": 103.25,
"learning_rate": 6.749700279738911e-07,
"loss": 87.0825,
"step": 3275
},
{
"epoch": 0.39317929815097846,
"grad_norm": 105.5,
"learning_rate": 6.743039829492473e-07,
"loss": 86.4466,
"step": 3280
},
{
"epoch": 0.3937786568371842,
"grad_norm": 105.0,
"learning_rate": 6.736379379246037e-07,
"loss": 87.9878,
"step": 3285
},
{
"epoch": 0.39437801552338997,
"grad_norm": 106.3125,
"learning_rate": 6.729718928999601e-07,
"loss": 87.5672,
"step": 3290
},
{
"epoch": 0.3949773742095957,
"grad_norm": 103.0625,
"learning_rate": 6.723058478753163e-07,
"loss": 87.317,
"step": 3295
},
{
"epoch": 0.3955767328958015,
"grad_norm": 102.1875,
"learning_rate": 6.716398028506727e-07,
"loss": 86.4539,
"step": 3300
},
{
"epoch": 0.39617609158200723,
"grad_norm": 99.5625,
"learning_rate": 6.709737578260291e-07,
"loss": 85.5183,
"step": 3305
},
{
"epoch": 0.39677545026821304,
"grad_norm": 101.5,
"learning_rate": 6.703077128013853e-07,
"loss": 86.3448,
"step": 3310
},
{
"epoch": 0.3973748089544188,
"grad_norm": 101.4375,
"learning_rate": 6.696416677767417e-07,
"loss": 87.9496,
"step": 3315
},
{
"epoch": 0.39797416764062454,
"grad_norm": 101.4375,
"learning_rate": 6.68975622752098e-07,
"loss": 86.568,
"step": 3320
},
{
"epoch": 0.3985735263268303,
"grad_norm": 100.75,
"learning_rate": 6.683095777274544e-07,
"loss": 84.8123,
"step": 3325
},
{
"epoch": 0.39917288501303605,
"grad_norm": 105.4375,
"learning_rate": 6.676435327028107e-07,
"loss": 87.7344,
"step": 3330
},
{
"epoch": 0.3997722436992418,
"grad_norm": 101.375,
"learning_rate": 6.66977487678167e-07,
"loss": 85.0206,
"step": 3335
},
{
"epoch": 0.40037160238544756,
"grad_norm": 105.3125,
"learning_rate": 6.663114426535234e-07,
"loss": 87.3425,
"step": 3340
},
{
"epoch": 0.4009709610716533,
"grad_norm": 104.25,
"learning_rate": 6.656453976288797e-07,
"loss": 86.7892,
"step": 3345
},
{
"epoch": 0.4015703197578591,
"grad_norm": 104.9375,
"learning_rate": 6.64979352604236e-07,
"loss": 85.9361,
"step": 3350
},
{
"epoch": 0.40216967844406487,
"grad_norm": 105.875,
"learning_rate": 6.643133075795924e-07,
"loss": 87.2027,
"step": 3355
},
{
"epoch": 0.4027690371302706,
"grad_norm": 102.6875,
"learning_rate": 6.636472625549488e-07,
"loss": 85.9342,
"step": 3360
},
{
"epoch": 0.4033683958164764,
"grad_norm": 104.875,
"learning_rate": 6.62981217530305e-07,
"loss": 85.8879,
"step": 3365
},
{
"epoch": 0.40396775450268213,
"grad_norm": 105.8125,
"learning_rate": 6.623151725056614e-07,
"loss": 86.2892,
"step": 3370
},
{
"epoch": 0.4045671131888879,
"grad_norm": 103.125,
"learning_rate": 6.616491274810177e-07,
"loss": 86.4097,
"step": 3375
},
{
"epoch": 0.40516647187509364,
"grad_norm": 106.625,
"learning_rate": 6.60983082456374e-07,
"loss": 86.6329,
"step": 3380
},
{
"epoch": 0.4057658305612994,
"grad_norm": 105.5625,
"learning_rate": 6.603170374317304e-07,
"loss": 85.9894,
"step": 3385
},
{
"epoch": 0.40636518924750514,
"grad_norm": 103.5625,
"learning_rate": 6.596509924070867e-07,
"loss": 86.8589,
"step": 3390
},
{
"epoch": 0.40696454793371095,
"grad_norm": 104.6875,
"learning_rate": 6.58984947382443e-07,
"loss": 85.3977,
"step": 3395
},
{
"epoch": 0.4075639066199167,
"grad_norm": 104.0625,
"learning_rate": 6.583189023577994e-07,
"loss": 85.5662,
"step": 3400
},
{
"epoch": 0.40816326530612246,
"grad_norm": 107.0625,
"learning_rate": 6.576528573331557e-07,
"loss": 87.1105,
"step": 3405
},
{
"epoch": 0.4087626239923282,
"grad_norm": 104.1875,
"learning_rate": 6.569868123085121e-07,
"loss": 86.211,
"step": 3410
},
{
"epoch": 0.40936198267853396,
"grad_norm": 102.5625,
"learning_rate": 6.563207672838683e-07,
"loss": 86.7257,
"step": 3415
},
{
"epoch": 0.4099613413647397,
"grad_norm": 106.1875,
"learning_rate": 6.556547222592247e-07,
"loss": 85.0465,
"step": 3420
},
{
"epoch": 0.41056070005094547,
"grad_norm": 104.1875,
"learning_rate": 6.549886772345811e-07,
"loss": 86.0544,
"step": 3425
},
{
"epoch": 0.4111600587371512,
"grad_norm": 100.5,
"learning_rate": 6.543226322099373e-07,
"loss": 84.8226,
"step": 3430
},
{
"epoch": 0.41175941742335703,
"grad_norm": 105.1875,
"learning_rate": 6.536565871852937e-07,
"loss": 86.4019,
"step": 3435
},
{
"epoch": 0.4123587761095628,
"grad_norm": 101.875,
"learning_rate": 6.529905421606501e-07,
"loss": 85.6632,
"step": 3440
},
{
"epoch": 0.41295813479576854,
"grad_norm": 103.75,
"learning_rate": 6.523244971360063e-07,
"loss": 87.07,
"step": 3445
},
{
"epoch": 0.4135574934819743,
"grad_norm": 111.1875,
"learning_rate": 6.516584521113627e-07,
"loss": 86.681,
"step": 3450
},
{
"epoch": 0.41415685216818005,
"grad_norm": 102.5,
"learning_rate": 6.50992407086719e-07,
"loss": 86.0613,
"step": 3455
},
{
"epoch": 0.4147562108543858,
"grad_norm": 103.4375,
"learning_rate": 6.503263620620754e-07,
"loss": 87.7511,
"step": 3460
},
{
"epoch": 0.41535556954059155,
"grad_norm": 103.5625,
"learning_rate": 6.496603170374317e-07,
"loss": 85.9475,
"step": 3465
},
{
"epoch": 0.4159549282267973,
"grad_norm": 104.75,
"learning_rate": 6.48994272012788e-07,
"loss": 84.7282,
"step": 3470
},
{
"epoch": 0.4165542869130031,
"grad_norm": 101.5625,
"learning_rate": 6.483282269881444e-07,
"loss": 85.3223,
"step": 3475
},
{
"epoch": 0.41715364559920887,
"grad_norm": 105.3125,
"learning_rate": 6.476621819635007e-07,
"loss": 85.0698,
"step": 3480
},
{
"epoch": 0.4177530042854146,
"grad_norm": 102.8125,
"learning_rate": 6.46996136938857e-07,
"loss": 86.7545,
"step": 3485
},
{
"epoch": 0.4183523629716204,
"grad_norm": 107.75,
"learning_rate": 6.463300919142134e-07,
"loss": 86.1996,
"step": 3490
},
{
"epoch": 0.4189517216578261,
"grad_norm": 101.125,
"learning_rate": 6.456640468895697e-07,
"loss": 86.6891,
"step": 3495
},
{
"epoch": 0.4195510803440319,
"grad_norm": 103.375,
"learning_rate": 6.44998001864926e-07,
"loss": 86.8633,
"step": 3500
},
{
"epoch": 0.4195510803440319,
"eval_loss": 2.689061403274536,
"eval_runtime": 405.5103,
"eval_samples_per_second": 1108.734,
"eval_steps_per_second": 34.65,
"step": 3500
},
{
"epoch": 0.42015043903023763,
"grad_norm": 104.875,
"learning_rate": 6.443319568402824e-07,
"loss": 86.7122,
"step": 3505
},
{
"epoch": 0.4207497977164434,
"grad_norm": 102.1875,
"learning_rate": 6.436659118156387e-07,
"loss": 85.7782,
"step": 3510
},
{
"epoch": 0.42134915640264914,
"grad_norm": 102.0,
"learning_rate": 6.42999866790995e-07,
"loss": 86.5698,
"step": 3515
},
{
"epoch": 0.42194851508885495,
"grad_norm": 106.0,
"learning_rate": 6.423338217663514e-07,
"loss": 85.1612,
"step": 3520
},
{
"epoch": 0.4225478737750607,
"grad_norm": 102.4375,
"learning_rate": 6.416677767417077e-07,
"loss": 86.2275,
"step": 3525
},
{
"epoch": 0.42314723246126645,
"grad_norm": 107.1875,
"learning_rate": 6.41001731717064e-07,
"loss": 84.2113,
"step": 3530
},
{
"epoch": 0.4237465911474722,
"grad_norm": 103.5,
"learning_rate": 6.403356866924203e-07,
"loss": 86.9985,
"step": 3535
},
{
"epoch": 0.42434594983367796,
"grad_norm": 105.875,
"learning_rate": 6.396696416677767e-07,
"loss": 84.5887,
"step": 3540
},
{
"epoch": 0.4249453085198837,
"grad_norm": 103.375,
"learning_rate": 6.390035966431331e-07,
"loss": 86.1314,
"step": 3545
},
{
"epoch": 0.42554466720608947,
"grad_norm": 107.25,
"learning_rate": 6.383375516184893e-07,
"loss": 86.0269,
"step": 3550
},
{
"epoch": 0.4261440258922952,
"grad_norm": 106.0,
"learning_rate": 6.376715065938457e-07,
"loss": 86.8127,
"step": 3555
},
{
"epoch": 0.42674338457850103,
"grad_norm": 105.6875,
"learning_rate": 6.370054615692021e-07,
"loss": 86.3978,
"step": 3560
},
{
"epoch": 0.4273427432647068,
"grad_norm": 103.9375,
"learning_rate": 6.363394165445583e-07,
"loss": 83.8842,
"step": 3565
},
{
"epoch": 0.42794210195091253,
"grad_norm": 104.6875,
"learning_rate": 6.356733715199147e-07,
"loss": 85.6649,
"step": 3570
},
{
"epoch": 0.4285414606371183,
"grad_norm": 106.3125,
"learning_rate": 6.35007326495271e-07,
"loss": 86.1648,
"step": 3575
},
{
"epoch": 0.42914081932332404,
"grad_norm": 104.5,
"learning_rate": 6.343412814706273e-07,
"loss": 84.8243,
"step": 3580
},
{
"epoch": 0.4297401780095298,
"grad_norm": 103.125,
"learning_rate": 6.336752364459837e-07,
"loss": 86.0243,
"step": 3585
},
{
"epoch": 0.43033953669573555,
"grad_norm": 102.9375,
"learning_rate": 6.3300919142134e-07,
"loss": 85.5283,
"step": 3590
},
{
"epoch": 0.4309388953819413,
"grad_norm": 100.0625,
"learning_rate": 6.323431463966964e-07,
"loss": 84.9534,
"step": 3595
},
{
"epoch": 0.4315382540681471,
"grad_norm": 102.0,
"learning_rate": 6.316771013720527e-07,
"loss": 86.3156,
"step": 3600
},
{
"epoch": 0.43213761275435286,
"grad_norm": 102.0625,
"learning_rate": 6.31011056347409e-07,
"loss": 85.5763,
"step": 3605
},
{
"epoch": 0.4327369714405586,
"grad_norm": 101.0625,
"learning_rate": 6.303450113227654e-07,
"loss": 86.5275,
"step": 3610
},
{
"epoch": 0.43333633012676437,
"grad_norm": 108.0,
"learning_rate": 6.296789662981216e-07,
"loss": 85.2912,
"step": 3615
},
{
"epoch": 0.4339356888129701,
"grad_norm": 102.875,
"learning_rate": 6.29012921273478e-07,
"loss": 85.6715,
"step": 3620
},
{
"epoch": 0.4345350474991759,
"grad_norm": 102.8125,
"learning_rate": 6.283468762488344e-07,
"loss": 85.735,
"step": 3625
},
{
"epoch": 0.43513440618538163,
"grad_norm": 103.25,
"learning_rate": 6.276808312241907e-07,
"loss": 84.4985,
"step": 3630
},
{
"epoch": 0.4357337648715874,
"grad_norm": 101.5625,
"learning_rate": 6.27014786199547e-07,
"loss": 85.695,
"step": 3635
},
{
"epoch": 0.43633312355779313,
"grad_norm": 104.875,
"learning_rate": 6.263487411749034e-07,
"loss": 85.4679,
"step": 3640
},
{
"epoch": 0.43693248224399894,
"grad_norm": 101.625,
"learning_rate": 6.256826961502597e-07,
"loss": 86.5071,
"step": 3645
},
{
"epoch": 0.4375318409302047,
"grad_norm": 108.875,
"learning_rate": 6.25016651125616e-07,
"loss": 85.3634,
"step": 3650
},
{
"epoch": 0.43813119961641045,
"grad_norm": 104.0625,
"learning_rate": 6.243506061009723e-07,
"loss": 84.9664,
"step": 3655
},
{
"epoch": 0.4387305583026162,
"grad_norm": 103.3125,
"learning_rate": 6.236845610763287e-07,
"loss": 85.6819,
"step": 3660
},
{
"epoch": 0.43932991698882196,
"grad_norm": 108.125,
"learning_rate": 6.23018516051685e-07,
"loss": 86.4368,
"step": 3665
},
{
"epoch": 0.4399292756750277,
"grad_norm": 107.375,
"learning_rate": 6.223524710270413e-07,
"loss": 84.9657,
"step": 3670
},
{
"epoch": 0.44052863436123346,
"grad_norm": 103.1875,
"learning_rate": 6.216864260023977e-07,
"loss": 84.2119,
"step": 3675
},
{
"epoch": 0.4411279930474392,
"grad_norm": 103.6875,
"learning_rate": 6.210203809777542e-07,
"loss": 85.9094,
"step": 3680
},
{
"epoch": 0.441727351733645,
"grad_norm": 102.5,
"learning_rate": 6.203543359531103e-07,
"loss": 84.6232,
"step": 3685
},
{
"epoch": 0.4423267104198508,
"grad_norm": 103.5625,
"learning_rate": 6.196882909284668e-07,
"loss": 86.2957,
"step": 3690
},
{
"epoch": 0.44292606910605653,
"grad_norm": 105.0625,
"learning_rate": 6.190222459038232e-07,
"loss": 84.6409,
"step": 3695
},
{
"epoch": 0.4435254277922623,
"grad_norm": 109.5625,
"learning_rate": 6.183562008791793e-07,
"loss": 86.1193,
"step": 3700
},
{
"epoch": 0.44412478647846804,
"grad_norm": 103.125,
"learning_rate": 6.176901558545358e-07,
"loss": 85.6869,
"step": 3705
},
{
"epoch": 0.4447241451646738,
"grad_norm": 106.5625,
"learning_rate": 6.170241108298921e-07,
"loss": 86.3559,
"step": 3710
},
{
"epoch": 0.44532350385087954,
"grad_norm": 103.0,
"learning_rate": 6.163580658052485e-07,
"loss": 84.8422,
"step": 3715
},
{
"epoch": 0.4459228625370853,
"grad_norm": 107.5625,
"learning_rate": 6.156920207806048e-07,
"loss": 85.5576,
"step": 3720
},
{
"epoch": 0.4465222212232911,
"grad_norm": 104.125,
"learning_rate": 6.150259757559611e-07,
"loss": 84.6006,
"step": 3725
},
{
"epoch": 0.44712157990949686,
"grad_norm": 105.875,
"learning_rate": 6.143599307313175e-07,
"loss": 86.1678,
"step": 3730
},
{
"epoch": 0.4477209385957026,
"grad_norm": 104.25,
"learning_rate": 6.136938857066738e-07,
"loss": 84.4279,
"step": 3735
},
{
"epoch": 0.44832029728190836,
"grad_norm": 105.3125,
"learning_rate": 6.130278406820301e-07,
"loss": 84.0661,
"step": 3740
},
{
"epoch": 0.4489196559681141,
"grad_norm": 103.625,
"learning_rate": 6.123617956573865e-07,
"loss": 84.5492,
"step": 3745
},
{
"epoch": 0.44951901465431987,
"grad_norm": 102.375,
"learning_rate": 6.116957506327427e-07,
"loss": 85.9095,
"step": 3750
},
{
"epoch": 0.4501183733405256,
"grad_norm": 108.3125,
"learning_rate": 6.110297056080991e-07,
"loss": 85.9708,
"step": 3755
},
{
"epoch": 0.4507177320267314,
"grad_norm": 103.875,
"learning_rate": 6.103636605834555e-07,
"loss": 86.4198,
"step": 3760
},
{
"epoch": 0.45131709071293713,
"grad_norm": 103.1875,
"learning_rate": 6.096976155588118e-07,
"loss": 86.0834,
"step": 3765
},
{
"epoch": 0.45191644939914294,
"grad_norm": 106.875,
"learning_rate": 6.090315705341681e-07,
"loss": 84.9955,
"step": 3770
},
{
"epoch": 0.4525158080853487,
"grad_norm": 105.625,
"learning_rate": 6.083655255095245e-07,
"loss": 85.3397,
"step": 3775
},
{
"epoch": 0.45311516677155445,
"grad_norm": 104.5,
"learning_rate": 6.076994804848808e-07,
"loss": 86.0351,
"step": 3780
},
{
"epoch": 0.4537145254577602,
"grad_norm": 100.875,
"learning_rate": 6.070334354602371e-07,
"loss": 84.0522,
"step": 3785
},
{
"epoch": 0.45431388414396595,
"grad_norm": 104.75,
"learning_rate": 6.063673904355934e-07,
"loss": 83.8673,
"step": 3790
},
{
"epoch": 0.4549132428301717,
"grad_norm": 105.625,
"learning_rate": 6.057013454109498e-07,
"loss": 84.8843,
"step": 3795
},
{
"epoch": 0.45551260151637746,
"grad_norm": 102.375,
"learning_rate": 6.050353003863061e-07,
"loss": 85.575,
"step": 3800
},
{
"epoch": 0.4561119602025832,
"grad_norm": 105.375,
"learning_rate": 6.043692553616624e-07,
"loss": 85.2,
"step": 3805
},
{
"epoch": 0.456711318888789,
"grad_norm": 103.125,
"learning_rate": 6.037032103370188e-07,
"loss": 86.2768,
"step": 3810
},
{
"epoch": 0.4573106775749948,
"grad_norm": 104.5,
"learning_rate": 6.030371653123752e-07,
"loss": 85.3179,
"step": 3815
},
{
"epoch": 0.4579100362612005,
"grad_norm": 105.6875,
"learning_rate": 6.023711202877314e-07,
"loss": 85.2562,
"step": 3820
},
{
"epoch": 0.4585093949474063,
"grad_norm": 104.625,
"learning_rate": 6.017050752630878e-07,
"loss": 84.96,
"step": 3825
},
{
"epoch": 0.45910875363361203,
"grad_norm": 107.75,
"learning_rate": 6.010390302384441e-07,
"loss": 84.189,
"step": 3830
},
{
"epoch": 0.4597081123198178,
"grad_norm": 101.875,
"learning_rate": 6.003729852138004e-07,
"loss": 83.2708,
"step": 3835
},
{
"epoch": 0.46030747100602354,
"grad_norm": 103.5,
"learning_rate": 5.997069401891568e-07,
"loss": 87.3057,
"step": 3840
},
{
"epoch": 0.4609068296922293,
"grad_norm": 103.9375,
"learning_rate": 5.990408951645131e-07,
"loss": 84.9138,
"step": 3845
},
{
"epoch": 0.4615061883784351,
"grad_norm": 103.9375,
"learning_rate": 5.983748501398695e-07,
"loss": 85.4947,
"step": 3850
},
{
"epoch": 0.46210554706464085,
"grad_norm": 101.9375,
"learning_rate": 5.977088051152258e-07,
"loss": 84.7751,
"step": 3855
},
{
"epoch": 0.4627049057508466,
"grad_norm": 104.0,
"learning_rate": 5.970427600905821e-07,
"loss": 84.721,
"step": 3860
},
{
"epoch": 0.46330426443705236,
"grad_norm": 103.3125,
"learning_rate": 5.963767150659385e-07,
"loss": 84.749,
"step": 3865
},
{
"epoch": 0.4639036231232581,
"grad_norm": 109.5,
"learning_rate": 5.957106700412947e-07,
"loss": 83.0867,
"step": 3870
},
{
"epoch": 0.46450298180946387,
"grad_norm": 105.9375,
"learning_rate": 5.950446250166511e-07,
"loss": 84.9955,
"step": 3875
},
{
"epoch": 0.4651023404956696,
"grad_norm": 107.1875,
"learning_rate": 5.943785799920075e-07,
"loss": 85.8307,
"step": 3880
},
{
"epoch": 0.4657016991818754,
"grad_norm": 101.9375,
"learning_rate": 5.937125349673637e-07,
"loss": 84.0253,
"step": 3885
},
{
"epoch": 0.4663010578680811,
"grad_norm": 103.1875,
"learning_rate": 5.930464899427201e-07,
"loss": 85.0021,
"step": 3890
},
{
"epoch": 0.46690041655428693,
"grad_norm": 102.0625,
"learning_rate": 5.923804449180765e-07,
"loss": 84.1141,
"step": 3895
},
{
"epoch": 0.4674997752404927,
"grad_norm": 103.375,
"learning_rate": 5.917143998934328e-07,
"loss": 82.9194,
"step": 3900
},
{
"epoch": 0.46809913392669844,
"grad_norm": 105.25,
"learning_rate": 5.910483548687891e-07,
"loss": 84.037,
"step": 3905
},
{
"epoch": 0.4686984926129042,
"grad_norm": 106.25,
"learning_rate": 5.903823098441454e-07,
"loss": 84.8373,
"step": 3910
},
{
"epoch": 0.46929785129910995,
"grad_norm": 105.75,
"learning_rate": 5.897162648195018e-07,
"loss": 85.4891,
"step": 3915
},
{
"epoch": 0.4698972099853157,
"grad_norm": 100.9375,
"learning_rate": 5.890502197948581e-07,
"loss": 84.3028,
"step": 3920
},
{
"epoch": 0.47049656867152145,
"grad_norm": 106.0625,
"learning_rate": 5.883841747702144e-07,
"loss": 84.8452,
"step": 3925
},
{
"epoch": 0.4710959273577272,
"grad_norm": 105.5625,
"learning_rate": 5.877181297455708e-07,
"loss": 84.6139,
"step": 3930
},
{
"epoch": 0.471695286043933,
"grad_norm": 107.0,
"learning_rate": 5.870520847209272e-07,
"loss": 84.4856,
"step": 3935
},
{
"epoch": 0.47229464473013877,
"grad_norm": 103.5,
"learning_rate": 5.863860396962834e-07,
"loss": 85.6233,
"step": 3940
},
{
"epoch": 0.4728940034163445,
"grad_norm": 102.375,
"learning_rate": 5.857199946716398e-07,
"loss": 84.1834,
"step": 3945
},
{
"epoch": 0.4734933621025503,
"grad_norm": 108.25,
"learning_rate": 5.850539496469961e-07,
"loss": 83.889,
"step": 3950
},
{
"epoch": 0.47409272078875603,
"grad_norm": 102.9375,
"learning_rate": 5.843879046223524e-07,
"loss": 83.3476,
"step": 3955
},
{
"epoch": 0.4746920794749618,
"grad_norm": 103.125,
"learning_rate": 5.837218595977088e-07,
"loss": 84.898,
"step": 3960
},
{
"epoch": 0.47529143816116753,
"grad_norm": 102.625,
"learning_rate": 5.830558145730651e-07,
"loss": 84.5597,
"step": 3965
},
{
"epoch": 0.4758907968473733,
"grad_norm": 104.0625,
"learning_rate": 5.823897695484214e-07,
"loss": 84.7598,
"step": 3970
},
{
"epoch": 0.4764901555335791,
"grad_norm": 101.6875,
"learning_rate": 5.817237245237778e-07,
"loss": 84.7258,
"step": 3975
},
{
"epoch": 0.47708951421978485,
"grad_norm": 105.375,
"learning_rate": 5.810576794991341e-07,
"loss": 84.1123,
"step": 3980
},
{
"epoch": 0.4776888729059906,
"grad_norm": 107.9375,
"learning_rate": 5.803916344744905e-07,
"loss": 83.9246,
"step": 3985
},
{
"epoch": 0.47828823159219636,
"grad_norm": 102.4375,
"learning_rate": 5.797255894498467e-07,
"loss": 83.9059,
"step": 3990
},
{
"epoch": 0.4788875902784021,
"grad_norm": 104.1875,
"learning_rate": 5.790595444252031e-07,
"loss": 83.2477,
"step": 3995
},
{
"epoch": 0.47948694896460786,
"grad_norm": 105.9375,
"learning_rate": 5.783934994005595e-07,
"loss": 84.4198,
"step": 4000
},
{
"epoch": 0.47948694896460786,
"eval_loss": 2.6376805305480957,
"eval_runtime": 403.2703,
"eval_samples_per_second": 1114.892,
"eval_steps_per_second": 34.843,
"step": 4000
},
{
"epoch": 0.4800863076508136,
"grad_norm": 103.5625,
"learning_rate": 5.777274543759157e-07,
"loss": 83.7404,
"step": 4005
},
{
"epoch": 0.48068566633701937,
"grad_norm": 99.5625,
"learning_rate": 5.770614093512721e-07,
"loss": 84.5335,
"step": 4010
},
{
"epoch": 0.4812850250232251,
"grad_norm": 102.1875,
"learning_rate": 5.763953643266285e-07,
"loss": 84.0931,
"step": 4015
},
{
"epoch": 0.48188438370943093,
"grad_norm": 106.8125,
"learning_rate": 5.757293193019847e-07,
"loss": 83.4464,
"step": 4020
},
{
"epoch": 0.4824837423956367,
"grad_norm": 105.25,
"learning_rate": 5.750632742773411e-07,
"loss": 84.1755,
"step": 4025
},
{
"epoch": 0.48308310108184244,
"grad_norm": 105.25,
"learning_rate": 5.743972292526974e-07,
"loss": 82.9136,
"step": 4030
},
{
"epoch": 0.4836824597680482,
"grad_norm": 103.3125,
"learning_rate": 5.737311842280538e-07,
"loss": 84.5602,
"step": 4035
},
{
"epoch": 0.48428181845425394,
"grad_norm": 103.75,
"learning_rate": 5.730651392034101e-07,
"loss": 84.7828,
"step": 4040
},
{
"epoch": 0.4848811771404597,
"grad_norm": 106.5,
"learning_rate": 5.723990941787664e-07,
"loss": 83.2263,
"step": 4045
},
{
"epoch": 0.48548053582666545,
"grad_norm": 107.5,
"learning_rate": 5.717330491541228e-07,
"loss": 82.3737,
"step": 4050
},
{
"epoch": 0.4860798945128712,
"grad_norm": 104.8125,
"learning_rate": 5.710670041294791e-07,
"loss": 83.2381,
"step": 4055
},
{
"epoch": 0.486679253199077,
"grad_norm": 107.6875,
"learning_rate": 5.704009591048354e-07,
"loss": 85.2892,
"step": 4060
},
{
"epoch": 0.48727861188528276,
"grad_norm": 106.0,
"learning_rate": 5.697349140801918e-07,
"loss": 85.1233,
"step": 4065
},
{
"epoch": 0.4878779705714885,
"grad_norm": 105.1875,
"learning_rate": 5.690688690555482e-07,
"loss": 83.7537,
"step": 4070
},
{
"epoch": 0.48847732925769427,
"grad_norm": 103.8125,
"learning_rate": 5.684028240309044e-07,
"loss": 84.1038,
"step": 4075
},
{
"epoch": 0.4890766879439,
"grad_norm": 107.5625,
"learning_rate": 5.677367790062608e-07,
"loss": 84.5125,
"step": 4080
},
{
"epoch": 0.4896760466301058,
"grad_norm": 102.0625,
"learning_rate": 5.670707339816171e-07,
"loss": 82.8591,
"step": 4085
},
{
"epoch": 0.49027540531631153,
"grad_norm": 104.25,
"learning_rate": 5.664046889569734e-07,
"loss": 84.0393,
"step": 4090
},
{
"epoch": 0.4908747640025173,
"grad_norm": 105.6875,
"learning_rate": 5.657386439323298e-07,
"loss": 82.1754,
"step": 4095
},
{
"epoch": 0.4914741226887231,
"grad_norm": 107.25,
"learning_rate": 5.650725989076861e-07,
"loss": 85.7898,
"step": 4100
},
{
"epoch": 0.49207348137492884,
"grad_norm": 103.375,
"learning_rate": 5.644065538830424e-07,
"loss": 83.8735,
"step": 4105
},
{
"epoch": 0.4926728400611346,
"grad_norm": 102.375,
"learning_rate": 5.637405088583988e-07,
"loss": 83.6745,
"step": 4110
},
{
"epoch": 0.49327219874734035,
"grad_norm": 104.0625,
"learning_rate": 5.630744638337551e-07,
"loss": 83.4683,
"step": 4115
},
{
"epoch": 0.4938715574335461,
"grad_norm": 101.3125,
"learning_rate": 5.624084188091115e-07,
"loss": 87.2967,
"step": 4120
},
{
"epoch": 0.49447091611975186,
"grad_norm": 103.875,
"learning_rate": 5.617423737844677e-07,
"loss": 84.2752,
"step": 4125
},
{
"epoch": 0.4950702748059576,
"grad_norm": 106.375,
"learning_rate": 5.610763287598241e-07,
"loss": 84.2686,
"step": 4130
},
{
"epoch": 0.49566963349216336,
"grad_norm": 103.375,
"learning_rate": 5.604102837351805e-07,
"loss": 82.353,
"step": 4135
},
{
"epoch": 0.4962689921783692,
"grad_norm": 101.625,
"learning_rate": 5.597442387105367e-07,
"loss": 82.047,
"step": 4140
},
{
"epoch": 0.4968683508645749,
"grad_norm": 106.75,
"learning_rate": 5.590781936858931e-07,
"loss": 83.8074,
"step": 4145
},
{
"epoch": 0.4974677095507807,
"grad_norm": 104.3125,
"learning_rate": 5.584121486612496e-07,
"loss": 83.597,
"step": 4150
},
{
"epoch": 0.49806706823698643,
"grad_norm": 111.5625,
"learning_rate": 5.577461036366057e-07,
"loss": 84.8574,
"step": 4155
},
{
"epoch": 0.4986664269231922,
"grad_norm": 105.375,
"learning_rate": 5.570800586119621e-07,
"loss": 83.1505,
"step": 4160
},
{
"epoch": 0.49926578560939794,
"grad_norm": 105.6875,
"learning_rate": 5.564140135873184e-07,
"loss": 84.2874,
"step": 4165
},
{
"epoch": 0.4998651442956037,
"grad_norm": 104.625,
"learning_rate": 5.557479685626749e-07,
"loss": 82.8634,
"step": 4170
},
{
"epoch": 0.5004645029818094,
"grad_norm": 104.5,
"learning_rate": 5.550819235380312e-07,
"loss": 82.2823,
"step": 4175
},
{
"epoch": 0.5010638616680152,
"grad_norm": 104.9375,
"learning_rate": 5.544158785133875e-07,
"loss": 83.2534,
"step": 4180
},
{
"epoch": 0.501663220354221,
"grad_norm": 103.4375,
"learning_rate": 5.537498334887439e-07,
"loss": 82.9911,
"step": 4185
},
{
"epoch": 0.5022625790404267,
"grad_norm": 106.4375,
"learning_rate": 5.530837884641002e-07,
"loss": 82.9007,
"step": 4190
},
{
"epoch": 0.5028619377266325,
"grad_norm": 104.25,
"learning_rate": 5.524177434394565e-07,
"loss": 83.3259,
"step": 4195
},
{
"epoch": 0.5034612964128382,
"grad_norm": 105.1875,
"learning_rate": 5.517516984148129e-07,
"loss": 83.7739,
"step": 4200
},
{
"epoch": 0.5040606550990441,
"grad_norm": 103.5625,
"learning_rate": 5.51085653390169e-07,
"loss": 83.3372,
"step": 4205
},
{
"epoch": 0.5046600137852498,
"grad_norm": 106.875,
"learning_rate": 5.504196083655255e-07,
"loss": 83.7161,
"step": 4210
},
{
"epoch": 0.5052593724714556,
"grad_norm": 105.25,
"learning_rate": 5.497535633408819e-07,
"loss": 82.8205,
"step": 4215
},
{
"epoch": 0.5058587311576613,
"grad_norm": 102.4375,
"learning_rate": 5.490875183162382e-07,
"loss": 83.3096,
"step": 4220
},
{
"epoch": 0.5064580898438671,
"grad_norm": 102.4375,
"learning_rate": 5.484214732915945e-07,
"loss": 84.2793,
"step": 4225
},
{
"epoch": 0.5070574485300728,
"grad_norm": 106.875,
"learning_rate": 5.477554282669509e-07,
"loss": 83.4831,
"step": 4230
},
{
"epoch": 0.5076568072162786,
"grad_norm": 105.25,
"learning_rate": 5.470893832423072e-07,
"loss": 85.2621,
"step": 4235
},
{
"epoch": 0.5082561659024843,
"grad_norm": 106.5625,
"learning_rate": 5.464233382176635e-07,
"loss": 82.8462,
"step": 4240
},
{
"epoch": 0.5088555245886901,
"grad_norm": 102.6875,
"learning_rate": 5.457572931930198e-07,
"loss": 83.4198,
"step": 4245
},
{
"epoch": 0.5094548832748959,
"grad_norm": 104.5,
"learning_rate": 5.450912481683762e-07,
"loss": 83.3093,
"step": 4250
},
{
"epoch": 0.5100542419611016,
"grad_norm": 107.8125,
"learning_rate": 5.444252031437326e-07,
"loss": 83.5302,
"step": 4255
},
{
"epoch": 0.5106536006473074,
"grad_norm": 109.875,
"learning_rate": 5.437591581190888e-07,
"loss": 83.8509,
"step": 4260
},
{
"epoch": 0.5112529593335131,
"grad_norm": 105.5,
"learning_rate": 5.430931130944452e-07,
"loss": 83.2915,
"step": 4265
},
{
"epoch": 0.5118523180197189,
"grad_norm": 105.0625,
"learning_rate": 5.424270680698016e-07,
"loss": 84.0008,
"step": 4270
},
{
"epoch": 0.5124516767059246,
"grad_norm": 102.4375,
"learning_rate": 5.417610230451578e-07,
"loss": 82.4604,
"step": 4275
},
{
"epoch": 0.5130510353921304,
"grad_norm": 103.9375,
"learning_rate": 5.410949780205142e-07,
"loss": 82.8647,
"step": 4280
},
{
"epoch": 0.5136503940783361,
"grad_norm": 104.5625,
"learning_rate": 5.404289329958705e-07,
"loss": 83.5644,
"step": 4285
},
{
"epoch": 0.514249752764542,
"grad_norm": 106.25,
"learning_rate": 5.397628879712268e-07,
"loss": 83.7008,
"step": 4290
},
{
"epoch": 0.5148491114507477,
"grad_norm": 102.875,
"learning_rate": 5.390968429465832e-07,
"loss": 83.0174,
"step": 4295
},
{
"epoch": 0.5154484701369535,
"grad_norm": 105.875,
"learning_rate": 5.384307979219395e-07,
"loss": 84.0506,
"step": 4300
},
{
"epoch": 0.5160478288231592,
"grad_norm": 104.375,
"learning_rate": 5.377647528972959e-07,
"loss": 82.8537,
"step": 4305
},
{
"epoch": 0.516647187509365,
"grad_norm": 108.5625,
"learning_rate": 5.370987078726522e-07,
"loss": 84.2384,
"step": 4310
},
{
"epoch": 0.5172465461955708,
"grad_norm": 104.4375,
"learning_rate": 5.364326628480085e-07,
"loss": 82.8151,
"step": 4315
},
{
"epoch": 0.5178459048817765,
"grad_norm": 102.25,
"learning_rate": 5.357666178233649e-07,
"loss": 83.6291,
"step": 4320
},
{
"epoch": 0.5184452635679823,
"grad_norm": 101.875,
"learning_rate": 5.351005727987211e-07,
"loss": 82.3904,
"step": 4325
},
{
"epoch": 0.519044622254188,
"grad_norm": 103.125,
"learning_rate": 5.344345277740775e-07,
"loss": 84.4222,
"step": 4330
},
{
"epoch": 0.5196439809403938,
"grad_norm": 103.75,
"learning_rate": 5.337684827494339e-07,
"loss": 83.1161,
"step": 4335
},
{
"epoch": 0.5202433396265995,
"grad_norm": 108.1875,
"learning_rate": 5.331024377247901e-07,
"loss": 82.2274,
"step": 4340
},
{
"epoch": 0.5208426983128053,
"grad_norm": 103.375,
"learning_rate": 5.324363927001465e-07,
"loss": 81.779,
"step": 4345
},
{
"epoch": 0.521442056999011,
"grad_norm": 104.0,
"learning_rate": 5.317703476755029e-07,
"loss": 83.6576,
"step": 4350
},
{
"epoch": 0.5220414156852168,
"grad_norm": 107.0625,
"learning_rate": 5.311043026508592e-07,
"loss": 84.1208,
"step": 4355
},
{
"epoch": 0.5226407743714225,
"grad_norm": 105.875,
"learning_rate": 5.304382576262155e-07,
"loss": 83.0397,
"step": 4360
},
{
"epoch": 0.5232401330576283,
"grad_norm": 103.375,
"learning_rate": 5.297722126015718e-07,
"loss": 83.5394,
"step": 4365
},
{
"epoch": 0.5238394917438342,
"grad_norm": 108.0625,
"learning_rate": 5.291061675769282e-07,
"loss": 82.6936,
"step": 4370
},
{
"epoch": 0.5244388504300399,
"grad_norm": 105.125,
"learning_rate": 5.284401225522845e-07,
"loss": 83.234,
"step": 4375
},
{
"epoch": 0.5250382091162457,
"grad_norm": 105.3125,
"learning_rate": 5.277740775276408e-07,
"loss": 82.665,
"step": 4380
},
{
"epoch": 0.5256375678024514,
"grad_norm": 103.9375,
"learning_rate": 5.271080325029972e-07,
"loss": 81.9667,
"step": 4385
},
{
"epoch": 0.5262369264886572,
"grad_norm": 102.25,
"learning_rate": 5.264419874783536e-07,
"loss": 81.5035,
"step": 4390
},
{
"epoch": 0.5268362851748629,
"grad_norm": 104.625,
"learning_rate": 5.257759424537098e-07,
"loss": 84.3656,
"step": 4395
},
{
"epoch": 0.5274356438610687,
"grad_norm": 104.0,
"learning_rate": 5.251098974290662e-07,
"loss": 83.848,
"step": 4400
},
{
"epoch": 0.5280350025472744,
"grad_norm": 107.4375,
"learning_rate": 5.244438524044225e-07,
"loss": 82.2172,
"step": 4405
},
{
"epoch": 0.5286343612334802,
"grad_norm": 104.875,
"learning_rate": 5.237778073797788e-07,
"loss": 81.9961,
"step": 4410
},
{
"epoch": 0.5292337199196859,
"grad_norm": 103.9375,
"learning_rate": 5.231117623551352e-07,
"loss": 83.5917,
"step": 4415
},
{
"epoch": 0.5298330786058917,
"grad_norm": 106.375,
"learning_rate": 5.224457173304915e-07,
"loss": 83.7527,
"step": 4420
},
{
"epoch": 0.5304324372920974,
"grad_norm": 101.9375,
"learning_rate": 5.217796723058478e-07,
"loss": 83.1019,
"step": 4425
},
{
"epoch": 0.5310317959783032,
"grad_norm": 107.6875,
"learning_rate": 5.211136272812042e-07,
"loss": 82.3363,
"step": 4430
},
{
"epoch": 0.5316311546645089,
"grad_norm": 106.125,
"learning_rate": 5.204475822565605e-07,
"loss": 83.6656,
"step": 4435
},
{
"epoch": 0.5322305133507147,
"grad_norm": 103.375,
"learning_rate": 5.197815372319169e-07,
"loss": 84.1844,
"step": 4440
},
{
"epoch": 0.5328298720369204,
"grad_norm": 104.5,
"learning_rate": 5.191154922072732e-07,
"loss": 82.8866,
"step": 4445
},
{
"epoch": 0.5334292307231262,
"grad_norm": 104.5,
"learning_rate": 5.184494471826295e-07,
"loss": 82.2072,
"step": 4450
},
{
"epoch": 0.5340285894093321,
"grad_norm": 101.6875,
"learning_rate": 5.177834021579859e-07,
"loss": 83.5113,
"step": 4455
},
{
"epoch": 0.5346279480955378,
"grad_norm": 105.3125,
"learning_rate": 5.171173571333421e-07,
"loss": 83.4531,
"step": 4460
},
{
"epoch": 0.5352273067817436,
"grad_norm": 104.375,
"learning_rate": 5.164513121086985e-07,
"loss": 81.8829,
"step": 4465
},
{
"epoch": 0.5358266654679493,
"grad_norm": 104.3125,
"learning_rate": 5.157852670840549e-07,
"loss": 83.2843,
"step": 4470
},
{
"epoch": 0.5364260241541551,
"grad_norm": 104.25,
"learning_rate": 5.151192220594112e-07,
"loss": 82.3948,
"step": 4475
},
{
"epoch": 0.5370253828403608,
"grad_norm": 102.1875,
"learning_rate": 5.144531770347675e-07,
"loss": 82.8727,
"step": 4480
},
{
"epoch": 0.5376247415265666,
"grad_norm": 103.0625,
"learning_rate": 5.137871320101239e-07,
"loss": 82.7623,
"step": 4485
},
{
"epoch": 0.5382241002127723,
"grad_norm": 102.1875,
"learning_rate": 5.131210869854802e-07,
"loss": 81.87,
"step": 4490
},
{
"epoch": 0.5388234588989781,
"grad_norm": 103.875,
"learning_rate": 5.124550419608365e-07,
"loss": 82.4659,
"step": 4495
},
{
"epoch": 0.5394228175851838,
"grad_norm": 106.3125,
"learning_rate": 5.117889969361928e-07,
"loss": 81.6633,
"step": 4500
},
{
"epoch": 0.5394228175851838,
"eval_loss": 2.588843584060669,
"eval_runtime": 398.8047,
"eval_samples_per_second": 1127.376,
"eval_steps_per_second": 35.233,
"step": 4500
},
{
"epoch": 0.5400221762713896,
"grad_norm": 101.25,
"learning_rate": 5.111229519115492e-07,
"loss": 81.4664,
"step": 4505
},
{
"epoch": 0.5406215349575954,
"grad_norm": 104.1875,
"learning_rate": 5.104569068869055e-07,
"loss": 82.9599,
"step": 4510
},
{
"epoch": 0.5412208936438011,
"grad_norm": 106.6875,
"learning_rate": 5.097908618622618e-07,
"loss": 82.806,
"step": 4515
},
{
"epoch": 0.5418202523300069,
"grad_norm": 105.0,
"learning_rate": 5.091248168376182e-07,
"loss": 82.4233,
"step": 4520
},
{
"epoch": 0.5424196110162126,
"grad_norm": 102.8125,
"learning_rate": 5.084587718129746e-07,
"loss": 84.2593,
"step": 4525
},
{
"epoch": 0.5430189697024184,
"grad_norm": 108.5625,
"learning_rate": 5.077927267883308e-07,
"loss": 83.7284,
"step": 4530
},
{
"epoch": 0.5436183283886241,
"grad_norm": 105.375,
"learning_rate": 5.071266817636872e-07,
"loss": 82.6245,
"step": 4535
},
{
"epoch": 0.54421768707483,
"grad_norm": 105.4375,
"learning_rate": 5.064606367390435e-07,
"loss": 82.7108,
"step": 4540
},
{
"epoch": 0.5448170457610357,
"grad_norm": 104.875,
"learning_rate": 5.057945917143998e-07,
"loss": 81.2744,
"step": 4545
},
{
"epoch": 0.5454164044472415,
"grad_norm": 104.0625,
"learning_rate": 5.051285466897562e-07,
"loss": 81.4598,
"step": 4550
},
{
"epoch": 0.5460157631334472,
"grad_norm": 105.75,
"learning_rate": 5.044625016651125e-07,
"loss": 82.4933,
"step": 4555
},
{
"epoch": 0.546615121819653,
"grad_norm": 106.0,
"learning_rate": 5.037964566404688e-07,
"loss": 83.762,
"step": 4560
},
{
"epoch": 0.5472144805058587,
"grad_norm": 104.0625,
"learning_rate": 5.031304116158252e-07,
"loss": 81.7877,
"step": 4565
},
{
"epoch": 0.5478138391920645,
"grad_norm": 107.375,
"learning_rate": 5.024643665911815e-07,
"loss": 83.3971,
"step": 4570
},
{
"epoch": 0.5484131978782703,
"grad_norm": 106.0625,
"learning_rate": 5.017983215665379e-07,
"loss": 82.3262,
"step": 4575
},
{
"epoch": 0.549012556564476,
"grad_norm": 100.8125,
"learning_rate": 5.011322765418941e-07,
"loss": 83.2061,
"step": 4580
},
{
"epoch": 0.5496119152506818,
"grad_norm": 105.8125,
"learning_rate": 5.004662315172505e-07,
"loss": 81.5443,
"step": 4585
},
{
"epoch": 0.5502112739368875,
"grad_norm": 104.0,
"learning_rate": 4.998001864926068e-07,
"loss": 82.7867,
"step": 4590
},
{
"epoch": 0.5508106326230933,
"grad_norm": 106.0,
"learning_rate": 4.991341414679632e-07,
"loss": 82.5244,
"step": 4595
},
{
"epoch": 0.551409991309299,
"grad_norm": 106.625,
"learning_rate": 4.984680964433195e-07,
"loss": 82.5752,
"step": 4600
},
{
"epoch": 0.5520093499955048,
"grad_norm": 106.25,
"learning_rate": 4.978020514186758e-07,
"loss": 83.1504,
"step": 4605
},
{
"epoch": 0.5526087086817105,
"grad_norm": 106.375,
"learning_rate": 4.971360063940322e-07,
"loss": 81.3867,
"step": 4610
},
{
"epoch": 0.5532080673679163,
"grad_norm": 105.25,
"learning_rate": 4.964699613693885e-07,
"loss": 83.0015,
"step": 4615
},
{
"epoch": 0.5538074260541221,
"grad_norm": 104.0625,
"learning_rate": 4.958039163447448e-07,
"loss": 81.9788,
"step": 4620
},
{
"epoch": 0.5544067847403279,
"grad_norm": 101.625,
"learning_rate": 4.951378713201012e-07,
"loss": 81.9656,
"step": 4625
},
{
"epoch": 0.5550061434265336,
"grad_norm": 104.1875,
"learning_rate": 4.944718262954575e-07,
"loss": 80.6905,
"step": 4630
},
{
"epoch": 0.5556055021127394,
"grad_norm": 105.125,
"learning_rate": 4.93805781270814e-07,
"loss": 82.6242,
"step": 4635
},
{
"epoch": 0.5562048607989452,
"grad_norm": 105.125,
"learning_rate": 4.931397362461702e-07,
"loss": 81.465,
"step": 4640
},
{
"epoch": 0.5568042194851509,
"grad_norm": 103.9375,
"learning_rate": 4.924736912215265e-07,
"loss": 82.8664,
"step": 4645
},
{
"epoch": 0.5574035781713567,
"grad_norm": 105.25,
"learning_rate": 4.918076461968828e-07,
"loss": 83.367,
"step": 4650
},
{
"epoch": 0.5580029368575624,
"grad_norm": 106.125,
"learning_rate": 4.911416011722393e-07,
"loss": 81.668,
"step": 4655
},
{
"epoch": 0.5586022955437682,
"grad_norm": 104.875,
"learning_rate": 4.904755561475956e-07,
"loss": 82.4106,
"step": 4660
},
{
"epoch": 0.5592016542299739,
"grad_norm": 106.25,
"learning_rate": 4.898095111229519e-07,
"loss": 83.5437,
"step": 4665
},
{
"epoch": 0.5598010129161797,
"grad_norm": 106.8125,
"learning_rate": 4.891434660983082e-07,
"loss": 81.4972,
"step": 4670
},
{
"epoch": 0.5604003716023854,
"grad_norm": 108.75,
"learning_rate": 4.884774210736646e-07,
"loss": 81.9168,
"step": 4675
},
{
"epoch": 0.5609997302885912,
"grad_norm": 105.5625,
"learning_rate": 4.878113760490209e-07,
"loss": 83.4338,
"step": 4680
},
{
"epoch": 0.5615990889747969,
"grad_norm": 99.9375,
"learning_rate": 4.871453310243773e-07,
"loss": 81.1651,
"step": 4685
},
{
"epoch": 0.5621984476610027,
"grad_norm": 106.6875,
"learning_rate": 4.864792859997336e-07,
"loss": 82.8396,
"step": 4690
},
{
"epoch": 0.5627978063472084,
"grad_norm": 101.0,
"learning_rate": 4.8581324097509e-07,
"loss": 83.3178,
"step": 4695
},
{
"epoch": 0.5633971650334142,
"grad_norm": 105.5,
"learning_rate": 4.851471959504463e-07,
"loss": 82.1475,
"step": 4700
},
{
"epoch": 0.5639965237196201,
"grad_norm": 104.75,
"learning_rate": 4.844811509258026e-07,
"loss": 81.1286,
"step": 4705
},
{
"epoch": 0.5645958824058258,
"grad_norm": 104.6875,
"learning_rate": 4.838151059011589e-07,
"loss": 82.0347,
"step": 4710
},
{
"epoch": 0.5651952410920316,
"grad_norm": 103.875,
"learning_rate": 4.831490608765153e-07,
"loss": 82.6172,
"step": 4715
},
{
"epoch": 0.5657945997782373,
"grad_norm": 104.5,
"learning_rate": 4.824830158518716e-07,
"loss": 81.2462,
"step": 4720
},
{
"epoch": 0.5663939584644431,
"grad_norm": 104.0,
"learning_rate": 4.818169708272279e-07,
"loss": 81.1761,
"step": 4725
},
{
"epoch": 0.5669933171506488,
"grad_norm": 105.3125,
"learning_rate": 4.811509258025842e-07,
"loss": 83.4811,
"step": 4730
},
{
"epoch": 0.5675926758368546,
"grad_norm": 104.1875,
"learning_rate": 4.804848807779406e-07,
"loss": 82.4757,
"step": 4735
},
{
"epoch": 0.5681920345230603,
"grad_norm": 107.3125,
"learning_rate": 4.798188357532969e-07,
"loss": 80.6124,
"step": 4740
},
{
"epoch": 0.5687913932092661,
"grad_norm": 104.75,
"learning_rate": 4.791527907286533e-07,
"loss": 82.6368,
"step": 4745
},
{
"epoch": 0.5693907518954718,
"grad_norm": 104.75,
"learning_rate": 4.784867457040096e-07,
"loss": 81.416,
"step": 4750
},
{
"epoch": 0.5699901105816776,
"grad_norm": 104.125,
"learning_rate": 4.778207006793659e-07,
"loss": 81.6732,
"step": 4755
},
{
"epoch": 0.5705894692678833,
"grad_norm": 103.1875,
"learning_rate": 4.771546556547223e-07,
"loss": 80.9805,
"step": 4760
},
{
"epoch": 0.5711888279540891,
"grad_norm": 102.8125,
"learning_rate": 4.764886106300786e-07,
"loss": 80.7926,
"step": 4765
},
{
"epoch": 0.5717881866402948,
"grad_norm": 106.0625,
"learning_rate": 4.758225656054349e-07,
"loss": 82.4188,
"step": 4770
},
{
"epoch": 0.5723875453265006,
"grad_norm": 104.6875,
"learning_rate": 4.751565205807913e-07,
"loss": 81.5341,
"step": 4775
},
{
"epoch": 0.5729869040127064,
"grad_norm": 106.6875,
"learning_rate": 4.744904755561476e-07,
"loss": 80.7328,
"step": 4780
},
{
"epoch": 0.5735862626989122,
"grad_norm": 105.5625,
"learning_rate": 4.738244305315039e-07,
"loss": 81.8097,
"step": 4785
},
{
"epoch": 0.574185621385118,
"grad_norm": 105.875,
"learning_rate": 4.731583855068603e-07,
"loss": 81.057,
"step": 4790
},
{
"epoch": 0.5747849800713237,
"grad_norm": 104.5625,
"learning_rate": 4.724923404822166e-07,
"loss": 82.8115,
"step": 4795
},
{
"epoch": 0.5753843387575295,
"grad_norm": 105.9375,
"learning_rate": 4.7182629545757293e-07,
"loss": 81.8745,
"step": 4800
},
{
"epoch": 0.5759836974437352,
"grad_norm": 107.9375,
"learning_rate": 4.7116025043292923e-07,
"loss": 81.5439,
"step": 4805
},
{
"epoch": 0.576583056129941,
"grad_norm": 104.0625,
"learning_rate": 4.704942054082856e-07,
"loss": 82.1254,
"step": 4810
},
{
"epoch": 0.5771824148161467,
"grad_norm": 103.6875,
"learning_rate": 4.6982816038364194e-07,
"loss": 81.1297,
"step": 4815
},
{
"epoch": 0.5777817735023525,
"grad_norm": 104.1875,
"learning_rate": 4.6916211535899823e-07,
"loss": 82.4339,
"step": 4820
},
{
"epoch": 0.5783811321885582,
"grad_norm": 104.125,
"learning_rate": 4.684960703343546e-07,
"loss": 81.5759,
"step": 4825
},
{
"epoch": 0.578980490874764,
"grad_norm": 106.5,
"learning_rate": 4.6783002530971094e-07,
"loss": 81.4303,
"step": 4830
},
{
"epoch": 0.5795798495609698,
"grad_norm": 102.75,
"learning_rate": 4.6716398028506724e-07,
"loss": 80.9401,
"step": 4835
},
{
"epoch": 0.5801792082471755,
"grad_norm": 106.0,
"learning_rate": 4.664979352604236e-07,
"loss": 82.6096,
"step": 4840
},
{
"epoch": 0.5807785669333813,
"grad_norm": 103.5,
"learning_rate": 4.658318902357799e-07,
"loss": 80.9871,
"step": 4845
},
{
"epoch": 0.581377925619587,
"grad_norm": 108.1875,
"learning_rate": 4.651658452111363e-07,
"loss": 82.111,
"step": 4850
},
{
"epoch": 0.5819772843057928,
"grad_norm": 103.375,
"learning_rate": 4.644998001864926e-07,
"loss": 81.8643,
"step": 4855
},
{
"epoch": 0.5825766429919985,
"grad_norm": 105.5,
"learning_rate": 4.638337551618489e-07,
"loss": 81.4741,
"step": 4860
},
{
"epoch": 0.5831760016782043,
"grad_norm": 107.25,
"learning_rate": 4.6316771013720524e-07,
"loss": 82.0387,
"step": 4865
},
{
"epoch": 0.5837753603644101,
"grad_norm": 103.9375,
"learning_rate": 4.625016651125616e-07,
"loss": 81.4705,
"step": 4870
},
{
"epoch": 0.5843747190506159,
"grad_norm": 105.125,
"learning_rate": 4.6183562008791795e-07,
"loss": 81.175,
"step": 4875
},
{
"epoch": 0.5849740777368216,
"grad_norm": 103.0625,
"learning_rate": 4.6116957506327425e-07,
"loss": 81.0439,
"step": 4880
},
{
"epoch": 0.5855734364230274,
"grad_norm": 107.4375,
"learning_rate": 4.6050353003863055e-07,
"loss": 80.9275,
"step": 4885
},
{
"epoch": 0.5861727951092331,
"grad_norm": 102.9375,
"learning_rate": 4.5983748501398695e-07,
"loss": 82.4049,
"step": 4890
},
{
"epoch": 0.5867721537954389,
"grad_norm": 106.125,
"learning_rate": 4.5917143998934325e-07,
"loss": 82.7499,
"step": 4895
},
{
"epoch": 0.5873715124816447,
"grad_norm": 104.3125,
"learning_rate": 4.585053949646996e-07,
"loss": 82.2027,
"step": 4900
},
{
"epoch": 0.5879708711678504,
"grad_norm": 106.5,
"learning_rate": 4.578393499400559e-07,
"loss": 82.3243,
"step": 4905
},
{
"epoch": 0.5885702298540562,
"grad_norm": 106.0625,
"learning_rate": 4.571733049154123e-07,
"loss": 82.9383,
"step": 4910
},
{
"epoch": 0.5891695885402619,
"grad_norm": 107.3125,
"learning_rate": 4.565072598907686e-07,
"loss": 81.5135,
"step": 4915
},
{
"epoch": 0.5897689472264677,
"grad_norm": 103.8125,
"learning_rate": 4.558412148661249e-07,
"loss": 82.6818,
"step": 4920
},
{
"epoch": 0.5903683059126734,
"grad_norm": 104.6875,
"learning_rate": 4.5517516984148126e-07,
"loss": 81.2648,
"step": 4925
},
{
"epoch": 0.5909676645988792,
"grad_norm": 101.875,
"learning_rate": 4.545091248168376e-07,
"loss": 82.1621,
"step": 4930
},
{
"epoch": 0.5915670232850849,
"grad_norm": 105.9375,
"learning_rate": 4.5384307979219396e-07,
"loss": 82.1844,
"step": 4935
},
{
"epoch": 0.5921663819712907,
"grad_norm": 104.4375,
"learning_rate": 4.5317703476755026e-07,
"loss": 81.5659,
"step": 4940
},
{
"epoch": 0.5927657406574964,
"grad_norm": 106.25,
"learning_rate": 4.5251098974290656e-07,
"loss": 82.0492,
"step": 4945
},
{
"epoch": 0.5933650993437022,
"grad_norm": 109.0625,
"learning_rate": 4.5184494471826296e-07,
"loss": 81.1946,
"step": 4950
},
{
"epoch": 0.593964458029908,
"grad_norm": 105.875,
"learning_rate": 4.5117889969361926e-07,
"loss": 82.4996,
"step": 4955
},
{
"epoch": 0.5945638167161138,
"grad_norm": 105.125,
"learning_rate": 4.505128546689756e-07,
"loss": 81.4744,
"step": 4960
},
{
"epoch": 0.5951631754023196,
"grad_norm": 106.3125,
"learning_rate": 4.498468096443319e-07,
"loss": 80.7131,
"step": 4965
},
{
"epoch": 0.5957625340885253,
"grad_norm": 111.3125,
"learning_rate": 4.4918076461968827e-07,
"loss": 82.425,
"step": 4970
},
{
"epoch": 0.5963618927747311,
"grad_norm": 108.0,
"learning_rate": 4.485147195950446e-07,
"loss": 79.8071,
"step": 4975
},
{
"epoch": 0.5969612514609368,
"grad_norm": 103.0625,
"learning_rate": 4.478486745704009e-07,
"loss": 82.7906,
"step": 4980
},
{
"epoch": 0.5975606101471426,
"grad_norm": 105.1875,
"learning_rate": 4.4718262954575727e-07,
"loss": 81.2079,
"step": 4985
},
{
"epoch": 0.5981599688333483,
"grad_norm": 107.25,
"learning_rate": 4.465165845211136e-07,
"loss": 80.6633,
"step": 4990
},
{
"epoch": 0.5987593275195541,
"grad_norm": 105.0,
"learning_rate": 4.458505394964699e-07,
"loss": 82.2591,
"step": 4995
},
{
"epoch": 0.5993586862057598,
"grad_norm": 101.875,
"learning_rate": 4.4518449447182627e-07,
"loss": 81.2498,
"step": 5000
},
{
"epoch": 0.5993586862057598,
"eval_loss": 2.542724609375,
"eval_runtime": 401.2295,
"eval_samples_per_second": 1120.563,
"eval_steps_per_second": 35.02,
"step": 5000
},
{
"epoch": 0.5999580448919656,
"grad_norm": 107.0625,
"learning_rate": 4.4451844944718257e-07,
"loss": 81.491,
"step": 5005
},
{
"epoch": 0.6005574035781713,
"grad_norm": 106.5625,
"learning_rate": 4.43852404422539e-07,
"loss": 80.9435,
"step": 5010
},
{
"epoch": 0.6011567622643771,
"grad_norm": 106.0625,
"learning_rate": 4.431863593978953e-07,
"loss": 80.1556,
"step": 5015
},
{
"epoch": 0.6017561209505828,
"grad_norm": 104.5625,
"learning_rate": 4.4252031437325163e-07,
"loss": 81.2597,
"step": 5020
},
{
"epoch": 0.6023554796367886,
"grad_norm": 103.0625,
"learning_rate": 4.4185426934860793e-07,
"loss": 80.4449,
"step": 5025
},
{
"epoch": 0.6029548383229943,
"grad_norm": 105.4375,
"learning_rate": 4.411882243239643e-07,
"loss": 79.8876,
"step": 5030
},
{
"epoch": 0.6035541970092002,
"grad_norm": 102.8125,
"learning_rate": 4.4052217929932063e-07,
"loss": 81.1896,
"step": 5035
},
{
"epoch": 0.604153555695406,
"grad_norm": 106.5,
"learning_rate": 4.3985613427467693e-07,
"loss": 80.7079,
"step": 5040
},
{
"epoch": 0.6047529143816117,
"grad_norm": 106.3125,
"learning_rate": 4.391900892500333e-07,
"loss": 80.8943,
"step": 5045
},
{
"epoch": 0.6053522730678175,
"grad_norm": 103.9375,
"learning_rate": 4.3852404422538963e-07,
"loss": 80.9722,
"step": 5050
},
{
"epoch": 0.6059516317540232,
"grad_norm": 103.0,
"learning_rate": 4.3785799920074593e-07,
"loss": 80.274,
"step": 5055
},
{
"epoch": 0.606550990440229,
"grad_norm": 103.0,
"learning_rate": 4.371919541761023e-07,
"loss": 81.1316,
"step": 5060
},
{
"epoch": 0.6071503491264347,
"grad_norm": 107.1875,
"learning_rate": 4.365259091514586e-07,
"loss": 81.8072,
"step": 5065
},
{
"epoch": 0.6077497078126405,
"grad_norm": 104.5,
"learning_rate": 4.35859864126815e-07,
"loss": 81.996,
"step": 5070
},
{
"epoch": 0.6083490664988462,
"grad_norm": 107.5625,
"learning_rate": 4.351938191021713e-07,
"loss": 81.1877,
"step": 5075
},
{
"epoch": 0.608948425185052,
"grad_norm": 108.8125,
"learning_rate": 4.345277740775276e-07,
"loss": 80.1332,
"step": 5080
},
{
"epoch": 0.6095477838712577,
"grad_norm": 108.9375,
"learning_rate": 4.3386172905288394e-07,
"loss": 82.2149,
"step": 5085
},
{
"epoch": 0.6101471425574635,
"grad_norm": 100.5625,
"learning_rate": 4.331956840282403e-07,
"loss": 80.578,
"step": 5090
},
{
"epoch": 0.6107465012436692,
"grad_norm": 106.6875,
"learning_rate": 4.3252963900359664e-07,
"loss": 79.6262,
"step": 5095
},
{
"epoch": 0.611345859929875,
"grad_norm": 105.3125,
"learning_rate": 4.3186359397895294e-07,
"loss": 81.7019,
"step": 5100
},
{
"epoch": 0.6119452186160808,
"grad_norm": 101.375,
"learning_rate": 4.3119754895430924e-07,
"loss": 80.9652,
"step": 5105
},
{
"epoch": 0.6125445773022865,
"grad_norm": 104.125,
"learning_rate": 4.3053150392966565e-07,
"loss": 79.8046,
"step": 5110
},
{
"epoch": 0.6131439359884923,
"grad_norm": 107.1875,
"learning_rate": 4.2986545890502195e-07,
"loss": 80.7197,
"step": 5115
},
{
"epoch": 0.6137432946746981,
"grad_norm": 106.25,
"learning_rate": 4.291994138803783e-07,
"loss": 81.8292,
"step": 5120
},
{
"epoch": 0.6143426533609039,
"grad_norm": 105.0,
"learning_rate": 4.2853336885573465e-07,
"loss": 80.7553,
"step": 5125
},
{
"epoch": 0.6149420120471096,
"grad_norm": 106.3125,
"learning_rate": 4.27867323831091e-07,
"loss": 82.0579,
"step": 5130
},
{
"epoch": 0.6155413707333154,
"grad_norm": 104.125,
"learning_rate": 4.272012788064473e-07,
"loss": 81.464,
"step": 5135
},
{
"epoch": 0.6161407294195211,
"grad_norm": 104.9375,
"learning_rate": 4.265352337818036e-07,
"loss": 81.48,
"step": 5140
},
{
"epoch": 0.6167400881057269,
"grad_norm": 101.5,
"learning_rate": 4.2586918875716e-07,
"loss": 80.81,
"step": 5145
},
{
"epoch": 0.6173394467919326,
"grad_norm": 108.5625,
"learning_rate": 4.252031437325163e-07,
"loss": 82.4736,
"step": 5150
},
{
"epoch": 0.6179388054781384,
"grad_norm": 105.375,
"learning_rate": 4.2453709870787266e-07,
"loss": 80.7478,
"step": 5155
},
{
"epoch": 0.6185381641643442,
"grad_norm": 105.0625,
"learning_rate": 4.2387105368322896e-07,
"loss": 81.2949,
"step": 5160
},
{
"epoch": 0.6191375228505499,
"grad_norm": 102.5,
"learning_rate": 4.232050086585853e-07,
"loss": 82.3897,
"step": 5165
},
{
"epoch": 0.6197368815367557,
"grad_norm": 106.9375,
"learning_rate": 4.2253896363394166e-07,
"loss": 80.5457,
"step": 5170
},
{
"epoch": 0.6203362402229614,
"grad_norm": 106.375,
"learning_rate": 4.2187291860929796e-07,
"loss": 81.6608,
"step": 5175
},
{
"epoch": 0.6209355989091672,
"grad_norm": 108.8125,
"learning_rate": 4.212068735846543e-07,
"loss": 80.4467,
"step": 5180
},
{
"epoch": 0.6215349575953729,
"grad_norm": 105.4375,
"learning_rate": 4.2054082856001066e-07,
"loss": 81.4107,
"step": 5185
},
{
"epoch": 0.6221343162815787,
"grad_norm": 105.1875,
"learning_rate": 4.1987478353536696e-07,
"loss": 79.603,
"step": 5190
},
{
"epoch": 0.6227336749677844,
"grad_norm": 107.875,
"learning_rate": 4.192087385107233e-07,
"loss": 79.1541,
"step": 5195
},
{
"epoch": 0.6233330336539902,
"grad_norm": 108.625,
"learning_rate": 4.185426934860796e-07,
"loss": 81.4973,
"step": 5200
},
{
"epoch": 0.623932392340196,
"grad_norm": 104.6875,
"learning_rate": 4.17876648461436e-07,
"loss": 81.097,
"step": 5205
},
{
"epoch": 0.6245317510264018,
"grad_norm": 106.9375,
"learning_rate": 4.172106034367923e-07,
"loss": 80.2684,
"step": 5210
},
{
"epoch": 0.6251311097126075,
"grad_norm": 105.625,
"learning_rate": 4.165445584121486e-07,
"loss": 80.8177,
"step": 5215
},
{
"epoch": 0.6257304683988133,
"grad_norm": 103.25,
"learning_rate": 4.1587851338750497e-07,
"loss": 81.7705,
"step": 5220
},
{
"epoch": 0.626329827085019,
"grad_norm": 107.0,
"learning_rate": 4.152124683628613e-07,
"loss": 81.2222,
"step": 5225
},
{
"epoch": 0.6269291857712248,
"grad_norm": 106.625,
"learning_rate": 4.1454642333821767e-07,
"loss": 81.7065,
"step": 5230
},
{
"epoch": 0.6275285444574306,
"grad_norm": 105.0625,
"learning_rate": 4.1388037831357397e-07,
"loss": 80.9833,
"step": 5235
},
{
"epoch": 0.6281279031436363,
"grad_norm": 107.625,
"learning_rate": 4.1321433328893027e-07,
"loss": 80.2564,
"step": 5240
},
{
"epoch": 0.6287272618298421,
"grad_norm": 104.875,
"learning_rate": 4.125482882642867e-07,
"loss": 80.1978,
"step": 5245
},
{
"epoch": 0.6293266205160478,
"grad_norm": 107.4375,
"learning_rate": 4.11882243239643e-07,
"loss": 80.0157,
"step": 5250
},
{
"epoch": 0.6299259792022536,
"grad_norm": 106.5625,
"learning_rate": 4.1121619821499933e-07,
"loss": 79.8329,
"step": 5255
},
{
"epoch": 0.6305253378884593,
"grad_norm": 104.9375,
"learning_rate": 4.105501531903556e-07,
"loss": 80.725,
"step": 5260
},
{
"epoch": 0.6311246965746651,
"grad_norm": 104.6875,
"learning_rate": 4.0988410816571203e-07,
"loss": 81.1629,
"step": 5265
},
{
"epoch": 0.6317240552608708,
"grad_norm": 106.875,
"learning_rate": 4.0921806314106833e-07,
"loss": 79.0839,
"step": 5270
},
{
"epoch": 0.6323234139470766,
"grad_norm": 102.375,
"learning_rate": 4.0855201811642463e-07,
"loss": 81.4344,
"step": 5275
},
{
"epoch": 0.6329227726332823,
"grad_norm": 103.125,
"learning_rate": 4.07885973091781e-07,
"loss": 79.7944,
"step": 5280
},
{
"epoch": 0.6335221313194882,
"grad_norm": 107.6875,
"learning_rate": 4.0721992806713733e-07,
"loss": 80.6244,
"step": 5285
},
{
"epoch": 0.634121490005694,
"grad_norm": 105.1875,
"learning_rate": 4.065538830424937e-07,
"loss": 82.5008,
"step": 5290
},
{
"epoch": 0.6347208486918997,
"grad_norm": 102.625,
"learning_rate": 4.0588783801785e-07,
"loss": 80.2262,
"step": 5295
},
{
"epoch": 0.6353202073781055,
"grad_norm": 103.875,
"learning_rate": 4.052217929932063e-07,
"loss": 80.5795,
"step": 5300
},
{
"epoch": 0.6359195660643112,
"grad_norm": 106.5,
"learning_rate": 4.045557479685627e-07,
"loss": 80.6724,
"step": 5305
},
{
"epoch": 0.636518924750517,
"grad_norm": 108.3125,
"learning_rate": 4.03889702943919e-07,
"loss": 80.1853,
"step": 5310
},
{
"epoch": 0.6371182834367227,
"grad_norm": 106.6875,
"learning_rate": 4.0322365791927534e-07,
"loss": 81.406,
"step": 5315
},
{
"epoch": 0.6377176421229285,
"grad_norm": 105.25,
"learning_rate": 4.0255761289463164e-07,
"loss": 79.3839,
"step": 5320
},
{
"epoch": 0.6383170008091342,
"grad_norm": 105.0,
"learning_rate": 4.01891567869988e-07,
"loss": 81.6353,
"step": 5325
},
{
"epoch": 0.63891635949534,
"grad_norm": 104.5,
"learning_rate": 4.0122552284534434e-07,
"loss": 79.2809,
"step": 5330
},
{
"epoch": 0.6395157181815457,
"grad_norm": 103.4375,
"learning_rate": 4.0055947782070064e-07,
"loss": 80.5541,
"step": 5335
},
{
"epoch": 0.6401150768677515,
"grad_norm": 107.3125,
"learning_rate": 3.99893432796057e-07,
"loss": 80.4581,
"step": 5340
},
{
"epoch": 0.6407144355539572,
"grad_norm": 106.5625,
"learning_rate": 3.9922738777141335e-07,
"loss": 79.9925,
"step": 5345
},
{
"epoch": 0.641313794240163,
"grad_norm": 106.1875,
"learning_rate": 3.9856134274676965e-07,
"loss": 80.1636,
"step": 5350
},
{
"epoch": 0.6419131529263687,
"grad_norm": 105.3125,
"learning_rate": 3.97895297722126e-07,
"loss": 80.483,
"step": 5355
},
{
"epoch": 0.6425125116125745,
"grad_norm": 107.8125,
"learning_rate": 3.972292526974823e-07,
"loss": 80.7774,
"step": 5360
},
{
"epoch": 0.6431118702987803,
"grad_norm": 107.375,
"learning_rate": 3.965632076728387e-07,
"loss": 81.4123,
"step": 5365
},
{
"epoch": 0.6437112289849861,
"grad_norm": 104.875,
"learning_rate": 3.95897162648195e-07,
"loss": 80.1176,
"step": 5370
},
{
"epoch": 0.6443105876711919,
"grad_norm": 105.8125,
"learning_rate": 3.952311176235513e-07,
"loss": 80.2847,
"step": 5375
},
{
"epoch": 0.6449099463573976,
"grad_norm": 105.0625,
"learning_rate": 3.9456507259890765e-07,
"loss": 80.2917,
"step": 5380
},
{
"epoch": 0.6455093050436034,
"grad_norm": 105.125,
"learning_rate": 3.93899027574264e-07,
"loss": 80.3692,
"step": 5385
},
{
"epoch": 0.6461086637298091,
"grad_norm": 105.8125,
"learning_rate": 3.9323298254962036e-07,
"loss": 79.7248,
"step": 5390
},
{
"epoch": 0.6467080224160149,
"grad_norm": 109.75,
"learning_rate": 3.9256693752497665e-07,
"loss": 79.5496,
"step": 5395
},
{
"epoch": 0.6473073811022206,
"grad_norm": 105.5625,
"learning_rate": 3.91900892500333e-07,
"loss": 79.3702,
"step": 5400
},
{
"epoch": 0.6479067397884264,
"grad_norm": 104.75,
"learning_rate": 3.9123484747568936e-07,
"loss": 81.0996,
"step": 5405
},
{
"epoch": 0.6485060984746321,
"grad_norm": 104.75,
"learning_rate": 3.9056880245104566e-07,
"loss": 79.2324,
"step": 5410
},
{
"epoch": 0.6491054571608379,
"grad_norm": 108.1875,
"learning_rate": 3.89902757426402e-07,
"loss": 80.0891,
"step": 5415
},
{
"epoch": 0.6497048158470436,
"grad_norm": 107.375,
"learning_rate": 3.892367124017583e-07,
"loss": 79.303,
"step": 5420
},
{
"epoch": 0.6503041745332494,
"grad_norm": 107.1875,
"learning_rate": 3.885706673771147e-07,
"loss": 80.5127,
"step": 5425
},
{
"epoch": 0.6509035332194552,
"grad_norm": 102.9375,
"learning_rate": 3.87904622352471e-07,
"loss": 80.0789,
"step": 5430
},
{
"epoch": 0.6515028919056609,
"grad_norm": 104.625,
"learning_rate": 3.872385773278273e-07,
"loss": 80.4981,
"step": 5435
},
{
"epoch": 0.6521022505918667,
"grad_norm": 105.6875,
"learning_rate": 3.8657253230318366e-07,
"loss": 79.4556,
"step": 5440
},
{
"epoch": 0.6527016092780724,
"grad_norm": 104.625,
"learning_rate": 3.8590648727854e-07,
"loss": 81.356,
"step": 5445
},
{
"epoch": 0.6533009679642782,
"grad_norm": 105.125,
"learning_rate": 3.8524044225389637e-07,
"loss": 80.1964,
"step": 5450
},
{
"epoch": 0.653900326650484,
"grad_norm": 105.6875,
"learning_rate": 3.8457439722925267e-07,
"loss": 80.0771,
"step": 5455
},
{
"epoch": 0.6544996853366898,
"grad_norm": 106.0625,
"learning_rate": 3.8390835220460897e-07,
"loss": 81.1686,
"step": 5460
},
{
"epoch": 0.6550990440228955,
"grad_norm": 107.1875,
"learning_rate": 3.8324230717996537e-07,
"loss": 79.9953,
"step": 5465
},
{
"epoch": 0.6556984027091013,
"grad_norm": 109.6875,
"learning_rate": 3.8257626215532167e-07,
"loss": 80.6024,
"step": 5470
},
{
"epoch": 0.656297761395307,
"grad_norm": 103.625,
"learning_rate": 3.81910217130678e-07,
"loss": 81.1354,
"step": 5475
},
{
"epoch": 0.6568971200815128,
"grad_norm": 106.8125,
"learning_rate": 3.812441721060343e-07,
"loss": 79.0769,
"step": 5480
},
{
"epoch": 0.6574964787677186,
"grad_norm": 104.75,
"learning_rate": 3.805781270813907e-07,
"loss": 80.8966,
"step": 5485
},
{
"epoch": 0.6580958374539243,
"grad_norm": 105.25,
"learning_rate": 3.79912082056747e-07,
"loss": 78.8852,
"step": 5490
},
{
"epoch": 0.6586951961401301,
"grad_norm": 107.25,
"learning_rate": 3.792460370321033e-07,
"loss": 80.7936,
"step": 5495
},
{
"epoch": 0.6592945548263358,
"grad_norm": 106.6875,
"learning_rate": 3.7857999200745973e-07,
"loss": 79.082,
"step": 5500
},
{
"epoch": 0.6592945548263358,
"eval_loss": 2.5001039505004883,
"eval_runtime": 405.6723,
"eval_samples_per_second": 1108.291,
"eval_steps_per_second": 34.636,
"step": 5500
},
{
"epoch": 0.6598939135125416,
"grad_norm": 104.375,
"learning_rate": 3.7791394698281603e-07,
"loss": 79.8162,
"step": 5505
},
{
"epoch": 0.6604932721987473,
"grad_norm": 108.25,
"learning_rate": 3.7724790195817233e-07,
"loss": 80.6216,
"step": 5510
},
{
"epoch": 0.6610926308849531,
"grad_norm": 106.25,
"learning_rate": 3.765818569335287e-07,
"loss": 80.1501,
"step": 5515
},
{
"epoch": 0.6616919895711588,
"grad_norm": 105.9375,
"learning_rate": 3.7591581190888503e-07,
"loss": 80.254,
"step": 5520
},
{
"epoch": 0.6622913482573646,
"grad_norm": 105.9375,
"learning_rate": 3.752497668842414e-07,
"loss": 80.2296,
"step": 5525
},
{
"epoch": 0.6628907069435703,
"grad_norm": 104.75,
"learning_rate": 3.745837218595977e-07,
"loss": 79.7098,
"step": 5530
},
{
"epoch": 0.6634900656297762,
"grad_norm": 105.9375,
"learning_rate": 3.7391767683495404e-07,
"loss": 80.7595,
"step": 5535
},
{
"epoch": 0.664089424315982,
"grad_norm": 106.0,
"learning_rate": 3.732516318103104e-07,
"loss": 79.3832,
"step": 5540
},
{
"epoch": 0.6646887830021877,
"grad_norm": 104.0,
"learning_rate": 3.725855867856667e-07,
"loss": 80.1191,
"step": 5545
},
{
"epoch": 0.6652881416883935,
"grad_norm": 104.0625,
"learning_rate": 3.7191954176102304e-07,
"loss": 79.0603,
"step": 5550
},
{
"epoch": 0.6658875003745992,
"grad_norm": 109.0,
"learning_rate": 3.7125349673637934e-07,
"loss": 79.1256,
"step": 5555
},
{
"epoch": 0.666486859060805,
"grad_norm": 104.5625,
"learning_rate": 3.7058745171173574e-07,
"loss": 79.9635,
"step": 5560
},
{
"epoch": 0.6670862177470107,
"grad_norm": 105.4375,
"learning_rate": 3.6992140668709204e-07,
"loss": 80.7606,
"step": 5565
},
{
"epoch": 0.6676855764332165,
"grad_norm": 103.375,
"learning_rate": 3.6925536166244834e-07,
"loss": 78.173,
"step": 5570
},
{
"epoch": 0.6682849351194222,
"grad_norm": 103.125,
"learning_rate": 3.685893166378047e-07,
"loss": 79.259,
"step": 5575
},
{
"epoch": 0.668884293805628,
"grad_norm": 107.5625,
"learning_rate": 3.6792327161316104e-07,
"loss": 80.321,
"step": 5580
},
{
"epoch": 0.6694836524918337,
"grad_norm": 104.0,
"learning_rate": 3.672572265885174e-07,
"loss": 79.958,
"step": 5585
},
{
"epoch": 0.6700830111780395,
"grad_norm": 106.25,
"learning_rate": 3.665911815638737e-07,
"loss": 79.1322,
"step": 5590
},
{
"epoch": 0.6706823698642452,
"grad_norm": 106.125,
"learning_rate": 3.6592513653923e-07,
"loss": 79.3344,
"step": 5595
},
{
"epoch": 0.671281728550451,
"grad_norm": 108.0,
"learning_rate": 3.652590915145864e-07,
"loss": 79.35,
"step": 5600
},
{
"epoch": 0.6718810872366567,
"grad_norm": 101.375,
"learning_rate": 3.645930464899427e-07,
"loss": 78.8445,
"step": 5605
},
{
"epoch": 0.6724804459228625,
"grad_norm": 104.5625,
"learning_rate": 3.6392700146529905e-07,
"loss": 80.2318,
"step": 5610
},
{
"epoch": 0.6730798046090682,
"grad_norm": 105.8125,
"learning_rate": 3.6326095644065535e-07,
"loss": 79.9447,
"step": 5615
},
{
"epoch": 0.6736791632952741,
"grad_norm": 103.5625,
"learning_rate": 3.625949114160117e-07,
"loss": 79.6821,
"step": 5620
},
{
"epoch": 0.6742785219814799,
"grad_norm": 105.625,
"learning_rate": 3.6192886639136805e-07,
"loss": 79.4833,
"step": 5625
},
{
"epoch": 0.6748778806676856,
"grad_norm": 103.5625,
"learning_rate": 3.6126282136672435e-07,
"loss": 80.9992,
"step": 5630
},
{
"epoch": 0.6754772393538914,
"grad_norm": 106.6875,
"learning_rate": 3.605967763420807e-07,
"loss": 80.4254,
"step": 5635
},
{
"epoch": 0.6760765980400971,
"grad_norm": 105.625,
"learning_rate": 3.5993073131743706e-07,
"loss": 79.7526,
"step": 5640
},
{
"epoch": 0.6766759567263029,
"grad_norm": 107.75,
"learning_rate": 3.592646862927934e-07,
"loss": 78.7197,
"step": 5645
},
{
"epoch": 0.6772753154125086,
"grad_norm": 104.375,
"learning_rate": 3.585986412681497e-07,
"loss": 79.867,
"step": 5650
},
{
"epoch": 0.6778746740987144,
"grad_norm": 104.0,
"learning_rate": 3.57932596243506e-07,
"loss": 79.7559,
"step": 5655
},
{
"epoch": 0.6784740327849201,
"grad_norm": 104.875,
"learning_rate": 3.572665512188624e-07,
"loss": 79.7349,
"step": 5660
},
{
"epoch": 0.6790733914711259,
"grad_norm": 106.6875,
"learning_rate": 3.566005061942187e-07,
"loss": 80.9384,
"step": 5665
},
{
"epoch": 0.6796727501573316,
"grad_norm": 107.625,
"learning_rate": 3.5593446116957506e-07,
"loss": 79.7798,
"step": 5670
},
{
"epoch": 0.6802721088435374,
"grad_norm": 107.5625,
"learning_rate": 3.5526841614493136e-07,
"loss": 79.8944,
"step": 5675
},
{
"epoch": 0.6808714675297431,
"grad_norm": 104.3125,
"learning_rate": 3.546023711202877e-07,
"loss": 79.2222,
"step": 5680
},
{
"epoch": 0.6814708262159489,
"grad_norm": 105.0,
"learning_rate": 3.5393632609564407e-07,
"loss": 79.4649,
"step": 5685
},
{
"epoch": 0.6820701849021547,
"grad_norm": 106.1875,
"learning_rate": 3.5327028107100037e-07,
"loss": 79.6086,
"step": 5690
},
{
"epoch": 0.6826695435883604,
"grad_norm": 105.9375,
"learning_rate": 3.526042360463567e-07,
"loss": 81.1845,
"step": 5695
},
{
"epoch": 0.6832689022745662,
"grad_norm": 104.5625,
"learning_rate": 3.5193819102171307e-07,
"loss": 80.7964,
"step": 5700
},
{
"epoch": 0.683868260960772,
"grad_norm": 106.8125,
"learning_rate": 3.5127214599706937e-07,
"loss": 80.6059,
"step": 5705
},
{
"epoch": 0.6844676196469778,
"grad_norm": 107.875,
"learning_rate": 3.506061009724257e-07,
"loss": 79.1676,
"step": 5710
},
{
"epoch": 0.6850669783331835,
"grad_norm": 107.375,
"learning_rate": 3.49940055947782e-07,
"loss": 78.7386,
"step": 5715
},
{
"epoch": 0.6856663370193893,
"grad_norm": 107.4375,
"learning_rate": 3.492740109231384e-07,
"loss": 80.582,
"step": 5720
},
{
"epoch": 0.686265695705595,
"grad_norm": 106.0,
"learning_rate": 3.486079658984947e-07,
"loss": 78.4127,
"step": 5725
},
{
"epoch": 0.6868650543918008,
"grad_norm": 103.375,
"learning_rate": 3.47941920873851e-07,
"loss": 79.0533,
"step": 5730
},
{
"epoch": 0.6874644130780065,
"grad_norm": 105.125,
"learning_rate": 3.472758758492074e-07,
"loss": 80.4304,
"step": 5735
},
{
"epoch": 0.6880637717642123,
"grad_norm": 106.125,
"learning_rate": 3.4660983082456373e-07,
"loss": 80.7424,
"step": 5740
},
{
"epoch": 0.688663130450418,
"grad_norm": 107.0,
"learning_rate": 3.459437857999201e-07,
"loss": 80.3244,
"step": 5745
},
{
"epoch": 0.6892624891366238,
"grad_norm": 103.1875,
"learning_rate": 3.452777407752764e-07,
"loss": 79.961,
"step": 5750
},
{
"epoch": 0.6898618478228296,
"grad_norm": 104.9375,
"learning_rate": 3.446116957506327e-07,
"loss": 78.8729,
"step": 5755
},
{
"epoch": 0.6904612065090353,
"grad_norm": 109.0625,
"learning_rate": 3.439456507259891e-07,
"loss": 80.4733,
"step": 5760
},
{
"epoch": 0.6910605651952411,
"grad_norm": 106.4375,
"learning_rate": 3.432796057013454e-07,
"loss": 78.2223,
"step": 5765
},
{
"epoch": 0.6916599238814468,
"grad_norm": 106.3125,
"learning_rate": 3.4261356067670173e-07,
"loss": 80.0241,
"step": 5770
},
{
"epoch": 0.6922592825676526,
"grad_norm": 103.6875,
"learning_rate": 3.4194751565205803e-07,
"loss": 78.9867,
"step": 5775
},
{
"epoch": 0.6928586412538583,
"grad_norm": 105.75,
"learning_rate": 3.4128147062741444e-07,
"loss": 78.7172,
"step": 5780
},
{
"epoch": 0.6934579999400642,
"grad_norm": 104.25,
"learning_rate": 3.4061542560277074e-07,
"loss": 79.7752,
"step": 5785
},
{
"epoch": 0.6940573586262699,
"grad_norm": 106.5625,
"learning_rate": 3.3994938057812704e-07,
"loss": 79.1381,
"step": 5790
},
{
"epoch": 0.6946567173124757,
"grad_norm": 105.3125,
"learning_rate": 3.392833355534834e-07,
"loss": 78.426,
"step": 5795
},
{
"epoch": 0.6952560759986814,
"grad_norm": 106.8125,
"learning_rate": 3.3861729052883974e-07,
"loss": 79.6124,
"step": 5800
},
{
"epoch": 0.6958554346848872,
"grad_norm": 104.875,
"learning_rate": 3.379512455041961e-07,
"loss": 79.5376,
"step": 5805
},
{
"epoch": 0.696454793371093,
"grad_norm": 105.0625,
"learning_rate": 3.372852004795524e-07,
"loss": 77.9224,
"step": 5810
},
{
"epoch": 0.6970541520572987,
"grad_norm": 104.125,
"learning_rate": 3.366191554549087e-07,
"loss": 79.9884,
"step": 5815
},
{
"epoch": 0.6976535107435045,
"grad_norm": 105.5,
"learning_rate": 3.359531104302651e-07,
"loss": 80.5143,
"step": 5820
},
{
"epoch": 0.6982528694297102,
"grad_norm": 108.8125,
"learning_rate": 3.352870654056214e-07,
"loss": 79.1192,
"step": 5825
},
{
"epoch": 0.698852228115916,
"grad_norm": 106.125,
"learning_rate": 3.3462102038097775e-07,
"loss": 79.9099,
"step": 5830
},
{
"epoch": 0.6994515868021217,
"grad_norm": 106.3125,
"learning_rate": 3.3395497535633405e-07,
"loss": 79.8999,
"step": 5835
},
{
"epoch": 0.7000509454883275,
"grad_norm": 103.75,
"learning_rate": 3.332889303316904e-07,
"loss": 79.162,
"step": 5840
},
{
"epoch": 0.7006503041745332,
"grad_norm": 105.625,
"learning_rate": 3.3262288530704675e-07,
"loss": 78.1294,
"step": 5845
},
{
"epoch": 0.701249662860739,
"grad_norm": 105.125,
"learning_rate": 3.3195684028240305e-07,
"loss": 78.6101,
"step": 5850
},
{
"epoch": 0.7018490215469447,
"grad_norm": 105.375,
"learning_rate": 3.3129079525775945e-07,
"loss": 77.9655,
"step": 5855
},
{
"epoch": 0.7024483802331505,
"grad_norm": 107.9375,
"learning_rate": 3.3062475023311575e-07,
"loss": 77.3438,
"step": 5860
},
{
"epoch": 0.7030477389193562,
"grad_norm": 105.5,
"learning_rate": 3.2995870520847205e-07,
"loss": 78.5834,
"step": 5865
},
{
"epoch": 0.7036470976055621,
"grad_norm": 105.8125,
"learning_rate": 3.292926601838284e-07,
"loss": 77.6612,
"step": 5870
},
{
"epoch": 0.7042464562917679,
"grad_norm": 103.375,
"learning_rate": 3.2862661515918476e-07,
"loss": 77.863,
"step": 5875
},
{
"epoch": 0.7048458149779736,
"grad_norm": 103.625,
"learning_rate": 3.279605701345411e-07,
"loss": 80.0449,
"step": 5880
},
{
"epoch": 0.7054451736641794,
"grad_norm": 109.125,
"learning_rate": 3.272945251098974e-07,
"loss": 78.6147,
"step": 5885
},
{
"epoch": 0.7060445323503851,
"grad_norm": 104.75,
"learning_rate": 3.266284800852537e-07,
"loss": 79.8518,
"step": 5890
},
{
"epoch": 0.7066438910365909,
"grad_norm": 104.0,
"learning_rate": 3.259624350606101e-07,
"loss": 79.7573,
"step": 5895
},
{
"epoch": 0.7072432497227966,
"grad_norm": 108.0,
"learning_rate": 3.252963900359664e-07,
"loss": 79.0408,
"step": 5900
},
{
"epoch": 0.7078426084090024,
"grad_norm": 107.0625,
"learning_rate": 3.2463034501132276e-07,
"loss": 78.054,
"step": 5905
},
{
"epoch": 0.7084419670952081,
"grad_norm": 109.4375,
"learning_rate": 3.2396429998667906e-07,
"loss": 78.9491,
"step": 5910
},
{
"epoch": 0.7090413257814139,
"grad_norm": 105.6875,
"learning_rate": 3.2329825496203547e-07,
"loss": 78.7078,
"step": 5915
},
{
"epoch": 0.7096406844676196,
"grad_norm": 107.25,
"learning_rate": 3.2263220993739177e-07,
"loss": 79.0629,
"step": 5920
},
{
"epoch": 0.7102400431538254,
"grad_norm": 106.875,
"learning_rate": 3.2196616491274806e-07,
"loss": 77.7746,
"step": 5925
},
{
"epoch": 0.7108394018400311,
"grad_norm": 108.375,
"learning_rate": 3.213001198881044e-07,
"loss": 78.0724,
"step": 5930
},
{
"epoch": 0.7114387605262369,
"grad_norm": 106.125,
"learning_rate": 3.2063407486346077e-07,
"loss": 78.7447,
"step": 5935
},
{
"epoch": 0.7120381192124426,
"grad_norm": 106.0625,
"learning_rate": 3.199680298388171e-07,
"loss": 77.4366,
"step": 5940
},
{
"epoch": 0.7126374778986484,
"grad_norm": 105.125,
"learning_rate": 3.193019848141734e-07,
"loss": 79.7885,
"step": 5945
},
{
"epoch": 0.7132368365848543,
"grad_norm": 102.8125,
"learning_rate": 3.186359397895297e-07,
"loss": 78.1937,
"step": 5950
},
{
"epoch": 0.71383619527106,
"grad_norm": 105.875,
"learning_rate": 3.179698947648861e-07,
"loss": 80.1111,
"step": 5955
},
{
"epoch": 0.7144355539572658,
"grad_norm": 105.8125,
"learning_rate": 3.173038497402424e-07,
"loss": 78.5599,
"step": 5960
},
{
"epoch": 0.7150349126434715,
"grad_norm": 106.25,
"learning_rate": 3.166378047155988e-07,
"loss": 78.213,
"step": 5965
},
{
"epoch": 0.7156342713296773,
"grad_norm": 108.3125,
"learning_rate": 3.159717596909551e-07,
"loss": 79.0787,
"step": 5970
},
{
"epoch": 0.716233630015883,
"grad_norm": 105.0,
"learning_rate": 3.153057146663114e-07,
"loss": 79.603,
"step": 5975
},
{
"epoch": 0.7168329887020888,
"grad_norm": 106.0,
"learning_rate": 3.146396696416678e-07,
"loss": 78.3208,
"step": 5980
},
{
"epoch": 0.7174323473882945,
"grad_norm": 109.4375,
"learning_rate": 3.139736246170241e-07,
"loss": 79.0942,
"step": 5985
},
{
"epoch": 0.7180317060745003,
"grad_norm": 107.25,
"learning_rate": 3.1330757959238043e-07,
"loss": 78.9856,
"step": 5990
},
{
"epoch": 0.718631064760706,
"grad_norm": 104.4375,
"learning_rate": 3.126415345677368e-07,
"loss": 79.374,
"step": 5995
},
{
"epoch": 0.7192304234469118,
"grad_norm": 106.4375,
"learning_rate": 3.119754895430931e-07,
"loss": 78.3493,
"step": 6000
},
{
"epoch": 0.7192304234469118,
"eval_loss": 2.469325542449951,
"eval_runtime": 404.8602,
"eval_samples_per_second": 1110.514,
"eval_steps_per_second": 34.706,
"step": 6000
},
{
"epoch": 0.7198297821331175,
"grad_norm": 106.875,
"learning_rate": 3.1130944451844943e-07,
"loss": 77.937,
"step": 6005
},
{
"epoch": 0.7204291408193233,
"grad_norm": 107.0625,
"learning_rate": 3.1064339949380573e-07,
"loss": 79.7142,
"step": 6010
},
{
"epoch": 0.721028499505529,
"grad_norm": 108.875,
"learning_rate": 3.0997735446916214e-07,
"loss": 78.1682,
"step": 6015
},
{
"epoch": 0.7216278581917348,
"grad_norm": 106.75,
"learning_rate": 3.0931130944451844e-07,
"loss": 77.3161,
"step": 6020
},
{
"epoch": 0.7222272168779406,
"grad_norm": 105.5,
"learning_rate": 3.086452644198748e-07,
"loss": 79.6543,
"step": 6025
},
{
"epoch": 0.7228265755641463,
"grad_norm": 112.125,
"learning_rate": 3.079792193952311e-07,
"loss": 78.0319,
"step": 6030
},
{
"epoch": 0.7234259342503522,
"grad_norm": 105.875,
"learning_rate": 3.0731317437058744e-07,
"loss": 78.8495,
"step": 6035
},
{
"epoch": 0.7240252929365579,
"grad_norm": 104.3125,
"learning_rate": 3.066471293459438e-07,
"loss": 78.3328,
"step": 6040
},
{
"epoch": 0.7246246516227637,
"grad_norm": 104.375,
"learning_rate": 3.059810843213001e-07,
"loss": 78.7412,
"step": 6045
},
{
"epoch": 0.7252240103089694,
"grad_norm": 107.25,
"learning_rate": 3.0531503929665644e-07,
"loss": 79.4384,
"step": 6050
},
{
"epoch": 0.7258233689951752,
"grad_norm": 103.0,
"learning_rate": 3.046489942720128e-07,
"loss": 77.85,
"step": 6055
},
{
"epoch": 0.7264227276813809,
"grad_norm": 107.3125,
"learning_rate": 3.039829492473691e-07,
"loss": 78.9541,
"step": 6060
},
{
"epoch": 0.7270220863675867,
"grad_norm": 105.4375,
"learning_rate": 3.0331690422272545e-07,
"loss": 80.1152,
"step": 6065
},
{
"epoch": 0.7276214450537924,
"grad_norm": 106.8125,
"learning_rate": 3.0265085919808174e-07,
"loss": 79.0895,
"step": 6070
},
{
"epoch": 0.7282208037399982,
"grad_norm": 108.0625,
"learning_rate": 3.0198481417343815e-07,
"loss": 79.5219,
"step": 6075
},
{
"epoch": 0.728820162426204,
"grad_norm": 107.25,
"learning_rate": 3.0131876914879445e-07,
"loss": 78.7073,
"step": 6080
},
{
"epoch": 0.7294195211124097,
"grad_norm": 106.875,
"learning_rate": 3.0065272412415075e-07,
"loss": 78.4252,
"step": 6085
},
{
"epoch": 0.7300188797986155,
"grad_norm": 106.125,
"learning_rate": 2.999866790995071e-07,
"loss": 80.2118,
"step": 6090
},
{
"epoch": 0.7306182384848212,
"grad_norm": 109.125,
"learning_rate": 2.9932063407486345e-07,
"loss": 78.9803,
"step": 6095
},
{
"epoch": 0.731217597171027,
"grad_norm": 106.0625,
"learning_rate": 2.986545890502198e-07,
"loss": 79.6833,
"step": 6100
},
{
"epoch": 0.7318169558572327,
"grad_norm": 107.375,
"learning_rate": 2.979885440255761e-07,
"loss": 78.3041,
"step": 6105
},
{
"epoch": 0.7324163145434385,
"grad_norm": 108.3125,
"learning_rate": 2.973224990009324e-07,
"loss": 77.6695,
"step": 6110
},
{
"epoch": 0.7330156732296442,
"grad_norm": 106.3125,
"learning_rate": 2.966564539762888e-07,
"loss": 78.813,
"step": 6115
},
{
"epoch": 0.7336150319158501,
"grad_norm": 104.625,
"learning_rate": 2.959904089516451e-07,
"loss": 78.1065,
"step": 6120
},
{
"epoch": 0.7342143906020558,
"grad_norm": 102.25,
"learning_rate": 2.9532436392700146e-07,
"loss": 78.4899,
"step": 6125
},
{
"epoch": 0.7348137492882616,
"grad_norm": 110.1875,
"learning_rate": 2.9465831890235776e-07,
"loss": 79.3901,
"step": 6130
},
{
"epoch": 0.7354131079744674,
"grad_norm": 105.875,
"learning_rate": 2.9399227387771416e-07,
"loss": 79.4775,
"step": 6135
},
{
"epoch": 0.7360124666606731,
"grad_norm": 108.625,
"learning_rate": 2.9332622885307046e-07,
"loss": 78.8485,
"step": 6140
},
{
"epoch": 0.7366118253468789,
"grad_norm": 106.125,
"learning_rate": 2.9266018382842676e-07,
"loss": 78.7774,
"step": 6145
},
{
"epoch": 0.7372111840330846,
"grad_norm": 101.625,
"learning_rate": 2.919941388037831e-07,
"loss": 77.9222,
"step": 6150
},
{
"epoch": 0.7378105427192904,
"grad_norm": 106.0625,
"learning_rate": 2.9132809377913946e-07,
"loss": 78.3709,
"step": 6155
},
{
"epoch": 0.7384099014054961,
"grad_norm": 105.125,
"learning_rate": 2.906620487544958e-07,
"loss": 77.9696,
"step": 6160
},
{
"epoch": 0.7390092600917019,
"grad_norm": 106.5,
"learning_rate": 2.899960037298521e-07,
"loss": 78.3595,
"step": 6165
},
{
"epoch": 0.7396086187779076,
"grad_norm": 104.6875,
"learning_rate": 2.893299587052084e-07,
"loss": 76.3922,
"step": 6170
},
{
"epoch": 0.7402079774641134,
"grad_norm": 105.25,
"learning_rate": 2.886639136805648e-07,
"loss": 78.9993,
"step": 6175
},
{
"epoch": 0.7408073361503191,
"grad_norm": 103.6875,
"learning_rate": 2.879978686559211e-07,
"loss": 78.1283,
"step": 6180
},
{
"epoch": 0.7414066948365249,
"grad_norm": 105.5,
"learning_rate": 2.8733182363127747e-07,
"loss": 78.5354,
"step": 6185
},
{
"epoch": 0.7420060535227306,
"grad_norm": 111.375,
"learning_rate": 2.8666577860663377e-07,
"loss": 78.5119,
"step": 6190
},
{
"epoch": 0.7426054122089364,
"grad_norm": 104.75,
"learning_rate": 2.859997335819901e-07,
"loss": 79.4887,
"step": 6195
},
{
"epoch": 0.7432047708951423,
"grad_norm": 105.1875,
"learning_rate": 2.853336885573465e-07,
"loss": 77.8327,
"step": 6200
},
{
"epoch": 0.743804129581348,
"grad_norm": 108.75,
"learning_rate": 2.8466764353270277e-07,
"loss": 78.3946,
"step": 6205
},
{
"epoch": 0.7444034882675538,
"grad_norm": 105.875,
"learning_rate": 2.840015985080592e-07,
"loss": 79.9381,
"step": 6210
},
{
"epoch": 0.7450028469537595,
"grad_norm": 105.6875,
"learning_rate": 2.833355534834155e-07,
"loss": 79.5875,
"step": 6215
},
{
"epoch": 0.7456022056399653,
"grad_norm": 102.9375,
"learning_rate": 2.826695084587718e-07,
"loss": 77.8823,
"step": 6220
},
{
"epoch": 0.746201564326171,
"grad_norm": 106.875,
"learning_rate": 2.8200346343412813e-07,
"loss": 78.1676,
"step": 6225
},
{
"epoch": 0.7468009230123768,
"grad_norm": 106.0,
"learning_rate": 2.813374184094845e-07,
"loss": 77.9816,
"step": 6230
},
{
"epoch": 0.7474002816985825,
"grad_norm": 106.3125,
"learning_rate": 2.8067137338484083e-07,
"loss": 78.3035,
"step": 6235
},
{
"epoch": 0.7479996403847883,
"grad_norm": 106.5,
"learning_rate": 2.8000532836019713e-07,
"loss": 80.3571,
"step": 6240
},
{
"epoch": 0.748598999070994,
"grad_norm": 106.5625,
"learning_rate": 2.7933928333555343e-07,
"loss": 79.3642,
"step": 6245
},
{
"epoch": 0.7491983577571998,
"grad_norm": 105.25,
"learning_rate": 2.7867323831090984e-07,
"loss": 77.0638,
"step": 6250
},
{
"epoch": 0.7497977164434055,
"grad_norm": 109.125,
"learning_rate": 2.7800719328626613e-07,
"loss": 78.2273,
"step": 6255
},
{
"epoch": 0.7503970751296113,
"grad_norm": 103.125,
"learning_rate": 2.773411482616225e-07,
"loss": 79.4584,
"step": 6260
},
{
"epoch": 0.750996433815817,
"grad_norm": 105.6875,
"learning_rate": 2.766751032369788e-07,
"loss": 77.9623,
"step": 6265
},
{
"epoch": 0.7515957925020228,
"grad_norm": 108.8125,
"learning_rate": 2.760090582123352e-07,
"loss": 77.3997,
"step": 6270
},
{
"epoch": 0.7521951511882286,
"grad_norm": 108.25,
"learning_rate": 2.753430131876915e-07,
"loss": 78.4736,
"step": 6275
},
{
"epoch": 0.7527945098744343,
"grad_norm": 108.6875,
"learning_rate": 2.746769681630478e-07,
"loss": 77.6046,
"step": 6280
},
{
"epoch": 0.7533938685606402,
"grad_norm": 107.25,
"learning_rate": 2.7401092313840414e-07,
"loss": 79.3138,
"step": 6285
},
{
"epoch": 0.7539932272468459,
"grad_norm": 106.4375,
"learning_rate": 2.733448781137605e-07,
"loss": 78.1768,
"step": 6290
},
{
"epoch": 0.7545925859330517,
"grad_norm": 108.25,
"learning_rate": 2.7267883308911685e-07,
"loss": 79.0016,
"step": 6295
},
{
"epoch": 0.7551919446192574,
"grad_norm": 103.9375,
"learning_rate": 2.7201278806447314e-07,
"loss": 78.4366,
"step": 6300
},
{
"epoch": 0.7557913033054632,
"grad_norm": 109.75,
"learning_rate": 2.7134674303982944e-07,
"loss": 78.4717,
"step": 6305
},
{
"epoch": 0.7563906619916689,
"grad_norm": 105.25,
"learning_rate": 2.7068069801518585e-07,
"loss": 77.4006,
"step": 6310
},
{
"epoch": 0.7569900206778747,
"grad_norm": 108.1875,
"learning_rate": 2.7001465299054215e-07,
"loss": 79.0828,
"step": 6315
},
{
"epoch": 0.7575893793640804,
"grad_norm": 106.4375,
"learning_rate": 2.693486079658985e-07,
"loss": 78.3332,
"step": 6320
},
{
"epoch": 0.7581887380502862,
"grad_norm": 106.25,
"learning_rate": 2.686825629412548e-07,
"loss": 79.0802,
"step": 6325
},
{
"epoch": 0.758788096736492,
"grad_norm": 106.1875,
"learning_rate": 2.6801651791661115e-07,
"loss": 77.4329,
"step": 6330
},
{
"epoch": 0.7593874554226977,
"grad_norm": 103.4375,
"learning_rate": 2.673504728919675e-07,
"loss": 77.6086,
"step": 6335
},
{
"epoch": 0.7599868141089035,
"grad_norm": 107.75,
"learning_rate": 2.666844278673238e-07,
"loss": 77.9435,
"step": 6340
},
{
"epoch": 0.7605861727951092,
"grad_norm": 108.5,
"learning_rate": 2.6601838284268015e-07,
"loss": 78.1191,
"step": 6345
},
{
"epoch": 0.761185531481315,
"grad_norm": 107.0,
"learning_rate": 2.653523378180365e-07,
"loss": 77.3275,
"step": 6350
},
{
"epoch": 0.7617848901675207,
"grad_norm": 105.0625,
"learning_rate": 2.646862927933928e-07,
"loss": 78.2275,
"step": 6355
},
{
"epoch": 0.7623842488537265,
"grad_norm": 107.3125,
"learning_rate": 2.6402024776874916e-07,
"loss": 77.8997,
"step": 6360
},
{
"epoch": 0.7629836075399322,
"grad_norm": 111.0,
"learning_rate": 2.6335420274410546e-07,
"loss": 78.2208,
"step": 6365
},
{
"epoch": 0.7635829662261381,
"grad_norm": 109.375,
"learning_rate": 2.6268815771946186e-07,
"loss": 80.8852,
"step": 6370
},
{
"epoch": 0.7641823249123438,
"grad_norm": 104.0625,
"learning_rate": 2.6202211269481816e-07,
"loss": 77.9402,
"step": 6375
},
{
"epoch": 0.7647816835985496,
"grad_norm": 106.3125,
"learning_rate": 2.6135606767017446e-07,
"loss": 79.3084,
"step": 6380
},
{
"epoch": 0.7653810422847553,
"grad_norm": 108.6875,
"learning_rate": 2.606900226455308e-07,
"loss": 78.8786,
"step": 6385
},
{
"epoch": 0.7659804009709611,
"grad_norm": 109.125,
"learning_rate": 2.6002397762088716e-07,
"loss": 78.2904,
"step": 6390
},
{
"epoch": 0.7665797596571668,
"grad_norm": 104.5,
"learning_rate": 2.593579325962435e-07,
"loss": 77.6137,
"step": 6395
},
{
"epoch": 0.7671791183433726,
"grad_norm": 106.25,
"learning_rate": 2.586918875715998e-07,
"loss": 78.5451,
"step": 6400
},
{
"epoch": 0.7677784770295784,
"grad_norm": 106.375,
"learning_rate": 2.5802584254695617e-07,
"loss": 79.5644,
"step": 6405
},
{
"epoch": 0.7683778357157841,
"grad_norm": 103.8125,
"learning_rate": 2.573597975223125e-07,
"loss": 77.4564,
"step": 6410
},
{
"epoch": 0.7689771944019899,
"grad_norm": 105.3125,
"learning_rate": 2.566937524976688e-07,
"loss": 77.9053,
"step": 6415
},
{
"epoch": 0.7695765530881956,
"grad_norm": 106.5,
"learning_rate": 2.5602770747302517e-07,
"loss": 79.797,
"step": 6420
},
{
"epoch": 0.7701759117744014,
"grad_norm": 107.125,
"learning_rate": 2.5536166244838147e-07,
"loss": 77.8786,
"step": 6425
},
{
"epoch": 0.7707752704606071,
"grad_norm": 106.25,
"learning_rate": 2.546956174237379e-07,
"loss": 79.659,
"step": 6430
},
{
"epoch": 0.7713746291468129,
"grad_norm": 103.875,
"learning_rate": 2.5402957239909417e-07,
"loss": 77.1865,
"step": 6435
},
{
"epoch": 0.7719739878330186,
"grad_norm": 108.8125,
"learning_rate": 2.5336352737445047e-07,
"loss": 77.6755,
"step": 6440
},
{
"epoch": 0.7725733465192244,
"grad_norm": 106.625,
"learning_rate": 2.526974823498068e-07,
"loss": 79.6433,
"step": 6445
},
{
"epoch": 0.7731727052054302,
"grad_norm": 106.8125,
"learning_rate": 2.520314373251632e-07,
"loss": 78.4122,
"step": 6450
},
{
"epoch": 0.773772063891636,
"grad_norm": 107.4375,
"learning_rate": 2.5136539230051953e-07,
"loss": 77.1019,
"step": 6455
},
{
"epoch": 0.7743714225778418,
"grad_norm": 105.1875,
"learning_rate": 2.5069934727587583e-07,
"loss": 79.4794,
"step": 6460
},
{
"epoch": 0.7749707812640475,
"grad_norm": 106.5,
"learning_rate": 2.500333022512321e-07,
"loss": 77.2468,
"step": 6465
},
{
"epoch": 0.7755701399502533,
"grad_norm": 103.5625,
"learning_rate": 2.493672572265885e-07,
"loss": 78.32,
"step": 6470
},
{
"epoch": 0.776169498636459,
"grad_norm": 107.0625,
"learning_rate": 2.4870121220194483e-07,
"loss": 78.8655,
"step": 6475
},
{
"epoch": 0.7767688573226648,
"grad_norm": 106.75,
"learning_rate": 2.480351671773012e-07,
"loss": 77.82,
"step": 6480
},
{
"epoch": 0.7773682160088705,
"grad_norm": 107.125,
"learning_rate": 2.4736912215265753e-07,
"loss": 77.6949,
"step": 6485
},
{
"epoch": 0.7779675746950763,
"grad_norm": 104.75,
"learning_rate": 2.4670307712801383e-07,
"loss": 77.4096,
"step": 6490
},
{
"epoch": 0.778566933381282,
"grad_norm": 108.875,
"learning_rate": 2.460370321033702e-07,
"loss": 78.1701,
"step": 6495
},
{
"epoch": 0.7791662920674878,
"grad_norm": 106.8125,
"learning_rate": 2.453709870787265e-07,
"loss": 77.7304,
"step": 6500
},
{
"epoch": 0.7791662920674878,
"eval_loss": 2.4339072704315186,
"eval_runtime": 403.9267,
"eval_samples_per_second": 1113.081,
"eval_steps_per_second": 34.786,
"step": 6500
},
{
"epoch": 0.7797656507536935,
"grad_norm": 107.1875,
"learning_rate": 2.4470494205408284e-07,
"loss": 78.4344,
"step": 6505
},
{
"epoch": 0.7803650094398993,
"grad_norm": 106.6875,
"learning_rate": 2.440388970294392e-07,
"loss": 78.3033,
"step": 6510
},
{
"epoch": 0.780964368126105,
"grad_norm": 104.375,
"learning_rate": 2.4337285200479554e-07,
"loss": 77.7646,
"step": 6515
},
{
"epoch": 0.7815637268123108,
"grad_norm": 107.875,
"learning_rate": 2.4270680698015184e-07,
"loss": 77.9291,
"step": 6520
},
{
"epoch": 0.7821630854985165,
"grad_norm": 109.0,
"learning_rate": 2.420407619555082e-07,
"loss": 78.5846,
"step": 6525
},
{
"epoch": 0.7827624441847223,
"grad_norm": 108.6875,
"learning_rate": 2.413747169308645e-07,
"loss": 77.4087,
"step": 6530
},
{
"epoch": 0.7833618028709282,
"grad_norm": 107.625,
"learning_rate": 2.4070867190622084e-07,
"loss": 78.0319,
"step": 6535
},
{
"epoch": 0.7839611615571339,
"grad_norm": 109.625,
"learning_rate": 2.400426268815772e-07,
"loss": 77.3543,
"step": 6540
},
{
"epoch": 0.7845605202433397,
"grad_norm": 102.375,
"learning_rate": 2.393765818569335e-07,
"loss": 77.9248,
"step": 6545
},
{
"epoch": 0.7851598789295454,
"grad_norm": 106.6875,
"learning_rate": 2.3871053683228985e-07,
"loss": 77.9559,
"step": 6550
},
{
"epoch": 0.7857592376157512,
"grad_norm": 104.5625,
"learning_rate": 2.380444918076462e-07,
"loss": 78.4013,
"step": 6555
},
{
"epoch": 0.7863585963019569,
"grad_norm": 106.6875,
"learning_rate": 2.3737844678300252e-07,
"loss": 77.4872,
"step": 6560
},
{
"epoch": 0.7869579549881627,
"grad_norm": 105.0,
"learning_rate": 2.3671240175835885e-07,
"loss": 77.9336,
"step": 6565
},
{
"epoch": 0.7875573136743684,
"grad_norm": 108.75,
"learning_rate": 2.360463567337152e-07,
"loss": 77.6178,
"step": 6570
},
{
"epoch": 0.7881566723605742,
"grad_norm": 106.9375,
"learning_rate": 2.3538031170907153e-07,
"loss": 77.6397,
"step": 6575
},
{
"epoch": 0.7887560310467799,
"grad_norm": 102.875,
"learning_rate": 2.3471426668442788e-07,
"loss": 77.6935,
"step": 6580
},
{
"epoch": 0.7893553897329857,
"grad_norm": 102.5625,
"learning_rate": 2.3404822165978418e-07,
"loss": 78.3942,
"step": 6585
},
{
"epoch": 0.7899547484191914,
"grad_norm": 108.4375,
"learning_rate": 2.3338217663514053e-07,
"loss": 77.9903,
"step": 6590
},
{
"epoch": 0.7905541071053972,
"grad_norm": 105.0625,
"learning_rate": 2.3271613161049686e-07,
"loss": 76.4352,
"step": 6595
},
{
"epoch": 0.791153465791603,
"grad_norm": 104.5625,
"learning_rate": 2.320500865858532e-07,
"loss": 77.667,
"step": 6600
},
{
"epoch": 0.7917528244778087,
"grad_norm": 107.4375,
"learning_rate": 2.3138404156120953e-07,
"loss": 77.5867,
"step": 6605
},
{
"epoch": 0.7923521831640145,
"grad_norm": 105.0,
"learning_rate": 2.3071799653656589e-07,
"loss": 77.8278,
"step": 6610
},
{
"epoch": 0.7929515418502202,
"grad_norm": 104.625,
"learning_rate": 2.3005195151192218e-07,
"loss": 77.053,
"step": 6615
},
{
"epoch": 0.7935509005364261,
"grad_norm": 105.9375,
"learning_rate": 2.2938590648727854e-07,
"loss": 78.3603,
"step": 6620
},
{
"epoch": 0.7941502592226318,
"grad_norm": 105.6875,
"learning_rate": 2.2871986146263486e-07,
"loss": 76.1169,
"step": 6625
},
{
"epoch": 0.7947496179088376,
"grad_norm": 107.125,
"learning_rate": 2.2805381643799121e-07,
"loss": 77.0539,
"step": 6630
},
{
"epoch": 0.7953489765950433,
"grad_norm": 105.6875,
"learning_rate": 2.2738777141334754e-07,
"loss": 77.8971,
"step": 6635
},
{
"epoch": 0.7959483352812491,
"grad_norm": 102.8125,
"learning_rate": 2.2672172638870387e-07,
"loss": 76.208,
"step": 6640
},
{
"epoch": 0.7965476939674548,
"grad_norm": 107.1875,
"learning_rate": 2.260556813640602e-07,
"loss": 77.1146,
"step": 6645
},
{
"epoch": 0.7971470526536606,
"grad_norm": 106.125,
"learning_rate": 2.2538963633941654e-07,
"loss": 77.5436,
"step": 6650
},
{
"epoch": 0.7977464113398663,
"grad_norm": 104.75,
"learning_rate": 2.2472359131477287e-07,
"loss": 77.1789,
"step": 6655
},
{
"epoch": 0.7983457700260721,
"grad_norm": 107.25,
"learning_rate": 2.2405754629012922e-07,
"loss": 76.5571,
"step": 6660
},
{
"epoch": 0.7989451287122779,
"grad_norm": 107.75,
"learning_rate": 2.2339150126548552e-07,
"loss": 77.4697,
"step": 6665
},
{
"epoch": 0.7995444873984836,
"grad_norm": 103.9375,
"learning_rate": 2.2272545624084187e-07,
"loss": 78.0711,
"step": 6670
},
{
"epoch": 0.8001438460846894,
"grad_norm": 107.25,
"learning_rate": 2.220594112161982e-07,
"loss": 77.3697,
"step": 6675
},
{
"epoch": 0.8007432047708951,
"grad_norm": 104.1875,
"learning_rate": 2.2139336619155455e-07,
"loss": 77.4097,
"step": 6680
},
{
"epoch": 0.8013425634571009,
"grad_norm": 106.0,
"learning_rate": 2.2072732116691087e-07,
"loss": 77.1239,
"step": 6685
},
{
"epoch": 0.8019419221433066,
"grad_norm": 107.0625,
"learning_rate": 2.2006127614226723e-07,
"loss": 77.2553,
"step": 6690
},
{
"epoch": 0.8025412808295124,
"grad_norm": 107.4375,
"learning_rate": 2.1939523111762353e-07,
"loss": 78.1408,
"step": 6695
},
{
"epoch": 0.8031406395157182,
"grad_norm": 109.5,
"learning_rate": 2.1872918609297988e-07,
"loss": 78.2807,
"step": 6700
},
{
"epoch": 0.803739998201924,
"grad_norm": 106.75,
"learning_rate": 2.180631410683362e-07,
"loss": 76.696,
"step": 6705
},
{
"epoch": 0.8043393568881297,
"grad_norm": 108.875,
"learning_rate": 2.1739709604369256e-07,
"loss": 77.1577,
"step": 6710
},
{
"epoch": 0.8049387155743355,
"grad_norm": 107.0,
"learning_rate": 2.1673105101904888e-07,
"loss": 77.7407,
"step": 6715
},
{
"epoch": 0.8055380742605412,
"grad_norm": 108.0,
"learning_rate": 2.160650059944052e-07,
"loss": 78.0718,
"step": 6720
},
{
"epoch": 0.806137432946747,
"grad_norm": 106.6875,
"learning_rate": 2.1539896096976153e-07,
"loss": 77.663,
"step": 6725
},
{
"epoch": 0.8067367916329528,
"grad_norm": 108.375,
"learning_rate": 2.1473291594511788e-07,
"loss": 78.4291,
"step": 6730
},
{
"epoch": 0.8073361503191585,
"grad_norm": 109.375,
"learning_rate": 2.140668709204742e-07,
"loss": 77.1584,
"step": 6735
},
{
"epoch": 0.8079355090053643,
"grad_norm": 105.3125,
"learning_rate": 2.1340082589583056e-07,
"loss": 76.6091,
"step": 6740
},
{
"epoch": 0.80853486769157,
"grad_norm": 106.625,
"learning_rate": 2.127347808711869e-07,
"loss": 76.3898,
"step": 6745
},
{
"epoch": 0.8091342263777758,
"grad_norm": 107.3125,
"learning_rate": 2.120687358465432e-07,
"loss": 76.942,
"step": 6750
},
{
"epoch": 0.8097335850639815,
"grad_norm": 106.5625,
"learning_rate": 2.1140269082189954e-07,
"loss": 76.9349,
"step": 6755
},
{
"epoch": 0.8103329437501873,
"grad_norm": 105.375,
"learning_rate": 2.107366457972559e-07,
"loss": 77.6708,
"step": 6760
},
{
"epoch": 0.810932302436393,
"grad_norm": 105.8125,
"learning_rate": 2.1007060077261222e-07,
"loss": 78.7127,
"step": 6765
},
{
"epoch": 0.8115316611225988,
"grad_norm": 105.0,
"learning_rate": 2.0940455574796857e-07,
"loss": 76.8139,
"step": 6770
},
{
"epoch": 0.8121310198088045,
"grad_norm": 106.9375,
"learning_rate": 2.0873851072332487e-07,
"loss": 77.4944,
"step": 6775
},
{
"epoch": 0.8127303784950103,
"grad_norm": 105.9375,
"learning_rate": 2.0807246569868122e-07,
"loss": 77.3446,
"step": 6780
},
{
"epoch": 0.8133297371812162,
"grad_norm": 106.375,
"learning_rate": 2.0740642067403754e-07,
"loss": 76.7164,
"step": 6785
},
{
"epoch": 0.8139290958674219,
"grad_norm": 105.0,
"learning_rate": 2.067403756493939e-07,
"loss": 76.5707,
"step": 6790
},
{
"epoch": 0.8145284545536277,
"grad_norm": 109.625,
"learning_rate": 2.0607433062475022e-07,
"loss": 78.0642,
"step": 6795
},
{
"epoch": 0.8151278132398334,
"grad_norm": 105.5625,
"learning_rate": 2.0540828560010657e-07,
"loss": 76.432,
"step": 6800
},
{
"epoch": 0.8157271719260392,
"grad_norm": 106.4375,
"learning_rate": 2.0474224057546287e-07,
"loss": 77.7618,
"step": 6805
},
{
"epoch": 0.8163265306122449,
"grad_norm": 108.625,
"learning_rate": 2.0407619555081923e-07,
"loss": 77.1692,
"step": 6810
},
{
"epoch": 0.8169258892984507,
"grad_norm": 108.6875,
"learning_rate": 2.0341015052617555e-07,
"loss": 77.461,
"step": 6815
},
{
"epoch": 0.8175252479846564,
"grad_norm": 106.3125,
"learning_rate": 2.027441055015319e-07,
"loss": 77.479,
"step": 6820
},
{
"epoch": 0.8181246066708622,
"grad_norm": 108.25,
"learning_rate": 2.0207806047688823e-07,
"loss": 77.7447,
"step": 6825
},
{
"epoch": 0.8187239653570679,
"grad_norm": 107.25,
"learning_rate": 2.0141201545224455e-07,
"loss": 78.9139,
"step": 6830
},
{
"epoch": 0.8193233240432737,
"grad_norm": 105.25,
"learning_rate": 2.0074597042760088e-07,
"loss": 76.8411,
"step": 6835
},
{
"epoch": 0.8199226827294794,
"grad_norm": 109.1875,
"learning_rate": 2.0007992540295723e-07,
"loss": 77.406,
"step": 6840
},
{
"epoch": 0.8205220414156852,
"grad_norm": 110.4375,
"learning_rate": 1.9941388037831356e-07,
"loss": 77.1655,
"step": 6845
},
{
"epoch": 0.8211214001018909,
"grad_norm": 107.1875,
"learning_rate": 1.987478353536699e-07,
"loss": 76.9701,
"step": 6850
},
{
"epoch": 0.8217207587880967,
"grad_norm": 104.0625,
"learning_rate": 1.980817903290262e-07,
"loss": 76.9663,
"step": 6855
},
{
"epoch": 0.8223201174743024,
"grad_norm": 106.125,
"learning_rate": 1.9741574530438256e-07,
"loss": 77.8956,
"step": 6860
},
{
"epoch": 0.8229194761605082,
"grad_norm": 103.25,
"learning_rate": 1.9674970027973889e-07,
"loss": 77.8959,
"step": 6865
},
{
"epoch": 0.8235188348467141,
"grad_norm": 106.125,
"learning_rate": 1.9608365525509524e-07,
"loss": 76.7267,
"step": 6870
},
{
"epoch": 0.8241181935329198,
"grad_norm": 107.0625,
"learning_rate": 1.9541761023045156e-07,
"loss": 76.7561,
"step": 6875
},
{
"epoch": 0.8247175522191256,
"grad_norm": 107.0625,
"learning_rate": 1.9475156520580792e-07,
"loss": 77.7667,
"step": 6880
},
{
"epoch": 0.8253169109053313,
"grad_norm": 106.3125,
"learning_rate": 1.9408552018116421e-07,
"loss": 76.7068,
"step": 6885
},
{
"epoch": 0.8259162695915371,
"grad_norm": 105.6875,
"learning_rate": 1.9341947515652057e-07,
"loss": 77.1725,
"step": 6890
},
{
"epoch": 0.8265156282777428,
"grad_norm": 107.125,
"learning_rate": 1.927534301318769e-07,
"loss": 76.7159,
"step": 6895
},
{
"epoch": 0.8271149869639486,
"grad_norm": 111.75,
"learning_rate": 1.9208738510723324e-07,
"loss": 76.9128,
"step": 6900
},
{
"epoch": 0.8277143456501543,
"grad_norm": 109.1875,
"learning_rate": 1.9142134008258957e-07,
"loss": 77.1265,
"step": 6905
},
{
"epoch": 0.8283137043363601,
"grad_norm": 105.4375,
"learning_rate": 1.907552950579459e-07,
"loss": 76.9765,
"step": 6910
},
{
"epoch": 0.8289130630225658,
"grad_norm": 106.5625,
"learning_rate": 1.9008925003330225e-07,
"loss": 76.0758,
"step": 6915
},
{
"epoch": 0.8295124217087716,
"grad_norm": 108.625,
"learning_rate": 1.8942320500865857e-07,
"loss": 77.2638,
"step": 6920
},
{
"epoch": 0.8301117803949773,
"grad_norm": 102.5,
"learning_rate": 1.8875715998401493e-07,
"loss": 76.5221,
"step": 6925
},
{
"epoch": 0.8307111390811831,
"grad_norm": 104.5,
"learning_rate": 1.8809111495937125e-07,
"loss": 77.0011,
"step": 6930
},
{
"epoch": 0.8313104977673889,
"grad_norm": 107.25,
"learning_rate": 1.874250699347276e-07,
"loss": 77.0331,
"step": 6935
},
{
"epoch": 0.8319098564535946,
"grad_norm": 106.1875,
"learning_rate": 1.867590249100839e-07,
"loss": 77.2594,
"step": 6940
},
{
"epoch": 0.8325092151398004,
"grad_norm": 106.125,
"learning_rate": 1.8609297988544025e-07,
"loss": 76.9138,
"step": 6945
},
{
"epoch": 0.8331085738260062,
"grad_norm": 104.75,
"learning_rate": 1.8542693486079658e-07,
"loss": 76.312,
"step": 6950
},
{
"epoch": 0.833707932512212,
"grad_norm": 107.6875,
"learning_rate": 1.8476088983615293e-07,
"loss": 75.6801,
"step": 6955
},
{
"epoch": 0.8343072911984177,
"grad_norm": 102.125,
"learning_rate": 1.8409484481150926e-07,
"loss": 78.2077,
"step": 6960
},
{
"epoch": 0.8349066498846235,
"grad_norm": 107.5,
"learning_rate": 1.8342879978686558e-07,
"loss": 77.1983,
"step": 6965
},
{
"epoch": 0.8355060085708292,
"grad_norm": 106.1875,
"learning_rate": 1.827627547622219e-07,
"loss": 78.3859,
"step": 6970
},
{
"epoch": 0.836105367257035,
"grad_norm": 107.3125,
"learning_rate": 1.8209670973757826e-07,
"loss": 76.9793,
"step": 6975
},
{
"epoch": 0.8367047259432407,
"grad_norm": 109.5625,
"learning_rate": 1.8143066471293459e-07,
"loss": 75.6328,
"step": 6980
},
{
"epoch": 0.8373040846294465,
"grad_norm": 108.0625,
"learning_rate": 1.8076461968829094e-07,
"loss": 76.4836,
"step": 6985
},
{
"epoch": 0.8379034433156523,
"grad_norm": 107.0,
"learning_rate": 1.8009857466364726e-07,
"loss": 76.2991,
"step": 6990
},
{
"epoch": 0.838502802001858,
"grad_norm": 107.4375,
"learning_rate": 1.794325296390036e-07,
"loss": 75.9192,
"step": 6995
},
{
"epoch": 0.8391021606880638,
"grad_norm": 108.4375,
"learning_rate": 1.7876648461435991e-07,
"loss": 77.1332,
"step": 7000
},
{
"epoch": 0.8391021606880638,
"eval_loss": 2.4002504348754883,
"eval_runtime": 400.1746,
"eval_samples_per_second": 1123.517,
"eval_steps_per_second": 35.112,
"step": 7000
},
{
"epoch": 0.8397015193742695,
"grad_norm": 105.375,
"learning_rate": 1.7810043958971627e-07,
"loss": 76.3652,
"step": 7005
},
{
"epoch": 0.8403008780604753,
"grad_norm": 107.0,
"learning_rate": 1.774343945650726e-07,
"loss": 76.7703,
"step": 7010
},
{
"epoch": 0.840900236746681,
"grad_norm": 107.0,
"learning_rate": 1.7676834954042894e-07,
"loss": 75.7328,
"step": 7015
},
{
"epoch": 0.8414995954328868,
"grad_norm": 106.3125,
"learning_rate": 1.7610230451578524e-07,
"loss": 76.7758,
"step": 7020
},
{
"epoch": 0.8420989541190925,
"grad_norm": 107.9375,
"learning_rate": 1.754362594911416e-07,
"loss": 77.1049,
"step": 7025
},
{
"epoch": 0.8426983128052983,
"grad_norm": 107.375,
"learning_rate": 1.7477021446649792e-07,
"loss": 77.5039,
"step": 7030
},
{
"epoch": 0.8432976714915041,
"grad_norm": 110.0,
"learning_rate": 1.7410416944185427e-07,
"loss": 77.7867,
"step": 7035
},
{
"epoch": 0.8438970301777099,
"grad_norm": 105.6875,
"learning_rate": 1.734381244172106e-07,
"loss": 76.1607,
"step": 7040
},
{
"epoch": 0.8444963888639156,
"grad_norm": 109.1875,
"learning_rate": 1.7277207939256695e-07,
"loss": 77.016,
"step": 7045
},
{
"epoch": 0.8450957475501214,
"grad_norm": 104.875,
"learning_rate": 1.7210603436792325e-07,
"loss": 75.7198,
"step": 7050
},
{
"epoch": 0.8456951062363272,
"grad_norm": 106.1875,
"learning_rate": 1.714399893432796e-07,
"loss": 75.8812,
"step": 7055
},
{
"epoch": 0.8462944649225329,
"grad_norm": 107.125,
"learning_rate": 1.7077394431863593e-07,
"loss": 75.089,
"step": 7060
},
{
"epoch": 0.8468938236087387,
"grad_norm": 107.5,
"learning_rate": 1.7010789929399228e-07,
"loss": 76.6488,
"step": 7065
},
{
"epoch": 0.8474931822949444,
"grad_norm": 103.3125,
"learning_rate": 1.694418542693486e-07,
"loss": 77.9913,
"step": 7070
},
{
"epoch": 0.8480925409811502,
"grad_norm": 106.5,
"learning_rate": 1.6877580924470493e-07,
"loss": 75.9304,
"step": 7075
},
{
"epoch": 0.8486918996673559,
"grad_norm": 107.5,
"learning_rate": 1.6810976422006126e-07,
"loss": 77.1558,
"step": 7080
},
{
"epoch": 0.8492912583535617,
"grad_norm": 110.4375,
"learning_rate": 1.674437191954176e-07,
"loss": 75.9749,
"step": 7085
},
{
"epoch": 0.8498906170397674,
"grad_norm": 103.75,
"learning_rate": 1.6677767417077393e-07,
"loss": 76.0139,
"step": 7090
},
{
"epoch": 0.8504899757259732,
"grad_norm": 106.9375,
"learning_rate": 1.6611162914613029e-07,
"loss": 76.1673,
"step": 7095
},
{
"epoch": 0.8510893344121789,
"grad_norm": 106.9375,
"learning_rate": 1.6544558412148658e-07,
"loss": 76.9603,
"step": 7100
},
{
"epoch": 0.8516886930983847,
"grad_norm": 105.125,
"learning_rate": 1.6477953909684294e-07,
"loss": 75.6582,
"step": 7105
},
{
"epoch": 0.8522880517845904,
"grad_norm": 104.0,
"learning_rate": 1.6411349407219926e-07,
"loss": 76.3708,
"step": 7110
},
{
"epoch": 0.8528874104707963,
"grad_norm": 107.0,
"learning_rate": 1.6344744904755561e-07,
"loss": 76.4872,
"step": 7115
},
{
"epoch": 0.8534867691570021,
"grad_norm": 108.875,
"learning_rate": 1.6278140402291194e-07,
"loss": 76.6676,
"step": 7120
},
{
"epoch": 0.8540861278432078,
"grad_norm": 105.5,
"learning_rate": 1.621153589982683e-07,
"loss": 76.4368,
"step": 7125
},
{
"epoch": 0.8546854865294136,
"grad_norm": 107.25,
"learning_rate": 1.614493139736246e-07,
"loss": 77.9496,
"step": 7130
},
{
"epoch": 0.8552848452156193,
"grad_norm": 107.0,
"learning_rate": 1.6078326894898094e-07,
"loss": 76.4692,
"step": 7135
},
{
"epoch": 0.8558842039018251,
"grad_norm": 106.8125,
"learning_rate": 1.6011722392433727e-07,
"loss": 76.6703,
"step": 7140
},
{
"epoch": 0.8564835625880308,
"grad_norm": 106.75,
"learning_rate": 1.5945117889969362e-07,
"loss": 77.6701,
"step": 7145
},
{
"epoch": 0.8570829212742366,
"grad_norm": 106.0625,
"learning_rate": 1.5878513387504995e-07,
"loss": 75.0485,
"step": 7150
},
{
"epoch": 0.8576822799604423,
"grad_norm": 103.0,
"learning_rate": 1.5811908885040627e-07,
"loss": 75.7822,
"step": 7155
},
{
"epoch": 0.8582816386466481,
"grad_norm": 107.0625,
"learning_rate": 1.574530438257626e-07,
"loss": 77.6251,
"step": 7160
},
{
"epoch": 0.8588809973328538,
"grad_norm": 104.3125,
"learning_rate": 1.5678699880111895e-07,
"loss": 75.9858,
"step": 7165
},
{
"epoch": 0.8594803560190596,
"grad_norm": 107.5625,
"learning_rate": 1.5612095377647528e-07,
"loss": 76.3392,
"step": 7170
},
{
"epoch": 0.8600797147052653,
"grad_norm": 108.375,
"learning_rate": 1.5545490875183163e-07,
"loss": 77.0148,
"step": 7175
},
{
"epoch": 0.8606790733914711,
"grad_norm": 107.8125,
"learning_rate": 1.5478886372718795e-07,
"loss": 76.0265,
"step": 7180
},
{
"epoch": 0.8612784320776768,
"grad_norm": 105.625,
"learning_rate": 1.5412281870254428e-07,
"loss": 76.7633,
"step": 7185
},
{
"epoch": 0.8618777907638826,
"grad_norm": 107.75,
"learning_rate": 1.534567736779006e-07,
"loss": 76.8605,
"step": 7190
},
{
"epoch": 0.8624771494500884,
"grad_norm": 106.875,
"learning_rate": 1.5279072865325696e-07,
"loss": 77.3806,
"step": 7195
},
{
"epoch": 0.8630765081362942,
"grad_norm": 105.5,
"learning_rate": 1.5212468362861328e-07,
"loss": 75.9591,
"step": 7200
},
{
"epoch": 0.8636758668225,
"grad_norm": 106.875,
"learning_rate": 1.5145863860396963e-07,
"loss": 76.9259,
"step": 7205
},
{
"epoch": 0.8642752255087057,
"grad_norm": 110.0,
"learning_rate": 1.5079259357932593e-07,
"loss": 77.0442,
"step": 7210
},
{
"epoch": 0.8648745841949115,
"grad_norm": 105.625,
"learning_rate": 1.5012654855468228e-07,
"loss": 76.0748,
"step": 7215
},
{
"epoch": 0.8654739428811172,
"grad_norm": 105.5,
"learning_rate": 1.494605035300386e-07,
"loss": 75.3836,
"step": 7220
},
{
"epoch": 0.866073301567323,
"grad_norm": 105.375,
"learning_rate": 1.4879445850539496e-07,
"loss": 75.3334,
"step": 7225
},
{
"epoch": 0.8666726602535287,
"grad_norm": 109.375,
"learning_rate": 1.481284134807513e-07,
"loss": 76.1473,
"step": 7230
},
{
"epoch": 0.8672720189397345,
"grad_norm": 108.1875,
"learning_rate": 1.4746236845610764e-07,
"loss": 76.3155,
"step": 7235
},
{
"epoch": 0.8678713776259402,
"grad_norm": 110.5,
"learning_rate": 1.4679632343146394e-07,
"loss": 76.8435,
"step": 7240
},
{
"epoch": 0.868470736312146,
"grad_norm": 104.75,
"learning_rate": 1.461302784068203e-07,
"loss": 77.2809,
"step": 7245
},
{
"epoch": 0.8690700949983517,
"grad_norm": 106.9375,
"learning_rate": 1.4546423338217662e-07,
"loss": 77.4855,
"step": 7250
},
{
"epoch": 0.8696694536845575,
"grad_norm": 106.375,
"learning_rate": 1.4479818835753297e-07,
"loss": 76.6354,
"step": 7255
},
{
"epoch": 0.8702688123707633,
"grad_norm": 104.3125,
"learning_rate": 1.441321433328893e-07,
"loss": 75.2417,
"step": 7260
},
{
"epoch": 0.870868171056969,
"grad_norm": 107.8125,
"learning_rate": 1.4346609830824562e-07,
"loss": 77.3617,
"step": 7265
},
{
"epoch": 0.8714675297431748,
"grad_norm": 107.3125,
"learning_rate": 1.4280005328360197e-07,
"loss": 75.9692,
"step": 7270
},
{
"epoch": 0.8720668884293805,
"grad_norm": 106.3125,
"learning_rate": 1.421340082589583e-07,
"loss": 77.6722,
"step": 7275
},
{
"epoch": 0.8726662471155863,
"grad_norm": 108.375,
"learning_rate": 1.4146796323431465e-07,
"loss": 77.6011,
"step": 7280
},
{
"epoch": 0.8732656058017921,
"grad_norm": 105.5,
"learning_rate": 1.4080191820967098e-07,
"loss": 76.7692,
"step": 7285
},
{
"epoch": 0.8738649644879979,
"grad_norm": 105.125,
"learning_rate": 1.4013587318502733e-07,
"loss": 77.2192,
"step": 7290
},
{
"epoch": 0.8744643231742036,
"grad_norm": 109.125,
"learning_rate": 1.3946982816038363e-07,
"loss": 76.9221,
"step": 7295
},
{
"epoch": 0.8750636818604094,
"grad_norm": 107.9375,
"learning_rate": 1.3880378313573998e-07,
"loss": 76.3955,
"step": 7300
},
{
"epoch": 0.8756630405466151,
"grad_norm": 107.0,
"learning_rate": 1.381377381110963e-07,
"loss": 75.523,
"step": 7305
},
{
"epoch": 0.8762623992328209,
"grad_norm": 107.625,
"learning_rate": 1.3747169308645266e-07,
"loss": 77.1425,
"step": 7310
},
{
"epoch": 0.8768617579190267,
"grad_norm": 109.875,
"learning_rate": 1.3680564806180898e-07,
"loss": 76.5368,
"step": 7315
},
{
"epoch": 0.8774611166052324,
"grad_norm": 107.25,
"learning_rate": 1.361396030371653e-07,
"loss": 76.536,
"step": 7320
},
{
"epoch": 0.8780604752914382,
"grad_norm": 108.8125,
"learning_rate": 1.3547355801252163e-07,
"loss": 76.2158,
"step": 7325
},
{
"epoch": 0.8786598339776439,
"grad_norm": 108.625,
"learning_rate": 1.3480751298787798e-07,
"loss": 76.2001,
"step": 7330
},
{
"epoch": 0.8792591926638497,
"grad_norm": 105.6875,
"learning_rate": 1.341414679632343e-07,
"loss": 74.994,
"step": 7335
},
{
"epoch": 0.8798585513500554,
"grad_norm": 105.4375,
"learning_rate": 1.3347542293859066e-07,
"loss": 76.1967,
"step": 7340
},
{
"epoch": 0.8804579100362612,
"grad_norm": 106.0625,
"learning_rate": 1.3280937791394696e-07,
"loss": 76.307,
"step": 7345
},
{
"epoch": 0.8810572687224669,
"grad_norm": 108.125,
"learning_rate": 1.3214333288930331e-07,
"loss": 76.0177,
"step": 7350
},
{
"epoch": 0.8816566274086727,
"grad_norm": 106.4375,
"learning_rate": 1.3147728786465964e-07,
"loss": 77.1298,
"step": 7355
},
{
"epoch": 0.8822559860948784,
"grad_norm": 107.3125,
"learning_rate": 1.30811242840016e-07,
"loss": 74.9574,
"step": 7360
},
{
"epoch": 0.8828553447810843,
"grad_norm": 109.0,
"learning_rate": 1.3014519781537232e-07,
"loss": 77.0243,
"step": 7365
},
{
"epoch": 0.88345470346729,
"grad_norm": 107.5625,
"learning_rate": 1.2947915279072867e-07,
"loss": 76.6947,
"step": 7370
},
{
"epoch": 0.8840540621534958,
"grad_norm": 109.6875,
"learning_rate": 1.2881310776608497e-07,
"loss": 76.0506,
"step": 7375
},
{
"epoch": 0.8846534208397016,
"grad_norm": 111.5,
"learning_rate": 1.2814706274144132e-07,
"loss": 75.8644,
"step": 7380
},
{
"epoch": 0.8852527795259073,
"grad_norm": 107.5625,
"learning_rate": 1.2748101771679765e-07,
"loss": 76.0396,
"step": 7385
},
{
"epoch": 0.8858521382121131,
"grad_norm": 108.5,
"learning_rate": 1.26814972692154e-07,
"loss": 75.8297,
"step": 7390
},
{
"epoch": 0.8864514968983188,
"grad_norm": 104.625,
"learning_rate": 1.2614892766751032e-07,
"loss": 76.7324,
"step": 7395
},
{
"epoch": 0.8870508555845246,
"grad_norm": 106.625,
"learning_rate": 1.2548288264286665e-07,
"loss": 75.7794,
"step": 7400
},
{
"epoch": 0.8876502142707303,
"grad_norm": 107.5,
"learning_rate": 1.2481683761822297e-07,
"loss": 75.7284,
"step": 7405
},
{
"epoch": 0.8882495729569361,
"grad_norm": 106.375,
"learning_rate": 1.2415079259357933e-07,
"loss": 76.7833,
"step": 7410
},
{
"epoch": 0.8888489316431418,
"grad_norm": 110.5,
"learning_rate": 1.2348474756893565e-07,
"loss": 75.3238,
"step": 7415
},
{
"epoch": 0.8894482903293476,
"grad_norm": 107.5,
"learning_rate": 1.2281870254429198e-07,
"loss": 74.8932,
"step": 7420
},
{
"epoch": 0.8900476490155533,
"grad_norm": 110.4375,
"learning_rate": 1.2215265751964833e-07,
"loss": 76.3159,
"step": 7425
},
{
"epoch": 0.8906470077017591,
"grad_norm": 109.125,
"learning_rate": 1.2148661249500465e-07,
"loss": 76.1404,
"step": 7430
},
{
"epoch": 0.8912463663879648,
"grad_norm": 108.875,
"learning_rate": 1.2082056747036098e-07,
"loss": 76.7653,
"step": 7435
},
{
"epoch": 0.8918457250741706,
"grad_norm": 107.625,
"learning_rate": 1.201545224457173e-07,
"loss": 77.1986,
"step": 7440
},
{
"epoch": 0.8924450837603763,
"grad_norm": 107.875,
"learning_rate": 1.1948847742107366e-07,
"loss": 76.5664,
"step": 7445
},
{
"epoch": 0.8930444424465822,
"grad_norm": 104.5625,
"learning_rate": 1.1882243239643e-07,
"loss": 76.0746,
"step": 7450
},
{
"epoch": 0.893643801132788,
"grad_norm": 105.75,
"learning_rate": 1.1815638737178634e-07,
"loss": 75.1047,
"step": 7455
},
{
"epoch": 0.8942431598189937,
"grad_norm": 105.75,
"learning_rate": 1.1749034234714267e-07,
"loss": 74.8203,
"step": 7460
},
{
"epoch": 0.8948425185051995,
"grad_norm": 105.5,
"learning_rate": 1.16824297322499e-07,
"loss": 76.4513,
"step": 7465
},
{
"epoch": 0.8954418771914052,
"grad_norm": 108.875,
"learning_rate": 1.1615825229785534e-07,
"loss": 75.8262,
"step": 7470
},
{
"epoch": 0.896041235877611,
"grad_norm": 107.6875,
"learning_rate": 1.1549220727321166e-07,
"loss": 75.3952,
"step": 7475
},
{
"epoch": 0.8966405945638167,
"grad_norm": 106.0,
"learning_rate": 1.14826162248568e-07,
"loss": 75.8961,
"step": 7480
},
{
"epoch": 0.8972399532500225,
"grad_norm": 111.1875,
"learning_rate": 1.1416011722392434e-07,
"loss": 76.1357,
"step": 7485
},
{
"epoch": 0.8978393119362282,
"grad_norm": 107.8125,
"learning_rate": 1.1349407219928067e-07,
"loss": 76.3188,
"step": 7490
},
{
"epoch": 0.898438670622434,
"grad_norm": 105.5,
"learning_rate": 1.12828027174637e-07,
"loss": 76.0742,
"step": 7495
},
{
"epoch": 0.8990380293086397,
"grad_norm": 109.625,
"learning_rate": 1.1216198214999335e-07,
"loss": 75.6454,
"step": 7500
},
{
"epoch": 0.8990380293086397,
"eval_loss": 2.3795347213745117,
"eval_runtime": 400.1917,
"eval_samples_per_second": 1123.469,
"eval_steps_per_second": 35.111,
"step": 7500
},
{
"epoch": 0.8996373879948455,
"grad_norm": 106.25,
"learning_rate": 1.1149593712534967e-07,
"loss": 75.9472,
"step": 7505
},
{
"epoch": 0.9002367466810512,
"grad_norm": 109.3125,
"learning_rate": 1.1082989210070601e-07,
"loss": 76.2319,
"step": 7510
},
{
"epoch": 0.900836105367257,
"grad_norm": 108.1875,
"learning_rate": 1.1016384707606233e-07,
"loss": 76.4846,
"step": 7515
},
{
"epoch": 0.9014354640534628,
"grad_norm": 108.75,
"learning_rate": 1.0949780205141867e-07,
"loss": 77.3349,
"step": 7520
},
{
"epoch": 0.9020348227396685,
"grad_norm": 104.9375,
"learning_rate": 1.0883175702677501e-07,
"loss": 74.6866,
"step": 7525
},
{
"epoch": 0.9026341814258743,
"grad_norm": 110.25,
"learning_rate": 1.0816571200213134e-07,
"loss": 75.6139,
"step": 7530
},
{
"epoch": 0.9032335401120801,
"grad_norm": 108.4375,
"learning_rate": 1.0749966697748768e-07,
"loss": 76.1253,
"step": 7535
},
{
"epoch": 0.9038328987982859,
"grad_norm": 107.5,
"learning_rate": 1.0683362195284402e-07,
"loss": 75.2204,
"step": 7540
},
{
"epoch": 0.9044322574844916,
"grad_norm": 105.5,
"learning_rate": 1.0616757692820034e-07,
"loss": 75.8417,
"step": 7545
},
{
"epoch": 0.9050316161706974,
"grad_norm": 105.1875,
"learning_rate": 1.0550153190355668e-07,
"loss": 75.7311,
"step": 7550
},
{
"epoch": 0.9056309748569031,
"grad_norm": 101.625,
"learning_rate": 1.0483548687891302e-07,
"loss": 74.4821,
"step": 7555
},
{
"epoch": 0.9062303335431089,
"grad_norm": 106.4375,
"learning_rate": 1.0416944185426934e-07,
"loss": 75.5503,
"step": 7560
},
{
"epoch": 0.9068296922293146,
"grad_norm": 104.5,
"learning_rate": 1.0350339682962568e-07,
"loss": 76.8925,
"step": 7565
},
{
"epoch": 0.9074290509155204,
"grad_norm": 111.125,
"learning_rate": 1.0283735180498201e-07,
"loss": 76.4836,
"step": 7570
},
{
"epoch": 0.9080284096017261,
"grad_norm": 104.75,
"learning_rate": 1.0217130678033835e-07,
"loss": 77.4133,
"step": 7575
},
{
"epoch": 0.9086277682879319,
"grad_norm": 110.5,
"learning_rate": 1.0150526175569469e-07,
"loss": 76.1813,
"step": 7580
},
{
"epoch": 0.9092271269741377,
"grad_norm": 106.8125,
"learning_rate": 1.0083921673105101e-07,
"loss": 75.3809,
"step": 7585
},
{
"epoch": 0.9098264856603434,
"grad_norm": 104.5625,
"learning_rate": 1.0017317170640735e-07,
"loss": 76.3577,
"step": 7590
},
{
"epoch": 0.9104258443465492,
"grad_norm": 106.8125,
"learning_rate": 9.950712668176369e-08,
"loss": 75.8196,
"step": 7595
},
{
"epoch": 0.9110252030327549,
"grad_norm": 107.0625,
"learning_rate": 9.884108165712002e-08,
"loss": 76.5003,
"step": 7600
},
{
"epoch": 0.9116245617189607,
"grad_norm": 107.125,
"learning_rate": 9.817503663247635e-08,
"loss": 75.7129,
"step": 7605
},
{
"epoch": 0.9122239204051664,
"grad_norm": 106.9375,
"learning_rate": 9.750899160783268e-08,
"loss": 77.2414,
"step": 7610
},
{
"epoch": 0.9128232790913723,
"grad_norm": 105.3125,
"learning_rate": 9.684294658318902e-08,
"loss": 75.8869,
"step": 7615
},
{
"epoch": 0.913422637777578,
"grad_norm": 107.1875,
"learning_rate": 9.617690155854536e-08,
"loss": 74.8393,
"step": 7620
},
{
"epoch": 0.9140219964637838,
"grad_norm": 107.25,
"learning_rate": 9.551085653390168e-08,
"loss": 75.3144,
"step": 7625
},
{
"epoch": 0.9146213551499895,
"grad_norm": 106.5625,
"learning_rate": 9.484481150925802e-08,
"loss": 75.7677,
"step": 7630
},
{
"epoch": 0.9152207138361953,
"grad_norm": 106.625,
"learning_rate": 9.417876648461436e-08,
"loss": 75.8197,
"step": 7635
},
{
"epoch": 0.915820072522401,
"grad_norm": 108.875,
"learning_rate": 9.351272145997069e-08,
"loss": 75.7213,
"step": 7640
},
{
"epoch": 0.9164194312086068,
"grad_norm": 105.125,
"learning_rate": 9.284667643532702e-08,
"loss": 74.8678,
"step": 7645
},
{
"epoch": 0.9170187898948126,
"grad_norm": 109.5625,
"learning_rate": 9.218063141068336e-08,
"loss": 77.0629,
"step": 7650
},
{
"epoch": 0.9176181485810183,
"grad_norm": 106.6875,
"learning_rate": 9.151458638603969e-08,
"loss": 76.5193,
"step": 7655
},
{
"epoch": 0.9182175072672241,
"grad_norm": 109.1875,
"learning_rate": 9.084854136139603e-08,
"loss": 76.135,
"step": 7660
},
{
"epoch": 0.9188168659534298,
"grad_norm": 108.1875,
"learning_rate": 9.018249633675235e-08,
"loss": 75.7558,
"step": 7665
},
{
"epoch": 0.9194162246396356,
"grad_norm": 106.0625,
"learning_rate": 8.951645131210869e-08,
"loss": 76.6425,
"step": 7670
},
{
"epoch": 0.9200155833258413,
"grad_norm": 110.625,
"learning_rate": 8.885040628746503e-08,
"loss": 75.9904,
"step": 7675
},
{
"epoch": 0.9206149420120471,
"grad_norm": 106.8125,
"learning_rate": 8.818436126282136e-08,
"loss": 76.6544,
"step": 7680
},
{
"epoch": 0.9212143006982528,
"grad_norm": 105.375,
"learning_rate": 8.75183162381777e-08,
"loss": 76.6804,
"step": 7685
},
{
"epoch": 0.9218136593844586,
"grad_norm": 107.75,
"learning_rate": 8.685227121353403e-08,
"loss": 75.9191,
"step": 7690
},
{
"epoch": 0.9224130180706643,
"grad_norm": 105.9375,
"learning_rate": 8.618622618889036e-08,
"loss": 76.0447,
"step": 7695
},
{
"epoch": 0.9230123767568702,
"grad_norm": 107.75,
"learning_rate": 8.55201811642467e-08,
"loss": 75.6943,
"step": 7700
},
{
"epoch": 0.923611735443076,
"grad_norm": 108.4375,
"learning_rate": 8.485413613960302e-08,
"loss": 76.1021,
"step": 7705
},
{
"epoch": 0.9242110941292817,
"grad_norm": 106.3125,
"learning_rate": 8.418809111495936e-08,
"loss": 75.5128,
"step": 7710
},
{
"epoch": 0.9248104528154875,
"grad_norm": 107.6875,
"learning_rate": 8.35220460903157e-08,
"loss": 75.4414,
"step": 7715
},
{
"epoch": 0.9254098115016932,
"grad_norm": 107.9375,
"learning_rate": 8.285600106567203e-08,
"loss": 76.4457,
"step": 7720
},
{
"epoch": 0.926009170187899,
"grad_norm": 110.375,
"learning_rate": 8.218995604102837e-08,
"loss": 75.3731,
"step": 7725
},
{
"epoch": 0.9266085288741047,
"grad_norm": 106.25,
"learning_rate": 8.15239110163847e-08,
"loss": 75.2117,
"step": 7730
},
{
"epoch": 0.9272078875603105,
"grad_norm": 107.8125,
"learning_rate": 8.085786599174103e-08,
"loss": 76.5782,
"step": 7735
},
{
"epoch": 0.9278072462465162,
"grad_norm": 107.75,
"learning_rate": 8.019182096709737e-08,
"loss": 76.7815,
"step": 7740
},
{
"epoch": 0.928406604932722,
"grad_norm": 109.4375,
"learning_rate": 7.952577594245371e-08,
"loss": 76.2764,
"step": 7745
},
{
"epoch": 0.9290059636189277,
"grad_norm": 108.0,
"learning_rate": 7.885973091781003e-08,
"loss": 75.9319,
"step": 7750
},
{
"epoch": 0.9296053223051335,
"grad_norm": 107.5625,
"learning_rate": 7.819368589316637e-08,
"loss": 75.9743,
"step": 7755
},
{
"epoch": 0.9302046809913392,
"grad_norm": 109.5,
"learning_rate": 7.75276408685227e-08,
"loss": 74.1208,
"step": 7760
},
{
"epoch": 0.930804039677545,
"grad_norm": 108.75,
"learning_rate": 7.686159584387904e-08,
"loss": 75.7255,
"step": 7765
},
{
"epoch": 0.9314033983637507,
"grad_norm": 104.6875,
"learning_rate": 7.619555081923538e-08,
"loss": 75.9585,
"step": 7770
},
{
"epoch": 0.9320027570499565,
"grad_norm": 103.75,
"learning_rate": 7.55295057945917e-08,
"loss": 75.6327,
"step": 7775
},
{
"epoch": 0.9326021157361623,
"grad_norm": 105.5625,
"learning_rate": 7.486346076994804e-08,
"loss": 76.4223,
"step": 7780
},
{
"epoch": 0.9332014744223681,
"grad_norm": 107.1875,
"learning_rate": 7.419741574530438e-08,
"loss": 74.8369,
"step": 7785
},
{
"epoch": 0.9338008331085739,
"grad_norm": 108.3125,
"learning_rate": 7.35313707206607e-08,
"loss": 75.6269,
"step": 7790
},
{
"epoch": 0.9344001917947796,
"grad_norm": 105.8125,
"learning_rate": 7.286532569601704e-08,
"loss": 76.3296,
"step": 7795
},
{
"epoch": 0.9349995504809854,
"grad_norm": 105.875,
"learning_rate": 7.219928067137337e-08,
"loss": 73.8214,
"step": 7800
},
{
"epoch": 0.9355989091671911,
"grad_norm": 104.0625,
"learning_rate": 7.153323564672971e-08,
"loss": 74.8871,
"step": 7805
},
{
"epoch": 0.9361982678533969,
"grad_norm": 107.25,
"learning_rate": 7.086719062208606e-08,
"loss": 76.116,
"step": 7810
},
{
"epoch": 0.9367976265396026,
"grad_norm": 105.0625,
"learning_rate": 7.020114559744239e-08,
"loss": 75.9282,
"step": 7815
},
{
"epoch": 0.9373969852258084,
"grad_norm": 108.8125,
"learning_rate": 6.953510057279872e-08,
"loss": 75.725,
"step": 7820
},
{
"epoch": 0.9379963439120141,
"grad_norm": 108.125,
"learning_rate": 6.886905554815506e-08,
"loss": 75.2024,
"step": 7825
},
{
"epoch": 0.9385957025982199,
"grad_norm": 105.0625,
"learning_rate": 6.820301052351139e-08,
"loss": 74.4366,
"step": 7830
},
{
"epoch": 0.9391950612844256,
"grad_norm": 105.3125,
"learning_rate": 6.753696549886773e-08,
"loss": 75.688,
"step": 7835
},
{
"epoch": 0.9397944199706314,
"grad_norm": 105.9375,
"learning_rate": 6.687092047422407e-08,
"loss": 75.9839,
"step": 7840
},
{
"epoch": 0.9403937786568372,
"grad_norm": 106.875,
"learning_rate": 6.620487544958039e-08,
"loss": 75.7041,
"step": 7845
},
{
"epoch": 0.9409931373430429,
"grad_norm": 107.4375,
"learning_rate": 6.553883042493673e-08,
"loss": 75.4809,
"step": 7850
},
{
"epoch": 0.9415924960292487,
"grad_norm": 106.1875,
"learning_rate": 6.487278540029306e-08,
"loss": 75.421,
"step": 7855
},
{
"epoch": 0.9421918547154544,
"grad_norm": 108.4375,
"learning_rate": 6.42067403756494e-08,
"loss": 76.3878,
"step": 7860
},
{
"epoch": 0.9427912134016603,
"grad_norm": 102.6875,
"learning_rate": 6.354069535100573e-08,
"loss": 75.1176,
"step": 7865
},
{
"epoch": 0.943390572087866,
"grad_norm": 107.6875,
"learning_rate": 6.287465032636206e-08,
"loss": 75.2976,
"step": 7870
},
{
"epoch": 0.9439899307740718,
"grad_norm": 106.375,
"learning_rate": 6.220860530171838e-08,
"loss": 75.4411,
"step": 7875
},
{
"epoch": 0.9445892894602775,
"grad_norm": 110.125,
"learning_rate": 6.154256027707472e-08,
"loss": 75.5738,
"step": 7880
},
{
"epoch": 0.9451886481464833,
"grad_norm": 104.75,
"learning_rate": 6.087651525243106e-08,
"loss": 75.0076,
"step": 7885
},
{
"epoch": 0.945788006832689,
"grad_norm": 106.3125,
"learning_rate": 6.021047022778739e-08,
"loss": 75.3758,
"step": 7890
},
{
"epoch": 0.9463873655188948,
"grad_norm": 103.875,
"learning_rate": 5.9544425203143727e-08,
"loss": 75.9719,
"step": 7895
},
{
"epoch": 0.9469867242051005,
"grad_norm": 106.75,
"learning_rate": 5.8878380178500065e-08,
"loss": 75.7151,
"step": 7900
},
{
"epoch": 0.9475860828913063,
"grad_norm": 106.4375,
"learning_rate": 5.8212335153856404e-08,
"loss": 74.1712,
"step": 7905
},
{
"epoch": 0.9481854415775121,
"grad_norm": 107.0,
"learning_rate": 5.7546290129212736e-08,
"loss": 76.8725,
"step": 7910
},
{
"epoch": 0.9487848002637178,
"grad_norm": 105.125,
"learning_rate": 5.688024510456907e-08,
"loss": 76.729,
"step": 7915
},
{
"epoch": 0.9493841589499236,
"grad_norm": 110.8125,
"learning_rate": 5.62142000799254e-08,
"loss": 75.4301,
"step": 7920
},
{
"epoch": 0.9499835176361293,
"grad_norm": 109.9375,
"learning_rate": 5.554815505528174e-08,
"loss": 75.4277,
"step": 7925
},
{
"epoch": 0.9505828763223351,
"grad_norm": 109.0625,
"learning_rate": 5.488211003063807e-08,
"loss": 74.834,
"step": 7930
},
{
"epoch": 0.9511822350085408,
"grad_norm": 104.0,
"learning_rate": 5.4216065005994404e-08,
"loss": 75.1636,
"step": 7935
},
{
"epoch": 0.9517815936947466,
"grad_norm": 108.625,
"learning_rate": 5.3550019981350736e-08,
"loss": 76.5781,
"step": 7940
},
{
"epoch": 0.9523809523809523,
"grad_norm": 108.125,
"learning_rate": 5.2883974956707075e-08,
"loss": 75.4816,
"step": 7945
},
{
"epoch": 0.9529803110671582,
"grad_norm": 104.625,
"learning_rate": 5.221792993206341e-08,
"loss": 75.2837,
"step": 7950
},
{
"epoch": 0.953579669753364,
"grad_norm": 105.5625,
"learning_rate": 5.155188490741974e-08,
"loss": 75.0366,
"step": 7955
},
{
"epoch": 0.9541790284395697,
"grad_norm": 110.5,
"learning_rate": 5.088583988277607e-08,
"loss": 74.8486,
"step": 7960
},
{
"epoch": 0.9547783871257755,
"grad_norm": 106.125,
"learning_rate": 5.021979485813241e-08,
"loss": 74.3942,
"step": 7965
},
{
"epoch": 0.9553777458119812,
"grad_norm": 108.1875,
"learning_rate": 4.955374983348874e-08,
"loss": 75.9698,
"step": 7970
},
{
"epoch": 0.955977104498187,
"grad_norm": 105.5,
"learning_rate": 4.8887704808845075e-08,
"loss": 74.0582,
"step": 7975
},
{
"epoch": 0.9565764631843927,
"grad_norm": 108.125,
"learning_rate": 4.822165978420141e-08,
"loss": 75.5124,
"step": 7980
},
{
"epoch": 0.9571758218705985,
"grad_norm": 109.0625,
"learning_rate": 4.7555614759557746e-08,
"loss": 75.5287,
"step": 7985
},
{
"epoch": 0.9577751805568042,
"grad_norm": 105.25,
"learning_rate": 4.688956973491408e-08,
"loss": 74.6588,
"step": 7990
},
{
"epoch": 0.95837453924301,
"grad_norm": 108.25,
"learning_rate": 4.622352471027041e-08,
"loss": 74.4629,
"step": 7995
},
{
"epoch": 0.9589738979292157,
"grad_norm": 107.0625,
"learning_rate": 4.555747968562675e-08,
"loss": 74.7334,
"step": 8000
},
{
"epoch": 0.9589738979292157,
"eval_loss": 2.360534191131592,
"eval_runtime": 408.2462,
"eval_samples_per_second": 1101.304,
"eval_steps_per_second": 34.418,
"step": 8000
},
{
"epoch": 0.9595732566154215,
"grad_norm": 102.625,
"learning_rate": 4.489143466098308e-08,
"loss": 76.2619,
"step": 8005
},
{
"epoch": 0.9601726153016272,
"grad_norm": 111.1875,
"learning_rate": 4.422538963633941e-08,
"loss": 74.1844,
"step": 8010
},
{
"epoch": 0.960771973987833,
"grad_norm": 104.0,
"learning_rate": 4.3559344611695745e-08,
"loss": 76.1778,
"step": 8015
},
{
"epoch": 0.9613713326740387,
"grad_norm": 106.9375,
"learning_rate": 4.2893299587052084e-08,
"loss": 77.1481,
"step": 8020
},
{
"epoch": 0.9619706913602445,
"grad_norm": 105.9375,
"learning_rate": 4.2227254562408416e-08,
"loss": 75.8017,
"step": 8025
},
{
"epoch": 0.9625700500464502,
"grad_norm": 106.0,
"learning_rate": 4.156120953776475e-08,
"loss": 75.1555,
"step": 8030
},
{
"epoch": 0.9631694087326561,
"grad_norm": 108.375,
"learning_rate": 4.089516451312108e-08,
"loss": 75.7826,
"step": 8035
},
{
"epoch": 0.9637687674188619,
"grad_norm": 106.5,
"learning_rate": 4.022911948847742e-08,
"loss": 73.2987,
"step": 8040
},
{
"epoch": 0.9643681261050676,
"grad_norm": 105.0,
"learning_rate": 3.956307446383375e-08,
"loss": 74.986,
"step": 8045
},
{
"epoch": 0.9649674847912734,
"grad_norm": 104.5,
"learning_rate": 3.8897029439190084e-08,
"loss": 75.9831,
"step": 8050
},
{
"epoch": 0.9655668434774791,
"grad_norm": 107.0,
"learning_rate": 3.8230984414546416e-08,
"loss": 75.217,
"step": 8055
},
{
"epoch": 0.9661662021636849,
"grad_norm": 108.625,
"learning_rate": 3.7564939389902755e-08,
"loss": 76.5886,
"step": 8060
},
{
"epoch": 0.9667655608498906,
"grad_norm": 110.6875,
"learning_rate": 3.689889436525909e-08,
"loss": 75.2561,
"step": 8065
},
{
"epoch": 0.9673649195360964,
"grad_norm": 107.0625,
"learning_rate": 3.623284934061542e-08,
"loss": 75.0127,
"step": 8070
},
{
"epoch": 0.9679642782223021,
"grad_norm": 106.8125,
"learning_rate": 3.5566804315971765e-08,
"loss": 76.2585,
"step": 8075
},
{
"epoch": 0.9685636369085079,
"grad_norm": 102.5625,
"learning_rate": 3.49007592913281e-08,
"loss": 75.7222,
"step": 8080
},
{
"epoch": 0.9691629955947136,
"grad_norm": 105.625,
"learning_rate": 3.423471426668443e-08,
"loss": 75.6719,
"step": 8085
},
{
"epoch": 0.9697623542809194,
"grad_norm": 106.1875,
"learning_rate": 3.356866924204076e-08,
"loss": 74.9915,
"step": 8090
},
{
"epoch": 0.9703617129671251,
"grad_norm": 106.4375,
"learning_rate": 3.29026242173971e-08,
"loss": 75.2533,
"step": 8095
},
{
"epoch": 0.9709610716533309,
"grad_norm": 102.875,
"learning_rate": 3.223657919275343e-08,
"loss": 74.9591,
"step": 8100
},
{
"epoch": 0.9715604303395367,
"grad_norm": 106.0,
"learning_rate": 3.1570534168109764e-08,
"loss": 75.4972,
"step": 8105
},
{
"epoch": 0.9721597890257424,
"grad_norm": 110.0,
"learning_rate": 3.0904489143466097e-08,
"loss": 75.5078,
"step": 8110
},
{
"epoch": 0.9727591477119483,
"grad_norm": 109.375,
"learning_rate": 3.023844411882243e-08,
"loss": 75.4631,
"step": 8115
},
{
"epoch": 0.973358506398154,
"grad_norm": 104.6875,
"learning_rate": 2.9572399094178768e-08,
"loss": 73.9664,
"step": 8120
},
{
"epoch": 0.9739578650843598,
"grad_norm": 107.1875,
"learning_rate": 2.89063540695351e-08,
"loss": 74.6196,
"step": 8125
},
{
"epoch": 0.9745572237705655,
"grad_norm": 104.8125,
"learning_rate": 2.8240309044891435e-08,
"loss": 73.8321,
"step": 8130
},
{
"epoch": 0.9751565824567713,
"grad_norm": 106.875,
"learning_rate": 2.7574264020247767e-08,
"loss": 73.9742,
"step": 8135
},
{
"epoch": 0.975755941142977,
"grad_norm": 105.4375,
"learning_rate": 2.6908218995604103e-08,
"loss": 74.6413,
"step": 8140
},
{
"epoch": 0.9763552998291828,
"grad_norm": 112.0625,
"learning_rate": 2.6242173970960435e-08,
"loss": 76.6869,
"step": 8145
},
{
"epoch": 0.9769546585153885,
"grad_norm": 109.125,
"learning_rate": 2.557612894631677e-08,
"loss": 75.1903,
"step": 8150
},
{
"epoch": 0.9775540172015943,
"grad_norm": 106.0625,
"learning_rate": 2.4910083921673103e-08,
"loss": 75.9438,
"step": 8155
},
{
"epoch": 0.9781533758878,
"grad_norm": 105.375,
"learning_rate": 2.4244038897029438e-08,
"loss": 75.6281,
"step": 8160
},
{
"epoch": 0.9787527345740058,
"grad_norm": 105.75,
"learning_rate": 2.357799387238577e-08,
"loss": 75.2257,
"step": 8165
},
{
"epoch": 0.9793520932602116,
"grad_norm": 106.5625,
"learning_rate": 2.2911948847742106e-08,
"loss": 75.081,
"step": 8170
},
{
"epoch": 0.9799514519464173,
"grad_norm": 107.5,
"learning_rate": 2.224590382309844e-08,
"loss": 74.4403,
"step": 8175
},
{
"epoch": 0.9805508106326231,
"grad_norm": 108.625,
"learning_rate": 2.1579858798454774e-08,
"loss": 75.8791,
"step": 8180
},
{
"epoch": 0.9811501693188288,
"grad_norm": 106.375,
"learning_rate": 2.091381377381111e-08,
"loss": 75.0459,
"step": 8185
},
{
"epoch": 0.9817495280050346,
"grad_norm": 103.625,
"learning_rate": 2.024776874916744e-08,
"loss": 74.4976,
"step": 8190
},
{
"epoch": 0.9823488866912403,
"grad_norm": 105.3125,
"learning_rate": 1.9581723724523777e-08,
"loss": 75.4353,
"step": 8195
},
{
"epoch": 0.9829482453774462,
"grad_norm": 107.5625,
"learning_rate": 1.891567869988011e-08,
"loss": 74.7069,
"step": 8200
},
{
"epoch": 0.9835476040636519,
"grad_norm": 109.3125,
"learning_rate": 1.8249633675236445e-08,
"loss": 76.061,
"step": 8205
},
{
"epoch": 0.9841469627498577,
"grad_norm": 108.3125,
"learning_rate": 1.758358865059278e-08,
"loss": 75.0838,
"step": 8210
},
{
"epoch": 0.9847463214360634,
"grad_norm": 107.0,
"learning_rate": 1.6917543625949116e-08,
"loss": 75.832,
"step": 8215
},
{
"epoch": 0.9853456801222692,
"grad_norm": 103.1875,
"learning_rate": 1.6251498601305448e-08,
"loss": 75.316,
"step": 8220
},
{
"epoch": 0.985945038808475,
"grad_norm": 104.9375,
"learning_rate": 1.558545357666178e-08,
"loss": 75.5353,
"step": 8225
},
{
"epoch": 0.9865443974946807,
"grad_norm": 108.6875,
"learning_rate": 1.4919408552018115e-08,
"loss": 75.2492,
"step": 8230
},
{
"epoch": 0.9871437561808865,
"grad_norm": 106.25,
"learning_rate": 1.4253363527374451e-08,
"loss": 74.484,
"step": 8235
},
{
"epoch": 0.9877431148670922,
"grad_norm": 110.6875,
"learning_rate": 1.3587318502730785e-08,
"loss": 75.8064,
"step": 8240
},
{
"epoch": 0.988342473553298,
"grad_norm": 109.5,
"learning_rate": 1.2921273478087119e-08,
"loss": 74.8354,
"step": 8245
},
{
"epoch": 0.9889418322395037,
"grad_norm": 108.0,
"learning_rate": 1.2255228453443452e-08,
"loss": 75.0737,
"step": 8250
},
{
"epoch": 0.9895411909257095,
"grad_norm": 105.8125,
"learning_rate": 1.1589183428799786e-08,
"loss": 75.3057,
"step": 8255
},
{
"epoch": 0.9901405496119152,
"grad_norm": 106.875,
"learning_rate": 1.092313840415612e-08,
"loss": 74.6023,
"step": 8260
},
{
"epoch": 0.990739908298121,
"grad_norm": 108.25,
"learning_rate": 1.0257093379512454e-08,
"loss": 75.8229,
"step": 8265
},
{
"epoch": 0.9913392669843267,
"grad_norm": 105.125,
"learning_rate": 9.591048354868788e-09,
"loss": 75.3764,
"step": 8270
},
{
"epoch": 0.9919386256705325,
"grad_norm": 105.0,
"learning_rate": 8.925003330225123e-09,
"loss": 75.4075,
"step": 8275
},
{
"epoch": 0.9925379843567383,
"grad_norm": 106.5625,
"learning_rate": 8.258958305581457e-09,
"loss": 75.7693,
"step": 8280
},
{
"epoch": 0.9931373430429441,
"grad_norm": 111.6875,
"learning_rate": 7.592913280937791e-09,
"loss": 76.1424,
"step": 8285
},
{
"epoch": 0.9937367017291499,
"grad_norm": 105.625,
"learning_rate": 6.926868256294126e-09,
"loss": 74.6477,
"step": 8290
},
{
"epoch": 0.9943360604153556,
"grad_norm": 106.0,
"learning_rate": 6.2608232316504594e-09,
"loss": 75.2063,
"step": 8295
},
{
"epoch": 0.9949354191015614,
"grad_norm": 104.1875,
"learning_rate": 5.594778207006793e-09,
"loss": 75.8802,
"step": 8300
},
{
"epoch": 0.9955347777877671,
"grad_norm": 108.75,
"learning_rate": 4.928733182363127e-09,
"loss": 74.0212,
"step": 8305
},
{
"epoch": 0.9961341364739729,
"grad_norm": 107.9375,
"learning_rate": 4.262688157719462e-09,
"loss": 75.8792,
"step": 8310
},
{
"epoch": 0.9967334951601786,
"grad_norm": 102.875,
"learning_rate": 3.596643133075796e-09,
"loss": 74.9816,
"step": 8315
},
{
"epoch": 0.9973328538463844,
"grad_norm": 105.3125,
"learning_rate": 2.93059810843213e-09,
"loss": 75.1708,
"step": 8320
},
{
"epoch": 0.9979322125325901,
"grad_norm": 108.8125,
"learning_rate": 2.2645530837884637e-09,
"loss": 75.8946,
"step": 8325
},
{
"epoch": 0.9985315712187959,
"grad_norm": 106.875,
"learning_rate": 1.5985080591447982e-09,
"loss": 75.4009,
"step": 8330
},
{
"epoch": 0.9991309299050016,
"grad_norm": 103.625,
"learning_rate": 9.324630345011322e-10,
"loss": 73.9491,
"step": 8335
},
{
"epoch": 0.9997302885912074,
"grad_norm": 106.25,
"learning_rate": 2.6641800985746636e-10,
"loss": 74.3839,
"step": 8340
}
],
"logging_steps": 5,
"max_steps": 8342,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.614053037573669e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}