CocoRoF's picture
Training in progress, step 3500, checkpoint
82551ae verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9855511166646135,
"eval_steps": 500,
"global_step": 3500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014079301666637335,
"grad_norm": 114.625,
"learning_rate": 2.8089887640449436e-08,
"loss": 85.8853,
"step": 5
},
{
"epoch": 0.002815860333327467,
"grad_norm": 111.6875,
"learning_rate": 5.617977528089887e-08,
"loss": 85.9658,
"step": 10
},
{
"epoch": 0.0042237904999912,
"grad_norm": 112.0625,
"learning_rate": 8.426966292134831e-08,
"loss": 84.4067,
"step": 15
},
{
"epoch": 0.005631720666654934,
"grad_norm": 114.375,
"learning_rate": 1.1235955056179774e-07,
"loss": 85.7776,
"step": 20
},
{
"epoch": 0.007039650833318667,
"grad_norm": 110.25,
"learning_rate": 1.4044943820224718e-07,
"loss": 83.5889,
"step": 25
},
{
"epoch": 0.0084475809999824,
"grad_norm": 112.375,
"learning_rate": 1.6853932584269663e-07,
"loss": 84.3387,
"step": 30
},
{
"epoch": 0.009855511166646135,
"grad_norm": 109.5625,
"learning_rate": 1.9662921348314607e-07,
"loss": 84.4387,
"step": 35
},
{
"epoch": 0.011263441333309868,
"grad_norm": 114.8125,
"learning_rate": 2.2471910112359549e-07,
"loss": 83.7717,
"step": 40
},
{
"epoch": 0.012671371499973601,
"grad_norm": 110.875,
"learning_rate": 2.5280898876404493e-07,
"loss": 83.5306,
"step": 45
},
{
"epoch": 0.014079301666637335,
"grad_norm": 106.3125,
"learning_rate": 2.8089887640449437e-07,
"loss": 83.6832,
"step": 50
},
{
"epoch": 0.015487231833301068,
"grad_norm": 112.25,
"learning_rate": 3.0898876404494376e-07,
"loss": 83.6929,
"step": 55
},
{
"epoch": 0.0168951619999648,
"grad_norm": 111.8125,
"learning_rate": 3.3707865168539325e-07,
"loss": 84.1018,
"step": 60
},
{
"epoch": 0.018303092166628534,
"grad_norm": 109.3125,
"learning_rate": 3.651685393258427e-07,
"loss": 84.1149,
"step": 65
},
{
"epoch": 0.01971102233329227,
"grad_norm": 109.875,
"learning_rate": 3.9325842696629214e-07,
"loss": 82.8841,
"step": 70
},
{
"epoch": 0.021118952499956004,
"grad_norm": 107.9375,
"learning_rate": 4.2134831460674153e-07,
"loss": 84.9085,
"step": 75
},
{
"epoch": 0.022526882666619737,
"grad_norm": 113.875,
"learning_rate": 4.4943820224719097e-07,
"loss": 83.177,
"step": 80
},
{
"epoch": 0.02393481283328347,
"grad_norm": 112.5625,
"learning_rate": 4.775280898876405e-07,
"loss": 84.905,
"step": 85
},
{
"epoch": 0.025342742999947203,
"grad_norm": 111.125,
"learning_rate": 5.056179775280899e-07,
"loss": 83.7053,
"step": 90
},
{
"epoch": 0.026750673166610936,
"grad_norm": 110.3125,
"learning_rate": 5.337078651685392e-07,
"loss": 84.249,
"step": 95
},
{
"epoch": 0.02815860333327467,
"grad_norm": 115.0,
"learning_rate": 5.617977528089887e-07,
"loss": 83.1926,
"step": 100
},
{
"epoch": 0.029566533499938402,
"grad_norm": 113.25,
"learning_rate": 5.898876404494381e-07,
"loss": 84.164,
"step": 105
},
{
"epoch": 0.030974463666602135,
"grad_norm": 112.9375,
"learning_rate": 6.179775280898875e-07,
"loss": 83.7127,
"step": 110
},
{
"epoch": 0.03238239383326587,
"grad_norm": 114.5,
"learning_rate": 6.460674157303371e-07,
"loss": 85.8182,
"step": 115
},
{
"epoch": 0.0337903239999296,
"grad_norm": 114.125,
"learning_rate": 6.741573033707865e-07,
"loss": 86.1688,
"step": 120
},
{
"epoch": 0.035198254166593335,
"grad_norm": 108.6875,
"learning_rate": 7.02247191011236e-07,
"loss": 83.0914,
"step": 125
},
{
"epoch": 0.03660618433325707,
"grad_norm": 109.75,
"learning_rate": 7.303370786516854e-07,
"loss": 86.4069,
"step": 130
},
{
"epoch": 0.0380141144999208,
"grad_norm": 111.8125,
"learning_rate": 7.584269662921348e-07,
"loss": 85.471,
"step": 135
},
{
"epoch": 0.03942204466658454,
"grad_norm": 109.75,
"learning_rate": 7.865168539325843e-07,
"loss": 84.2073,
"step": 140
},
{
"epoch": 0.040829974833248274,
"grad_norm": 112.6875,
"learning_rate": 8.146067415730337e-07,
"loss": 85.0642,
"step": 145
},
{
"epoch": 0.04223790499991201,
"grad_norm": 112.5,
"learning_rate": 8.426966292134831e-07,
"loss": 86.0327,
"step": 150
},
{
"epoch": 0.04364583516657574,
"grad_norm": 111.625,
"learning_rate": 8.707865168539326e-07,
"loss": 86.5769,
"step": 155
},
{
"epoch": 0.04505376533323947,
"grad_norm": 110.1875,
"learning_rate": 8.988764044943819e-07,
"loss": 87.2292,
"step": 160
},
{
"epoch": 0.046461695499903206,
"grad_norm": 108.4375,
"learning_rate": 9.269662921348314e-07,
"loss": 87.8398,
"step": 165
},
{
"epoch": 0.04786962566656694,
"grad_norm": 107.5,
"learning_rate": 9.55056179775281e-07,
"loss": 87.6348,
"step": 170
},
{
"epoch": 0.04927755583323067,
"grad_norm": 115.625,
"learning_rate": 9.831460674157302e-07,
"loss": 87.7073,
"step": 175
},
{
"epoch": 0.050685485999894406,
"grad_norm": 115.125,
"learning_rate": 1.0112359550561797e-06,
"loss": 86.4599,
"step": 180
},
{
"epoch": 0.05209341616655814,
"grad_norm": 109.625,
"learning_rate": 1.0393258426966292e-06,
"loss": 85.6867,
"step": 185
},
{
"epoch": 0.05350134633322187,
"grad_norm": 111.1875,
"learning_rate": 1.0674157303370785e-06,
"loss": 88.4886,
"step": 190
},
{
"epoch": 0.054909276499885605,
"grad_norm": 113.6875,
"learning_rate": 1.095505617977528e-06,
"loss": 89.7883,
"step": 195
},
{
"epoch": 0.05631720666654934,
"grad_norm": 112.1875,
"learning_rate": 1.1235955056179775e-06,
"loss": 86.8666,
"step": 200
},
{
"epoch": 0.05772513683321307,
"grad_norm": 113.875,
"learning_rate": 1.151685393258427e-06,
"loss": 88.1594,
"step": 205
},
{
"epoch": 0.059133066999876804,
"grad_norm": 112.5625,
"learning_rate": 1.1797752808988763e-06,
"loss": 87.9299,
"step": 210
},
{
"epoch": 0.06054099716654054,
"grad_norm": 111.1875,
"learning_rate": 1.2078651685393258e-06,
"loss": 88.8051,
"step": 215
},
{
"epoch": 0.06194892733320427,
"grad_norm": 108.125,
"learning_rate": 1.235955056179775e-06,
"loss": 86.5864,
"step": 220
},
{
"epoch": 0.06335685749986801,
"grad_norm": 115.5625,
"learning_rate": 1.2640449438202247e-06,
"loss": 89.1844,
"step": 225
},
{
"epoch": 0.06476478766653174,
"grad_norm": 112.875,
"learning_rate": 1.2921348314606742e-06,
"loss": 89.5238,
"step": 230
},
{
"epoch": 0.06617271783319548,
"grad_norm": 111.1875,
"learning_rate": 1.3202247191011235e-06,
"loss": 89.9019,
"step": 235
},
{
"epoch": 0.0675806479998592,
"grad_norm": 114.0625,
"learning_rate": 1.348314606741573e-06,
"loss": 89.5135,
"step": 240
},
{
"epoch": 0.06898857816652294,
"grad_norm": 111.9375,
"learning_rate": 1.3764044943820223e-06,
"loss": 89.6118,
"step": 245
},
{
"epoch": 0.07039650833318667,
"grad_norm": 113.875,
"learning_rate": 1.404494382022472e-06,
"loss": 89.3294,
"step": 250
},
{
"epoch": 0.07180443849985041,
"grad_norm": 108.6875,
"learning_rate": 1.4325842696629213e-06,
"loss": 91.4349,
"step": 255
},
{
"epoch": 0.07321236866651414,
"grad_norm": 115.125,
"learning_rate": 1.4606741573033708e-06,
"loss": 89.6878,
"step": 260
},
{
"epoch": 0.07462029883317788,
"grad_norm": 111.3125,
"learning_rate": 1.48876404494382e-06,
"loss": 91.3542,
"step": 265
},
{
"epoch": 0.0760282289998416,
"grad_norm": 114.0625,
"learning_rate": 1.5168539325842696e-06,
"loss": 90.9409,
"step": 270
},
{
"epoch": 0.07743615916650534,
"grad_norm": 110.875,
"learning_rate": 1.544943820224719e-06,
"loss": 91.2244,
"step": 275
},
{
"epoch": 0.07884408933316908,
"grad_norm": 114.125,
"learning_rate": 1.5730337078651686e-06,
"loss": 90.888,
"step": 280
},
{
"epoch": 0.08025201949983281,
"grad_norm": 113.4375,
"learning_rate": 1.6011235955056178e-06,
"loss": 91.2134,
"step": 285
},
{
"epoch": 0.08165994966649655,
"grad_norm": 108.375,
"learning_rate": 1.6292134831460673e-06,
"loss": 91.87,
"step": 290
},
{
"epoch": 0.08306787983316027,
"grad_norm": 108.6875,
"learning_rate": 1.6573033707865166e-06,
"loss": 91.9418,
"step": 295
},
{
"epoch": 0.08447580999982401,
"grad_norm": 108.1875,
"learning_rate": 1.6853932584269661e-06,
"loss": 91.6362,
"step": 300
},
{
"epoch": 0.08588374016648774,
"grad_norm": 114.9375,
"learning_rate": 1.7134831460674158e-06,
"loss": 92.6335,
"step": 305
},
{
"epoch": 0.08729167033315148,
"grad_norm": 107.1875,
"learning_rate": 1.741573033707865e-06,
"loss": 92.5334,
"step": 310
},
{
"epoch": 0.0886996004998152,
"grad_norm": 114.6875,
"learning_rate": 1.7696629213483146e-06,
"loss": 91.7963,
"step": 315
},
{
"epoch": 0.09010753066647895,
"grad_norm": 109.1875,
"learning_rate": 1.7977528089887639e-06,
"loss": 92.5923,
"step": 320
},
{
"epoch": 0.09151546083314267,
"grad_norm": 114.9375,
"learning_rate": 1.8258426966292134e-06,
"loss": 93.9665,
"step": 325
},
{
"epoch": 0.09292339099980641,
"grad_norm": 118.1875,
"learning_rate": 1.8539325842696629e-06,
"loss": 91.7156,
"step": 330
},
{
"epoch": 0.09433132116647014,
"grad_norm": 116.0,
"learning_rate": 1.8820224719101124e-06,
"loss": 94.3289,
"step": 335
},
{
"epoch": 0.09573925133313388,
"grad_norm": 116.75,
"learning_rate": 1.910112359550562e-06,
"loss": 94.187,
"step": 340
},
{
"epoch": 0.0971471814997976,
"grad_norm": 111.875,
"learning_rate": 1.938202247191011e-06,
"loss": 93.7581,
"step": 345
},
{
"epoch": 0.09855511166646135,
"grad_norm": 114.9375,
"learning_rate": 1.9662921348314604e-06,
"loss": 93.413,
"step": 350
},
{
"epoch": 0.09996304183312507,
"grad_norm": 112.625,
"learning_rate": 1.99438202247191e-06,
"loss": 94.2749,
"step": 355
},
{
"epoch": 0.10137097199978881,
"grad_norm": 107.6875,
"learning_rate": 1.9974960876369326e-06,
"loss": 94.6595,
"step": 360
},
{
"epoch": 0.10277890216645254,
"grad_norm": 115.125,
"learning_rate": 1.9943661971830985e-06,
"loss": 94.8831,
"step": 365
},
{
"epoch": 0.10418683233311628,
"grad_norm": 111.9375,
"learning_rate": 1.9912363067292643e-06,
"loss": 94.5983,
"step": 370
},
{
"epoch": 0.10559476249978002,
"grad_norm": 113.875,
"learning_rate": 1.98810641627543e-06,
"loss": 96.0132,
"step": 375
},
{
"epoch": 0.10700269266644374,
"grad_norm": 111.8125,
"learning_rate": 1.984976525821596e-06,
"loss": 96.9668,
"step": 380
},
{
"epoch": 0.10841062283310748,
"grad_norm": 109.5625,
"learning_rate": 1.981846635367762e-06,
"loss": 94.2092,
"step": 385
},
{
"epoch": 0.10981855299977121,
"grad_norm": 113.6875,
"learning_rate": 1.9787167449139278e-06,
"loss": 93.9071,
"step": 390
},
{
"epoch": 0.11122648316643495,
"grad_norm": 113.625,
"learning_rate": 1.9755868544600936e-06,
"loss": 94.9607,
"step": 395
},
{
"epoch": 0.11263441333309868,
"grad_norm": 110.75,
"learning_rate": 1.9724569640062595e-06,
"loss": 92.9988,
"step": 400
},
{
"epoch": 0.11404234349976242,
"grad_norm": 114.25,
"learning_rate": 1.9693270735524258e-06,
"loss": 95.0537,
"step": 405
},
{
"epoch": 0.11545027366642614,
"grad_norm": 110.9375,
"learning_rate": 1.9661971830985916e-06,
"loss": 94.5083,
"step": 410
},
{
"epoch": 0.11685820383308988,
"grad_norm": 113.375,
"learning_rate": 1.9630672926447575e-06,
"loss": 94.8912,
"step": 415
},
{
"epoch": 0.11826613399975361,
"grad_norm": 112.125,
"learning_rate": 1.9599374021909234e-06,
"loss": 95.3204,
"step": 420
},
{
"epoch": 0.11967406416641735,
"grad_norm": 113.625,
"learning_rate": 1.9568075117370892e-06,
"loss": 95.2065,
"step": 425
},
{
"epoch": 0.12108199433308107,
"grad_norm": 111.5625,
"learning_rate": 1.953677621283255e-06,
"loss": 94.5451,
"step": 430
},
{
"epoch": 0.12248992449974482,
"grad_norm": 108.75,
"learning_rate": 1.950547730829421e-06,
"loss": 95.1698,
"step": 435
},
{
"epoch": 0.12389785466640854,
"grad_norm": 107.9375,
"learning_rate": 1.947417840375587e-06,
"loss": 94.7059,
"step": 440
},
{
"epoch": 0.12530578483307228,
"grad_norm": 108.875,
"learning_rate": 1.9442879499217527e-06,
"loss": 94.4242,
"step": 445
},
{
"epoch": 0.12671371499973602,
"grad_norm": 110.875,
"learning_rate": 1.9411580594679185e-06,
"loss": 95.4208,
"step": 450
},
{
"epoch": 0.12812164516639973,
"grad_norm": 112.4375,
"learning_rate": 1.9380281690140844e-06,
"loss": 96.0422,
"step": 455
},
{
"epoch": 0.12952957533306347,
"grad_norm": 111.25,
"learning_rate": 1.9348982785602502e-06,
"loss": 93.9782,
"step": 460
},
{
"epoch": 0.1309375054997272,
"grad_norm": 112.4375,
"learning_rate": 1.931768388106416e-06,
"loss": 93.49,
"step": 465
},
{
"epoch": 0.13234543566639095,
"grad_norm": 111.25,
"learning_rate": 1.928638497652582e-06,
"loss": 93.6705,
"step": 470
},
{
"epoch": 0.1337533658330547,
"grad_norm": 109.75,
"learning_rate": 1.925508607198748e-06,
"loss": 94.0692,
"step": 475
},
{
"epoch": 0.1351612959997184,
"grad_norm": 106.875,
"learning_rate": 1.922378716744914e-06,
"loss": 94.4334,
"step": 480
},
{
"epoch": 0.13656922616638215,
"grad_norm": 110.4375,
"learning_rate": 1.9192488262910796e-06,
"loss": 95.9341,
"step": 485
},
{
"epoch": 0.13797715633304589,
"grad_norm": 108.5,
"learning_rate": 1.9161189358372454e-06,
"loss": 94.6785,
"step": 490
},
{
"epoch": 0.13938508649970963,
"grad_norm": 109.5,
"learning_rate": 1.9129890453834113e-06,
"loss": 95.5102,
"step": 495
},
{
"epoch": 0.14079301666637334,
"grad_norm": 110.3125,
"learning_rate": 1.909859154929577e-06,
"loss": 93.6705,
"step": 500
},
{
"epoch": 0.14079301666637334,
"eval_loss": 2.953507661819458,
"eval_runtime": 174.8095,
"eval_samples_per_second": 1094.883,
"eval_steps_per_second": 34.22,
"step": 500
},
{
"epoch": 0.14220094683303708,
"grad_norm": 116.3125,
"learning_rate": 1.9067292644757434e-06,
"loss": 93.6612,
"step": 505
},
{
"epoch": 0.14360887699970082,
"grad_norm": 107.5625,
"learning_rate": 1.9035993740219093e-06,
"loss": 93.6054,
"step": 510
},
{
"epoch": 0.14501680716636456,
"grad_norm": 107.4375,
"learning_rate": 1.900469483568075e-06,
"loss": 93.6585,
"step": 515
},
{
"epoch": 0.14642473733302827,
"grad_norm": 111.3125,
"learning_rate": 1.8973395931142408e-06,
"loss": 93.5845,
"step": 520
},
{
"epoch": 0.147832667499692,
"grad_norm": 109.8125,
"learning_rate": 1.8942097026604067e-06,
"loss": 96.1556,
"step": 525
},
{
"epoch": 0.14924059766635575,
"grad_norm": 112.3125,
"learning_rate": 1.8910798122065727e-06,
"loss": 95.0694,
"step": 530
},
{
"epoch": 0.1506485278330195,
"grad_norm": 108.0625,
"learning_rate": 1.8879499217527386e-06,
"loss": 94.7092,
"step": 535
},
{
"epoch": 0.1520564579996832,
"grad_norm": 107.0,
"learning_rate": 1.8848200312989044e-06,
"loss": 92.7553,
"step": 540
},
{
"epoch": 0.15346438816634694,
"grad_norm": 110.4375,
"learning_rate": 1.8816901408450703e-06,
"loss": 95.0633,
"step": 545
},
{
"epoch": 0.15487231833301068,
"grad_norm": 105.8125,
"learning_rate": 1.8785602503912362e-06,
"loss": 93.2634,
"step": 550
},
{
"epoch": 0.15628024849967442,
"grad_norm": 109.9375,
"learning_rate": 1.8754303599374022e-06,
"loss": 94.7607,
"step": 555
},
{
"epoch": 0.15768817866633816,
"grad_norm": 107.8125,
"learning_rate": 1.872300469483568e-06,
"loss": 94.321,
"step": 560
},
{
"epoch": 0.15909610883300188,
"grad_norm": 104.5625,
"learning_rate": 1.8691705790297338e-06,
"loss": 94.108,
"step": 565
},
{
"epoch": 0.16050403899966562,
"grad_norm": 106.25,
"learning_rate": 1.8660406885758996e-06,
"loss": 94.7094,
"step": 570
},
{
"epoch": 0.16191196916632936,
"grad_norm": 103.8125,
"learning_rate": 1.8629107981220657e-06,
"loss": 95.3564,
"step": 575
},
{
"epoch": 0.1633198993329931,
"grad_norm": 109.375,
"learning_rate": 1.8597809076682315e-06,
"loss": 94.0117,
"step": 580
},
{
"epoch": 0.1647278294996568,
"grad_norm": 111.0,
"learning_rate": 1.8566510172143974e-06,
"loss": 94.3991,
"step": 585
},
{
"epoch": 0.16613575966632055,
"grad_norm": 107.9375,
"learning_rate": 1.8535211267605633e-06,
"loss": 93.3577,
"step": 590
},
{
"epoch": 0.1675436898329843,
"grad_norm": 113.0,
"learning_rate": 1.8503912363067291e-06,
"loss": 92.6764,
"step": 595
},
{
"epoch": 0.16895161999964803,
"grad_norm": 108.875,
"learning_rate": 1.8472613458528952e-06,
"loss": 93.5743,
"step": 600
},
{
"epoch": 0.17035955016631174,
"grad_norm": 110.75,
"learning_rate": 1.844131455399061e-06,
"loss": 95.4879,
"step": 605
},
{
"epoch": 0.17176748033297548,
"grad_norm": 108.1875,
"learning_rate": 1.841001564945227e-06,
"loss": 94.0815,
"step": 610
},
{
"epoch": 0.17317541049963922,
"grad_norm": 107.6875,
"learning_rate": 1.8378716744913928e-06,
"loss": 93.5862,
"step": 615
},
{
"epoch": 0.17458334066630296,
"grad_norm": 110.6875,
"learning_rate": 1.8347417840375584e-06,
"loss": 94.7766,
"step": 620
},
{
"epoch": 0.17599127083296667,
"grad_norm": 107.75,
"learning_rate": 1.8316118935837245e-06,
"loss": 93.7132,
"step": 625
},
{
"epoch": 0.1773992009996304,
"grad_norm": 108.5,
"learning_rate": 1.8284820031298904e-06,
"loss": 95.441,
"step": 630
},
{
"epoch": 0.17880713116629415,
"grad_norm": 109.875,
"learning_rate": 1.8253521126760562e-06,
"loss": 94.4422,
"step": 635
},
{
"epoch": 0.1802150613329579,
"grad_norm": 108.9375,
"learning_rate": 1.822222222222222e-06,
"loss": 92.7065,
"step": 640
},
{
"epoch": 0.1816229914996216,
"grad_norm": 108.0,
"learning_rate": 1.819092331768388e-06,
"loss": 95.6058,
"step": 645
},
{
"epoch": 0.18303092166628535,
"grad_norm": 113.4375,
"learning_rate": 1.815962441314554e-06,
"loss": 93.1739,
"step": 650
},
{
"epoch": 0.18443885183294909,
"grad_norm": 105.375,
"learning_rate": 1.8128325508607199e-06,
"loss": 94.3629,
"step": 655
},
{
"epoch": 0.18584678199961283,
"grad_norm": 109.3125,
"learning_rate": 1.8097026604068857e-06,
"loss": 93.1163,
"step": 660
},
{
"epoch": 0.18725471216627657,
"grad_norm": 109.75,
"learning_rate": 1.8065727699530516e-06,
"loss": 91.9717,
"step": 665
},
{
"epoch": 0.18866264233294028,
"grad_norm": 107.0,
"learning_rate": 1.8034428794992173e-06,
"loss": 93.1724,
"step": 670
},
{
"epoch": 0.19007057249960402,
"grad_norm": 111.125,
"learning_rate": 1.8003129890453833e-06,
"loss": 93.8705,
"step": 675
},
{
"epoch": 0.19147850266626776,
"grad_norm": 106.6875,
"learning_rate": 1.7971830985915492e-06,
"loss": 95.3723,
"step": 680
},
{
"epoch": 0.1928864328329315,
"grad_norm": 105.75,
"learning_rate": 1.794053208137715e-06,
"loss": 94.0826,
"step": 685
},
{
"epoch": 0.1942943629995952,
"grad_norm": 108.8125,
"learning_rate": 1.790923317683881e-06,
"loss": 93.8173,
"step": 690
},
{
"epoch": 0.19570229316625895,
"grad_norm": 108.5625,
"learning_rate": 1.7877934272300468e-06,
"loss": 93.0598,
"step": 695
},
{
"epoch": 0.1971102233329227,
"grad_norm": 105.1875,
"learning_rate": 1.7846635367762128e-06,
"loss": 93.9662,
"step": 700
},
{
"epoch": 0.19851815349958643,
"grad_norm": 109.0625,
"learning_rate": 1.7815336463223787e-06,
"loss": 91.824,
"step": 705
},
{
"epoch": 0.19992608366625014,
"grad_norm": 109.625,
"learning_rate": 1.7784037558685446e-06,
"loss": 93.3993,
"step": 710
},
{
"epoch": 0.20133401383291388,
"grad_norm": 108.125,
"learning_rate": 1.7752738654147104e-06,
"loss": 92.4667,
"step": 715
},
{
"epoch": 0.20274194399957762,
"grad_norm": 111.0,
"learning_rate": 1.7721439749608763e-06,
"loss": 92.8489,
"step": 720
},
{
"epoch": 0.20414987416624136,
"grad_norm": 109.125,
"learning_rate": 1.7690140845070422e-06,
"loss": 93.2275,
"step": 725
},
{
"epoch": 0.20555780433290508,
"grad_norm": 108.0625,
"learning_rate": 1.765884194053208e-06,
"loss": 92.2952,
"step": 730
},
{
"epoch": 0.20696573449956882,
"grad_norm": 107.375,
"learning_rate": 1.7627543035993739e-06,
"loss": 93.8364,
"step": 735
},
{
"epoch": 0.20837366466623256,
"grad_norm": 110.0625,
"learning_rate": 1.7596244131455397e-06,
"loss": 93.7801,
"step": 740
},
{
"epoch": 0.2097815948328963,
"grad_norm": 109.8125,
"learning_rate": 1.7564945226917056e-06,
"loss": 93.1315,
"step": 745
},
{
"epoch": 0.21118952499956004,
"grad_norm": 111.0625,
"learning_rate": 1.7533646322378717e-06,
"loss": 91.7111,
"step": 750
},
{
"epoch": 0.21259745516622375,
"grad_norm": 108.25,
"learning_rate": 1.7502347417840375e-06,
"loss": 93.8666,
"step": 755
},
{
"epoch": 0.2140053853328875,
"grad_norm": 108.875,
"learning_rate": 1.7471048513302034e-06,
"loss": 91.9754,
"step": 760
},
{
"epoch": 0.21541331549955123,
"grad_norm": 108.0,
"learning_rate": 1.7439749608763693e-06,
"loss": 94.4781,
"step": 765
},
{
"epoch": 0.21682124566621497,
"grad_norm": 109.1875,
"learning_rate": 1.7408450704225351e-06,
"loss": 92.3431,
"step": 770
},
{
"epoch": 0.21822917583287868,
"grad_norm": 108.0625,
"learning_rate": 1.7377151799687012e-06,
"loss": 93.1383,
"step": 775
},
{
"epoch": 0.21963710599954242,
"grad_norm": 107.4375,
"learning_rate": 1.7345852895148668e-06,
"loss": 93.4093,
"step": 780
},
{
"epoch": 0.22104503616620616,
"grad_norm": 107.875,
"learning_rate": 1.7314553990610327e-06,
"loss": 93.3635,
"step": 785
},
{
"epoch": 0.2224529663328699,
"grad_norm": 109.0625,
"learning_rate": 1.7283255086071986e-06,
"loss": 90.5487,
"step": 790
},
{
"epoch": 0.2238608964995336,
"grad_norm": 105.9375,
"learning_rate": 1.7251956181533644e-06,
"loss": 93.1931,
"step": 795
},
{
"epoch": 0.22526882666619735,
"grad_norm": 109.5625,
"learning_rate": 1.7220657276995305e-06,
"loss": 93.2803,
"step": 800
},
{
"epoch": 0.2266767568328611,
"grad_norm": 105.75,
"learning_rate": 1.7189358372456964e-06,
"loss": 92.0815,
"step": 805
},
{
"epoch": 0.22808468699952483,
"grad_norm": 106.0625,
"learning_rate": 1.7158059467918622e-06,
"loss": 92.8134,
"step": 810
},
{
"epoch": 0.22949261716618855,
"grad_norm": 105.0625,
"learning_rate": 1.712676056338028e-06,
"loss": 91.2653,
"step": 815
},
{
"epoch": 0.23090054733285229,
"grad_norm": 107.4375,
"learning_rate": 1.709546165884194e-06,
"loss": 92.2337,
"step": 820
},
{
"epoch": 0.23230847749951603,
"grad_norm": 105.75,
"learning_rate": 1.70641627543036e-06,
"loss": 91.8069,
"step": 825
},
{
"epoch": 0.23371640766617977,
"grad_norm": 112.0,
"learning_rate": 1.7032863849765259e-06,
"loss": 90.8042,
"step": 830
},
{
"epoch": 0.23512433783284348,
"grad_norm": 107.25,
"learning_rate": 1.7001564945226915e-06,
"loss": 92.4549,
"step": 835
},
{
"epoch": 0.23653226799950722,
"grad_norm": 104.0625,
"learning_rate": 1.6970266040688574e-06,
"loss": 93.1807,
"step": 840
},
{
"epoch": 0.23794019816617096,
"grad_norm": 108.75,
"learning_rate": 1.6938967136150232e-06,
"loss": 92.848,
"step": 845
},
{
"epoch": 0.2393481283328347,
"grad_norm": 110.9375,
"learning_rate": 1.6907668231611893e-06,
"loss": 91.5934,
"step": 850
},
{
"epoch": 0.24075605849949844,
"grad_norm": 105.1875,
"learning_rate": 1.6876369327073552e-06,
"loss": 91.8347,
"step": 855
},
{
"epoch": 0.24216398866616215,
"grad_norm": 109.6875,
"learning_rate": 1.684507042253521e-06,
"loss": 91.4695,
"step": 860
},
{
"epoch": 0.2435719188328259,
"grad_norm": 108.3125,
"learning_rate": 1.681377151799687e-06,
"loss": 92.2532,
"step": 865
},
{
"epoch": 0.24497984899948963,
"grad_norm": 105.9375,
"learning_rate": 1.678247261345853e-06,
"loss": 91.6567,
"step": 870
},
{
"epoch": 0.24638777916615337,
"grad_norm": 107.4375,
"learning_rate": 1.6751173708920188e-06,
"loss": 91.7404,
"step": 875
},
{
"epoch": 0.24779570933281708,
"grad_norm": 108.125,
"learning_rate": 1.6719874804381847e-06,
"loss": 91.8094,
"step": 880
},
{
"epoch": 0.24920363949948082,
"grad_norm": 105.8125,
"learning_rate": 1.6688575899843503e-06,
"loss": 91.615,
"step": 885
},
{
"epoch": 0.25061156966614456,
"grad_norm": 108.0625,
"learning_rate": 1.6657276995305162e-06,
"loss": 93.0504,
"step": 890
},
{
"epoch": 0.2520194998328083,
"grad_norm": 112.375,
"learning_rate": 1.6625978090766823e-06,
"loss": 91.0786,
"step": 895
},
{
"epoch": 0.25342742999947204,
"grad_norm": 107.875,
"learning_rate": 1.6594679186228481e-06,
"loss": 90.6298,
"step": 900
},
{
"epoch": 0.2548353601661358,
"grad_norm": 106.1875,
"learning_rate": 1.656338028169014e-06,
"loss": 91.8793,
"step": 905
},
{
"epoch": 0.25624329033279947,
"grad_norm": 110.5,
"learning_rate": 1.6532081377151799e-06,
"loss": 91.6597,
"step": 910
},
{
"epoch": 0.2576512204994632,
"grad_norm": 106.125,
"learning_rate": 1.6500782472613457e-06,
"loss": 91.8785,
"step": 915
},
{
"epoch": 0.25905915066612695,
"grad_norm": 106.0625,
"learning_rate": 1.6469483568075118e-06,
"loss": 90.6928,
"step": 920
},
{
"epoch": 0.2604670808327907,
"grad_norm": 105.4375,
"learning_rate": 1.6438184663536777e-06,
"loss": 90.5155,
"step": 925
},
{
"epoch": 0.2618750109994544,
"grad_norm": 105.4375,
"learning_rate": 1.6406885758998435e-06,
"loss": 91.1331,
"step": 930
},
{
"epoch": 0.26328294116611817,
"grad_norm": 104.875,
"learning_rate": 1.6375586854460094e-06,
"loss": 92.0714,
"step": 935
},
{
"epoch": 0.2646908713327819,
"grad_norm": 106.375,
"learning_rate": 1.634428794992175e-06,
"loss": 91.0242,
"step": 940
},
{
"epoch": 0.26609880149944565,
"grad_norm": 108.6875,
"learning_rate": 1.631298904538341e-06,
"loss": 91.612,
"step": 945
},
{
"epoch": 0.2675067316661094,
"grad_norm": 104.6875,
"learning_rate": 1.628169014084507e-06,
"loss": 90.1659,
"step": 950
},
{
"epoch": 0.26891466183277307,
"grad_norm": 105.1875,
"learning_rate": 1.6250391236306728e-06,
"loss": 91.2171,
"step": 955
},
{
"epoch": 0.2703225919994368,
"grad_norm": 107.375,
"learning_rate": 1.6219092331768387e-06,
"loss": 91.4,
"step": 960
},
{
"epoch": 0.27173052216610055,
"grad_norm": 105.5625,
"learning_rate": 1.6187793427230045e-06,
"loss": 90.2747,
"step": 965
},
{
"epoch": 0.2731384523327643,
"grad_norm": 113.0,
"learning_rate": 1.6156494522691706e-06,
"loss": 90.9026,
"step": 970
},
{
"epoch": 0.27454638249942803,
"grad_norm": 108.8125,
"learning_rate": 1.6125195618153365e-06,
"loss": 90.2174,
"step": 975
},
{
"epoch": 0.27595431266609177,
"grad_norm": 107.0625,
"learning_rate": 1.6093896713615023e-06,
"loss": 92.164,
"step": 980
},
{
"epoch": 0.2773622428327555,
"grad_norm": 109.0,
"learning_rate": 1.6062597809076682e-06,
"loss": 90.952,
"step": 985
},
{
"epoch": 0.27877017299941925,
"grad_norm": 109.3125,
"learning_rate": 1.6031298904538339e-06,
"loss": 91.6762,
"step": 990
},
{
"epoch": 0.28017810316608294,
"grad_norm": 108.6875,
"learning_rate": 1.6e-06,
"loss": 89.3236,
"step": 995
},
{
"epoch": 0.2815860333327467,
"grad_norm": 105.0,
"learning_rate": 1.5968701095461658e-06,
"loss": 89.8639,
"step": 1000
},
{
"epoch": 0.2815860333327467,
"eval_loss": 2.838271141052246,
"eval_runtime": 172.5248,
"eval_samples_per_second": 1109.383,
"eval_steps_per_second": 34.673,
"step": 1000
},
{
"epoch": 0.2829939634994104,
"grad_norm": 106.125,
"learning_rate": 1.5937402190923316e-06,
"loss": 89.9307,
"step": 1005
},
{
"epoch": 0.28440189366607416,
"grad_norm": 106.0,
"learning_rate": 1.5906103286384975e-06,
"loss": 92.5395,
"step": 1010
},
{
"epoch": 0.2858098238327379,
"grad_norm": 105.5,
"learning_rate": 1.5874804381846634e-06,
"loss": 92.4412,
"step": 1015
},
{
"epoch": 0.28721775399940164,
"grad_norm": 105.25,
"learning_rate": 1.5843505477308294e-06,
"loss": 89.2497,
"step": 1020
},
{
"epoch": 0.2886256841660654,
"grad_norm": 104.5,
"learning_rate": 1.5812206572769953e-06,
"loss": 90.8058,
"step": 1025
},
{
"epoch": 0.2900336143327291,
"grad_norm": 106.8125,
"learning_rate": 1.5780907668231612e-06,
"loss": 90.845,
"step": 1030
},
{
"epoch": 0.29144154449939286,
"grad_norm": 108.8125,
"learning_rate": 1.574960876369327e-06,
"loss": 89.4672,
"step": 1035
},
{
"epoch": 0.29284947466605654,
"grad_norm": 105.4375,
"learning_rate": 1.5718309859154929e-06,
"loss": 90.9758,
"step": 1040
},
{
"epoch": 0.2942574048327203,
"grad_norm": 102.875,
"learning_rate": 1.5687010954616588e-06,
"loss": 88.5602,
"step": 1045
},
{
"epoch": 0.295665334999384,
"grad_norm": 108.6875,
"learning_rate": 1.5655712050078246e-06,
"loss": 89.8782,
"step": 1050
},
{
"epoch": 0.29707326516604776,
"grad_norm": 104.5,
"learning_rate": 1.5624413145539905e-06,
"loss": 89.0,
"step": 1055
},
{
"epoch": 0.2984811953327115,
"grad_norm": 105.9375,
"learning_rate": 1.5593114241001563e-06,
"loss": 90.7312,
"step": 1060
},
{
"epoch": 0.29988912549937524,
"grad_norm": 103.625,
"learning_rate": 1.5561815336463222e-06,
"loss": 89.4973,
"step": 1065
},
{
"epoch": 0.301297055666039,
"grad_norm": 106.25,
"learning_rate": 1.5530516431924883e-06,
"loss": 90.8279,
"step": 1070
},
{
"epoch": 0.3027049858327027,
"grad_norm": 110.9375,
"learning_rate": 1.5499217527386541e-06,
"loss": 89.7147,
"step": 1075
},
{
"epoch": 0.3041129159993664,
"grad_norm": 103.25,
"learning_rate": 1.54679186228482e-06,
"loss": 89.0186,
"step": 1080
},
{
"epoch": 0.30552084616603015,
"grad_norm": 108.6875,
"learning_rate": 1.5436619718309859e-06,
"loss": 89.8325,
"step": 1085
},
{
"epoch": 0.3069287763326939,
"grad_norm": 104.9375,
"learning_rate": 1.5405320813771517e-06,
"loss": 90.2746,
"step": 1090
},
{
"epoch": 0.3083367064993576,
"grad_norm": 103.4375,
"learning_rate": 1.5374021909233178e-06,
"loss": 91.3873,
"step": 1095
},
{
"epoch": 0.30974463666602137,
"grad_norm": 105.0625,
"learning_rate": 1.5342723004694834e-06,
"loss": 90.5268,
"step": 1100
},
{
"epoch": 0.3111525668326851,
"grad_norm": 108.25,
"learning_rate": 1.5311424100156493e-06,
"loss": 90.6294,
"step": 1105
},
{
"epoch": 0.31256049699934885,
"grad_norm": 109.5,
"learning_rate": 1.5280125195618152e-06,
"loss": 90.3081,
"step": 1110
},
{
"epoch": 0.3139684271660126,
"grad_norm": 102.8125,
"learning_rate": 1.524882629107981e-06,
"loss": 90.183,
"step": 1115
},
{
"epoch": 0.3153763573326763,
"grad_norm": 103.0625,
"learning_rate": 1.521752738654147e-06,
"loss": 89.3388,
"step": 1120
},
{
"epoch": 0.31678428749934,
"grad_norm": 108.3125,
"learning_rate": 1.518622848200313e-06,
"loss": 91.1364,
"step": 1125
},
{
"epoch": 0.31819221766600375,
"grad_norm": 103.25,
"learning_rate": 1.5154929577464788e-06,
"loss": 89.0373,
"step": 1130
},
{
"epoch": 0.3196001478326675,
"grad_norm": 102.5,
"learning_rate": 1.5123630672926447e-06,
"loss": 89.7825,
"step": 1135
},
{
"epoch": 0.32100807799933123,
"grad_norm": 106.625,
"learning_rate": 1.5092331768388105e-06,
"loss": 89.4166,
"step": 1140
},
{
"epoch": 0.32241600816599497,
"grad_norm": 107.875,
"learning_rate": 1.5061032863849766e-06,
"loss": 88.7669,
"step": 1145
},
{
"epoch": 0.3238239383326587,
"grad_norm": 104.6875,
"learning_rate": 1.5029733959311423e-06,
"loss": 89.7136,
"step": 1150
},
{
"epoch": 0.32523186849932245,
"grad_norm": 106.0,
"learning_rate": 1.4998435054773081e-06,
"loss": 88.9745,
"step": 1155
},
{
"epoch": 0.3266397986659862,
"grad_norm": 109.625,
"learning_rate": 1.496713615023474e-06,
"loss": 89.3217,
"step": 1160
},
{
"epoch": 0.3280477288326499,
"grad_norm": 104.9375,
"learning_rate": 1.49358372456964e-06,
"loss": 88.7268,
"step": 1165
},
{
"epoch": 0.3294556589993136,
"grad_norm": 107.25,
"learning_rate": 1.490453834115806e-06,
"loss": 88.8883,
"step": 1170
},
{
"epoch": 0.33086358916597736,
"grad_norm": 108.8125,
"learning_rate": 1.4873239436619718e-06,
"loss": 88.9828,
"step": 1175
},
{
"epoch": 0.3322715193326411,
"grad_norm": 105.5,
"learning_rate": 1.4841940532081376e-06,
"loss": 90.4691,
"step": 1180
},
{
"epoch": 0.33367944949930484,
"grad_norm": 104.0625,
"learning_rate": 1.4810641627543035e-06,
"loss": 88.7875,
"step": 1185
},
{
"epoch": 0.3350873796659686,
"grad_norm": 106.0625,
"learning_rate": 1.4779342723004696e-06,
"loss": 89.6179,
"step": 1190
},
{
"epoch": 0.3364953098326323,
"grad_norm": 106.5,
"learning_rate": 1.4748043818466354e-06,
"loss": 89.5033,
"step": 1195
},
{
"epoch": 0.33790323999929606,
"grad_norm": 105.625,
"learning_rate": 1.4716744913928013e-06,
"loss": 90.4939,
"step": 1200
},
{
"epoch": 0.33931117016595974,
"grad_norm": 107.3125,
"learning_rate": 1.468544600938967e-06,
"loss": 89.3375,
"step": 1205
},
{
"epoch": 0.3407191003326235,
"grad_norm": 105.6875,
"learning_rate": 1.4654147104851328e-06,
"loss": 89.1358,
"step": 1210
},
{
"epoch": 0.3421270304992872,
"grad_norm": 106.25,
"learning_rate": 1.4622848200312989e-06,
"loss": 89.1661,
"step": 1215
},
{
"epoch": 0.34353496066595096,
"grad_norm": 106.4375,
"learning_rate": 1.4591549295774647e-06,
"loss": 87.8099,
"step": 1220
},
{
"epoch": 0.3449428908326147,
"grad_norm": 103.875,
"learning_rate": 1.4560250391236306e-06,
"loss": 89.5182,
"step": 1225
},
{
"epoch": 0.34635082099927844,
"grad_norm": 105.375,
"learning_rate": 1.4528951486697965e-06,
"loss": 88.9682,
"step": 1230
},
{
"epoch": 0.3477587511659422,
"grad_norm": 106.75,
"learning_rate": 1.4497652582159623e-06,
"loss": 90.2759,
"step": 1235
},
{
"epoch": 0.3491666813326059,
"grad_norm": 104.4375,
"learning_rate": 1.4466353677621284e-06,
"loss": 88.1414,
"step": 1240
},
{
"epoch": 0.35057461149926966,
"grad_norm": 104.6875,
"learning_rate": 1.4435054773082943e-06,
"loss": 87.1446,
"step": 1245
},
{
"epoch": 0.35198254166593335,
"grad_norm": 109.625,
"learning_rate": 1.4403755868544601e-06,
"loss": 88.6361,
"step": 1250
},
{
"epoch": 0.3533904718325971,
"grad_norm": 104.375,
"learning_rate": 1.437245696400626e-06,
"loss": 87.653,
"step": 1255
},
{
"epoch": 0.3547984019992608,
"grad_norm": 106.1875,
"learning_rate": 1.4341158059467916e-06,
"loss": 89.5021,
"step": 1260
},
{
"epoch": 0.35620633216592457,
"grad_norm": 107.5,
"learning_rate": 1.4309859154929577e-06,
"loss": 89.5801,
"step": 1265
},
{
"epoch": 0.3576142623325883,
"grad_norm": 104.0,
"learning_rate": 1.4278560250391236e-06,
"loss": 88.0105,
"step": 1270
},
{
"epoch": 0.35902219249925205,
"grad_norm": 100.875,
"learning_rate": 1.4247261345852894e-06,
"loss": 87.0041,
"step": 1275
},
{
"epoch": 0.3604301226659158,
"grad_norm": 106.875,
"learning_rate": 1.4215962441314553e-06,
"loss": 88.9566,
"step": 1280
},
{
"epoch": 0.3618380528325795,
"grad_norm": 104.0,
"learning_rate": 1.4184663536776211e-06,
"loss": 88.0401,
"step": 1285
},
{
"epoch": 0.3632459829992432,
"grad_norm": 106.6875,
"learning_rate": 1.4153364632237872e-06,
"loss": 88.6244,
"step": 1290
},
{
"epoch": 0.36465391316590695,
"grad_norm": 108.5,
"learning_rate": 1.412206572769953e-06,
"loss": 89.1999,
"step": 1295
},
{
"epoch": 0.3660618433325707,
"grad_norm": 106.9375,
"learning_rate": 1.409076682316119e-06,
"loss": 87.5746,
"step": 1300
},
{
"epoch": 0.36746977349923443,
"grad_norm": 106.875,
"learning_rate": 1.4059467918622848e-06,
"loss": 88.8466,
"step": 1305
},
{
"epoch": 0.36887770366589817,
"grad_norm": 109.375,
"learning_rate": 1.4028169014084504e-06,
"loss": 88.3115,
"step": 1310
},
{
"epoch": 0.3702856338325619,
"grad_norm": 105.375,
"learning_rate": 1.3996870109546165e-06,
"loss": 88.6426,
"step": 1315
},
{
"epoch": 0.37169356399922565,
"grad_norm": 105.125,
"learning_rate": 1.3965571205007824e-06,
"loss": 88.8493,
"step": 1320
},
{
"epoch": 0.3731014941658894,
"grad_norm": 107.4375,
"learning_rate": 1.3934272300469482e-06,
"loss": 88.7368,
"step": 1325
},
{
"epoch": 0.37450942433255313,
"grad_norm": 107.75,
"learning_rate": 1.3902973395931141e-06,
"loss": 87.2699,
"step": 1330
},
{
"epoch": 0.3759173544992168,
"grad_norm": 106.8125,
"learning_rate": 1.38716744913928e-06,
"loss": 88.8466,
"step": 1335
},
{
"epoch": 0.37732528466588056,
"grad_norm": 102.875,
"learning_rate": 1.384037558685446e-06,
"loss": 86.9346,
"step": 1340
},
{
"epoch": 0.3787332148325443,
"grad_norm": 108.0625,
"learning_rate": 1.380907668231612e-06,
"loss": 88.0844,
"step": 1345
},
{
"epoch": 0.38014114499920804,
"grad_norm": 107.5,
"learning_rate": 1.3777777777777778e-06,
"loss": 87.3932,
"step": 1350
},
{
"epoch": 0.3815490751658718,
"grad_norm": 106.3125,
"learning_rate": 1.3746478873239436e-06,
"loss": 87.1559,
"step": 1355
},
{
"epoch": 0.3829570053325355,
"grad_norm": 107.0,
"learning_rate": 1.3715179968701095e-06,
"loss": 87.3616,
"step": 1360
},
{
"epoch": 0.38436493549919926,
"grad_norm": 109.0,
"learning_rate": 1.3683881064162753e-06,
"loss": 86.6208,
"step": 1365
},
{
"epoch": 0.385772865665863,
"grad_norm": 107.25,
"learning_rate": 1.3652582159624412e-06,
"loss": 87.1467,
"step": 1370
},
{
"epoch": 0.3871807958325267,
"grad_norm": 104.5625,
"learning_rate": 1.362128325508607e-06,
"loss": 86.8382,
"step": 1375
},
{
"epoch": 0.3885887259991904,
"grad_norm": 103.125,
"learning_rate": 1.358998435054773e-06,
"loss": 88.2134,
"step": 1380
},
{
"epoch": 0.38999665616585416,
"grad_norm": 110.1875,
"learning_rate": 1.3558685446009388e-06,
"loss": 87.1116,
"step": 1385
},
{
"epoch": 0.3914045863325179,
"grad_norm": 106.3125,
"learning_rate": 1.3527386541471049e-06,
"loss": 86.9536,
"step": 1390
},
{
"epoch": 0.39281251649918164,
"grad_norm": 106.1875,
"learning_rate": 1.3496087636932707e-06,
"loss": 88.1949,
"step": 1395
},
{
"epoch": 0.3942204466658454,
"grad_norm": 103.6875,
"learning_rate": 1.3464788732394366e-06,
"loss": 87.7377,
"step": 1400
},
{
"epoch": 0.3956283768325091,
"grad_norm": 104.0,
"learning_rate": 1.3433489827856024e-06,
"loss": 89.4314,
"step": 1405
},
{
"epoch": 0.39703630699917286,
"grad_norm": 103.3125,
"learning_rate": 1.3402190923317683e-06,
"loss": 87.2316,
"step": 1410
},
{
"epoch": 0.3984442371658366,
"grad_norm": 106.5,
"learning_rate": 1.3370892018779344e-06,
"loss": 87.3214,
"step": 1415
},
{
"epoch": 0.3998521673325003,
"grad_norm": 105.0,
"learning_rate": 1.3339593114241e-06,
"loss": 88.2331,
"step": 1420
},
{
"epoch": 0.401260097499164,
"grad_norm": 100.1875,
"learning_rate": 1.3308294209702659e-06,
"loss": 86.9472,
"step": 1425
},
{
"epoch": 0.40266802766582777,
"grad_norm": 106.3125,
"learning_rate": 1.3276995305164318e-06,
"loss": 87.7424,
"step": 1430
},
{
"epoch": 0.4040759578324915,
"grad_norm": 105.25,
"learning_rate": 1.3245696400625978e-06,
"loss": 87.8725,
"step": 1435
},
{
"epoch": 0.40548388799915525,
"grad_norm": 106.1875,
"learning_rate": 1.3214397496087637e-06,
"loss": 86.9491,
"step": 1440
},
{
"epoch": 0.406891818165819,
"grad_norm": 105.5625,
"learning_rate": 1.3183098591549295e-06,
"loss": 88.0539,
"step": 1445
},
{
"epoch": 0.4082997483324827,
"grad_norm": 106.1875,
"learning_rate": 1.3151799687010954e-06,
"loss": 87.0151,
"step": 1450
},
{
"epoch": 0.40970767849914647,
"grad_norm": 104.1875,
"learning_rate": 1.3120500782472613e-06,
"loss": 87.05,
"step": 1455
},
{
"epoch": 0.41111560866581015,
"grad_norm": 107.375,
"learning_rate": 1.3089201877934273e-06,
"loss": 88.9645,
"step": 1460
},
{
"epoch": 0.4125235388324739,
"grad_norm": 104.375,
"learning_rate": 1.3057902973395932e-06,
"loss": 86.4326,
"step": 1465
},
{
"epoch": 0.41393146899913763,
"grad_norm": 106.3125,
"learning_rate": 1.3026604068857589e-06,
"loss": 87.6248,
"step": 1470
},
{
"epoch": 0.41533939916580137,
"grad_norm": 102.0,
"learning_rate": 1.2995305164319247e-06,
"loss": 87.4086,
"step": 1475
},
{
"epoch": 0.4167473293324651,
"grad_norm": 104.375,
"learning_rate": 1.2964006259780906e-06,
"loss": 87.9904,
"step": 1480
},
{
"epoch": 0.41815525949912885,
"grad_norm": 103.5625,
"learning_rate": 1.2932707355242566e-06,
"loss": 88.0592,
"step": 1485
},
{
"epoch": 0.4195631896657926,
"grad_norm": 105.5625,
"learning_rate": 1.2901408450704225e-06,
"loss": 87.4781,
"step": 1490
},
{
"epoch": 0.42097111983245633,
"grad_norm": 104.6875,
"learning_rate": 1.2870109546165884e-06,
"loss": 86.5436,
"step": 1495
},
{
"epoch": 0.42237904999912007,
"grad_norm": 106.75,
"learning_rate": 1.2838810641627542e-06,
"loss": 85.7463,
"step": 1500
},
{
"epoch": 0.42237904999912007,
"eval_loss": 2.723001718521118,
"eval_runtime": 173.3623,
"eval_samples_per_second": 1104.023,
"eval_steps_per_second": 34.506,
"step": 1500
},
{
"epoch": 0.42378698016578376,
"grad_norm": 103.0,
"learning_rate": 1.28075117370892e-06,
"loss": 85.6684,
"step": 1505
},
{
"epoch": 0.4251949103324475,
"grad_norm": 107.6875,
"learning_rate": 1.2776212832550862e-06,
"loss": 87.5719,
"step": 1510
},
{
"epoch": 0.42660284049911124,
"grad_norm": 105.0625,
"learning_rate": 1.274491392801252e-06,
"loss": 87.0592,
"step": 1515
},
{
"epoch": 0.428010770665775,
"grad_norm": 104.1875,
"learning_rate": 1.2713615023474179e-06,
"loss": 86.5884,
"step": 1520
},
{
"epoch": 0.4294187008324387,
"grad_norm": 107.4375,
"learning_rate": 1.2682316118935835e-06,
"loss": 85.2697,
"step": 1525
},
{
"epoch": 0.43082663099910246,
"grad_norm": 106.125,
"learning_rate": 1.2651017214397494e-06,
"loss": 85.8189,
"step": 1530
},
{
"epoch": 0.4322345611657662,
"grad_norm": 107.6875,
"learning_rate": 1.2619718309859155e-06,
"loss": 87.4702,
"step": 1535
},
{
"epoch": 0.43364249133242994,
"grad_norm": 105.3125,
"learning_rate": 1.2588419405320813e-06,
"loss": 87.5384,
"step": 1540
},
{
"epoch": 0.4350504214990936,
"grad_norm": 106.8125,
"learning_rate": 1.2557120500782472e-06,
"loss": 87.188,
"step": 1545
},
{
"epoch": 0.43645835166575736,
"grad_norm": 104.3125,
"learning_rate": 1.252582159624413e-06,
"loss": 86.1398,
"step": 1550
},
{
"epoch": 0.4378662818324211,
"grad_norm": 101.5,
"learning_rate": 1.249452269170579e-06,
"loss": 84.2644,
"step": 1555
},
{
"epoch": 0.43927421199908484,
"grad_norm": 109.1875,
"learning_rate": 1.246322378716745e-06,
"loss": 87.5508,
"step": 1560
},
{
"epoch": 0.4406821421657486,
"grad_norm": 106.5,
"learning_rate": 1.2431924882629109e-06,
"loss": 86.3704,
"step": 1565
},
{
"epoch": 0.4420900723324123,
"grad_norm": 103.5625,
"learning_rate": 1.2400625978090767e-06,
"loss": 85.5718,
"step": 1570
},
{
"epoch": 0.44349800249907606,
"grad_norm": 108.375,
"learning_rate": 1.2369327073552424e-06,
"loss": 86.0767,
"step": 1575
},
{
"epoch": 0.4449059326657398,
"grad_norm": 105.6875,
"learning_rate": 1.2338028169014082e-06,
"loss": 86.0886,
"step": 1580
},
{
"epoch": 0.4463138628324035,
"grad_norm": 105.9375,
"learning_rate": 1.2306729264475743e-06,
"loss": 86.8305,
"step": 1585
},
{
"epoch": 0.4477217929990672,
"grad_norm": 105.125,
"learning_rate": 1.2275430359937402e-06,
"loss": 86.3498,
"step": 1590
},
{
"epoch": 0.44912972316573097,
"grad_norm": 102.1875,
"learning_rate": 1.224413145539906e-06,
"loss": 87.4621,
"step": 1595
},
{
"epoch": 0.4505376533323947,
"grad_norm": 105.3125,
"learning_rate": 1.2212832550860719e-06,
"loss": 86.2455,
"step": 1600
},
{
"epoch": 0.45194558349905845,
"grad_norm": 104.9375,
"learning_rate": 1.2181533646322377e-06,
"loss": 86.032,
"step": 1605
},
{
"epoch": 0.4533535136657222,
"grad_norm": 107.6875,
"learning_rate": 1.2150234741784038e-06,
"loss": 85.0281,
"step": 1610
},
{
"epoch": 0.4547614438323859,
"grad_norm": 104.1875,
"learning_rate": 1.2118935837245697e-06,
"loss": 85.6405,
"step": 1615
},
{
"epoch": 0.45616937399904967,
"grad_norm": 107.625,
"learning_rate": 1.2087636932707355e-06,
"loss": 85.8417,
"step": 1620
},
{
"epoch": 0.4575773041657134,
"grad_norm": 108.1875,
"learning_rate": 1.2056338028169014e-06,
"loss": 85.5851,
"step": 1625
},
{
"epoch": 0.4589852343323771,
"grad_norm": 105.125,
"learning_rate": 1.202503912363067e-06,
"loss": 86.3523,
"step": 1630
},
{
"epoch": 0.46039316449904083,
"grad_norm": 104.0,
"learning_rate": 1.1993740219092331e-06,
"loss": 86.3561,
"step": 1635
},
{
"epoch": 0.46180109466570457,
"grad_norm": 105.0625,
"learning_rate": 1.196244131455399e-06,
"loss": 86.4649,
"step": 1640
},
{
"epoch": 0.4632090248323683,
"grad_norm": 102.375,
"learning_rate": 1.1931142410015648e-06,
"loss": 85.7339,
"step": 1645
},
{
"epoch": 0.46461695499903205,
"grad_norm": 105.9375,
"learning_rate": 1.1899843505477307e-06,
"loss": 85.7039,
"step": 1650
},
{
"epoch": 0.4660248851656958,
"grad_norm": 104.375,
"learning_rate": 1.1868544600938966e-06,
"loss": 86.5029,
"step": 1655
},
{
"epoch": 0.46743281533235953,
"grad_norm": 102.5,
"learning_rate": 1.1837245696400626e-06,
"loss": 85.348,
"step": 1660
},
{
"epoch": 0.46884074549902327,
"grad_norm": 106.875,
"learning_rate": 1.1805946791862285e-06,
"loss": 86.3522,
"step": 1665
},
{
"epoch": 0.47024867566568695,
"grad_norm": 104.5625,
"learning_rate": 1.1774647887323944e-06,
"loss": 87.0737,
"step": 1670
},
{
"epoch": 0.4716566058323507,
"grad_norm": 105.75,
"learning_rate": 1.1743348982785602e-06,
"loss": 85.9809,
"step": 1675
},
{
"epoch": 0.47306453599901443,
"grad_norm": 103.9375,
"learning_rate": 1.171205007824726e-06,
"loss": 86.8843,
"step": 1680
},
{
"epoch": 0.4744724661656782,
"grad_norm": 109.9375,
"learning_rate": 1.168075117370892e-06,
"loss": 85.7657,
"step": 1685
},
{
"epoch": 0.4758803963323419,
"grad_norm": 104.4375,
"learning_rate": 1.1649452269170578e-06,
"loss": 85.6955,
"step": 1690
},
{
"epoch": 0.47728832649900566,
"grad_norm": 104.25,
"learning_rate": 1.1618153364632237e-06,
"loss": 86.2856,
"step": 1695
},
{
"epoch": 0.4786962566656694,
"grad_norm": 107.625,
"learning_rate": 1.1586854460093895e-06,
"loss": 85.6085,
"step": 1700
},
{
"epoch": 0.48010418683233314,
"grad_norm": 106.8125,
"learning_rate": 1.1555555555555554e-06,
"loss": 83.9402,
"step": 1705
},
{
"epoch": 0.4815121169989969,
"grad_norm": 106.625,
"learning_rate": 1.1524256651017215e-06,
"loss": 85.7792,
"step": 1710
},
{
"epoch": 0.48292004716566056,
"grad_norm": 108.9375,
"learning_rate": 1.1492957746478873e-06,
"loss": 86.3912,
"step": 1715
},
{
"epoch": 0.4843279773323243,
"grad_norm": 106.75,
"learning_rate": 1.1461658841940532e-06,
"loss": 85.8531,
"step": 1720
},
{
"epoch": 0.48573590749898804,
"grad_norm": 106.875,
"learning_rate": 1.143035993740219e-06,
"loss": 85.4033,
"step": 1725
},
{
"epoch": 0.4871438376656518,
"grad_norm": 105.4375,
"learning_rate": 1.1399061032863851e-06,
"loss": 85.488,
"step": 1730
},
{
"epoch": 0.4885517678323155,
"grad_norm": 102.875,
"learning_rate": 1.136776212832551e-06,
"loss": 84.5739,
"step": 1735
},
{
"epoch": 0.48995969799897926,
"grad_norm": 109.0,
"learning_rate": 1.1336463223787166e-06,
"loss": 85.1109,
"step": 1740
},
{
"epoch": 0.491367628165643,
"grad_norm": 106.1875,
"learning_rate": 1.1305164319248825e-06,
"loss": 85.3278,
"step": 1745
},
{
"epoch": 0.49277555833230674,
"grad_norm": 107.1875,
"learning_rate": 1.1273865414710483e-06,
"loss": 86.7529,
"step": 1750
},
{
"epoch": 0.4941834884989704,
"grad_norm": 105.0,
"learning_rate": 1.1242566510172144e-06,
"loss": 86.1095,
"step": 1755
},
{
"epoch": 0.49559141866563416,
"grad_norm": 103.5,
"learning_rate": 1.1211267605633803e-06,
"loss": 84.7933,
"step": 1760
},
{
"epoch": 0.4969993488322979,
"grad_norm": 108.4375,
"learning_rate": 1.1179968701095461e-06,
"loss": 84.1568,
"step": 1765
},
{
"epoch": 0.49840727899896164,
"grad_norm": 103.5625,
"learning_rate": 1.114866979655712e-06,
"loss": 84.3743,
"step": 1770
},
{
"epoch": 0.4998152091656254,
"grad_norm": 101.625,
"learning_rate": 1.1117370892018779e-06,
"loss": 84.1492,
"step": 1775
},
{
"epoch": 0.5012231393322891,
"grad_norm": 106.4375,
"learning_rate": 1.108607198748044e-06,
"loss": 85.6935,
"step": 1780
},
{
"epoch": 0.5026310694989529,
"grad_norm": 102.8125,
"learning_rate": 1.1054773082942098e-06,
"loss": 86.224,
"step": 1785
},
{
"epoch": 0.5040389996656166,
"grad_norm": 104.1875,
"learning_rate": 1.1023474178403754e-06,
"loss": 84.8126,
"step": 1790
},
{
"epoch": 0.5054469298322803,
"grad_norm": 105.875,
"learning_rate": 1.0992175273865413e-06,
"loss": 86.0512,
"step": 1795
},
{
"epoch": 0.5068548599989441,
"grad_norm": 105.4375,
"learning_rate": 1.0960876369327072e-06,
"loss": 84.977,
"step": 1800
},
{
"epoch": 0.5082627901656078,
"grad_norm": 101.4375,
"learning_rate": 1.0929577464788732e-06,
"loss": 85.1623,
"step": 1805
},
{
"epoch": 0.5096707203322716,
"grad_norm": 104.125,
"learning_rate": 1.089827856025039e-06,
"loss": 85.0361,
"step": 1810
},
{
"epoch": 0.5110786504989353,
"grad_norm": 105.0,
"learning_rate": 1.086697965571205e-06,
"loss": 84.6887,
"step": 1815
},
{
"epoch": 0.5124865806655989,
"grad_norm": 105.0,
"learning_rate": 1.0835680751173708e-06,
"loss": 84.9917,
"step": 1820
},
{
"epoch": 0.5138945108322627,
"grad_norm": 104.8125,
"learning_rate": 1.0804381846635367e-06,
"loss": 84.4424,
"step": 1825
},
{
"epoch": 0.5153024409989264,
"grad_norm": 106.25,
"learning_rate": 1.0773082942097028e-06,
"loss": 84.7556,
"step": 1830
},
{
"epoch": 0.5167103711655902,
"grad_norm": 105.9375,
"learning_rate": 1.0741784037558686e-06,
"loss": 84.1018,
"step": 1835
},
{
"epoch": 0.5181183013322539,
"grad_norm": 107.25,
"learning_rate": 1.0710485133020345e-06,
"loss": 84.7397,
"step": 1840
},
{
"epoch": 0.5195262314989176,
"grad_norm": 104.0,
"learning_rate": 1.0679186228482001e-06,
"loss": 83.6932,
"step": 1845
},
{
"epoch": 0.5209341616655814,
"grad_norm": 105.8125,
"learning_rate": 1.064788732394366e-06,
"loss": 84.8758,
"step": 1850
},
{
"epoch": 0.5223420918322451,
"grad_norm": 102.5,
"learning_rate": 1.061658841940532e-06,
"loss": 83.5707,
"step": 1855
},
{
"epoch": 0.5237500219989089,
"grad_norm": 105.5625,
"learning_rate": 1.058528951486698e-06,
"loss": 84.6335,
"step": 1860
},
{
"epoch": 0.5251579521655726,
"grad_norm": 105.125,
"learning_rate": 1.0553990610328638e-06,
"loss": 84.691,
"step": 1865
},
{
"epoch": 0.5265658823322363,
"grad_norm": 105.0,
"learning_rate": 1.0522691705790297e-06,
"loss": 84.6201,
"step": 1870
},
{
"epoch": 0.5279738124989001,
"grad_norm": 104.25,
"learning_rate": 1.0491392801251955e-06,
"loss": 83.5126,
"step": 1875
},
{
"epoch": 0.5293817426655638,
"grad_norm": 104.6875,
"learning_rate": 1.0460093896713616e-06,
"loss": 84.5516,
"step": 1880
},
{
"epoch": 0.5307896728322276,
"grad_norm": 107.875,
"learning_rate": 1.0428794992175274e-06,
"loss": 82.609,
"step": 1885
},
{
"epoch": 0.5321976029988913,
"grad_norm": 106.25,
"learning_rate": 1.0397496087636933e-06,
"loss": 83.3716,
"step": 1890
},
{
"epoch": 0.533605533165555,
"grad_norm": 105.0,
"learning_rate": 1.036619718309859e-06,
"loss": 83.7494,
"step": 1895
},
{
"epoch": 0.5350134633322188,
"grad_norm": 103.5,
"learning_rate": 1.0334898278560248e-06,
"loss": 86.2409,
"step": 1900
},
{
"epoch": 0.5364213934988824,
"grad_norm": 106.875,
"learning_rate": 1.0303599374021909e-06,
"loss": 83.7314,
"step": 1905
},
{
"epoch": 0.5378293236655461,
"grad_norm": 101.5625,
"learning_rate": 1.0272300469483568e-06,
"loss": 82.9069,
"step": 1910
},
{
"epoch": 0.5392372538322099,
"grad_norm": 103.1875,
"learning_rate": 1.0241001564945226e-06,
"loss": 84.7595,
"step": 1915
},
{
"epoch": 0.5406451839988736,
"grad_norm": 105.875,
"learning_rate": 1.0209702660406885e-06,
"loss": 84.6069,
"step": 1920
},
{
"epoch": 0.5420531141655374,
"grad_norm": 103.1875,
"learning_rate": 1.0178403755868543e-06,
"loss": 84.9307,
"step": 1925
},
{
"epoch": 0.5434610443322011,
"grad_norm": 104.9375,
"learning_rate": 1.0147104851330204e-06,
"loss": 84.499,
"step": 1930
},
{
"epoch": 0.5448689744988648,
"grad_norm": 103.5625,
"learning_rate": 1.0115805946791863e-06,
"loss": 83.4419,
"step": 1935
},
{
"epoch": 0.5462769046655286,
"grad_norm": 106.875,
"learning_rate": 1.0084507042253521e-06,
"loss": 81.8547,
"step": 1940
},
{
"epoch": 0.5476848348321923,
"grad_norm": 102.5,
"learning_rate": 1.005320813771518e-06,
"loss": 84.1197,
"step": 1945
},
{
"epoch": 0.5490927649988561,
"grad_norm": 106.875,
"learning_rate": 1.0021909233176836e-06,
"loss": 84.6961,
"step": 1950
},
{
"epoch": 0.5505006951655198,
"grad_norm": 102.1875,
"learning_rate": 9.990610328638497e-07,
"loss": 84.4034,
"step": 1955
},
{
"epoch": 0.5519086253321835,
"grad_norm": 102.4375,
"learning_rate": 9.959311424100156e-07,
"loss": 83.8276,
"step": 1960
},
{
"epoch": 0.5533165554988473,
"grad_norm": 107.875,
"learning_rate": 9.928012519561814e-07,
"loss": 83.8557,
"step": 1965
},
{
"epoch": 0.554724485665511,
"grad_norm": 105.625,
"learning_rate": 9.896713615023475e-07,
"loss": 84.7172,
"step": 1970
},
{
"epoch": 0.5561324158321748,
"grad_norm": 107.1875,
"learning_rate": 9.865414710485132e-07,
"loss": 83.6505,
"step": 1975
},
{
"epoch": 0.5575403459988385,
"grad_norm": 104.4375,
"learning_rate": 9.83411580594679e-07,
"loss": 84.1419,
"step": 1980
},
{
"epoch": 0.5589482761655022,
"grad_norm": 107.875,
"learning_rate": 9.80281690140845e-07,
"loss": 83.3483,
"step": 1985
},
{
"epoch": 0.5603562063321659,
"grad_norm": 109.5625,
"learning_rate": 9.77151799687011e-07,
"loss": 83.2388,
"step": 1990
},
{
"epoch": 0.5617641364988296,
"grad_norm": 106.375,
"learning_rate": 9.740219092331768e-07,
"loss": 83.3274,
"step": 1995
},
{
"epoch": 0.5631720666654934,
"grad_norm": 104.375,
"learning_rate": 9.708920187793427e-07,
"loss": 84.6276,
"step": 2000
},
{
"epoch": 0.5631720666654934,
"eval_loss": 2.6152572631835938,
"eval_runtime": 173.0871,
"eval_samples_per_second": 1105.778,
"eval_steps_per_second": 34.561,
"step": 2000
},
{
"epoch": 0.5645799968321571,
"grad_norm": 105.6875,
"learning_rate": 9.677621283255085e-07,
"loss": 83.7494,
"step": 2005
},
{
"epoch": 0.5659879269988208,
"grad_norm": 107.4375,
"learning_rate": 9.646322378716744e-07,
"loss": 84.0542,
"step": 2010
},
{
"epoch": 0.5673958571654846,
"grad_norm": 104.0625,
"learning_rate": 9.615023474178403e-07,
"loss": 84.0271,
"step": 2015
},
{
"epoch": 0.5688037873321483,
"grad_norm": 107.25,
"learning_rate": 9.583724569640063e-07,
"loss": 85.3889,
"step": 2020
},
{
"epoch": 0.570211717498812,
"grad_norm": 110.375,
"learning_rate": 9.552425665101722e-07,
"loss": 82.7557,
"step": 2025
},
{
"epoch": 0.5716196476654758,
"grad_norm": 105.125,
"learning_rate": 9.52112676056338e-07,
"loss": 83.84,
"step": 2030
},
{
"epoch": 0.5730275778321395,
"grad_norm": 104.0625,
"learning_rate": 9.489827856025039e-07,
"loss": 84.4417,
"step": 2035
},
{
"epoch": 0.5744355079988033,
"grad_norm": 108.0,
"learning_rate": 9.458528951486698e-07,
"loss": 82.6491,
"step": 2040
},
{
"epoch": 0.575843438165467,
"grad_norm": 105.625,
"learning_rate": 9.427230046948356e-07,
"loss": 85.0275,
"step": 2045
},
{
"epoch": 0.5772513683321308,
"grad_norm": 105.4375,
"learning_rate": 9.395931142410015e-07,
"loss": 82.4665,
"step": 2050
},
{
"epoch": 0.5786592984987945,
"grad_norm": 107.375,
"learning_rate": 9.364632237871674e-07,
"loss": 83.1962,
"step": 2055
},
{
"epoch": 0.5800672286654582,
"grad_norm": 100.625,
"learning_rate": 9.333333333333333e-07,
"loss": 83.5944,
"step": 2060
},
{
"epoch": 0.581475158832122,
"grad_norm": 106.6875,
"learning_rate": 9.302034428794992e-07,
"loss": 84.8146,
"step": 2065
},
{
"epoch": 0.5828830889987857,
"grad_norm": 105.5,
"learning_rate": 9.27073552425665e-07,
"loss": 82.4536,
"step": 2070
},
{
"epoch": 0.5842910191654493,
"grad_norm": 105.8125,
"learning_rate": 9.239436619718309e-07,
"loss": 82.6324,
"step": 2075
},
{
"epoch": 0.5856989493321131,
"grad_norm": 107.25,
"learning_rate": 9.208137715179968e-07,
"loss": 83.1704,
"step": 2080
},
{
"epoch": 0.5871068794987768,
"grad_norm": 107.625,
"learning_rate": 9.176838810641627e-07,
"loss": 82.6887,
"step": 2085
},
{
"epoch": 0.5885148096654406,
"grad_norm": 102.125,
"learning_rate": 9.145539906103286e-07,
"loss": 82.3661,
"step": 2090
},
{
"epoch": 0.5899227398321043,
"grad_norm": 104.3125,
"learning_rate": 9.114241001564945e-07,
"loss": 82.481,
"step": 2095
},
{
"epoch": 0.591330669998768,
"grad_norm": 105.875,
"learning_rate": 9.082942097026603e-07,
"loss": 82.2819,
"step": 2100
},
{
"epoch": 0.5927386001654318,
"grad_norm": 108.1875,
"learning_rate": 9.051643192488263e-07,
"loss": 83.5644,
"step": 2105
},
{
"epoch": 0.5941465303320955,
"grad_norm": 105.0625,
"learning_rate": 9.020344287949921e-07,
"loss": 83.2546,
"step": 2110
},
{
"epoch": 0.5955544604987593,
"grad_norm": 106.125,
"learning_rate": 8.98904538341158e-07,
"loss": 83.6469,
"step": 2115
},
{
"epoch": 0.596962390665423,
"grad_norm": 106.4375,
"learning_rate": 8.95774647887324e-07,
"loss": 83.8329,
"step": 2120
},
{
"epoch": 0.5983703208320867,
"grad_norm": 104.0625,
"learning_rate": 8.926447574334897e-07,
"loss": 81.423,
"step": 2125
},
{
"epoch": 0.5997782509987505,
"grad_norm": 103.3125,
"learning_rate": 8.895148669796557e-07,
"loss": 82.5989,
"step": 2130
},
{
"epoch": 0.6011861811654142,
"grad_norm": 103.75,
"learning_rate": 8.863849765258216e-07,
"loss": 82.3526,
"step": 2135
},
{
"epoch": 0.602594111332078,
"grad_norm": 109.5625,
"learning_rate": 8.832550860719874e-07,
"loss": 81.5921,
"step": 2140
},
{
"epoch": 0.6040020414987417,
"grad_norm": 105.75,
"learning_rate": 8.801251956181534e-07,
"loss": 81.5713,
"step": 2145
},
{
"epoch": 0.6054099716654054,
"grad_norm": 104.8125,
"learning_rate": 8.769953051643191e-07,
"loss": 82.5239,
"step": 2150
},
{
"epoch": 0.6068179018320692,
"grad_norm": 103.3125,
"learning_rate": 8.738654147104851e-07,
"loss": 81.41,
"step": 2155
},
{
"epoch": 0.6082258319987328,
"grad_norm": 106.75,
"learning_rate": 8.70735524256651e-07,
"loss": 83.2237,
"step": 2160
},
{
"epoch": 0.6096337621653966,
"grad_norm": 102.4375,
"learning_rate": 8.676056338028168e-07,
"loss": 82.1952,
"step": 2165
},
{
"epoch": 0.6110416923320603,
"grad_norm": 104.5,
"learning_rate": 8.644757433489828e-07,
"loss": 81.5301,
"step": 2170
},
{
"epoch": 0.612449622498724,
"grad_norm": 102.5,
"learning_rate": 8.613458528951486e-07,
"loss": 83.0988,
"step": 2175
},
{
"epoch": 0.6138575526653878,
"grad_norm": 106.3125,
"learning_rate": 8.582159624413145e-07,
"loss": 83.5443,
"step": 2180
},
{
"epoch": 0.6152654828320515,
"grad_norm": 104.4375,
"learning_rate": 8.550860719874804e-07,
"loss": 81.3843,
"step": 2185
},
{
"epoch": 0.6166734129987153,
"grad_norm": 103.3125,
"learning_rate": 8.519561815336462e-07,
"loss": 82.8493,
"step": 2190
},
{
"epoch": 0.618081343165379,
"grad_norm": 106.75,
"learning_rate": 8.488262910798122e-07,
"loss": 82.6346,
"step": 2195
},
{
"epoch": 0.6194892733320427,
"grad_norm": 105.4375,
"learning_rate": 8.456964006259781e-07,
"loss": 82.4296,
"step": 2200
},
{
"epoch": 0.6208972034987065,
"grad_norm": 107.9375,
"learning_rate": 8.425665101721439e-07,
"loss": 82.7624,
"step": 2205
},
{
"epoch": 0.6223051336653702,
"grad_norm": 103.9375,
"learning_rate": 8.394366197183098e-07,
"loss": 82.7663,
"step": 2210
},
{
"epoch": 0.623713063832034,
"grad_norm": 104.8125,
"learning_rate": 8.363067292644757e-07,
"loss": 83.0542,
"step": 2215
},
{
"epoch": 0.6251209939986977,
"grad_norm": 105.5625,
"learning_rate": 8.331768388106416e-07,
"loss": 82.92,
"step": 2220
},
{
"epoch": 0.6265289241653614,
"grad_norm": 104.3125,
"learning_rate": 8.300469483568075e-07,
"loss": 81.9954,
"step": 2225
},
{
"epoch": 0.6279368543320252,
"grad_norm": 104.8125,
"learning_rate": 8.269170579029733e-07,
"loss": 82.2349,
"step": 2230
},
{
"epoch": 0.6293447844986889,
"grad_norm": 104.3125,
"learning_rate": 8.237871674491392e-07,
"loss": 81.3002,
"step": 2235
},
{
"epoch": 0.6307527146653527,
"grad_norm": 104.125,
"learning_rate": 8.206572769953052e-07,
"loss": 82.5326,
"step": 2240
},
{
"epoch": 0.6321606448320163,
"grad_norm": 104.625,
"learning_rate": 8.17527386541471e-07,
"loss": 80.9955,
"step": 2245
},
{
"epoch": 0.63356857499868,
"grad_norm": 105.5625,
"learning_rate": 8.143974960876369e-07,
"loss": 82.0781,
"step": 2250
},
{
"epoch": 0.6349765051653438,
"grad_norm": 106.0625,
"learning_rate": 8.112676056338028e-07,
"loss": 81.8139,
"step": 2255
},
{
"epoch": 0.6363844353320075,
"grad_norm": 103.0625,
"learning_rate": 8.081377151799686e-07,
"loss": 81.4989,
"step": 2260
},
{
"epoch": 0.6377923654986712,
"grad_norm": 103.625,
"learning_rate": 8.050078247261346e-07,
"loss": 83.1769,
"step": 2265
},
{
"epoch": 0.639200295665335,
"grad_norm": 107.875,
"learning_rate": 8.018779342723004e-07,
"loss": 81.519,
"step": 2270
},
{
"epoch": 0.6406082258319987,
"grad_norm": 106.75,
"learning_rate": 7.987480438184663e-07,
"loss": 81.4644,
"step": 2275
},
{
"epoch": 0.6420161559986625,
"grad_norm": 106.3125,
"learning_rate": 7.956181533646323e-07,
"loss": 82.4883,
"step": 2280
},
{
"epoch": 0.6434240861653262,
"grad_norm": 105.4375,
"learning_rate": 7.92488262910798e-07,
"loss": 81.4137,
"step": 2285
},
{
"epoch": 0.6448320163319899,
"grad_norm": 106.875,
"learning_rate": 7.89358372456964e-07,
"loss": 82.0379,
"step": 2290
},
{
"epoch": 0.6462399464986537,
"grad_norm": 105.5,
"learning_rate": 7.862284820031299e-07,
"loss": 80.6811,
"step": 2295
},
{
"epoch": 0.6476478766653174,
"grad_norm": 105.3125,
"learning_rate": 7.830985915492957e-07,
"loss": 81.5782,
"step": 2300
},
{
"epoch": 0.6490558068319812,
"grad_norm": 102.125,
"learning_rate": 7.799687010954617e-07,
"loss": 79.9575,
"step": 2305
},
{
"epoch": 0.6504637369986449,
"grad_norm": 103.125,
"learning_rate": 7.768388106416274e-07,
"loss": 80.942,
"step": 2310
},
{
"epoch": 0.6518716671653086,
"grad_norm": 104.8125,
"learning_rate": 7.737089201877934e-07,
"loss": 82.0593,
"step": 2315
},
{
"epoch": 0.6532795973319724,
"grad_norm": 104.0,
"learning_rate": 7.705790297339593e-07,
"loss": 82.9627,
"step": 2320
},
{
"epoch": 0.654687527498636,
"grad_norm": 108.4375,
"learning_rate": 7.674491392801251e-07,
"loss": 81.7538,
"step": 2325
},
{
"epoch": 0.6560954576652998,
"grad_norm": 102.0625,
"learning_rate": 7.643192488262911e-07,
"loss": 80.4473,
"step": 2330
},
{
"epoch": 0.6575033878319635,
"grad_norm": 105.25,
"learning_rate": 7.611893583724569e-07,
"loss": 82.7576,
"step": 2335
},
{
"epoch": 0.6589113179986272,
"grad_norm": 108.0625,
"learning_rate": 7.580594679186228e-07,
"loss": 80.7854,
"step": 2340
},
{
"epoch": 0.660319248165291,
"grad_norm": 105.5,
"learning_rate": 7.549295774647887e-07,
"loss": 80.6502,
"step": 2345
},
{
"epoch": 0.6617271783319547,
"grad_norm": 103.25,
"learning_rate": 7.517996870109545e-07,
"loss": 82.1516,
"step": 2350
},
{
"epoch": 0.6631351084986185,
"grad_norm": 103.75,
"learning_rate": 7.486697965571205e-07,
"loss": 82.5402,
"step": 2355
},
{
"epoch": 0.6645430386652822,
"grad_norm": 105.0,
"learning_rate": 7.455399061032864e-07,
"loss": 80.486,
"step": 2360
},
{
"epoch": 0.6659509688319459,
"grad_norm": 107.875,
"learning_rate": 7.424100156494522e-07,
"loss": 81.896,
"step": 2365
},
{
"epoch": 0.6673588989986097,
"grad_norm": 103.5,
"learning_rate": 7.392801251956181e-07,
"loss": 80.4128,
"step": 2370
},
{
"epoch": 0.6687668291652734,
"grad_norm": 101.8125,
"learning_rate": 7.361502347417841e-07,
"loss": 81.8544,
"step": 2375
},
{
"epoch": 0.6701747593319372,
"grad_norm": 104.875,
"learning_rate": 7.330203442879499e-07,
"loss": 81.6146,
"step": 2380
},
{
"epoch": 0.6715826894986009,
"grad_norm": 105.625,
"learning_rate": 7.298904538341158e-07,
"loss": 81.1391,
"step": 2385
},
{
"epoch": 0.6729906196652646,
"grad_norm": 109.25,
"learning_rate": 7.267605633802816e-07,
"loss": 82.8396,
"step": 2390
},
{
"epoch": 0.6743985498319284,
"grad_norm": 102.625,
"learning_rate": 7.236306729264475e-07,
"loss": 82.0588,
"step": 2395
},
{
"epoch": 0.6758064799985921,
"grad_norm": 105.125,
"learning_rate": 7.205007824726135e-07,
"loss": 80.4687,
"step": 2400
},
{
"epoch": 0.6772144101652559,
"grad_norm": 104.375,
"learning_rate": 7.173708920187793e-07,
"loss": 80.4795,
"step": 2405
},
{
"epoch": 0.6786223403319195,
"grad_norm": 104.8125,
"learning_rate": 7.142410015649452e-07,
"loss": 81.0931,
"step": 2410
},
{
"epoch": 0.6800302704985832,
"grad_norm": 107.25,
"learning_rate": 7.111111111111111e-07,
"loss": 81.6648,
"step": 2415
},
{
"epoch": 0.681438200665247,
"grad_norm": 102.5625,
"learning_rate": 7.079812206572769e-07,
"loss": 81.7432,
"step": 2420
},
{
"epoch": 0.6828461308319107,
"grad_norm": 104.6875,
"learning_rate": 7.048513302034429e-07,
"loss": 81.0647,
"step": 2425
},
{
"epoch": 0.6842540609985744,
"grad_norm": 103.375,
"learning_rate": 7.017214397496087e-07,
"loss": 81.8188,
"step": 2430
},
{
"epoch": 0.6856619911652382,
"grad_norm": 101.8125,
"learning_rate": 6.985915492957746e-07,
"loss": 80.2828,
"step": 2435
},
{
"epoch": 0.6870699213319019,
"grad_norm": 101.625,
"learning_rate": 6.954616588419406e-07,
"loss": 79.2917,
"step": 2440
},
{
"epoch": 0.6884778514985657,
"grad_norm": 101.4375,
"learning_rate": 6.923317683881063e-07,
"loss": 81.5069,
"step": 2445
},
{
"epoch": 0.6898857816652294,
"grad_norm": 106.0625,
"learning_rate": 6.892018779342723e-07,
"loss": 80.9566,
"step": 2450
},
{
"epoch": 0.6912937118318931,
"grad_norm": 103.625,
"learning_rate": 6.860719874804382e-07,
"loss": 80.8435,
"step": 2455
},
{
"epoch": 0.6927016419985569,
"grad_norm": 103.375,
"learning_rate": 6.82942097026604e-07,
"loss": 82.4846,
"step": 2460
},
{
"epoch": 0.6941095721652206,
"grad_norm": 103.4375,
"learning_rate": 6.7981220657277e-07,
"loss": 81.9275,
"step": 2465
},
{
"epoch": 0.6955175023318844,
"grad_norm": 101.1875,
"learning_rate": 6.766823161189357e-07,
"loss": 80.6729,
"step": 2470
},
{
"epoch": 0.6969254324985481,
"grad_norm": 103.3125,
"learning_rate": 6.735524256651017e-07,
"loss": 82.3246,
"step": 2475
},
{
"epoch": 0.6983333626652118,
"grad_norm": 107.1875,
"learning_rate": 6.704225352112676e-07,
"loss": 82.1143,
"step": 2480
},
{
"epoch": 0.6997412928318756,
"grad_norm": 107.6875,
"learning_rate": 6.672926447574334e-07,
"loss": 80.1889,
"step": 2485
},
{
"epoch": 0.7011492229985393,
"grad_norm": 104.5625,
"learning_rate": 6.641627543035994e-07,
"loss": 81.7307,
"step": 2490
},
{
"epoch": 0.702557153165203,
"grad_norm": 102.625,
"learning_rate": 6.610328638497652e-07,
"loss": 81.0958,
"step": 2495
},
{
"epoch": 0.7039650833318667,
"grad_norm": 101.625,
"learning_rate": 6.579029733959311e-07,
"loss": 80.9724,
"step": 2500
},
{
"epoch": 0.7039650833318667,
"eval_loss": 2.5142035484313965,
"eval_runtime": 171.8658,
"eval_samples_per_second": 1113.636,
"eval_steps_per_second": 34.806,
"step": 2500
},
{
"epoch": 0.7053730134985304,
"grad_norm": 105.0625,
"learning_rate": 6.54773082942097e-07,
"loss": 80.2974,
"step": 2505
},
{
"epoch": 0.7067809436651942,
"grad_norm": 103.0,
"learning_rate": 6.516431924882628e-07,
"loss": 81.7355,
"step": 2510
},
{
"epoch": 0.7081888738318579,
"grad_norm": 102.0625,
"learning_rate": 6.485133020344288e-07,
"loss": 80.0072,
"step": 2515
},
{
"epoch": 0.7095968039985217,
"grad_norm": 103.0,
"learning_rate": 6.453834115805947e-07,
"loss": 80.7088,
"step": 2520
},
{
"epoch": 0.7110047341651854,
"grad_norm": 104.375,
"learning_rate": 6.422535211267605e-07,
"loss": 80.8086,
"step": 2525
},
{
"epoch": 0.7124126643318491,
"grad_norm": 106.3125,
"learning_rate": 6.391236306729264e-07,
"loss": 79.6987,
"step": 2530
},
{
"epoch": 0.7138205944985129,
"grad_norm": 104.375,
"learning_rate": 6.359937402190924e-07,
"loss": 81.249,
"step": 2535
},
{
"epoch": 0.7152285246651766,
"grad_norm": 103.25,
"learning_rate": 6.328638497652582e-07,
"loss": 80.7754,
"step": 2540
},
{
"epoch": 0.7166364548318404,
"grad_norm": 103.1875,
"learning_rate": 6.297339593114241e-07,
"loss": 80.2149,
"step": 2545
},
{
"epoch": 0.7180443849985041,
"grad_norm": 105.875,
"learning_rate": 6.266040688575899e-07,
"loss": 80.4064,
"step": 2550
},
{
"epoch": 0.7194523151651678,
"grad_norm": 104.4375,
"learning_rate": 6.234741784037558e-07,
"loss": 80.101,
"step": 2555
},
{
"epoch": 0.7208602453318316,
"grad_norm": 102.0,
"learning_rate": 6.203442879499218e-07,
"loss": 80.2786,
"step": 2560
},
{
"epoch": 0.7222681754984953,
"grad_norm": 103.6875,
"learning_rate": 6.172143974960876e-07,
"loss": 79.634,
"step": 2565
},
{
"epoch": 0.723676105665159,
"grad_norm": 106.1875,
"learning_rate": 6.140845070422535e-07,
"loss": 80.0352,
"step": 2570
},
{
"epoch": 0.7250840358318228,
"grad_norm": 105.375,
"learning_rate": 6.109546165884194e-07,
"loss": 80.5197,
"step": 2575
},
{
"epoch": 0.7264919659984864,
"grad_norm": 102.125,
"learning_rate": 6.078247261345852e-07,
"loss": 81.0426,
"step": 2580
},
{
"epoch": 0.7278998961651502,
"grad_norm": 103.375,
"learning_rate": 6.046948356807512e-07,
"loss": 79.8113,
"step": 2585
},
{
"epoch": 0.7293078263318139,
"grad_norm": 108.0,
"learning_rate": 6.01564945226917e-07,
"loss": 80.2865,
"step": 2590
},
{
"epoch": 0.7307157564984776,
"grad_norm": 106.0,
"learning_rate": 5.984350547730829e-07,
"loss": 79.9696,
"step": 2595
},
{
"epoch": 0.7321236866651414,
"grad_norm": 106.25,
"learning_rate": 5.953051643192489e-07,
"loss": 79.5844,
"step": 2600
},
{
"epoch": 0.7335316168318051,
"grad_norm": 103.8125,
"learning_rate": 5.921752738654146e-07,
"loss": 80.3369,
"step": 2605
},
{
"epoch": 0.7349395469984689,
"grad_norm": 107.6875,
"learning_rate": 5.890453834115806e-07,
"loss": 81.3312,
"step": 2610
},
{
"epoch": 0.7363474771651326,
"grad_norm": 107.1875,
"learning_rate": 5.859154929577465e-07,
"loss": 80.8727,
"step": 2615
},
{
"epoch": 0.7377554073317963,
"grad_norm": 105.5625,
"learning_rate": 5.827856025039123e-07,
"loss": 80.7422,
"step": 2620
},
{
"epoch": 0.7391633374984601,
"grad_norm": 105.1875,
"learning_rate": 5.796557120500783e-07,
"loss": 79.7938,
"step": 2625
},
{
"epoch": 0.7405712676651238,
"grad_norm": 103.875,
"learning_rate": 5.76525821596244e-07,
"loss": 80.3593,
"step": 2630
},
{
"epoch": 0.7419791978317876,
"grad_norm": 107.375,
"learning_rate": 5.7339593114241e-07,
"loss": 80.1003,
"step": 2635
},
{
"epoch": 0.7433871279984513,
"grad_norm": 103.0625,
"learning_rate": 5.702660406885759e-07,
"loss": 78.9579,
"step": 2640
},
{
"epoch": 0.744795058165115,
"grad_norm": 104.8125,
"learning_rate": 5.671361502347417e-07,
"loss": 81.0954,
"step": 2645
},
{
"epoch": 0.7462029883317788,
"grad_norm": 105.4375,
"learning_rate": 5.640062597809077e-07,
"loss": 78.6543,
"step": 2650
},
{
"epoch": 0.7476109184984425,
"grad_norm": 103.0625,
"learning_rate": 5.608763693270734e-07,
"loss": 78.3176,
"step": 2655
},
{
"epoch": 0.7490188486651063,
"grad_norm": 104.5,
"learning_rate": 5.577464788732394e-07,
"loss": 79.9896,
"step": 2660
},
{
"epoch": 0.7504267788317699,
"grad_norm": 105.6875,
"learning_rate": 5.546165884194053e-07,
"loss": 80.574,
"step": 2665
},
{
"epoch": 0.7518347089984336,
"grad_norm": 106.0,
"learning_rate": 5.514866979655712e-07,
"loss": 78.7153,
"step": 2670
},
{
"epoch": 0.7532426391650974,
"grad_norm": 107.4375,
"learning_rate": 5.483568075117371e-07,
"loss": 80.4845,
"step": 2675
},
{
"epoch": 0.7546505693317611,
"grad_norm": 102.8125,
"learning_rate": 5.452269170579029e-07,
"loss": 78.3971,
"step": 2680
},
{
"epoch": 0.7560584994984249,
"grad_norm": 103.5,
"learning_rate": 5.420970266040688e-07,
"loss": 79.7906,
"step": 2685
},
{
"epoch": 0.7574664296650886,
"grad_norm": 105.0,
"learning_rate": 5.389671361502347e-07,
"loss": 78.2005,
"step": 2690
},
{
"epoch": 0.7588743598317523,
"grad_norm": 106.3125,
"learning_rate": 5.358372456964007e-07,
"loss": 79.2892,
"step": 2695
},
{
"epoch": 0.7602822899984161,
"grad_norm": 106.9375,
"learning_rate": 5.327073552425665e-07,
"loss": 78.7832,
"step": 2700
},
{
"epoch": 0.7616902201650798,
"grad_norm": 104.8125,
"learning_rate": 5.295774647887324e-07,
"loss": 79.459,
"step": 2705
},
{
"epoch": 0.7630981503317436,
"grad_norm": 107.25,
"learning_rate": 5.264475743348982e-07,
"loss": 79.9508,
"step": 2710
},
{
"epoch": 0.7645060804984073,
"grad_norm": 105.5,
"learning_rate": 5.233176838810641e-07,
"loss": 80.7908,
"step": 2715
},
{
"epoch": 0.765914010665071,
"grad_norm": 104.9375,
"learning_rate": 5.201877934272301e-07,
"loss": 77.9633,
"step": 2720
},
{
"epoch": 0.7673219408317348,
"grad_norm": 109.0625,
"learning_rate": 5.170579029733959e-07,
"loss": 79.519,
"step": 2725
},
{
"epoch": 0.7687298709983985,
"grad_norm": 101.75,
"learning_rate": 5.139280125195618e-07,
"loss": 77.4878,
"step": 2730
},
{
"epoch": 0.7701378011650623,
"grad_norm": 105.0625,
"learning_rate": 5.107981220657277e-07,
"loss": 80.2978,
"step": 2735
},
{
"epoch": 0.771545731331726,
"grad_norm": 105.1875,
"learning_rate": 5.076682316118935e-07,
"loss": 78.4916,
"step": 2740
},
{
"epoch": 0.7729536614983897,
"grad_norm": 104.25,
"learning_rate": 5.045383411580595e-07,
"loss": 79.4729,
"step": 2745
},
{
"epoch": 0.7743615916650534,
"grad_norm": 103.25,
"learning_rate": 5.014084507042253e-07,
"loss": 80.6059,
"step": 2750
},
{
"epoch": 0.7757695218317171,
"grad_norm": 104.25,
"learning_rate": 4.982785602503912e-07,
"loss": 78.5902,
"step": 2755
},
{
"epoch": 0.7771774519983808,
"grad_norm": 105.0625,
"learning_rate": 4.951486697965572e-07,
"loss": 79.6788,
"step": 2760
},
{
"epoch": 0.7785853821650446,
"grad_norm": 106.5,
"learning_rate": 4.920187793427229e-07,
"loss": 79.9395,
"step": 2765
},
{
"epoch": 0.7799933123317083,
"grad_norm": 103.125,
"learning_rate": 4.888888888888889e-07,
"loss": 80.7752,
"step": 2770
},
{
"epoch": 0.7814012424983721,
"grad_norm": 105.3125,
"learning_rate": 4.857589984350548e-07,
"loss": 81.2435,
"step": 2775
},
{
"epoch": 0.7828091726650358,
"grad_norm": 106.3125,
"learning_rate": 4.826291079812206e-07,
"loss": 79.5725,
"step": 2780
},
{
"epoch": 0.7842171028316995,
"grad_norm": 103.8125,
"learning_rate": 4.794992175273866e-07,
"loss": 79.1346,
"step": 2785
},
{
"epoch": 0.7856250329983633,
"grad_norm": 104.375,
"learning_rate": 4.7636932707355244e-07,
"loss": 80.1,
"step": 2790
},
{
"epoch": 0.787032963165027,
"grad_norm": 105.0625,
"learning_rate": 4.7323943661971825e-07,
"loss": 80.1356,
"step": 2795
},
{
"epoch": 0.7884408933316908,
"grad_norm": 107.3125,
"learning_rate": 4.7010954616588416e-07,
"loss": 79.9081,
"step": 2800
},
{
"epoch": 0.7898488234983545,
"grad_norm": 102.3125,
"learning_rate": 4.669796557120501e-07,
"loss": 80.3247,
"step": 2805
},
{
"epoch": 0.7912567536650182,
"grad_norm": 103.75,
"learning_rate": 4.6384976525821594e-07,
"loss": 77.7689,
"step": 2810
},
{
"epoch": 0.792664683831682,
"grad_norm": 100.8125,
"learning_rate": 4.6071987480438185e-07,
"loss": 78.127,
"step": 2815
},
{
"epoch": 0.7940726139983457,
"grad_norm": 104.5625,
"learning_rate": 4.5758998435054766e-07,
"loss": 80.0082,
"step": 2820
},
{
"epoch": 0.7954805441650095,
"grad_norm": 105.125,
"learning_rate": 4.544600938967136e-07,
"loss": 79.2828,
"step": 2825
},
{
"epoch": 0.7968884743316732,
"grad_norm": 104.0625,
"learning_rate": 4.513302034428795e-07,
"loss": 79.3832,
"step": 2830
},
{
"epoch": 0.7982964044983368,
"grad_norm": 104.5625,
"learning_rate": 4.4820031298904535e-07,
"loss": 79.5533,
"step": 2835
},
{
"epoch": 0.7997043346650006,
"grad_norm": 102.3125,
"learning_rate": 4.4507042253521126e-07,
"loss": 78.6774,
"step": 2840
},
{
"epoch": 0.8011122648316643,
"grad_norm": 104.875,
"learning_rate": 4.419405320813771e-07,
"loss": 80.5638,
"step": 2845
},
{
"epoch": 0.802520194998328,
"grad_norm": 102.0625,
"learning_rate": 4.38810641627543e-07,
"loss": 78.2395,
"step": 2850
},
{
"epoch": 0.8039281251649918,
"grad_norm": 107.125,
"learning_rate": 4.356807511737089e-07,
"loss": 79.105,
"step": 2855
},
{
"epoch": 0.8053360553316555,
"grad_norm": 102.875,
"learning_rate": 4.325508607198748e-07,
"loss": 78.7725,
"step": 2860
},
{
"epoch": 0.8067439854983193,
"grad_norm": 105.625,
"learning_rate": 4.294209702660407e-07,
"loss": 80.5747,
"step": 2865
},
{
"epoch": 0.808151915664983,
"grad_norm": 100.6875,
"learning_rate": 4.262910798122066e-07,
"loss": 79.2395,
"step": 2870
},
{
"epoch": 0.8095598458316468,
"grad_norm": 104.5625,
"learning_rate": 4.231611893583724e-07,
"loss": 78.6901,
"step": 2875
},
{
"epoch": 0.8109677759983105,
"grad_norm": 107.5625,
"learning_rate": 4.200312989045383e-07,
"loss": 78.2269,
"step": 2880
},
{
"epoch": 0.8123757061649742,
"grad_norm": 105.6875,
"learning_rate": 4.169014084507042e-07,
"loss": 78.163,
"step": 2885
},
{
"epoch": 0.813783636331638,
"grad_norm": 104.75,
"learning_rate": 4.137715179968701e-07,
"loss": 79.7651,
"step": 2890
},
{
"epoch": 0.8151915664983017,
"grad_norm": 104.5,
"learning_rate": 4.10641627543036e-07,
"loss": 78.9615,
"step": 2895
},
{
"epoch": 0.8165994966649655,
"grad_norm": 104.0,
"learning_rate": 4.075117370892018e-07,
"loss": 79.6789,
"step": 2900
},
{
"epoch": 0.8180074268316292,
"grad_norm": 103.25,
"learning_rate": 4.043818466353677e-07,
"loss": 77.2641,
"step": 2905
},
{
"epoch": 0.8194153569982929,
"grad_norm": 107.25,
"learning_rate": 4.0125195618153364e-07,
"loss": 77.7458,
"step": 2910
},
{
"epoch": 0.8208232871649567,
"grad_norm": 101.1875,
"learning_rate": 3.981220657276995e-07,
"loss": 78.3837,
"step": 2915
},
{
"epoch": 0.8222312173316203,
"grad_norm": 103.5,
"learning_rate": 3.949921752738654e-07,
"loss": 78.8562,
"step": 2920
},
{
"epoch": 0.823639147498284,
"grad_norm": 106.0625,
"learning_rate": 3.9186228482003133e-07,
"loss": 78.7467,
"step": 2925
},
{
"epoch": 0.8250470776649478,
"grad_norm": 102.875,
"learning_rate": 3.8873239436619713e-07,
"loss": 78.072,
"step": 2930
},
{
"epoch": 0.8264550078316115,
"grad_norm": 105.5,
"learning_rate": 3.8560250391236305e-07,
"loss": 78.5434,
"step": 2935
},
{
"epoch": 0.8278629379982753,
"grad_norm": 104.5,
"learning_rate": 3.824726134585289e-07,
"loss": 77.0472,
"step": 2940
},
{
"epoch": 0.829270868164939,
"grad_norm": 105.3125,
"learning_rate": 3.793427230046948e-07,
"loss": 77.9887,
"step": 2945
},
{
"epoch": 0.8306787983316027,
"grad_norm": 103.125,
"learning_rate": 3.7621283255086074e-07,
"loss": 79.2707,
"step": 2950
},
{
"epoch": 0.8320867284982665,
"grad_norm": 104.0,
"learning_rate": 3.7308294209702655e-07,
"loss": 77.7834,
"step": 2955
},
{
"epoch": 0.8334946586649302,
"grad_norm": 100.4375,
"learning_rate": 3.6995305164319246e-07,
"loss": 79.0078,
"step": 2960
},
{
"epoch": 0.834902588831594,
"grad_norm": 104.5625,
"learning_rate": 3.668231611893584e-07,
"loss": 79.1376,
"step": 2965
},
{
"epoch": 0.8363105189982577,
"grad_norm": 104.5,
"learning_rate": 3.6369327073552424e-07,
"loss": 79.726,
"step": 2970
},
{
"epoch": 0.8377184491649214,
"grad_norm": 100.1875,
"learning_rate": 3.6056338028169015e-07,
"loss": 78.9119,
"step": 2975
},
{
"epoch": 0.8391263793315852,
"grad_norm": 101.875,
"learning_rate": 3.5743348982785596e-07,
"loss": 78.8435,
"step": 2980
},
{
"epoch": 0.8405343094982489,
"grad_norm": 104.0,
"learning_rate": 3.5430359937402187e-07,
"loss": 80.1266,
"step": 2985
},
{
"epoch": 0.8419422396649127,
"grad_norm": 107.0625,
"learning_rate": 3.511737089201878e-07,
"loss": 78.4947,
"step": 2990
},
{
"epoch": 0.8433501698315764,
"grad_norm": 101.25,
"learning_rate": 3.4804381846635365e-07,
"loss": 78.0414,
"step": 2995
},
{
"epoch": 0.8447580999982401,
"grad_norm": 104.4375,
"learning_rate": 3.4491392801251956e-07,
"loss": 78.6488,
"step": 3000
},
{
"epoch": 0.8447580999982401,
"eval_loss": 2.4493813514709473,
"eval_runtime": 171.8577,
"eval_samples_per_second": 1113.689,
"eval_steps_per_second": 34.808,
"step": 3000
},
{
"epoch": 0.8461660301649038,
"grad_norm": 107.375,
"learning_rate": 3.417840375586855e-07,
"loss": 79.4929,
"step": 3005
},
{
"epoch": 0.8475739603315675,
"grad_norm": 100.75,
"learning_rate": 3.386541471048513e-07,
"loss": 78.5898,
"step": 3010
},
{
"epoch": 0.8489818904982313,
"grad_norm": 104.0,
"learning_rate": 3.355242566510172e-07,
"loss": 79.1758,
"step": 3015
},
{
"epoch": 0.850389820664895,
"grad_norm": 103.25,
"learning_rate": 3.3239436619718306e-07,
"loss": 78.1413,
"step": 3020
},
{
"epoch": 0.8517977508315587,
"grad_norm": 102.875,
"learning_rate": 3.2926447574334897e-07,
"loss": 78.1522,
"step": 3025
},
{
"epoch": 0.8532056809982225,
"grad_norm": 103.25,
"learning_rate": 3.261345852895149e-07,
"loss": 77.6261,
"step": 3030
},
{
"epoch": 0.8546136111648862,
"grad_norm": 103.3125,
"learning_rate": 3.230046948356807e-07,
"loss": 78.7022,
"step": 3035
},
{
"epoch": 0.85602154133155,
"grad_norm": 101.5,
"learning_rate": 3.198748043818466e-07,
"loss": 77.1196,
"step": 3040
},
{
"epoch": 0.8574294714982137,
"grad_norm": 106.3125,
"learning_rate": 3.167449139280125e-07,
"loss": 79.1569,
"step": 3045
},
{
"epoch": 0.8588374016648774,
"grad_norm": 108.5,
"learning_rate": 3.136150234741784e-07,
"loss": 77.5236,
"step": 3050
},
{
"epoch": 0.8602453318315412,
"grad_norm": 103.3125,
"learning_rate": 3.104851330203443e-07,
"loss": 78.7436,
"step": 3055
},
{
"epoch": 0.8616532619982049,
"grad_norm": 103.875,
"learning_rate": 3.073552425665101e-07,
"loss": 76.0241,
"step": 3060
},
{
"epoch": 0.8630611921648687,
"grad_norm": 103.375,
"learning_rate": 3.04225352112676e-07,
"loss": 77.1607,
"step": 3065
},
{
"epoch": 0.8644691223315324,
"grad_norm": 106.6875,
"learning_rate": 3.0109546165884194e-07,
"loss": 78.4861,
"step": 3070
},
{
"epoch": 0.8658770524981961,
"grad_norm": 102.875,
"learning_rate": 2.979655712050078e-07,
"loss": 76.9453,
"step": 3075
},
{
"epoch": 0.8672849826648599,
"grad_norm": 104.6875,
"learning_rate": 2.948356807511737e-07,
"loss": 78.0346,
"step": 3080
},
{
"epoch": 0.8686929128315236,
"grad_norm": 104.9375,
"learning_rate": 2.917057902973396e-07,
"loss": 78.9506,
"step": 3085
},
{
"epoch": 0.8701008429981872,
"grad_norm": 108.0625,
"learning_rate": 2.8857589984350543e-07,
"loss": 76.9891,
"step": 3090
},
{
"epoch": 0.871508773164851,
"grad_norm": 101.1875,
"learning_rate": 2.8544600938967135e-07,
"loss": 78.0246,
"step": 3095
},
{
"epoch": 0.8729167033315147,
"grad_norm": 106.3125,
"learning_rate": 2.823161189358372e-07,
"loss": 76.2545,
"step": 3100
},
{
"epoch": 0.8743246334981785,
"grad_norm": 104.75,
"learning_rate": 2.791862284820031e-07,
"loss": 76.3179,
"step": 3105
},
{
"epoch": 0.8757325636648422,
"grad_norm": 105.4375,
"learning_rate": 2.7605633802816904e-07,
"loss": 78.4135,
"step": 3110
},
{
"epoch": 0.8771404938315059,
"grad_norm": 104.25,
"learning_rate": 2.7292644757433484e-07,
"loss": 78.5533,
"step": 3115
},
{
"epoch": 0.8785484239981697,
"grad_norm": 104.5625,
"learning_rate": 2.6979655712050076e-07,
"loss": 77.3735,
"step": 3120
},
{
"epoch": 0.8799563541648334,
"grad_norm": 104.0,
"learning_rate": 2.6666666666666667e-07,
"loss": 77.2561,
"step": 3125
},
{
"epoch": 0.8813642843314972,
"grad_norm": 103.375,
"learning_rate": 2.6353677621283253e-07,
"loss": 76.7062,
"step": 3130
},
{
"epoch": 0.8827722144981609,
"grad_norm": 104.8125,
"learning_rate": 2.6040688575899845e-07,
"loss": 76.776,
"step": 3135
},
{
"epoch": 0.8841801446648246,
"grad_norm": 102.75,
"learning_rate": 2.572769953051643e-07,
"loss": 78.8089,
"step": 3140
},
{
"epoch": 0.8855880748314884,
"grad_norm": 103.8125,
"learning_rate": 2.5414710485133017e-07,
"loss": 77.2237,
"step": 3145
},
{
"epoch": 0.8869960049981521,
"grad_norm": 102.4375,
"learning_rate": 2.510172143974961e-07,
"loss": 77.3127,
"step": 3150
},
{
"epoch": 0.8884039351648159,
"grad_norm": 107.375,
"learning_rate": 2.4788732394366194e-07,
"loss": 77.6217,
"step": 3155
},
{
"epoch": 0.8898118653314796,
"grad_norm": 101.6875,
"learning_rate": 2.4475743348982786e-07,
"loss": 77.8592,
"step": 3160
},
{
"epoch": 0.8912197954981433,
"grad_norm": 102.375,
"learning_rate": 2.416275430359937e-07,
"loss": 78.3207,
"step": 3165
},
{
"epoch": 0.892627725664807,
"grad_norm": 106.375,
"learning_rate": 2.3849765258215963e-07,
"loss": 79.2276,
"step": 3170
},
{
"epoch": 0.8940356558314707,
"grad_norm": 104.1875,
"learning_rate": 2.353677621283255e-07,
"loss": 76.6556,
"step": 3175
},
{
"epoch": 0.8954435859981345,
"grad_norm": 104.4375,
"learning_rate": 2.3223787167449138e-07,
"loss": 77.2663,
"step": 3180
},
{
"epoch": 0.8968515161647982,
"grad_norm": 104.6875,
"learning_rate": 2.2910798122065727e-07,
"loss": 77.5001,
"step": 3185
},
{
"epoch": 0.8982594463314619,
"grad_norm": 106.125,
"learning_rate": 2.2597809076682313e-07,
"loss": 78.1235,
"step": 3190
},
{
"epoch": 0.8996673764981257,
"grad_norm": 103.4375,
"learning_rate": 2.2284820031298905e-07,
"loss": 76.6002,
"step": 3195
},
{
"epoch": 0.9010753066647894,
"grad_norm": 102.8125,
"learning_rate": 2.1971830985915493e-07,
"loss": 76.1001,
"step": 3200
},
{
"epoch": 0.9024832368314532,
"grad_norm": 103.875,
"learning_rate": 2.165884194053208e-07,
"loss": 76.9923,
"step": 3205
},
{
"epoch": 0.9038911669981169,
"grad_norm": 101.75,
"learning_rate": 2.1345852895148668e-07,
"loss": 76.7819,
"step": 3210
},
{
"epoch": 0.9052990971647806,
"grad_norm": 105.375,
"learning_rate": 2.1032863849765257e-07,
"loss": 78.2517,
"step": 3215
},
{
"epoch": 0.9067070273314444,
"grad_norm": 106.0,
"learning_rate": 2.0719874804381846e-07,
"loss": 76.2478,
"step": 3220
},
{
"epoch": 0.9081149574981081,
"grad_norm": 106.5625,
"learning_rate": 2.0406885758998434e-07,
"loss": 78.1819,
"step": 3225
},
{
"epoch": 0.9095228876647719,
"grad_norm": 100.125,
"learning_rate": 2.009389671361502e-07,
"loss": 76.3853,
"step": 3230
},
{
"epoch": 0.9109308178314356,
"grad_norm": 104.125,
"learning_rate": 1.9780907668231612e-07,
"loss": 77.7426,
"step": 3235
},
{
"epoch": 0.9123387479980993,
"grad_norm": 107.3125,
"learning_rate": 1.94679186228482e-07,
"loss": 76.3708,
"step": 3240
},
{
"epoch": 0.9137466781647631,
"grad_norm": 102.125,
"learning_rate": 1.9154929577464787e-07,
"loss": 77.9827,
"step": 3245
},
{
"epoch": 0.9151546083314268,
"grad_norm": 107.625,
"learning_rate": 1.8841940532081376e-07,
"loss": 75.9863,
"step": 3250
},
{
"epoch": 0.9165625384980904,
"grad_norm": 106.5625,
"learning_rate": 1.8528951486697964e-07,
"loss": 76.5452,
"step": 3255
},
{
"epoch": 0.9179704686647542,
"grad_norm": 104.4375,
"learning_rate": 1.8215962441314553e-07,
"loss": 75.8746,
"step": 3260
},
{
"epoch": 0.9193783988314179,
"grad_norm": 103.5,
"learning_rate": 1.7902973395931142e-07,
"loss": 76.9438,
"step": 3265
},
{
"epoch": 0.9207863289980817,
"grad_norm": 105.9375,
"learning_rate": 1.7589984350547728e-07,
"loss": 78.6087,
"step": 3270
},
{
"epoch": 0.9221942591647454,
"grad_norm": 103.75,
"learning_rate": 1.727699530516432e-07,
"loss": 77.5682,
"step": 3275
},
{
"epoch": 0.9236021893314091,
"grad_norm": 100.875,
"learning_rate": 1.6964006259780908e-07,
"loss": 77.3968,
"step": 3280
},
{
"epoch": 0.9250101194980729,
"grad_norm": 101.6875,
"learning_rate": 1.6651017214397494e-07,
"loss": 78.751,
"step": 3285
},
{
"epoch": 0.9264180496647366,
"grad_norm": 102.4375,
"learning_rate": 1.6338028169014083e-07,
"loss": 77.4331,
"step": 3290
},
{
"epoch": 0.9278259798314004,
"grad_norm": 100.0,
"learning_rate": 1.6025039123630672e-07,
"loss": 76.5302,
"step": 3295
},
{
"epoch": 0.9292339099980641,
"grad_norm": 104.375,
"learning_rate": 1.571205007824726e-07,
"loss": 77.845,
"step": 3300
},
{
"epoch": 0.9306418401647278,
"grad_norm": 100.875,
"learning_rate": 1.539906103286385e-07,
"loss": 76.8356,
"step": 3305
},
{
"epoch": 0.9320497703313916,
"grad_norm": 105.0625,
"learning_rate": 1.5086071987480435e-07,
"loss": 77.4631,
"step": 3310
},
{
"epoch": 0.9334577004980553,
"grad_norm": 103.4375,
"learning_rate": 1.4773082942097027e-07,
"loss": 77.3199,
"step": 3315
},
{
"epoch": 0.9348656306647191,
"grad_norm": 101.375,
"learning_rate": 1.4460093896713616e-07,
"loss": 77.5975,
"step": 3320
},
{
"epoch": 0.9362735608313828,
"grad_norm": 103.5,
"learning_rate": 1.4147104851330202e-07,
"loss": 75.7769,
"step": 3325
},
{
"epoch": 0.9376814909980465,
"grad_norm": 100.1875,
"learning_rate": 1.383411580594679e-07,
"loss": 78.1456,
"step": 3330
},
{
"epoch": 0.9390894211647103,
"grad_norm": 101.9375,
"learning_rate": 1.352112676056338e-07,
"loss": 76.1149,
"step": 3335
},
{
"epoch": 0.9404973513313739,
"grad_norm": 102.8125,
"learning_rate": 1.3208137715179968e-07,
"loss": 77.1507,
"step": 3340
},
{
"epoch": 0.9419052814980376,
"grad_norm": 107.0,
"learning_rate": 1.2895148669796557e-07,
"loss": 77.3375,
"step": 3345
},
{
"epoch": 0.9433132116647014,
"grad_norm": 105.8125,
"learning_rate": 1.2582159624413143e-07,
"loss": 75.3474,
"step": 3350
},
{
"epoch": 0.9447211418313651,
"grad_norm": 106.625,
"learning_rate": 1.2269170579029734e-07,
"loss": 77.822,
"step": 3355
},
{
"epoch": 0.9461290719980289,
"grad_norm": 104.5,
"learning_rate": 1.195618153364632e-07,
"loss": 76.8075,
"step": 3360
},
{
"epoch": 0.9475370021646926,
"grad_norm": 105.75,
"learning_rate": 1.164319248826291e-07,
"loss": 76.809,
"step": 3365
},
{
"epoch": 0.9489449323313564,
"grad_norm": 102.9375,
"learning_rate": 1.1330203442879499e-07,
"loss": 76.7047,
"step": 3370
},
{
"epoch": 0.9503528624980201,
"grad_norm": 106.8125,
"learning_rate": 1.1017214397496087e-07,
"loss": 77.414,
"step": 3375
},
{
"epoch": 0.9517607926646838,
"grad_norm": 103.125,
"learning_rate": 1.0704225352112675e-07,
"loss": 76.7584,
"step": 3380
},
{
"epoch": 0.9531687228313476,
"grad_norm": 105.25,
"learning_rate": 1.0391236306729264e-07,
"loss": 77.223,
"step": 3385
},
{
"epoch": 0.9545766529980113,
"grad_norm": 106.75,
"learning_rate": 1.0078247261345853e-07,
"loss": 76.3612,
"step": 3390
},
{
"epoch": 0.955984583164675,
"grad_norm": 106.0625,
"learning_rate": 9.76525821596244e-08,
"loss": 76.3587,
"step": 3395
},
{
"epoch": 0.9573925133313388,
"grad_norm": 102.9375,
"learning_rate": 9.452269170579029e-08,
"loss": 77.3654,
"step": 3400
},
{
"epoch": 0.9588004434980025,
"grad_norm": 102.75,
"learning_rate": 9.139280125195618e-08,
"loss": 76.7485,
"step": 3405
},
{
"epoch": 0.9602083736646663,
"grad_norm": 104.875,
"learning_rate": 8.826291079812207e-08,
"loss": 75.8477,
"step": 3410
},
{
"epoch": 0.96161630383133,
"grad_norm": 103.8125,
"learning_rate": 8.513302034428794e-08,
"loss": 77.4901,
"step": 3415
},
{
"epoch": 0.9630242339979938,
"grad_norm": 106.9375,
"learning_rate": 8.200312989045383e-08,
"loss": 75.7378,
"step": 3420
},
{
"epoch": 0.9644321641646574,
"grad_norm": 104.375,
"learning_rate": 7.887323943661972e-08,
"loss": 76.1005,
"step": 3425
},
{
"epoch": 0.9658400943313211,
"grad_norm": 97.9375,
"learning_rate": 7.57433489827856e-08,
"loss": 75.469,
"step": 3430
},
{
"epoch": 0.9672480244979849,
"grad_norm": 104.0,
"learning_rate": 7.261345852895148e-08,
"loss": 76.6004,
"step": 3435
},
{
"epoch": 0.9686559546646486,
"grad_norm": 102.8125,
"learning_rate": 6.948356807511737e-08,
"loss": 74.9522,
"step": 3440
},
{
"epoch": 0.9700638848313123,
"grad_norm": 103.5,
"learning_rate": 6.635367762128325e-08,
"loss": 77.0921,
"step": 3445
},
{
"epoch": 0.9714718149979761,
"grad_norm": 106.625,
"learning_rate": 6.322378716744914e-08,
"loss": 77.7678,
"step": 3450
},
{
"epoch": 0.9728797451646398,
"grad_norm": 103.0,
"learning_rate": 6.009389671361502e-08,
"loss": 76.3409,
"step": 3455
},
{
"epoch": 0.9742876753313036,
"grad_norm": 104.625,
"learning_rate": 5.6964006259780904e-08,
"loss": 77.9755,
"step": 3460
},
{
"epoch": 0.9756956054979673,
"grad_norm": 105.625,
"learning_rate": 5.3834115805946785e-08,
"loss": 77.3649,
"step": 3465
},
{
"epoch": 0.977103535664631,
"grad_norm": 101.8125,
"learning_rate": 5.070422535211267e-08,
"loss": 75.9296,
"step": 3470
},
{
"epoch": 0.9785114658312948,
"grad_norm": 102.125,
"learning_rate": 4.7574334898278553e-08,
"loss": 75.9884,
"step": 3475
},
{
"epoch": 0.9799193959979585,
"grad_norm": 108.1875,
"learning_rate": 4.444444444444444e-08,
"loss": 76.8274,
"step": 3480
},
{
"epoch": 0.9813273261646223,
"grad_norm": 99.5,
"learning_rate": 4.131455399061032e-08,
"loss": 76.25,
"step": 3485
},
{
"epoch": 0.982735256331286,
"grad_norm": 106.1875,
"learning_rate": 3.818466353677621e-08,
"loss": 77.0828,
"step": 3490
},
{
"epoch": 0.9841431864979497,
"grad_norm": 103.5625,
"learning_rate": 3.505477308294209e-08,
"loss": 77.2998,
"step": 3495
},
{
"epoch": 0.9855511166646135,
"grad_norm": 101.0625,
"learning_rate": 3.192488262910798e-08,
"loss": 76.5932,
"step": 3500
},
{
"epoch": 0.9855511166646135,
"eval_loss": 2.395787239074707,
"eval_runtime": 173.5088,
"eval_samples_per_second": 1103.091,
"eval_steps_per_second": 34.477,
"step": 3500
}
],
"logging_steps": 5,
"max_steps": 3551,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5163252974760755e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}