web_rag_ablation2_48_LR_1e-5 / trainer_state.json
Rubywong123's picture
Upload folder using huggingface_hub
e78d5b8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9994000299985002,
"eval_steps": 500,
"global_step": 4444,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00044997750112494374,
"grad_norm": 4.529098245180452,
"learning_rate": 2.2471910112359554e-08,
"loss": 2.198,
"step": 1
},
{
"epoch": 0.0022498875056247186,
"grad_norm": 4.089353425911041,
"learning_rate": 1.1235955056179776e-07,
"loss": 2.1345,
"step": 5
},
{
"epoch": 0.004499775011249437,
"grad_norm": 3.871692712739612,
"learning_rate": 2.247191011235955e-07,
"loss": 2.1219,
"step": 10
},
{
"epoch": 0.006749662516874156,
"grad_norm": 3.48717161147707,
"learning_rate": 3.3707865168539325e-07,
"loss": 2.1269,
"step": 15
},
{
"epoch": 0.008999550022498875,
"grad_norm": 2.757793504494978,
"learning_rate": 4.49438202247191e-07,
"loss": 1.9716,
"step": 20
},
{
"epoch": 0.011249437528123594,
"grad_norm": 1.888825489292899,
"learning_rate": 5.617977528089888e-07,
"loss": 1.8304,
"step": 25
},
{
"epoch": 0.013499325033748313,
"grad_norm": 1.775898351624132,
"learning_rate": 6.741573033707865e-07,
"loss": 1.5734,
"step": 30
},
{
"epoch": 0.01574921253937303,
"grad_norm": 1.3767992538309424,
"learning_rate": 7.865168539325843e-07,
"loss": 1.3449,
"step": 35
},
{
"epoch": 0.01799910004499775,
"grad_norm": 0.640753938304731,
"learning_rate": 8.98876404494382e-07,
"loss": 1.033,
"step": 40
},
{
"epoch": 0.020248987550622467,
"grad_norm": 0.5235162453528913,
"learning_rate": 1.01123595505618e-06,
"loss": 0.8949,
"step": 45
},
{
"epoch": 0.02249887505624719,
"grad_norm": 0.5107836396937878,
"learning_rate": 1.1235955056179777e-06,
"loss": 0.8114,
"step": 50
},
{
"epoch": 0.024748762561871907,
"grad_norm": 0.4706639289305382,
"learning_rate": 1.2359550561797752e-06,
"loss": 0.7441,
"step": 55
},
{
"epoch": 0.026998650067496625,
"grad_norm": 0.40589783510545635,
"learning_rate": 1.348314606741573e-06,
"loss": 0.69,
"step": 60
},
{
"epoch": 0.029248537573121344,
"grad_norm": 0.35422872028585745,
"learning_rate": 1.4606741573033708e-06,
"loss": 0.6444,
"step": 65
},
{
"epoch": 0.03149842507874606,
"grad_norm": 0.29411265606808407,
"learning_rate": 1.5730337078651686e-06,
"loss": 0.6228,
"step": 70
},
{
"epoch": 0.03374831258437078,
"grad_norm": 0.26693653586148297,
"learning_rate": 1.6853932584269663e-06,
"loss": 0.5582,
"step": 75
},
{
"epoch": 0.0359982000899955,
"grad_norm": 0.2681533501424792,
"learning_rate": 1.797752808988764e-06,
"loss": 0.556,
"step": 80
},
{
"epoch": 0.03824808759562022,
"grad_norm": 0.25571380427813706,
"learning_rate": 1.910112359550562e-06,
"loss": 0.5195,
"step": 85
},
{
"epoch": 0.040497975101244935,
"grad_norm": 0.2711200571248154,
"learning_rate": 2.02247191011236e-06,
"loss": 0.5143,
"step": 90
},
{
"epoch": 0.042747862606869656,
"grad_norm": 0.24950121635592964,
"learning_rate": 2.1348314606741574e-06,
"loss": 0.494,
"step": 95
},
{
"epoch": 0.04499775011249438,
"grad_norm": 0.27150452317695684,
"learning_rate": 2.2471910112359554e-06,
"loss": 0.4789,
"step": 100
},
{
"epoch": 0.04724763761811909,
"grad_norm": 0.28474082113260213,
"learning_rate": 2.359550561797753e-06,
"loss": 0.4491,
"step": 105
},
{
"epoch": 0.049497525123743814,
"grad_norm": 0.3015594402280247,
"learning_rate": 2.4719101123595505e-06,
"loss": 0.4403,
"step": 110
},
{
"epoch": 0.05174741262936853,
"grad_norm": 0.30222867324963326,
"learning_rate": 2.584269662921349e-06,
"loss": 0.4484,
"step": 115
},
{
"epoch": 0.05399730013499325,
"grad_norm": 0.273012018841968,
"learning_rate": 2.696629213483146e-06,
"loss": 0.4261,
"step": 120
},
{
"epoch": 0.05624718764061797,
"grad_norm": 0.26965584672677667,
"learning_rate": 2.8089887640449444e-06,
"loss": 0.4322,
"step": 125
},
{
"epoch": 0.05849707514624269,
"grad_norm": 0.26746073049580904,
"learning_rate": 2.9213483146067416e-06,
"loss": 0.4639,
"step": 130
},
{
"epoch": 0.06074696265186741,
"grad_norm": 0.23633644420426006,
"learning_rate": 3.03370786516854e-06,
"loss": 0.4599,
"step": 135
},
{
"epoch": 0.06299685015749212,
"grad_norm": 0.23444605189207685,
"learning_rate": 3.146067415730337e-06,
"loss": 0.3872,
"step": 140
},
{
"epoch": 0.06524673766311684,
"grad_norm": 0.2498458749808252,
"learning_rate": 3.258426966292135e-06,
"loss": 0.4033,
"step": 145
},
{
"epoch": 0.06749662516874157,
"grad_norm": 0.20527616283239267,
"learning_rate": 3.3707865168539327e-06,
"loss": 0.4029,
"step": 150
},
{
"epoch": 0.06974651267436628,
"grad_norm": 0.1929264923191392,
"learning_rate": 3.4831460674157306e-06,
"loss": 0.3972,
"step": 155
},
{
"epoch": 0.071996400179991,
"grad_norm": 0.2053416646605216,
"learning_rate": 3.595505617977528e-06,
"loss": 0.4048,
"step": 160
},
{
"epoch": 0.07424628768561573,
"grad_norm": 0.21703322812039422,
"learning_rate": 3.707865168539326e-06,
"loss": 0.4141,
"step": 165
},
{
"epoch": 0.07649617519124044,
"grad_norm": 0.1891382535207117,
"learning_rate": 3.820224719101124e-06,
"loss": 0.4136,
"step": 170
},
{
"epoch": 0.07874606269686515,
"grad_norm": 0.147554774836894,
"learning_rate": 3.932584269662922e-06,
"loss": 0.375,
"step": 175
},
{
"epoch": 0.08099595020248987,
"grad_norm": 0.17905729415958477,
"learning_rate": 4.04494382022472e-06,
"loss": 0.4017,
"step": 180
},
{
"epoch": 0.0832458377081146,
"grad_norm": 0.1505640721404514,
"learning_rate": 4.157303370786518e-06,
"loss": 0.3805,
"step": 185
},
{
"epoch": 0.08549572521373931,
"grad_norm": 0.1583375307295741,
"learning_rate": 4.269662921348315e-06,
"loss": 0.3782,
"step": 190
},
{
"epoch": 0.08774561271936403,
"grad_norm": 0.18866183228882213,
"learning_rate": 4.382022471910113e-06,
"loss": 0.4021,
"step": 195
},
{
"epoch": 0.08999550022498876,
"grad_norm": 0.15247202386740694,
"learning_rate": 4.494382022471911e-06,
"loss": 0.3601,
"step": 200
},
{
"epoch": 0.09224538773061347,
"grad_norm": 0.15750805465065473,
"learning_rate": 4.606741573033709e-06,
"loss": 0.3642,
"step": 205
},
{
"epoch": 0.09449527523623819,
"grad_norm": 0.18964645355569257,
"learning_rate": 4.719101123595506e-06,
"loss": 0.381,
"step": 210
},
{
"epoch": 0.0967451627418629,
"grad_norm": 0.21688318162459963,
"learning_rate": 4.831460674157304e-06,
"loss": 0.3936,
"step": 215
},
{
"epoch": 0.09899505024748763,
"grad_norm": 0.1626408315890611,
"learning_rate": 4.943820224719101e-06,
"loss": 0.3836,
"step": 220
},
{
"epoch": 0.10124493775311234,
"grad_norm": 0.15324258338477442,
"learning_rate": 5.0561797752809e-06,
"loss": 0.3772,
"step": 225
},
{
"epoch": 0.10349482525873706,
"grad_norm": 0.14183725887256435,
"learning_rate": 5.168539325842698e-06,
"loss": 0.372,
"step": 230
},
{
"epoch": 0.10574471276436179,
"grad_norm": 0.16218789948449905,
"learning_rate": 5.280898876404494e-06,
"loss": 0.3571,
"step": 235
},
{
"epoch": 0.1079946002699865,
"grad_norm": 0.1542954767008243,
"learning_rate": 5.393258426966292e-06,
"loss": 0.3773,
"step": 240
},
{
"epoch": 0.11024448777561122,
"grad_norm": 0.1480650113941815,
"learning_rate": 5.50561797752809e-06,
"loss": 0.3679,
"step": 245
},
{
"epoch": 0.11249437528123594,
"grad_norm": 0.14762694498356746,
"learning_rate": 5.617977528089889e-06,
"loss": 0.3766,
"step": 250
},
{
"epoch": 0.11474426278686066,
"grad_norm": 0.13168710830003866,
"learning_rate": 5.730337078651685e-06,
"loss": 0.3563,
"step": 255
},
{
"epoch": 0.11699415029248537,
"grad_norm": 0.12994674295453762,
"learning_rate": 5.842696629213483e-06,
"loss": 0.3835,
"step": 260
},
{
"epoch": 0.11924403779811009,
"grad_norm": 0.15322453794019458,
"learning_rate": 5.955056179775281e-06,
"loss": 0.3523,
"step": 265
},
{
"epoch": 0.12149392530373482,
"grad_norm": 0.14048350395524586,
"learning_rate": 6.06741573033708e-06,
"loss": 0.3667,
"step": 270
},
{
"epoch": 0.12374381280935953,
"grad_norm": 0.12596877364547304,
"learning_rate": 6.179775280898876e-06,
"loss": 0.3675,
"step": 275
},
{
"epoch": 0.12599370031498425,
"grad_norm": 0.14354283317022593,
"learning_rate": 6.292134831460674e-06,
"loss": 0.4005,
"step": 280
},
{
"epoch": 0.12824358782060896,
"grad_norm": 0.11630361105168144,
"learning_rate": 6.404494382022472e-06,
"loss": 0.3747,
"step": 285
},
{
"epoch": 0.13049347532623368,
"grad_norm": 0.13228040085041334,
"learning_rate": 6.51685393258427e-06,
"loss": 0.4077,
"step": 290
},
{
"epoch": 0.13274336283185842,
"grad_norm": 0.14738366545686615,
"learning_rate": 6.629213483146067e-06,
"loss": 0.3998,
"step": 295
},
{
"epoch": 0.13499325033748313,
"grad_norm": 0.12791384531800185,
"learning_rate": 6.741573033707865e-06,
"loss": 0.3964,
"step": 300
},
{
"epoch": 0.13724313784310785,
"grad_norm": 0.14292143339244684,
"learning_rate": 6.853932584269663e-06,
"loss": 0.3665,
"step": 305
},
{
"epoch": 0.13949302534873256,
"grad_norm": 0.13259028515993362,
"learning_rate": 6.966292134831461e-06,
"loss": 0.3648,
"step": 310
},
{
"epoch": 0.14174291285435728,
"grad_norm": 0.1292871978326336,
"learning_rate": 7.078651685393258e-06,
"loss": 0.3729,
"step": 315
},
{
"epoch": 0.143992800359982,
"grad_norm": 0.11157572176169035,
"learning_rate": 7.191011235955056e-06,
"loss": 0.3746,
"step": 320
},
{
"epoch": 0.1462426878656067,
"grad_norm": 0.13354661592196232,
"learning_rate": 7.303370786516854e-06,
"loss": 0.3808,
"step": 325
},
{
"epoch": 0.14849257537123145,
"grad_norm": 0.1306015247259671,
"learning_rate": 7.415730337078652e-06,
"loss": 0.3581,
"step": 330
},
{
"epoch": 0.15074246287685616,
"grad_norm": 0.14070851299418752,
"learning_rate": 7.5280898876404495e-06,
"loss": 0.3687,
"step": 335
},
{
"epoch": 0.15299235038248088,
"grad_norm": 0.12379734686286958,
"learning_rate": 7.640449438202247e-06,
"loss": 0.3663,
"step": 340
},
{
"epoch": 0.1552422378881056,
"grad_norm": 0.10751947723272416,
"learning_rate": 7.752808988764046e-06,
"loss": 0.3852,
"step": 345
},
{
"epoch": 0.1574921253937303,
"grad_norm": 0.1540237761883997,
"learning_rate": 7.865168539325843e-06,
"loss": 0.3717,
"step": 350
},
{
"epoch": 0.15974201289935502,
"grad_norm": 0.11180120270561197,
"learning_rate": 7.97752808988764e-06,
"loss": 0.3489,
"step": 355
},
{
"epoch": 0.16199190040497974,
"grad_norm": 0.12094951392614195,
"learning_rate": 8.08988764044944e-06,
"loss": 0.3732,
"step": 360
},
{
"epoch": 0.16424178791060448,
"grad_norm": 0.09738867753978302,
"learning_rate": 8.202247191011237e-06,
"loss": 0.377,
"step": 365
},
{
"epoch": 0.1664916754162292,
"grad_norm": 0.19496877057584272,
"learning_rate": 8.314606741573035e-06,
"loss": 0.3804,
"step": 370
},
{
"epoch": 0.1687415629218539,
"grad_norm": 0.13099392604853372,
"learning_rate": 8.426966292134832e-06,
"loss": 0.3853,
"step": 375
},
{
"epoch": 0.17099145042747863,
"grad_norm": 0.0929654538179481,
"learning_rate": 8.53932584269663e-06,
"loss": 0.3532,
"step": 380
},
{
"epoch": 0.17324133793310334,
"grad_norm": 0.12166228073135968,
"learning_rate": 8.651685393258428e-06,
"loss": 0.3629,
"step": 385
},
{
"epoch": 0.17549122543872805,
"grad_norm": 0.10261562058829776,
"learning_rate": 8.764044943820226e-06,
"loss": 0.3662,
"step": 390
},
{
"epoch": 0.17774111294435277,
"grad_norm": 0.11229652632991165,
"learning_rate": 8.876404494382023e-06,
"loss": 0.366,
"step": 395
},
{
"epoch": 0.1799910004499775,
"grad_norm": 0.11438088125544643,
"learning_rate": 8.988764044943822e-06,
"loss": 0.3766,
"step": 400
},
{
"epoch": 0.18224088795560223,
"grad_norm": 0.11100159715996023,
"learning_rate": 9.101123595505619e-06,
"loss": 0.3149,
"step": 405
},
{
"epoch": 0.18449077546122694,
"grad_norm": 0.09215304432977561,
"learning_rate": 9.213483146067417e-06,
"loss": 0.3505,
"step": 410
},
{
"epoch": 0.18674066296685166,
"grad_norm": 0.11027407366862503,
"learning_rate": 9.325842696629213e-06,
"loss": 0.3239,
"step": 415
},
{
"epoch": 0.18899055047247637,
"grad_norm": 0.10591097871093506,
"learning_rate": 9.438202247191012e-06,
"loss": 0.3763,
"step": 420
},
{
"epoch": 0.19124043797810109,
"grad_norm": 0.12014034899424982,
"learning_rate": 9.55056179775281e-06,
"loss": 0.387,
"step": 425
},
{
"epoch": 0.1934903254837258,
"grad_norm": 0.09470189013776839,
"learning_rate": 9.662921348314608e-06,
"loss": 0.379,
"step": 430
},
{
"epoch": 0.19574021298935054,
"grad_norm": 0.09501092189291689,
"learning_rate": 9.775280898876405e-06,
"loss": 0.3642,
"step": 435
},
{
"epoch": 0.19799010049497526,
"grad_norm": 0.11021236791362751,
"learning_rate": 9.887640449438202e-06,
"loss": 0.353,
"step": 440
},
{
"epoch": 0.20023998800059997,
"grad_norm": 0.10184537613196046,
"learning_rate": 1e-05,
"loss": 0.365,
"step": 445
},
{
"epoch": 0.2024898755062247,
"grad_norm": 0.08309643827704467,
"learning_rate": 9.999961427623602e-06,
"loss": 0.3575,
"step": 450
},
{
"epoch": 0.2047397630118494,
"grad_norm": 0.10037722707892065,
"learning_rate": 9.999845711089533e-06,
"loss": 0.3471,
"step": 455
},
{
"epoch": 0.20698965051747412,
"grad_norm": 0.11715533210340626,
"learning_rate": 9.999652852183184e-06,
"loss": 0.3714,
"step": 460
},
{
"epoch": 0.20923953802309886,
"grad_norm": 0.13060134335041548,
"learning_rate": 9.99938285388016e-06,
"loss": 0.3635,
"step": 465
},
{
"epoch": 0.21148942552872357,
"grad_norm": 0.08745200641347264,
"learning_rate": 9.999035720346254e-06,
"loss": 0.3571,
"step": 470
},
{
"epoch": 0.2137393130343483,
"grad_norm": 0.08966989980740278,
"learning_rate": 9.998611456937373e-06,
"loss": 0.3639,
"step": 475
},
{
"epoch": 0.215989200539973,
"grad_norm": 0.09956344072812717,
"learning_rate": 9.998110070199454e-06,
"loss": 0.3665,
"step": 480
},
{
"epoch": 0.21823908804559772,
"grad_norm": 0.09656500042174225,
"learning_rate": 9.997531567868367e-06,
"loss": 0.3726,
"step": 485
},
{
"epoch": 0.22048897555122243,
"grad_norm": 0.11853509152316483,
"learning_rate": 9.996875958869803e-06,
"loss": 0.3518,
"step": 490
},
{
"epoch": 0.22273886305684715,
"grad_norm": 0.10059970891055807,
"learning_rate": 9.996143253319113e-06,
"loss": 0.3624,
"step": 495
},
{
"epoch": 0.2249887505624719,
"grad_norm": 0.10793939615766127,
"learning_rate": 9.995333462521178e-06,
"loss": 0.3654,
"step": 500
},
{
"epoch": 0.2272386380680966,
"grad_norm": 0.10450237535025396,
"learning_rate": 9.99444659897022e-06,
"loss": 0.3663,
"step": 505
},
{
"epoch": 0.22948852557372132,
"grad_norm": 0.09806526685924745,
"learning_rate": 9.993482676349612e-06,
"loss": 0.342,
"step": 510
},
{
"epoch": 0.23173841307934603,
"grad_norm": 0.0964872964969597,
"learning_rate": 9.992441709531671e-06,
"loss": 0.3705,
"step": 515
},
{
"epoch": 0.23398830058497075,
"grad_norm": 0.09412389153440821,
"learning_rate": 9.991323714577421e-06,
"loss": 0.3541,
"step": 520
},
{
"epoch": 0.23623818809059546,
"grad_norm": 0.08953556655443609,
"learning_rate": 9.99012870873635e-06,
"loss": 0.3521,
"step": 525
},
{
"epoch": 0.23848807559622018,
"grad_norm": 0.08695345930804899,
"learning_rate": 9.988856710446143e-06,
"loss": 0.3505,
"step": 530
},
{
"epoch": 0.24073796310184492,
"grad_norm": 0.09543515501943514,
"learning_rate": 9.987507739332401e-06,
"loss": 0.3766,
"step": 535
},
{
"epoch": 0.24298785060746964,
"grad_norm": 0.08926010106293578,
"learning_rate": 9.986081816208333e-06,
"loss": 0.329,
"step": 540
},
{
"epoch": 0.24523773811309435,
"grad_norm": 0.08598056512962657,
"learning_rate": 9.984578963074436e-06,
"loss": 0.3617,
"step": 545
},
{
"epoch": 0.24748762561871906,
"grad_norm": 0.08952811194064599,
"learning_rate": 9.982999203118153e-06,
"loss": 0.3383,
"step": 550
},
{
"epoch": 0.24973751312434378,
"grad_norm": 0.12658781787185433,
"learning_rate": 9.981342560713528e-06,
"loss": 0.3238,
"step": 555
},
{
"epoch": 0.2519874006299685,
"grad_norm": 0.09353150867243243,
"learning_rate": 9.979609061420812e-06,
"loss": 0.3545,
"step": 560
},
{
"epoch": 0.2542372881355932,
"grad_norm": 0.09177651257435882,
"learning_rate": 9.977798731986079e-06,
"loss": 0.3502,
"step": 565
},
{
"epoch": 0.2564871756412179,
"grad_norm": 0.09932686671141468,
"learning_rate": 9.975911600340814e-06,
"loss": 0.3468,
"step": 570
},
{
"epoch": 0.25873706314684264,
"grad_norm": 0.08192603238892632,
"learning_rate": 9.973947695601477e-06,
"loss": 0.3324,
"step": 575
},
{
"epoch": 0.26098695065246735,
"grad_norm": 0.07493334234921131,
"learning_rate": 9.971907048069058e-06,
"loss": 0.3795,
"step": 580
},
{
"epoch": 0.26323683815809207,
"grad_norm": 0.08138918761115761,
"learning_rate": 9.969789689228606e-06,
"loss": 0.3385,
"step": 585
},
{
"epoch": 0.26548672566371684,
"grad_norm": 0.08838566706763232,
"learning_rate": 9.967595651748745e-06,
"loss": 0.369,
"step": 590
},
{
"epoch": 0.26773661316934155,
"grad_norm": 0.08784958596018687,
"learning_rate": 9.965324969481172e-06,
"loss": 0.3169,
"step": 595
},
{
"epoch": 0.26998650067496627,
"grad_norm": 0.0896168468240925,
"learning_rate": 9.962977677460132e-06,
"loss": 0.3572,
"step": 600
},
{
"epoch": 0.272236388180591,
"grad_norm": 0.0946662419061461,
"learning_rate": 9.960553811901879e-06,
"loss": 0.385,
"step": 605
},
{
"epoch": 0.2744862756862157,
"grad_norm": 0.12115400639084788,
"learning_rate": 9.95805341020411e-06,
"loss": 0.3595,
"step": 610
},
{
"epoch": 0.2767361631918404,
"grad_norm": 0.0997799833296398,
"learning_rate": 9.955476510945401e-06,
"loss": 0.3317,
"step": 615
},
{
"epoch": 0.2789860506974651,
"grad_norm": 0.0996130660835657,
"learning_rate": 9.952823153884606e-06,
"loss": 0.3449,
"step": 620
},
{
"epoch": 0.28123593820308984,
"grad_norm": 0.07835665128694007,
"learning_rate": 9.950093379960238e-06,
"loss": 0.3397,
"step": 625
},
{
"epoch": 0.28348582570871456,
"grad_norm": 0.09391607163130151,
"learning_rate": 9.947287231289844e-06,
"loss": 0.3776,
"step": 630
},
{
"epoch": 0.28573571321433927,
"grad_norm": 0.09176019196191011,
"learning_rate": 9.944404751169353e-06,
"loss": 0.3722,
"step": 635
},
{
"epoch": 0.287985600719964,
"grad_norm": 0.08406446635852077,
"learning_rate": 9.941445984072408e-06,
"loss": 0.338,
"step": 640
},
{
"epoch": 0.2902354882255887,
"grad_norm": 0.08291987107240674,
"learning_rate": 9.938410975649681e-06,
"loss": 0.3742,
"step": 645
},
{
"epoch": 0.2924853757312134,
"grad_norm": 0.08633205314263653,
"learning_rate": 9.935299772728166e-06,
"loss": 0.3611,
"step": 650
},
{
"epoch": 0.2947352632368382,
"grad_norm": 0.07331502093091126,
"learning_rate": 9.93211242331046e-06,
"loss": 0.3344,
"step": 655
},
{
"epoch": 0.2969851507424629,
"grad_norm": 0.08385804833550349,
"learning_rate": 9.92884897657402e-06,
"loss": 0.3557,
"step": 660
},
{
"epoch": 0.2992350382480876,
"grad_norm": 0.07435080817645906,
"learning_rate": 9.925509482870403e-06,
"loss": 0.3405,
"step": 665
},
{
"epoch": 0.30148492575371233,
"grad_norm": 0.08168530188324026,
"learning_rate": 9.922093993724492e-06,
"loss": 0.3426,
"step": 670
},
{
"epoch": 0.30373481325933704,
"grad_norm": 0.08035820133808234,
"learning_rate": 9.918602561833702e-06,
"loss": 0.3604,
"step": 675
},
{
"epoch": 0.30598470076496176,
"grad_norm": 0.08271219072869937,
"learning_rate": 9.91503524106716e-06,
"loss": 0.348,
"step": 680
},
{
"epoch": 0.3082345882705865,
"grad_norm": 0.08921123968472987,
"learning_rate": 9.911392086464886e-06,
"loss": 0.3441,
"step": 685
},
{
"epoch": 0.3104844757762112,
"grad_norm": 0.0839985353132867,
"learning_rate": 9.907673154236929e-06,
"loss": 0.3574,
"step": 690
},
{
"epoch": 0.3127343632818359,
"grad_norm": 0.10386954688768853,
"learning_rate": 9.903878501762511e-06,
"loss": 0.3286,
"step": 695
},
{
"epoch": 0.3149842507874606,
"grad_norm": 0.08778681814263677,
"learning_rate": 9.900008187589138e-06,
"loss": 0.3268,
"step": 700
},
{
"epoch": 0.31723413829308533,
"grad_norm": 0.09027807015137441,
"learning_rate": 9.896062271431697e-06,
"loss": 0.3392,
"step": 705
},
{
"epoch": 0.31948402579871005,
"grad_norm": 0.09567803807106381,
"learning_rate": 9.89204081417153e-06,
"loss": 0.3539,
"step": 710
},
{
"epoch": 0.32173391330433476,
"grad_norm": 0.08574167684815145,
"learning_rate": 9.887943877855505e-06,
"loss": 0.3377,
"step": 715
},
{
"epoch": 0.3239838008099595,
"grad_norm": 0.09260863383057749,
"learning_rate": 9.883771525695052e-06,
"loss": 0.3449,
"step": 720
},
{
"epoch": 0.32623368831558425,
"grad_norm": 0.08495447140601177,
"learning_rate": 9.879523822065181e-06,
"loss": 0.3219,
"step": 725
},
{
"epoch": 0.32848357582120896,
"grad_norm": 0.07533141152453762,
"learning_rate": 9.875200832503505e-06,
"loss": 0.3568,
"step": 730
},
{
"epoch": 0.3307334633268337,
"grad_norm": 0.12247315370054979,
"learning_rate": 9.870802623709215e-06,
"loss": 0.3596,
"step": 735
},
{
"epoch": 0.3329833508324584,
"grad_norm": 0.08738003894579985,
"learning_rate": 9.866329263542055e-06,
"loss": 0.3638,
"step": 740
},
{
"epoch": 0.3352332383380831,
"grad_norm": 0.08338816245916761,
"learning_rate": 9.861780821021282e-06,
"loss": 0.3561,
"step": 745
},
{
"epoch": 0.3374831258437078,
"grad_norm": 0.08236575366096931,
"learning_rate": 9.857157366324587e-06,
"loss": 0.3332,
"step": 750
},
{
"epoch": 0.33973301334933254,
"grad_norm": 0.06944440484574142,
"learning_rate": 9.852458970787027e-06,
"loss": 0.357,
"step": 755
},
{
"epoch": 0.34198290085495725,
"grad_norm": 0.07253573063652108,
"learning_rate": 9.847685706899913e-06,
"loss": 0.3245,
"step": 760
},
{
"epoch": 0.34423278836058196,
"grad_norm": 0.07122505571988245,
"learning_rate": 9.842837648309698e-06,
"loss": 0.3528,
"step": 765
},
{
"epoch": 0.3464826758662067,
"grad_norm": 0.07767571294888054,
"learning_rate": 9.837914869816835e-06,
"loss": 0.3395,
"step": 770
},
{
"epoch": 0.3487325633718314,
"grad_norm": 0.07346709829835463,
"learning_rate": 9.832917447374637e-06,
"loss": 0.3648,
"step": 775
},
{
"epoch": 0.3509824508774561,
"grad_norm": 0.08947827115200468,
"learning_rate": 9.827845458088082e-06,
"loss": 0.3521,
"step": 780
},
{
"epoch": 0.3532323383830808,
"grad_norm": 0.06534661518603589,
"learning_rate": 9.822698980212643e-06,
"loss": 0.3366,
"step": 785
},
{
"epoch": 0.35548222588870554,
"grad_norm": 0.08012548677368805,
"learning_rate": 9.817478093153074e-06,
"loss": 0.3752,
"step": 790
},
{
"epoch": 0.3577321133943303,
"grad_norm": 0.08284074693974608,
"learning_rate": 9.812182877462182e-06,
"loss": 0.3337,
"step": 795
},
{
"epoch": 0.359982000899955,
"grad_norm": 0.0962415375604297,
"learning_rate": 9.806813414839588e-06,
"loss": 0.3489,
"step": 800
},
{
"epoch": 0.36223188840557974,
"grad_norm": 0.08299125800356132,
"learning_rate": 9.801369788130468e-06,
"loss": 0.3466,
"step": 805
},
{
"epoch": 0.36448177591120445,
"grad_norm": 0.07196876922608039,
"learning_rate": 9.795852081324266e-06,
"loss": 0.3424,
"step": 810
},
{
"epoch": 0.36673166341682917,
"grad_norm": 0.07251364573785335,
"learning_rate": 9.79026037955341e-06,
"loss": 0.3578,
"step": 815
},
{
"epoch": 0.3689815509224539,
"grad_norm": 0.07779557103393991,
"learning_rate": 9.784594769091989e-06,
"loss": 0.3616,
"step": 820
},
{
"epoch": 0.3712314384280786,
"grad_norm": 0.07434071832631806,
"learning_rate": 9.778855337354426e-06,
"loss": 0.3572,
"step": 825
},
{
"epoch": 0.3734813259337033,
"grad_norm": 0.0761276852235193,
"learning_rate": 9.77304217289413e-06,
"loss": 0.3147,
"step": 830
},
{
"epoch": 0.375731213439328,
"grad_norm": 0.08043122828466166,
"learning_rate": 9.76715536540213e-06,
"loss": 0.377,
"step": 835
},
{
"epoch": 0.37798110094495274,
"grad_norm": 0.07418765173136689,
"learning_rate": 9.761195005705685e-06,
"loss": 0.3198,
"step": 840
},
{
"epoch": 0.38023098845057746,
"grad_norm": 0.08536316659010101,
"learning_rate": 9.755161185766891e-06,
"loss": 0.3324,
"step": 845
},
{
"epoch": 0.38248087595620217,
"grad_norm": 0.07947600210593922,
"learning_rate": 9.74905399868126e-06,
"loss": 0.3618,
"step": 850
},
{
"epoch": 0.3847307634618269,
"grad_norm": 0.0744113590241544,
"learning_rate": 9.742873538676274e-06,
"loss": 0.3402,
"step": 855
},
{
"epoch": 0.3869806509674516,
"grad_norm": 0.06709744864423575,
"learning_rate": 9.73661990110995e-06,
"loss": 0.3337,
"step": 860
},
{
"epoch": 0.38923053847307637,
"grad_norm": 0.07721094678627155,
"learning_rate": 9.73029318246935e-06,
"loss": 0.3473,
"step": 865
},
{
"epoch": 0.3914804259787011,
"grad_norm": 0.07436794628188735,
"learning_rate": 9.723893480369106e-06,
"loss": 0.3227,
"step": 870
},
{
"epoch": 0.3937303134843258,
"grad_norm": 0.08184087425329187,
"learning_rate": 9.717420893549902e-06,
"loss": 0.3271,
"step": 875
},
{
"epoch": 0.3959802009899505,
"grad_norm": 0.07323707936362174,
"learning_rate": 9.71087552187696e-06,
"loss": 0.3353,
"step": 880
},
{
"epoch": 0.39823008849557523,
"grad_norm": 0.07407984516514123,
"learning_rate": 9.7042574663385e-06,
"loss": 0.3405,
"step": 885
},
{
"epoch": 0.40047997600119994,
"grad_norm": 0.06705082859053621,
"learning_rate": 9.697566829044172e-06,
"loss": 0.3335,
"step": 890
},
{
"epoch": 0.40272986350682466,
"grad_norm": 0.06417105200135667,
"learning_rate": 9.690803713223485e-06,
"loss": 0.3632,
"step": 895
},
{
"epoch": 0.4049797510124494,
"grad_norm": 0.07661580482483403,
"learning_rate": 9.68396822322422e-06,
"loss": 0.341,
"step": 900
},
{
"epoch": 0.4072296385180741,
"grad_norm": 0.07783982481846635,
"learning_rate": 9.677060464510817e-06,
"loss": 0.3422,
"step": 905
},
{
"epoch": 0.4094795260236988,
"grad_norm": 0.07934781483289755,
"learning_rate": 9.670080543662742e-06,
"loss": 0.344,
"step": 910
},
{
"epoch": 0.4117294135293235,
"grad_norm": 0.07206722738626223,
"learning_rate": 9.663028568372845e-06,
"loss": 0.3563,
"step": 915
},
{
"epoch": 0.41397930103494823,
"grad_norm": 0.06767347411319052,
"learning_rate": 9.655904647445711e-06,
"loss": 0.3231,
"step": 920
},
{
"epoch": 0.41622918854057295,
"grad_norm": 0.07180782228261029,
"learning_rate": 9.64870889079596e-06,
"loss": 0.3287,
"step": 925
},
{
"epoch": 0.4184790760461977,
"grad_norm": 0.07242610923174227,
"learning_rate": 9.641441409446563e-06,
"loss": 0.3487,
"step": 930
},
{
"epoch": 0.42072896355182243,
"grad_norm": 0.06832390188318747,
"learning_rate": 9.634102315527136e-06,
"loss": 0.325,
"step": 935
},
{
"epoch": 0.42297885105744715,
"grad_norm": 0.07856703769371849,
"learning_rate": 9.626691722272193e-06,
"loss": 0.3458,
"step": 940
},
{
"epoch": 0.42522873856307186,
"grad_norm": 0.0663937348509602,
"learning_rate": 9.61920974401941e-06,
"loss": 0.3513,
"step": 945
},
{
"epoch": 0.4274786260686966,
"grad_norm": 0.07114607462059036,
"learning_rate": 9.611656496207861e-06,
"loss": 0.3474,
"step": 950
},
{
"epoch": 0.4297285135743213,
"grad_norm": 0.07603014864007235,
"learning_rate": 9.604032095376234e-06,
"loss": 0.3362,
"step": 955
},
{
"epoch": 0.431978401079946,
"grad_norm": 0.0734531353849079,
"learning_rate": 9.596336659161031e-06,
"loss": 0.3445,
"step": 960
},
{
"epoch": 0.4342282885855707,
"grad_norm": 0.061596458285852376,
"learning_rate": 9.588570306294759e-06,
"loss": 0.3453,
"step": 965
},
{
"epoch": 0.43647817609119544,
"grad_norm": 0.05885162798568731,
"learning_rate": 9.58073315660409e-06,
"loss": 0.3439,
"step": 970
},
{
"epoch": 0.43872806359682015,
"grad_norm": 0.07082727968014366,
"learning_rate": 9.57282533100802e-06,
"loss": 0.3395,
"step": 975
},
{
"epoch": 0.44097795110244487,
"grad_norm": 0.07316435404238263,
"learning_rate": 9.564846951515997e-06,
"loss": 0.3304,
"step": 980
},
{
"epoch": 0.4432278386080696,
"grad_norm": 0.07444841963108913,
"learning_rate": 9.55679814122605e-06,
"loss": 0.3298,
"step": 985
},
{
"epoch": 0.4454777261136943,
"grad_norm": 0.07294271191699972,
"learning_rate": 9.548679024322866e-06,
"loss": 0.3463,
"step": 990
},
{
"epoch": 0.447727613619319,
"grad_norm": 0.07031942249727262,
"learning_rate": 9.540489726075907e-06,
"loss": 0.3486,
"step": 995
},
{
"epoch": 0.4499775011249438,
"grad_norm": 0.07151326035389519,
"learning_rate": 9.532230372837446e-06,
"loss": 0.3537,
"step": 1000
},
{
"epoch": 0.4522273886305685,
"grad_norm": 0.0671028535664748,
"learning_rate": 9.523901092040634e-06,
"loss": 0.3455,
"step": 1005
},
{
"epoch": 0.4544772761361932,
"grad_norm": 0.07197014184781744,
"learning_rate": 9.51550201219754e-06,
"loss": 0.3432,
"step": 1010
},
{
"epoch": 0.4567271636418179,
"grad_norm": 0.07169196920459484,
"learning_rate": 9.507033262897142e-06,
"loss": 0.31,
"step": 1015
},
{
"epoch": 0.45897705114744264,
"grad_norm": 0.07109226686317548,
"learning_rate": 9.498494974803362e-06,
"loss": 0.3663,
"step": 1020
},
{
"epoch": 0.46122693865306735,
"grad_norm": 0.05804652011529642,
"learning_rate": 9.489887279653023e-06,
"loss": 0.3194,
"step": 1025
},
{
"epoch": 0.46347682615869207,
"grad_norm": 0.0700778438901929,
"learning_rate": 9.481210310253826e-06,
"loss": 0.3167,
"step": 1030
},
{
"epoch": 0.4657267136643168,
"grad_norm": 0.06244080013341172,
"learning_rate": 9.472464200482303e-06,
"loss": 0.3127,
"step": 1035
},
{
"epoch": 0.4679766011699415,
"grad_norm": 0.06903401204251029,
"learning_rate": 9.463649085281752e-06,
"loss": 0.3259,
"step": 1040
},
{
"epoch": 0.4702264886755662,
"grad_norm": 0.07317408098224049,
"learning_rate": 9.454765100660144e-06,
"loss": 0.3446,
"step": 1045
},
{
"epoch": 0.4724763761811909,
"grad_norm": 0.06487603568640564,
"learning_rate": 9.445812383688046e-06,
"loss": 0.3418,
"step": 1050
},
{
"epoch": 0.47472626368681564,
"grad_norm": 0.06587470603877191,
"learning_rate": 9.43679107249648e-06,
"loss": 0.3473,
"step": 1055
},
{
"epoch": 0.47697615119244036,
"grad_norm": 0.07107259617908306,
"learning_rate": 9.427701306274812e-06,
"loss": 0.337,
"step": 1060
},
{
"epoch": 0.47922603869806507,
"grad_norm": 0.06697594936792645,
"learning_rate": 9.418543225268598e-06,
"loss": 0.3429,
"step": 1065
},
{
"epoch": 0.48147592620368984,
"grad_norm": 0.0682858638376316,
"learning_rate": 9.40931697077741e-06,
"loss": 0.3358,
"step": 1070
},
{
"epoch": 0.48372581370931456,
"grad_norm": 0.07619891304792806,
"learning_rate": 9.400022685152683e-06,
"loss": 0.3333,
"step": 1075
},
{
"epoch": 0.48597570121493927,
"grad_norm": 0.07522989171574869,
"learning_rate": 9.390660511795481e-06,
"loss": 0.3587,
"step": 1080
},
{
"epoch": 0.488225588720564,
"grad_norm": 0.07244707737339262,
"learning_rate": 9.381230595154319e-06,
"loss": 0.3386,
"step": 1085
},
{
"epoch": 0.4904754762261887,
"grad_norm": 0.0747628006572659,
"learning_rate": 9.371733080722911e-06,
"loss": 0.3457,
"step": 1090
},
{
"epoch": 0.4927253637318134,
"grad_norm": 0.0687511407497147,
"learning_rate": 9.362168115037942e-06,
"loss": 0.3433,
"step": 1095
},
{
"epoch": 0.49497525123743813,
"grad_norm": 0.07512269519367433,
"learning_rate": 9.352535845676791e-06,
"loss": 0.3219,
"step": 1100
},
{
"epoch": 0.49722513874306284,
"grad_norm": 0.07246031317089945,
"learning_rate": 9.342836421255268e-06,
"loss": 0.322,
"step": 1105
},
{
"epoch": 0.49947502624868756,
"grad_norm": 0.07594536131369899,
"learning_rate": 9.333069991425313e-06,
"loss": 0.3589,
"step": 1110
},
{
"epoch": 0.5017249137543123,
"grad_norm": 0.06689469633356987,
"learning_rate": 9.323236706872685e-06,
"loss": 0.357,
"step": 1115
},
{
"epoch": 0.503974801259937,
"grad_norm": 0.06930147639704634,
"learning_rate": 9.31333671931465e-06,
"loss": 0.3263,
"step": 1120
},
{
"epoch": 0.5062246887655617,
"grad_norm": 0.06133351525533005,
"learning_rate": 9.303370181497623e-06,
"loss": 0.3422,
"step": 1125
},
{
"epoch": 0.5084745762711864,
"grad_norm": 0.07604072880215484,
"learning_rate": 9.293337247194827e-06,
"loss": 0.359,
"step": 1130
},
{
"epoch": 0.5107244637768111,
"grad_norm": 0.06955762934734898,
"learning_rate": 9.283238071203907e-06,
"loss": 0.3439,
"step": 1135
},
{
"epoch": 0.5129743512824358,
"grad_norm": 0.0803346877614296,
"learning_rate": 9.27307280934455e-06,
"loss": 0.3471,
"step": 1140
},
{
"epoch": 0.5152242387880606,
"grad_norm": 0.06622678263367843,
"learning_rate": 9.26284161845608e-06,
"loss": 0.3427,
"step": 1145
},
{
"epoch": 0.5174741262936853,
"grad_norm": 0.0709676185753263,
"learning_rate": 9.252544656395033e-06,
"loss": 0.3363,
"step": 1150
},
{
"epoch": 0.51972401379931,
"grad_norm": 0.08282162237795766,
"learning_rate": 9.242182082032729e-06,
"loss": 0.341,
"step": 1155
},
{
"epoch": 0.5219739013049347,
"grad_norm": 0.06233679346455434,
"learning_rate": 9.231754055252817e-06,
"loss": 0.3308,
"step": 1160
},
{
"epoch": 0.5242237888105594,
"grad_norm": 0.05949370637515577,
"learning_rate": 9.221260736948803e-06,
"loss": 0.3254,
"step": 1165
},
{
"epoch": 0.5264736763161841,
"grad_norm": 0.06102505633204194,
"learning_rate": 9.21070228902158e-06,
"loss": 0.327,
"step": 1170
},
{
"epoch": 0.528723563821809,
"grad_norm": 0.07096293590033853,
"learning_rate": 9.200078874376917e-06,
"loss": 0.3309,
"step": 1175
},
{
"epoch": 0.5309734513274337,
"grad_norm": 0.06374179754335971,
"learning_rate": 9.189390656922955e-06,
"loss": 0.3579,
"step": 1180
},
{
"epoch": 0.5332233388330584,
"grad_norm": 0.09643830344296066,
"learning_rate": 9.17863780156767e-06,
"loss": 0.3466,
"step": 1185
},
{
"epoch": 0.5354732263386831,
"grad_norm": 0.0652384061049577,
"learning_rate": 9.167820474216337e-06,
"loss": 0.3523,
"step": 1190
},
{
"epoch": 0.5377231138443078,
"grad_norm": 0.06430574295906281,
"learning_rate": 9.156938841768965e-06,
"loss": 0.3722,
"step": 1195
},
{
"epoch": 0.5399730013499325,
"grad_norm": 0.07648802804062793,
"learning_rate": 9.145993072117724e-06,
"loss": 0.321,
"step": 1200
},
{
"epoch": 0.5422228888555573,
"grad_norm": 0.06775418329662553,
"learning_rate": 9.134983334144352e-06,
"loss": 0.3549,
"step": 1205
},
{
"epoch": 0.544472776361182,
"grad_norm": 0.076334857238285,
"learning_rate": 9.123909797717551e-06,
"loss": 0.335,
"step": 1210
},
{
"epoch": 0.5467226638668067,
"grad_norm": 0.06576432515389055,
"learning_rate": 9.112772633690368e-06,
"loss": 0.3239,
"step": 1215
},
{
"epoch": 0.5489725513724314,
"grad_norm": 0.06872638373228167,
"learning_rate": 9.101572013897555e-06,
"loss": 0.3141,
"step": 1220
},
{
"epoch": 0.5512224388780561,
"grad_norm": 0.06158733598122966,
"learning_rate": 9.090308111152924e-06,
"loss": 0.3221,
"step": 1225
},
{
"epoch": 0.5534723263836808,
"grad_norm": 0.08097819934773681,
"learning_rate": 9.07898109924667e-06,
"loss": 0.3151,
"step": 1230
},
{
"epoch": 0.5557222138893055,
"grad_norm": 0.06764912622152554,
"learning_rate": 9.067591152942701e-06,
"loss": 0.3332,
"step": 1235
},
{
"epoch": 0.5579721013949303,
"grad_norm": 0.07314176615388208,
"learning_rate": 9.056138447975936e-06,
"loss": 0.3415,
"step": 1240
},
{
"epoch": 0.560221988900555,
"grad_norm": 0.0717387579544613,
"learning_rate": 9.044623161049594e-06,
"loss": 0.3386,
"step": 1245
},
{
"epoch": 0.5624718764061797,
"grad_norm": 0.07552097065323739,
"learning_rate": 9.033045469832467e-06,
"loss": 0.3569,
"step": 1250
},
{
"epoch": 0.5647217639118044,
"grad_norm": 0.06915693480180615,
"learning_rate": 9.02140555295618e-06,
"loss": 0.3222,
"step": 1255
},
{
"epoch": 0.5669716514174291,
"grad_norm": 0.07769020322155092,
"learning_rate": 9.009703590012434e-06,
"loss": 0.3185,
"step": 1260
},
{
"epoch": 0.5692215389230538,
"grad_norm": 0.07598860570344396,
"learning_rate": 8.997939761550239e-06,
"loss": 0.3522,
"step": 1265
},
{
"epoch": 0.5714714264286785,
"grad_norm": 0.07073748495565614,
"learning_rate": 8.986114249073122e-06,
"loss": 0.3169,
"step": 1270
},
{
"epoch": 0.5737213139343033,
"grad_norm": 0.06866551274687982,
"learning_rate": 8.97422723503633e-06,
"loss": 0.3304,
"step": 1275
},
{
"epoch": 0.575971201439928,
"grad_norm": 0.07075202015965712,
"learning_rate": 8.962278902844016e-06,
"loss": 0.3309,
"step": 1280
},
{
"epoch": 0.5782210889455527,
"grad_norm": 0.07165184953921011,
"learning_rate": 8.950269436846405e-06,
"loss": 0.331,
"step": 1285
},
{
"epoch": 0.5804709764511774,
"grad_norm": 0.06433134595791733,
"learning_rate": 8.938199022336956e-06,
"loss": 0.328,
"step": 1290
},
{
"epoch": 0.5827208639568021,
"grad_norm": 0.07003765990675229,
"learning_rate": 8.926067845549495e-06,
"loss": 0.3297,
"step": 1295
},
{
"epoch": 0.5849707514624268,
"grad_norm": 0.06653035126789796,
"learning_rate": 8.913876093655351e-06,
"loss": 0.335,
"step": 1300
},
{
"epoch": 0.5872206389680515,
"grad_norm": 0.06847091877632593,
"learning_rate": 8.90162395476046e-06,
"loss": 0.3279,
"step": 1305
},
{
"epoch": 0.5894705264736764,
"grad_norm": 0.06903452581161729,
"learning_rate": 8.889311617902468e-06,
"loss": 0.3229,
"step": 1310
},
{
"epoch": 0.5917204139793011,
"grad_norm": 0.09341128215879058,
"learning_rate": 8.876939273047813e-06,
"loss": 0.299,
"step": 1315
},
{
"epoch": 0.5939703014849258,
"grad_norm": 0.07278846497904187,
"learning_rate": 8.86450711108879e-06,
"loss": 0.3226,
"step": 1320
},
{
"epoch": 0.5962201889905505,
"grad_norm": 0.06262188507904164,
"learning_rate": 8.85201532384061e-06,
"loss": 0.3133,
"step": 1325
},
{
"epoch": 0.5984700764961752,
"grad_norm": 0.06880573696178596,
"learning_rate": 8.839464104038445e-06,
"loss": 0.2962,
"step": 1330
},
{
"epoch": 0.6007199640017999,
"grad_norm": 0.06779234501270573,
"learning_rate": 8.826853645334441e-06,
"loss": 0.3124,
"step": 1335
},
{
"epoch": 0.6029698515074247,
"grad_norm": 0.06187208772150342,
"learning_rate": 8.814184142294744e-06,
"loss": 0.315,
"step": 1340
},
{
"epoch": 0.6052197390130494,
"grad_norm": 0.07896028281919827,
"learning_rate": 8.80145579039649e-06,
"loss": 0.3432,
"step": 1345
},
{
"epoch": 0.6074696265186741,
"grad_norm": 0.0751755215796296,
"learning_rate": 8.78866878602479e-06,
"loss": 0.3239,
"step": 1350
},
{
"epoch": 0.6097195140242988,
"grad_norm": 0.07156093608605772,
"learning_rate": 8.775823326469703e-06,
"loss": 0.337,
"step": 1355
},
{
"epoch": 0.6119694015299235,
"grad_norm": 0.0727876886960586,
"learning_rate": 8.76291960992319e-06,
"loss": 0.3737,
"step": 1360
},
{
"epoch": 0.6142192890355482,
"grad_norm": 0.07239709772207241,
"learning_rate": 8.749957835476053e-06,
"loss": 0.333,
"step": 1365
},
{
"epoch": 0.616469176541173,
"grad_norm": 0.07320801736790428,
"learning_rate": 8.736938203114872e-06,
"loss": 0.344,
"step": 1370
},
{
"epoch": 0.6187190640467977,
"grad_norm": 0.06549770224319154,
"learning_rate": 8.72386091371891e-06,
"loss": 0.3006,
"step": 1375
},
{
"epoch": 0.6209689515524224,
"grad_norm": 0.07264319355187582,
"learning_rate": 8.710726169057018e-06,
"loss": 0.3173,
"step": 1380
},
{
"epoch": 0.6232188390580471,
"grad_norm": 0.07992193946978773,
"learning_rate": 8.697534171784523e-06,
"loss": 0.3467,
"step": 1385
},
{
"epoch": 0.6254687265636718,
"grad_norm": 0.06707456122943496,
"learning_rate": 8.684285125440099e-06,
"loss": 0.3297,
"step": 1390
},
{
"epoch": 0.6277186140692965,
"grad_norm": 0.06483948438605809,
"learning_rate": 8.670979234442624e-06,
"loss": 0.3349,
"step": 1395
},
{
"epoch": 0.6299685015749212,
"grad_norm": 0.0705394359218232,
"learning_rate": 8.657616704088037e-06,
"loss": 0.33,
"step": 1400
},
{
"epoch": 0.632218389080546,
"grad_norm": 0.07985592647370479,
"learning_rate": 8.644197740546153e-06,
"loss": 0.3605,
"step": 1405
},
{
"epoch": 0.6344682765861707,
"grad_norm": 0.08150029983078208,
"learning_rate": 8.630722550857503e-06,
"loss": 0.3363,
"step": 1410
},
{
"epoch": 0.6367181640917954,
"grad_norm": 0.07286616055279489,
"learning_rate": 8.617191342930118e-06,
"loss": 0.3441,
"step": 1415
},
{
"epoch": 0.6389680515974201,
"grad_norm": 0.06834780355739174,
"learning_rate": 8.603604325536338e-06,
"loss": 0.3298,
"step": 1420
},
{
"epoch": 0.6412179391030448,
"grad_norm": 0.06360740971285378,
"learning_rate": 8.589961708309582e-06,
"loss": 0.308,
"step": 1425
},
{
"epoch": 0.6434678266086695,
"grad_norm": 0.06387011586281786,
"learning_rate": 8.576263701741115e-06,
"loss": 0.3102,
"step": 1430
},
{
"epoch": 0.6457177141142942,
"grad_norm": 0.059287022702283844,
"learning_rate": 8.562510517176807e-06,
"loss": 0.333,
"step": 1435
},
{
"epoch": 0.647967601619919,
"grad_norm": 0.07076357545448068,
"learning_rate": 8.54870236681386e-06,
"loss": 0.3376,
"step": 1440
},
{
"epoch": 0.6502174891255437,
"grad_norm": 0.08080237359735847,
"learning_rate": 8.534839463697541e-06,
"loss": 0.344,
"step": 1445
},
{
"epoch": 0.6524673766311685,
"grad_norm": 0.07301903865415799,
"learning_rate": 8.520922021717903e-06,
"loss": 0.3236,
"step": 1450
},
{
"epoch": 0.6547172641367932,
"grad_norm": 0.06408247558471158,
"learning_rate": 8.506950255606466e-06,
"loss": 0.3119,
"step": 1455
},
{
"epoch": 0.6569671516424179,
"grad_norm": 0.07142979546900464,
"learning_rate": 8.492924380932919e-06,
"loss": 0.3235,
"step": 1460
},
{
"epoch": 0.6592170391480426,
"grad_norm": 0.06623934823982494,
"learning_rate": 8.478844614101792e-06,
"loss": 0.3127,
"step": 1465
},
{
"epoch": 0.6614669266536674,
"grad_norm": 0.064051288527217,
"learning_rate": 8.464711172349105e-06,
"loss": 0.3408,
"step": 1470
},
{
"epoch": 0.6637168141592921,
"grad_norm": 0.07268808159369747,
"learning_rate": 8.450524273739036e-06,
"loss": 0.3406,
"step": 1475
},
{
"epoch": 0.6659667016649168,
"grad_norm": 0.06547939077675495,
"learning_rate": 8.436284137160544e-06,
"loss": 0.3404,
"step": 1480
},
{
"epoch": 0.6682165891705415,
"grad_norm": 0.11515221274329139,
"learning_rate": 8.421990982323988e-06,
"loss": 0.3342,
"step": 1485
},
{
"epoch": 0.6704664766761662,
"grad_norm": 0.08213057865060075,
"learning_rate": 8.407645029757752e-06,
"loss": 0.3631,
"step": 1490
},
{
"epoch": 0.6727163641817909,
"grad_norm": 0.07445316775297253,
"learning_rate": 8.393246500804825e-06,
"loss": 0.362,
"step": 1495
},
{
"epoch": 0.6749662516874156,
"grad_norm": 0.0715773585848479,
"learning_rate": 8.3787956176194e-06,
"loss": 0.3377,
"step": 1500
},
{
"epoch": 0.6772161391930404,
"grad_norm": 0.07245667708706742,
"learning_rate": 8.36429260316344e-06,
"loss": 0.2967,
"step": 1505
},
{
"epoch": 0.6794660266986651,
"grad_norm": 0.07191851967760118,
"learning_rate": 8.349737681203234e-06,
"loss": 0.3447,
"step": 1510
},
{
"epoch": 0.6817159142042898,
"grad_norm": 0.06475853768493092,
"learning_rate": 8.335131076305958e-06,
"loss": 0.3339,
"step": 1515
},
{
"epoch": 0.6839658017099145,
"grad_norm": 0.07263158877410257,
"learning_rate": 8.320473013836197e-06,
"loss": 0.3074,
"step": 1520
},
{
"epoch": 0.6862156892155392,
"grad_norm": 0.05942603663221257,
"learning_rate": 8.305763719952467e-06,
"loss": 0.2997,
"step": 1525
},
{
"epoch": 0.6884655767211639,
"grad_norm": 0.07938168227761808,
"learning_rate": 8.29100342160374e-06,
"loss": 0.3122,
"step": 1530
},
{
"epoch": 0.6907154642267886,
"grad_norm": 0.07999223664242092,
"learning_rate": 8.27619234652593e-06,
"loss": 0.3138,
"step": 1535
},
{
"epoch": 0.6929653517324134,
"grad_norm": 0.06996270561203156,
"learning_rate": 8.261330723238381e-06,
"loss": 0.3321,
"step": 1540
},
{
"epoch": 0.6952152392380381,
"grad_norm": 0.09647113986832291,
"learning_rate": 8.246418781040345e-06,
"loss": 0.3269,
"step": 1545
},
{
"epoch": 0.6974651267436628,
"grad_norm": 0.07974144611519904,
"learning_rate": 8.231456750007436e-06,
"loss": 0.309,
"step": 1550
},
{
"epoch": 0.6997150142492875,
"grad_norm": 0.06944041746000827,
"learning_rate": 8.216444860988098e-06,
"loss": 0.3347,
"step": 1555
},
{
"epoch": 0.7019649017549122,
"grad_norm": 0.06697853007490644,
"learning_rate": 8.20138334560002e-06,
"loss": 0.3432,
"step": 1560
},
{
"epoch": 0.7042147892605369,
"grad_norm": 0.09163411149931353,
"learning_rate": 8.18627243622658e-06,
"loss": 0.3294,
"step": 1565
},
{
"epoch": 0.7064646767661616,
"grad_norm": 0.06745466757701833,
"learning_rate": 8.171112366013252e-06,
"loss": 0.3382,
"step": 1570
},
{
"epoch": 0.7087145642717864,
"grad_norm": 0.06524545139947452,
"learning_rate": 8.155903368864008e-06,
"loss": 0.2894,
"step": 1575
},
{
"epoch": 0.7109644517774111,
"grad_norm": 0.07357575023935092,
"learning_rate": 8.140645679437713e-06,
"loss": 0.345,
"step": 1580
},
{
"epoch": 0.7132143392830359,
"grad_norm": 0.07032356069075725,
"learning_rate": 8.125339533144507e-06,
"loss": 0.3497,
"step": 1585
},
{
"epoch": 0.7154642267886606,
"grad_norm": 0.07305825316899144,
"learning_rate": 8.109985166142161e-06,
"loss": 0.3223,
"step": 1590
},
{
"epoch": 0.7177141142942853,
"grad_norm": 0.07026921859976491,
"learning_rate": 8.09458281533244e-06,
"loss": 0.3271,
"step": 1595
},
{
"epoch": 0.71996400179991,
"grad_norm": 0.0783084169696169,
"learning_rate": 8.079132718357465e-06,
"loss": 0.311,
"step": 1600
},
{
"epoch": 0.7222138893055348,
"grad_norm": 0.06977970059586212,
"learning_rate": 8.063635113596006e-06,
"loss": 0.3114,
"step": 1605
},
{
"epoch": 0.7244637768111595,
"grad_norm": 0.06695382649927473,
"learning_rate": 8.048090240159849e-06,
"loss": 0.3186,
"step": 1610
},
{
"epoch": 0.7267136643167842,
"grad_norm": 0.07382767142740718,
"learning_rate": 8.032498337890073e-06,
"loss": 0.3115,
"step": 1615
},
{
"epoch": 0.7289635518224089,
"grad_norm": 0.0847147954522355,
"learning_rate": 8.01685964735337e-06,
"loss": 0.3313,
"step": 1620
},
{
"epoch": 0.7312134393280336,
"grad_norm": 0.08710412831256738,
"learning_rate": 8.00117440983832e-06,
"loss": 0.3129,
"step": 1625
},
{
"epoch": 0.7334633268336583,
"grad_norm": 0.07163605298015002,
"learning_rate": 7.985442867351682e-06,
"loss": 0.3197,
"step": 1630
},
{
"epoch": 0.735713214339283,
"grad_norm": 0.08693479896494097,
"learning_rate": 7.969665262614642e-06,
"loss": 0.3584,
"step": 1635
},
{
"epoch": 0.7379631018449078,
"grad_norm": 0.07181692085074703,
"learning_rate": 7.953841839059086e-06,
"loss": 0.3024,
"step": 1640
},
{
"epoch": 0.7402129893505325,
"grad_norm": 0.06762138099885763,
"learning_rate": 7.937972840823836e-06,
"loss": 0.3393,
"step": 1645
},
{
"epoch": 0.7424628768561572,
"grad_norm": 0.06579548855422006,
"learning_rate": 7.922058512750876e-06,
"loss": 0.3415,
"step": 1650
},
{
"epoch": 0.7447127643617819,
"grad_norm": 0.06780731913871438,
"learning_rate": 7.90609910038159e-06,
"loss": 0.326,
"step": 1655
},
{
"epoch": 0.7469626518674066,
"grad_norm": 0.07345224322730477,
"learning_rate": 7.890094849952964e-06,
"loss": 0.3579,
"step": 1660
},
{
"epoch": 0.7492125393730313,
"grad_norm": 0.07643898702300285,
"learning_rate": 7.874046008393783e-06,
"loss": 0.3215,
"step": 1665
},
{
"epoch": 0.751462426878656,
"grad_norm": 0.08111641352223722,
"learning_rate": 7.857952823320833e-06,
"loss": 0.3396,
"step": 1670
},
{
"epoch": 0.7537123143842808,
"grad_norm": 0.06433102937848656,
"learning_rate": 7.84181554303507e-06,
"loss": 0.3229,
"step": 1675
},
{
"epoch": 0.7559622018899055,
"grad_norm": 0.07066161687549372,
"learning_rate": 7.825634416517793e-06,
"loss": 0.3168,
"step": 1680
},
{
"epoch": 0.7582120893955302,
"grad_norm": 0.07761037252783486,
"learning_rate": 7.809409693426803e-06,
"loss": 0.345,
"step": 1685
},
{
"epoch": 0.7604619769011549,
"grad_norm": 0.07514558565636438,
"learning_rate": 7.793141624092551e-06,
"loss": 0.3423,
"step": 1690
},
{
"epoch": 0.7627118644067796,
"grad_norm": 0.08138341842898199,
"learning_rate": 7.776830459514275e-06,
"loss": 0.3153,
"step": 1695
},
{
"epoch": 0.7649617519124043,
"grad_norm": 0.07657999183778645,
"learning_rate": 7.760476451356123e-06,
"loss": 0.3568,
"step": 1700
},
{
"epoch": 0.7672116394180291,
"grad_norm": 0.08932610854441203,
"learning_rate": 7.744079851943286e-06,
"loss": 0.3045,
"step": 1705
},
{
"epoch": 0.7694615269236538,
"grad_norm": 0.07788581856311123,
"learning_rate": 7.727640914258076e-06,
"loss": 0.322,
"step": 1710
},
{
"epoch": 0.7717114144292785,
"grad_norm": 0.07234842557571529,
"learning_rate": 7.711159891936059e-06,
"loss": 0.301,
"step": 1715
},
{
"epoch": 0.7739613019349032,
"grad_norm": 0.08103908033954604,
"learning_rate": 7.694637039262109e-06,
"loss": 0.2934,
"step": 1720
},
{
"epoch": 0.776211189440528,
"grad_norm": 0.08033609793206774,
"learning_rate": 7.678072611166503e-06,
"loss": 0.3281,
"step": 1725
},
{
"epoch": 0.7784610769461527,
"grad_norm": 0.08682473308042656,
"learning_rate": 7.661466863220982e-06,
"loss": 0.3377,
"step": 1730
},
{
"epoch": 0.7807109644517775,
"grad_norm": 0.07975724023981283,
"learning_rate": 7.644820051634813e-06,
"loss": 0.3312,
"step": 1735
},
{
"epoch": 0.7829608519574022,
"grad_norm": 0.08318610850328363,
"learning_rate": 7.628132433250828e-06,
"loss": 0.318,
"step": 1740
},
{
"epoch": 0.7852107394630269,
"grad_norm": 0.07470444210188223,
"learning_rate": 7.611404265541464e-06,
"loss": 0.3166,
"step": 1745
},
{
"epoch": 0.7874606269686516,
"grad_norm": 0.07633984339680623,
"learning_rate": 7.594635806604797e-06,
"loss": 0.3068,
"step": 1750
},
{
"epoch": 0.7897105144742763,
"grad_norm": 0.08519611137288997,
"learning_rate": 7.57782731516055e-06,
"loss": 0.3465,
"step": 1755
},
{
"epoch": 0.791960401979901,
"grad_norm": 0.07125104461336126,
"learning_rate": 7.560979050546103e-06,
"loss": 0.311,
"step": 1760
},
{
"epoch": 0.7942102894855257,
"grad_norm": 0.08460045098046377,
"learning_rate": 7.544091272712501e-06,
"loss": 0.3036,
"step": 1765
},
{
"epoch": 0.7964601769911505,
"grad_norm": 0.07731671038628908,
"learning_rate": 7.527164242220434e-06,
"loss": 0.3214,
"step": 1770
},
{
"epoch": 0.7987100644967752,
"grad_norm": 0.07618452283812552,
"learning_rate": 7.510198220236217e-06,
"loss": 0.3412,
"step": 1775
},
{
"epoch": 0.8009599520023999,
"grad_norm": 0.08122249298530079,
"learning_rate": 7.493193468527764e-06,
"loss": 0.3129,
"step": 1780
},
{
"epoch": 0.8032098395080246,
"grad_norm": 0.08390625774458342,
"learning_rate": 7.476150249460549e-06,
"loss": 0.3168,
"step": 1785
},
{
"epoch": 0.8054597270136493,
"grad_norm": 0.07518471851900174,
"learning_rate": 7.4590688259935554e-06,
"loss": 0.331,
"step": 1790
},
{
"epoch": 0.807709614519274,
"grad_norm": 0.07627971467235234,
"learning_rate": 7.441949461675223e-06,
"loss": 0.3471,
"step": 1795
},
{
"epoch": 0.8099595020248987,
"grad_norm": 0.08879967466572108,
"learning_rate": 7.424792420639377e-06,
"loss": 0.323,
"step": 1800
},
{
"epoch": 0.8122093895305235,
"grad_norm": 0.0858174200658171,
"learning_rate": 7.407597967601155e-06,
"loss": 0.3284,
"step": 1805
},
{
"epoch": 0.8144592770361482,
"grad_norm": 0.08665127583082709,
"learning_rate": 7.390366367852923e-06,
"loss": 0.3217,
"step": 1810
},
{
"epoch": 0.8167091645417729,
"grad_norm": 0.08001080258785544,
"learning_rate": 7.3730978872601825e-06,
"loss": 0.3248,
"step": 1815
},
{
"epoch": 0.8189590520473976,
"grad_norm": 0.07815794847284734,
"learning_rate": 7.355792792257463e-06,
"loss": 0.3124,
"step": 1820
},
{
"epoch": 0.8212089395530223,
"grad_norm": 0.0869139056537896,
"learning_rate": 7.338451349844225e-06,
"loss": 0.323,
"step": 1825
},
{
"epoch": 0.823458827058647,
"grad_norm": 0.09766019302119812,
"learning_rate": 7.3210738275807225e-06,
"loss": 0.3332,
"step": 1830
},
{
"epoch": 0.8257087145642718,
"grad_norm": 0.08508749834617443,
"learning_rate": 7.303660493583889e-06,
"loss": 0.3285,
"step": 1835
},
{
"epoch": 0.8279586020698965,
"grad_norm": 0.10673197384722342,
"learning_rate": 7.286211616523193e-06,
"loss": 0.3169,
"step": 1840
},
{
"epoch": 0.8302084895755212,
"grad_norm": 0.11681882774169298,
"learning_rate": 7.268727465616497e-06,
"loss": 0.331,
"step": 1845
},
{
"epoch": 0.8324583770811459,
"grad_norm": 0.08970145688216963,
"learning_rate": 7.251208310625899e-06,
"loss": 0.3262,
"step": 1850
},
{
"epoch": 0.8347082645867706,
"grad_norm": 0.08677453595649923,
"learning_rate": 7.2336544218535776e-06,
"loss": 0.2968,
"step": 1855
},
{
"epoch": 0.8369581520923954,
"grad_norm": 0.08463356362517462,
"learning_rate": 7.216066070137614e-06,
"loss": 0.3408,
"step": 1860
},
{
"epoch": 0.8392080395980202,
"grad_norm": 0.10768608728008885,
"learning_rate": 7.198443526847816e-06,
"loss": 0.3222,
"step": 1865
},
{
"epoch": 0.8414579271036449,
"grad_norm": 0.08293925088501428,
"learning_rate": 7.180787063881534e-06,
"loss": 0.3225,
"step": 1870
},
{
"epoch": 0.8437078146092696,
"grad_norm": 0.09753175069029144,
"learning_rate": 7.163096953659462e-06,
"loss": 0.3249,
"step": 1875
},
{
"epoch": 0.8459577021148943,
"grad_norm": 0.10750990409191725,
"learning_rate": 7.145373469121435e-06,
"loss": 0.3248,
"step": 1880
},
{
"epoch": 0.848207589620519,
"grad_norm": 0.0713111477001828,
"learning_rate": 7.1276168837222215e-06,
"loss": 0.3262,
"step": 1885
},
{
"epoch": 0.8504574771261437,
"grad_norm": 0.08520099737279731,
"learning_rate": 7.109827471427299e-06,
"loss": 0.3248,
"step": 1890
},
{
"epoch": 0.8527073646317684,
"grad_norm": 0.1007558956965131,
"learning_rate": 7.092005506708629e-06,
"loss": 0.3063,
"step": 1895
},
{
"epoch": 0.8549572521373932,
"grad_norm": 0.10076509216745107,
"learning_rate": 7.074151264540425e-06,
"loss": 0.3394,
"step": 1900
},
{
"epoch": 0.8572071396430179,
"grad_norm": 0.1128171772187796,
"learning_rate": 7.056265020394908e-06,
"loss": 0.3353,
"step": 1905
},
{
"epoch": 0.8594570271486426,
"grad_norm": 0.07826929688060387,
"learning_rate": 7.038347050238052e-06,
"loss": 0.3313,
"step": 1910
},
{
"epoch": 0.8617069146542673,
"grad_norm": 0.09477827641455178,
"learning_rate": 7.020397630525336e-06,
"loss": 0.3094,
"step": 1915
},
{
"epoch": 0.863956802159892,
"grad_norm": 0.07996338853084985,
"learning_rate": 7.002417038197466e-06,
"loss": 0.3365,
"step": 1920
},
{
"epoch": 0.8662066896655167,
"grad_norm": 0.08681950662638242,
"learning_rate": 6.984405550676113e-06,
"loss": 0.2858,
"step": 1925
},
{
"epoch": 0.8684565771711414,
"grad_norm": 0.10147028587889259,
"learning_rate": 6.966363445859629e-06,
"loss": 0.3307,
"step": 1930
},
{
"epoch": 0.8707064646767662,
"grad_norm": 0.09778557000247115,
"learning_rate": 6.948291002118757e-06,
"loss": 0.3346,
"step": 1935
},
{
"epoch": 0.8729563521823909,
"grad_norm": 0.08335343107919917,
"learning_rate": 6.930188498292334e-06,
"loss": 0.3102,
"step": 1940
},
{
"epoch": 0.8752062396880156,
"grad_norm": 0.127528307390263,
"learning_rate": 6.912056213683001e-06,
"loss": 0.2772,
"step": 1945
},
{
"epoch": 0.8774561271936403,
"grad_norm": 0.08449830219805671,
"learning_rate": 6.893894428052881e-06,
"loss": 0.3331,
"step": 1950
},
{
"epoch": 0.879706014699265,
"grad_norm": 0.10290918076564952,
"learning_rate": 6.875703421619263e-06,
"loss": 0.3162,
"step": 1955
},
{
"epoch": 0.8819559022048897,
"grad_norm": 0.09666086595549915,
"learning_rate": 6.85748347505029e-06,
"loss": 0.3393,
"step": 1960
},
{
"epoch": 0.8842057897105144,
"grad_norm": 0.09126192537758601,
"learning_rate": 6.839234869460614e-06,
"loss": 0.3313,
"step": 1965
},
{
"epoch": 0.8864556772161392,
"grad_norm": 0.09213126718219308,
"learning_rate": 6.820957886407068e-06,
"loss": 0.3298,
"step": 1970
},
{
"epoch": 0.8887055647217639,
"grad_norm": 0.0893744576312266,
"learning_rate": 6.802652807884322e-06,
"loss": 0.3258,
"step": 1975
},
{
"epoch": 0.8909554522273886,
"grad_norm": 0.10520537204979115,
"learning_rate": 6.784319916320528e-06,
"loss": 0.3152,
"step": 1980
},
{
"epoch": 0.8932053397330133,
"grad_norm": 0.09224246726284402,
"learning_rate": 6.765959494572959e-06,
"loss": 0.3176,
"step": 1985
},
{
"epoch": 0.895455227238638,
"grad_norm": 0.09179694827419689,
"learning_rate": 6.74757182592366e-06,
"loss": 0.34,
"step": 1990
},
{
"epoch": 0.8977051147442627,
"grad_norm": 0.10131034789212955,
"learning_rate": 6.7291571940750575e-06,
"loss": 0.3171,
"step": 1995
},
{
"epoch": 0.8999550022498876,
"grad_norm": 0.11052424709399664,
"learning_rate": 6.710715883145599e-06,
"loss": 0.3084,
"step": 2000
},
{
"epoch": 0.9022048897555123,
"grad_norm": 0.09523315367515199,
"learning_rate": 6.692248177665357e-06,
"loss": 0.3127,
"step": 2005
},
{
"epoch": 0.904454777261137,
"grad_norm": 0.09774145840636202,
"learning_rate": 6.673754362571646e-06,
"loss": 0.2866,
"step": 2010
},
{
"epoch": 0.9067046647667617,
"grad_norm": 0.1231628868544864,
"learning_rate": 6.6552347232046255e-06,
"loss": 0.2926,
"step": 2015
},
{
"epoch": 0.9089545522723864,
"grad_norm": 0.09563379874509359,
"learning_rate": 6.636689545302898e-06,
"loss": 0.3128,
"step": 2020
},
{
"epoch": 0.9112044397780111,
"grad_norm": 0.07820421786999905,
"learning_rate": 6.6181191149990905e-06,
"loss": 0.321,
"step": 2025
},
{
"epoch": 0.9134543272836358,
"grad_norm": 0.10476028051810904,
"learning_rate": 6.599523718815461e-06,
"loss": 0.2836,
"step": 2030
},
{
"epoch": 0.9157042147892606,
"grad_norm": 0.11389284533738375,
"learning_rate": 6.580903643659453e-06,
"loss": 0.2934,
"step": 2035
},
{
"epoch": 0.9179541022948853,
"grad_norm": 0.10996849745288242,
"learning_rate": 6.5622591768192875e-06,
"loss": 0.3243,
"step": 2040
},
{
"epoch": 0.92020398980051,
"grad_norm": 0.09512165946660596,
"learning_rate": 6.5435906059595215e-06,
"loss": 0.3081,
"step": 2045
},
{
"epoch": 0.9224538773061347,
"grad_norm": 0.10421356775522515,
"learning_rate": 6.524898219116612e-06,
"loss": 0.2682,
"step": 2050
},
{
"epoch": 0.9247037648117594,
"grad_norm": 0.10201698883401172,
"learning_rate": 6.5061823046944694e-06,
"loss": 0.2909,
"step": 2055
},
{
"epoch": 0.9269536523173841,
"grad_norm": 0.10974937304411288,
"learning_rate": 6.4874431514600146e-06,
"loss": 0.3072,
"step": 2060
},
{
"epoch": 0.9292035398230089,
"grad_norm": 0.09276233118456312,
"learning_rate": 6.468681048538715e-06,
"loss": 0.2989,
"step": 2065
},
{
"epoch": 0.9314534273286336,
"grad_norm": 0.11862538493837348,
"learning_rate": 6.44989628541013e-06,
"loss": 0.3372,
"step": 2070
},
{
"epoch": 0.9337033148342583,
"grad_norm": 0.10451521274212297,
"learning_rate": 6.431089151903439e-06,
"loss": 0.3188,
"step": 2075
},
{
"epoch": 0.935953202339883,
"grad_norm": 0.11422644044073009,
"learning_rate": 6.412259938192978e-06,
"loss": 0.307,
"step": 2080
},
{
"epoch": 0.9382030898455077,
"grad_norm": 0.14091820208432657,
"learning_rate": 6.393408934793752e-06,
"loss": 0.3546,
"step": 2085
},
{
"epoch": 0.9404529773511324,
"grad_norm": 0.11829750564224563,
"learning_rate": 6.374536432556963e-06,
"loss": 0.3267,
"step": 2090
},
{
"epoch": 0.9427028648567571,
"grad_norm": 0.11528106197624186,
"learning_rate": 6.355642722665512e-06,
"loss": 0.3203,
"step": 2095
},
{
"epoch": 0.9449527523623819,
"grad_norm": 0.09372673822212164,
"learning_rate": 6.336728096629517e-06,
"loss": 0.3151,
"step": 2100
},
{
"epoch": 0.9472026398680066,
"grad_norm": 0.10779896033185006,
"learning_rate": 6.317792846281805e-06,
"loss": 0.3052,
"step": 2105
},
{
"epoch": 0.9494525273736313,
"grad_norm": 0.09672862996353586,
"learning_rate": 6.298837263773423e-06,
"loss": 0.3033,
"step": 2110
},
{
"epoch": 0.951702414879256,
"grad_norm": 0.10872396340925997,
"learning_rate": 6.2798616415691095e-06,
"loss": 0.3002,
"step": 2115
},
{
"epoch": 0.9539523023848807,
"grad_norm": 0.11829489090483326,
"learning_rate": 6.260866272442807e-06,
"loss": 0.2929,
"step": 2120
},
{
"epoch": 0.9562021898905054,
"grad_norm": 0.11145672561455416,
"learning_rate": 6.2418514494731245e-06,
"loss": 0.2808,
"step": 2125
},
{
"epoch": 0.9584520773961301,
"grad_norm": 0.1056896163271936,
"learning_rate": 6.222817466038824e-06,
"loss": 0.2841,
"step": 2130
},
{
"epoch": 0.9607019649017549,
"grad_norm": 0.10666373036314321,
"learning_rate": 6.2037646158142975e-06,
"loss": 0.3005,
"step": 2135
},
{
"epoch": 0.9629518524073797,
"grad_norm": 0.10697096904271322,
"learning_rate": 6.184693192765028e-06,
"loss": 0.2894,
"step": 2140
},
{
"epoch": 0.9652017399130044,
"grad_norm": 0.17157045181184577,
"learning_rate": 6.165603491143057e-06,
"loss": 0.3298,
"step": 2145
},
{
"epoch": 0.9674516274186291,
"grad_norm": 0.1005745666451797,
"learning_rate": 6.146495805482451e-06,
"loss": 0.3196,
"step": 2150
},
{
"epoch": 0.9697015149242538,
"grad_norm": 0.139307317568223,
"learning_rate": 6.127370430594745e-06,
"loss": 0.2993,
"step": 2155
},
{
"epoch": 0.9719514024298785,
"grad_norm": 0.11791582586234053,
"learning_rate": 6.108227661564401e-06,
"loss": 0.3083,
"step": 2160
},
{
"epoch": 0.9742012899355033,
"grad_norm": 0.11233522118086736,
"learning_rate": 6.089067793744258e-06,
"loss": 0.3137,
"step": 2165
},
{
"epoch": 0.976451177441128,
"grad_norm": 0.12524898605746265,
"learning_rate": 6.069891122750971e-06,
"loss": 0.2825,
"step": 2170
},
{
"epoch": 0.9787010649467527,
"grad_norm": 0.09825541745527079,
"learning_rate": 6.050697944460444e-06,
"loss": 0.3146,
"step": 2175
},
{
"epoch": 0.9809509524523774,
"grad_norm": 0.11637412785681134,
"learning_rate": 6.0314885550032796e-06,
"loss": 0.2935,
"step": 2180
},
{
"epoch": 0.9832008399580021,
"grad_norm": 0.10398981333232891,
"learning_rate": 6.012263250760199e-06,
"loss": 0.28,
"step": 2185
},
{
"epoch": 0.9854507274636268,
"grad_norm": 0.1347409630178848,
"learning_rate": 5.993022328357466e-06,
"loss": 0.2899,
"step": 2190
},
{
"epoch": 0.9877006149692515,
"grad_norm": 0.136591408837683,
"learning_rate": 5.973766084662324e-06,
"loss": 0.2729,
"step": 2195
},
{
"epoch": 0.9899505024748763,
"grad_norm": 0.1032954692332516,
"learning_rate": 5.954494816778408e-06,
"loss": 0.3106,
"step": 2200
},
{
"epoch": 0.992200389980501,
"grad_norm": 0.12420490530861028,
"learning_rate": 5.935208822041152e-06,
"loss": 0.2699,
"step": 2205
},
{
"epoch": 0.9944502774861257,
"grad_norm": 0.10146757951487546,
"learning_rate": 5.915908398013217e-06,
"loss": 0.266,
"step": 2210
},
{
"epoch": 0.9967001649917504,
"grad_norm": 0.10690509046474422,
"learning_rate": 5.896593842479893e-06,
"loss": 0.2916,
"step": 2215
},
{
"epoch": 0.9989500524973751,
"grad_norm": 0.2098417588495756,
"learning_rate": 5.8772654534445e-06,
"loss": 0.3104,
"step": 2220
},
{
"epoch": 1.0,
"eval_loss": 0.27543845772743225,
"eval_runtime": 55.028,
"eval_samples_per_second": 19.59,
"eval_steps_per_second": 4.907,
"step": 2223
},
{
"epoch": 1.00089995500225,
"grad_norm": 0.11100179968154768,
"learning_rate": 5.857923529123799e-06,
"loss": 0.2341,
"step": 2225
},
{
"epoch": 1.0031498425078746,
"grad_norm": 0.14616860643517418,
"learning_rate": 5.838568367943383e-06,
"loss": 0.2679,
"step": 2230
},
{
"epoch": 1.0053997300134994,
"grad_norm": 0.11313230544533252,
"learning_rate": 5.819200268533076e-06,
"loss": 0.2873,
"step": 2235
},
{
"epoch": 1.007649617519124,
"grad_norm": 0.1210465260044826,
"learning_rate": 5.7998195297223285e-06,
"loss": 0.2677,
"step": 2240
},
{
"epoch": 1.0098995050247488,
"grad_norm": 0.11722674843174795,
"learning_rate": 5.7804264505356e-06,
"loss": 0.2548,
"step": 2245
},
{
"epoch": 1.0121493925303735,
"grad_norm": 0.12390544554268877,
"learning_rate": 5.76102133018775e-06,
"loss": 0.2942,
"step": 2250
},
{
"epoch": 1.0143992800359982,
"grad_norm": 0.14215352813872506,
"learning_rate": 5.741604468079421e-06,
"loss": 0.3095,
"step": 2255
},
{
"epoch": 1.016649167541623,
"grad_norm": 0.13309421360381032,
"learning_rate": 5.72217616379242e-06,
"loss": 0.2794,
"step": 2260
},
{
"epoch": 1.0188990550472476,
"grad_norm": 0.1409784002692586,
"learning_rate": 5.702736717085093e-06,
"loss": 0.2998,
"step": 2265
},
{
"epoch": 1.0211489425528724,
"grad_norm": 0.12978570417210325,
"learning_rate": 5.6832864278876984e-06,
"loss": 0.2829,
"step": 2270
},
{
"epoch": 1.023398830058497,
"grad_norm": 0.10750959417123264,
"learning_rate": 5.663825596297794e-06,
"loss": 0.2902,
"step": 2275
},
{
"epoch": 1.0256487175641218,
"grad_norm": 0.137940819760974,
"learning_rate": 5.644354522575581e-06,
"loss": 0.2806,
"step": 2280
},
{
"epoch": 1.0278986050697465,
"grad_norm": 0.14563829553392096,
"learning_rate": 5.624873507139297e-06,
"loss": 0.277,
"step": 2285
},
{
"epoch": 1.0301484925753712,
"grad_norm": 0.12377796525725795,
"learning_rate": 5.605382850560565e-06,
"loss": 0.2943,
"step": 2290
},
{
"epoch": 1.032398380080996,
"grad_norm": 0.16984305955909604,
"learning_rate": 5.585882853559762e-06,
"loss": 0.2889,
"step": 2295
},
{
"epoch": 1.0346482675866207,
"grad_norm": 0.1281002826955631,
"learning_rate": 5.566373817001377e-06,
"loss": 0.293,
"step": 2300
},
{
"epoch": 1.0368981550922454,
"grad_norm": 0.15524678076001608,
"learning_rate": 5.546856041889374e-06,
"loss": 0.2605,
"step": 2305
},
{
"epoch": 1.03914804259787,
"grad_norm": 0.14215571774039212,
"learning_rate": 5.527329829362534e-06,
"loss": 0.2786,
"step": 2310
},
{
"epoch": 1.0413979301034948,
"grad_norm": 0.1447748028005779,
"learning_rate": 5.5077954806898284e-06,
"loss": 0.2688,
"step": 2315
},
{
"epoch": 1.0436478176091195,
"grad_norm": 0.14426858307924748,
"learning_rate": 5.488253297265757e-06,
"loss": 0.2777,
"step": 2320
},
{
"epoch": 1.0458977051147442,
"grad_norm": 0.1272869099382178,
"learning_rate": 5.468703580605703e-06,
"loss": 0.2997,
"step": 2325
},
{
"epoch": 1.048147592620369,
"grad_norm": 0.133865100418296,
"learning_rate": 5.4491466323412745e-06,
"loss": 0.2839,
"step": 2330
},
{
"epoch": 1.0503974801259937,
"grad_norm": 0.12437130432718715,
"learning_rate": 5.429582754215664e-06,
"loss": 0.2843,
"step": 2335
},
{
"epoch": 1.0526473676316184,
"grad_norm": 0.1419352738893503,
"learning_rate": 5.410012248078975e-06,
"loss": 0.2677,
"step": 2340
},
{
"epoch": 1.054897255137243,
"grad_norm": 0.1639413029064359,
"learning_rate": 5.390435415883583e-06,
"loss": 0.2805,
"step": 2345
},
{
"epoch": 1.0571471426428678,
"grad_norm": 0.14750894149267404,
"learning_rate": 5.370852559679461e-06,
"loss": 0.2718,
"step": 2350
},
{
"epoch": 1.0593970301484925,
"grad_norm": 0.1418143669594509,
"learning_rate": 5.351263981609532e-06,
"loss": 0.2374,
"step": 2355
},
{
"epoch": 1.0616469176541172,
"grad_norm": 0.1467085192211227,
"learning_rate": 5.331669983904996e-06,
"loss": 0.278,
"step": 2360
},
{
"epoch": 1.063896805159742,
"grad_norm": 0.1686191463372291,
"learning_rate": 5.312070868880678e-06,
"loss": 0.2818,
"step": 2365
},
{
"epoch": 1.0661466926653667,
"grad_norm": 0.1844876464618337,
"learning_rate": 5.29246693893035e-06,
"loss": 0.2971,
"step": 2370
},
{
"epoch": 1.0683965801709914,
"grad_norm": 0.12521919673631507,
"learning_rate": 5.272858496522084e-06,
"loss": 0.2737,
"step": 2375
},
{
"epoch": 1.070646467676616,
"grad_norm": 0.15034047715143825,
"learning_rate": 5.253245844193564e-06,
"loss": 0.2858,
"step": 2380
},
{
"epoch": 1.0728963551822408,
"grad_norm": 0.1264075738033277,
"learning_rate": 5.233629284547435e-06,
"loss": 0.2564,
"step": 2385
},
{
"epoch": 1.0751462426878655,
"grad_norm": 0.1940932983786269,
"learning_rate": 5.214009120246623e-06,
"loss": 0.2722,
"step": 2390
},
{
"epoch": 1.0773961301934902,
"grad_norm": 0.16922904631843647,
"learning_rate": 5.1943856540096795e-06,
"loss": 0.2912,
"step": 2395
},
{
"epoch": 1.079646017699115,
"grad_norm": 0.23716139744779294,
"learning_rate": 5.174759188606087e-06,
"loss": 0.2885,
"step": 2400
},
{
"epoch": 1.0818959052047397,
"grad_norm": 0.11509288529342813,
"learning_rate": 5.155130026851616e-06,
"loss": 0.2575,
"step": 2405
},
{
"epoch": 1.0841457927103644,
"grad_norm": 0.17727493415132747,
"learning_rate": 5.135498471603629e-06,
"loss": 0.2639,
"step": 2410
},
{
"epoch": 1.0863956802159893,
"grad_norm": 0.13930766876349623,
"learning_rate": 5.1158648257564235e-06,
"loss": 0.2606,
"step": 2415
},
{
"epoch": 1.0886455677216138,
"grad_norm": 0.12454839412933186,
"learning_rate": 5.0962293922365495e-06,
"loss": 0.256,
"step": 2420
},
{
"epoch": 1.0908954552272387,
"grad_norm": 0.18809390149779476,
"learning_rate": 5.076592473998141e-06,
"loss": 0.2646,
"step": 2425
},
{
"epoch": 1.0931453427328635,
"grad_norm": 0.1508834503375353,
"learning_rate": 5.056954374018236e-06,
"loss": 0.2764,
"step": 2430
},
{
"epoch": 1.0953952302384882,
"grad_norm": 0.1491889266816844,
"learning_rate": 5.037315395292111e-06,
"loss": 0.2691,
"step": 2435
},
{
"epoch": 1.0976451177441129,
"grad_norm": 0.15633034297704468,
"learning_rate": 5.017675840828597e-06,
"loss": 0.2657,
"step": 2440
},
{
"epoch": 1.0998950052497376,
"grad_norm": 0.15782519717103635,
"learning_rate": 4.998036013645409e-06,
"loss": 0.2561,
"step": 2445
},
{
"epoch": 1.1021448927553623,
"grad_norm": 0.19449808917352213,
"learning_rate": 4.97839621676447e-06,
"loss": 0.2571,
"step": 2450
},
{
"epoch": 1.104394780260987,
"grad_norm": 0.16872055966750726,
"learning_rate": 4.958756753207234e-06,
"loss": 0.2459,
"step": 2455
},
{
"epoch": 1.1066446677666117,
"grad_norm": 0.17373438335912267,
"learning_rate": 4.939117925990013e-06,
"loss": 0.2805,
"step": 2460
},
{
"epoch": 1.1088945552722365,
"grad_norm": 0.15151992334964703,
"learning_rate": 4.919480038119302e-06,
"loss": 0.251,
"step": 2465
},
{
"epoch": 1.1111444427778612,
"grad_norm": 0.15554454267536397,
"learning_rate": 4.899843392587104e-06,
"loss": 0.2533,
"step": 2470
},
{
"epoch": 1.113394330283486,
"grad_norm": 0.16595294302301358,
"learning_rate": 4.880208292366247e-06,
"loss": 0.2864,
"step": 2475
},
{
"epoch": 1.1156442177891106,
"grad_norm": 0.15038201249362013,
"learning_rate": 4.860575040405726e-06,
"loss": 0.2744,
"step": 2480
},
{
"epoch": 1.1178941052947353,
"grad_norm": 0.16630054816008968,
"learning_rate": 4.840943939626012e-06,
"loss": 0.2362,
"step": 2485
},
{
"epoch": 1.12014399280036,
"grad_norm": 0.18269629542973387,
"learning_rate": 4.821315292914392e-06,
"loss": 0.2786,
"step": 2490
},
{
"epoch": 1.1223938803059847,
"grad_norm": 0.16417528180865418,
"learning_rate": 4.801689403120282e-06,
"loss": 0.2506,
"step": 2495
},
{
"epoch": 1.1246437678116095,
"grad_norm": 0.12251195240813534,
"learning_rate": 4.782066573050567e-06,
"loss": 0.2693,
"step": 2500
},
{
"epoch": 1.1268936553172342,
"grad_norm": 0.19913321021658195,
"learning_rate": 4.7624471054649216e-06,
"loss": 0.26,
"step": 2505
},
{
"epoch": 1.129143542822859,
"grad_norm": 0.16359478594452095,
"learning_rate": 4.742831303071143e-06,
"loss": 0.2507,
"step": 2510
},
{
"epoch": 1.1313934303284836,
"grad_norm": 0.20741074237045662,
"learning_rate": 4.723219468520474e-06,
"loss": 0.2678,
"step": 2515
},
{
"epoch": 1.1336433178341083,
"grad_norm": 0.16956816625653676,
"learning_rate": 4.703611904402939e-06,
"loss": 0.2795,
"step": 2520
},
{
"epoch": 1.135893205339733,
"grad_norm": 0.1818340434409631,
"learning_rate": 4.684008913242679e-06,
"loss": 0.2586,
"step": 2525
},
{
"epoch": 1.1381430928453578,
"grad_norm": 0.17749209313732456,
"learning_rate": 4.664410797493275e-06,
"loss": 0.2708,
"step": 2530
},
{
"epoch": 1.1403929803509825,
"grad_norm": 0.167827444506409,
"learning_rate": 4.644817859533083e-06,
"loss": 0.2717,
"step": 2535
},
{
"epoch": 1.1426428678566072,
"grad_norm": 0.17149191797141825,
"learning_rate": 4.625230401660578e-06,
"loss": 0.2444,
"step": 2540
},
{
"epoch": 1.144892755362232,
"grad_norm": 0.19053262323498327,
"learning_rate": 4.605648726089674e-06,
"loss": 0.2546,
"step": 2545
},
{
"epoch": 1.1471426428678566,
"grad_norm": 0.17029611567515032,
"learning_rate": 4.58607313494508e-06,
"loss": 0.2515,
"step": 2550
},
{
"epoch": 1.1493925303734813,
"grad_norm": 0.20535330778256622,
"learning_rate": 4.566503930257624e-06,
"loss": 0.2687,
"step": 2555
},
{
"epoch": 1.151642417879106,
"grad_norm": 0.17888453950166083,
"learning_rate": 4.546941413959595e-06,
"loss": 0.2582,
"step": 2560
},
{
"epoch": 1.1538923053847308,
"grad_norm": 0.17098481716726255,
"learning_rate": 4.5273858878800895e-06,
"loss": 0.2633,
"step": 2565
},
{
"epoch": 1.1561421928903555,
"grad_norm": 0.22394541422414396,
"learning_rate": 4.507837653740355e-06,
"loss": 0.2657,
"step": 2570
},
{
"epoch": 1.1583920803959802,
"grad_norm": 0.16148745686481833,
"learning_rate": 4.4882970131491286e-06,
"loss": 0.2469,
"step": 2575
},
{
"epoch": 1.160641967901605,
"grad_norm": 0.21762812124764483,
"learning_rate": 4.468764267597986e-06,
"loss": 0.2815,
"step": 2580
},
{
"epoch": 1.1628918554072296,
"grad_norm": 0.2041647572323139,
"learning_rate": 4.449239718456696e-06,
"loss": 0.253,
"step": 2585
},
{
"epoch": 1.1651417429128543,
"grad_norm": 0.1508182234886033,
"learning_rate": 4.429723666968559e-06,
"loss": 0.2532,
"step": 2590
},
{
"epoch": 1.167391630418479,
"grad_norm": 0.22173731592066487,
"learning_rate": 4.410216414245771e-06,
"loss": 0.2597,
"step": 2595
},
{
"epoch": 1.1696415179241038,
"grad_norm": 0.15334607029538722,
"learning_rate": 4.390718261264768e-06,
"loss": 0.2429,
"step": 2600
},
{
"epoch": 1.1718914054297285,
"grad_norm": 0.17386719805484463,
"learning_rate": 4.371229508861588e-06,
"loss": 0.2718,
"step": 2605
},
{
"epoch": 1.1741412929353532,
"grad_norm": 0.255145373819277,
"learning_rate": 4.351750457727229e-06,
"loss": 0.2544,
"step": 2610
},
{
"epoch": 1.176391180440978,
"grad_norm": 0.19091868423027997,
"learning_rate": 4.332281408403011e-06,
"loss": 0.26,
"step": 2615
},
{
"epoch": 1.1786410679466026,
"grad_norm": 0.17031635023758315,
"learning_rate": 4.312822661275929e-06,
"loss": 0.2478,
"step": 2620
},
{
"epoch": 1.1808909554522273,
"grad_norm": 0.18810141305157912,
"learning_rate": 4.293374516574031e-06,
"loss": 0.2593,
"step": 2625
},
{
"epoch": 1.183140842957852,
"grad_norm": 0.20489249951929697,
"learning_rate": 4.273937274361782e-06,
"loss": 0.2226,
"step": 2630
},
{
"epoch": 1.1853907304634768,
"grad_norm": 0.18589998495363094,
"learning_rate": 4.254511234535432e-06,
"loss": 0.2313,
"step": 2635
},
{
"epoch": 1.1876406179691015,
"grad_norm": 0.1974695166475231,
"learning_rate": 4.235096696818385e-06,
"loss": 0.2782,
"step": 2640
},
{
"epoch": 1.1898905054747262,
"grad_norm": 0.15560807641673985,
"learning_rate": 4.215693960756586e-06,
"loss": 0.2461,
"step": 2645
},
{
"epoch": 1.192140392980351,
"grad_norm": 0.14168460680781833,
"learning_rate": 4.1963033257138904e-06,
"loss": 0.2323,
"step": 2650
},
{
"epoch": 1.1943902804859756,
"grad_norm": 0.19193101382035213,
"learning_rate": 4.176925090867449e-06,
"loss": 0.252,
"step": 2655
},
{
"epoch": 1.1966401679916003,
"grad_norm": 0.19059681316908272,
"learning_rate": 4.157559555203086e-06,
"loss": 0.2237,
"step": 2660
},
{
"epoch": 1.198890055497225,
"grad_norm": 0.18365584045782385,
"learning_rate": 4.138207017510696e-06,
"loss": 0.2498,
"step": 2665
},
{
"epoch": 1.2011399430028498,
"grad_norm": 0.17126185601849214,
"learning_rate": 4.118867776379624e-06,
"loss": 0.2121,
"step": 2670
},
{
"epoch": 1.2033898305084745,
"grad_norm": 0.23530086737062514,
"learning_rate": 4.099542130194069e-06,
"loss": 0.2369,
"step": 2675
},
{
"epoch": 1.2056397180140994,
"grad_norm": 0.1759441387313428,
"learning_rate": 4.0802303771284685e-06,
"loss": 0.2171,
"step": 2680
},
{
"epoch": 1.207889605519724,
"grad_norm": 0.19878924933956027,
"learning_rate": 4.060932815142904e-06,
"loss": 0.2631,
"step": 2685
},
{
"epoch": 1.2101394930253488,
"grad_norm": 0.1984620336427276,
"learning_rate": 4.041649741978508e-06,
"loss": 0.2408,
"step": 2690
},
{
"epoch": 1.2123893805309733,
"grad_norm": 0.16406158411947314,
"learning_rate": 4.022381455152863e-06,
"loss": 0.2204,
"step": 2695
},
{
"epoch": 1.2146392680365983,
"grad_norm": 0.18585134324802086,
"learning_rate": 4.003128251955412e-06,
"loss": 0.2254,
"step": 2700
},
{
"epoch": 1.2168891555422228,
"grad_norm": 0.2028470417783533,
"learning_rate": 3.983890429442876e-06,
"loss": 0.2174,
"step": 2705
},
{
"epoch": 1.2191390430478477,
"grad_norm": 0.18306124060212872,
"learning_rate": 3.964668284434666e-06,
"loss": 0.2281,
"step": 2710
},
{
"epoch": 1.2213889305534724,
"grad_norm": 0.2261111639681813,
"learning_rate": 3.945462113508312e-06,
"loss": 0.2183,
"step": 2715
},
{
"epoch": 1.2236388180590971,
"grad_norm": 0.21171892163095699,
"learning_rate": 3.92627221299487e-06,
"loss": 0.2249,
"step": 2720
},
{
"epoch": 1.2258887055647218,
"grad_norm": 0.18554866614076224,
"learning_rate": 3.907098878974367e-06,
"loss": 0.2356,
"step": 2725
},
{
"epoch": 1.2281385930703466,
"grad_norm": 0.17740452807380613,
"learning_rate": 3.887942407271228e-06,
"loss": 0.2213,
"step": 2730
},
{
"epoch": 1.2303884805759713,
"grad_norm": 0.19628544094095077,
"learning_rate": 3.868803093449709e-06,
"loss": 0.2256,
"step": 2735
},
{
"epoch": 1.232638368081596,
"grad_norm": 0.1906710395370276,
"learning_rate": 3.8496812328093335e-06,
"loss": 0.2431,
"step": 2740
},
{
"epoch": 1.2348882555872207,
"grad_norm": 0.21739799246928065,
"learning_rate": 3.8305771203803434e-06,
"loss": 0.2053,
"step": 2745
},
{
"epoch": 1.2371381430928454,
"grad_norm": 0.20933189544262915,
"learning_rate": 3.8114910509191483e-06,
"loss": 0.2372,
"step": 2750
},
{
"epoch": 1.2393880305984701,
"grad_norm": 0.16805864711967494,
"learning_rate": 3.7924233189037697e-06,
"loss": 0.2421,
"step": 2755
},
{
"epoch": 1.2416379181040949,
"grad_norm": 0.23407049517629622,
"learning_rate": 3.773374218529298e-06,
"loss": 0.2289,
"step": 2760
},
{
"epoch": 1.2438878056097196,
"grad_norm": 0.20043628906146582,
"learning_rate": 3.7543440437033656e-06,
"loss": 0.2197,
"step": 2765
},
{
"epoch": 1.2461376931153443,
"grad_norm": 0.1811301883423287,
"learning_rate": 3.7353330880415963e-06,
"loss": 0.2118,
"step": 2770
},
{
"epoch": 1.248387580620969,
"grad_norm": 0.20413664615759625,
"learning_rate": 3.7163416448630886e-06,
"loss": 0.2103,
"step": 2775
},
{
"epoch": 1.2506374681265937,
"grad_norm": 0.2053294418375065,
"learning_rate": 3.6973700071858764e-06,
"loss": 0.2265,
"step": 2780
},
{
"epoch": 1.2528873556322184,
"grad_norm": 0.17855437216730508,
"learning_rate": 3.6784184677224204e-06,
"loss": 0.2082,
"step": 2785
},
{
"epoch": 1.2551372431378431,
"grad_norm": 0.21204933584524724,
"learning_rate": 3.659487318875087e-06,
"loss": 0.2368,
"step": 2790
},
{
"epoch": 1.2573871306434679,
"grad_norm": 0.244934854739885,
"learning_rate": 3.6405768527316376e-06,
"loss": 0.2236,
"step": 2795
},
{
"epoch": 1.2596370181490926,
"grad_norm": 0.20352719384257717,
"learning_rate": 3.6216873610607155e-06,
"loss": 0.2127,
"step": 2800
},
{
"epoch": 1.2618869056547173,
"grad_norm": 0.21525625357885447,
"learning_rate": 3.602819135307355e-06,
"loss": 0.2026,
"step": 2805
},
{
"epoch": 1.264136793160342,
"grad_norm": 0.24886200931475094,
"learning_rate": 3.58397246658848e-06,
"loss": 0.2049,
"step": 2810
},
{
"epoch": 1.2663866806659667,
"grad_norm": 0.22213048059657176,
"learning_rate": 3.5651476456884103e-06,
"loss": 0.2149,
"step": 2815
},
{
"epoch": 1.2686365681715914,
"grad_norm": 0.24474792019196667,
"learning_rate": 3.5463449630543744e-06,
"loss": 0.2176,
"step": 2820
},
{
"epoch": 1.2708864556772161,
"grad_norm": 0.21959268792414904,
"learning_rate": 3.527564708792035e-06,
"loss": 0.2319,
"step": 2825
},
{
"epoch": 1.2731363431828409,
"grad_norm": 0.21285142665025264,
"learning_rate": 3.508807172661006e-06,
"loss": 0.2278,
"step": 2830
},
{
"epoch": 1.2753862306884656,
"grad_norm": 0.24872484432655345,
"learning_rate": 3.490072644070386e-06,
"loss": 0.2367,
"step": 2835
},
{
"epoch": 1.2776361181940903,
"grad_norm": 0.2446892197957464,
"learning_rate": 3.47136141207429e-06,
"loss": 0.2147,
"step": 2840
},
{
"epoch": 1.279886005699715,
"grad_norm": 0.3593552477933211,
"learning_rate": 3.452673765367389e-06,
"loss": 0.2471,
"step": 2845
},
{
"epoch": 1.2821358932053397,
"grad_norm": 0.18760658096432373,
"learning_rate": 3.4340099922804627e-06,
"loss": 0.2185,
"step": 2850
},
{
"epoch": 1.2843857807109644,
"grad_norm": 0.1746094898464911,
"learning_rate": 3.4153703807759432e-06,
"loss": 0.1939,
"step": 2855
},
{
"epoch": 1.2866356682165891,
"grad_norm": 0.2386232051443061,
"learning_rate": 3.3967552184434753e-06,
"loss": 0.2182,
"step": 2860
},
{
"epoch": 1.2888855557222139,
"grad_norm": 0.2147456869413775,
"learning_rate": 3.378164792495475e-06,
"loss": 0.2232,
"step": 2865
},
{
"epoch": 1.2911354432278386,
"grad_norm": 0.21939888824914258,
"learning_rate": 3.3595993897627098e-06,
"loss": 0.2059,
"step": 2870
},
{
"epoch": 1.2933853307334633,
"grad_norm": 0.20007621997926173,
"learning_rate": 3.3410592966898565e-06,
"loss": 0.2025,
"step": 2875
},
{
"epoch": 1.295635218239088,
"grad_norm": 0.22959303011889556,
"learning_rate": 3.3225447993310983e-06,
"loss": 0.2004,
"step": 2880
},
{
"epoch": 1.2978851057447127,
"grad_norm": 0.23309801112874845,
"learning_rate": 3.3040561833456964e-06,
"loss": 0.1914,
"step": 2885
},
{
"epoch": 1.3001349932503374,
"grad_norm": 0.22848735574436602,
"learning_rate": 3.2855937339935933e-06,
"loss": 0.1844,
"step": 2890
},
{
"epoch": 1.3023848807559621,
"grad_norm": 0.20570875834144497,
"learning_rate": 3.2671577361310087e-06,
"loss": 0.2132,
"step": 2895
},
{
"epoch": 1.3046347682615869,
"grad_norm": 0.24826968315533732,
"learning_rate": 3.2487484742060427e-06,
"loss": 0.2111,
"step": 2900
},
{
"epoch": 1.3068846557672116,
"grad_norm": 0.2410762961266627,
"learning_rate": 3.2303662322542835e-06,
"loss": 0.1948,
"step": 2905
},
{
"epoch": 1.3091345432728363,
"grad_norm": 0.23347593077480983,
"learning_rate": 3.212011293894436e-06,
"loss": 0.2008,
"step": 2910
},
{
"epoch": 1.311384430778461,
"grad_norm": 0.21360507603920142,
"learning_rate": 3.1936839423239376e-06,
"loss": 0.2042,
"step": 2915
},
{
"epoch": 1.3136343182840857,
"grad_norm": 0.21960761516089436,
"learning_rate": 3.1753844603145894e-06,
"loss": 0.2391,
"step": 2920
},
{
"epoch": 1.3158842057897104,
"grad_norm": 0.20203229318870164,
"learning_rate": 3.1571131302081916e-06,
"loss": 0.1876,
"step": 2925
},
{
"epoch": 1.3181340932953352,
"grad_norm": 0.24191918555495237,
"learning_rate": 3.138870233912197e-06,
"loss": 0.1962,
"step": 2930
},
{
"epoch": 1.32038398080096,
"grad_norm": 0.20921020378628946,
"learning_rate": 3.1206560528953467e-06,
"loss": 0.2058,
"step": 2935
},
{
"epoch": 1.3226338683065846,
"grad_norm": 0.20869954401470014,
"learning_rate": 3.102470868183344e-06,
"loss": 0.2064,
"step": 2940
},
{
"epoch": 1.3248837558122095,
"grad_norm": 0.2249649340119077,
"learning_rate": 3.084314960354501e-06,
"loss": 0.2046,
"step": 2945
},
{
"epoch": 1.327133643317834,
"grad_norm": 0.25182875069609073,
"learning_rate": 3.066188609535421e-06,
"loss": 0.2037,
"step": 2950
},
{
"epoch": 1.329383530823459,
"grad_norm": 0.21107658603026272,
"learning_rate": 3.0480920953966786e-06,
"loss": 0.2094,
"step": 2955
},
{
"epoch": 1.3316334183290834,
"grad_norm": 0.22913188913268076,
"learning_rate": 3.0300256971484943e-06,
"loss": 0.2162,
"step": 2960
},
{
"epoch": 1.3338833058347084,
"grad_norm": 0.2539850632246194,
"learning_rate": 3.0119896935364305e-06,
"loss": 0.1941,
"step": 2965
},
{
"epoch": 1.3361331933403329,
"grad_norm": 0.23858971814994895,
"learning_rate": 2.993984362837098e-06,
"loss": 0.1839,
"step": 2970
},
{
"epoch": 1.3383830808459578,
"grad_norm": 0.2721885562055672,
"learning_rate": 2.9760099828538545e-06,
"loss": 0.211,
"step": 2975
},
{
"epoch": 1.3406329683515823,
"grad_norm": 0.2482373958173057,
"learning_rate": 2.9580668309125203e-06,
"loss": 0.1998,
"step": 2980
},
{
"epoch": 1.3428828558572072,
"grad_norm": 0.2306376622557913,
"learning_rate": 2.940155183857096e-06,
"loss": 0.2196,
"step": 2985
},
{
"epoch": 1.3451327433628317,
"grad_norm": 0.26262612593731016,
"learning_rate": 2.922275318045502e-06,
"loss": 0.1882,
"step": 2990
},
{
"epoch": 1.3473826308684567,
"grad_norm": 0.20346419756531464,
"learning_rate": 2.9044275093453034e-06,
"loss": 0.193,
"step": 2995
},
{
"epoch": 1.3496325183740812,
"grad_norm": 0.22993902000452152,
"learning_rate": 2.8866120331294567e-06,
"loss": 0.1736,
"step": 3000
},
{
"epoch": 1.351882405879706,
"grad_norm": 0.27935117403868454,
"learning_rate": 2.8688291642720656e-06,
"loss": 0.1904,
"step": 3005
},
{
"epoch": 1.3541322933853308,
"grad_norm": 0.27365571988160076,
"learning_rate": 2.8510791771441327e-06,
"loss": 0.1853,
"step": 3010
},
{
"epoch": 1.3563821808909555,
"grad_norm": 0.23083646098925237,
"learning_rate": 2.8333623456093313e-06,
"loss": 0.1968,
"step": 3015
},
{
"epoch": 1.3586320683965802,
"grad_norm": 0.2704413392632432,
"learning_rate": 2.815678943019784e-06,
"loss": 0.1927,
"step": 3020
},
{
"epoch": 1.360881955902205,
"grad_norm": 0.24524940743516008,
"learning_rate": 2.7980292422118282e-06,
"loss": 0.2051,
"step": 3025
},
{
"epoch": 1.3631318434078297,
"grad_norm": 0.24409905829083706,
"learning_rate": 2.7804135155018307e-06,
"loss": 0.164,
"step": 3030
},
{
"epoch": 1.3653817309134544,
"grad_norm": 0.23814624701122666,
"learning_rate": 2.762832034681965e-06,
"loss": 0.1777,
"step": 3035
},
{
"epoch": 1.367631618419079,
"grad_norm": 0.24368144586742516,
"learning_rate": 2.7452850710160305e-06,
"loss": 0.1946,
"step": 3040
},
{
"epoch": 1.3698815059247038,
"grad_norm": 0.24657044578855591,
"learning_rate": 2.727772895235262e-06,
"loss": 0.2024,
"step": 3045
},
{
"epoch": 1.3721313934303285,
"grad_norm": 0.20668089980394588,
"learning_rate": 2.710295777534154e-06,
"loss": 0.1853,
"step": 3050
},
{
"epoch": 1.3743812809359532,
"grad_norm": 0.22601090907078772,
"learning_rate": 2.692853987566291e-06,
"loss": 0.1764,
"step": 3055
},
{
"epoch": 1.376631168441578,
"grad_norm": 0.2872809956397954,
"learning_rate": 2.675447794440188e-06,
"loss": 0.1609,
"step": 3060
},
{
"epoch": 1.3788810559472027,
"grad_norm": 0.2411498866283728,
"learning_rate": 2.658077466715138e-06,
"loss": 0.1813,
"step": 3065
},
{
"epoch": 1.3811309434528274,
"grad_norm": 0.19284001830686515,
"learning_rate": 2.6407432723970694e-06,
"loss": 0.1751,
"step": 3070
},
{
"epoch": 1.383380830958452,
"grad_norm": 0.23962546665483148,
"learning_rate": 2.6234454789344067e-06,
"loss": 0.164,
"step": 3075
},
{
"epoch": 1.3856307184640768,
"grad_norm": 0.25424115274026465,
"learning_rate": 2.6061843532139563e-06,
"loss": 0.1816,
"step": 3080
},
{
"epoch": 1.3878806059697015,
"grad_norm": 0.25313051025088457,
"learning_rate": 2.5889601615567657e-06,
"loss": 0.1813,
"step": 3085
},
{
"epoch": 1.3901304934753262,
"grad_norm": 0.2378148779779353,
"learning_rate": 2.5717731697140425e-06,
"loss": 0.1822,
"step": 3090
},
{
"epoch": 1.392380380980951,
"grad_norm": 0.24347808422412195,
"learning_rate": 2.554623642863031e-06,
"loss": 0.165,
"step": 3095
},
{
"epoch": 1.3946302684865757,
"grad_norm": 0.26898645682575706,
"learning_rate": 2.5375118456029345e-06,
"loss": 0.1834,
"step": 3100
},
{
"epoch": 1.3968801559922004,
"grad_norm": 0.20720842966338204,
"learning_rate": 2.520438041950827e-06,
"loss": 0.1638,
"step": 3105
},
{
"epoch": 1.399130043497825,
"grad_norm": 0.17953407898094143,
"learning_rate": 2.503402495337579e-06,
"loss": 0.1547,
"step": 3110
},
{
"epoch": 1.4013799310034498,
"grad_norm": 0.27527731902583097,
"learning_rate": 2.4864054686037993e-06,
"loss": 0.1771,
"step": 3115
},
{
"epoch": 1.4036298185090745,
"grad_norm": 0.2441103318991431,
"learning_rate": 2.469447223995772e-06,
"loss": 0.1866,
"step": 3120
},
{
"epoch": 1.4058797060146992,
"grad_norm": 0.22146555295977546,
"learning_rate": 2.452528023161414e-06,
"loss": 0.1658,
"step": 3125
},
{
"epoch": 1.408129593520324,
"grad_norm": 0.24036023292266545,
"learning_rate": 2.4356481271462396e-06,
"loss": 0.1951,
"step": 3130
},
{
"epoch": 1.4103794810259487,
"grad_norm": 0.2346343428333368,
"learning_rate": 2.4188077963893276e-06,
"loss": 0.1724,
"step": 3135
},
{
"epoch": 1.4126293685315734,
"grad_norm": 0.28341682436165366,
"learning_rate": 2.4020072907193123e-06,
"loss": 0.1786,
"step": 3140
},
{
"epoch": 1.414879256037198,
"grad_norm": 0.25526003142103276,
"learning_rate": 2.3852468693503635e-06,
"loss": 0.166,
"step": 3145
},
{
"epoch": 1.4171291435428228,
"grad_norm": 0.21169312639212728,
"learning_rate": 2.3685267908781934e-06,
"loss": 0.1651,
"step": 3150
},
{
"epoch": 1.4193790310484475,
"grad_norm": 0.2311680518850515,
"learning_rate": 2.3518473132760668e-06,
"loss": 0.1943,
"step": 3155
},
{
"epoch": 1.4216289185540723,
"grad_norm": 0.2466100225354571,
"learning_rate": 2.335208693890819e-06,
"loss": 0.1759,
"step": 3160
},
{
"epoch": 1.423878806059697,
"grad_norm": 0.25320147361029777,
"learning_rate": 2.318611189438884e-06,
"loss": 0.1741,
"step": 3165
},
{
"epoch": 1.4261286935653217,
"grad_norm": 0.20379034599277449,
"learning_rate": 2.30205505600234e-06,
"loss": 0.1515,
"step": 3170
},
{
"epoch": 1.4283785810709464,
"grad_norm": 0.2171396909921854,
"learning_rate": 2.2855405490249498e-06,
"loss": 0.17,
"step": 3175
},
{
"epoch": 1.430628468576571,
"grad_norm": 0.25025327769749556,
"learning_rate": 2.2690679233082237e-06,
"loss": 0.1442,
"step": 3180
},
{
"epoch": 1.4328783560821958,
"grad_norm": 0.20134411154173665,
"learning_rate": 2.2526374330074945e-06,
"loss": 0.172,
"step": 3185
},
{
"epoch": 1.4351282435878205,
"grad_norm": 0.20636498909519851,
"learning_rate": 2.23624933162798e-06,
"loss": 0.1663,
"step": 3190
},
{
"epoch": 1.4373781310934453,
"grad_norm": 0.24524283669244562,
"learning_rate": 2.219903872020885e-06,
"loss": 0.1726,
"step": 3195
},
{
"epoch": 1.43962801859907,
"grad_norm": 0.226380159375995,
"learning_rate": 2.2036013063795024e-06,
"loss": 0.1707,
"step": 3200
},
{
"epoch": 1.4418779061046947,
"grad_norm": 0.23433271109204132,
"learning_rate": 2.1873418862353095e-06,
"loss": 0.1885,
"step": 3205
},
{
"epoch": 1.4441277936103196,
"grad_norm": 0.27219913617478064,
"learning_rate": 2.1711258624540955e-06,
"loss": 0.1627,
"step": 3210
},
{
"epoch": 1.4463776811159441,
"grad_norm": 0.2720555642584958,
"learning_rate": 2.15495348523209e-06,
"loss": 0.1637,
"step": 3215
},
{
"epoch": 1.448627568621569,
"grad_norm": 0.26833619371988116,
"learning_rate": 2.1388250040921007e-06,
"loss": 0.1536,
"step": 3220
},
{
"epoch": 1.4508774561271935,
"grad_norm": 0.2603614911120465,
"learning_rate": 2.1227406678796664e-06,
"loss": 0.1608,
"step": 3225
},
{
"epoch": 1.4531273436328185,
"grad_norm": 0.21162975240993986,
"learning_rate": 2.1067007247592153e-06,
"loss": 0.1649,
"step": 3230
},
{
"epoch": 1.455377231138443,
"grad_norm": 0.240416434380939,
"learning_rate": 2.0907054222102367e-06,
"loss": 0.157,
"step": 3235
},
{
"epoch": 1.457627118644068,
"grad_norm": 0.22619962147371023,
"learning_rate": 2.074755007023461e-06,
"loss": 0.152,
"step": 3240
},
{
"epoch": 1.4598770061496924,
"grad_norm": 0.23122000932974787,
"learning_rate": 2.058849725297061e-06,
"loss": 0.1674,
"step": 3245
},
{
"epoch": 1.4621268936553173,
"grad_norm": 0.20088454332338435,
"learning_rate": 2.042989822432837e-06,
"loss": 0.1426,
"step": 3250
},
{
"epoch": 1.4643767811609418,
"grad_norm": 0.22446161891702227,
"learning_rate": 2.0271755431324456e-06,
"loss": 0.1431,
"step": 3255
},
{
"epoch": 1.4666266686665668,
"grad_norm": 0.25951357745647785,
"learning_rate": 2.011407131393624e-06,
"loss": 0.1521,
"step": 3260
},
{
"epoch": 1.4688765561721913,
"grad_norm": 0.23931332134441274,
"learning_rate": 1.9956848305064156e-06,
"loss": 0.1348,
"step": 3265
},
{
"epoch": 1.4711264436778162,
"grad_norm": 0.2443591943685552,
"learning_rate": 1.9800088830494233e-06,
"loss": 0.1616,
"step": 3270
},
{
"epoch": 1.4733763311834407,
"grad_norm": 0.2583750411808441,
"learning_rate": 1.964379530886066e-06,
"loss": 0.1568,
"step": 3275
},
{
"epoch": 1.4756262186890656,
"grad_norm": 0.23149031179647173,
"learning_rate": 1.948797015160845e-06,
"loss": 0.1628,
"step": 3280
},
{
"epoch": 1.4778761061946903,
"grad_norm": 0.24603772896490778,
"learning_rate": 1.9332615762956252e-06,
"loss": 0.172,
"step": 3285
},
{
"epoch": 1.480125993700315,
"grad_norm": 0.21815434321755453,
"learning_rate": 1.9177734539859246e-06,
"loss": 0.1412,
"step": 3290
},
{
"epoch": 1.4823758812059398,
"grad_norm": 0.22721807904843327,
"learning_rate": 1.9023328871972163e-06,
"loss": 0.1535,
"step": 3295
},
{
"epoch": 1.4846257687115645,
"grad_norm": 0.242196598571302,
"learning_rate": 1.886940114161241e-06,
"loss": 0.1418,
"step": 3300
},
{
"epoch": 1.4868756562171892,
"grad_norm": 0.31031701723743615,
"learning_rate": 1.8715953723723374e-06,
"loss": 0.152,
"step": 3305
},
{
"epoch": 1.489125543722814,
"grad_norm": 0.2768031393228084,
"learning_rate": 1.8562988985837632e-06,
"loss": 0.1333,
"step": 3310
},
{
"epoch": 1.4913754312284386,
"grad_norm": 0.20415150169731586,
"learning_rate": 1.8410509288040557e-06,
"loss": 0.1414,
"step": 3315
},
{
"epoch": 1.4936253187340633,
"grad_norm": 0.26667619653525043,
"learning_rate": 1.8258516982933905e-06,
"loss": 0.1468,
"step": 3320
},
{
"epoch": 1.495875206239688,
"grad_norm": 0.2944474959496459,
"learning_rate": 1.8107014415599416e-06,
"loss": 0.1191,
"step": 3325
},
{
"epoch": 1.4981250937453128,
"grad_norm": 0.2393057029593332,
"learning_rate": 1.7956003923562715e-06,
"loss": 0.1404,
"step": 3330
},
{
"epoch": 1.5003749812509375,
"grad_norm": 0.2714540493615385,
"learning_rate": 1.7805487836757224e-06,
"loss": 0.139,
"step": 3335
},
{
"epoch": 1.5026248687565622,
"grad_norm": 0.24328575957899592,
"learning_rate": 1.7655468477488191e-06,
"loss": 0.1388,
"step": 3340
},
{
"epoch": 1.504874756262187,
"grad_norm": 0.22919700731255085,
"learning_rate": 1.7505948160396901e-06,
"loss": 0.1238,
"step": 3345
},
{
"epoch": 1.5071246437678116,
"grad_norm": 0.2619372461489788,
"learning_rate": 1.7356929192424937e-06,
"loss": 0.1451,
"step": 3350
},
{
"epoch": 1.5093745312734363,
"grad_norm": 0.2086634801533094,
"learning_rate": 1.720841387277858e-06,
"loss": 0.1485,
"step": 3355
},
{
"epoch": 1.511624418779061,
"grad_norm": 0.22980427576347195,
"learning_rate": 1.7060404492893345e-06,
"loss": 0.1474,
"step": 3360
},
{
"epoch": 1.5138743062846858,
"grad_norm": 0.24590985962229212,
"learning_rate": 1.6912903336398677e-06,
"loss": 0.1375,
"step": 3365
},
{
"epoch": 1.5161241937903105,
"grad_norm": 0.25732491354780235,
"learning_rate": 1.6765912679082592e-06,
"loss": 0.1357,
"step": 3370
},
{
"epoch": 1.5183740812959352,
"grad_norm": 0.26500627710086616,
"learning_rate": 1.6619434788856664e-06,
"loss": 0.1419,
"step": 3375
},
{
"epoch": 1.52062396880156,
"grad_norm": 0.2928540052735079,
"learning_rate": 1.647347192572105e-06,
"loss": 0.1307,
"step": 3380
},
{
"epoch": 1.5228738563071846,
"grad_norm": 0.21671093221137389,
"learning_rate": 1.6328026341729547e-06,
"loss": 0.1269,
"step": 3385
},
{
"epoch": 1.5251237438128094,
"grad_norm": 0.20008846223184612,
"learning_rate": 1.618310028095486e-06,
"loss": 0.1229,
"step": 3390
},
{
"epoch": 1.527373631318434,
"grad_norm": 0.27258283595012933,
"learning_rate": 1.6038695979454033e-06,
"loss": 0.1291,
"step": 3395
},
{
"epoch": 1.5296235188240588,
"grad_norm": 0.1659681540695972,
"learning_rate": 1.589481566523388e-06,
"loss": 0.1132,
"step": 3400
},
{
"epoch": 1.5318734063296835,
"grad_norm": 0.29801173110625,
"learning_rate": 1.5751461558216662e-06,
"loss": 0.1557,
"step": 3405
},
{
"epoch": 1.5341232938353082,
"grad_norm": 0.21585855046664817,
"learning_rate": 1.5608635870205813e-06,
"loss": 0.1275,
"step": 3410
},
{
"epoch": 1.536373181340933,
"grad_norm": 0.22691724613303907,
"learning_rate": 1.546634080485181e-06,
"loss": 0.1263,
"step": 3415
},
{
"epoch": 1.5386230688465576,
"grad_norm": 0.3126367672650283,
"learning_rate": 1.5324578557618158e-06,
"loss": 0.1281,
"step": 3420
},
{
"epoch": 1.5408729563521824,
"grad_norm": 0.2605966357941338,
"learning_rate": 1.5183351315747618e-06,
"loss": 0.1334,
"step": 3425
},
{
"epoch": 1.543122843857807,
"grad_norm": 0.21630173206179193,
"learning_rate": 1.5042661258228268e-06,
"loss": 0.1372,
"step": 3430
},
{
"epoch": 1.5453727313634318,
"grad_norm": 0.26829878682557234,
"learning_rate": 1.4902510555760052e-06,
"loss": 0.1257,
"step": 3435
},
{
"epoch": 1.5476226188690565,
"grad_norm": 0.29544397855593446,
"learning_rate": 1.4762901370721266e-06,
"loss": 0.1396,
"step": 3440
},
{
"epoch": 1.5498725063746812,
"grad_norm": 0.24819039772864243,
"learning_rate": 1.4623835857135099e-06,
"loss": 0.1298,
"step": 3445
},
{
"epoch": 1.552122393880306,
"grad_norm": 0.20631029388339692,
"learning_rate": 1.4485316160636491e-06,
"loss": 0.1135,
"step": 3450
},
{
"epoch": 1.5543722813859309,
"grad_norm": 0.2588845660409855,
"learning_rate": 1.434734441843899e-06,
"loss": 0.119,
"step": 3455
},
{
"epoch": 1.5566221688915554,
"grad_norm": 0.27482300851220287,
"learning_rate": 1.420992275930178e-06,
"loss": 0.1228,
"step": 3460
},
{
"epoch": 1.5588720563971803,
"grad_norm": 0.23756828701782703,
"learning_rate": 1.4073053303496837e-06,
"loss": 0.1397,
"step": 3465
},
{
"epoch": 1.5611219439028048,
"grad_norm": 0.2565129570324179,
"learning_rate": 1.3936738162776269e-06,
"loss": 0.1171,
"step": 3470
},
{
"epoch": 1.5633718314084297,
"grad_norm": 0.23747517958547196,
"learning_rate": 1.3800979440339602e-06,
"loss": 0.1112,
"step": 3475
},
{
"epoch": 1.5656217189140542,
"grad_norm": 0.28612053881235616,
"learning_rate": 1.3665779230801452e-06,
"loss": 0.1214,
"step": 3480
},
{
"epoch": 1.5678716064196792,
"grad_norm": 0.21974317229860285,
"learning_rate": 1.353113962015919e-06,
"loss": 0.1133,
"step": 3485
},
{
"epoch": 1.5701214939253036,
"grad_norm": 0.2586485593990932,
"learning_rate": 1.3397062685760715e-06,
"loss": 0.131,
"step": 3490
},
{
"epoch": 1.5723713814309286,
"grad_norm": 0.24428264502478964,
"learning_rate": 1.326355049627238e-06,
"loss": 0.1239,
"step": 3495
},
{
"epoch": 1.574621268936553,
"grad_norm": 0.2440528536513943,
"learning_rate": 1.31306051116472e-06,
"loss": 0.1156,
"step": 3500
},
{
"epoch": 1.576871156442178,
"grad_norm": 0.25949845916894754,
"learning_rate": 1.299822858309292e-06,
"loss": 0.118,
"step": 3505
},
{
"epoch": 1.5791210439478025,
"grad_norm": 0.27496698683134035,
"learning_rate": 1.2866422953040458e-06,
"loss": 0.1364,
"step": 3510
},
{
"epoch": 1.5813709314534274,
"grad_norm": 0.22644679203329376,
"learning_rate": 1.273519025511236e-06,
"loss": 0.1242,
"step": 3515
},
{
"epoch": 1.583620818959052,
"grad_norm": 0.27460451136628766,
"learning_rate": 1.2604532514091444e-06,
"loss": 0.1179,
"step": 3520
},
{
"epoch": 1.5858707064646769,
"grad_norm": 0.20903077719719648,
"learning_rate": 1.2474451745889516e-06,
"loss": 0.1174,
"step": 3525
},
{
"epoch": 1.5881205939703014,
"grad_norm": 0.24497256736111866,
"learning_rate": 1.2344949957516356e-06,
"loss": 0.1119,
"step": 3530
},
{
"epoch": 1.5903704814759263,
"grad_norm": 0.2539761952183662,
"learning_rate": 1.221602914704862e-06,
"loss": 0.1219,
"step": 3535
},
{
"epoch": 1.5926203689815508,
"grad_norm": 0.23677806854402075,
"learning_rate": 1.2087691303599109e-06,
"loss": 0.1131,
"step": 3540
},
{
"epoch": 1.5948702564871757,
"grad_norm": 0.28943703222233913,
"learning_rate": 1.1959938407286099e-06,
"loss": 0.1265,
"step": 3545
},
{
"epoch": 1.5971201439928002,
"grad_norm": 0.26936033145050353,
"learning_rate": 1.1832772429202716e-06,
"loss": 0.1155,
"step": 3550
},
{
"epoch": 1.5993700314984252,
"grad_norm": 0.24859595390547068,
"learning_rate": 1.1706195331386494e-06,
"loss": 0.1319,
"step": 3555
},
{
"epoch": 1.6016199190040497,
"grad_norm": 0.21137786230401104,
"learning_rate": 1.1580209066789272e-06,
"loss": 0.0959,
"step": 3560
},
{
"epoch": 1.6038698065096746,
"grad_norm": 0.18826307259382147,
"learning_rate": 1.1454815579246874e-06,
"loss": 0.1162,
"step": 3565
},
{
"epoch": 1.606119694015299,
"grad_norm": 0.2059875775498964,
"learning_rate": 1.1330016803449224e-06,
"loss": 0.1079,
"step": 3570
},
{
"epoch": 1.608369581520924,
"grad_norm": 0.28679664201908944,
"learning_rate": 1.1205814664910464e-06,
"loss": 0.1323,
"step": 3575
},
{
"epoch": 1.6106194690265485,
"grad_norm": 0.23898491505271052,
"learning_rate": 1.1082211079939248e-06,
"loss": 0.1,
"step": 3580
},
{
"epoch": 1.6128693565321734,
"grad_norm": 0.2700410969754371,
"learning_rate": 1.0959207955609163e-06,
"loss": 0.107,
"step": 3585
},
{
"epoch": 1.6151192440377982,
"grad_norm": 0.21154102841364958,
"learning_rate": 1.083680718972938e-06,
"loss": 0.1126,
"step": 3590
},
{
"epoch": 1.6173691315434229,
"grad_norm": 0.19582804111079785,
"learning_rate": 1.0715010670815212e-06,
"loss": 0.1111,
"step": 3595
},
{
"epoch": 1.6196190190490476,
"grad_norm": 0.2154658262674778,
"learning_rate": 1.059382027805914e-06,
"loss": 0.1025,
"step": 3600
},
{
"epoch": 1.6218689065546723,
"grad_norm": 0.30677049526532074,
"learning_rate": 1.0473237881301763e-06,
"loss": 0.1201,
"step": 3605
},
{
"epoch": 1.624118794060297,
"grad_norm": 0.23614701148998188,
"learning_rate": 1.0353265341002916e-06,
"loss": 0.104,
"step": 3610
},
{
"epoch": 1.6263686815659217,
"grad_norm": 0.24690793627028748,
"learning_rate": 1.0233904508212955e-06,
"loss": 0.1078,
"step": 3615
},
{
"epoch": 1.6286185690715465,
"grad_norm": 0.31258122069910355,
"learning_rate": 1.0115157224544313e-06,
"loss": 0.1036,
"step": 3620
},
{
"epoch": 1.6308684565771712,
"grad_norm": 0.23164502515145138,
"learning_rate": 9.997025322142934e-07,
"loss": 0.1082,
"step": 3625
},
{
"epoch": 1.6331183440827959,
"grad_norm": 0.24106743301610264,
"learning_rate": 9.87951062366011e-07,
"loss": 0.1197,
"step": 3630
},
{
"epoch": 1.6353682315884206,
"grad_norm": 0.26573732918101894,
"learning_rate": 9.762614942224312e-07,
"loss": 0.1205,
"step": 3635
},
{
"epoch": 1.6376181190940453,
"grad_norm": 0.11482579343049812,
"learning_rate": 9.646340081413225e-07,
"loss": 0.0915,
"step": 3640
},
{
"epoch": 1.63986800659967,
"grad_norm": 0.21075003083613178,
"learning_rate": 9.530687835225916e-07,
"loss": 0.097,
"step": 3645
},
{
"epoch": 1.6421178941052947,
"grad_norm": 0.23753636897967206,
"learning_rate": 9.415659988055215e-07,
"loss": 0.1042,
"step": 3650
},
{
"epoch": 1.6443677816109195,
"grad_norm": 0.24065236956197258,
"learning_rate": 9.30125831466005e-07,
"loss": 0.1021,
"step": 3655
},
{
"epoch": 1.6466176691165442,
"grad_norm": 0.28653116996485667,
"learning_rate": 9.187484580138184e-07,
"loss": 0.1153,
"step": 3660
},
{
"epoch": 1.6488675566221689,
"grad_norm": 0.2563141158700858,
"learning_rate": 9.074340539898962e-07,
"loss": 0.1106,
"step": 3665
},
{
"epoch": 1.6511174441277936,
"grad_norm": 0.29385405444476315,
"learning_rate": 8.961827939636198e-07,
"loss": 0.1087,
"step": 3670
},
{
"epoch": 1.6533673316334183,
"grad_norm": 0.2631541250069433,
"learning_rate": 8.849948515301188e-07,
"loss": 0.0978,
"step": 3675
},
{
"epoch": 1.655617219139043,
"grad_norm": 0.2722585679724958,
"learning_rate": 8.738703993076087e-07,
"loss": 0.109,
"step": 3680
},
{
"epoch": 1.6578671066446677,
"grad_norm": 0.3065358825170482,
"learning_rate": 8.62809608934711e-07,
"loss": 0.1019,
"step": 3685
},
{
"epoch": 1.6601169941502925,
"grad_norm": 0.23763230365231583,
"learning_rate": 8.518126510678138e-07,
"loss": 0.1138,
"step": 3690
},
{
"epoch": 1.6623668816559172,
"grad_norm": 0.2532402386408982,
"learning_rate": 8.408796953784365e-07,
"loss": 0.1102,
"step": 3695
},
{
"epoch": 1.6646167691615419,
"grad_norm": 0.23005454958970656,
"learning_rate": 8.30010910550611e-07,
"loss": 0.1017,
"step": 3700
},
{
"epoch": 1.6668666566671666,
"grad_norm": 0.2194832160899072,
"learning_rate": 8.19206464278281e-07,
"loss": 0.0985,
"step": 3705
},
{
"epoch": 1.6691165441727913,
"grad_norm": 0.32261159442961446,
"learning_rate": 8.084665232627165e-07,
"loss": 0.1115,
"step": 3710
},
{
"epoch": 1.671366431678416,
"grad_norm": 0.28020915769071963,
"learning_rate": 7.977912532099336e-07,
"loss": 0.1072,
"step": 3715
},
{
"epoch": 1.6736163191840407,
"grad_norm": 0.2587579519713862,
"learning_rate": 7.871808188281461e-07,
"loss": 0.0884,
"step": 3720
},
{
"epoch": 1.6758662066896655,
"grad_norm": 0.25937560152984207,
"learning_rate": 7.766353838252227e-07,
"loss": 0.0963,
"step": 3725
},
{
"epoch": 1.6781160941952904,
"grad_norm": 0.2623209006276337,
"learning_rate": 7.661551109061593e-07,
"loss": 0.0945,
"step": 3730
},
{
"epoch": 1.680365981700915,
"grad_norm": 0.2734268883455671,
"learning_rate": 7.557401617705673e-07,
"loss": 0.0962,
"step": 3735
},
{
"epoch": 1.6826158692065398,
"grad_norm": 0.2600567478099387,
"learning_rate": 7.453906971101826e-07,
"loss": 0.0965,
"step": 3740
},
{
"epoch": 1.6848657567121643,
"grad_norm": 0.21549655386834185,
"learning_rate": 7.35106876606384e-07,
"loss": 0.0802,
"step": 3745
},
{
"epoch": 1.6871156442177893,
"grad_norm": 0.23243787528580465,
"learning_rate": 7.248888589277275e-07,
"loss": 0.0979,
"step": 3750
},
{
"epoch": 1.6893655317234137,
"grad_norm": 0.2724000087724297,
"learning_rate": 7.147368017275075e-07,
"loss": 0.0954,
"step": 3755
},
{
"epoch": 1.6916154192290387,
"grad_norm": 0.27067809227580786,
"learning_rate": 7.046508616413078e-07,
"loss": 0.0921,
"step": 3760
},
{
"epoch": 1.6938653067346632,
"grad_norm": 0.2211597100684428,
"learning_rate": 6.946311942846002e-07,
"loss": 0.1051,
"step": 3765
},
{
"epoch": 1.6961151942402881,
"grad_norm": 0.24002929405082607,
"learning_rate": 6.846779542503384e-07,
"loss": 0.0899,
"step": 3770
},
{
"epoch": 1.6983650817459126,
"grad_norm": 0.19994511733272957,
"learning_rate": 6.747912951065722e-07,
"loss": 0.0914,
"step": 3775
},
{
"epoch": 1.7006149692515375,
"grad_norm": 0.3127468963077912,
"learning_rate": 6.649713693940718e-07,
"loss": 0.1032,
"step": 3780
},
{
"epoch": 1.702864856757162,
"grad_norm": 0.23642047104133684,
"learning_rate": 6.552183286239899e-07,
"loss": 0.087,
"step": 3785
},
{
"epoch": 1.705114744262787,
"grad_norm": 0.2068805673647048,
"learning_rate": 6.455323232755095e-07,
"loss": 0.093,
"step": 3790
},
{
"epoch": 1.7073646317684115,
"grad_norm": 0.26466757083784725,
"learning_rate": 6.35913502793527e-07,
"loss": 0.0857,
"step": 3795
},
{
"epoch": 1.7096145192740364,
"grad_norm": 0.20726909159845547,
"learning_rate": 6.263620155863492e-07,
"loss": 0.0863,
"step": 3800
},
{
"epoch": 1.711864406779661,
"grad_norm": 0.21055369695393691,
"learning_rate": 6.168780090233994e-07,
"loss": 0.0916,
"step": 3805
},
{
"epoch": 1.7141142942852858,
"grad_norm": 0.2778245613335268,
"learning_rate": 6.07461629432945e-07,
"loss": 0.0917,
"step": 3810
},
{
"epoch": 1.7163641817909103,
"grad_norm": 0.1999858453456665,
"learning_rate": 5.981130220998444e-07,
"loss": 0.0746,
"step": 3815
},
{
"epoch": 1.7186140692965353,
"grad_norm": 0.19402830775044652,
"learning_rate": 5.888323312632948e-07,
"loss": 0.094,
"step": 3820
},
{
"epoch": 1.7208639568021598,
"grad_norm": 0.26795633448623635,
"learning_rate": 5.796197001146164e-07,
"loss": 0.0884,
"step": 3825
},
{
"epoch": 1.7231138443077847,
"grad_norm": 0.2390770570872304,
"learning_rate": 5.704752707950412e-07,
"loss": 0.0905,
"step": 3830
},
{
"epoch": 1.7253637318134092,
"grad_norm": 0.16942131967267335,
"learning_rate": 5.613991843935179e-07,
"loss": 0.0827,
"step": 3835
},
{
"epoch": 1.7276136193190341,
"grad_norm": 0.2027819269347922,
"learning_rate": 5.523915809445313e-07,
"loss": 0.0832,
"step": 3840
},
{
"epoch": 1.7298635068246586,
"grad_norm": 0.22046848444535852,
"learning_rate": 5.434525994259531e-07,
"loss": 0.0886,
"step": 3845
},
{
"epoch": 1.7321133943302836,
"grad_norm": 0.2595049287219421,
"learning_rate": 5.345823777568859e-07,
"loss": 0.0937,
"step": 3850
},
{
"epoch": 1.734363281835908,
"grad_norm": 0.24929926941084485,
"learning_rate": 5.25781052795541e-07,
"loss": 0.0787,
"step": 3855
},
{
"epoch": 1.736613169341533,
"grad_norm": 0.24888880504119226,
"learning_rate": 5.170487603371266e-07,
"loss": 0.0845,
"step": 3860
},
{
"epoch": 1.7388630568471577,
"grad_norm": 0.2604651193142029,
"learning_rate": 5.083856351117511e-07,
"loss": 0.0786,
"step": 3865
},
{
"epoch": 1.7411129443527824,
"grad_norm": 0.20003829357925593,
"learning_rate": 4.997918107823446e-07,
"loss": 0.08,
"step": 3870
},
{
"epoch": 1.7433628318584071,
"grad_norm": 0.2051972235262297,
"learning_rate": 4.912674199425999e-07,
"loss": 0.0853,
"step": 3875
},
{
"epoch": 1.7456127193640318,
"grad_norm": 0.2549934939375718,
"learning_rate": 4.828125941149197e-07,
"loss": 0.0844,
"step": 3880
},
{
"epoch": 1.7478626068696566,
"grad_norm": 0.3024383705811877,
"learning_rate": 4.7442746374839363e-07,
"loss": 0.0846,
"step": 3885
},
{
"epoch": 1.7501124943752813,
"grad_norm": 0.21907085756014216,
"learning_rate": 4.6611215821678546e-07,
"loss": 0.0839,
"step": 3890
},
{
"epoch": 1.752362381880906,
"grad_norm": 0.2241634174428953,
"learning_rate": 4.578668058165325e-07,
"loss": 0.0758,
"step": 3895
},
{
"epoch": 1.7546122693865307,
"grad_norm": 0.26223122192387566,
"learning_rate": 4.4969153376476726e-07,
"loss": 0.0814,
"step": 3900
},
{
"epoch": 1.7568621568921554,
"grad_norm": 0.3009548280743066,
"learning_rate": 4.415864681973608e-07,
"loss": 0.079,
"step": 3905
},
{
"epoch": 1.7591120443977801,
"grad_norm": 0.2282784959309563,
"learning_rate": 4.335517341669676e-07,
"loss": 0.084,
"step": 3910
},
{
"epoch": 1.7613619319034048,
"grad_norm": 0.2736169743202772,
"learning_rate": 4.255874556411016e-07,
"loss": 0.0845,
"step": 3915
},
{
"epoch": 1.7636118194090296,
"grad_norm": 0.2273826431677496,
"learning_rate": 4.176937555002231e-07,
"loss": 0.0789,
"step": 3920
},
{
"epoch": 1.7658617069146543,
"grad_norm": 0.2562359538975016,
"learning_rate": 4.098707555358411e-07,
"loss": 0.0841,
"step": 3925
},
{
"epoch": 1.768111594420279,
"grad_norm": 0.20875979878240594,
"learning_rate": 4.0211857644863404e-07,
"loss": 0.0868,
"step": 3930
},
{
"epoch": 1.7703614819259037,
"grad_norm": 0.23466550669048516,
"learning_rate": 3.9443733784659324e-07,
"loss": 0.0863,
"step": 3935
},
{
"epoch": 1.7726113694315284,
"grad_norm": 0.22510691667210447,
"learning_rate": 3.8682715824316594e-07,
"loss": 0.0966,
"step": 3940
},
{
"epoch": 1.7748612569371531,
"grad_norm": 0.20906650720115227,
"learning_rate": 3.792881550554373e-07,
"loss": 0.0792,
"step": 3945
},
{
"epoch": 1.7771111444427778,
"grad_norm": 0.27079014394170864,
"learning_rate": 3.7182044460231605e-07,
"loss": 0.0793,
"step": 3950
},
{
"epoch": 1.7793610319484026,
"grad_norm": 0.1857139907781371,
"learning_rate": 3.6442414210273834e-07,
"loss": 0.0798,
"step": 3955
},
{
"epoch": 1.7816109194540273,
"grad_norm": 0.2556479800636284,
"learning_rate": 3.570993616738866e-07,
"loss": 0.0848,
"step": 3960
},
{
"epoch": 1.783860806959652,
"grad_norm": 0.20166968130742072,
"learning_rate": 3.498462163294386e-07,
"loss": 0.0811,
"step": 3965
},
{
"epoch": 1.7861106944652767,
"grad_norm": 0.2536060487731229,
"learning_rate": 3.426648179778147e-07,
"loss": 0.0953,
"step": 3970
},
{
"epoch": 1.7883605819709014,
"grad_norm": 0.2112175288890015,
"learning_rate": 3.355552774204551e-07,
"loss": 0.0762,
"step": 3975
},
{
"epoch": 1.7906104694765261,
"grad_norm": 0.2295537149666403,
"learning_rate": 3.2851770435010864e-07,
"loss": 0.0767,
"step": 3980
},
{
"epoch": 1.7928603569821508,
"grad_norm": 0.2866042831708544,
"learning_rate": 3.215522073491434e-07,
"loss": 0.0822,
"step": 3985
},
{
"epoch": 1.7951102444877756,
"grad_norm": 0.21056995297624528,
"learning_rate": 3.1465889388786697e-07,
"loss": 0.0884,
"step": 3990
},
{
"epoch": 1.7973601319934003,
"grad_norm": 0.2722790864581489,
"learning_rate": 3.0783787032287407e-07,
"loss": 0.0881,
"step": 3995
},
{
"epoch": 1.799610019499025,
"grad_norm": 0.2828340673761126,
"learning_rate": 3.010892418953981e-07,
"loss": 0.0791,
"step": 4000
},
{
"epoch": 1.80185990700465,
"grad_norm": 0.2144007429401686,
"learning_rate": 2.9441311272969343e-07,
"loss": 0.067,
"step": 4005
},
{
"epoch": 1.8041097945102744,
"grad_norm": 0.21922591950990084,
"learning_rate": 2.878095858314278e-07,
"loss": 0.0702,
"step": 4010
},
{
"epoch": 1.8063596820158994,
"grad_norm": 0.20896747701719126,
"learning_rate": 2.812787630860919e-07,
"loss": 0.078,
"step": 4015
},
{
"epoch": 1.8086095695215239,
"grad_norm": 0.21476087685701412,
"learning_rate": 2.7482074525742477e-07,
"loss": 0.0688,
"step": 4020
},
{
"epoch": 1.8108594570271488,
"grad_norm": 0.24301133633323727,
"learning_rate": 2.6843563198586553e-07,
"loss": 0.0804,
"step": 4025
},
{
"epoch": 1.8131093445327733,
"grad_norm": 0.23191122306412676,
"learning_rate": 2.621235217870116e-07,
"loss": 0.0861,
"step": 4030
},
{
"epoch": 1.8153592320383982,
"grad_norm": 0.207067846018882,
"learning_rate": 2.55884512050098e-07,
"loss": 0.0886,
"step": 4035
},
{
"epoch": 1.8176091195440227,
"grad_norm": 0.22298796620779232,
"learning_rate": 2.4971869903649916e-07,
"loss": 0.0841,
"step": 4040
},
{
"epoch": 1.8198590070496476,
"grad_norm": 0.2933340830070678,
"learning_rate": 2.436261778782378e-07,
"loss": 0.0794,
"step": 4045
},
{
"epoch": 1.8221088945552721,
"grad_norm": 0.21972032956327708,
"learning_rate": 2.3760704257652145e-07,
"loss": 0.0774,
"step": 4050
},
{
"epoch": 1.824358782060897,
"grad_norm": 0.2597840708263632,
"learning_rate": 2.3166138600029198e-07,
"loss": 0.0772,
"step": 4055
},
{
"epoch": 1.8266086695665216,
"grad_norm": 0.24945833711183132,
"learning_rate": 2.257892998847916e-07,
"loss": 0.0758,
"step": 4060
},
{
"epoch": 1.8288585570721465,
"grad_norm": 0.2108872276998458,
"learning_rate": 2.1999087483014437e-07,
"loss": 0.0742,
"step": 4065
},
{
"epoch": 1.831108444577771,
"grad_norm": 0.2561087507310594,
"learning_rate": 2.1426620029996516e-07,
"loss": 0.078,
"step": 4070
},
{
"epoch": 1.833358332083396,
"grad_norm": 0.18787526118382977,
"learning_rate": 2.08615364619974e-07,
"loss": 0.0679,
"step": 4075
},
{
"epoch": 1.8356082195890204,
"grad_norm": 0.224194236181041,
"learning_rate": 2.0303845497663566e-07,
"loss": 0.0746,
"step": 4080
},
{
"epoch": 1.8378581070946454,
"grad_norm": 0.2392479281560687,
"learning_rate": 1.9753555741581277e-07,
"loss": 0.0764,
"step": 4085
},
{
"epoch": 1.8401079946002699,
"grad_norm": 0.21912472032156466,
"learning_rate": 1.921067568414403e-07,
"loss": 0.079,
"step": 4090
},
{
"epoch": 1.8423578821058948,
"grad_norm": 0.25670091205707113,
"learning_rate": 1.8675213701421223e-07,
"loss": 0.0835,
"step": 4095
},
{
"epoch": 1.8446077696115193,
"grad_norm": 0.2594177639066407,
"learning_rate": 1.814717805502958e-07,
"loss": 0.0803,
"step": 4100
},
{
"epoch": 1.8468576571171442,
"grad_norm": 0.21469179882031758,
"learning_rate": 1.762657689200481e-07,
"loss": 0.0764,
"step": 4105
},
{
"epoch": 1.8491075446227687,
"grad_norm": 0.2209544552641529,
"learning_rate": 1.7113418244676493e-07,
"loss": 0.0785,
"step": 4110
},
{
"epoch": 1.8513574321283937,
"grad_norm": 0.24145566509686753,
"learning_rate": 1.6607710030544122e-07,
"loss": 0.0719,
"step": 4115
},
{
"epoch": 1.8536073196340181,
"grad_norm": 0.2093424795846333,
"learning_rate": 1.6109460052154802e-07,
"loss": 0.0764,
"step": 4120
},
{
"epoch": 1.855857207139643,
"grad_norm": 0.24494002119656788,
"learning_rate": 1.561867599698258e-07,
"loss": 0.0798,
"step": 4125
},
{
"epoch": 1.8581070946452676,
"grad_norm": 0.22975170782618237,
"learning_rate": 1.5135365437310534e-07,
"loss": 0.0837,
"step": 4130
},
{
"epoch": 1.8603569821508925,
"grad_norm": 0.23509810998937047,
"learning_rate": 1.4659535830113368e-07,
"loss": 0.0784,
"step": 4135
},
{
"epoch": 1.8626068696565172,
"grad_norm": 0.21476403073025796,
"learning_rate": 1.419119451694262e-07,
"loss": 0.0735,
"step": 4140
},
{
"epoch": 1.864856757162142,
"grad_norm": 0.206525508501757,
"learning_rate": 1.3730348723813181e-07,
"loss": 0.0693,
"step": 4145
},
{
"epoch": 1.8671066446677667,
"grad_norm": 0.23677883602034755,
"learning_rate": 1.3277005561092016e-07,
"loss": 0.0765,
"step": 4150
},
{
"epoch": 1.8693565321733914,
"grad_norm": 0.22864240045396528,
"learning_rate": 1.2831172023388349e-07,
"loss": 0.0682,
"step": 4155
},
{
"epoch": 1.871606419679016,
"grad_norm": 0.23988380079630575,
"learning_rate": 1.2392854989445925e-07,
"loss": 0.0792,
"step": 4160
},
{
"epoch": 1.8738563071846408,
"grad_norm": 0.21420842660768485,
"learning_rate": 1.196206122203647e-07,
"loss": 0.0723,
"step": 4165
},
{
"epoch": 1.8761061946902655,
"grad_norm": 0.23599002078153936,
"learning_rate": 1.153879736785568e-07,
"loss": 0.0745,
"step": 4170
},
{
"epoch": 1.8783560821958902,
"grad_norm": 0.19939870757943454,
"learning_rate": 1.112306995742074e-07,
"loss": 0.0764,
"step": 4175
},
{
"epoch": 1.880605969701515,
"grad_norm": 0.3002967324611831,
"learning_rate": 1.0714885404969288e-07,
"loss": 0.0745,
"step": 4180
},
{
"epoch": 1.8828558572071397,
"grad_norm": 0.2534936155963215,
"learning_rate": 1.031425000836056e-07,
"loss": 0.0805,
"step": 4185
},
{
"epoch": 1.8851057447127644,
"grad_norm": 0.2099770583680312,
"learning_rate": 9.921169948978293e-08,
"loss": 0.0696,
"step": 4190
},
{
"epoch": 1.887355632218389,
"grad_norm": 0.21406906185927677,
"learning_rate": 9.535651291635362e-08,
"loss": 0.0695,
"step": 4195
},
{
"epoch": 1.8896055197240138,
"grad_norm": 0.222365141419886,
"learning_rate": 9.157699984480018e-08,
"loss": 0.0747,
"step": 4200
},
{
"epoch": 1.8918554072296385,
"grad_norm": 0.26953866043532604,
"learning_rate": 8.787321858904241e-08,
"loss": 0.0707,
"step": 4205
},
{
"epoch": 1.8941052947352632,
"grad_norm": 0.2625647283056072,
"learning_rate": 8.424522629453924e-08,
"loss": 0.0743,
"step": 4210
},
{
"epoch": 1.896355182240888,
"grad_norm": 0.23612000625927326,
"learning_rate": 8.06930789374033e-08,
"loss": 0.0763,
"step": 4215
},
{
"epoch": 1.8986050697465127,
"grad_norm": 0.20902097825740532,
"learning_rate": 7.721683132354163e-08,
"loss": 0.0744,
"step": 4220
},
{
"epoch": 1.9008549572521374,
"grad_norm": 0.24284466003504482,
"learning_rate": 7.381653708780578e-08,
"loss": 0.0741,
"step": 4225
},
{
"epoch": 1.903104844757762,
"grad_norm": 0.208669560652797,
"learning_rate": 7.049224869316807e-08,
"loss": 0.0711,
"step": 4230
},
{
"epoch": 1.9053547322633868,
"grad_norm": 0.2250655987088142,
"learning_rate": 6.724401742990993e-08,
"loss": 0.0689,
"step": 4235
},
{
"epoch": 1.9076046197690115,
"grad_norm": 0.21694307565283746,
"learning_rate": 6.407189341483044e-08,
"loss": 0.0761,
"step": 4240
},
{
"epoch": 1.9098545072746362,
"grad_norm": 0.19311480533839126,
"learning_rate": 6.097592559047405e-08,
"loss": 0.0743,
"step": 4245
},
{
"epoch": 1.912104394780261,
"grad_norm": 0.18525264406718234,
"learning_rate": 5.795616172437624e-08,
"loss": 0.0725,
"step": 4250
},
{
"epoch": 1.9143542822858857,
"grad_norm": 0.1651176792199085,
"learning_rate": 5.501264840832299e-08,
"loss": 0.0761,
"step": 4255
},
{
"epoch": 1.9166041697915104,
"grad_norm": 0.2260442730154628,
"learning_rate": 5.214543105763692e-08,
"loss": 0.0889,
"step": 4260
},
{
"epoch": 1.918854057297135,
"grad_norm": 0.18715741530378352,
"learning_rate": 4.935455391047228e-08,
"loss": 0.0663,
"step": 4265
},
{
"epoch": 1.9211039448027598,
"grad_norm": 0.27005890292615836,
"learning_rate": 4.664006002713495e-08,
"loss": 0.0728,
"step": 4270
},
{
"epoch": 1.9233538323083845,
"grad_norm": 0.2849847348297522,
"learning_rate": 4.400199128941573e-08,
"loss": 0.077,
"step": 4275
},
{
"epoch": 1.9256037198140095,
"grad_norm": 0.21209255590676854,
"learning_rate": 4.1440388399948686e-08,
"loss": 0.0668,
"step": 4280
},
{
"epoch": 1.927853607319634,
"grad_norm": 0.24917025560084188,
"learning_rate": 3.8955290881576566e-08,
"loss": 0.0731,
"step": 4285
},
{
"epoch": 1.930103494825259,
"grad_norm": 0.18031075827239754,
"learning_rate": 3.654673707674639e-08,
"loss": 0.0755,
"step": 4290
},
{
"epoch": 1.9323533823308834,
"grad_norm": 0.29159777908954887,
"learning_rate": 3.4214764146915936e-08,
"loss": 0.0771,
"step": 4295
},
{
"epoch": 1.9346032698365083,
"grad_norm": 0.2515730214349518,
"learning_rate": 3.195940807198039e-08,
"loss": 0.0718,
"step": 4300
},
{
"epoch": 1.9368531573421328,
"grad_norm": 0.2359716229508164,
"learning_rate": 2.9780703649716637e-08,
"loss": 0.075,
"step": 4305
},
{
"epoch": 1.9391030448477578,
"grad_norm": 0.2279891015303591,
"learning_rate": 2.767868449524813e-08,
"loss": 0.071,
"step": 4310
},
{
"epoch": 1.9413529323533822,
"grad_norm": 0.24505091349172803,
"learning_rate": 2.5653383040524228e-08,
"loss": 0.074,
"step": 4315
},
{
"epoch": 1.9436028198590072,
"grad_norm": 0.2478550376131261,
"learning_rate": 2.370483053382111e-08,
"loss": 0.0771,
"step": 4320
},
{
"epoch": 1.9458527073646317,
"grad_norm": 0.2193182868657674,
"learning_rate": 2.183305703925831e-08,
"loss": 0.0756,
"step": 4325
},
{
"epoch": 1.9481025948702566,
"grad_norm": 0.2746040123293314,
"learning_rate": 2.0038091436337392e-08,
"loss": 0.0684,
"step": 4330
},
{
"epoch": 1.950352482375881,
"grad_norm": 0.27001950154682536,
"learning_rate": 1.8319961419493436e-08,
"loss": 0.0776,
"step": 4335
},
{
"epoch": 1.952602369881506,
"grad_norm": 0.17878344688895553,
"learning_rate": 1.667869349766982e-08,
"loss": 0.0684,
"step": 4340
},
{
"epoch": 1.9548522573871305,
"grad_norm": 0.2242691348688971,
"learning_rate": 1.5114312993908532e-08,
"loss": 0.0775,
"step": 4345
},
{
"epoch": 1.9571021448927555,
"grad_norm": 0.30659381638740507,
"learning_rate": 1.3626844044957733e-08,
"loss": 0.0742,
"step": 4350
},
{
"epoch": 1.95935203239838,
"grad_norm": 0.22927922548010668,
"learning_rate": 1.2216309600903142e-08,
"loss": 0.0758,
"step": 4355
},
{
"epoch": 1.961601919904005,
"grad_norm": 0.24862858256846118,
"learning_rate": 1.088273142481111e-08,
"loss": 0.0716,
"step": 4360
},
{
"epoch": 1.9638518074096294,
"grad_norm": 0.18094046454860382,
"learning_rate": 9.626130092393326e-09,
"loss": 0.0708,
"step": 4365
},
{
"epoch": 1.9661016949152543,
"grad_norm": 0.22402544663253343,
"learning_rate": 8.446524991689298e-09,
"loss": 0.0724,
"step": 4370
},
{
"epoch": 1.9683515824208788,
"grad_norm": 0.1742440920374198,
"learning_rate": 7.343934322767699e-09,
"loss": 0.0776,
"step": 4375
},
{
"epoch": 1.9706014699265038,
"grad_norm": 0.22337791056104178,
"learning_rate": 6.318375097446039e-09,
"loss": 0.0662,
"step": 4380
},
{
"epoch": 1.9728513574321282,
"grad_norm": 0.20821619733680874,
"learning_rate": 5.369863139026432e-09,
"loss": 0.0823,
"step": 4385
},
{
"epoch": 1.9751012449377532,
"grad_norm": 0.28459129908034536,
"learning_rate": 4.498413082053566e-09,
"loss": 0.0847,
"step": 4390
},
{
"epoch": 1.9773511324433777,
"grad_norm": 0.2487528636409247,
"learning_rate": 3.704038372085994e-09,
"loss": 0.0812,
"step": 4395
},
{
"epoch": 1.9796010199490026,
"grad_norm": 0.20877793863085078,
"learning_rate": 2.986751265493526e-09,
"loss": 0.0726,
"step": 4400
},
{
"epoch": 1.981850907454627,
"grad_norm": 0.2843353007301253,
"learning_rate": 2.3465628292623776e-09,
"loss": 0.0696,
"step": 4405
},
{
"epoch": 1.984100794960252,
"grad_norm": 0.20964930087522637,
"learning_rate": 1.7834829408286402e-09,
"loss": 0.0669,
"step": 4410
},
{
"epoch": 1.9863506824658768,
"grad_norm": 0.18510001166373868,
"learning_rate": 1.297520287923404e-09,
"loss": 0.0732,
"step": 4415
},
{
"epoch": 1.9886005699715015,
"grad_norm": 0.1927246478831311,
"learning_rate": 8.886823684417512e-10,
"loss": 0.0798,
"step": 4420
},
{
"epoch": 1.9908504574771262,
"grad_norm": 0.1773427078092757,
"learning_rate": 5.56975490322853e-10,
"loss": 0.0759,
"step": 4425
},
{
"epoch": 1.993100344982751,
"grad_norm": 0.2305382589496537,
"learning_rate": 3.0240477145559997e-10,
"loss": 0.0862,
"step": 4430
},
{
"epoch": 1.9953502324883756,
"grad_norm": 0.28373308100373656,
"learning_rate": 1.24974139599221e-10,
"loss": 0.0769,
"step": 4435
},
{
"epoch": 1.9976001199940003,
"grad_norm": 0.19382710057534525,
"learning_rate": 2.4686332322221286e-11,
"loss": 0.0755,
"step": 4440
},
{
"epoch": 1.9994000299985002,
"eval_loss": 0.08069541305303574,
"eval_runtime": 54.013,
"eval_samples_per_second": 19.958,
"eval_steps_per_second": 4.999,
"step": 4444
},
{
"epoch": 1.9994000299985002,
"step": 4444,
"total_flos": 1.2064109084748546e+19,
"train_loss": 0.2652757017731559,
"train_runtime": 35452.6785,
"train_samples_per_second": 6.017,
"train_steps_per_second": 0.125
}
],
"logging_steps": 5,
"max_steps": 4444,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2064109084748546e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}