qwen_0.5_downstream / trainer_state.json
terry69's picture
Model save
9ab8775 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1911,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005232862375719519,
"grad_norm": 1.6008316731720644,
"learning_rate": 5.208333333333333e-08,
"loss": 1.681,
"step": 1
},
{
"epoch": 0.0026164311878597592,
"grad_norm": 1.5124995634557081,
"learning_rate": 2.604166666666667e-07,
"loss": 1.7583,
"step": 5
},
{
"epoch": 0.0052328623757195184,
"grad_norm": 1.6881183838300193,
"learning_rate": 5.208333333333334e-07,
"loss": 1.7784,
"step": 10
},
{
"epoch": 0.007849293563579277,
"grad_norm": 1.6569959855142122,
"learning_rate": 7.8125e-07,
"loss": 1.7571,
"step": 15
},
{
"epoch": 0.010465724751439037,
"grad_norm": 1.4876244750187468,
"learning_rate": 1.0416666666666667e-06,
"loss": 1.7796,
"step": 20
},
{
"epoch": 0.013082155939298797,
"grad_norm": 1.555761827132201,
"learning_rate": 1.3020833333333335e-06,
"loss": 1.7455,
"step": 25
},
{
"epoch": 0.015698587127158554,
"grad_norm": 1.3344622915583402,
"learning_rate": 1.5625e-06,
"loss": 1.713,
"step": 30
},
{
"epoch": 0.018315018315018316,
"grad_norm": 1.6878431208167544,
"learning_rate": 1.8229166666666666e-06,
"loss": 1.753,
"step": 35
},
{
"epoch": 0.020931449502878074,
"grad_norm": 1.925154137160916,
"learning_rate": 2.0833333333333334e-06,
"loss": 1.7583,
"step": 40
},
{
"epoch": 0.023547880690737835,
"grad_norm": 2.0658602507700143,
"learning_rate": 2.3437500000000002e-06,
"loss": 1.6918,
"step": 45
},
{
"epoch": 0.026164311878597593,
"grad_norm": 2.2843015724617484,
"learning_rate": 2.604166666666667e-06,
"loss": 1.6991,
"step": 50
},
{
"epoch": 0.02878074306645735,
"grad_norm": 2.0076765426852905,
"learning_rate": 2.8645833333333334e-06,
"loss": 1.6578,
"step": 55
},
{
"epoch": 0.03139717425431711,
"grad_norm": 1.5242713335796327,
"learning_rate": 3.125e-06,
"loss": 1.6201,
"step": 60
},
{
"epoch": 0.034013605442176874,
"grad_norm": 1.202884778723732,
"learning_rate": 3.385416666666667e-06,
"loss": 1.6545,
"step": 65
},
{
"epoch": 0.03663003663003663,
"grad_norm": 0.9017919097747348,
"learning_rate": 3.6458333333333333e-06,
"loss": 1.6182,
"step": 70
},
{
"epoch": 0.03924646781789639,
"grad_norm": 0.7155469892761825,
"learning_rate": 3.90625e-06,
"loss": 1.5853,
"step": 75
},
{
"epoch": 0.04186289900575615,
"grad_norm": 0.5537857406833877,
"learning_rate": 4.166666666666667e-06,
"loss": 1.6142,
"step": 80
},
{
"epoch": 0.044479330193615906,
"grad_norm": 0.5008944354387821,
"learning_rate": 4.427083333333334e-06,
"loss": 1.5458,
"step": 85
},
{
"epoch": 0.04709576138147567,
"grad_norm": 0.4217272654697234,
"learning_rate": 4.6875000000000004e-06,
"loss": 1.5173,
"step": 90
},
{
"epoch": 0.04971219256933543,
"grad_norm": 0.39469142742095514,
"learning_rate": 4.947916666666667e-06,
"loss": 1.5501,
"step": 95
},
{
"epoch": 0.052328623757195186,
"grad_norm": 0.34359153398155595,
"learning_rate": 5.208333333333334e-06,
"loss": 1.5213,
"step": 100
},
{
"epoch": 0.054945054945054944,
"grad_norm": 0.3099893379425512,
"learning_rate": 5.468750000000001e-06,
"loss": 1.6151,
"step": 105
},
{
"epoch": 0.0575614861329147,
"grad_norm": 0.28278343285367047,
"learning_rate": 5.729166666666667e-06,
"loss": 1.514,
"step": 110
},
{
"epoch": 0.06017791732077447,
"grad_norm": 0.25881346963420426,
"learning_rate": 5.989583333333334e-06,
"loss": 1.5072,
"step": 115
},
{
"epoch": 0.06279434850863422,
"grad_norm": 0.24915604462449628,
"learning_rate": 6.25e-06,
"loss": 1.5199,
"step": 120
},
{
"epoch": 0.06541077969649398,
"grad_norm": 0.2485517174096053,
"learning_rate": 6.510416666666667e-06,
"loss": 1.5366,
"step": 125
},
{
"epoch": 0.06802721088435375,
"grad_norm": 0.22712811100348562,
"learning_rate": 6.770833333333334e-06,
"loss": 1.504,
"step": 130
},
{
"epoch": 0.0706436420722135,
"grad_norm": 0.21446325271380284,
"learning_rate": 7.031250000000001e-06,
"loss": 1.5327,
"step": 135
},
{
"epoch": 0.07326007326007326,
"grad_norm": 0.19651591348301906,
"learning_rate": 7.291666666666667e-06,
"loss": 1.4778,
"step": 140
},
{
"epoch": 0.07587650444793302,
"grad_norm": 0.21362640404068758,
"learning_rate": 7.552083333333334e-06,
"loss": 1.4924,
"step": 145
},
{
"epoch": 0.07849293563579278,
"grad_norm": 0.2086701893979055,
"learning_rate": 7.8125e-06,
"loss": 1.4934,
"step": 150
},
{
"epoch": 0.08110936682365254,
"grad_norm": 0.19722374493413125,
"learning_rate": 8.072916666666667e-06,
"loss": 1.5194,
"step": 155
},
{
"epoch": 0.0837257980115123,
"grad_norm": 0.197232116513064,
"learning_rate": 8.333333333333334e-06,
"loss": 1.5089,
"step": 160
},
{
"epoch": 0.08634222919937205,
"grad_norm": 0.19736927732072057,
"learning_rate": 8.59375e-06,
"loss": 1.493,
"step": 165
},
{
"epoch": 0.08895866038723181,
"grad_norm": 0.18999653216382684,
"learning_rate": 8.854166666666667e-06,
"loss": 1.5084,
"step": 170
},
{
"epoch": 0.09157509157509157,
"grad_norm": 0.18816442922674095,
"learning_rate": 9.114583333333334e-06,
"loss": 1.4939,
"step": 175
},
{
"epoch": 0.09419152276295134,
"grad_norm": 0.19457985638812048,
"learning_rate": 9.375000000000001e-06,
"loss": 1.5115,
"step": 180
},
{
"epoch": 0.0968079539508111,
"grad_norm": 0.18306860662282393,
"learning_rate": 9.635416666666668e-06,
"loss": 1.51,
"step": 185
},
{
"epoch": 0.09942438513867086,
"grad_norm": 0.1721853071759001,
"learning_rate": 9.895833333333334e-06,
"loss": 1.5121,
"step": 190
},
{
"epoch": 0.10204081632653061,
"grad_norm": 0.18129124345256323,
"learning_rate": 9.999924849924331e-06,
"loss": 1.5169,
"step": 195
},
{
"epoch": 0.10465724751439037,
"grad_norm": 0.17875223053180506,
"learning_rate": 9.999465607642677e-06,
"loss": 1.4897,
"step": 200
},
{
"epoch": 0.10727367870225013,
"grad_norm": 0.1765984203759676,
"learning_rate": 9.998588911421522e-06,
"loss": 1.4531,
"step": 205
},
{
"epoch": 0.10989010989010989,
"grad_norm": 0.16783670979039028,
"learning_rate": 9.99729483446475e-06,
"loss": 1.4821,
"step": 210
},
{
"epoch": 0.11250654107796965,
"grad_norm": 0.17395143750443126,
"learning_rate": 9.995583484827415e-06,
"loss": 1.5229,
"step": 215
},
{
"epoch": 0.1151229722658294,
"grad_norm": 0.16605004371887766,
"learning_rate": 9.993455005406717e-06,
"loss": 1.441,
"step": 220
},
{
"epoch": 0.11773940345368916,
"grad_norm": 0.1700314362693217,
"learning_rate": 9.990909573930075e-06,
"loss": 1.4892,
"step": 225
},
{
"epoch": 0.12035583464154893,
"grad_norm": 0.16948161898267894,
"learning_rate": 9.987947402940285e-06,
"loss": 1.5555,
"step": 230
},
{
"epoch": 0.12297226582940869,
"grad_norm": 0.16463177711173213,
"learning_rate": 9.984568739777776e-06,
"loss": 1.4554,
"step": 235
},
{
"epoch": 0.12558869701726844,
"grad_norm": 0.16770654591359932,
"learning_rate": 9.980773866559946e-06,
"loss": 1.4815,
"step": 240
},
{
"epoch": 0.1282051282051282,
"grad_norm": 0.16640092407199292,
"learning_rate": 9.976563100157615e-06,
"loss": 1.4868,
"step": 245
},
{
"epoch": 0.13082155939298795,
"grad_norm": 0.16699146902728054,
"learning_rate": 9.971936792168569e-06,
"loss": 1.4735,
"step": 250
},
{
"epoch": 0.13343799058084774,
"grad_norm": 0.1743960884970157,
"learning_rate": 9.966895328888195e-06,
"loss": 1.4793,
"step": 255
},
{
"epoch": 0.1360544217687075,
"grad_norm": 0.1702507585575381,
"learning_rate": 9.961439131277223e-06,
"loss": 1.4823,
"step": 260
},
{
"epoch": 0.13867085295656725,
"grad_norm": 0.16235653984108958,
"learning_rate": 9.955568654926582e-06,
"loss": 1.4555,
"step": 265
},
{
"epoch": 0.141287284144427,
"grad_norm": 0.15373757230170634,
"learning_rate": 9.949284390019362e-06,
"loss": 1.4332,
"step": 270
},
{
"epoch": 0.14390371533228677,
"grad_norm": 0.1639742183162807,
"learning_rate": 9.942586861289874e-06,
"loss": 1.497,
"step": 275
},
{
"epoch": 0.14652014652014653,
"grad_norm": 0.1632185984817443,
"learning_rate": 9.935476627979837e-06,
"loss": 1.4556,
"step": 280
},
{
"epoch": 0.14913657770800628,
"grad_norm": 0.16327675062600183,
"learning_rate": 9.927954283791687e-06,
"loss": 1.4614,
"step": 285
},
{
"epoch": 0.15175300889586604,
"grad_norm": 0.17441801496249207,
"learning_rate": 9.920020456838998e-06,
"loss": 1.5062,
"step": 290
},
{
"epoch": 0.1543694400837258,
"grad_norm": 0.16030662489136668,
"learning_rate": 9.911675809594042e-06,
"loss": 1.4913,
"step": 295
},
{
"epoch": 0.15698587127158556,
"grad_norm": 0.1573623101473936,
"learning_rate": 9.902921038832456e-06,
"loss": 1.4614,
"step": 300
},
{
"epoch": 0.15960230245944532,
"grad_norm": 0.15793397911742527,
"learning_rate": 9.893756875575082e-06,
"loss": 1.4465,
"step": 305
},
{
"epoch": 0.16221873364730507,
"grad_norm": 0.16568484828821042,
"learning_rate": 9.884184085026918e-06,
"loss": 1.4444,
"step": 310
},
{
"epoch": 0.16483516483516483,
"grad_norm": 0.17311490884226297,
"learning_rate": 9.874203466513215e-06,
"loss": 1.4701,
"step": 315
},
{
"epoch": 0.1674515960230246,
"grad_norm": 0.1721852603261551,
"learning_rate": 9.863815853412748e-06,
"loss": 1.4582,
"step": 320
},
{
"epoch": 0.17006802721088435,
"grad_norm": 0.164698820484247,
"learning_rate": 9.853022113088223e-06,
"loss": 1.5308,
"step": 325
},
{
"epoch": 0.1726844583987441,
"grad_norm": 0.1586763981745356,
"learning_rate": 9.84182314681385e-06,
"loss": 1.4744,
"step": 330
},
{
"epoch": 0.17530088958660386,
"grad_norm": 0.15689735846007058,
"learning_rate": 9.83021988970009e-06,
"loss": 1.4753,
"step": 335
},
{
"epoch": 0.17791732077446362,
"grad_norm": 0.1575630177309333,
"learning_rate": 9.818213310615575e-06,
"loss": 1.4579,
"step": 340
},
{
"epoch": 0.18053375196232338,
"grad_norm": 0.1631768128880243,
"learning_rate": 9.805804412106197e-06,
"loss": 1.4541,
"step": 345
},
{
"epoch": 0.18315018315018314,
"grad_norm": 0.15686973370987312,
"learning_rate": 9.792994230311419e-06,
"loss": 1.406,
"step": 350
},
{
"epoch": 0.1857666143380429,
"grad_norm": 0.1605407693787364,
"learning_rate": 9.779783834877727e-06,
"loss": 1.4506,
"step": 355
},
{
"epoch": 0.18838304552590268,
"grad_norm": 0.1566921205098481,
"learning_rate": 9.766174328869344e-06,
"loss": 1.4208,
"step": 360
},
{
"epoch": 0.19099947671376244,
"grad_norm": 0.1558094611868576,
"learning_rate": 9.752166848676101e-06,
"loss": 1.4643,
"step": 365
},
{
"epoch": 0.1936159079016222,
"grad_norm": 0.15672567518111116,
"learning_rate": 9.737762563918564e-06,
"loss": 1.4534,
"step": 370
},
{
"epoch": 0.19623233908948196,
"grad_norm": 0.1535562279342849,
"learning_rate": 9.722962677350367e-06,
"loss": 1.4771,
"step": 375
},
{
"epoch": 0.1988487702773417,
"grad_norm": 0.183419499409996,
"learning_rate": 9.707768424757778e-06,
"loss": 1.4415,
"step": 380
},
{
"epoch": 0.20146520146520147,
"grad_norm": 0.16119980470890002,
"learning_rate": 9.692181074856515e-06,
"loss": 1.523,
"step": 385
},
{
"epoch": 0.20408163265306123,
"grad_norm": 0.16455686201587852,
"learning_rate": 9.676201929185809e-06,
"loss": 1.4638,
"step": 390
},
{
"epoch": 0.206698063840921,
"grad_norm": 0.15623412201622025,
"learning_rate": 9.659832321999727e-06,
"loss": 1.4955,
"step": 395
},
{
"epoch": 0.20931449502878074,
"grad_norm": 0.15115523058807412,
"learning_rate": 9.643073620155755e-06,
"loss": 1.4509,
"step": 400
},
{
"epoch": 0.2119309262166405,
"grad_norm": 0.16043581067705567,
"learning_rate": 9.625927223000679e-06,
"loss": 1.5064,
"step": 405
},
{
"epoch": 0.21454735740450026,
"grad_norm": 0.16061562830754533,
"learning_rate": 9.608394562253724e-06,
"loss": 1.44,
"step": 410
},
{
"epoch": 0.21716378859236002,
"grad_norm": 0.1610549978873481,
"learning_rate": 9.590477101887016e-06,
"loss": 1.5057,
"step": 415
},
{
"epoch": 0.21978021978021978,
"grad_norm": 0.16201649106784555,
"learning_rate": 9.572176338003341e-06,
"loss": 1.4478,
"step": 420
},
{
"epoch": 0.22239665096807953,
"grad_norm": 0.15700846939992966,
"learning_rate": 9.553493798711217e-06,
"loss": 1.4529,
"step": 425
},
{
"epoch": 0.2250130821559393,
"grad_norm": 0.16614805498541974,
"learning_rate": 9.534431043997298e-06,
"loss": 1.4327,
"step": 430
},
{
"epoch": 0.22762951334379905,
"grad_norm": 0.15339862150367098,
"learning_rate": 9.514989665596114e-06,
"loss": 1.468,
"step": 435
},
{
"epoch": 0.2302459445316588,
"grad_norm": 0.16105247258500596,
"learning_rate": 9.495171286857171e-06,
"loss": 1.4999,
"step": 440
},
{
"epoch": 0.23286237571951857,
"grad_norm": 0.14812137703125702,
"learning_rate": 9.47497756260939e-06,
"loss": 1.4194,
"step": 445
},
{
"epoch": 0.23547880690737832,
"grad_norm": 0.15329658500251478,
"learning_rate": 9.454410179022932e-06,
"loss": 1.4579,
"step": 450
},
{
"epoch": 0.23809523809523808,
"grad_norm": 0.1615690171559319,
"learning_rate": 9.433470853468409e-06,
"loss": 1.4626,
"step": 455
},
{
"epoch": 0.24071166928309787,
"grad_norm": 0.1545180409071662,
"learning_rate": 9.412161334373477e-06,
"loss": 1.4709,
"step": 460
},
{
"epoch": 0.24332810047095763,
"grad_norm": 0.15399340754206403,
"learning_rate": 9.39048340107685e-06,
"loss": 1.4776,
"step": 465
},
{
"epoch": 0.24594453165881738,
"grad_norm": 0.15962881447905178,
"learning_rate": 9.36843886367972e-06,
"loss": 1.4593,
"step": 470
},
{
"epoch": 0.24856096284667714,
"grad_norm": 0.1630934673570493,
"learning_rate": 9.346029562894616e-06,
"loss": 1.4681,
"step": 475
},
{
"epoch": 0.25117739403453687,
"grad_norm": 0.1648479272141616,
"learning_rate": 9.323257369891702e-06,
"loss": 1.4364,
"step": 480
},
{
"epoch": 0.25379382522239663,
"grad_norm": 0.17086807544238972,
"learning_rate": 9.300124186142542e-06,
"loss": 1.483,
"step": 485
},
{
"epoch": 0.2564102564102564,
"grad_norm": 0.1554704282008905,
"learning_rate": 9.276631943261325e-06,
"loss": 1.439,
"step": 490
},
{
"epoch": 0.25902668759811615,
"grad_norm": 0.15622543010651593,
"learning_rate": 9.252782602843565e-06,
"loss": 1.5031,
"step": 495
},
{
"epoch": 0.2616431187859759,
"grad_norm": 0.16771215479185553,
"learning_rate": 9.228578156302327e-06,
"loss": 1.4406,
"step": 500
},
{
"epoch": 0.26425954997383566,
"grad_norm": 0.16484311397184792,
"learning_rate": 9.204020624701932e-06,
"loss": 1.4803,
"step": 505
},
{
"epoch": 0.2668759811616955,
"grad_norm": 0.16013346151268085,
"learning_rate": 9.1791120585892e-06,
"loss": 1.4161,
"step": 510
},
{
"epoch": 0.26949241234955523,
"grad_norm": 0.16165617987217745,
"learning_rate": 9.153854537822235e-06,
"loss": 1.464,
"step": 515
},
{
"epoch": 0.272108843537415,
"grad_norm": 0.15775246590680753,
"learning_rate": 9.12825017139675e-06,
"loss": 1.4856,
"step": 520
},
{
"epoch": 0.27472527472527475,
"grad_norm": 0.15261801905816583,
"learning_rate": 9.102301097269974e-06,
"loss": 1.4921,
"step": 525
},
{
"epoch": 0.2773417059131345,
"grad_norm": 0.17027676618685536,
"learning_rate": 9.076009482182132e-06,
"loss": 1.4594,
"step": 530
},
{
"epoch": 0.27995813710099426,
"grad_norm": 0.15658397006356073,
"learning_rate": 9.049377521475514e-06,
"loss": 1.4449,
"step": 535
},
{
"epoch": 0.282574568288854,
"grad_norm": 0.16413577886883224,
"learning_rate": 9.022407438911177e-06,
"loss": 1.4784,
"step": 540
},
{
"epoch": 0.2851909994767138,
"grad_norm": 0.15420397374950906,
"learning_rate": 8.99510148648325e-06,
"loss": 1.4453,
"step": 545
},
{
"epoch": 0.28780743066457354,
"grad_norm": 0.15750798306390343,
"learning_rate": 8.967461944230908e-06,
"loss": 1.4498,
"step": 550
},
{
"epoch": 0.2904238618524333,
"grad_norm": 0.1631028988305524,
"learning_rate": 8.939491120047974e-06,
"loss": 1.4998,
"step": 555
},
{
"epoch": 0.29304029304029305,
"grad_norm": 0.16042232539081466,
"learning_rate": 8.911191349490215e-06,
"loss": 1.4617,
"step": 560
},
{
"epoch": 0.2956567242281528,
"grad_norm": 0.15786335167092433,
"learning_rate": 8.882564995580329e-06,
"loss": 1.4675,
"step": 565
},
{
"epoch": 0.29827315541601257,
"grad_norm": 0.1509933414399425,
"learning_rate": 8.85361444861063e-06,
"loss": 1.449,
"step": 570
},
{
"epoch": 0.3008895866038723,
"grad_norm": 0.16307057514008638,
"learning_rate": 8.824342125943461e-06,
"loss": 1.4394,
"step": 575
},
{
"epoch": 0.3035060177917321,
"grad_norm": 0.15286343283349654,
"learning_rate": 8.79475047180934e-06,
"loss": 1.4926,
"step": 580
},
{
"epoch": 0.30612244897959184,
"grad_norm": 0.15816557747601245,
"learning_rate": 8.764841957102866e-06,
"loss": 1.4991,
"step": 585
},
{
"epoch": 0.3087388801674516,
"grad_norm": 0.15992999802079433,
"learning_rate": 8.734619079176416e-06,
"loss": 1.4967,
"step": 590
},
{
"epoch": 0.31135531135531136,
"grad_norm": 0.15559994076241815,
"learning_rate": 8.704084361631597e-06,
"loss": 1.4602,
"step": 595
},
{
"epoch": 0.3139717425431711,
"grad_norm": 0.16010926275335507,
"learning_rate": 8.673240354108539e-06,
"loss": 1.4233,
"step": 600
},
{
"epoch": 0.3165881737310309,
"grad_norm": 0.16290886252895473,
"learning_rate": 8.642089632072992e-06,
"loss": 1.4039,
"step": 605
},
{
"epoch": 0.31920460491889063,
"grad_norm": 0.1586363120366518,
"learning_rate": 8.61063479660128e-06,
"loss": 1.4828,
"step": 610
},
{
"epoch": 0.3218210361067504,
"grad_norm": 0.16209454857234118,
"learning_rate": 8.578878474163115e-06,
"loss": 1.5014,
"step": 615
},
{
"epoch": 0.32443746729461015,
"grad_norm": 0.15224653908638042,
"learning_rate": 8.546823316402282e-06,
"loss": 1.4809,
"step": 620
},
{
"epoch": 0.3270538984824699,
"grad_norm": 0.159244210541448,
"learning_rate": 8.514471999915229e-06,
"loss": 1.4946,
"step": 625
},
{
"epoch": 0.32967032967032966,
"grad_norm": 0.1521622865645663,
"learning_rate": 8.48182722602757e-06,
"loss": 1.4713,
"step": 630
},
{
"epoch": 0.3322867608581894,
"grad_norm": 0.15381664517913313,
"learning_rate": 8.448891720568535e-06,
"loss": 1.4452,
"step": 635
},
{
"epoch": 0.3349031920460492,
"grad_norm": 0.1582064130471886,
"learning_rate": 8.415668233643346e-06,
"loss": 1.4305,
"step": 640
},
{
"epoch": 0.33751962323390894,
"grad_norm": 0.16426813337977975,
"learning_rate": 8.382159539403605e-06,
"loss": 1.5116,
"step": 645
},
{
"epoch": 0.3401360544217687,
"grad_norm": 0.15908674556867114,
"learning_rate": 8.348368435815636e-06,
"loss": 1.4454,
"step": 650
},
{
"epoch": 0.34275248560962845,
"grad_norm": 0.15658041474741916,
"learning_rate": 8.314297744426865e-06,
"loss": 1.4428,
"step": 655
},
{
"epoch": 0.3453689167974882,
"grad_norm": 0.15747154649657244,
"learning_rate": 8.279950310130218e-06,
"loss": 1.4417,
"step": 660
},
{
"epoch": 0.34798534798534797,
"grad_norm": 0.1589936285906929,
"learning_rate": 8.245329000926574e-06,
"loss": 1.4563,
"step": 665
},
{
"epoch": 0.35060177917320773,
"grad_norm": 0.1613181591302504,
"learning_rate": 8.210436707685286e-06,
"loss": 1.4754,
"step": 670
},
{
"epoch": 0.3532182103610675,
"grad_norm": 0.165332497765771,
"learning_rate": 8.175276343902802e-06,
"loss": 1.4547,
"step": 675
},
{
"epoch": 0.35583464154892724,
"grad_norm": 0.1578563622480374,
"learning_rate": 8.139850845459378e-06,
"loss": 1.4565,
"step": 680
},
{
"epoch": 0.358451072736787,
"grad_norm": 0.16107629875736493,
"learning_rate": 8.104163170373942e-06,
"loss": 1.4473,
"step": 685
},
{
"epoch": 0.36106750392464676,
"grad_norm": 0.16625542644136174,
"learning_rate": 8.068216298557088e-06,
"loss": 1.4833,
"step": 690
},
{
"epoch": 0.3636839351125065,
"grad_norm": 0.15680894783853527,
"learning_rate": 8.032013231562271e-06,
"loss": 1.4802,
"step": 695
},
{
"epoch": 0.3663003663003663,
"grad_norm": 0.16181472227400934,
"learning_rate": 7.995556992335168e-06,
"loss": 1.4705,
"step": 700
},
{
"epoch": 0.36891679748822603,
"grad_norm": 0.15686803017929454,
"learning_rate": 7.95885062496126e-06,
"loss": 1.4592,
"step": 705
},
{
"epoch": 0.3715332286760858,
"grad_norm": 0.1579836788382813,
"learning_rate": 7.92189719441166e-06,
"loss": 1.4179,
"step": 710
},
{
"epoch": 0.3741496598639456,
"grad_norm": 0.15713818521977135,
"learning_rate": 7.884699786287188e-06,
"loss": 1.4881,
"step": 715
},
{
"epoch": 0.37676609105180536,
"grad_norm": 0.15909147673646215,
"learning_rate": 7.847261506560716e-06,
"loss": 1.4779,
"step": 720
},
{
"epoch": 0.3793825222396651,
"grad_norm": 0.15891568593086702,
"learning_rate": 7.809585481317824e-06,
"loss": 1.4525,
"step": 725
},
{
"epoch": 0.3819989534275249,
"grad_norm": 0.1555272930916928,
"learning_rate": 7.77167485649578e-06,
"loss": 1.4803,
"step": 730
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.15344838974292666,
"learning_rate": 7.733532797620849e-06,
"loss": 1.4693,
"step": 735
},
{
"epoch": 0.3872318158032444,
"grad_norm": 0.15915416421044593,
"learning_rate": 7.695162489543966e-06,
"loss": 1.5005,
"step": 740
},
{
"epoch": 0.38984824699110415,
"grad_norm": 0.1642521243641894,
"learning_rate": 7.656567136174817e-06,
"loss": 1.4154,
"step": 745
},
{
"epoch": 0.3924646781789639,
"grad_norm": 0.1522074237366889,
"learning_rate": 7.6177499602143e-06,
"loss": 1.4141,
"step": 750
},
{
"epoch": 0.39508110936682367,
"grad_norm": 0.15826765900009956,
"learning_rate": 7.578714202885436e-06,
"loss": 1.491,
"step": 755
},
{
"epoch": 0.3976975405546834,
"grad_norm": 0.16460953142733703,
"learning_rate": 7.53946312366273e-06,
"loss": 1.4553,
"step": 760
},
{
"epoch": 0.4003139717425432,
"grad_norm": 0.16346924185292147,
"learning_rate": 7.500000000000001e-06,
"loss": 1.3906,
"step": 765
},
{
"epoch": 0.40293040293040294,
"grad_norm": 0.16031939514368226,
"learning_rate": 7.460328127056718e-06,
"loss": 1.4978,
"step": 770
},
{
"epoch": 0.4055468341182627,
"grad_norm": 0.1582953066263729,
"learning_rate": 7.420450817422855e-06,
"loss": 1.4618,
"step": 775
},
{
"epoch": 0.40816326530612246,
"grad_norm": 0.16134419804701114,
"learning_rate": 7.38037140084229e-06,
"loss": 1.4654,
"step": 780
},
{
"epoch": 0.4107796964939822,
"grad_norm": 0.1590857338863284,
"learning_rate": 7.340093223934775e-06,
"loss": 1.4632,
"step": 785
},
{
"epoch": 0.413396127681842,
"grad_norm": 0.1599207804357529,
"learning_rate": 7.29961964991649e-06,
"loss": 1.4222,
"step": 790
},
{
"epoch": 0.41601255886970173,
"grad_norm": 0.16680739008193485,
"learning_rate": 7.2589540583192165e-06,
"loss": 1.5157,
"step": 795
},
{
"epoch": 0.4186289900575615,
"grad_norm": 0.15668314190198573,
"learning_rate": 7.218099844708152e-06,
"loss": 1.4838,
"step": 800
},
{
"epoch": 0.42124542124542125,
"grad_norm": 0.16035073862413637,
"learning_rate": 7.177060420398376e-06,
"loss": 1.4573,
"step": 805
},
{
"epoch": 0.423861852433281,
"grad_norm": 0.15900953162107145,
"learning_rate": 7.135839212170008e-06,
"loss": 1.4492,
"step": 810
},
{
"epoch": 0.42647828362114076,
"grad_norm": 0.16479547257307764,
"learning_rate": 7.094439661982072e-06,
"loss": 1.4661,
"step": 815
},
{
"epoch": 0.4290947148090005,
"grad_norm": 0.15392631508278395,
"learning_rate": 7.0528652266850935e-06,
"loss": 1.4521,
"step": 820
},
{
"epoch": 0.4317111459968603,
"grad_norm": 0.15684281936206454,
"learning_rate": 7.011119377732459e-06,
"loss": 1.4509,
"step": 825
},
{
"epoch": 0.43432757718472004,
"grad_norm": 0.16360635983799987,
"learning_rate": 6.969205600890539e-06,
"loss": 1.4661,
"step": 830
},
{
"epoch": 0.4369440083725798,
"grad_norm": 0.15446157313570677,
"learning_rate": 6.9271273959476415e-06,
"loss": 1.4724,
"step": 835
},
{
"epoch": 0.43956043956043955,
"grad_norm": 0.15684403662844304,
"learning_rate": 6.884888276421766e-06,
"loss": 1.4417,
"step": 840
},
{
"epoch": 0.4421768707482993,
"grad_norm": 0.1623177931087071,
"learning_rate": 6.842491769267241e-06,
"loss": 1.4657,
"step": 845
},
{
"epoch": 0.44479330193615907,
"grad_norm": 0.15191793713718904,
"learning_rate": 6.79994141458021e-06,
"loss": 1.4758,
"step": 850
},
{
"epoch": 0.4474097331240188,
"grad_norm": 0.1580816385183261,
"learning_rate": 6.757240765303047e-06,
"loss": 1.4354,
"step": 855
},
{
"epoch": 0.4500261643118786,
"grad_norm": 0.15469805356817723,
"learning_rate": 6.7143933869276755e-06,
"loss": 1.423,
"step": 860
},
{
"epoch": 0.45264259549973834,
"grad_norm": 0.15867250364888047,
"learning_rate": 6.671402857197864e-06,
"loss": 1.4333,
"step": 865
},
{
"epoch": 0.4552590266875981,
"grad_norm": 0.15450873366986678,
"learning_rate": 6.628272765810468e-06,
"loss": 1.5001,
"step": 870
},
{
"epoch": 0.45787545787545786,
"grad_norm": 0.15904540784415175,
"learning_rate": 6.585006714115709e-06,
"loss": 1.4568,
"step": 875
},
{
"epoch": 0.4604918890633176,
"grad_norm": 0.18783657076424265,
"learning_rate": 6.541608314816451e-06,
"loss": 1.4828,
"step": 880
},
{
"epoch": 0.4631083202511774,
"grad_norm": 0.15633907898033594,
"learning_rate": 6.498081191666549e-06,
"loss": 1.3961,
"step": 885
},
{
"epoch": 0.46572475143903713,
"grad_norm": 0.1562637792140999,
"learning_rate": 6.454428979168257e-06,
"loss": 1.4612,
"step": 890
},
{
"epoch": 0.4683411826268969,
"grad_norm": 0.16274637193792332,
"learning_rate": 6.410655322268758e-06,
"loss": 1.4744,
"step": 895
},
{
"epoch": 0.47095761381475665,
"grad_norm": 0.16139617681241109,
"learning_rate": 6.3667638760558055e-06,
"loss": 1.4937,
"step": 900
},
{
"epoch": 0.4735740450026164,
"grad_norm": 0.16266632807711848,
"learning_rate": 6.3227583054525296e-06,
"loss": 1.4359,
"step": 905
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.15220372261088064,
"learning_rate": 6.2786422849114074e-06,
"loss": 1.4534,
"step": 910
},
{
"epoch": 0.478806907378336,
"grad_norm": 0.1585996226107583,
"learning_rate": 6.2344194981074616e-06,
"loss": 1.4731,
"step": 915
},
{
"epoch": 0.48142333856619574,
"grad_norm": 0.164545533910534,
"learning_rate": 6.190093637630662e-06,
"loss": 1.4888,
"step": 920
},
{
"epoch": 0.4840397697540555,
"grad_norm": 0.16298648791233225,
"learning_rate": 6.145668404677604e-06,
"loss": 1.4198,
"step": 925
},
{
"epoch": 0.48665620094191525,
"grad_norm": 0.16942313057138814,
"learning_rate": 6.101147508742456e-06,
"loss": 1.4351,
"step": 930
},
{
"epoch": 0.489272632129775,
"grad_norm": 0.15711211853308713,
"learning_rate": 6.056534667307212e-06,
"loss": 1.4644,
"step": 935
},
{
"epoch": 0.49188906331763477,
"grad_norm": 0.15626699824988666,
"learning_rate": 6.011833605531295e-06,
"loss": 1.4304,
"step": 940
},
{
"epoch": 0.4945054945054945,
"grad_norm": 0.17131121620072295,
"learning_rate": 5.967048055940503e-06,
"loss": 1.4363,
"step": 945
},
{
"epoch": 0.4971219256933543,
"grad_norm": 0.16411315909203975,
"learning_rate": 5.922181758115333e-06,
"loss": 1.4948,
"step": 950
},
{
"epoch": 0.49973835688121404,
"grad_norm": 0.1652087290533154,
"learning_rate": 5.8772384583787455e-06,
"loss": 1.4749,
"step": 955
},
{
"epoch": 0.5023547880690737,
"grad_norm": 0.1527872810679704,
"learning_rate": 5.832221909483334e-06,
"loss": 1.5299,
"step": 960
},
{
"epoch": 0.5049712192569336,
"grad_norm": 0.16242876178871807,
"learning_rate": 5.787135870297976e-06,
"loss": 1.4302,
"step": 965
},
{
"epoch": 0.5075876504447933,
"grad_norm": 0.15914276616432216,
"learning_rate": 5.741984105493967e-06,
"loss": 1.4855,
"step": 970
},
{
"epoch": 0.5102040816326531,
"grad_norm": 0.15942651171220024,
"learning_rate": 5.696770385230679e-06,
"loss": 1.4426,
"step": 975
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.15807600841433533,
"learning_rate": 5.651498484840737e-06,
"loss": 1.4827,
"step": 980
},
{
"epoch": 0.5154369440083726,
"grad_norm": 0.15731445684573817,
"learning_rate": 5.6061721845148e-06,
"loss": 1.4354,
"step": 985
},
{
"epoch": 0.5180533751962323,
"grad_norm": 0.15827375236560648,
"learning_rate": 5.560795268985899e-06,
"loss": 1.442,
"step": 990
},
{
"epoch": 0.5206698063840921,
"grad_norm": 0.1578476109009046,
"learning_rate": 5.515371527213422e-06,
"loss": 1.4272,
"step": 995
},
{
"epoch": 0.5232862375719518,
"grad_norm": 0.161165877259809,
"learning_rate": 5.469904752066736e-06,
"loss": 1.4602,
"step": 1000
},
{
"epoch": 0.5259026687598116,
"grad_norm": 0.15792406612385534,
"learning_rate": 5.424398740008481e-06,
"loss": 1.4762,
"step": 1005
},
{
"epoch": 0.5285190999476713,
"grad_norm": 0.16143656535191364,
"learning_rate": 5.378857290777566e-06,
"loss": 1.4597,
"step": 1010
},
{
"epoch": 0.5311355311355311,
"grad_norm": 0.1525842123237197,
"learning_rate": 5.333284207071901e-06,
"loss": 1.4559,
"step": 1015
},
{
"epoch": 0.533751962323391,
"grad_norm": 0.1600143521665237,
"learning_rate": 5.287683294230855e-06,
"loss": 1.4993,
"step": 1020
},
{
"epoch": 0.5363683935112507,
"grad_norm": 0.1671409495356182,
"learning_rate": 5.242058359917531e-06,
"loss": 1.4362,
"step": 1025
},
{
"epoch": 0.5389848246991105,
"grad_norm": 0.16601362069808856,
"learning_rate": 5.196413213800812e-06,
"loss": 1.4371,
"step": 1030
},
{
"epoch": 0.5416012558869702,
"grad_norm": 0.15498287472397484,
"learning_rate": 5.150751667237266e-06,
"loss": 1.4294,
"step": 1035
},
{
"epoch": 0.54421768707483,
"grad_norm": 0.15961404392992412,
"learning_rate": 5.1050775329528865e-06,
"loss": 1.4789,
"step": 1040
},
{
"epoch": 0.5468341182626897,
"grad_norm": 0.15949553392591714,
"learning_rate": 5.059394624724749e-06,
"loss": 1.4851,
"step": 1045
},
{
"epoch": 0.5494505494505495,
"grad_norm": 0.16116839795222396,
"learning_rate": 5.0137067570625345e-06,
"loss": 1.4458,
"step": 1050
},
{
"epoch": 0.5520669806384092,
"grad_norm": 0.15971613721230996,
"learning_rate": 4.968017744890052e-06,
"loss": 1.4697,
"step": 1055
},
{
"epoch": 0.554683411826269,
"grad_norm": 0.16118914480125368,
"learning_rate": 4.922331403226667e-06,
"loss": 1.4442,
"step": 1060
},
{
"epoch": 0.5572998430141287,
"grad_norm": 0.1620932053747273,
"learning_rate": 4.876651546868759e-06,
"loss": 1.4814,
"step": 1065
},
{
"epoch": 0.5599162742019885,
"grad_norm": 0.15581908215479442,
"learning_rate": 4.830981990071193e-06,
"loss": 1.4428,
"step": 1070
},
{
"epoch": 0.5625327053898482,
"grad_norm": 0.155369479068761,
"learning_rate": 4.785326546228818e-06,
"loss": 1.4835,
"step": 1075
},
{
"epoch": 0.565149136577708,
"grad_norm": 0.16291117247601922,
"learning_rate": 4.739689027558052e-06,
"loss": 1.443,
"step": 1080
},
{
"epoch": 0.5677655677655677,
"grad_norm": 0.15825965604814435,
"learning_rate": 4.694073244778571e-06,
"loss": 1.4788,
"step": 1085
},
{
"epoch": 0.5703819989534276,
"grad_norm": 0.16221449839650126,
"learning_rate": 4.648483006795115e-06,
"loss": 1.4565,
"step": 1090
},
{
"epoch": 0.5729984301412873,
"grad_norm": 0.1631035307892455,
"learning_rate": 4.602922120379432e-06,
"loss": 1.423,
"step": 1095
},
{
"epoch": 0.5756148613291471,
"grad_norm": 0.15523741305430758,
"learning_rate": 4.557394389852427e-06,
"loss": 1.4925,
"step": 1100
},
{
"epoch": 0.5782312925170068,
"grad_norm": 0.16229543459181653,
"learning_rate": 4.5119036167664966e-06,
"loss": 1.4646,
"step": 1105
},
{
"epoch": 0.5808477237048666,
"grad_norm": 0.17203262275951273,
"learning_rate": 4.466453599588103e-06,
"loss": 1.4874,
"step": 1110
},
{
"epoch": 0.5834641548927263,
"grad_norm": 0.1523478817379919,
"learning_rate": 4.421048133380601e-06,
"loss": 1.4031,
"step": 1115
},
{
"epoch": 0.5860805860805861,
"grad_norm": 0.15873848031137183,
"learning_rate": 4.375691009487351e-06,
"loss": 1.4796,
"step": 1120
},
{
"epoch": 0.5886970172684458,
"grad_norm": 0.16337788296807224,
"learning_rate": 4.330386015215145e-06,
"loss": 1.5251,
"step": 1125
},
{
"epoch": 0.5913134484563056,
"grad_norm": 0.1602836054740823,
"learning_rate": 4.285136933517971e-06,
"loss": 1.4303,
"step": 1130
},
{
"epoch": 0.5939298796441653,
"grad_norm": 0.15647438547435116,
"learning_rate": 4.239947542681125e-06,
"loss": 1.4039,
"step": 1135
},
{
"epoch": 0.5965463108320251,
"grad_norm": 0.15857408026812378,
"learning_rate": 4.194821616005738e-06,
"loss": 1.3983,
"step": 1140
},
{
"epoch": 0.5991627420198848,
"grad_norm": 0.15775093783775374,
"learning_rate": 4.1497629214937e-06,
"loss": 1.4372,
"step": 1145
},
{
"epoch": 0.6017791732077447,
"grad_norm": 0.18509829862091232,
"learning_rate": 4.104775221533039e-06,
"loss": 1.4806,
"step": 1150
},
{
"epoch": 0.6043956043956044,
"grad_norm": 0.1683414103321442,
"learning_rate": 4.059862272583755e-06,
"loss": 1.4842,
"step": 1155
},
{
"epoch": 0.6070120355834642,
"grad_norm": 0.15725746906302956,
"learning_rate": 4.015027824864158e-06,
"loss": 1.474,
"step": 1160
},
{
"epoch": 0.6096284667713239,
"grad_norm": 0.156823916865726,
"learning_rate": 3.97027562203773e-06,
"loss": 1.4457,
"step": 1165
},
{
"epoch": 0.6122448979591837,
"grad_norm": 0.16196818890564343,
"learning_rate": 3.92560940090053e-06,
"loss": 1.4549,
"step": 1170
},
{
"epoch": 0.6148613291470434,
"grad_norm": 0.16431998359127215,
"learning_rate": 3.881032891069169e-06,
"loss": 1.4569,
"step": 1175
},
{
"epoch": 0.6174777603349032,
"grad_norm": 0.1624294435977778,
"learning_rate": 3.836549814669389e-06,
"loss": 1.4707,
"step": 1180
},
{
"epoch": 0.6200941915227629,
"grad_norm": 0.15807576657487513,
"learning_rate": 3.7921638860252674e-06,
"loss": 1.4445,
"step": 1185
},
{
"epoch": 0.6227106227106227,
"grad_norm": 0.15852155364559742,
"learning_rate": 3.747878811349075e-06,
"loss": 1.4828,
"step": 1190
},
{
"epoch": 0.6253270538984824,
"grad_norm": 0.1646770963198663,
"learning_rate": 3.703698288431801e-06,
"loss": 1.4424,
"step": 1195
},
{
"epoch": 0.6279434850863422,
"grad_norm": 0.1603001093766745,
"learning_rate": 3.659626006334395e-06,
"loss": 1.4539,
"step": 1200
},
{
"epoch": 0.6305599162742019,
"grad_norm": 0.15832268935743676,
"learning_rate": 3.615665645079728e-06,
"loss": 1.4581,
"step": 1205
},
{
"epoch": 0.6331763474620618,
"grad_norm": 0.15675764863415026,
"learning_rate": 3.5718208753453166e-06,
"loss": 1.5017,
"step": 1210
},
{
"epoch": 0.6357927786499215,
"grad_norm": 0.1564827555990543,
"learning_rate": 3.5280953581568155e-06,
"loss": 1.4583,
"step": 1215
},
{
"epoch": 0.6384092098377813,
"grad_norm": 0.1564902878878436,
"learning_rate": 3.484492744582325e-06,
"loss": 1.4566,
"step": 1220
},
{
"epoch": 0.6410256410256411,
"grad_norm": 0.15668534020828204,
"learning_rate": 3.441016675427532e-06,
"loss": 1.4551,
"step": 1225
},
{
"epoch": 0.6436420722135008,
"grad_norm": 0.15413678756262972,
"learning_rate": 3.397670780931699e-06,
"loss": 1.4326,
"step": 1230
},
{
"epoch": 0.6462585034013606,
"grad_norm": 0.15578687601295582,
"learning_rate": 3.354458680464543e-06,
"loss": 1.4317,
"step": 1235
},
{
"epoch": 0.6488749345892203,
"grad_norm": 0.1638562831165353,
"learning_rate": 3.311383982224017e-06,
"loss": 1.4472,
"step": 1240
},
{
"epoch": 0.6514913657770801,
"grad_norm": 0.16594544182894133,
"learning_rate": 3.268450282935026e-06,
"loss": 1.4729,
"step": 1245
},
{
"epoch": 0.6541077969649398,
"grad_norm": 0.16046123770729828,
"learning_rate": 3.2256611675491096e-06,
"loss": 1.4583,
"step": 1250
},
{
"epoch": 0.6567242281527996,
"grad_norm": 0.16513808379274897,
"learning_rate": 3.183020208945086e-06,
"loss": 1.4484,
"step": 1255
},
{
"epoch": 0.6593406593406593,
"grad_norm": 0.16412228072472856,
"learning_rate": 3.1405309676307283e-06,
"loss": 1.4906,
"step": 1260
},
{
"epoch": 0.6619570905285191,
"grad_norm": 0.16590678276919818,
"learning_rate": 3.0981969914454555e-06,
"loss": 1.4856,
"step": 1265
},
{
"epoch": 0.6645735217163788,
"grad_norm": 0.1687948653068824,
"learning_rate": 3.056021815264102e-06,
"loss": 1.4641,
"step": 1270
},
{
"epoch": 0.6671899529042387,
"grad_norm": 0.1627828541693421,
"learning_rate": 3.0140089607017386e-06,
"loss": 1.494,
"step": 1275
},
{
"epoch": 0.6698063840920984,
"grad_norm": 0.15269049435378884,
"learning_rate": 2.972161935819632e-06,
"loss": 1.4347,
"step": 1280
},
{
"epoch": 0.6724228152799582,
"grad_norm": 0.15762990269917085,
"learning_rate": 2.930484234832315e-06,
"loss": 1.4694,
"step": 1285
},
{
"epoch": 0.6750392464678179,
"grad_norm": 0.15508871368297228,
"learning_rate": 2.8889793378158284e-06,
"loss": 1.4402,
"step": 1290
},
{
"epoch": 0.6776556776556777,
"grad_norm": 0.17290914975265276,
"learning_rate": 2.8476507104171273e-06,
"loss": 1.4618,
"step": 1295
},
{
"epoch": 0.6802721088435374,
"grad_norm": 0.15581225533545537,
"learning_rate": 2.806501803564708e-06,
"loss": 1.472,
"step": 1300
},
{
"epoch": 0.6828885400313972,
"grad_norm": 0.16127501163040442,
"learning_rate": 2.765536053180447e-06,
"loss": 1.4563,
"step": 1305
},
{
"epoch": 0.6855049712192569,
"grad_norm": 0.16320911225559154,
"learning_rate": 2.724756879892717e-06,
"loss": 1.4605,
"step": 1310
},
{
"epoch": 0.6881214024071167,
"grad_norm": 0.16165957297000277,
"learning_rate": 2.6841676887507505e-06,
"loss": 1.4384,
"step": 1315
},
{
"epoch": 0.6907378335949764,
"grad_norm": 0.15751845797115352,
"learning_rate": 2.643771868940327e-06,
"loss": 1.4584,
"step": 1320
},
{
"epoch": 0.6933542647828362,
"grad_norm": 0.1540695191316521,
"learning_rate": 2.603572793500775e-06,
"loss": 1.4408,
"step": 1325
},
{
"epoch": 0.6959706959706959,
"grad_norm": 0.1550335651788221,
"learning_rate": 2.5635738190433252e-06,
"loss": 1.4369,
"step": 1330
},
{
"epoch": 0.6985871271585558,
"grad_norm": 0.16362988015639554,
"learning_rate": 2.523778285470835e-06,
"loss": 1.4822,
"step": 1335
},
{
"epoch": 0.7012035583464155,
"grad_norm": 0.15606895225172598,
"learning_rate": 2.4841895156989047e-06,
"loss": 1.4672,
"step": 1340
},
{
"epoch": 0.7038199895342753,
"grad_norm": 0.1615627045553084,
"learning_rate": 2.444810815378416e-06,
"loss": 1.4105,
"step": 1345
},
{
"epoch": 0.706436420722135,
"grad_norm": 0.16228564404934667,
"learning_rate": 2.4056454726195166e-06,
"loss": 1.4706,
"step": 1350
},
{
"epoch": 0.7090528519099948,
"grad_norm": 0.16011002705477617,
"learning_rate": 2.366696757717054e-06,
"loss": 1.4392,
"step": 1355
},
{
"epoch": 0.7116692830978545,
"grad_norm": 0.15870675982792665,
"learning_rate": 2.327967922877515e-06,
"loss": 1.5238,
"step": 1360
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.15138917306239938,
"learning_rate": 2.28946220194746e-06,
"loss": 1.4617,
"step": 1365
},
{
"epoch": 0.716902145473574,
"grad_norm": 0.16468665101161267,
"learning_rate": 2.2511828101435105e-06,
"loss": 1.473,
"step": 1370
},
{
"epoch": 0.7195185766614338,
"grad_norm": 0.15846712332838853,
"learning_rate": 2.213132943783864e-06,
"loss": 1.4154,
"step": 1375
},
{
"epoch": 0.7221350078492935,
"grad_norm": 0.1586190639294726,
"learning_rate": 2.1753157800214107e-06,
"loss": 1.4482,
"step": 1380
},
{
"epoch": 0.7247514390371533,
"grad_norm": 0.15881895479160194,
"learning_rate": 2.137734476578443e-06,
"loss": 1.4911,
"step": 1385
},
{
"epoch": 0.727367870225013,
"grad_norm": 0.16286201033342562,
"learning_rate": 2.1003921714829823e-06,
"loss": 1.4584,
"step": 1390
},
{
"epoch": 0.7299843014128728,
"grad_norm": 0.15788330405126516,
"learning_rate": 2.063291982806759e-06,
"loss": 1.4102,
"step": 1395
},
{
"epoch": 0.7326007326007326,
"grad_norm": 0.15934473574485228,
"learning_rate": 2.0264370084048498e-06,
"loss": 1.4932,
"step": 1400
},
{
"epoch": 0.7352171637885924,
"grad_norm": 0.16436224664001722,
"learning_rate": 1.9898303256570093e-06,
"loss": 1.4216,
"step": 1405
},
{
"epoch": 0.7378335949764521,
"grad_norm": 0.15573035343733022,
"learning_rate": 1.953474991210717e-06,
"loss": 1.4545,
"step": 1410
},
{
"epoch": 0.7404500261643119,
"grad_norm": 0.15560461400003067,
"learning_rate": 1.917374040725935e-06,
"loss": 1.4646,
"step": 1415
},
{
"epoch": 0.7430664573521716,
"grad_norm": 0.1577474998263538,
"learning_rate": 1.8815304886216385e-06,
"loss": 1.4534,
"step": 1420
},
{
"epoch": 0.7456828885400314,
"grad_norm": 0.15854274323074571,
"learning_rate": 1.8459473278241125e-06,
"loss": 1.417,
"step": 1425
},
{
"epoch": 0.7482993197278912,
"grad_norm": 0.15488543431357468,
"learning_rate": 1.8106275295170462e-06,
"loss": 1.4453,
"step": 1430
},
{
"epoch": 0.7509157509157509,
"grad_norm": 0.1486710066323424,
"learning_rate": 1.7755740428934333e-06,
"loss": 1.4146,
"step": 1435
},
{
"epoch": 0.7535321821036107,
"grad_norm": 0.15860342482210135,
"learning_rate": 1.7407897949093184e-06,
"loss": 1.4131,
"step": 1440
},
{
"epoch": 0.7561486132914704,
"grad_norm": 0.1498596914474104,
"learning_rate": 1.7062776900393979e-06,
"loss": 1.4928,
"step": 1445
},
{
"epoch": 0.7587650444793302,
"grad_norm": 0.16559110824107307,
"learning_rate": 1.6720406100344977e-06,
"loss": 1.455,
"step": 1450
},
{
"epoch": 0.7613814756671899,
"grad_norm": 0.15672560186933324,
"learning_rate": 1.6380814136809442e-06,
"loss": 1.4384,
"step": 1455
},
{
"epoch": 0.7639979068550498,
"grad_norm": 0.15929938335939636,
"learning_rate": 1.6044029365618612e-06,
"loss": 1.4048,
"step": 1460
},
{
"epoch": 0.7666143380429095,
"grad_norm": 0.15082068272349816,
"learning_rate": 1.571007990820394e-06,
"loss": 1.4777,
"step": 1465
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.1551606972940531,
"learning_rate": 1.5378993649249053e-06,
"loss": 1.4509,
"step": 1470
},
{
"epoch": 0.771847200418629,
"grad_norm": 0.16555549980007303,
"learning_rate": 1.5050798234361269e-06,
"loss": 1.4896,
"step": 1475
},
{
"epoch": 0.7744636316064888,
"grad_norm": 0.15106932577324084,
"learning_rate": 1.4725521067763298e-06,
"loss": 1.4298,
"step": 1480
},
{
"epoch": 0.7770800627943485,
"grad_norm": 0.16881824215449678,
"learning_rate": 1.4403189310004917e-06,
"loss": 1.4626,
"step": 1485
},
{
"epoch": 0.7796964939822083,
"grad_norm": 0.15714915915750127,
"learning_rate": 1.4083829875695172e-06,
"loss": 1.4336,
"step": 1490
},
{
"epoch": 0.782312925170068,
"grad_norm": 0.15042384246168655,
"learning_rate": 1.376746943125491e-06,
"loss": 1.4531,
"step": 1495
},
{
"epoch": 0.7849293563579278,
"grad_norm": 0.16251411929370363,
"learning_rate": 1.34541343926902e-06,
"loss": 1.4554,
"step": 1500
},
{
"epoch": 0.7875457875457875,
"grad_norm": 0.16135250106988436,
"learning_rate": 1.3143850923386586e-06,
"loss": 1.4649,
"step": 1505
},
{
"epoch": 0.7901622187336473,
"grad_norm": 0.16618023255091524,
"learning_rate": 1.2836644931924469e-06,
"loss": 1.4879,
"step": 1510
},
{
"epoch": 0.792778649921507,
"grad_norm": 0.15813395048358223,
"learning_rate": 1.2532542069915722e-06,
"loss": 1.4397,
"step": 1515
},
{
"epoch": 0.7953950811093669,
"grad_norm": 0.15701143348776167,
"learning_rate": 1.2231567729861809e-06,
"loss": 1.4261,
"step": 1520
},
{
"epoch": 0.7980115122972266,
"grad_norm": 0.1603189121004618,
"learning_rate": 1.1933747043033505e-06,
"loss": 1.4486,
"step": 1525
},
{
"epoch": 0.8006279434850864,
"grad_norm": 0.1593742047063577,
"learning_rate": 1.1639104877372475e-06,
"loss": 1.4691,
"step": 1530
},
{
"epoch": 0.8032443746729461,
"grad_norm": 0.16867515846268982,
"learning_rate": 1.134766583541475e-06,
"loss": 1.4516,
"step": 1535
},
{
"epoch": 0.8058608058608059,
"grad_norm": 0.16003034328500954,
"learning_rate": 1.1059454252236457e-06,
"loss": 1.4664,
"step": 1540
},
{
"epoch": 0.8084772370486656,
"grad_norm": 0.15168555205033307,
"learning_rate": 1.0774494193421842e-06,
"loss": 1.4613,
"step": 1545
},
{
"epoch": 0.8110936682365254,
"grad_norm": 0.15563769398367197,
"learning_rate": 1.0492809453053836e-06,
"loss": 1.4464,
"step": 1550
},
{
"epoch": 0.8137100994243851,
"grad_norm": 0.1685907836889909,
"learning_rate": 1.0214423551727188e-06,
"loss": 1.4501,
"step": 1555
},
{
"epoch": 0.8163265306122449,
"grad_norm": 0.1577575034779687,
"learning_rate": 9.939359734584552e-07,
"loss": 1.4204,
"step": 1560
},
{
"epoch": 0.8189429618001046,
"grad_norm": 0.15781696179307608,
"learning_rate": 9.667640969375465e-07,
"loss": 1.4233,
"step": 1565
},
{
"epoch": 0.8215593929879644,
"grad_norm": 0.1631762144058873,
"learning_rate": 9.399289944538664e-07,
"loss": 1.4701,
"step": 1570
},
{
"epoch": 0.8241758241758241,
"grad_norm": 0.158857072715717,
"learning_rate": 9.134329067307485e-07,
"loss": 1.4929,
"step": 1575
},
{
"epoch": 0.826792255363684,
"grad_norm": 0.16115217693002612,
"learning_rate": 8.872780461838931e-07,
"loss": 1.4778,
"step": 1580
},
{
"epoch": 0.8294086865515437,
"grad_norm": 0.1569960900935591,
"learning_rate": 8.614665967366276e-07,
"loss": 1.4912,
"step": 1585
},
{
"epoch": 0.8320251177394035,
"grad_norm": 0.16399438070889508,
"learning_rate": 8.360007136375553e-07,
"loss": 1.4597,
"step": 1590
},
{
"epoch": 0.8346415489272632,
"grad_norm": 0.1565164124394842,
"learning_rate": 8.108825232805856e-07,
"loss": 1.4573,
"step": 1595
},
{
"epoch": 0.837257980115123,
"grad_norm": 0.15301717777573082,
"learning_rate": 7.861141230273839e-07,
"loss": 1.432,
"step": 1600
},
{
"epoch": 0.8398744113029827,
"grad_norm": 0.16280701231358558,
"learning_rate": 7.61697581032243e-07,
"loss": 1.4701,
"step": 1605
},
{
"epoch": 0.8424908424908425,
"grad_norm": 0.1542078888615764,
"learning_rate": 7.376349360693952e-07,
"loss": 1.4416,
"step": 1610
},
{
"epoch": 0.8451072736787022,
"grad_norm": 0.15896265508191268,
"learning_rate": 7.139281973627693e-07,
"loss": 1.5054,
"step": 1615
},
{
"epoch": 0.847723704866562,
"grad_norm": 0.15547362210975574,
"learning_rate": 6.905793444182257e-07,
"loss": 1.4047,
"step": 1620
},
{
"epoch": 0.8503401360544217,
"grad_norm": 0.15640953304641333,
"learning_rate": 6.675903268582623e-07,
"loss": 1.4447,
"step": 1625
},
{
"epoch": 0.8529565672422815,
"grad_norm": 0.1648337946012005,
"learning_rate": 6.449630642592336e-07,
"loss": 1.4324,
"step": 1630
},
{
"epoch": 0.8555729984301413,
"grad_norm": 0.16020988675812617,
"learning_rate": 6.22699445991054e-07,
"loss": 1.5053,
"step": 1635
},
{
"epoch": 0.858189429618001,
"grad_norm": 0.16301957953770885,
"learning_rate": 6.008013310594418e-07,
"loss": 1.4225,
"step": 1640
},
{
"epoch": 0.8608058608058609,
"grad_norm": 0.15536283216420355,
"learning_rate": 5.7927054795069e-07,
"loss": 1.4632,
"step": 1645
},
{
"epoch": 0.8634222919937206,
"grad_norm": 0.15402900867791064,
"learning_rate": 5.581088944789953e-07,
"loss": 1.4617,
"step": 1650
},
{
"epoch": 0.8660387231815804,
"grad_norm": 0.15324358115634612,
"learning_rate": 5.373181376363312e-07,
"loss": 1.4933,
"step": 1655
},
{
"epoch": 0.8686551543694401,
"grad_norm": 0.16150842206673804,
"learning_rate": 5.169000134449115e-07,
"loss": 1.4881,
"step": 1660
},
{
"epoch": 0.8712715855572999,
"grad_norm": 0.1563492858079007,
"learning_rate": 4.968562268122285e-07,
"loss": 1.4782,
"step": 1665
},
{
"epoch": 0.8738880167451596,
"grad_norm": 0.15656351967720294,
"learning_rate": 4.771884513886998e-07,
"loss": 1.4926,
"step": 1670
},
{
"epoch": 0.8765044479330194,
"grad_norm": 0.15968896687575132,
"learning_rate": 4.578983294279138e-07,
"loss": 1.4465,
"step": 1675
},
{
"epoch": 0.8791208791208791,
"grad_norm": 0.15840577744694784,
"learning_rate": 4.389874716495013e-07,
"loss": 1.471,
"step": 1680
},
{
"epoch": 0.8817373103087389,
"grad_norm": 0.15537710474993283,
"learning_rate": 4.204574571046438e-07,
"loss": 1.4827,
"step": 1685
},
{
"epoch": 0.8843537414965986,
"grad_norm": 0.1596282006005663,
"learning_rate": 4.0230983304422543e-07,
"loss": 1.4873,
"step": 1690
},
{
"epoch": 0.8869701726844584,
"grad_norm": 0.1610797994580487,
"learning_rate": 3.8454611478963235e-07,
"loss": 1.4998,
"step": 1695
},
{
"epoch": 0.8895866038723181,
"grad_norm": 0.16534664335538504,
"learning_rate": 3.671677856062261e-07,
"loss": 1.4371,
"step": 1700
},
{
"epoch": 0.892203035060178,
"grad_norm": 0.1542092085511985,
"learning_rate": 3.501762965794919e-07,
"loss": 1.4787,
"step": 1705
},
{
"epoch": 0.8948194662480377,
"grad_norm": 0.15126499803017035,
"learning_rate": 3.335730664938758e-07,
"loss": 1.4482,
"step": 1710
},
{
"epoch": 0.8974358974358975,
"grad_norm": 0.1574221424399337,
"learning_rate": 3.1735948171431e-07,
"loss": 1.4415,
"step": 1715
},
{
"epoch": 0.9000523286237572,
"grad_norm": 0.15973254318652566,
"learning_rate": 3.015368960704584e-07,
"loss": 1.4944,
"step": 1720
},
{
"epoch": 0.902668759811617,
"grad_norm": 0.16225657403102964,
"learning_rate": 2.8610663074366773e-07,
"loss": 1.4666,
"step": 1725
},
{
"epoch": 0.9052851909994767,
"grad_norm": 0.15648772554662277,
"learning_rate": 2.7106997415665527e-07,
"loss": 1.4265,
"step": 1730
},
{
"epoch": 0.9079016221873365,
"grad_norm": 0.15343388907268007,
"learning_rate": 2.564281818659159e-07,
"loss": 1.4214,
"step": 1735
},
{
"epoch": 0.9105180533751962,
"grad_norm": 0.15794655302835334,
"learning_rate": 2.4218247645689306e-07,
"loss": 1.5033,
"step": 1740
},
{
"epoch": 0.913134484563056,
"grad_norm": 0.16018691162962145,
"learning_rate": 2.2833404744188824e-07,
"loss": 1.4402,
"step": 1745
},
{
"epoch": 0.9157509157509157,
"grad_norm": 0.15874745650816893,
"learning_rate": 2.1488405116074028e-07,
"loss": 1.4202,
"step": 1750
},
{
"epoch": 0.9183673469387755,
"grad_norm": 0.1681304342474622,
"learning_rate": 2.0183361068426778e-07,
"loss": 1.465,
"step": 1755
},
{
"epoch": 0.9209837781266352,
"grad_norm": 0.15372824338493402,
"learning_rate": 1.8918381572049393e-07,
"loss": 1.4512,
"step": 1760
},
{
"epoch": 0.923600209314495,
"grad_norm": 0.16183345451480904,
"learning_rate": 1.7693572252365841e-07,
"loss": 1.4484,
"step": 1765
},
{
"epoch": 0.9262166405023547,
"grad_norm": 0.16140526836471605,
"learning_rate": 1.650903538060189e-07,
"loss": 1.4727,
"step": 1770
},
{
"epoch": 0.9288330716902146,
"grad_norm": 0.15802040978302784,
"learning_rate": 1.536486986524538e-07,
"loss": 1.4756,
"step": 1775
},
{
"epoch": 0.9314495028780743,
"grad_norm": 0.15845763106711214,
"learning_rate": 1.426117124378762e-07,
"loss": 1.4562,
"step": 1780
},
{
"epoch": 0.9340659340659341,
"grad_norm": 0.15827675471278949,
"learning_rate": 1.3198031674745814e-07,
"loss": 1.3972,
"step": 1785
},
{
"epoch": 0.9366823652537938,
"grad_norm": 0.15820634911568737,
"learning_rate": 1.2175539929968117e-07,
"loss": 1.447,
"step": 1790
},
{
"epoch": 0.9392987964416536,
"grad_norm": 0.15746447537785097,
"learning_rate": 1.1193781387220936e-07,
"loss": 1.4424,
"step": 1795
},
{
"epoch": 0.9419152276295133,
"grad_norm": 0.15952729687882758,
"learning_rate": 1.0252838023059985e-07,
"loss": 1.4973,
"step": 1800
},
{
"epoch": 0.9445316588173731,
"grad_norm": 0.16828930147773663,
"learning_rate": 9.352788405985469e-08,
"loss": 1.4413,
"step": 1805
},
{
"epoch": 0.9471480900052328,
"grad_norm": 0.15376650859627036,
"learning_rate": 8.493707689881448e-08,
"loss": 1.4496,
"step": 1810
},
{
"epoch": 0.9497645211930926,
"grad_norm": 0.15522464643971864,
"learning_rate": 7.675667607740356e-08,
"loss": 1.4641,
"step": 1815
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.16498724213623525,
"learning_rate": 6.898736465673739e-08,
"loss": 1.4701,
"step": 1820
},
{
"epoch": 0.9549973835688121,
"grad_norm": 0.16595904586188515,
"learning_rate": 6.162979137208314e-08,
"loss": 1.4684,
"step": 1825
},
{
"epoch": 0.957613814756672,
"grad_norm": 0.15857597277888638,
"learning_rate": 5.468457057869358e-08,
"loss": 1.441,
"step": 1830
},
{
"epoch": 0.9602302459445317,
"grad_norm": 0.1606833298581918,
"learning_rate": 4.815228220050538e-08,
"loss": 1.4476,
"step": 1835
},
{
"epoch": 0.9628466771323915,
"grad_norm": 0.1589246468457863,
"learning_rate": 4.2033471681718895e-08,
"loss": 1.4504,
"step": 1840
},
{
"epoch": 0.9654631083202512,
"grad_norm": 0.15710438180801464,
"learning_rate": 3.632864994125129e-08,
"loss": 1.4322,
"step": 1845
},
{
"epoch": 0.968079539508111,
"grad_norm": 0.1582315990940597,
"learning_rate": 3.103829333007624e-08,
"loss": 1.4641,
"step": 1850
},
{
"epoch": 0.9706959706959707,
"grad_norm": 0.1566511194352204,
"learning_rate": 2.616284359144794e-08,
"loss": 1.4106,
"step": 1855
},
{
"epoch": 0.9733124018838305,
"grad_norm": 0.1628350239191456,
"learning_rate": 2.1702707824017287e-08,
"loss": 1.465,
"step": 1860
},
{
"epoch": 0.9759288330716902,
"grad_norm": 0.15467515622764103,
"learning_rate": 1.7658258447836306e-08,
"loss": 1.4735,
"step": 1865
},
{
"epoch": 0.97854526425955,
"grad_norm": 0.16105669593925756,
"learning_rate": 1.4029833173264673e-08,
"loss": 1.4625,
"step": 1870
},
{
"epoch": 0.9811616954474097,
"grad_norm": 0.16002164928848112,
"learning_rate": 1.0817734972768946e-08,
"loss": 1.4441,
"step": 1875
},
{
"epoch": 0.9837781266352695,
"grad_norm": 0.16816554801748468,
"learning_rate": 8.022232055623913e-09,
"loss": 1.4317,
"step": 1880
},
{
"epoch": 0.9863945578231292,
"grad_norm": 0.1600747418511804,
"learning_rate": 5.643557845518843e-09,
"loss": 1.4828,
"step": 1885
},
{
"epoch": 0.989010989010989,
"grad_norm": 0.15616538247681158,
"learning_rate": 3.6819109610658486e-09,
"loss": 1.4604,
"step": 1890
},
{
"epoch": 0.9916274201988488,
"grad_norm": 0.15776227258618464,
"learning_rate": 2.137455199215377e-09,
"loss": 1.3947,
"step": 1895
},
{
"epoch": 0.9942438513867086,
"grad_norm": 0.15889575650183468,
"learning_rate": 1.0103195215788175e-09,
"loss": 1.4492,
"step": 1900
},
{
"epoch": 0.9968602825745683,
"grad_norm": 0.15556430774291685,
"learning_rate": 3.005980436604494e-10,
"loss": 1.4935,
"step": 1905
},
{
"epoch": 0.9994767137624281,
"grad_norm": 0.15905073281412974,
"learning_rate": 8.350027000392224e-12,
"loss": 1.4804,
"step": 1910
},
{
"epoch": 1.0,
"eval_runtime": 2.2534,
"eval_samples_per_second": 4.438,
"eval_steps_per_second": 1.331,
"step": 1911
},
{
"epoch": 1.0,
"step": 1911,
"total_flos": 4845354067427328.0,
"train_loss": 1.4736063955717222,
"train_runtime": 1883.6956,
"train_samples_per_second": 16.224,
"train_steps_per_second": 1.014
}
],
"logging_steps": 5,
"max_steps": 1911,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4845354067427328.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}