autoprogrammer's picture
Upload sdar_1.7b_trace_sft_math-checkpoint-2148
e499dda verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2148,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006985679357317499,
"grad_norm": 9.417793273925781,
"learning_rate": 6.153846153846155e-07,
"loss": 0.4856,
"step": 5
},
{
"epoch": 0.013971358714634998,
"grad_norm": 7.6536865234375,
"learning_rate": 1.3846153846153848e-06,
"loss": 0.4457,
"step": 10
},
{
"epoch": 0.020957038071952497,
"grad_norm": 6.9425950050354,
"learning_rate": 2.153846153846154e-06,
"loss": 0.4359,
"step": 15
},
{
"epoch": 0.027942717429269997,
"grad_norm": 7.008752822875977,
"learning_rate": 2.9230769230769236e-06,
"loss": 0.4247,
"step": 20
},
{
"epoch": 0.0349283967865875,
"grad_norm": 7.943561553955078,
"learning_rate": 3.692307692307693e-06,
"loss": 0.4144,
"step": 25
},
{
"epoch": 0.041914076143904994,
"grad_norm": 8.960087776184082,
"learning_rate": 4.461538461538462e-06,
"loss": 0.4095,
"step": 30
},
{
"epoch": 0.0488997555012225,
"grad_norm": 9.126522064208984,
"learning_rate": 5.230769230769232e-06,
"loss": 0.4195,
"step": 35
},
{
"epoch": 0.055885434858539994,
"grad_norm": 6.81967306137085,
"learning_rate": 6e-06,
"loss": 0.409,
"step": 40
},
{
"epoch": 0.06287111421585749,
"grad_norm": 9.193132400512695,
"learning_rate": 6.76923076923077e-06,
"loss": 0.4256,
"step": 45
},
{
"epoch": 0.069856793573175,
"grad_norm": 7.224976062774658,
"learning_rate": 7.538461538461539e-06,
"loss": 0.41,
"step": 50
},
{
"epoch": 0.07684247293049248,
"grad_norm": 7.260319232940674,
"learning_rate": 8.307692307692309e-06,
"loss": 0.4534,
"step": 55
},
{
"epoch": 0.08382815228780999,
"grad_norm": 8.081445693969727,
"learning_rate": 9.076923076923078e-06,
"loss": 0.4435,
"step": 60
},
{
"epoch": 0.09081383164512749,
"grad_norm": 8.150150299072266,
"learning_rate": 9.846153846153848e-06,
"loss": 0.4457,
"step": 65
},
{
"epoch": 0.097799511002445,
"grad_norm": 6.927804946899414,
"learning_rate": 9.999909012888162e-06,
"loss": 0.4559,
"step": 70
},
{
"epoch": 0.10478519035976248,
"grad_norm": 7.140408039093018,
"learning_rate": 9.999539383421715e-06,
"loss": 0.4217,
"step": 75
},
{
"epoch": 0.11177086971707999,
"grad_norm": 7.418540000915527,
"learning_rate": 9.998885445909666e-06,
"loss": 0.4857,
"step": 80
},
{
"epoch": 0.11875654907439749,
"grad_norm": 8.253525733947754,
"learning_rate": 9.997947237539373e-06,
"loss": 0.4863,
"step": 85
},
{
"epoch": 0.12574222843171498,
"grad_norm": 6.550653457641602,
"learning_rate": 9.996724811663797e-06,
"loss": 0.4663,
"step": 90
},
{
"epoch": 0.13272790778903248,
"grad_norm": 6.81130313873291,
"learning_rate": 9.99521823779844e-06,
"loss": 0.4684,
"step": 95
},
{
"epoch": 0.13971358714635,
"grad_norm": 7.913576602935791,
"learning_rate": 9.99342760161741e-06,
"loss": 0.5049,
"step": 100
},
{
"epoch": 0.1466992665036675,
"grad_norm": 8.523401260375977,
"learning_rate": 9.991353004948538e-06,
"loss": 0.4635,
"step": 105
},
{
"epoch": 0.15368494586098497,
"grad_norm": 6.084781169891357,
"learning_rate": 9.988994565767604e-06,
"loss": 0.4752,
"step": 110
},
{
"epoch": 0.16067062521830247,
"grad_norm": 5.975230693817139,
"learning_rate": 9.986352418191608e-06,
"loss": 0.4722,
"step": 115
},
{
"epoch": 0.16765630457561997,
"grad_norm": 6.941432476043701,
"learning_rate": 9.983426712471151e-06,
"loss": 0.4991,
"step": 120
},
{
"epoch": 0.17464198393293748,
"grad_norm": 6.190479755401611,
"learning_rate": 9.9802176149819e-06,
"loss": 0.4821,
"step": 125
},
{
"epoch": 0.18162766329025498,
"grad_norm": 6.3899006843566895,
"learning_rate": 9.976725308215109e-06,
"loss": 0.5163,
"step": 130
},
{
"epoch": 0.18861334264757249,
"grad_norm": 5.7873430252075195,
"learning_rate": 9.97294999076726e-06,
"loss": 0.4517,
"step": 135
},
{
"epoch": 0.19559902200489,
"grad_norm": 6.523060321807861,
"learning_rate": 9.968891877328752e-06,
"loss": 0.4632,
"step": 140
},
{
"epoch": 0.20258470136220746,
"grad_norm": 7.010469436645508,
"learning_rate": 9.964551198671709e-06,
"loss": 0.4643,
"step": 145
},
{
"epoch": 0.20957038071952497,
"grad_norm": 6.9780473709106445,
"learning_rate": 9.95992820163684e-06,
"loss": 0.4737,
"step": 150
},
{
"epoch": 0.21655606007684247,
"grad_norm": 6.335629463195801,
"learning_rate": 9.955023149119413e-06,
"loss": 0.4552,
"step": 155
},
{
"epoch": 0.22354173943415998,
"grad_norm": 5.958488464355469,
"learning_rate": 9.949836320054309e-06,
"loss": 0.4553,
"step": 160
},
{
"epoch": 0.23052741879147748,
"grad_norm": 6.195608139038086,
"learning_rate": 9.944368009400145e-06,
"loss": 0.5117,
"step": 165
},
{
"epoch": 0.23751309814879498,
"grad_norm": 6.441509246826172,
"learning_rate": 9.93861852812251e-06,
"loss": 0.4652,
"step": 170
},
{
"epoch": 0.24449877750611246,
"grad_norm": 5.994947910308838,
"learning_rate": 9.932588203176281e-06,
"loss": 0.4832,
"step": 175
},
{
"epoch": 0.25148445686342996,
"grad_norm": 6.672873020172119,
"learning_rate": 9.92627737748703e-06,
"loss": 0.4827,
"step": 180
},
{
"epoch": 0.2584701362207475,
"grad_norm": 6.571279048919678,
"learning_rate": 9.919686409931523e-06,
"loss": 0.5003,
"step": 185
},
{
"epoch": 0.26545581557806497,
"grad_norm": 6.049185276031494,
"learning_rate": 9.912815675317307e-06,
"loss": 0.4905,
"step": 190
},
{
"epoch": 0.27244149493538244,
"grad_norm": 6.473181247711182,
"learning_rate": 9.905665564361403e-06,
"loss": 0.4843,
"step": 195
},
{
"epoch": 0.2794271742927,
"grad_norm": 7.581283092498779,
"learning_rate": 9.898236483668086e-06,
"loss": 0.5097,
"step": 200
},
{
"epoch": 0.28641285365001745,
"grad_norm": 7.333038806915283,
"learning_rate": 9.890528855705753e-06,
"loss": 0.4698,
"step": 205
},
{
"epoch": 0.293398533007335,
"grad_norm": 6.216223239898682,
"learning_rate": 9.882543118782913e-06,
"loss": 0.5034,
"step": 210
},
{
"epoch": 0.30038421236465246,
"grad_norm": 6.861793041229248,
"learning_rate": 9.874279727023253e-06,
"loss": 0.4756,
"step": 215
},
{
"epoch": 0.30736989172196993,
"grad_norm": 6.37476110458374,
"learning_rate": 9.865739150339815e-06,
"loss": 0.5117,
"step": 220
},
{
"epoch": 0.31435557107928747,
"grad_norm": 5.816511154174805,
"learning_rate": 9.856921874408272e-06,
"loss": 0.4925,
"step": 225
},
{
"epoch": 0.32134125043660494,
"grad_norm": 6.3242692947387695,
"learning_rate": 9.847828400639317e-06,
"loss": 0.4679,
"step": 230
},
{
"epoch": 0.3283269297939225,
"grad_norm": 6.17088508605957,
"learning_rate": 9.838459246150138e-06,
"loss": 0.4646,
"step": 235
},
{
"epoch": 0.33531260915123995,
"grad_norm": 7.1427812576293945,
"learning_rate": 9.828814943735019e-06,
"loss": 0.4633,
"step": 240
},
{
"epoch": 0.3422982885085575,
"grad_norm": 6.307351112365723,
"learning_rate": 9.818896041835036e-06,
"loss": 0.4586,
"step": 245
},
{
"epoch": 0.34928396786587496,
"grad_norm": 6.033569812774658,
"learning_rate": 9.808703104506874e-06,
"loss": 0.5144,
"step": 250
},
{
"epoch": 0.35626964722319243,
"grad_norm": 6.253153324127197,
"learning_rate": 9.798236711390754e-06,
"loss": 0.4936,
"step": 255
},
{
"epoch": 0.36325532658050996,
"grad_norm": 6.369323253631592,
"learning_rate": 9.787497457677456e-06,
"loss": 0.5142,
"step": 260
},
{
"epoch": 0.37024100593782744,
"grad_norm": 7.059694766998291,
"learning_rate": 9.776485954074492e-06,
"loss": 0.508,
"step": 265
},
{
"epoch": 0.37722668529514497,
"grad_norm": 5.437386989593506,
"learning_rate": 9.765202826771358e-06,
"loss": 0.4517,
"step": 270
},
{
"epoch": 0.38421236465246245,
"grad_norm": 6.353536605834961,
"learning_rate": 9.753648717403944e-06,
"loss": 0.5177,
"step": 275
},
{
"epoch": 0.39119804400978,
"grad_norm": 6.7363481521606445,
"learning_rate": 9.741824283018022e-06,
"loss": 0.4768,
"step": 280
},
{
"epoch": 0.39818372336709745,
"grad_norm": 6.804571151733398,
"learning_rate": 9.729730196031908e-06,
"loss": 0.5009,
"step": 285
},
{
"epoch": 0.40516940272441493,
"grad_norm": 6.188082695007324,
"learning_rate": 9.717367144198209e-06,
"loss": 0.4975,
"step": 290
},
{
"epoch": 0.41215508208173246,
"grad_norm": 6.352089881896973,
"learning_rate": 9.704735830564707e-06,
"loss": 0.458,
"step": 295
},
{
"epoch": 0.41914076143904994,
"grad_norm": 5.823917388916016,
"learning_rate": 9.6918369734344e-06,
"loss": 0.4863,
"step": 300
},
{
"epoch": 0.42612644079636747,
"grad_norm": 6.301001071929932,
"learning_rate": 9.678671306324627e-06,
"loss": 0.5175,
"step": 305
},
{
"epoch": 0.43311212015368494,
"grad_norm": 6.185693740844727,
"learning_rate": 9.665239577925384e-06,
"loss": 0.4844,
"step": 310
},
{
"epoch": 0.4400977995110024,
"grad_norm": 6.580015182495117,
"learning_rate": 9.65154255205672e-06,
"loss": 0.4907,
"step": 315
},
{
"epoch": 0.44708347886831995,
"grad_norm": 5.962856769561768,
"learning_rate": 9.637581007625328e-06,
"loss": 0.4552,
"step": 320
},
{
"epoch": 0.4540691582256374,
"grad_norm": 6.2723774909973145,
"learning_rate": 9.623355738580226e-06,
"loss": 0.4994,
"step": 325
},
{
"epoch": 0.46105483758295496,
"grad_norm": 6.140941143035889,
"learning_rate": 9.608867553867628e-06,
"loss": 0.4829,
"step": 330
},
{
"epoch": 0.46804051694027243,
"grad_norm": 6.278982639312744,
"learning_rate": 9.594117277384928e-06,
"loss": 0.5118,
"step": 335
},
{
"epoch": 0.47502619629758996,
"grad_norm": 6.221822261810303,
"learning_rate": 9.579105747933858e-06,
"loss": 0.4903,
"step": 340
},
{
"epoch": 0.48201187565490744,
"grad_norm": 6.153563976287842,
"learning_rate": 9.563833819172777e-06,
"loss": 0.4729,
"step": 345
},
{
"epoch": 0.4889975550122249,
"grad_norm": 5.723773956298828,
"learning_rate": 9.548302359568136e-06,
"loss": 0.536,
"step": 350
},
{
"epoch": 0.49598323436954245,
"grad_norm": 5.7412238121032715,
"learning_rate": 9.532512252345086e-06,
"loss": 0.4818,
"step": 355
},
{
"epoch": 0.5029689137268599,
"grad_norm": 5.776915073394775,
"learning_rate": 9.516464395437251e-06,
"loss": 0.4818,
"step": 360
},
{
"epoch": 0.5099545930841775,
"grad_norm": 6.4714741706848145,
"learning_rate": 9.50015970143567e-06,
"loss": 0.486,
"step": 365
},
{
"epoch": 0.516940272441495,
"grad_norm": 5.570414066314697,
"learning_rate": 9.483599097536895e-06,
"loss": 0.4821,
"step": 370
},
{
"epoch": 0.5239259517988124,
"grad_norm": 5.86240816116333,
"learning_rate": 9.466783525490271e-06,
"loss": 0.5014,
"step": 375
},
{
"epoch": 0.5309116311561299,
"grad_norm": 5.690156936645508,
"learning_rate": 9.449713941544375e-06,
"loss": 0.505,
"step": 380
},
{
"epoch": 0.5378973105134475,
"grad_norm": 5.273125648498535,
"learning_rate": 9.432391316392639e-06,
"loss": 0.4738,
"step": 385
},
{
"epoch": 0.5448829898707649,
"grad_norm": 6.2202887535095215,
"learning_rate": 9.414816635118154e-06,
"loss": 0.5023,
"step": 390
},
{
"epoch": 0.5518686692280824,
"grad_norm": 6.270415782928467,
"learning_rate": 9.396990897137647e-06,
"loss": 0.4772,
"step": 395
},
{
"epoch": 0.5588543485854,
"grad_norm": 5.550614356994629,
"learning_rate": 9.378915116144646e-06,
"loss": 0.4556,
"step": 400
},
{
"epoch": 0.5658400279427175,
"grad_norm": 5.853529930114746,
"learning_rate": 9.360590320051844e-06,
"loss": 0.4942,
"step": 405
},
{
"epoch": 0.5728257073000349,
"grad_norm": 5.692981243133545,
"learning_rate": 9.342017550932627e-06,
"loss": 0.4772,
"step": 410
},
{
"epoch": 0.5798113866573524,
"grad_norm": 5.6134185791015625,
"learning_rate": 9.323197864961835e-06,
"loss": 0.4245,
"step": 415
},
{
"epoch": 0.58679706601467,
"grad_norm": 6.195576190948486,
"learning_rate": 9.304132332355685e-06,
"loss": 0.5083,
"step": 420
},
{
"epoch": 0.5937827453719874,
"grad_norm": 6.21861457824707,
"learning_rate": 9.28482203731092e-06,
"loss": 0.5366,
"step": 425
},
{
"epoch": 0.6007684247293049,
"grad_norm": 5.109756946563721,
"learning_rate": 9.265268077943152e-06,
"loss": 0.4774,
"step": 430
},
{
"epoch": 0.6077541040866224,
"grad_norm": 6.137323379516602,
"learning_rate": 9.245471566224416e-06,
"loss": 0.5125,
"step": 435
},
{
"epoch": 0.6147397834439399,
"grad_norm": 5.771416187286377,
"learning_rate": 9.225433627919927e-06,
"loss": 0.5453,
"step": 440
},
{
"epoch": 0.6217254628012574,
"grad_norm": 5.52296781539917,
"learning_rate": 9.20515540252408e-06,
"loss": 0.4663,
"step": 445
},
{
"epoch": 0.6287111421585749,
"grad_norm": 6.380407333374023,
"learning_rate": 9.184638043195628e-06,
"loss": 0.4752,
"step": 450
},
{
"epoch": 0.6356968215158925,
"grad_norm": 5.940698623657227,
"learning_rate": 9.16388271669213e-06,
"loss": 0.4309,
"step": 455
},
{
"epoch": 0.6426825008732099,
"grad_norm": 6.604904651641846,
"learning_rate": 9.142890603303573e-06,
"loss": 0.5043,
"step": 460
},
{
"epoch": 0.6496681802305274,
"grad_norm": 6.345200061798096,
"learning_rate": 9.121662896785285e-06,
"loss": 0.4991,
"step": 465
},
{
"epoch": 0.656653859587845,
"grad_norm": 5.294460773468018,
"learning_rate": 9.100200804290014e-06,
"loss": 0.4788,
"step": 470
},
{
"epoch": 0.6636395389451624,
"grad_norm": 5.905223846435547,
"learning_rate": 9.078505546299317e-06,
"loss": 0.4871,
"step": 475
},
{
"epoch": 0.6706252183024799,
"grad_norm": 4.967826843261719,
"learning_rate": 9.056578356554124e-06,
"loss": 0.5174,
"step": 480
},
{
"epoch": 0.6776108976597974,
"grad_norm": 5.444486618041992,
"learning_rate": 9.034420481984604e-06,
"loss": 0.4889,
"step": 485
},
{
"epoch": 0.684596577017115,
"grad_norm": 5.278167724609375,
"learning_rate": 9.012033182639238e-06,
"loss": 0.4793,
"step": 490
},
{
"epoch": 0.6915822563744324,
"grad_norm": 6.902711391448975,
"learning_rate": 8.989417731613169e-06,
"loss": 0.5259,
"step": 495
},
{
"epoch": 0.6985679357317499,
"grad_norm": 6.086877346038818,
"learning_rate": 8.966575414975813e-06,
"loss": 0.5203,
"step": 500
},
{
"epoch": 0.7055536150890674,
"grad_norm": 6.7947564125061035,
"learning_rate": 8.943507531697715e-06,
"loss": 0.4609,
"step": 505
},
{
"epoch": 0.7125392944463849,
"grad_norm": 6.169280052185059,
"learning_rate": 8.920215393576685e-06,
"loss": 0.4799,
"step": 510
},
{
"epoch": 0.7195249738037024,
"grad_norm": 6.514042377471924,
"learning_rate": 8.8967003251632e-06,
"loss": 0.5,
"step": 515
},
{
"epoch": 0.7265106531610199,
"grad_norm": 7.015414714813232,
"learning_rate": 8.872963663685075e-06,
"loss": 0.5226,
"step": 520
},
{
"epoch": 0.7334963325183375,
"grad_norm": 4.968021869659424,
"learning_rate": 8.849006758971429e-06,
"loss": 0.4697,
"step": 525
},
{
"epoch": 0.7404820118756549,
"grad_norm": 6.74957799911499,
"learning_rate": 8.824830973375917e-06,
"loss": 0.5045,
"step": 530
},
{
"epoch": 0.7474676912329724,
"grad_norm": 5.4804816246032715,
"learning_rate": 8.800437681699264e-06,
"loss": 0.465,
"step": 535
},
{
"epoch": 0.7544533705902899,
"grad_norm": 5.658529281616211,
"learning_rate": 8.775828271111074e-06,
"loss": 0.474,
"step": 540
},
{
"epoch": 0.7614390499476074,
"grad_norm": 5.418574333190918,
"learning_rate": 8.751004141070958e-06,
"loss": 0.4904,
"step": 545
},
{
"epoch": 0.7684247293049249,
"grad_norm": 5.922712802886963,
"learning_rate": 8.72596670324894e-06,
"loss": 0.4721,
"step": 550
},
{
"epoch": 0.7754104086622424,
"grad_norm": 6.200154781341553,
"learning_rate": 8.70071738144519e-06,
"loss": 0.4902,
"step": 555
},
{
"epoch": 0.78239608801956,
"grad_norm": 5.705479145050049,
"learning_rate": 8.67525761150905e-06,
"loss": 0.4726,
"step": 560
},
{
"epoch": 0.7893817673768774,
"grad_norm": 6.879328727722168,
"learning_rate": 8.649588841257383e-06,
"loss": 0.4757,
"step": 565
},
{
"epoch": 0.7963674467341949,
"grad_norm": 7.012612819671631,
"learning_rate": 8.623712530392244e-06,
"loss": 0.4522,
"step": 570
},
{
"epoch": 0.8033531260915124,
"grad_norm": 6.293166160583496,
"learning_rate": 8.597630150417867e-06,
"loss": 0.4651,
"step": 575
},
{
"epoch": 0.8103388054488299,
"grad_norm": 5.768970012664795,
"learning_rate": 8.571343184556985e-06,
"loss": 0.4277,
"step": 580
},
{
"epoch": 0.8173244848061474,
"grad_norm": 6.452169895172119,
"learning_rate": 8.544853127666483e-06,
"loss": 0.5337,
"step": 585
},
{
"epoch": 0.8243101641634649,
"grad_norm": 5.4347615242004395,
"learning_rate": 8.518161486152406e-06,
"loss": 0.468,
"step": 590
},
{
"epoch": 0.8312958435207825,
"grad_norm": 5.982383728027344,
"learning_rate": 8.491269777884264e-06,
"loss": 0.4511,
"step": 595
},
{
"epoch": 0.8382815228780999,
"grad_norm": 5.2124457359313965,
"learning_rate": 8.464179532108742e-06,
"loss": 0.4667,
"step": 600
},
{
"epoch": 0.8452672022354174,
"grad_norm": 5.130034923553467,
"learning_rate": 8.436892289362728e-06,
"loss": 0.4551,
"step": 605
},
{
"epoch": 0.8522528815927349,
"grad_norm": 5.86264705657959,
"learning_rate": 8.409409601385702e-06,
"loss": 0.5026,
"step": 610
},
{
"epoch": 0.8592385609500524,
"grad_norm": 6.521675109863281,
"learning_rate": 8.381733031031503e-06,
"loss": 0.5017,
"step": 615
},
{
"epoch": 0.8662242403073699,
"grad_norm": 5.794342994689941,
"learning_rate": 8.353864152179445e-06,
"loss": 0.4638,
"step": 620
},
{
"epoch": 0.8732099196646874,
"grad_norm": 5.721254348754883,
"learning_rate": 8.325804549644823e-06,
"loss": 0.4652,
"step": 625
},
{
"epoch": 0.8801955990220048,
"grad_norm": 5.914518356323242,
"learning_rate": 8.297555819088787e-06,
"loss": 0.4613,
"step": 630
},
{
"epoch": 0.8871812783793224,
"grad_norm": 5.939746856689453,
"learning_rate": 8.269119566927599e-06,
"loss": 0.4599,
"step": 635
},
{
"epoch": 0.8941669577366399,
"grad_norm": 5.655862808227539,
"learning_rate": 8.240497410241286e-06,
"loss": 0.4602,
"step": 640
},
{
"epoch": 0.9011526370939574,
"grad_norm": 6.510814666748047,
"learning_rate": 8.211690976681678e-06,
"loss": 0.4582,
"step": 645
},
{
"epoch": 0.9081383164512749,
"grad_norm": 5.543778419494629,
"learning_rate": 8.18270190437985e-06,
"loss": 0.4777,
"step": 650
},
{
"epoch": 0.9151239958085924,
"grad_norm": 5.059428691864014,
"learning_rate": 8.153531841852969e-06,
"loss": 0.4756,
"step": 655
},
{
"epoch": 0.9221096751659099,
"grad_norm": 5.040985584259033,
"learning_rate": 8.12418244791054e-06,
"loss": 0.4742,
"step": 660
},
{
"epoch": 0.9290953545232273,
"grad_norm": 5.490272521972656,
"learning_rate": 8.094655391560086e-06,
"loss": 0.4614,
"step": 665
},
{
"epoch": 0.9360810338805449,
"grad_norm": 5.865604877471924,
"learning_rate": 8.064952351912227e-06,
"loss": 0.4671,
"step": 670
},
{
"epoch": 0.9430667132378624,
"grad_norm": 5.491858005523682,
"learning_rate": 8.035075018085204e-06,
"loss": 0.4624,
"step": 675
},
{
"epoch": 0.9500523925951799,
"grad_norm": 5.839197158813477,
"learning_rate": 8.005025089108812e-06,
"loss": 0.4811,
"step": 680
},
{
"epoch": 0.9570380719524973,
"grad_norm": 6.362710952758789,
"learning_rate": 7.974804273827791e-06,
"loss": 0.4829,
"step": 685
},
{
"epoch": 0.9640237513098149,
"grad_norm": 5.933167457580566,
"learning_rate": 7.944414290804647e-06,
"loss": 0.4895,
"step": 690
},
{
"epoch": 0.9710094306671324,
"grad_norm": 6.338855743408203,
"learning_rate": 7.913856868221922e-06,
"loss": 0.4729,
"step": 695
},
{
"epoch": 0.9779951100244498,
"grad_norm": 5.022581577301025,
"learning_rate": 7.883133743783918e-06,
"loss": 0.4772,
"step": 700
},
{
"epoch": 0.9849807893817674,
"grad_norm": 5.898752212524414,
"learning_rate": 7.852246664617878e-06,
"loss": 0.457,
"step": 705
},
{
"epoch": 0.9919664687390849,
"grad_norm": 4.9067864418029785,
"learning_rate": 7.821197387174638e-06,
"loss": 0.4577,
"step": 710
},
{
"epoch": 0.9989521480964024,
"grad_norm": 4.425784111022949,
"learning_rate": 7.78998767712873e-06,
"loss": 0.4683,
"step": 715
},
{
"epoch": 1.005588543485854,
"grad_norm": 3.8555009365081787,
"learning_rate": 7.758619309277988e-06,
"loss": 0.2635,
"step": 720
},
{
"epoch": 1.0125742228431716,
"grad_norm": 3.948883056640625,
"learning_rate": 7.72709406744262e-06,
"loss": 0.2383,
"step": 725
},
{
"epoch": 1.019559902200489,
"grad_norm": 6.131858825683594,
"learning_rate": 7.695413744363753e-06,
"loss": 0.2488,
"step": 730
},
{
"epoch": 1.0265455815578064,
"grad_norm": 6.018847942352295,
"learning_rate": 7.663580141601504e-06,
"loss": 0.2176,
"step": 735
},
{
"epoch": 1.033531260915124,
"grad_norm": 6.508975982666016,
"learning_rate": 7.631595069432515e-06,
"loss": 0.2193,
"step": 740
},
{
"epoch": 1.0405169402724415,
"grad_norm": 7.317805767059326,
"learning_rate": 7.599460346747024e-06,
"loss": 0.2054,
"step": 745
},
{
"epoch": 1.047502619629759,
"grad_norm": 5.333045482635498,
"learning_rate": 7.567177800945413e-06,
"loss": 0.2133,
"step": 750
},
{
"epoch": 1.0544882989870765,
"grad_norm": 4.751222610473633,
"learning_rate": 7.534749267834309e-06,
"loss": 0.2313,
"step": 755
},
{
"epoch": 1.061473978344394,
"grad_norm": 4.327038288116455,
"learning_rate": 7.5021765915221656e-06,
"loss": 0.1994,
"step": 760
},
{
"epoch": 1.0684596577017116,
"grad_norm": 5.050411224365234,
"learning_rate": 7.469461624314416e-06,
"loss": 0.2121,
"step": 765
},
{
"epoch": 1.075445337059029,
"grad_norm": 5.225903511047363,
"learning_rate": 7.436606226608122e-06,
"loss": 0.2147,
"step": 770
},
{
"epoch": 1.0824310164163464,
"grad_norm": 5.33516788482666,
"learning_rate": 7.403612266786188e-06,
"loss": 0.21,
"step": 775
},
{
"epoch": 1.089416695773664,
"grad_norm": 5.235130786895752,
"learning_rate": 7.370481621111106e-06,
"loss": 0.2034,
"step": 780
},
{
"epoch": 1.0964023751309815,
"grad_norm": 5.345615386962891,
"learning_rate": 7.337216173618269e-06,
"loss": 0.1926,
"step": 785
},
{
"epoch": 1.103388054488299,
"grad_norm": 5.1708221435546875,
"learning_rate": 7.303817816008814e-06,
"loss": 0.2309,
"step": 790
},
{
"epoch": 1.1103737338456166,
"grad_norm": 5.851431846618652,
"learning_rate": 7.2702884475420645e-06,
"loss": 0.2179,
"step": 795
},
{
"epoch": 1.117359413202934,
"grad_norm": 5.048578262329102,
"learning_rate": 7.236629974927517e-06,
"loss": 0.2269,
"step": 800
},
{
"epoch": 1.1243450925602514,
"grad_norm": 5.8538641929626465,
"learning_rate": 7.202844312216415e-06,
"loss": 0.2353,
"step": 805
},
{
"epoch": 1.131330771917569,
"grad_norm": 4.558470726013184,
"learning_rate": 7.168933380692899e-06,
"loss": 0.1886,
"step": 810
},
{
"epoch": 1.1383164512748865,
"grad_norm": 5.602941513061523,
"learning_rate": 7.134899108764754e-06,
"loss": 0.2056,
"step": 815
},
{
"epoch": 1.145302130632204,
"grad_norm": 5.849012851715088,
"learning_rate": 7.1007434318537424e-06,
"loss": 0.2279,
"step": 820
},
{
"epoch": 1.1522878099895215,
"grad_norm": 4.99567985534668,
"learning_rate": 7.06646829228555e-06,
"loss": 0.2233,
"step": 825
},
{
"epoch": 1.159273489346839,
"grad_norm": 4.784215450286865,
"learning_rate": 7.0320756391793256e-06,
"loss": 0.209,
"step": 830
},
{
"epoch": 1.1662591687041566,
"grad_norm": 5.286474227905273,
"learning_rate": 6.9975674283368385e-06,
"loss": 0.2066,
"step": 835
},
{
"epoch": 1.1732448480614739,
"grad_norm": 4.656322956085205,
"learning_rate": 6.962945622131269e-06,
"loss": 0.2251,
"step": 840
},
{
"epoch": 1.1802305274187914,
"grad_norm": 5.176768779754639,
"learning_rate": 6.9282121893956026e-06,
"loss": 0.2373,
"step": 845
},
{
"epoch": 1.187216206776109,
"grad_norm": 5.336601257324219,
"learning_rate": 6.893369105310673e-06,
"loss": 0.2128,
"step": 850
},
{
"epoch": 1.1942018861334265,
"grad_norm": 5.231692314147949,
"learning_rate": 6.858418351292845e-06,
"loss": 0.2069,
"step": 855
},
{
"epoch": 1.201187565490744,
"grad_norm": 5.922867298126221,
"learning_rate": 6.823361914881331e-06,
"loss": 0.2006,
"step": 860
},
{
"epoch": 1.2081732448480615,
"grad_norm": 5.954738140106201,
"learning_rate": 6.788201789625166e-06,
"loss": 0.2189,
"step": 865
},
{
"epoch": 1.215158924205379,
"grad_norm": 5.425834655761719,
"learning_rate": 6.7529399749698465e-06,
"loss": 0.1976,
"step": 870
},
{
"epoch": 1.2221446035626964,
"grad_norm": 4.986149311065674,
"learning_rate": 6.717578476143621e-06,
"loss": 0.1959,
"step": 875
},
{
"epoch": 1.229130282920014,
"grad_norm": 6.26975679397583,
"learning_rate": 6.682119304043464e-06,
"loss": 0.2197,
"step": 880
},
{
"epoch": 1.2361159622773314,
"grad_norm": 5.916511058807373,
"learning_rate": 6.6465644751207225e-06,
"loss": 0.2146,
"step": 885
},
{
"epoch": 1.243101641634649,
"grad_norm": 4.7209601402282715,
"learning_rate": 6.610916011266445e-06,
"loss": 0.2143,
"step": 890
},
{
"epoch": 1.2500873209919665,
"grad_norm": 5.150550842285156,
"learning_rate": 6.575175939696401e-06,
"loss": 0.206,
"step": 895
},
{
"epoch": 1.257073000349284,
"grad_norm": 4.861652374267578,
"learning_rate": 6.539346292835804e-06,
"loss": 0.2198,
"step": 900
},
{
"epoch": 1.2640586797066016,
"grad_norm": 4.239111423492432,
"learning_rate": 6.503429108203734e-06,
"loss": 0.204,
"step": 905
},
{
"epoch": 1.2710443590639189,
"grad_norm": 5.413881301879883,
"learning_rate": 6.467426428297262e-06,
"loss": 0.2263,
"step": 910
},
{
"epoch": 1.2780300384212364,
"grad_norm": 5.223084449768066,
"learning_rate": 6.431340300475314e-06,
"loss": 0.201,
"step": 915
},
{
"epoch": 1.285015717778554,
"grad_norm": 4.555695056915283,
"learning_rate": 6.395172776842229e-06,
"loss": 0.2117,
"step": 920
},
{
"epoch": 1.2920013971358715,
"grad_norm": 5.600462913513184,
"learning_rate": 6.358925914131071e-06,
"loss": 0.1974,
"step": 925
},
{
"epoch": 1.298987076493189,
"grad_norm": 7.300204753875732,
"learning_rate": 6.322601773586669e-06,
"loss": 0.2158,
"step": 930
},
{
"epoch": 1.3059727558505065,
"grad_norm": 4.54530143737793,
"learning_rate": 6.2862024208484e-06,
"loss": 0.2356,
"step": 935
},
{
"epoch": 1.312958435207824,
"grad_norm": 6.054986000061035,
"learning_rate": 6.249729925832716e-06,
"loss": 0.2008,
"step": 940
},
{
"epoch": 1.3199441145651414,
"grad_norm": 5.145763874053955,
"learning_rate": 6.213186362615444e-06,
"loss": 0.2086,
"step": 945
},
{
"epoch": 1.326929793922459,
"grad_norm": 4.843288421630859,
"learning_rate": 6.176573809313836e-06,
"loss": 0.1962,
"step": 950
},
{
"epoch": 1.3339154732797764,
"grad_norm": 5.428574085235596,
"learning_rate": 6.139894347968389e-06,
"loss": 0.2361,
"step": 955
},
{
"epoch": 1.340901152637094,
"grad_norm": 5.883815765380859,
"learning_rate": 6.103150064424454e-06,
"loss": 0.1931,
"step": 960
},
{
"epoch": 1.3478868319944115,
"grad_norm": 4.711780071258545,
"learning_rate": 6.066343048213611e-06,
"loss": 0.2122,
"step": 965
},
{
"epoch": 1.354872511351729,
"grad_norm": 5.004215240478516,
"learning_rate": 6.029475392434855e-06,
"loss": 0.2247,
"step": 970
},
{
"epoch": 1.3618581907090466,
"grad_norm": 5.238073825836182,
"learning_rate": 5.9925491936355575e-06,
"loss": 0.2228,
"step": 975
},
{
"epoch": 1.3688438700663639,
"grad_norm": 5.151479721069336,
"learning_rate": 5.955566551692247e-06,
"loss": 0.2103,
"step": 980
},
{
"epoch": 1.3758295494236814,
"grad_norm": 5.709561824798584,
"learning_rate": 5.918529569691202e-06,
"loss": 0.2069,
"step": 985
},
{
"epoch": 1.382815228780999,
"grad_norm": 5.741881370544434,
"learning_rate": 5.88144035380884e-06,
"loss": 0.2225,
"step": 990
},
{
"epoch": 1.3898009081383165,
"grad_norm": 5.143665790557861,
"learning_rate": 5.8443010131919574e-06,
"loss": 0.2411,
"step": 995
},
{
"epoch": 1.396786587495634,
"grad_norm": 6.438896179199219,
"learning_rate": 5.807113659837792e-06,
"loss": 0.2324,
"step": 1000
},
{
"epoch": 1.4037722668529513,
"grad_norm": 4.634160041809082,
"learning_rate": 5.769880408473907e-06,
"loss": 0.2005,
"step": 1005
},
{
"epoch": 1.410757946210269,
"grad_norm": 4.902849197387695,
"learning_rate": 5.732603376437944e-06,
"loss": 0.2174,
"step": 1010
},
{
"epoch": 1.4177436255675864,
"grad_norm": 5.148767948150635,
"learning_rate": 5.695284683557216e-06,
"loss": 0.237,
"step": 1015
},
{
"epoch": 1.424729304924904,
"grad_norm": 5.740781784057617,
"learning_rate": 5.6579264520281515e-06,
"loss": 0.2197,
"step": 1020
},
{
"epoch": 1.4317149842822214,
"grad_norm": 5.8418097496032715,
"learning_rate": 5.620530806295621e-06,
"loss": 0.2185,
"step": 1025
},
{
"epoch": 1.438700663639539,
"grad_norm": 5.2934250831604,
"learning_rate": 5.583099872932123e-06,
"loss": 0.2092,
"step": 1030
},
{
"epoch": 1.4456863429968565,
"grad_norm": 4.844241619110107,
"learning_rate": 5.545635780516848e-06,
"loss": 0.2084,
"step": 1035
},
{
"epoch": 1.4526720223541738,
"grad_norm": 3.787388324737549,
"learning_rate": 5.508140659514644e-06,
"loss": 0.2007,
"step": 1040
},
{
"epoch": 1.4596577017114916,
"grad_norm": 5.787258148193359,
"learning_rate": 5.470616642154853e-06,
"loss": 0.2263,
"step": 1045
},
{
"epoch": 1.4666433810688089,
"grad_norm": 4.828206539154053,
"learning_rate": 5.433065862310059e-06,
"loss": 0.2041,
"step": 1050
},
{
"epoch": 1.4736290604261264,
"grad_norm": 5.000522613525391,
"learning_rate": 5.39549045537475e-06,
"loss": 0.2125,
"step": 1055
},
{
"epoch": 1.480614739783444,
"grad_norm": 4.484281063079834,
"learning_rate": 5.3578925581438765e-06,
"loss": 0.1907,
"step": 1060
},
{
"epoch": 1.4876004191407615,
"grad_norm": 5.4914164543151855,
"learning_rate": 5.32027430869134e-06,
"loss": 0.2468,
"step": 1065
},
{
"epoch": 1.494586098498079,
"grad_norm": 5.7337188720703125,
"learning_rate": 5.28263784624841e-06,
"loss": 0.2048,
"step": 1070
},
{
"epoch": 1.5015717778553963,
"grad_norm": 5.739035606384277,
"learning_rate": 5.244985311082073e-06,
"loss": 0.2097,
"step": 1075
},
{
"epoch": 1.508557457212714,
"grad_norm": 5.063661575317383,
"learning_rate": 5.207318844373315e-06,
"loss": 0.1988,
"step": 1080
},
{
"epoch": 1.5155431365700314,
"grad_norm": 4.986030578613281,
"learning_rate": 5.1696405880953715e-06,
"loss": 0.2037,
"step": 1085
},
{
"epoch": 1.522528815927349,
"grad_norm": 5.575538158416748,
"learning_rate": 5.13195268489191e-06,
"loss": 0.2026,
"step": 1090
},
{
"epoch": 1.5295144952846664,
"grad_norm": 6.1051530838012695,
"learning_rate": 5.094257277955187e-06,
"loss": 0.2067,
"step": 1095
},
{
"epoch": 1.536500174641984,
"grad_norm": 7.7973761558532715,
"learning_rate": 5.056556510904178e-06,
"loss": 0.2305,
"step": 1100
},
{
"epoch": 1.5434858539993015,
"grad_norm": 4.967608451843262,
"learning_rate": 5.0188525276626675e-06,
"loss": 0.1946,
"step": 1105
},
{
"epoch": 1.5504715333566188,
"grad_norm": 4.415831089019775,
"learning_rate": 4.981147472337333e-06,
"loss": 0.1934,
"step": 1110
},
{
"epoch": 1.5574572127139366,
"grad_norm": 6.092747211456299,
"learning_rate": 4.943443489095822e-06,
"loss": 0.261,
"step": 1115
},
{
"epoch": 1.5644428920712539,
"grad_norm": 5.598076820373535,
"learning_rate": 4.905742722044813e-06,
"loss": 0.1967,
"step": 1120
},
{
"epoch": 1.5714285714285714,
"grad_norm": 4.891939640045166,
"learning_rate": 4.868047315108091e-06,
"loss": 0.2134,
"step": 1125
},
{
"epoch": 1.578414250785889,
"grad_norm": 5.4631667137146,
"learning_rate": 4.83035941190463e-06,
"loss": 0.2164,
"step": 1130
},
{
"epoch": 1.5853999301432065,
"grad_norm": 4.748926162719727,
"learning_rate": 4.792681155626687e-06,
"loss": 0.2095,
"step": 1135
},
{
"epoch": 1.592385609500524,
"grad_norm": 5.863819122314453,
"learning_rate": 4.75501468891793e-06,
"loss": 0.2115,
"step": 1140
},
{
"epoch": 1.5993712888578413,
"grad_norm": 5.605953216552734,
"learning_rate": 4.717362153751591e-06,
"loss": 0.1883,
"step": 1145
},
{
"epoch": 1.606356968215159,
"grad_norm": 5.130843639373779,
"learning_rate": 4.679725691308662e-06,
"loss": 0.2082,
"step": 1150
},
{
"epoch": 1.6133426475724764,
"grad_norm": 5.240429401397705,
"learning_rate": 4.642107441856125e-06,
"loss": 0.2219,
"step": 1155
},
{
"epoch": 1.620328326929794,
"grad_norm": 4.621257305145264,
"learning_rate": 4.604509544625252e-06,
"loss": 0.1798,
"step": 1160
},
{
"epoch": 1.6273140062871114,
"grad_norm": 4.74717903137207,
"learning_rate": 4.566934137689943e-06,
"loss": 0.1898,
"step": 1165
},
{
"epoch": 1.634299685644429,
"grad_norm": 4.941923141479492,
"learning_rate": 4.529383357845148e-06,
"loss": 0.1949,
"step": 1170
},
{
"epoch": 1.6412853650017465,
"grad_norm": 5.3948516845703125,
"learning_rate": 4.491859340485356e-06,
"loss": 0.1925,
"step": 1175
},
{
"epoch": 1.6482710443590638,
"grad_norm": 5.2328667640686035,
"learning_rate": 4.454364219483153e-06,
"loss": 0.2067,
"step": 1180
},
{
"epoch": 1.6552567237163816,
"grad_norm": 5.516434669494629,
"learning_rate": 4.416900127067879e-06,
"loss": 0.2076,
"step": 1185
},
{
"epoch": 1.6622424030736989,
"grad_norm": 5.7635040283203125,
"learning_rate": 4.3794691937043804e-06,
"loss": 0.2152,
"step": 1190
},
{
"epoch": 1.6692280824310164,
"grad_norm": 4.992877006530762,
"learning_rate": 4.342073547971849e-06,
"loss": 0.2091,
"step": 1195
},
{
"epoch": 1.676213761788334,
"grad_norm": 5.6958136558532715,
"learning_rate": 4.304715316442785e-06,
"loss": 0.2059,
"step": 1200
},
{
"epoch": 1.6831994411456515,
"grad_norm": 6.601348400115967,
"learning_rate": 4.267396623562057e-06,
"loss": 0.2227,
"step": 1205
},
{
"epoch": 1.690185120502969,
"grad_norm": 4.910021781921387,
"learning_rate": 4.230119591526095e-06,
"loss": 0.2157,
"step": 1210
},
{
"epoch": 1.6971707998602863,
"grad_norm": 5.821145534515381,
"learning_rate": 4.19288634016221e-06,
"loss": 0.2197,
"step": 1215
},
{
"epoch": 1.704156479217604,
"grad_norm": 5.628259658813477,
"learning_rate": 4.155698986808045e-06,
"loss": 0.2159,
"step": 1220
},
{
"epoch": 1.7111421585749214,
"grad_norm": 5.30466890335083,
"learning_rate": 4.118559646191164e-06,
"loss": 0.1895,
"step": 1225
},
{
"epoch": 1.7181278379322389,
"grad_norm": 4.880356311798096,
"learning_rate": 4.081470430308799e-06,
"loss": 0.2022,
"step": 1230
},
{
"epoch": 1.7251135172895564,
"grad_norm": 5.982080936431885,
"learning_rate": 4.044433448307753e-06,
"loss": 0.2135,
"step": 1235
},
{
"epoch": 1.732099196646874,
"grad_norm": 5.4156341552734375,
"learning_rate": 4.007450806364443e-06,
"loss": 0.1854,
"step": 1240
},
{
"epoch": 1.7390848760041915,
"grad_norm": 4.503352165222168,
"learning_rate": 3.970524607565146e-06,
"loss": 0.1988,
"step": 1245
},
{
"epoch": 1.7460705553615088,
"grad_norm": 4.712128162384033,
"learning_rate": 3.93365695178639e-06,
"loss": 0.213,
"step": 1250
},
{
"epoch": 1.7530562347188265,
"grad_norm": 6.144698143005371,
"learning_rate": 3.896849935575548e-06,
"loss": 0.2213,
"step": 1255
},
{
"epoch": 1.7600419140761439,
"grad_norm": 5.0675482749938965,
"learning_rate": 3.860105652031612e-06,
"loss": 0.1937,
"step": 1260
},
{
"epoch": 1.7670275934334614,
"grad_norm": 5.1114020347595215,
"learning_rate": 3.823426190686166e-06,
"loss": 0.2159,
"step": 1265
},
{
"epoch": 1.774013272790779,
"grad_norm": 5.182307720184326,
"learning_rate": 3.7868136373845577e-06,
"loss": 0.2074,
"step": 1270
},
{
"epoch": 1.7809989521480964,
"grad_norm": 5.307186126708984,
"learning_rate": 3.7502700741672864e-06,
"loss": 0.2072,
"step": 1275
},
{
"epoch": 1.787984631505414,
"grad_norm": 5.354731559753418,
"learning_rate": 3.7137975791516025e-06,
"loss": 0.2029,
"step": 1280
},
{
"epoch": 1.7949703108627313,
"grad_norm": 5.206900119781494,
"learning_rate": 3.6773982264133324e-06,
"loss": 0.1993,
"step": 1285
},
{
"epoch": 1.801955990220049,
"grad_norm": 5.040802955627441,
"learning_rate": 3.641074085868932e-06,
"loss": 0.1863,
"step": 1290
},
{
"epoch": 1.8089416695773664,
"grad_norm": 4.957560062408447,
"learning_rate": 3.604827223157773e-06,
"loss": 0.2037,
"step": 1295
},
{
"epoch": 1.8159273489346839,
"grad_norm": 5.18066930770874,
"learning_rate": 3.5686596995246868e-06,
"loss": 0.1939,
"step": 1300
},
{
"epoch": 1.8229130282920014,
"grad_norm": 6.234875202178955,
"learning_rate": 3.5325735717027386e-06,
"loss": 0.2052,
"step": 1305
},
{
"epoch": 1.829898707649319,
"grad_norm": 5.431389808654785,
"learning_rate": 3.496570891796267e-06,
"loss": 0.1935,
"step": 1310
},
{
"epoch": 1.8368843870066365,
"grad_norm": 5.478362083435059,
"learning_rate": 3.4606537071641966e-06,
"loss": 0.207,
"step": 1315
},
{
"epoch": 1.8438700663639538,
"grad_norm": 5.117818832397461,
"learning_rate": 3.424824060303601e-06,
"loss": 0.2017,
"step": 1320
},
{
"epoch": 1.8508557457212715,
"grad_norm": 4.163613319396973,
"learning_rate": 3.389083988733556e-06,
"loss": 0.1818,
"step": 1325
},
{
"epoch": 1.8578414250785888,
"grad_norm": 4.954441070556641,
"learning_rate": 3.3534355248792787e-06,
"loss": 0.1909,
"step": 1330
},
{
"epoch": 1.8648271044359064,
"grad_norm": 5.696218490600586,
"learning_rate": 3.3178806959565378e-06,
"loss": 0.2004,
"step": 1335
},
{
"epoch": 1.871812783793224,
"grad_norm": 5.73183536529541,
"learning_rate": 3.282421523856381e-06,
"loss": 0.1965,
"step": 1340
},
{
"epoch": 1.8787984631505414,
"grad_norm": 5.084606647491455,
"learning_rate": 3.247060025030156e-06,
"loss": 0.2062,
"step": 1345
},
{
"epoch": 1.885784142507859,
"grad_norm": 6.37736177444458,
"learning_rate": 3.2117982103748358e-06,
"loss": 0.2164,
"step": 1350
},
{
"epoch": 1.8927698218651763,
"grad_norm": 4.8795013427734375,
"learning_rate": 3.1766380851186695e-06,
"loss": 0.2003,
"step": 1355
},
{
"epoch": 1.899755501222494,
"grad_norm": 5.402559757232666,
"learning_rate": 3.141581648707155e-06,
"loss": 0.188,
"step": 1360
},
{
"epoch": 1.9067411805798113,
"grad_norm": 5.172591209411621,
"learning_rate": 3.106630894689328e-06,
"loss": 0.1847,
"step": 1365
},
{
"epoch": 1.9137268599371289,
"grad_norm": 4.233926773071289,
"learning_rate": 3.0717878106043987e-06,
"loss": 0.187,
"step": 1370
},
{
"epoch": 1.9207125392944464,
"grad_norm": 4.548036575317383,
"learning_rate": 3.0370543778687315e-06,
"loss": 0.2091,
"step": 1375
},
{
"epoch": 1.927698218651764,
"grad_norm": 6.768104076385498,
"learning_rate": 3.002432571663162e-06,
"loss": 0.1916,
"step": 1380
},
{
"epoch": 1.9346838980090815,
"grad_norm": 4.954831600189209,
"learning_rate": 2.9679243608206752e-06,
"loss": 0.1896,
"step": 1385
},
{
"epoch": 1.9416695773663988,
"grad_norm": 5.701454162597656,
"learning_rate": 2.933531707714451e-06,
"loss": 0.2246,
"step": 1390
},
{
"epoch": 1.9486552567237165,
"grad_norm": 4.5505475997924805,
"learning_rate": 2.8992565681462592e-06,
"loss": 0.1776,
"step": 1395
},
{
"epoch": 1.9556409360810338,
"grad_norm": 3.9627044200897217,
"learning_rate": 2.86510089123525e-06,
"loss": 0.1774,
"step": 1400
},
{
"epoch": 1.9626266154383514,
"grad_norm": 5.475313186645508,
"learning_rate": 2.831066619307104e-06,
"loss": 0.2232,
"step": 1405
},
{
"epoch": 1.969612294795669,
"grad_norm": 4.611178874969482,
"learning_rate": 2.797155687783587e-06,
"loss": 0.1724,
"step": 1410
},
{
"epoch": 1.9765979741529862,
"grad_norm": 4.791353225708008,
"learning_rate": 2.7633700250724837e-06,
"loss": 0.2039,
"step": 1415
},
{
"epoch": 1.983583653510304,
"grad_norm": 4.962776184082031,
"learning_rate": 2.7297115524579364e-06,
"loss": 0.1999,
"step": 1420
},
{
"epoch": 1.9905693328676213,
"grad_norm": 5.41670560836792,
"learning_rate": 2.6961821839911873e-06,
"loss": 0.2224,
"step": 1425
},
{
"epoch": 1.997555012224939,
"grad_norm": 4.636608600616455,
"learning_rate": 2.662783826381734e-06,
"loss": 0.1939,
"step": 1430
},
{
"epoch": 2.0041914076143903,
"grad_norm": 2.7747793197631836,
"learning_rate": 2.6295183788888945e-06,
"loss": 0.1141,
"step": 1435
},
{
"epoch": 2.011177086971708,
"grad_norm": 3.0276601314544678,
"learning_rate": 2.5963877332138133e-06,
"loss": 0.0733,
"step": 1440
},
{
"epoch": 2.0181627663290254,
"grad_norm": 2.265000581741333,
"learning_rate": 2.563393773391879e-06,
"loss": 0.0695,
"step": 1445
},
{
"epoch": 2.025148445686343,
"grad_norm": 2.5616984367370605,
"learning_rate": 2.530538375685586e-06,
"loss": 0.076,
"step": 1450
},
{
"epoch": 2.0321341250436604,
"grad_norm": 3.750467300415039,
"learning_rate": 2.4978234084778357e-06,
"loss": 0.0676,
"step": 1455
},
{
"epoch": 2.039119804400978,
"grad_norm": 4.017278671264648,
"learning_rate": 2.4652507321656927e-06,
"loss": 0.0733,
"step": 1460
},
{
"epoch": 2.0461054837582955,
"grad_norm": 3.2065634727478027,
"learning_rate": 2.4328221990545887e-06,
"loss": 0.066,
"step": 1465
},
{
"epoch": 2.053091163115613,
"grad_norm": 2.6953470706939697,
"learning_rate": 2.4005396532529786e-06,
"loss": 0.0628,
"step": 1470
},
{
"epoch": 2.0600768424729305,
"grad_norm": 2.3816425800323486,
"learning_rate": 2.3684049305674858e-06,
"loss": 0.062,
"step": 1475
},
{
"epoch": 2.067062521830248,
"grad_norm": 3.783911943435669,
"learning_rate": 2.3364198583984977e-06,
"loss": 0.0767,
"step": 1480
},
{
"epoch": 2.0740482011875656,
"grad_norm": 3.010451316833496,
"learning_rate": 2.304586255636247e-06,
"loss": 0.0588,
"step": 1485
},
{
"epoch": 2.081033880544883,
"grad_norm": 4.173311710357666,
"learning_rate": 2.2729059325573805e-06,
"loss": 0.0651,
"step": 1490
},
{
"epoch": 2.0880195599022007,
"grad_norm": 3.4096760749816895,
"learning_rate": 2.2413806907220125e-06,
"loss": 0.0727,
"step": 1495
},
{
"epoch": 2.095005239259518,
"grad_norm": 2.6200010776519775,
"learning_rate": 2.210012322871272e-06,
"loss": 0.0685,
"step": 1500
},
{
"epoch": 2.1019909186168353,
"grad_norm": 3.109384059906006,
"learning_rate": 2.1788026128253637e-06,
"loss": 0.059,
"step": 1505
},
{
"epoch": 2.108976597974153,
"grad_norm": 3.6358861923217773,
"learning_rate": 2.1477533353821226e-06,
"loss": 0.0584,
"step": 1510
},
{
"epoch": 2.1159622773314704,
"grad_norm": 4.2906293869018555,
"learning_rate": 2.116866256216083e-06,
"loss": 0.0602,
"step": 1515
},
{
"epoch": 2.122947956688788,
"grad_norm": 4.5148491859436035,
"learning_rate": 2.086143131778079e-06,
"loss": 0.0732,
"step": 1520
},
{
"epoch": 2.1299336360461054,
"grad_norm": 3.4261441230773926,
"learning_rate": 2.055585709195356e-06,
"loss": 0.0672,
"step": 1525
},
{
"epoch": 2.136919315403423,
"grad_norm": 2.8189690113067627,
"learning_rate": 2.0251957261722116e-06,
"loss": 0.0554,
"step": 1530
},
{
"epoch": 2.1439049947607405,
"grad_norm": 3.4528841972351074,
"learning_rate": 1.9949749108911886e-06,
"loss": 0.0693,
"step": 1535
},
{
"epoch": 2.150890674118058,
"grad_norm": 5.297241687774658,
"learning_rate": 1.9649249819147976e-06,
"loss": 0.0623,
"step": 1540
},
{
"epoch": 2.1578763534753755,
"grad_norm": 3.1499645709991455,
"learning_rate": 1.9350476480877735e-06,
"loss": 0.0646,
"step": 1545
},
{
"epoch": 2.164862032832693,
"grad_norm": 3.9576337337493896,
"learning_rate": 1.9053446084399153e-06,
"loss": 0.0675,
"step": 1550
},
{
"epoch": 2.1718477121900106,
"grad_norm": 4.909806251525879,
"learning_rate": 1.8758175520894622e-06,
"loss": 0.0725,
"step": 1555
},
{
"epoch": 2.178833391547328,
"grad_norm": 4.15227746963501,
"learning_rate": 1.8464681581470328e-06,
"loss": 0.0632,
"step": 1560
},
{
"epoch": 2.1858190709046457,
"grad_norm": 3.7336344718933105,
"learning_rate": 1.8172980956201502e-06,
"loss": 0.0681,
"step": 1565
},
{
"epoch": 2.192804750261963,
"grad_norm": 3.011342763900757,
"learning_rate": 1.7883090233183238e-06,
"loss": 0.0611,
"step": 1570
},
{
"epoch": 2.1997904296192803,
"grad_norm": 3.7468390464782715,
"learning_rate": 1.7595025897587154e-06,
"loss": 0.0584,
"step": 1575
},
{
"epoch": 2.206776108976598,
"grad_norm": 3.4422414302825928,
"learning_rate": 1.7308804330724021e-06,
"loss": 0.0794,
"step": 1580
},
{
"epoch": 2.2137617883339153,
"grad_norm": 3.476590156555176,
"learning_rate": 1.7024441809112158e-06,
"loss": 0.06,
"step": 1585
},
{
"epoch": 2.220747467691233,
"grad_norm": 3.7604615688323975,
"learning_rate": 1.674195450355179e-06,
"loss": 0.0693,
"step": 1590
},
{
"epoch": 2.2277331470485504,
"grad_norm": 3.4398772716522217,
"learning_rate": 1.6461358478205552e-06,
"loss": 0.0582,
"step": 1595
},
{
"epoch": 2.234718826405868,
"grad_norm": 3.7523179054260254,
"learning_rate": 1.6182669689684982e-06,
"loss": 0.0708,
"step": 1600
},
{
"epoch": 2.2417045057631855,
"grad_norm": 3.6123757362365723,
"learning_rate": 1.5905903986142983e-06,
"loss": 0.0626,
"step": 1605
},
{
"epoch": 2.248690185120503,
"grad_norm": 3.515021562576294,
"learning_rate": 1.5631077106372728e-06,
"loss": 0.0626,
"step": 1610
},
{
"epoch": 2.2556758644778205,
"grad_norm": 3.9685404300689697,
"learning_rate": 1.5358204678912597e-06,
"loss": 0.0735,
"step": 1615
},
{
"epoch": 2.262661543835138,
"grad_norm": 4.189309597015381,
"learning_rate": 1.5087302221157386e-06,
"loss": 0.0706,
"step": 1620
},
{
"epoch": 2.2696472231924556,
"grad_norm": 2.535946846008301,
"learning_rate": 1.481838513847596e-06,
"loss": 0.0579,
"step": 1625
},
{
"epoch": 2.276632902549773,
"grad_norm": 3.503710985183716,
"learning_rate": 1.4551468723335182e-06,
"loss": 0.0579,
"step": 1630
},
{
"epoch": 2.28361858190709,
"grad_norm": 5.277285575866699,
"learning_rate": 1.4286568154430187e-06,
"loss": 0.0749,
"step": 1635
},
{
"epoch": 2.290604261264408,
"grad_norm": 3.7260444164276123,
"learning_rate": 1.4023698495821347e-06,
"loss": 0.0618,
"step": 1640
},
{
"epoch": 2.2975899406217253,
"grad_norm": 3.1638755798339844,
"learning_rate": 1.3762874696077561e-06,
"loss": 0.0741,
"step": 1645
},
{
"epoch": 2.304575619979043,
"grad_norm": 4.0308613777160645,
"learning_rate": 1.3504111587426178e-06,
"loss": 0.0666,
"step": 1650
},
{
"epoch": 2.3115612993363603,
"grad_norm": 3.7784061431884766,
"learning_rate": 1.3247423884909511e-06,
"loss": 0.0737,
"step": 1655
},
{
"epoch": 2.318546978693678,
"grad_norm": 3.080986976623535,
"learning_rate": 1.2992826185548106e-06,
"loss": 0.0627,
"step": 1660
},
{
"epoch": 2.3255326580509954,
"grad_norm": 3.805724859237671,
"learning_rate": 1.2740332967510604e-06,
"loss": 0.0741,
"step": 1665
},
{
"epoch": 2.332518337408313,
"grad_norm": 3.431828022003174,
"learning_rate": 1.248995858929043e-06,
"loss": 0.0584,
"step": 1670
},
{
"epoch": 2.3395040167656305,
"grad_norm": 3.6512904167175293,
"learning_rate": 1.2241717288889272e-06,
"loss": 0.0692,
"step": 1675
},
{
"epoch": 2.3464896961229478,
"grad_norm": 3.4175989627838135,
"learning_rate": 1.1995623183007372e-06,
"loss": 0.0621,
"step": 1680
},
{
"epoch": 2.3534753754802655,
"grad_norm": 3.0239272117614746,
"learning_rate": 1.1751690266240834e-06,
"loss": 0.0648,
"step": 1685
},
{
"epoch": 2.360461054837583,
"grad_norm": 3.577535629272461,
"learning_rate": 1.1509932410285723e-06,
"loss": 0.0679,
"step": 1690
},
{
"epoch": 2.3674467341949006,
"grad_norm": 3.299124002456665,
"learning_rate": 1.1270363363149273e-06,
"loss": 0.0699,
"step": 1695
},
{
"epoch": 2.374432413552218,
"grad_norm": 4.893401622772217,
"learning_rate": 1.1032996748368018e-06,
"loss": 0.0603,
"step": 1700
},
{
"epoch": 2.381418092909535,
"grad_norm": 3.660778284072876,
"learning_rate": 1.0797846064233154e-06,
"loss": 0.0814,
"step": 1705
},
{
"epoch": 2.388403772266853,
"grad_norm": 3.0096163749694824,
"learning_rate": 1.0564924683022865e-06,
"loss": 0.0611,
"step": 1710
},
{
"epoch": 2.3953894516241703,
"grad_norm": 3.9784798622131348,
"learning_rate": 1.0334245850241892e-06,
"loss": 0.0693,
"step": 1715
},
{
"epoch": 2.402375130981488,
"grad_norm": 4.206497669219971,
"learning_rate": 1.0105822683868333e-06,
"loss": 0.0695,
"step": 1720
},
{
"epoch": 2.4093608103388053,
"grad_norm": 3.21100115776062,
"learning_rate": 9.879668173607644e-07,
"loss": 0.0572,
"step": 1725
},
{
"epoch": 2.416346489696123,
"grad_norm": 4.883619785308838,
"learning_rate": 9.655795180153965e-07,
"loss": 0.0678,
"step": 1730
},
{
"epoch": 2.4233321690534404,
"grad_norm": 3.869938373565674,
"learning_rate": 9.434216434458749e-07,
"loss": 0.0602,
"step": 1735
},
{
"epoch": 2.430317848410758,
"grad_norm": 3.6375582218170166,
"learning_rate": 9.214944537006848e-07,
"loss": 0.0492,
"step": 1740
},
{
"epoch": 2.4373035277680755,
"grad_norm": 3.4665791988372803,
"learning_rate": 8.997991957099861e-07,
"loss": 0.0676,
"step": 1745
},
{
"epoch": 2.4442892071253928,
"grad_norm": 2.881965398788452,
"learning_rate": 8.78337103214717e-07,
"loss": 0.0615,
"step": 1750
},
{
"epoch": 2.4512748864827105,
"grad_norm": 3.227743148803711,
"learning_rate": 8.571093966964272e-07,
"loss": 0.0605,
"step": 1755
},
{
"epoch": 2.458260565840028,
"grad_norm": 3.0547025203704834,
"learning_rate": 8.361172833078724e-07,
"loss": 0.062,
"step": 1760
},
{
"epoch": 2.4652462451973456,
"grad_norm": 2.9871292114257812,
"learning_rate": 8.153619568043725e-07,
"loss": 0.0634,
"step": 1765
},
{
"epoch": 2.472231924554663,
"grad_norm": 3.806741952896118,
"learning_rate": 7.948445974759228e-07,
"loss": 0.075,
"step": 1770
},
{
"epoch": 2.47921760391198,
"grad_norm": 3.380242347717285,
"learning_rate": 7.745663720800739e-07,
"loss": 0.0564,
"step": 1775
},
{
"epoch": 2.486203283269298,
"grad_norm": 3.2576658725738525,
"learning_rate": 7.545284337755848e-07,
"loss": 0.0637,
"step": 1780
},
{
"epoch": 2.4931889626266153,
"grad_norm": 4.28541374206543,
"learning_rate": 7.347319220568478e-07,
"loss": 0.0692,
"step": 1785
},
{
"epoch": 2.500174641983933,
"grad_norm": 3.7046804428100586,
"learning_rate": 7.151779626890798e-07,
"loss": 0.0649,
"step": 1790
},
{
"epoch": 2.5071603213412503,
"grad_norm": 3.2380423545837402,
"learning_rate": 6.95867667644316e-07,
"loss": 0.0653,
"step": 1795
},
{
"epoch": 2.514146000698568,
"grad_norm": 3.8117942810058594,
"learning_rate": 6.76802135038167e-07,
"loss": 0.0686,
"step": 1800
},
{
"epoch": 2.5211316800558854,
"grad_norm": 3.508927583694458,
"learning_rate": 6.579824490673742e-07,
"loss": 0.0615,
"step": 1805
},
{
"epoch": 2.528117359413203,
"grad_norm": 3.836540699005127,
"learning_rate": 6.394096799481575e-07,
"loss": 0.0527,
"step": 1810
},
{
"epoch": 2.5351030387705205,
"grad_norm": 3.654384136199951,
"learning_rate": 6.210848838553541e-07,
"loss": 0.0615,
"step": 1815
},
{
"epoch": 2.5420887181278378,
"grad_norm": 4.480383396148682,
"learning_rate": 6.030091028623542e-07,
"loss": 0.0599,
"step": 1820
},
{
"epoch": 2.5490743974851555,
"grad_norm": 3.871952772140503,
"learning_rate": 5.851833648818467e-07,
"loss": 0.053,
"step": 1825
},
{
"epoch": 2.556060076842473,
"grad_norm": 4.014284610748291,
"learning_rate": 5.67608683607363e-07,
"loss": 0.0637,
"step": 1830
},
{
"epoch": 2.5630457561997906,
"grad_norm": 3.3244235515594482,
"learning_rate": 5.502860584556275e-07,
"loss": 0.0645,
"step": 1835
},
{
"epoch": 2.570031435557108,
"grad_norm": 3.213613271713257,
"learning_rate": 5.332164745097301e-07,
"loss": 0.0588,
"step": 1840
},
{
"epoch": 2.577017114914425,
"grad_norm": 3.7526209354400635,
"learning_rate": 5.164009024631067e-07,
"loss": 0.0593,
"step": 1845
},
{
"epoch": 2.584002794271743,
"grad_norm": 4.042670249938965,
"learning_rate": 4.998402985643319e-07,
"loss": 0.0648,
"step": 1850
},
{
"epoch": 2.5909884736290603,
"grad_norm": 3.8450632095336914,
"learning_rate": 4.83535604562751e-07,
"loss": 0.0621,
"step": 1855
},
{
"epoch": 2.597974152986378,
"grad_norm": 4.661925315856934,
"learning_rate": 4.6748774765491666e-07,
"loss": 0.0559,
"step": 1860
},
{
"epoch": 2.6049598323436953,
"grad_norm": 4.002378940582275,
"learning_rate": 4.516976404318657e-07,
"loss": 0.0658,
"step": 1865
},
{
"epoch": 2.611945511701013,
"grad_norm": 3.1322593688964844,
"learning_rate": 4.3616618082722384e-07,
"loss": 0.052,
"step": 1870
},
{
"epoch": 2.6189311910583304,
"grad_norm": 4.374446392059326,
"learning_rate": 4.2089425206614356e-07,
"loss": 0.0581,
"step": 1875
},
{
"epoch": 2.625916870415648,
"grad_norm": 4.130561828613281,
"learning_rate": 4.0588272261507245e-07,
"loss": 0.0714,
"step": 1880
},
{
"epoch": 2.6329025497729655,
"grad_norm": 3.0091750621795654,
"learning_rate": 3.911324461323729e-07,
"loss": 0.068,
"step": 1885
},
{
"epoch": 2.6398882291302828,
"grad_norm": 3.022900342941284,
"learning_rate": 3.766442614197746e-07,
"loss": 0.0613,
"step": 1890
},
{
"epoch": 2.6468739084876005,
"grad_norm": 3.2522053718566895,
"learning_rate": 3.6241899237467347e-07,
"loss": 0.0676,
"step": 1895
},
{
"epoch": 2.653859587844918,
"grad_norm": 2.8870651721954346,
"learning_rate": 3.4845744794327896e-07,
"loss": 0.0592,
"step": 1900
},
{
"epoch": 2.6608452672022356,
"grad_norm": 3.692023992538452,
"learning_rate": 3.347604220746176e-07,
"loss": 0.0634,
"step": 1905
},
{
"epoch": 2.667830946559553,
"grad_norm": 3.13959002494812,
"learning_rate": 3.213286936753729e-07,
"loss": 0.0573,
"step": 1910
},
{
"epoch": 2.67481662591687,
"grad_norm": 3.258509635925293,
"learning_rate": 3.081630265656016e-07,
"loss": 0.0771,
"step": 1915
},
{
"epoch": 2.681802305274188,
"grad_norm": 3.533224582672119,
"learning_rate": 2.9526416943529314e-07,
"loss": 0.062,
"step": 1920
},
{
"epoch": 2.6887879846315053,
"grad_norm": 3.2382559776306152,
"learning_rate": 2.8263285580179325e-07,
"loss": 0.0576,
"step": 1925
},
{
"epoch": 2.695773663988823,
"grad_norm": 3.944854497909546,
"learning_rate": 2.7026980396809235e-07,
"loss": 0.059,
"step": 1930
},
{
"epoch": 2.7027593433461403,
"grad_norm": 2.705212116241455,
"learning_rate": 2.581757169819793e-07,
"loss": 0.0505,
"step": 1935
},
{
"epoch": 2.709745022703458,
"grad_norm": 4.2093610763549805,
"learning_rate": 2.46351282596059e-07,
"loss": 0.0568,
"step": 1940
},
{
"epoch": 2.7167307020607754,
"grad_norm": 3.779445171356201,
"learning_rate": 2.347971732286425e-07,
"loss": 0.0614,
"step": 1945
},
{
"epoch": 2.723716381418093,
"grad_norm": 3.2959094047546387,
"learning_rate": 2.2351404592550918e-07,
"loss": 0.0555,
"step": 1950
},
{
"epoch": 2.7307020607754104,
"grad_norm": 3.7622032165527344,
"learning_rate": 2.1250254232254518e-07,
"loss": 0.0487,
"step": 1955
},
{
"epoch": 2.7376877401327278,
"grad_norm": 2.774178981781006,
"learning_rate": 2.01763288609248e-07,
"loss": 0.0594,
"step": 1960
},
{
"epoch": 2.7446734194900455,
"grad_norm": 3.630791664123535,
"learning_rate": 1.9129689549312602e-07,
"loss": 0.0517,
"step": 1965
},
{
"epoch": 2.751659098847363,
"grad_norm": 3.184044122695923,
"learning_rate": 1.8110395816496595e-07,
"loss": 0.05,
"step": 1970
},
{
"epoch": 2.7586447782046806,
"grad_norm": 3.227968692779541,
"learning_rate": 1.711850562649825e-07,
"loss": 0.0626,
"step": 1975
},
{
"epoch": 2.765630457561998,
"grad_norm": 4.061306953430176,
"learning_rate": 1.6154075384986202e-07,
"loss": 0.0594,
"step": 1980
},
{
"epoch": 2.772616136919315,
"grad_norm": 3.1833701133728027,
"learning_rate": 1.5217159936068314e-07,
"loss": 0.06,
"step": 1985
},
{
"epoch": 2.779601816276633,
"grad_norm": 3.1657423973083496,
"learning_rate": 1.4307812559172796e-07,
"loss": 0.0656,
"step": 1990
},
{
"epoch": 2.7865874956339503,
"grad_norm": 3.4851529598236084,
"learning_rate": 1.342608496601866e-07,
"loss": 0.0591,
"step": 1995
},
{
"epoch": 2.793573174991268,
"grad_norm": 4.401630878448486,
"learning_rate": 1.257202729767487e-07,
"loss": 0.0659,
"step": 2000
},
{
"epoch": 2.8005588543485853,
"grad_norm": 2.809574842453003,
"learning_rate": 1.1745688121708843e-07,
"loss": 0.0546,
"step": 2005
},
{
"epoch": 2.8075445337059026,
"grad_norm": 4.462152004241943,
"learning_rate": 1.094711442942481e-07,
"loss": 0.0626,
"step": 2010
},
{
"epoch": 2.8145302130632204,
"grad_norm": 3.0803418159484863,
"learning_rate": 1.0176351633191583e-07,
"loss": 0.0594,
"step": 2015
},
{
"epoch": 2.821515892420538,
"grad_norm": 3.0584282875061035,
"learning_rate": 9.433443563859667e-08,
"loss": 0.0544,
"step": 2020
},
{
"epoch": 2.8285015717778554,
"grad_norm": 3.364630937576294,
"learning_rate": 8.718432468269312e-08,
"loss": 0.053,
"step": 2025
},
{
"epoch": 2.8354872511351727,
"grad_norm": 3.150378465652466,
"learning_rate": 8.03135900684776e-08,
"loss": 0.0485,
"step": 2030
},
{
"epoch": 2.8424729304924905,
"grad_norm": 3.073679208755493,
"learning_rate": 7.372262251297002e-08,
"loss": 0.0481,
"step": 2035
},
{
"epoch": 2.849458609849808,
"grad_norm": 3.3093550205230713,
"learning_rate": 6.741179682371979e-08,
"loss": 0.048,
"step": 2040
},
{
"epoch": 2.8564442892071256,
"grad_norm": 4.013802528381348,
"learning_rate": 6.138147187749133e-08,
"loss": 0.0654,
"step": 2045
},
{
"epoch": 2.863429968564443,
"grad_norm": 4.480405807495117,
"learning_rate": 5.563199059985591e-08,
"loss": 0.0632,
"step": 2050
},
{
"epoch": 2.87041564792176,
"grad_norm": 4.415154457092285,
"learning_rate": 5.0163679945691155e-08,
"loss": 0.0652,
"step": 2055
},
{
"epoch": 2.877401327279078,
"grad_norm": 3.2409656047821045,
"learning_rate": 4.497685088058701e-08,
"loss": 0.0566,
"step": 2060
},
{
"epoch": 2.8843870066363952,
"grad_norm": 2.8262383937835693,
"learning_rate": 4.0071798363162105e-08,
"loss": 0.0535,
"step": 2065
},
{
"epoch": 2.891372685993713,
"grad_norm": 4.663197994232178,
"learning_rate": 3.5448801328293316e-08,
"loss": 0.0642,
"step": 2070
},
{
"epoch": 2.8983583653510303,
"grad_norm": 3.1666250228881836,
"learning_rate": 3.110812267124841e-08,
"loss": 0.0587,
"step": 2075
},
{
"epoch": 2.9053440447083476,
"grad_norm": 3.907501220703125,
"learning_rate": 2.7050009232741927e-08,
"loss": 0.0581,
"step": 2080
},
{
"epoch": 2.9123297240656654,
"grad_norm": 3.8790831565856934,
"learning_rate": 2.327469178489139e-08,
"loss": 0.0584,
"step": 2085
},
{
"epoch": 2.919315403422983,
"grad_norm": 4.372128486633301,
"learning_rate": 1.97823850181017e-08,
"loss": 0.0588,
"step": 2090
},
{
"epoch": 2.9263010827803004,
"grad_norm": 5.066431045532227,
"learning_rate": 1.657328752884879e-08,
"loss": 0.0658,
"step": 2095
},
{
"epoch": 2.9332867621376177,
"grad_norm": 4.35849142074585,
"learning_rate": 1.3647581808393096e-08,
"loss": 0.0621,
"step": 2100
},
{
"epoch": 2.9402724414949355,
"grad_norm": 3.5087194442749023,
"learning_rate": 1.1005434232396196e-08,
"loss": 0.0543,
"step": 2105
},
{
"epoch": 2.947258120852253,
"grad_norm": 3.806771993637085,
"learning_rate": 8.646995051461715e-09,
"loss": 0.0563,
"step": 2110
},
{
"epoch": 2.9542438002095706,
"grad_norm": 3.2938411235809326,
"learning_rate": 6.57239838259216e-09,
"loss": 0.0574,
"step": 2115
},
{
"epoch": 2.961229479566888,
"grad_norm": 2.854013681411743,
"learning_rate": 4.7817622015611244e-09,
"loss": 0.052,
"step": 2120
},
{
"epoch": 2.968215158924205,
"grad_norm": 3.962702512741089,
"learning_rate": 3.2751883362036606e-09,
"loss": 0.0491,
"step": 2125
},
{
"epoch": 2.975200838281523,
"grad_norm": 3.7732391357421875,
"learning_rate": 2.0527624606264674e-09,
"loss": 0.0548,
"step": 2130
},
{
"epoch": 2.9821865176388402,
"grad_norm": 3.932664155960083,
"learning_rate": 1.1145540903362285e-09,
"loss": 0.0618,
"step": 2135
},
{
"epoch": 2.989172196996158,
"grad_norm": 3.8951032161712646,
"learning_rate": 4.606165782855554e-10,
"loss": 0.0604,
"step": 2140
},
{
"epoch": 2.9961578763534753,
"grad_norm": 2.8378732204437256,
"learning_rate": 9.09871118387473e-11,
"loss": 0.0609,
"step": 2145
}
],
"logging_steps": 5,
"max_steps": 2148,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.9054767696141353e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}