{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1432, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006985679357317499, "grad_norm": 9.417793273925781, "learning_rate": 6.153846153846155e-07, "loss": 0.4856, "step": 5 }, { "epoch": 0.013971358714634998, "grad_norm": 7.6536865234375, "learning_rate": 1.3846153846153848e-06, "loss": 0.4457, "step": 10 }, { "epoch": 0.020957038071952497, "grad_norm": 6.9425950050354, "learning_rate": 2.153846153846154e-06, "loss": 0.4359, "step": 15 }, { "epoch": 0.027942717429269997, "grad_norm": 7.008752822875977, "learning_rate": 2.9230769230769236e-06, "loss": 0.4247, "step": 20 }, { "epoch": 0.0349283967865875, "grad_norm": 7.943561553955078, "learning_rate": 3.692307692307693e-06, "loss": 0.4144, "step": 25 }, { "epoch": 0.041914076143904994, "grad_norm": 8.960087776184082, "learning_rate": 4.461538461538462e-06, "loss": 0.4095, "step": 30 }, { "epoch": 0.0488997555012225, "grad_norm": 9.126522064208984, "learning_rate": 5.230769230769232e-06, "loss": 0.4195, "step": 35 }, { "epoch": 0.055885434858539994, "grad_norm": 6.81967306137085, "learning_rate": 6e-06, "loss": 0.409, "step": 40 }, { "epoch": 0.06287111421585749, "grad_norm": 9.193132400512695, "learning_rate": 6.76923076923077e-06, "loss": 0.4256, "step": 45 }, { "epoch": 0.069856793573175, "grad_norm": 7.224976062774658, "learning_rate": 7.538461538461539e-06, "loss": 0.41, "step": 50 }, { "epoch": 0.07684247293049248, "grad_norm": 7.260319232940674, "learning_rate": 8.307692307692309e-06, "loss": 0.4534, "step": 55 }, { "epoch": 0.08382815228780999, "grad_norm": 8.081445693969727, "learning_rate": 9.076923076923078e-06, "loss": 0.4435, "step": 60 }, { "epoch": 0.09081383164512749, "grad_norm": 8.150150299072266, "learning_rate": 9.846153846153848e-06, "loss": 0.4457, "step": 65 }, { "epoch": 0.097799511002445, "grad_norm": 6.927804946899414, "learning_rate": 9.999909012888162e-06, "loss": 0.4559, "step": 70 }, { "epoch": 0.10478519035976248, "grad_norm": 7.140408039093018, "learning_rate": 9.999539383421715e-06, "loss": 0.4217, "step": 75 }, { "epoch": 0.11177086971707999, "grad_norm": 7.418540000915527, "learning_rate": 9.998885445909666e-06, "loss": 0.4857, "step": 80 }, { "epoch": 0.11875654907439749, "grad_norm": 8.253525733947754, "learning_rate": 9.997947237539373e-06, "loss": 0.4863, "step": 85 }, { "epoch": 0.12574222843171498, "grad_norm": 6.550653457641602, "learning_rate": 9.996724811663797e-06, "loss": 0.4663, "step": 90 }, { "epoch": 0.13272790778903248, "grad_norm": 6.81130313873291, "learning_rate": 9.99521823779844e-06, "loss": 0.4684, "step": 95 }, { "epoch": 0.13971358714635, "grad_norm": 7.913576602935791, "learning_rate": 9.99342760161741e-06, "loss": 0.5049, "step": 100 }, { "epoch": 0.1466992665036675, "grad_norm": 8.523401260375977, "learning_rate": 9.991353004948538e-06, "loss": 0.4635, "step": 105 }, { "epoch": 0.15368494586098497, "grad_norm": 6.084781169891357, "learning_rate": 9.988994565767604e-06, "loss": 0.4752, "step": 110 }, { "epoch": 0.16067062521830247, "grad_norm": 5.975230693817139, "learning_rate": 9.986352418191608e-06, "loss": 0.4722, "step": 115 }, { "epoch": 0.16765630457561997, "grad_norm": 6.941432476043701, "learning_rate": 9.983426712471151e-06, "loss": 0.4991, "step": 120 }, { "epoch": 0.17464198393293748, "grad_norm": 6.190479755401611, "learning_rate": 9.9802176149819e-06, "loss": 0.4821, "step": 125 }, { "epoch": 0.18162766329025498, "grad_norm": 6.3899006843566895, "learning_rate": 9.976725308215109e-06, "loss": 0.5163, "step": 130 }, { "epoch": 0.18861334264757249, "grad_norm": 5.7873430252075195, "learning_rate": 9.97294999076726e-06, "loss": 0.4517, "step": 135 }, { "epoch": 0.19559902200489, "grad_norm": 6.523060321807861, "learning_rate": 9.968891877328752e-06, "loss": 0.4632, "step": 140 }, { "epoch": 0.20258470136220746, "grad_norm": 7.010469436645508, "learning_rate": 9.964551198671709e-06, "loss": 0.4643, "step": 145 }, { "epoch": 0.20957038071952497, "grad_norm": 6.9780473709106445, "learning_rate": 9.95992820163684e-06, "loss": 0.4737, "step": 150 }, { "epoch": 0.21655606007684247, "grad_norm": 6.335629463195801, "learning_rate": 9.955023149119413e-06, "loss": 0.4552, "step": 155 }, { "epoch": 0.22354173943415998, "grad_norm": 5.958488464355469, "learning_rate": 9.949836320054309e-06, "loss": 0.4553, "step": 160 }, { "epoch": 0.23052741879147748, "grad_norm": 6.195608139038086, "learning_rate": 9.944368009400145e-06, "loss": 0.5117, "step": 165 }, { "epoch": 0.23751309814879498, "grad_norm": 6.441509246826172, "learning_rate": 9.93861852812251e-06, "loss": 0.4652, "step": 170 }, { "epoch": 0.24449877750611246, "grad_norm": 5.994947910308838, "learning_rate": 9.932588203176281e-06, "loss": 0.4832, "step": 175 }, { "epoch": 0.25148445686342996, "grad_norm": 6.672873020172119, "learning_rate": 9.92627737748703e-06, "loss": 0.4827, "step": 180 }, { "epoch": 0.2584701362207475, "grad_norm": 6.571279048919678, "learning_rate": 9.919686409931523e-06, "loss": 0.5003, "step": 185 }, { "epoch": 0.26545581557806497, "grad_norm": 6.049185276031494, "learning_rate": 9.912815675317307e-06, "loss": 0.4905, "step": 190 }, { "epoch": 0.27244149493538244, "grad_norm": 6.473181247711182, "learning_rate": 9.905665564361403e-06, "loss": 0.4843, "step": 195 }, { "epoch": 0.2794271742927, "grad_norm": 7.581283092498779, "learning_rate": 9.898236483668086e-06, "loss": 0.5097, "step": 200 }, { "epoch": 0.28641285365001745, "grad_norm": 7.333038806915283, "learning_rate": 9.890528855705753e-06, "loss": 0.4698, "step": 205 }, { "epoch": 0.293398533007335, "grad_norm": 6.216223239898682, "learning_rate": 9.882543118782913e-06, "loss": 0.5034, "step": 210 }, { "epoch": 0.30038421236465246, "grad_norm": 6.861793041229248, "learning_rate": 9.874279727023253e-06, "loss": 0.4756, "step": 215 }, { "epoch": 0.30736989172196993, "grad_norm": 6.37476110458374, "learning_rate": 9.865739150339815e-06, "loss": 0.5117, "step": 220 }, { "epoch": 0.31435557107928747, "grad_norm": 5.816511154174805, "learning_rate": 9.856921874408272e-06, "loss": 0.4925, "step": 225 }, { "epoch": 0.32134125043660494, "grad_norm": 6.3242692947387695, "learning_rate": 9.847828400639317e-06, "loss": 0.4679, "step": 230 }, { "epoch": 0.3283269297939225, "grad_norm": 6.17088508605957, "learning_rate": 9.838459246150138e-06, "loss": 0.4646, "step": 235 }, { "epoch": 0.33531260915123995, "grad_norm": 7.1427812576293945, "learning_rate": 9.828814943735019e-06, "loss": 0.4633, "step": 240 }, { "epoch": 0.3422982885085575, "grad_norm": 6.307351112365723, "learning_rate": 9.818896041835036e-06, "loss": 0.4586, "step": 245 }, { "epoch": 0.34928396786587496, "grad_norm": 6.033569812774658, "learning_rate": 9.808703104506874e-06, "loss": 0.5144, "step": 250 }, { "epoch": 0.35626964722319243, "grad_norm": 6.253153324127197, "learning_rate": 9.798236711390754e-06, "loss": 0.4936, "step": 255 }, { "epoch": 0.36325532658050996, "grad_norm": 6.369323253631592, "learning_rate": 9.787497457677456e-06, "loss": 0.5142, "step": 260 }, { "epoch": 0.37024100593782744, "grad_norm": 7.059694766998291, "learning_rate": 9.776485954074492e-06, "loss": 0.508, "step": 265 }, { "epoch": 0.37722668529514497, "grad_norm": 5.437386989593506, "learning_rate": 9.765202826771358e-06, "loss": 0.4517, "step": 270 }, { "epoch": 0.38421236465246245, "grad_norm": 6.353536605834961, "learning_rate": 9.753648717403944e-06, "loss": 0.5177, "step": 275 }, { "epoch": 0.39119804400978, "grad_norm": 6.7363481521606445, "learning_rate": 9.741824283018022e-06, "loss": 0.4768, "step": 280 }, { "epoch": 0.39818372336709745, "grad_norm": 6.804571151733398, "learning_rate": 9.729730196031908e-06, "loss": 0.5009, "step": 285 }, { "epoch": 0.40516940272441493, "grad_norm": 6.188082695007324, "learning_rate": 9.717367144198209e-06, "loss": 0.4975, "step": 290 }, { "epoch": 0.41215508208173246, "grad_norm": 6.352089881896973, "learning_rate": 9.704735830564707e-06, "loss": 0.458, "step": 295 }, { "epoch": 0.41914076143904994, "grad_norm": 5.823917388916016, "learning_rate": 9.6918369734344e-06, "loss": 0.4863, "step": 300 }, { "epoch": 0.42612644079636747, "grad_norm": 6.301001071929932, "learning_rate": 9.678671306324627e-06, "loss": 0.5175, "step": 305 }, { "epoch": 0.43311212015368494, "grad_norm": 6.185693740844727, "learning_rate": 9.665239577925384e-06, "loss": 0.4844, "step": 310 }, { "epoch": 0.4400977995110024, "grad_norm": 6.580015182495117, "learning_rate": 9.65154255205672e-06, "loss": 0.4907, "step": 315 }, { "epoch": 0.44708347886831995, "grad_norm": 5.962856769561768, "learning_rate": 9.637581007625328e-06, "loss": 0.4552, "step": 320 }, { "epoch": 0.4540691582256374, "grad_norm": 6.2723774909973145, "learning_rate": 9.623355738580226e-06, "loss": 0.4994, "step": 325 }, { "epoch": 0.46105483758295496, "grad_norm": 6.140941143035889, "learning_rate": 9.608867553867628e-06, "loss": 0.4829, "step": 330 }, { "epoch": 0.46804051694027243, "grad_norm": 6.278982639312744, "learning_rate": 9.594117277384928e-06, "loss": 0.5118, "step": 335 }, { "epoch": 0.47502619629758996, "grad_norm": 6.221822261810303, "learning_rate": 9.579105747933858e-06, "loss": 0.4903, "step": 340 }, { "epoch": 0.48201187565490744, "grad_norm": 6.153563976287842, "learning_rate": 9.563833819172777e-06, "loss": 0.4729, "step": 345 }, { "epoch": 0.4889975550122249, "grad_norm": 5.723773956298828, "learning_rate": 9.548302359568136e-06, "loss": 0.536, "step": 350 }, { "epoch": 0.49598323436954245, "grad_norm": 5.7412238121032715, "learning_rate": 9.532512252345086e-06, "loss": 0.4818, "step": 355 }, { "epoch": 0.5029689137268599, "grad_norm": 5.776915073394775, "learning_rate": 9.516464395437251e-06, "loss": 0.4818, "step": 360 }, { "epoch": 0.5099545930841775, "grad_norm": 6.4714741706848145, "learning_rate": 9.50015970143567e-06, "loss": 0.486, "step": 365 }, { "epoch": 0.516940272441495, "grad_norm": 5.570414066314697, "learning_rate": 9.483599097536895e-06, "loss": 0.4821, "step": 370 }, { "epoch": 0.5239259517988124, "grad_norm": 5.86240816116333, "learning_rate": 9.466783525490271e-06, "loss": 0.5014, "step": 375 }, { "epoch": 0.5309116311561299, "grad_norm": 5.690156936645508, "learning_rate": 9.449713941544375e-06, "loss": 0.505, "step": 380 }, { "epoch": 0.5378973105134475, "grad_norm": 5.273125648498535, "learning_rate": 9.432391316392639e-06, "loss": 0.4738, "step": 385 }, { "epoch": 0.5448829898707649, "grad_norm": 6.2202887535095215, "learning_rate": 9.414816635118154e-06, "loss": 0.5023, "step": 390 }, { "epoch": 0.5518686692280824, "grad_norm": 6.270415782928467, "learning_rate": 9.396990897137647e-06, "loss": 0.4772, "step": 395 }, { "epoch": 0.5588543485854, "grad_norm": 5.550614356994629, "learning_rate": 9.378915116144646e-06, "loss": 0.4556, "step": 400 }, { "epoch": 0.5658400279427175, "grad_norm": 5.853529930114746, "learning_rate": 9.360590320051844e-06, "loss": 0.4942, "step": 405 }, { "epoch": 0.5728257073000349, "grad_norm": 5.692981243133545, "learning_rate": 9.342017550932627e-06, "loss": 0.4772, "step": 410 }, { "epoch": 0.5798113866573524, "grad_norm": 5.6134185791015625, "learning_rate": 9.323197864961835e-06, "loss": 0.4245, "step": 415 }, { "epoch": 0.58679706601467, "grad_norm": 6.195576190948486, "learning_rate": 9.304132332355685e-06, "loss": 0.5083, "step": 420 }, { "epoch": 0.5937827453719874, "grad_norm": 6.21861457824707, "learning_rate": 9.28482203731092e-06, "loss": 0.5366, "step": 425 }, { "epoch": 0.6007684247293049, "grad_norm": 5.109756946563721, "learning_rate": 9.265268077943152e-06, "loss": 0.4774, "step": 430 }, { "epoch": 0.6077541040866224, "grad_norm": 6.137323379516602, "learning_rate": 9.245471566224416e-06, "loss": 0.5125, "step": 435 }, { "epoch": 0.6147397834439399, "grad_norm": 5.771416187286377, "learning_rate": 9.225433627919927e-06, "loss": 0.5453, "step": 440 }, { "epoch": 0.6217254628012574, "grad_norm": 5.52296781539917, "learning_rate": 9.20515540252408e-06, "loss": 0.4663, "step": 445 }, { "epoch": 0.6287111421585749, "grad_norm": 6.380407333374023, "learning_rate": 9.184638043195628e-06, "loss": 0.4752, "step": 450 }, { "epoch": 0.6356968215158925, "grad_norm": 5.940698623657227, "learning_rate": 9.16388271669213e-06, "loss": 0.4309, "step": 455 }, { "epoch": 0.6426825008732099, "grad_norm": 6.604904651641846, "learning_rate": 9.142890603303573e-06, "loss": 0.5043, "step": 460 }, { "epoch": 0.6496681802305274, "grad_norm": 6.345200061798096, "learning_rate": 9.121662896785285e-06, "loss": 0.4991, "step": 465 }, { "epoch": 0.656653859587845, "grad_norm": 5.294460773468018, "learning_rate": 9.100200804290014e-06, "loss": 0.4788, "step": 470 }, { "epoch": 0.6636395389451624, "grad_norm": 5.905223846435547, "learning_rate": 9.078505546299317e-06, "loss": 0.4871, "step": 475 }, { "epoch": 0.6706252183024799, "grad_norm": 4.967826843261719, "learning_rate": 9.056578356554124e-06, "loss": 0.5174, "step": 480 }, { "epoch": 0.6776108976597974, "grad_norm": 5.444486618041992, "learning_rate": 9.034420481984604e-06, "loss": 0.4889, "step": 485 }, { "epoch": 0.684596577017115, "grad_norm": 5.278167724609375, "learning_rate": 9.012033182639238e-06, "loss": 0.4793, "step": 490 }, { "epoch": 0.6915822563744324, "grad_norm": 6.902711391448975, "learning_rate": 8.989417731613169e-06, "loss": 0.5259, "step": 495 }, { "epoch": 0.6985679357317499, "grad_norm": 6.086877346038818, "learning_rate": 8.966575414975813e-06, "loss": 0.5203, "step": 500 }, { "epoch": 0.7055536150890674, "grad_norm": 6.7947564125061035, "learning_rate": 8.943507531697715e-06, "loss": 0.4609, "step": 505 }, { "epoch": 0.7125392944463849, "grad_norm": 6.169280052185059, "learning_rate": 8.920215393576685e-06, "loss": 0.4799, "step": 510 }, { "epoch": 0.7195249738037024, "grad_norm": 6.514042377471924, "learning_rate": 8.8967003251632e-06, "loss": 0.5, "step": 515 }, { "epoch": 0.7265106531610199, "grad_norm": 7.015414714813232, "learning_rate": 8.872963663685075e-06, "loss": 0.5226, "step": 520 }, { "epoch": 0.7334963325183375, "grad_norm": 4.968021869659424, "learning_rate": 8.849006758971429e-06, "loss": 0.4697, "step": 525 }, { "epoch": 0.7404820118756549, "grad_norm": 6.74957799911499, "learning_rate": 8.824830973375917e-06, "loss": 0.5045, "step": 530 }, { "epoch": 0.7474676912329724, "grad_norm": 5.4804816246032715, "learning_rate": 8.800437681699264e-06, "loss": 0.465, "step": 535 }, { "epoch": 0.7544533705902899, "grad_norm": 5.658529281616211, "learning_rate": 8.775828271111074e-06, "loss": 0.474, "step": 540 }, { "epoch": 0.7614390499476074, "grad_norm": 5.418574333190918, "learning_rate": 8.751004141070958e-06, "loss": 0.4904, "step": 545 }, { "epoch": 0.7684247293049249, "grad_norm": 5.922712802886963, "learning_rate": 8.72596670324894e-06, "loss": 0.4721, "step": 550 }, { "epoch": 0.7754104086622424, "grad_norm": 6.200154781341553, "learning_rate": 8.70071738144519e-06, "loss": 0.4902, "step": 555 }, { "epoch": 0.78239608801956, "grad_norm": 5.705479145050049, "learning_rate": 8.67525761150905e-06, "loss": 0.4726, "step": 560 }, { "epoch": 0.7893817673768774, "grad_norm": 6.879328727722168, "learning_rate": 8.649588841257383e-06, "loss": 0.4757, "step": 565 }, { "epoch": 0.7963674467341949, "grad_norm": 7.012612819671631, "learning_rate": 8.623712530392244e-06, "loss": 0.4522, "step": 570 }, { "epoch": 0.8033531260915124, "grad_norm": 6.293166160583496, "learning_rate": 8.597630150417867e-06, "loss": 0.4651, "step": 575 }, { "epoch": 0.8103388054488299, "grad_norm": 5.768970012664795, "learning_rate": 8.571343184556985e-06, "loss": 0.4277, "step": 580 }, { "epoch": 0.8173244848061474, "grad_norm": 6.452169895172119, "learning_rate": 8.544853127666483e-06, "loss": 0.5337, "step": 585 }, { "epoch": 0.8243101641634649, "grad_norm": 5.4347615242004395, "learning_rate": 8.518161486152406e-06, "loss": 0.468, "step": 590 }, { "epoch": 0.8312958435207825, "grad_norm": 5.982383728027344, "learning_rate": 8.491269777884264e-06, "loss": 0.4511, "step": 595 }, { "epoch": 0.8382815228780999, "grad_norm": 5.2124457359313965, "learning_rate": 8.464179532108742e-06, "loss": 0.4667, "step": 600 }, { "epoch": 0.8452672022354174, "grad_norm": 5.130034923553467, "learning_rate": 8.436892289362728e-06, "loss": 0.4551, "step": 605 }, { "epoch": 0.8522528815927349, "grad_norm": 5.86264705657959, "learning_rate": 8.409409601385702e-06, "loss": 0.5026, "step": 610 }, { "epoch": 0.8592385609500524, "grad_norm": 6.521675109863281, "learning_rate": 8.381733031031503e-06, "loss": 0.5017, "step": 615 }, { "epoch": 0.8662242403073699, "grad_norm": 5.794342994689941, "learning_rate": 8.353864152179445e-06, "loss": 0.4638, "step": 620 }, { "epoch": 0.8732099196646874, "grad_norm": 5.721254348754883, "learning_rate": 8.325804549644823e-06, "loss": 0.4652, "step": 625 }, { "epoch": 0.8801955990220048, "grad_norm": 5.914518356323242, "learning_rate": 8.297555819088787e-06, "loss": 0.4613, "step": 630 }, { "epoch": 0.8871812783793224, "grad_norm": 5.939746856689453, "learning_rate": 8.269119566927599e-06, "loss": 0.4599, "step": 635 }, { "epoch": 0.8941669577366399, "grad_norm": 5.655862808227539, "learning_rate": 8.240497410241286e-06, "loss": 0.4602, "step": 640 }, { "epoch": 0.9011526370939574, "grad_norm": 6.510814666748047, "learning_rate": 8.211690976681678e-06, "loss": 0.4582, "step": 645 }, { "epoch": 0.9081383164512749, "grad_norm": 5.543778419494629, "learning_rate": 8.18270190437985e-06, "loss": 0.4777, "step": 650 }, { "epoch": 0.9151239958085924, "grad_norm": 5.059428691864014, "learning_rate": 8.153531841852969e-06, "loss": 0.4756, "step": 655 }, { "epoch": 0.9221096751659099, "grad_norm": 5.040985584259033, "learning_rate": 8.12418244791054e-06, "loss": 0.4742, "step": 660 }, { "epoch": 0.9290953545232273, "grad_norm": 5.490272521972656, "learning_rate": 8.094655391560086e-06, "loss": 0.4614, "step": 665 }, { "epoch": 0.9360810338805449, "grad_norm": 5.865604877471924, "learning_rate": 8.064952351912227e-06, "loss": 0.4671, "step": 670 }, { "epoch": 0.9430667132378624, "grad_norm": 5.491858005523682, "learning_rate": 8.035075018085204e-06, "loss": 0.4624, "step": 675 }, { "epoch": 0.9500523925951799, "grad_norm": 5.839197158813477, "learning_rate": 8.005025089108812e-06, "loss": 0.4811, "step": 680 }, { "epoch": 0.9570380719524973, "grad_norm": 6.362710952758789, "learning_rate": 7.974804273827791e-06, "loss": 0.4829, "step": 685 }, { "epoch": 0.9640237513098149, "grad_norm": 5.933167457580566, "learning_rate": 7.944414290804647e-06, "loss": 0.4895, "step": 690 }, { "epoch": 0.9710094306671324, "grad_norm": 6.338855743408203, "learning_rate": 7.913856868221922e-06, "loss": 0.4729, "step": 695 }, { "epoch": 0.9779951100244498, "grad_norm": 5.022581577301025, "learning_rate": 7.883133743783918e-06, "loss": 0.4772, "step": 700 }, { "epoch": 0.9849807893817674, "grad_norm": 5.898752212524414, "learning_rate": 7.852246664617878e-06, "loss": 0.457, "step": 705 }, { "epoch": 0.9919664687390849, "grad_norm": 4.9067864418029785, "learning_rate": 7.821197387174638e-06, "loss": 0.4577, "step": 710 }, { "epoch": 0.9989521480964024, "grad_norm": 4.425784111022949, "learning_rate": 7.78998767712873e-06, "loss": 0.4683, "step": 715 }, { "epoch": 1.005588543485854, "grad_norm": 3.8555009365081787, "learning_rate": 7.758619309277988e-06, "loss": 0.2635, "step": 720 }, { "epoch": 1.0125742228431716, "grad_norm": 3.948883056640625, "learning_rate": 7.72709406744262e-06, "loss": 0.2383, "step": 725 }, { "epoch": 1.019559902200489, "grad_norm": 6.131858825683594, "learning_rate": 7.695413744363753e-06, "loss": 0.2488, "step": 730 }, { "epoch": 1.0265455815578064, "grad_norm": 6.018847942352295, "learning_rate": 7.663580141601504e-06, "loss": 0.2176, "step": 735 }, { "epoch": 1.033531260915124, "grad_norm": 6.508975982666016, "learning_rate": 7.631595069432515e-06, "loss": 0.2193, "step": 740 }, { "epoch": 1.0405169402724415, "grad_norm": 7.317805767059326, "learning_rate": 7.599460346747024e-06, "loss": 0.2054, "step": 745 }, { "epoch": 1.047502619629759, "grad_norm": 5.333045482635498, "learning_rate": 7.567177800945413e-06, "loss": 0.2133, "step": 750 }, { "epoch": 1.0544882989870765, "grad_norm": 4.751222610473633, "learning_rate": 7.534749267834309e-06, "loss": 0.2313, "step": 755 }, { "epoch": 1.061473978344394, "grad_norm": 4.327038288116455, "learning_rate": 7.5021765915221656e-06, "loss": 0.1994, "step": 760 }, { "epoch": 1.0684596577017116, "grad_norm": 5.050411224365234, "learning_rate": 7.469461624314416e-06, "loss": 0.2121, "step": 765 }, { "epoch": 1.075445337059029, "grad_norm": 5.225903511047363, "learning_rate": 7.436606226608122e-06, "loss": 0.2147, "step": 770 }, { "epoch": 1.0824310164163464, "grad_norm": 5.33516788482666, "learning_rate": 7.403612266786188e-06, "loss": 0.21, "step": 775 }, { "epoch": 1.089416695773664, "grad_norm": 5.235130786895752, "learning_rate": 7.370481621111106e-06, "loss": 0.2034, "step": 780 }, { "epoch": 1.0964023751309815, "grad_norm": 5.345615386962891, "learning_rate": 7.337216173618269e-06, "loss": 0.1926, "step": 785 }, { "epoch": 1.103388054488299, "grad_norm": 5.1708221435546875, "learning_rate": 7.303817816008814e-06, "loss": 0.2309, "step": 790 }, { "epoch": 1.1103737338456166, "grad_norm": 5.851431846618652, "learning_rate": 7.2702884475420645e-06, "loss": 0.2179, "step": 795 }, { "epoch": 1.117359413202934, "grad_norm": 5.048578262329102, "learning_rate": 7.236629974927517e-06, "loss": 0.2269, "step": 800 }, { "epoch": 1.1243450925602514, "grad_norm": 5.8538641929626465, "learning_rate": 7.202844312216415e-06, "loss": 0.2353, "step": 805 }, { "epoch": 1.131330771917569, "grad_norm": 4.558470726013184, "learning_rate": 7.168933380692899e-06, "loss": 0.1886, "step": 810 }, { "epoch": 1.1383164512748865, "grad_norm": 5.602941513061523, "learning_rate": 7.134899108764754e-06, "loss": 0.2056, "step": 815 }, { "epoch": 1.145302130632204, "grad_norm": 5.849012851715088, "learning_rate": 7.1007434318537424e-06, "loss": 0.2279, "step": 820 }, { "epoch": 1.1522878099895215, "grad_norm": 4.99567985534668, "learning_rate": 7.06646829228555e-06, "loss": 0.2233, "step": 825 }, { "epoch": 1.159273489346839, "grad_norm": 4.784215450286865, "learning_rate": 7.0320756391793256e-06, "loss": 0.209, "step": 830 }, { "epoch": 1.1662591687041566, "grad_norm": 5.286474227905273, "learning_rate": 6.9975674283368385e-06, "loss": 0.2066, "step": 835 }, { "epoch": 1.1732448480614739, "grad_norm": 4.656322956085205, "learning_rate": 6.962945622131269e-06, "loss": 0.2251, "step": 840 }, { "epoch": 1.1802305274187914, "grad_norm": 5.176768779754639, "learning_rate": 6.9282121893956026e-06, "loss": 0.2373, "step": 845 }, { "epoch": 1.187216206776109, "grad_norm": 5.336601257324219, "learning_rate": 6.893369105310673e-06, "loss": 0.2128, "step": 850 }, { "epoch": 1.1942018861334265, "grad_norm": 5.231692314147949, "learning_rate": 6.858418351292845e-06, "loss": 0.2069, "step": 855 }, { "epoch": 1.201187565490744, "grad_norm": 5.922867298126221, "learning_rate": 6.823361914881331e-06, "loss": 0.2006, "step": 860 }, { "epoch": 1.2081732448480615, "grad_norm": 5.954738140106201, "learning_rate": 6.788201789625166e-06, "loss": 0.2189, "step": 865 }, { "epoch": 1.215158924205379, "grad_norm": 5.425834655761719, "learning_rate": 6.7529399749698465e-06, "loss": 0.1976, "step": 870 }, { "epoch": 1.2221446035626964, "grad_norm": 4.986149311065674, "learning_rate": 6.717578476143621e-06, "loss": 0.1959, "step": 875 }, { "epoch": 1.229130282920014, "grad_norm": 6.26975679397583, "learning_rate": 6.682119304043464e-06, "loss": 0.2197, "step": 880 }, { "epoch": 1.2361159622773314, "grad_norm": 5.916511058807373, "learning_rate": 6.6465644751207225e-06, "loss": 0.2146, "step": 885 }, { "epoch": 1.243101641634649, "grad_norm": 4.7209601402282715, "learning_rate": 6.610916011266445e-06, "loss": 0.2143, "step": 890 }, { "epoch": 1.2500873209919665, "grad_norm": 5.150550842285156, "learning_rate": 6.575175939696401e-06, "loss": 0.206, "step": 895 }, { "epoch": 1.257073000349284, "grad_norm": 4.861652374267578, "learning_rate": 6.539346292835804e-06, "loss": 0.2198, "step": 900 }, { "epoch": 1.2640586797066016, "grad_norm": 4.239111423492432, "learning_rate": 6.503429108203734e-06, "loss": 0.204, "step": 905 }, { "epoch": 1.2710443590639189, "grad_norm": 5.413881301879883, "learning_rate": 6.467426428297262e-06, "loss": 0.2263, "step": 910 }, { "epoch": 1.2780300384212364, "grad_norm": 5.223084449768066, "learning_rate": 6.431340300475314e-06, "loss": 0.201, "step": 915 }, { "epoch": 1.285015717778554, "grad_norm": 4.555695056915283, "learning_rate": 6.395172776842229e-06, "loss": 0.2117, "step": 920 }, { "epoch": 1.2920013971358715, "grad_norm": 5.600462913513184, "learning_rate": 6.358925914131071e-06, "loss": 0.1974, "step": 925 }, { "epoch": 1.298987076493189, "grad_norm": 7.300204753875732, "learning_rate": 6.322601773586669e-06, "loss": 0.2158, "step": 930 }, { "epoch": 1.3059727558505065, "grad_norm": 4.54530143737793, "learning_rate": 6.2862024208484e-06, "loss": 0.2356, "step": 935 }, { "epoch": 1.312958435207824, "grad_norm": 6.054986000061035, "learning_rate": 6.249729925832716e-06, "loss": 0.2008, "step": 940 }, { "epoch": 1.3199441145651414, "grad_norm": 5.145763874053955, "learning_rate": 6.213186362615444e-06, "loss": 0.2086, "step": 945 }, { "epoch": 1.326929793922459, "grad_norm": 4.843288421630859, "learning_rate": 6.176573809313836e-06, "loss": 0.1962, "step": 950 }, { "epoch": 1.3339154732797764, "grad_norm": 5.428574085235596, "learning_rate": 6.139894347968389e-06, "loss": 0.2361, "step": 955 }, { "epoch": 1.340901152637094, "grad_norm": 5.883815765380859, "learning_rate": 6.103150064424454e-06, "loss": 0.1931, "step": 960 }, { "epoch": 1.3478868319944115, "grad_norm": 4.711780071258545, "learning_rate": 6.066343048213611e-06, "loss": 0.2122, "step": 965 }, { "epoch": 1.354872511351729, "grad_norm": 5.004215240478516, "learning_rate": 6.029475392434855e-06, "loss": 0.2247, "step": 970 }, { "epoch": 1.3618581907090466, "grad_norm": 5.238073825836182, "learning_rate": 5.9925491936355575e-06, "loss": 0.2228, "step": 975 }, { "epoch": 1.3688438700663639, "grad_norm": 5.151479721069336, "learning_rate": 5.955566551692247e-06, "loss": 0.2103, "step": 980 }, { "epoch": 1.3758295494236814, "grad_norm": 5.709561824798584, "learning_rate": 5.918529569691202e-06, "loss": 0.2069, "step": 985 }, { "epoch": 1.382815228780999, "grad_norm": 5.741881370544434, "learning_rate": 5.88144035380884e-06, "loss": 0.2225, "step": 990 }, { "epoch": 1.3898009081383165, "grad_norm": 5.143665790557861, "learning_rate": 5.8443010131919574e-06, "loss": 0.2411, "step": 995 }, { "epoch": 1.396786587495634, "grad_norm": 6.438896179199219, "learning_rate": 5.807113659837792e-06, "loss": 0.2324, "step": 1000 }, { "epoch": 1.4037722668529513, "grad_norm": 4.634160041809082, "learning_rate": 5.769880408473907e-06, "loss": 0.2005, "step": 1005 }, { "epoch": 1.410757946210269, "grad_norm": 4.902849197387695, "learning_rate": 5.732603376437944e-06, "loss": 0.2174, "step": 1010 }, { "epoch": 1.4177436255675864, "grad_norm": 5.148767948150635, "learning_rate": 5.695284683557216e-06, "loss": 0.237, "step": 1015 }, { "epoch": 1.424729304924904, "grad_norm": 5.740781784057617, "learning_rate": 5.6579264520281515e-06, "loss": 0.2197, "step": 1020 }, { "epoch": 1.4317149842822214, "grad_norm": 5.8418097496032715, "learning_rate": 5.620530806295621e-06, "loss": 0.2185, "step": 1025 }, { "epoch": 1.438700663639539, "grad_norm": 5.2934250831604, "learning_rate": 5.583099872932123e-06, "loss": 0.2092, "step": 1030 }, { "epoch": 1.4456863429968565, "grad_norm": 4.844241619110107, "learning_rate": 5.545635780516848e-06, "loss": 0.2084, "step": 1035 }, { "epoch": 1.4526720223541738, "grad_norm": 3.787388324737549, "learning_rate": 5.508140659514644e-06, "loss": 0.2007, "step": 1040 }, { "epoch": 1.4596577017114916, "grad_norm": 5.787258148193359, "learning_rate": 5.470616642154853e-06, "loss": 0.2263, "step": 1045 }, { "epoch": 1.4666433810688089, "grad_norm": 4.828206539154053, "learning_rate": 5.433065862310059e-06, "loss": 0.2041, "step": 1050 }, { "epoch": 1.4736290604261264, "grad_norm": 5.000522613525391, "learning_rate": 5.39549045537475e-06, "loss": 0.2125, "step": 1055 }, { "epoch": 1.480614739783444, "grad_norm": 4.484281063079834, "learning_rate": 5.3578925581438765e-06, "loss": 0.1907, "step": 1060 }, { "epoch": 1.4876004191407615, "grad_norm": 5.4914164543151855, "learning_rate": 5.32027430869134e-06, "loss": 0.2468, "step": 1065 }, { "epoch": 1.494586098498079, "grad_norm": 5.7337188720703125, "learning_rate": 5.28263784624841e-06, "loss": 0.2048, "step": 1070 }, { "epoch": 1.5015717778553963, "grad_norm": 5.739035606384277, "learning_rate": 5.244985311082073e-06, "loss": 0.2097, "step": 1075 }, { "epoch": 1.508557457212714, "grad_norm": 5.063661575317383, "learning_rate": 5.207318844373315e-06, "loss": 0.1988, "step": 1080 }, { "epoch": 1.5155431365700314, "grad_norm": 4.986030578613281, "learning_rate": 5.1696405880953715e-06, "loss": 0.2037, "step": 1085 }, { "epoch": 1.522528815927349, "grad_norm": 5.575538158416748, "learning_rate": 5.13195268489191e-06, "loss": 0.2026, "step": 1090 }, { "epoch": 1.5295144952846664, "grad_norm": 6.1051530838012695, "learning_rate": 5.094257277955187e-06, "loss": 0.2067, "step": 1095 }, { "epoch": 1.536500174641984, "grad_norm": 7.7973761558532715, "learning_rate": 5.056556510904178e-06, "loss": 0.2305, "step": 1100 }, { "epoch": 1.5434858539993015, "grad_norm": 4.967608451843262, "learning_rate": 5.0188525276626675e-06, "loss": 0.1946, "step": 1105 }, { "epoch": 1.5504715333566188, "grad_norm": 4.415831089019775, "learning_rate": 4.981147472337333e-06, "loss": 0.1934, "step": 1110 }, { "epoch": 1.5574572127139366, "grad_norm": 6.092747211456299, "learning_rate": 4.943443489095822e-06, "loss": 0.261, "step": 1115 }, { "epoch": 1.5644428920712539, "grad_norm": 5.598076820373535, "learning_rate": 4.905742722044813e-06, "loss": 0.1967, "step": 1120 }, { "epoch": 1.5714285714285714, "grad_norm": 4.891939640045166, "learning_rate": 4.868047315108091e-06, "loss": 0.2134, "step": 1125 }, { "epoch": 1.578414250785889, "grad_norm": 5.4631667137146, "learning_rate": 4.83035941190463e-06, "loss": 0.2164, "step": 1130 }, { "epoch": 1.5853999301432065, "grad_norm": 4.748926162719727, "learning_rate": 4.792681155626687e-06, "loss": 0.2095, "step": 1135 }, { "epoch": 1.592385609500524, "grad_norm": 5.863819122314453, "learning_rate": 4.75501468891793e-06, "loss": 0.2115, "step": 1140 }, { "epoch": 1.5993712888578413, "grad_norm": 5.605953216552734, "learning_rate": 4.717362153751591e-06, "loss": 0.1883, "step": 1145 }, { "epoch": 1.606356968215159, "grad_norm": 5.130843639373779, "learning_rate": 4.679725691308662e-06, "loss": 0.2082, "step": 1150 }, { "epoch": 1.6133426475724764, "grad_norm": 5.240429401397705, "learning_rate": 4.642107441856125e-06, "loss": 0.2219, "step": 1155 }, { "epoch": 1.620328326929794, "grad_norm": 4.621257305145264, "learning_rate": 4.604509544625252e-06, "loss": 0.1798, "step": 1160 }, { "epoch": 1.6273140062871114, "grad_norm": 4.74717903137207, "learning_rate": 4.566934137689943e-06, "loss": 0.1898, "step": 1165 }, { "epoch": 1.634299685644429, "grad_norm": 4.941923141479492, "learning_rate": 4.529383357845148e-06, "loss": 0.1949, "step": 1170 }, { "epoch": 1.6412853650017465, "grad_norm": 5.3948516845703125, "learning_rate": 4.491859340485356e-06, "loss": 0.1925, "step": 1175 }, { "epoch": 1.6482710443590638, "grad_norm": 5.2328667640686035, "learning_rate": 4.454364219483153e-06, "loss": 0.2067, "step": 1180 }, { "epoch": 1.6552567237163816, "grad_norm": 5.516434669494629, "learning_rate": 4.416900127067879e-06, "loss": 0.2076, "step": 1185 }, { "epoch": 1.6622424030736989, "grad_norm": 5.7635040283203125, "learning_rate": 4.3794691937043804e-06, "loss": 0.2152, "step": 1190 }, { "epoch": 1.6692280824310164, "grad_norm": 4.992877006530762, "learning_rate": 4.342073547971849e-06, "loss": 0.2091, "step": 1195 }, { "epoch": 1.676213761788334, "grad_norm": 5.6958136558532715, "learning_rate": 4.304715316442785e-06, "loss": 0.2059, "step": 1200 }, { "epoch": 1.6831994411456515, "grad_norm": 6.601348400115967, "learning_rate": 4.267396623562057e-06, "loss": 0.2227, "step": 1205 }, { "epoch": 1.690185120502969, "grad_norm": 4.910021781921387, "learning_rate": 4.230119591526095e-06, "loss": 0.2157, "step": 1210 }, { "epoch": 1.6971707998602863, "grad_norm": 5.821145534515381, "learning_rate": 4.19288634016221e-06, "loss": 0.2197, "step": 1215 }, { "epoch": 1.704156479217604, "grad_norm": 5.628259658813477, "learning_rate": 4.155698986808045e-06, "loss": 0.2159, "step": 1220 }, { "epoch": 1.7111421585749214, "grad_norm": 5.30466890335083, "learning_rate": 4.118559646191164e-06, "loss": 0.1895, "step": 1225 }, { "epoch": 1.7181278379322389, "grad_norm": 4.880356311798096, "learning_rate": 4.081470430308799e-06, "loss": 0.2022, "step": 1230 }, { "epoch": 1.7251135172895564, "grad_norm": 5.982080936431885, "learning_rate": 4.044433448307753e-06, "loss": 0.2135, "step": 1235 }, { "epoch": 1.732099196646874, "grad_norm": 5.4156341552734375, "learning_rate": 4.007450806364443e-06, "loss": 0.1854, "step": 1240 }, { "epoch": 1.7390848760041915, "grad_norm": 4.503352165222168, "learning_rate": 3.970524607565146e-06, "loss": 0.1988, "step": 1245 }, { "epoch": 1.7460705553615088, "grad_norm": 4.712128162384033, "learning_rate": 3.93365695178639e-06, "loss": 0.213, "step": 1250 }, { "epoch": 1.7530562347188265, "grad_norm": 6.144698143005371, "learning_rate": 3.896849935575548e-06, "loss": 0.2213, "step": 1255 }, { "epoch": 1.7600419140761439, "grad_norm": 5.0675482749938965, "learning_rate": 3.860105652031612e-06, "loss": 0.1937, "step": 1260 }, { "epoch": 1.7670275934334614, "grad_norm": 5.1114020347595215, "learning_rate": 3.823426190686166e-06, "loss": 0.2159, "step": 1265 }, { "epoch": 1.774013272790779, "grad_norm": 5.182307720184326, "learning_rate": 3.7868136373845577e-06, "loss": 0.2074, "step": 1270 }, { "epoch": 1.7809989521480964, "grad_norm": 5.307186126708984, "learning_rate": 3.7502700741672864e-06, "loss": 0.2072, "step": 1275 }, { "epoch": 1.787984631505414, "grad_norm": 5.354731559753418, "learning_rate": 3.7137975791516025e-06, "loss": 0.2029, "step": 1280 }, { "epoch": 1.7949703108627313, "grad_norm": 5.206900119781494, "learning_rate": 3.6773982264133324e-06, "loss": 0.1993, "step": 1285 }, { "epoch": 1.801955990220049, "grad_norm": 5.040802955627441, "learning_rate": 3.641074085868932e-06, "loss": 0.1863, "step": 1290 }, { "epoch": 1.8089416695773664, "grad_norm": 4.957560062408447, "learning_rate": 3.604827223157773e-06, "loss": 0.2037, "step": 1295 }, { "epoch": 1.8159273489346839, "grad_norm": 5.18066930770874, "learning_rate": 3.5686596995246868e-06, "loss": 0.1939, "step": 1300 }, { "epoch": 1.8229130282920014, "grad_norm": 6.234875202178955, "learning_rate": 3.5325735717027386e-06, "loss": 0.2052, "step": 1305 }, { "epoch": 1.829898707649319, "grad_norm": 5.431389808654785, "learning_rate": 3.496570891796267e-06, "loss": 0.1935, "step": 1310 }, { "epoch": 1.8368843870066365, "grad_norm": 5.478362083435059, "learning_rate": 3.4606537071641966e-06, "loss": 0.207, "step": 1315 }, { "epoch": 1.8438700663639538, "grad_norm": 5.117818832397461, "learning_rate": 3.424824060303601e-06, "loss": 0.2017, "step": 1320 }, { "epoch": 1.8508557457212715, "grad_norm": 4.163613319396973, "learning_rate": 3.389083988733556e-06, "loss": 0.1818, "step": 1325 }, { "epoch": 1.8578414250785888, "grad_norm": 4.954441070556641, "learning_rate": 3.3534355248792787e-06, "loss": 0.1909, "step": 1330 }, { "epoch": 1.8648271044359064, "grad_norm": 5.696218490600586, "learning_rate": 3.3178806959565378e-06, "loss": 0.2004, "step": 1335 }, { "epoch": 1.871812783793224, "grad_norm": 5.73183536529541, "learning_rate": 3.282421523856381e-06, "loss": 0.1965, "step": 1340 }, { "epoch": 1.8787984631505414, "grad_norm": 5.084606647491455, "learning_rate": 3.247060025030156e-06, "loss": 0.2062, "step": 1345 }, { "epoch": 1.885784142507859, "grad_norm": 6.37736177444458, "learning_rate": 3.2117982103748358e-06, "loss": 0.2164, "step": 1350 }, { "epoch": 1.8927698218651763, "grad_norm": 4.8795013427734375, "learning_rate": 3.1766380851186695e-06, "loss": 0.2003, "step": 1355 }, { "epoch": 1.899755501222494, "grad_norm": 5.402559757232666, "learning_rate": 3.141581648707155e-06, "loss": 0.188, "step": 1360 }, { "epoch": 1.9067411805798113, "grad_norm": 5.172591209411621, "learning_rate": 3.106630894689328e-06, "loss": 0.1847, "step": 1365 }, { "epoch": 1.9137268599371289, "grad_norm": 4.233926773071289, "learning_rate": 3.0717878106043987e-06, "loss": 0.187, "step": 1370 }, { "epoch": 1.9207125392944464, "grad_norm": 4.548036575317383, "learning_rate": 3.0370543778687315e-06, "loss": 0.2091, "step": 1375 }, { "epoch": 1.927698218651764, "grad_norm": 6.768104076385498, "learning_rate": 3.002432571663162e-06, "loss": 0.1916, "step": 1380 }, { "epoch": 1.9346838980090815, "grad_norm": 4.954831600189209, "learning_rate": 2.9679243608206752e-06, "loss": 0.1896, "step": 1385 }, { "epoch": 1.9416695773663988, "grad_norm": 5.701454162597656, "learning_rate": 2.933531707714451e-06, "loss": 0.2246, "step": 1390 }, { "epoch": 1.9486552567237165, "grad_norm": 4.5505475997924805, "learning_rate": 2.8992565681462592e-06, "loss": 0.1776, "step": 1395 }, { "epoch": 1.9556409360810338, "grad_norm": 3.9627044200897217, "learning_rate": 2.86510089123525e-06, "loss": 0.1774, "step": 1400 }, { "epoch": 1.9626266154383514, "grad_norm": 5.475313186645508, "learning_rate": 2.831066619307104e-06, "loss": 0.2232, "step": 1405 }, { "epoch": 1.969612294795669, "grad_norm": 4.611178874969482, "learning_rate": 2.797155687783587e-06, "loss": 0.1724, "step": 1410 }, { "epoch": 1.9765979741529862, "grad_norm": 4.791353225708008, "learning_rate": 2.7633700250724837e-06, "loss": 0.2039, "step": 1415 }, { "epoch": 1.983583653510304, "grad_norm": 4.962776184082031, "learning_rate": 2.7297115524579364e-06, "loss": 0.1999, "step": 1420 }, { "epoch": 1.9905693328676213, "grad_norm": 5.41670560836792, "learning_rate": 2.6961821839911873e-06, "loss": 0.2224, "step": 1425 }, { "epoch": 1.997555012224939, "grad_norm": 4.636608600616455, "learning_rate": 2.662783826381734e-06, "loss": 0.1939, "step": 1430 } ], "logging_steps": 5, "max_steps": 2148, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9369845131879383e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }