{ "best_global_step": 48000, "best_metric": 4.58038854598999, "best_model_checkpoint": "./model_save/checkpoint-48000", "epoch": 1.0, "eval_steps": 1000, "global_step": 48833, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00010238977740462392, "grad_norm": 3.237281322479248, "learning_rate": 1.2e-06, "loss": 11.7826, "step": 5 }, { "epoch": 0.00020477955480924785, "grad_norm": 3.256563425064087, "learning_rate": 2.6999999999999996e-06, "loss": 11.7333, "step": 10 }, { "epoch": 0.00030716933221387174, "grad_norm": 3.1677451133728027, "learning_rate": 4.2e-06, "loss": 11.6291, "step": 15 }, { "epoch": 0.0004095591096184957, "grad_norm": 3.29661226272583, "learning_rate": 5.7e-06, "loss": 11.476, "step": 20 }, { "epoch": 0.0005119488870231196, "grad_norm": 3.558603048324585, "learning_rate": 7.2e-06, "loss": 11.2652, "step": 25 }, { "epoch": 0.0006143386644277435, "grad_norm": 2.8637077808380127, "learning_rate": 8.7e-06, "loss": 11.0138, "step": 30 }, { "epoch": 0.0007167284418323675, "grad_norm": 2.0960562229156494, "learning_rate": 1.02e-05, "loss": 10.7718, "step": 35 }, { "epoch": 0.0008191182192369914, "grad_norm": 1.7101972103118896, "learning_rate": 1.17e-05, "loss": 10.5829, "step": 40 }, { "epoch": 0.0009215079966416153, "grad_norm": 1.4555027484893799, "learning_rate": 1.3199999999999997e-05, "loss": 10.3681, "step": 45 }, { "epoch": 0.0010238977740462392, "grad_norm": 1.3247417211532593, "learning_rate": 1.47e-05, "loss": 10.2536, "step": 50 }, { "epoch": 0.0011262875514508632, "grad_norm": 1.1643482446670532, "learning_rate": 1.6199999999999997e-05, "loss": 10.1509, "step": 55 }, { "epoch": 0.001228677328855487, "grad_norm": 1.153939127922058, "learning_rate": 1.7699999999999997e-05, "loss": 10.0431, "step": 60 }, { "epoch": 0.001331067106260111, "grad_norm": 1.1378567218780518, "learning_rate": 1.92e-05, "loss": 9.9499, "step": 65 }, { "epoch": 0.001433456883664735, "grad_norm": 1.1022170782089233, "learning_rate": 2.07e-05, "loss": 9.8486, "step": 70 }, { "epoch": 0.0015358466610693588, "grad_norm": 1.0503519773483276, "learning_rate": 2.2199999999999998e-05, "loss": 9.7697, "step": 75 }, { "epoch": 0.0016382364384739828, "grad_norm": 1.0037927627563477, "learning_rate": 2.3699999999999997e-05, "loss": 9.6688, "step": 80 }, { "epoch": 0.0017406262158786068, "grad_norm": 0.9526827335357666, "learning_rate": 2.52e-05, "loss": 9.5799, "step": 85 }, { "epoch": 0.0018430159932832306, "grad_norm": 0.8503618836402893, "learning_rate": 2.6699999999999995e-05, "loss": 9.5157, "step": 90 }, { "epoch": 0.0019454057706878546, "grad_norm": 0.8890340328216553, "learning_rate": 2.8199999999999998e-05, "loss": 9.4274, "step": 95 }, { "epoch": 0.0020477955480924783, "grad_norm": 0.8748540878295898, "learning_rate": 2.97e-05, "loss": 9.3157, "step": 100 }, { "epoch": 0.0021501853254971026, "grad_norm": 0.8950842022895813, "learning_rate": 3.119999999999999e-05, "loss": 9.2445, "step": 105 }, { "epoch": 0.0022525751029017264, "grad_norm": 0.8296220898628235, "learning_rate": 3.2699999999999995e-05, "loss": 9.1356, "step": 110 }, { "epoch": 0.00235496488030635, "grad_norm": 0.8047326803207397, "learning_rate": 3.42e-05, "loss": 9.0434, "step": 115 }, { "epoch": 0.002457354657710974, "grad_norm": 0.8104546070098877, "learning_rate": 3.5699999999999994e-05, "loss": 8.9745, "step": 120 }, { "epoch": 0.002559744435115598, "grad_norm": 0.8774133324623108, "learning_rate": 3.7199999999999996e-05, "loss": 8.9045, "step": 125 }, { "epoch": 0.002662134212520222, "grad_norm": 0.6810584664344788, "learning_rate": 3.87e-05, "loss": 8.8046, "step": 130 }, { "epoch": 0.0027645239899248457, "grad_norm": 0.6244851350784302, "learning_rate": 4.02e-05, "loss": 8.7529, "step": 135 }, { "epoch": 0.00286691376732947, "grad_norm": 0.7336015701293945, "learning_rate": 4.17e-05, "loss": 8.6713, "step": 140 }, { "epoch": 0.0029693035447340937, "grad_norm": 0.7508506774902344, "learning_rate": 4.319999999999999e-05, "loss": 8.6402, "step": 145 }, { "epoch": 0.0030716933221387175, "grad_norm": 0.7344472408294678, "learning_rate": 4.4699999999999996e-05, "loss": 8.5754, "step": 150 }, { "epoch": 0.0031740830995433417, "grad_norm": 0.5286904573440552, "learning_rate": 4.62e-05, "loss": 8.5107, "step": 155 }, { "epoch": 0.0032764728769479655, "grad_norm": 0.5070658326148987, "learning_rate": 4.7699999999999994e-05, "loss": 8.4627, "step": 160 }, { "epoch": 0.0033788626543525893, "grad_norm": 0.5503280162811279, "learning_rate": 4.9199999999999997e-05, "loss": 8.4416, "step": 165 }, { "epoch": 0.0034812524317572135, "grad_norm": 0.5057103633880615, "learning_rate": 5.07e-05, "loss": 8.4661, "step": 170 }, { "epoch": 0.0035836422091618373, "grad_norm": 0.5336150527000427, "learning_rate": 5.2199999999999995e-05, "loss": 8.3401, "step": 175 }, { "epoch": 0.003686031986566461, "grad_norm": 0.4095563590526581, "learning_rate": 5.369999999999999e-05, "loss": 8.2784, "step": 180 }, { "epoch": 0.0037884217639710853, "grad_norm": 0.37456509470939636, "learning_rate": 5.519999999999999e-05, "loss": 8.2367, "step": 185 }, { "epoch": 0.003890811541375709, "grad_norm": 0.4807591736316681, "learning_rate": 5.6699999999999996e-05, "loss": 8.2496, "step": 190 }, { "epoch": 0.003993201318780333, "grad_norm": 0.4516820013523102, "learning_rate": 5.82e-05, "loss": 8.1633, "step": 195 }, { "epoch": 0.004095591096184957, "grad_norm": 0.40326714515686035, "learning_rate": 5.97e-05, "loss": 8.1903, "step": 200 }, { "epoch": 0.0041979808735895805, "grad_norm": 0.4950360357761383, "learning_rate": 6.12e-05, "loss": 8.1316, "step": 205 }, { "epoch": 0.004300370650994205, "grad_norm": 0.6782643795013428, "learning_rate": 6.269999999999999e-05, "loss": 8.1855, "step": 210 }, { "epoch": 0.004402760428398829, "grad_norm": 0.5185160636901855, "learning_rate": 6.419999999999999e-05, "loss": 8.0867, "step": 215 }, { "epoch": 0.004505150205803453, "grad_norm": 0.5909910202026367, "learning_rate": 6.57e-05, "loss": 8.098, "step": 220 }, { "epoch": 0.0046075399832080765, "grad_norm": 0.5086753368377686, "learning_rate": 6.72e-05, "loss": 8.0583, "step": 225 }, { "epoch": 0.0047099297606127, "grad_norm": 0.5272602438926697, "learning_rate": 6.87e-05, "loss": 8.0315, "step": 230 }, { "epoch": 0.004812319538017324, "grad_norm": 0.9392020106315613, "learning_rate": 7.02e-05, "loss": 8.0517, "step": 235 }, { "epoch": 0.004914709315421948, "grad_norm": 0.696977436542511, "learning_rate": 7.17e-05, "loss": 8.0084, "step": 240 }, { "epoch": 0.0050170990928265725, "grad_norm": 0.6232326030731201, "learning_rate": 7.319999999999999e-05, "loss": 7.9619, "step": 245 }, { "epoch": 0.005119488870231196, "grad_norm": 1.3301762342453003, "learning_rate": 7.47e-05, "loss": 7.9713, "step": 250 }, { "epoch": 0.00522187864763582, "grad_norm": 3.402907609939575, "learning_rate": 7.62e-05, "loss": 8.0243, "step": 255 }, { "epoch": 0.005324268425040444, "grad_norm": 0.739121675491333, "learning_rate": 7.769999999999999e-05, "loss": 7.888, "step": 260 }, { "epoch": 0.005426658202445068, "grad_norm": 0.6031880974769592, "learning_rate": 7.92e-05, "loss": 7.8527, "step": 265 }, { "epoch": 0.0055290479798496914, "grad_norm": 0.7053850889205933, "learning_rate": 8.07e-05, "loss": 7.8751, "step": 270 }, { "epoch": 0.005631437757254316, "grad_norm": 0.564606249332428, "learning_rate": 8.22e-05, "loss": 7.7992, "step": 275 }, { "epoch": 0.00573382753465894, "grad_norm": 0.5167285203933716, "learning_rate": 8.37e-05, "loss": 7.7863, "step": 280 }, { "epoch": 0.005836217312063564, "grad_norm": 0.816970944404602, "learning_rate": 8.519999999999998e-05, "loss": 7.7767, "step": 285 }, { "epoch": 0.0059386070894681875, "grad_norm": 0.7622373700141907, "learning_rate": 8.669999999999998e-05, "loss": 7.7498, "step": 290 }, { "epoch": 0.006040996866872811, "grad_norm": 0.7365726828575134, "learning_rate": 8.819999999999999e-05, "loss": 7.769, "step": 295 }, { "epoch": 0.006143386644277435, "grad_norm": 0.5777131915092468, "learning_rate": 8.969999999999998e-05, "loss": 7.7428, "step": 300 }, { "epoch": 0.00624577642168206, "grad_norm": 0.7264725565910339, "learning_rate": 9.12e-05, "loss": 7.7056, "step": 305 }, { "epoch": 0.0063481661990866835, "grad_norm": 0.5708840489387512, "learning_rate": 9.269999999999999e-05, "loss": 7.661, "step": 310 }, { "epoch": 0.006450555976491307, "grad_norm": 0.6288994550704956, "learning_rate": 9.419999999999999e-05, "loss": 7.6555, "step": 315 }, { "epoch": 0.006552945753895931, "grad_norm": 1.0663909912109375, "learning_rate": 9.57e-05, "loss": 7.6393, "step": 320 }, { "epoch": 0.006655335531300555, "grad_norm": 0.7720368504524231, "learning_rate": 9.719999999999999e-05, "loss": 7.6684, "step": 325 }, { "epoch": 0.006757725308705179, "grad_norm": 0.983554482460022, "learning_rate": 9.87e-05, "loss": 7.633, "step": 330 }, { "epoch": 0.006860115086109802, "grad_norm": 0.7408352494239807, "learning_rate": 0.0001002, "loss": 7.6358, "step": 335 }, { "epoch": 0.006962504863514427, "grad_norm": 0.7378109097480774, "learning_rate": 0.00010169999999999999, "loss": 7.5782, "step": 340 }, { "epoch": 0.007064894640919051, "grad_norm": 0.6890093684196472, "learning_rate": 0.00010319999999999999, "loss": 7.5093, "step": 345 }, { "epoch": 0.007167284418323675, "grad_norm": 0.974371612071991, "learning_rate": 0.00010469999999999998, "loss": 7.4945, "step": 350 }, { "epoch": 0.007269674195728298, "grad_norm": 0.6680528521537781, "learning_rate": 0.00010619999999999998, "loss": 7.4819, "step": 355 }, { "epoch": 0.007372063973132922, "grad_norm": 1.2959836721420288, "learning_rate": 0.00010769999999999999, "loss": 7.5094, "step": 360 }, { "epoch": 0.007474453750537546, "grad_norm": 0.6721152663230896, "learning_rate": 0.00010919999999999998, "loss": 7.4758, "step": 365 }, { "epoch": 0.007576843527942171, "grad_norm": 0.6479594111442566, "learning_rate": 0.0001107, "loss": 7.4569, "step": 370 }, { "epoch": 0.0076792333053467944, "grad_norm": 1.128847599029541, "learning_rate": 0.00011219999999999999, "loss": 7.4654, "step": 375 }, { "epoch": 0.007781623082751418, "grad_norm": 0.9047718048095703, "learning_rate": 0.00011369999999999999, "loss": 7.4445, "step": 380 }, { "epoch": 0.007884012860156043, "grad_norm": 0.6440275311470032, "learning_rate": 0.0001152, "loss": 7.3781, "step": 385 }, { "epoch": 0.007986402637560666, "grad_norm": 0.9747110605239868, "learning_rate": 0.00011669999999999999, "loss": 7.378, "step": 390 }, { "epoch": 0.00808879241496529, "grad_norm": 0.8239765167236328, "learning_rate": 0.0001182, "loss": 7.3363, "step": 395 }, { "epoch": 0.008191182192369913, "grad_norm": 1.1421988010406494, "learning_rate": 0.0001197, "loss": 7.3996, "step": 400 }, { "epoch": 0.008293571969774538, "grad_norm": 0.9112151861190796, "learning_rate": 0.00012119999999999999, "loss": 7.4102, "step": 405 }, { "epoch": 0.008395961747179161, "grad_norm": 0.6386898756027222, "learning_rate": 0.00012269999999999997, "loss": 7.3811, "step": 410 }, { "epoch": 0.008498351524583786, "grad_norm": 0.7662557363510132, "learning_rate": 0.00012419999999999998, "loss": 7.3087, "step": 415 }, { "epoch": 0.00860074130198841, "grad_norm": 0.8661086559295654, "learning_rate": 0.0001257, "loss": 7.2616, "step": 420 }, { "epoch": 0.008703131079393033, "grad_norm": 0.7727162837982178, "learning_rate": 0.00012719999999999997, "loss": 7.2938, "step": 425 }, { "epoch": 0.008805520856797658, "grad_norm": 0.6405845284461975, "learning_rate": 0.00012869999999999998, "loss": 7.2836, "step": 430 }, { "epoch": 0.00890791063420228, "grad_norm": 1.2078088521957397, "learning_rate": 0.0001302, "loss": 7.2712, "step": 435 }, { "epoch": 0.009010300411606905, "grad_norm": 0.9001152515411377, "learning_rate": 0.00013169999999999998, "loss": 7.249, "step": 440 }, { "epoch": 0.009112690189011528, "grad_norm": 0.8163766264915466, "learning_rate": 0.00013319999999999999, "loss": 7.2852, "step": 445 }, { "epoch": 0.009215079966416153, "grad_norm": 0.9464946389198303, "learning_rate": 0.0001347, "loss": 7.1945, "step": 450 }, { "epoch": 0.009317469743820778, "grad_norm": 0.8073117733001709, "learning_rate": 0.0001362, "loss": 7.1885, "step": 455 }, { "epoch": 0.0094198595212254, "grad_norm": 0.7544310092926025, "learning_rate": 0.00013769999999999999, "loss": 7.219, "step": 460 }, { "epoch": 0.009522249298630025, "grad_norm": 1.2347111701965332, "learning_rate": 0.0001392, "loss": 7.1837, "step": 465 }, { "epoch": 0.009624639076034648, "grad_norm": 0.7268940210342407, "learning_rate": 0.00014069999999999998, "loss": 7.1897, "step": 470 }, { "epoch": 0.009727028853439273, "grad_norm": 0.9710267186164856, "learning_rate": 0.0001422, "loss": 7.1928, "step": 475 }, { "epoch": 0.009829418630843896, "grad_norm": 0.8327569365501404, "learning_rate": 0.00014369999999999997, "loss": 7.2138, "step": 480 }, { "epoch": 0.00993180840824852, "grad_norm": 0.9703325033187866, "learning_rate": 0.00014519999999999998, "loss": 7.1411, "step": 485 }, { "epoch": 0.010034198185653145, "grad_norm": 1.184667706489563, "learning_rate": 0.0001467, "loss": 7.1319, "step": 490 }, { "epoch": 0.010136587963057768, "grad_norm": 0.8813581466674805, "learning_rate": 0.0001482, "loss": 7.1491, "step": 495 }, { "epoch": 0.010238977740462393, "grad_norm": 0.7553069591522217, "learning_rate": 0.00014969999999999998, "loss": 7.1247, "step": 500 }, { "epoch": 0.010341367517867016, "grad_norm": 0.8524662256240845, "learning_rate": 0.0001512, "loss": 7.0651, "step": 505 }, { "epoch": 0.01044375729527164, "grad_norm": 0.9542851448059082, "learning_rate": 0.0001527, "loss": 7.0811, "step": 510 }, { "epoch": 0.010546147072676265, "grad_norm": 0.839388906955719, "learning_rate": 0.00015419999999999998, "loss": 7.1088, "step": 515 }, { "epoch": 0.010648536850080888, "grad_norm": 0.7219004034996033, "learning_rate": 0.0001557, "loss": 7.073, "step": 520 }, { "epoch": 0.010750926627485512, "grad_norm": 1.1495189666748047, "learning_rate": 0.0001572, "loss": 7.0617, "step": 525 }, { "epoch": 0.010853316404890135, "grad_norm": 0.9143120646476746, "learning_rate": 0.00015869999999999998, "loss": 7.095, "step": 530 }, { "epoch": 0.01095570618229476, "grad_norm": 0.6772273182868958, "learning_rate": 0.0001602, "loss": 7.0889, "step": 535 }, { "epoch": 0.011058095959699383, "grad_norm": 0.6970393657684326, "learning_rate": 0.0001617, "loss": 7.0662, "step": 540 }, { "epoch": 0.011160485737104008, "grad_norm": 0.7849879264831543, "learning_rate": 0.0001632, "loss": 7.0654, "step": 545 }, { "epoch": 0.011262875514508632, "grad_norm": 0.6196251511573792, "learning_rate": 0.0001647, "loss": 7.0607, "step": 550 }, { "epoch": 0.011365265291913255, "grad_norm": 0.9614342451095581, "learning_rate": 0.0001662, "loss": 7.0476, "step": 555 }, { "epoch": 0.01146765506931788, "grad_norm": 0.8006017208099365, "learning_rate": 0.0001677, "loss": 6.9906, "step": 560 }, { "epoch": 0.011570044846722503, "grad_norm": 0.7031441926956177, "learning_rate": 0.00016919999999999997, "loss": 6.9459, "step": 565 }, { "epoch": 0.011672434624127127, "grad_norm": 0.770129919052124, "learning_rate": 0.00017069999999999998, "loss": 6.997, "step": 570 }, { "epoch": 0.01177482440153175, "grad_norm": 0.7633779644966125, "learning_rate": 0.00017219999999999998, "loss": 7.0089, "step": 575 }, { "epoch": 0.011877214178936375, "grad_norm": 1.0391132831573486, "learning_rate": 0.00017369999999999997, "loss": 6.9777, "step": 580 }, { "epoch": 0.011979603956341, "grad_norm": 0.8871012330055237, "learning_rate": 0.00017519999999999998, "loss": 6.9827, "step": 585 }, { "epoch": 0.012081993733745622, "grad_norm": 0.6953468918800354, "learning_rate": 0.00017669999999999999, "loss": 6.9325, "step": 590 }, { "epoch": 0.012184383511150247, "grad_norm": 0.7784765362739563, "learning_rate": 0.00017819999999999997, "loss": 6.9506, "step": 595 }, { "epoch": 0.01228677328855487, "grad_norm": 0.8684126734733582, "learning_rate": 0.00017969999999999998, "loss": 6.9306, "step": 600 }, { "epoch": 0.012389163065959495, "grad_norm": 0.9275881052017212, "learning_rate": 0.00018119999999999999, "loss": 6.9174, "step": 605 }, { "epoch": 0.01249155284336412, "grad_norm": 0.6531779766082764, "learning_rate": 0.00018269999999999997, "loss": 6.9123, "step": 610 }, { "epoch": 0.012593942620768742, "grad_norm": 0.758029043674469, "learning_rate": 0.00018419999999999998, "loss": 6.9776, "step": 615 }, { "epoch": 0.012696332398173367, "grad_norm": 0.7189378142356873, "learning_rate": 0.0001857, "loss": 6.9247, "step": 620 }, { "epoch": 0.01279872217557799, "grad_norm": 0.8125119209289551, "learning_rate": 0.0001872, "loss": 6.8839, "step": 625 }, { "epoch": 0.012901111952982615, "grad_norm": 0.8374565243721008, "learning_rate": 0.00018869999999999998, "loss": 6.9138, "step": 630 }, { "epoch": 0.013003501730387237, "grad_norm": 0.7942729592323303, "learning_rate": 0.0001902, "loss": 6.866, "step": 635 }, { "epoch": 0.013105891507791862, "grad_norm": 0.9105733633041382, "learning_rate": 0.0001917, "loss": 6.876, "step": 640 }, { "epoch": 0.013208281285196487, "grad_norm": 0.873391330242157, "learning_rate": 0.00019319999999999998, "loss": 6.8388, "step": 645 }, { "epoch": 0.01331067106260111, "grad_norm": 0.7067049145698547, "learning_rate": 0.0001947, "loss": 6.8088, "step": 650 }, { "epoch": 0.013413060840005734, "grad_norm": 1.2663843631744385, "learning_rate": 0.0001962, "loss": 6.8106, "step": 655 }, { "epoch": 0.013515450617410357, "grad_norm": 1.0296598672866821, "learning_rate": 0.00019769999999999998, "loss": 6.8277, "step": 660 }, { "epoch": 0.013617840394814982, "grad_norm": 0.810079038143158, "learning_rate": 0.0001992, "loss": 6.8173, "step": 665 }, { "epoch": 0.013720230172219605, "grad_norm": 0.8228099942207336, "learning_rate": 0.0002007, "loss": 6.8861, "step": 670 }, { "epoch": 0.01382261994962423, "grad_norm": 0.7651252746582031, "learning_rate": 0.0002022, "loss": 6.8165, "step": 675 }, { "epoch": 0.013925009727028854, "grad_norm": 0.8029287457466125, "learning_rate": 0.0002037, "loss": 6.7644, "step": 680 }, { "epoch": 0.014027399504433477, "grad_norm": 0.852442741394043, "learning_rate": 0.0002052, "loss": 6.8568, "step": 685 }, { "epoch": 0.014129789281838102, "grad_norm": 0.8873968124389648, "learning_rate": 0.00020669999999999996, "loss": 6.7989, "step": 690 }, { "epoch": 0.014232179059242725, "grad_norm": 0.7122694849967957, "learning_rate": 0.00020819999999999996, "loss": 6.7896, "step": 695 }, { "epoch": 0.01433456883664735, "grad_norm": 0.668544590473175, "learning_rate": 0.00020969999999999997, "loss": 6.7068, "step": 700 }, { "epoch": 0.014436958614051972, "grad_norm": 1.1267056465148926, "learning_rate": 0.00021119999999999996, "loss": 6.7627, "step": 705 }, { "epoch": 0.014539348391456597, "grad_norm": 0.9785383939743042, "learning_rate": 0.00021269999999999997, "loss": 6.7088, "step": 710 }, { "epoch": 0.014641738168861222, "grad_norm": 0.6889818906784058, "learning_rate": 0.00021419999999999998, "loss": 6.7014, "step": 715 }, { "epoch": 0.014744127946265844, "grad_norm": 0.5881128907203674, "learning_rate": 0.00021569999999999998, "loss": 6.7047, "step": 720 }, { "epoch": 0.014846517723670469, "grad_norm": 0.49074748158454895, "learning_rate": 0.00021719999999999997, "loss": 6.7351, "step": 725 }, { "epoch": 0.014948907501075092, "grad_norm": 1.000606656074524, "learning_rate": 0.00021869999999999998, "loss": 6.6648, "step": 730 }, { "epoch": 0.015051297278479717, "grad_norm": 1.2626793384552002, "learning_rate": 0.00022019999999999999, "loss": 6.7403, "step": 735 }, { "epoch": 0.015153687055884341, "grad_norm": 1.0611534118652344, "learning_rate": 0.00022169999999999997, "loss": 6.6834, "step": 740 }, { "epoch": 0.015256076833288964, "grad_norm": 1.1611676216125488, "learning_rate": 0.00022319999999999998, "loss": 6.7988, "step": 745 }, { "epoch": 0.015358466610693589, "grad_norm": 0.6415315270423889, "learning_rate": 0.0002247, "loss": 6.6566, "step": 750 }, { "epoch": 0.015460856388098212, "grad_norm": 0.6816340684890747, "learning_rate": 0.00022619999999999997, "loss": 6.652, "step": 755 }, { "epoch": 0.015563246165502836, "grad_norm": 0.8404628038406372, "learning_rate": 0.00022769999999999998, "loss": 6.6516, "step": 760 }, { "epoch": 0.01566563594290746, "grad_norm": 1.1827441453933716, "learning_rate": 0.0002292, "loss": 6.6355, "step": 765 }, { "epoch": 0.015768025720312086, "grad_norm": 0.5615525841712952, "learning_rate": 0.0002307, "loss": 6.6248, "step": 770 }, { "epoch": 0.015870415497716707, "grad_norm": 0.7266453504562378, "learning_rate": 0.00023219999999999998, "loss": 6.6207, "step": 775 }, { "epoch": 0.01597280527512133, "grad_norm": 0.6322405338287354, "learning_rate": 0.0002337, "loss": 6.6276, "step": 780 }, { "epoch": 0.016075195052525956, "grad_norm": 0.80306077003479, "learning_rate": 0.0002352, "loss": 6.6492, "step": 785 }, { "epoch": 0.01617758482993058, "grad_norm": 0.6725737452507019, "learning_rate": 0.00023669999999999998, "loss": 6.6179, "step": 790 }, { "epoch": 0.016279974607335202, "grad_norm": 0.7415054440498352, "learning_rate": 0.0002382, "loss": 6.6516, "step": 795 }, { "epoch": 0.016382364384739827, "grad_norm": 0.5289933681488037, "learning_rate": 0.0002397, "loss": 6.584, "step": 800 }, { "epoch": 0.01648475416214445, "grad_norm": 0.6528019905090332, "learning_rate": 0.00024119999999999998, "loss": 6.6597, "step": 805 }, { "epoch": 0.016587143939549076, "grad_norm": 0.573921799659729, "learning_rate": 0.0002427, "loss": 6.581, "step": 810 }, { "epoch": 0.0166895337169537, "grad_norm": 0.7144637703895569, "learning_rate": 0.00024419999999999997, "loss": 6.5338, "step": 815 }, { "epoch": 0.016791923494358322, "grad_norm": 0.6318907141685486, "learning_rate": 0.00024569999999999995, "loss": 6.5926, "step": 820 }, { "epoch": 0.016894313271762947, "grad_norm": 0.665605902671814, "learning_rate": 0.0002472, "loss": 6.5585, "step": 825 }, { "epoch": 0.01699670304916757, "grad_norm": 0.5565012097358704, "learning_rate": 0.0002487, "loss": 6.5379, "step": 830 }, { "epoch": 0.017099092826572196, "grad_norm": 0.6355167031288147, "learning_rate": 0.00025019999999999996, "loss": 6.5098, "step": 835 }, { "epoch": 0.01720148260397682, "grad_norm": 0.6501747369766235, "learning_rate": 0.0002517, "loss": 6.5521, "step": 840 }, { "epoch": 0.01730387238138144, "grad_norm": 0.5432541370391846, "learning_rate": 0.0002532, "loss": 6.5465, "step": 845 }, { "epoch": 0.017406262158786066, "grad_norm": 0.561551570892334, "learning_rate": 0.00025469999999999996, "loss": 6.5426, "step": 850 }, { "epoch": 0.01750865193619069, "grad_norm": 0.865897536277771, "learning_rate": 0.0002562, "loss": 6.5195, "step": 855 }, { "epoch": 0.017611041713595316, "grad_norm": 0.7287527918815613, "learning_rate": 0.0002577, "loss": 6.5411, "step": 860 }, { "epoch": 0.017713431490999937, "grad_norm": 0.6758579015731812, "learning_rate": 0.00025919999999999996, "loss": 6.5528, "step": 865 }, { "epoch": 0.01781582126840456, "grad_norm": 0.7459971904754639, "learning_rate": 0.0002607, "loss": 6.5109, "step": 870 }, { "epoch": 0.017918211045809186, "grad_norm": 0.5554007291793823, "learning_rate": 0.0002622, "loss": 6.4838, "step": 875 }, { "epoch": 0.01802060082321381, "grad_norm": 0.7232329845428467, "learning_rate": 0.00026369999999999996, "loss": 6.5336, "step": 880 }, { "epoch": 0.018122990600618435, "grad_norm": 0.48811808228492737, "learning_rate": 0.0002652, "loss": 6.4909, "step": 885 }, { "epoch": 0.018225380378023057, "grad_norm": 0.673035204410553, "learning_rate": 0.0002667, "loss": 6.4889, "step": 890 }, { "epoch": 0.01832777015542768, "grad_norm": 0.5305101275444031, "learning_rate": 0.00026819999999999996, "loss": 6.4806, "step": 895 }, { "epoch": 0.018430159932832306, "grad_norm": 0.5400475263595581, "learning_rate": 0.0002697, "loss": 6.4376, "step": 900 }, { "epoch": 0.01853254971023693, "grad_norm": 0.5476040244102478, "learning_rate": 0.0002712, "loss": 6.4375, "step": 905 }, { "epoch": 0.018634939487641555, "grad_norm": 0.4433288276195526, "learning_rate": 0.00027269999999999996, "loss": 6.4463, "step": 910 }, { "epoch": 0.018737329265046176, "grad_norm": 0.640889048576355, "learning_rate": 0.0002742, "loss": 6.4159, "step": 915 }, { "epoch": 0.0188397190424508, "grad_norm": 0.5932295322418213, "learning_rate": 0.0002757, "loss": 6.4376, "step": 920 }, { "epoch": 0.018942108819855426, "grad_norm": 0.542849600315094, "learning_rate": 0.0002772, "loss": 6.4334, "step": 925 }, { "epoch": 0.01904449859726005, "grad_norm": 0.7035185694694519, "learning_rate": 0.0002787, "loss": 6.3826, "step": 930 }, { "epoch": 0.019146888374664675, "grad_norm": 0.5947180986404419, "learning_rate": 0.0002802, "loss": 6.4046, "step": 935 }, { "epoch": 0.019249278152069296, "grad_norm": 0.45393678545951843, "learning_rate": 0.00028169999999999996, "loss": 6.4006, "step": 940 }, { "epoch": 0.01935166792947392, "grad_norm": 0.5216274261474609, "learning_rate": 0.00028319999999999994, "loss": 6.4666, "step": 945 }, { "epoch": 0.019454057706878546, "grad_norm": 0.6625182032585144, "learning_rate": 0.0002847, "loss": 6.3587, "step": 950 }, { "epoch": 0.01955644748428317, "grad_norm": 0.6020975708961487, "learning_rate": 0.00028619999999999996, "loss": 6.3703, "step": 955 }, { "epoch": 0.01965883726168779, "grad_norm": 0.6037085056304932, "learning_rate": 0.00028769999999999995, "loss": 6.4085, "step": 960 }, { "epoch": 0.019761227039092416, "grad_norm": 0.5771493911743164, "learning_rate": 0.0002892, "loss": 6.5006, "step": 965 }, { "epoch": 0.01986361681649704, "grad_norm": 0.6366599798202515, "learning_rate": 0.00029069999999999996, "loss": 6.3859, "step": 970 }, { "epoch": 0.019966006593901665, "grad_norm": 0.48622769117355347, "learning_rate": 0.00029219999999999995, "loss": 6.3734, "step": 975 }, { "epoch": 0.02006839637130629, "grad_norm": 0.5495675206184387, "learning_rate": 0.0002937, "loss": 6.3456, "step": 980 }, { "epoch": 0.02017078614871091, "grad_norm": 0.48445188999176025, "learning_rate": 0.00029519999999999997, "loss": 6.3319, "step": 985 }, { "epoch": 0.020273175926115536, "grad_norm": 0.5041749477386475, "learning_rate": 0.00029669999999999995, "loss": 6.2705, "step": 990 }, { "epoch": 0.02037556570352016, "grad_norm": 0.7327763438224792, "learning_rate": 0.0002982, "loss": 6.3685, "step": 995 }, { "epoch": 0.020477955480924785, "grad_norm": 0.5152395367622375, "learning_rate": 0.00029969999999999997, "loss": 6.2687, "step": 1000 }, { "epoch": 0.020477955480924785, "eval_loss": 6.336370944976807, "eval_runtime": 128.3744, "eval_samples_per_second": 4.534, "eval_steps_per_second": 0.569, "step": 1000 }, { "epoch": 0.02058034525832941, "grad_norm": 0.6076420545578003, "learning_rate": 0.0003, "loss": 6.2989, "step": 1005 }, { "epoch": 0.02068273503573403, "grad_norm": 0.47126850485801697, "learning_rate": 0.0003, "loss": 6.2849, "step": 1010 }, { "epoch": 0.020785124813138656, "grad_norm": 0.5502199530601501, "learning_rate": 0.0003, "loss": 6.2745, "step": 1015 }, { "epoch": 0.02088751459054328, "grad_norm": 0.42955151200294495, "learning_rate": 0.0003, "loss": 6.2641, "step": 1020 }, { "epoch": 0.020989904367947905, "grad_norm": 0.48553207516670227, "learning_rate": 0.0003, "loss": 6.2998, "step": 1025 }, { "epoch": 0.02109229414535253, "grad_norm": 0.5554366707801819, "learning_rate": 0.0003, "loss": 6.1981, "step": 1030 }, { "epoch": 0.02119468392275715, "grad_norm": 0.41872456669807434, "learning_rate": 0.0003, "loss": 6.2481, "step": 1035 }, { "epoch": 0.021297073700161775, "grad_norm": 0.4277290403842926, "learning_rate": 0.0003, "loss": 6.2365, "step": 1040 }, { "epoch": 0.0213994634775664, "grad_norm": 0.543452799320221, "learning_rate": 0.0003, "loss": 6.2455, "step": 1045 }, { "epoch": 0.021501853254971025, "grad_norm": 0.5797136425971985, "learning_rate": 0.0003, "loss": 6.2531, "step": 1050 }, { "epoch": 0.021604243032375646, "grad_norm": 0.5384876132011414, "learning_rate": 0.0003, "loss": 6.2193, "step": 1055 }, { "epoch": 0.02170663280978027, "grad_norm": 0.7414798140525818, "learning_rate": 0.0003, "loss": 6.1985, "step": 1060 }, { "epoch": 0.021809022587184895, "grad_norm": 0.5251097083091736, "learning_rate": 0.0003, "loss": 6.1753, "step": 1065 }, { "epoch": 0.02191141236458952, "grad_norm": 0.4314242899417877, "learning_rate": 0.0003, "loss": 6.1949, "step": 1070 }, { "epoch": 0.022013802141994145, "grad_norm": 0.3870464861392975, "learning_rate": 0.0003, "loss": 6.1631, "step": 1075 }, { "epoch": 0.022116191919398766, "grad_norm": 0.5488783121109009, "learning_rate": 0.0003, "loss": 6.2236, "step": 1080 }, { "epoch": 0.02221858169680339, "grad_norm": 0.4516567587852478, "learning_rate": 0.0003, "loss": 6.1836, "step": 1085 }, { "epoch": 0.022320971474208015, "grad_norm": 0.44085627794265747, "learning_rate": 0.0003, "loss": 6.1345, "step": 1090 }, { "epoch": 0.02242336125161264, "grad_norm": 0.6064696907997131, "learning_rate": 0.0003, "loss": 6.1789, "step": 1095 }, { "epoch": 0.022525751029017264, "grad_norm": 0.4074086844921112, "learning_rate": 0.0003, "loss": 6.0935, "step": 1100 }, { "epoch": 0.022628140806421886, "grad_norm": 0.4455675482749939, "learning_rate": 0.0003, "loss": 6.163, "step": 1105 }, { "epoch": 0.02273053058382651, "grad_norm": 0.4718489944934845, "learning_rate": 0.0003, "loss": 6.139, "step": 1110 }, { "epoch": 0.022832920361231135, "grad_norm": 0.45494744181632996, "learning_rate": 0.0003, "loss": 6.1668, "step": 1115 }, { "epoch": 0.02293531013863576, "grad_norm": 0.5436424612998962, "learning_rate": 0.0003, "loss": 6.145, "step": 1120 }, { "epoch": 0.023037699916040384, "grad_norm": 0.42465776205062866, "learning_rate": 0.0003, "loss": 6.0842, "step": 1125 }, { "epoch": 0.023140089693445005, "grad_norm": 0.5106245279312134, "learning_rate": 0.0003, "loss": 6.076, "step": 1130 }, { "epoch": 0.02324247947084963, "grad_norm": 0.41347572207450867, "learning_rate": 0.0003, "loss": 6.0926, "step": 1135 }, { "epoch": 0.023344869248254255, "grad_norm": 0.48794978857040405, "learning_rate": 0.0003, "loss": 6.1093, "step": 1140 }, { "epoch": 0.02344725902565888, "grad_norm": 0.47482869029045105, "learning_rate": 0.0003, "loss": 6.087, "step": 1145 }, { "epoch": 0.0235496488030635, "grad_norm": 0.5310367941856384, "learning_rate": 0.0003, "loss": 6.0847, "step": 1150 }, { "epoch": 0.023652038580468125, "grad_norm": 0.4245409071445465, "learning_rate": 0.0003, "loss": 6.0588, "step": 1155 }, { "epoch": 0.02375442835787275, "grad_norm": 0.40255552530288696, "learning_rate": 0.0003, "loss": 6.0829, "step": 1160 }, { "epoch": 0.023856818135277374, "grad_norm": 0.4049079120159149, "learning_rate": 0.0003, "loss": 6.099, "step": 1165 }, { "epoch": 0.023959207912682, "grad_norm": 0.44209587574005127, "learning_rate": 0.0003, "loss": 6.0418, "step": 1170 }, { "epoch": 0.02406159769008662, "grad_norm": 0.4518246650695801, "learning_rate": 0.0003, "loss": 6.0572, "step": 1175 }, { "epoch": 0.024163987467491245, "grad_norm": 0.5068168640136719, "learning_rate": 0.0003, "loss": 5.985, "step": 1180 }, { "epoch": 0.02426637724489587, "grad_norm": 0.7139155268669128, "learning_rate": 0.0003, "loss": 6.0778, "step": 1185 }, { "epoch": 0.024368767022300494, "grad_norm": 0.5174667239189148, "learning_rate": 0.0003, "loss": 6.0872, "step": 1190 }, { "epoch": 0.02447115679970512, "grad_norm": 0.5178516507148743, "learning_rate": 0.0003, "loss": 6.0561, "step": 1195 }, { "epoch": 0.02457354657710974, "grad_norm": 0.4526061415672302, "learning_rate": 0.0003, "loss": 6.0428, "step": 1200 }, { "epoch": 0.024675936354514365, "grad_norm": 0.4358402490615845, "learning_rate": 0.0003, "loss": 6.0126, "step": 1205 }, { "epoch": 0.02477832613191899, "grad_norm": 0.4212406575679779, "learning_rate": 0.0003, "loss": 6.0066, "step": 1210 }, { "epoch": 0.024880715909323614, "grad_norm": 0.4510822296142578, "learning_rate": 0.0003, "loss": 5.9947, "step": 1215 }, { "epoch": 0.02498310568672824, "grad_norm": 0.4986138939857483, "learning_rate": 0.0003, "loss": 6.0421, "step": 1220 }, { "epoch": 0.02508549546413286, "grad_norm": 0.41960692405700684, "learning_rate": 0.0003, "loss": 5.9841, "step": 1225 }, { "epoch": 0.025187885241537485, "grad_norm": 0.3849349319934845, "learning_rate": 0.0003, "loss": 6.0143, "step": 1230 }, { "epoch": 0.02529027501894211, "grad_norm": 0.3687571883201599, "learning_rate": 0.0003, "loss": 5.9718, "step": 1235 }, { "epoch": 0.025392664796346734, "grad_norm": 0.40778475999832153, "learning_rate": 0.0003, "loss": 5.982, "step": 1240 }, { "epoch": 0.025495054573751355, "grad_norm": 0.4962833821773529, "learning_rate": 0.0003, "loss": 5.9226, "step": 1245 }, { "epoch": 0.02559744435115598, "grad_norm": 0.5068868398666382, "learning_rate": 0.0003, "loss": 5.976, "step": 1250 }, { "epoch": 0.025699834128560604, "grad_norm": 0.4068223237991333, "learning_rate": 0.0003, "loss": 5.954, "step": 1255 }, { "epoch": 0.02580222390596523, "grad_norm": 0.4254078269004822, "learning_rate": 0.0003, "loss": 5.95, "step": 1260 }, { "epoch": 0.025904613683369854, "grad_norm": 0.4287364184856415, "learning_rate": 0.0003, "loss": 5.91, "step": 1265 }, { "epoch": 0.026007003460774475, "grad_norm": 0.561913251876831, "learning_rate": 0.0003, "loss": 5.9931, "step": 1270 }, { "epoch": 0.0261093932381791, "grad_norm": 0.44675496220588684, "learning_rate": 0.0003, "loss": 5.9254, "step": 1275 }, { "epoch": 0.026211783015583724, "grad_norm": 0.3729199171066284, "learning_rate": 0.0003, "loss": 5.9828, "step": 1280 }, { "epoch": 0.02631417279298835, "grad_norm": 0.3478130102157593, "learning_rate": 0.0003, "loss": 5.917, "step": 1285 }, { "epoch": 0.026416562570392974, "grad_norm": 0.38890165090560913, "learning_rate": 0.0003, "loss": 5.933, "step": 1290 }, { "epoch": 0.026518952347797595, "grad_norm": 0.41554898023605347, "learning_rate": 0.0003, "loss": 5.9307, "step": 1295 }, { "epoch": 0.02662134212520222, "grad_norm": 0.42908337712287903, "learning_rate": 0.0003, "loss": 5.9184, "step": 1300 }, { "epoch": 0.026723731902606844, "grad_norm": 0.38750404119491577, "learning_rate": 0.0003, "loss": 5.9222, "step": 1305 }, { "epoch": 0.02682612168001147, "grad_norm": 0.4352513253688812, "learning_rate": 0.0003, "loss": 5.8937, "step": 1310 }, { "epoch": 0.026928511457416093, "grad_norm": 0.3794362545013428, "learning_rate": 0.0003, "loss": 5.8896, "step": 1315 }, { "epoch": 0.027030901234820714, "grad_norm": 0.4965048134326935, "learning_rate": 0.0003, "loss": 5.9118, "step": 1320 }, { "epoch": 0.02713329101222534, "grad_norm": 0.4754084050655365, "learning_rate": 0.0003, "loss": 5.9377, "step": 1325 }, { "epoch": 0.027235680789629964, "grad_norm": 0.48557162284851074, "learning_rate": 0.0003, "loss": 5.8814, "step": 1330 }, { "epoch": 0.02733807056703459, "grad_norm": 0.4073465168476105, "learning_rate": 0.0003, "loss": 5.878, "step": 1335 }, { "epoch": 0.02744046034443921, "grad_norm": 0.40075191855430603, "learning_rate": 0.0003, "loss": 5.8505, "step": 1340 }, { "epoch": 0.027542850121843834, "grad_norm": 0.35601678490638733, "learning_rate": 0.0003, "loss": 5.9291, "step": 1345 }, { "epoch": 0.02764523989924846, "grad_norm": 0.5753795504570007, "learning_rate": 0.0003, "loss": 5.8586, "step": 1350 }, { "epoch": 0.027747629676653084, "grad_norm": 0.3592877984046936, "learning_rate": 0.0003, "loss": 5.8507, "step": 1355 }, { "epoch": 0.027850019454057708, "grad_norm": 0.3982386291027069, "learning_rate": 0.0003, "loss": 5.8679, "step": 1360 }, { "epoch": 0.02795240923146233, "grad_norm": 0.4653341770172119, "learning_rate": 0.0003, "loss": 5.8429, "step": 1365 }, { "epoch": 0.028054799008866954, "grad_norm": 0.42373934388160706, "learning_rate": 0.0003, "loss": 5.8409, "step": 1370 }, { "epoch": 0.02815718878627158, "grad_norm": 0.4872317910194397, "learning_rate": 0.0003, "loss": 5.8394, "step": 1375 }, { "epoch": 0.028259578563676203, "grad_norm": 0.4176357388496399, "learning_rate": 0.0003, "loss": 5.869, "step": 1380 }, { "epoch": 0.028361968341080828, "grad_norm": 0.4286108911037445, "learning_rate": 0.0003, "loss": 5.8369, "step": 1385 }, { "epoch": 0.02846435811848545, "grad_norm": 0.39009803533554077, "learning_rate": 0.0003, "loss": 5.8566, "step": 1390 }, { "epoch": 0.028566747895890074, "grad_norm": 0.3626231253147125, "learning_rate": 0.0003, "loss": 5.8769, "step": 1395 }, { "epoch": 0.0286691376732947, "grad_norm": 0.39475512504577637, "learning_rate": 0.0003, "loss": 5.8343, "step": 1400 }, { "epoch": 0.028771527450699323, "grad_norm": 0.35049155354499817, "learning_rate": 0.0003, "loss": 5.8078, "step": 1405 }, { "epoch": 0.028873917228103944, "grad_norm": 0.4400930404663086, "learning_rate": 0.0003, "loss": 5.8022, "step": 1410 }, { "epoch": 0.02897630700550857, "grad_norm": 0.4743267595767975, "learning_rate": 0.0003, "loss": 5.8385, "step": 1415 }, { "epoch": 0.029078696782913194, "grad_norm": 0.3859167695045471, "learning_rate": 0.0003, "loss": 5.8517, "step": 1420 }, { "epoch": 0.02918108656031782, "grad_norm": 0.37883540987968445, "learning_rate": 0.0003, "loss": 5.8716, "step": 1425 }, { "epoch": 0.029283476337722443, "grad_norm": 0.383993923664093, "learning_rate": 0.0003, "loss": 5.7866, "step": 1430 }, { "epoch": 0.029385866115127064, "grad_norm": 0.42862668633461, "learning_rate": 0.0003, "loss": 5.8276, "step": 1435 }, { "epoch": 0.02948825589253169, "grad_norm": 0.36276373267173767, "learning_rate": 0.0003, "loss": 5.8164, "step": 1440 }, { "epoch": 0.029590645669936314, "grad_norm": 0.5727818012237549, "learning_rate": 0.0003, "loss": 5.8051, "step": 1445 }, { "epoch": 0.029693035447340938, "grad_norm": 0.35043594241142273, "learning_rate": 0.0003, "loss": 5.8202, "step": 1450 }, { "epoch": 0.029795425224745563, "grad_norm": 0.44809040427207947, "learning_rate": 0.0003, "loss": 5.8222, "step": 1455 }, { "epoch": 0.029897815002150184, "grad_norm": 0.4120275378227234, "learning_rate": 0.0003, "loss": 5.794, "step": 1460 }, { "epoch": 0.03000020477955481, "grad_norm": 0.3673813045024872, "learning_rate": 0.0003, "loss": 5.8181, "step": 1465 }, { "epoch": 0.030102594556959433, "grad_norm": 0.3413775563240051, "learning_rate": 0.0003, "loss": 5.7701, "step": 1470 }, { "epoch": 0.030204984334364058, "grad_norm": 0.35870635509490967, "learning_rate": 0.0003, "loss": 5.7869, "step": 1475 }, { "epoch": 0.030307374111768683, "grad_norm": 0.3878447711467743, "learning_rate": 0.0003, "loss": 5.7742, "step": 1480 }, { "epoch": 0.030409763889173304, "grad_norm": 0.4131092429161072, "learning_rate": 0.0003, "loss": 5.7552, "step": 1485 }, { "epoch": 0.03051215366657793, "grad_norm": 0.34513989090919495, "learning_rate": 0.0003, "loss": 5.7792, "step": 1490 }, { "epoch": 0.030614543443982553, "grad_norm": 0.38045361638069153, "learning_rate": 0.0003, "loss": 5.7677, "step": 1495 }, { "epoch": 0.030716933221387178, "grad_norm": 0.4234094023704529, "learning_rate": 0.0003, "loss": 5.7968, "step": 1500 }, { "epoch": 0.0308193229987918, "grad_norm": 0.3792508840560913, "learning_rate": 0.0003, "loss": 5.7779, "step": 1505 }, { "epoch": 0.030921712776196424, "grad_norm": 0.3751790225505829, "learning_rate": 0.0003, "loss": 5.8122, "step": 1510 }, { "epoch": 0.03102410255360105, "grad_norm": 0.38870561122894287, "learning_rate": 0.0003, "loss": 5.7776, "step": 1515 }, { "epoch": 0.031126492331005673, "grad_norm": 0.3328951299190521, "learning_rate": 0.0003, "loss": 5.7336, "step": 1520 }, { "epoch": 0.031228882108410298, "grad_norm": 0.8058025240898132, "learning_rate": 0.0003, "loss": 5.6826, "step": 1525 }, { "epoch": 0.03133127188581492, "grad_norm": 0.37183621525764465, "learning_rate": 0.0003, "loss": 5.7525, "step": 1530 }, { "epoch": 0.03143366166321954, "grad_norm": 0.33894869685173035, "learning_rate": 0.0003, "loss": 5.7739, "step": 1535 }, { "epoch": 0.03153605144062417, "grad_norm": 0.3959846794605255, "learning_rate": 0.0003, "loss": 5.7527, "step": 1540 }, { "epoch": 0.03163844121802879, "grad_norm": 0.41048547625541687, "learning_rate": 0.0003, "loss": 5.7598, "step": 1545 }, { "epoch": 0.031740830995433414, "grad_norm": 0.37840402126312256, "learning_rate": 0.0003, "loss": 5.7207, "step": 1550 }, { "epoch": 0.03184322077283804, "grad_norm": 0.37814024090766907, "learning_rate": 0.0003, "loss": 5.7232, "step": 1555 }, { "epoch": 0.03194561055024266, "grad_norm": 0.3821817636489868, "learning_rate": 0.0003, "loss": 5.7615, "step": 1560 }, { "epoch": 0.032048000327647284, "grad_norm": 0.42602258920669556, "learning_rate": 0.0003, "loss": 5.7825, "step": 1565 }, { "epoch": 0.03215039010505191, "grad_norm": 0.4071274995803833, "learning_rate": 0.0003, "loss": 5.7122, "step": 1570 }, { "epoch": 0.032252779882456534, "grad_norm": 0.3515342175960541, "learning_rate": 0.0003, "loss": 5.7415, "step": 1575 }, { "epoch": 0.03235516965986116, "grad_norm": 0.40821370482444763, "learning_rate": 0.0003, "loss": 5.6702, "step": 1580 }, { "epoch": 0.03245755943726578, "grad_norm": 0.4025024175643921, "learning_rate": 0.0003, "loss": 5.7376, "step": 1585 }, { "epoch": 0.032559949214670404, "grad_norm": 0.3244132995605469, "learning_rate": 0.0003, "loss": 5.7276, "step": 1590 }, { "epoch": 0.03266233899207503, "grad_norm": 0.3303980231285095, "learning_rate": 0.0003, "loss": 5.7456, "step": 1595 }, { "epoch": 0.032764728769479654, "grad_norm": 0.38295868039131165, "learning_rate": 0.0003, "loss": 5.7252, "step": 1600 }, { "epoch": 0.03286711854688428, "grad_norm": 0.3676038086414337, "learning_rate": 0.0003, "loss": 5.7262, "step": 1605 }, { "epoch": 0.0329695083242889, "grad_norm": 0.3270364999771118, "learning_rate": 0.0003, "loss": 5.6958, "step": 1610 }, { "epoch": 0.033071898101693524, "grad_norm": 0.38710907101631165, "learning_rate": 0.0003, "loss": 5.6863, "step": 1615 }, { "epoch": 0.03317428787909815, "grad_norm": 0.48394158482551575, "learning_rate": 0.0003, "loss": 5.7162, "step": 1620 }, { "epoch": 0.03327667765650277, "grad_norm": 0.3814449906349182, "learning_rate": 0.0003, "loss": 5.7125, "step": 1625 }, { "epoch": 0.0333790674339074, "grad_norm": 0.3561025857925415, "learning_rate": 0.0003, "loss": 5.7087, "step": 1630 }, { "epoch": 0.03348145721131202, "grad_norm": 0.3981541693210602, "learning_rate": 0.0003, "loss": 5.6742, "step": 1635 }, { "epoch": 0.033583846988716644, "grad_norm": 0.36963599920272827, "learning_rate": 0.0003, "loss": 5.6722, "step": 1640 }, { "epoch": 0.03368623676612127, "grad_norm": 0.3874456286430359, "learning_rate": 0.0003, "loss": 5.6887, "step": 1645 }, { "epoch": 0.03378862654352589, "grad_norm": 0.31456896662712097, "learning_rate": 0.0003, "loss": 5.6685, "step": 1650 }, { "epoch": 0.03389101632093052, "grad_norm": 0.3377586901187897, "learning_rate": 0.0003, "loss": 5.6969, "step": 1655 }, { "epoch": 0.03399340609833514, "grad_norm": 0.3572098910808563, "learning_rate": 0.0003, "loss": 5.676, "step": 1660 }, { "epoch": 0.034095795875739764, "grad_norm": 0.3694843351840973, "learning_rate": 0.0003, "loss": 5.6547, "step": 1665 }, { "epoch": 0.03419818565314439, "grad_norm": 0.4291844666004181, "learning_rate": 0.0003, "loss": 5.7127, "step": 1670 }, { "epoch": 0.03430057543054901, "grad_norm": 0.36142176389694214, "learning_rate": 0.0003, "loss": 5.7063, "step": 1675 }, { "epoch": 0.03440296520795364, "grad_norm": 0.3345441520214081, "learning_rate": 0.0003, "loss": 5.7012, "step": 1680 }, { "epoch": 0.03450535498535826, "grad_norm": 0.33553993701934814, "learning_rate": 0.0003, "loss": 5.6681, "step": 1685 }, { "epoch": 0.03460774476276288, "grad_norm": 0.3819684386253357, "learning_rate": 0.0003, "loss": 5.6353, "step": 1690 }, { "epoch": 0.03471013454016751, "grad_norm": 0.35187289118766785, "learning_rate": 0.0003, "loss": 5.6603, "step": 1695 }, { "epoch": 0.03481252431757213, "grad_norm": 0.3862445652484894, "learning_rate": 0.0003, "loss": 5.7001, "step": 1700 }, { "epoch": 0.03491491409497676, "grad_norm": 0.3471886217594147, "learning_rate": 0.0003, "loss": 5.6303, "step": 1705 }, { "epoch": 0.03501730387238138, "grad_norm": 0.330762654542923, "learning_rate": 0.0003, "loss": 5.7048, "step": 1710 }, { "epoch": 0.035119693649786, "grad_norm": 0.3590639531612396, "learning_rate": 0.0003, "loss": 5.6629, "step": 1715 }, { "epoch": 0.03522208342719063, "grad_norm": 0.4507967531681061, "learning_rate": 0.0003, "loss": 5.6756, "step": 1720 }, { "epoch": 0.03532447320459525, "grad_norm": 0.3698206841945648, "learning_rate": 0.0003, "loss": 5.727, "step": 1725 }, { "epoch": 0.035426862981999874, "grad_norm": 0.35770583152770996, "learning_rate": 0.0003, "loss": 5.5815, "step": 1730 }, { "epoch": 0.0355292527594045, "grad_norm": 0.33955705165863037, "learning_rate": 0.0003, "loss": 5.6646, "step": 1735 }, { "epoch": 0.03563164253680912, "grad_norm": 0.3695562481880188, "learning_rate": 0.0003, "loss": 5.6772, "step": 1740 }, { "epoch": 0.03573403231421375, "grad_norm": 0.3273179829120636, "learning_rate": 0.0003, "loss": 5.6608, "step": 1745 }, { "epoch": 0.03583642209161837, "grad_norm": 0.34312453866004944, "learning_rate": 0.0003, "loss": 5.6768, "step": 1750 }, { "epoch": 0.035938811869022994, "grad_norm": 0.34229740500450134, "learning_rate": 0.0003, "loss": 5.6108, "step": 1755 }, { "epoch": 0.03604120164642762, "grad_norm": 0.3227626383304596, "learning_rate": 0.0003, "loss": 5.6101, "step": 1760 }, { "epoch": 0.03614359142383224, "grad_norm": 0.3419533669948578, "learning_rate": 0.0003, "loss": 5.6117, "step": 1765 }, { "epoch": 0.03624598120123687, "grad_norm": 0.40896594524383545, "learning_rate": 0.0003, "loss": 5.6037, "step": 1770 }, { "epoch": 0.03634837097864149, "grad_norm": 0.32976123690605164, "learning_rate": 0.0003, "loss": 5.6104, "step": 1775 }, { "epoch": 0.03645076075604611, "grad_norm": 0.31900909543037415, "learning_rate": 0.0003, "loss": 5.6692, "step": 1780 }, { "epoch": 0.03655315053345074, "grad_norm": 0.30529239773750305, "learning_rate": 0.0003, "loss": 5.5969, "step": 1785 }, { "epoch": 0.03665554031085536, "grad_norm": 0.32662200927734375, "learning_rate": 0.0003, "loss": 5.6239, "step": 1790 }, { "epoch": 0.03675793008825999, "grad_norm": 0.35408344864845276, "learning_rate": 0.0003, "loss": 5.6779, "step": 1795 }, { "epoch": 0.03686031986566461, "grad_norm": 0.3842693269252777, "learning_rate": 0.0003, "loss": 5.6231, "step": 1800 }, { "epoch": 0.03696270964306923, "grad_norm": 0.33722659945487976, "learning_rate": 0.0003, "loss": 5.6586, "step": 1805 }, { "epoch": 0.03706509942047386, "grad_norm": 0.3564646542072296, "learning_rate": 0.0003, "loss": 5.5967, "step": 1810 }, { "epoch": 0.03716748919787848, "grad_norm": 0.32668283581733704, "learning_rate": 0.0003, "loss": 5.5891, "step": 1815 }, { "epoch": 0.03726987897528311, "grad_norm": 0.3343525230884552, "learning_rate": 0.0003, "loss": 5.6138, "step": 1820 }, { "epoch": 0.03737226875268773, "grad_norm": 0.4201762080192566, "learning_rate": 0.0003, "loss": 5.598, "step": 1825 }, { "epoch": 0.03747465853009235, "grad_norm": 0.5659616589546204, "learning_rate": 0.0003, "loss": 5.7533, "step": 1830 }, { "epoch": 0.03757704830749698, "grad_norm": 0.38928595185279846, "learning_rate": 0.0003, "loss": 5.6555, "step": 1835 }, { "epoch": 0.0376794380849016, "grad_norm": 0.45403486490249634, "learning_rate": 0.0003, "loss": 5.7067, "step": 1840 }, { "epoch": 0.03778182786230623, "grad_norm": 0.3449617624282837, "learning_rate": 0.0003, "loss": 5.6221, "step": 1845 }, { "epoch": 0.03788421763971085, "grad_norm": 0.353818416595459, "learning_rate": 0.0003, "loss": 5.614, "step": 1850 }, { "epoch": 0.03798660741711547, "grad_norm": 0.35148918628692627, "learning_rate": 0.0003, "loss": 5.6021, "step": 1855 }, { "epoch": 0.0380889971945201, "grad_norm": 0.2895936369895935, "learning_rate": 0.0003, "loss": 5.6375, "step": 1860 }, { "epoch": 0.03819138697192472, "grad_norm": 0.31613689661026, "learning_rate": 0.0003, "loss": 5.5729, "step": 1865 }, { "epoch": 0.03829377674932935, "grad_norm": 0.7037827372550964, "learning_rate": 0.0003, "loss": 5.5409, "step": 1870 }, { "epoch": 0.03839616652673397, "grad_norm": 0.40238696336746216, "learning_rate": 0.0003, "loss": 5.5955, "step": 1875 }, { "epoch": 0.03849855630413859, "grad_norm": 0.30799344182014465, "learning_rate": 0.0003, "loss": 5.6085, "step": 1880 }, { "epoch": 0.03860094608154322, "grad_norm": 0.3469482660293579, "learning_rate": 0.0003, "loss": 5.608, "step": 1885 }, { "epoch": 0.03870333585894784, "grad_norm": 0.3633992075920105, "learning_rate": 0.0003, "loss": 5.6317, "step": 1890 }, { "epoch": 0.03880572563635247, "grad_norm": 0.3335460126399994, "learning_rate": 0.0003, "loss": 5.577, "step": 1895 }, { "epoch": 0.03890811541375709, "grad_norm": 0.3882322907447815, "learning_rate": 0.0003, "loss": 5.6301, "step": 1900 }, { "epoch": 0.03901050519116171, "grad_norm": 0.33104488253593445, "learning_rate": 0.0003, "loss": 5.5612, "step": 1905 }, { "epoch": 0.03911289496856634, "grad_norm": 0.32825130224227905, "learning_rate": 0.0003, "loss": 5.5834, "step": 1910 }, { "epoch": 0.03921528474597096, "grad_norm": 0.3569926619529724, "learning_rate": 0.0003, "loss": 5.5913, "step": 1915 }, { "epoch": 0.03931767452337558, "grad_norm": 0.3393506109714508, "learning_rate": 0.0003, "loss": 5.6085, "step": 1920 }, { "epoch": 0.03942006430078021, "grad_norm": 0.3391747772693634, "learning_rate": 0.0003, "loss": 5.5552, "step": 1925 }, { "epoch": 0.03952245407818483, "grad_norm": 0.374737024307251, "learning_rate": 0.0003, "loss": 5.5973, "step": 1930 }, { "epoch": 0.03962484385558946, "grad_norm": 0.32326409220695496, "learning_rate": 0.0003, "loss": 5.5686, "step": 1935 }, { "epoch": 0.03972723363299408, "grad_norm": 0.33155369758605957, "learning_rate": 0.0003, "loss": 5.586, "step": 1940 }, { "epoch": 0.0398296234103987, "grad_norm": 0.30039525032043457, "learning_rate": 0.0003, "loss": 5.5708, "step": 1945 }, { "epoch": 0.03993201318780333, "grad_norm": 0.31259462237358093, "learning_rate": 0.0003, "loss": 5.5693, "step": 1950 }, { "epoch": 0.04003440296520795, "grad_norm": 0.32965245842933655, "learning_rate": 0.0003, "loss": 5.5784, "step": 1955 }, { "epoch": 0.04013679274261258, "grad_norm": 0.36182472109794617, "learning_rate": 0.0003, "loss": 5.5073, "step": 1960 }, { "epoch": 0.0402391825200172, "grad_norm": 0.3083086311817169, "learning_rate": 0.0003, "loss": 5.5432, "step": 1965 }, { "epoch": 0.04034157229742182, "grad_norm": 0.3532242178916931, "learning_rate": 0.0003, "loss": 5.5841, "step": 1970 }, { "epoch": 0.04044396207482645, "grad_norm": 0.3572655916213989, "learning_rate": 0.0003, "loss": 5.5665, "step": 1975 }, { "epoch": 0.04054635185223107, "grad_norm": 0.3412257432937622, "learning_rate": 0.0003, "loss": 5.5591, "step": 1980 }, { "epoch": 0.0406487416296357, "grad_norm": 0.3201288878917694, "learning_rate": 0.0003, "loss": 5.606, "step": 1985 }, { "epoch": 0.04075113140704032, "grad_norm": 0.38728052377700806, "learning_rate": 0.0003, "loss": 5.5509, "step": 1990 }, { "epoch": 0.04085352118444494, "grad_norm": 0.38100388646125793, "learning_rate": 0.0003, "loss": 5.6295, "step": 1995 }, { "epoch": 0.04095591096184957, "grad_norm": 0.34643521904945374, "learning_rate": 0.0003, "loss": 5.5531, "step": 2000 }, { "epoch": 0.04095591096184957, "eval_loss": 5.583315849304199, "eval_runtime": 17.2581, "eval_samples_per_second": 33.723, "eval_steps_per_second": 4.23, "step": 2000 }, { "epoch": 0.04105830073925419, "grad_norm": 0.3605150282382965, "learning_rate": 0.0003, "loss": 5.5599, "step": 2005 }, { "epoch": 0.04116069051665882, "grad_norm": 0.3214218318462372, "learning_rate": 0.0003, "loss": 5.5758, "step": 2010 }, { "epoch": 0.04126308029406344, "grad_norm": 0.33031126856803894, "learning_rate": 0.0003, "loss": 5.5487, "step": 2015 }, { "epoch": 0.04136547007146806, "grad_norm": 0.3428272604942322, "learning_rate": 0.0003, "loss": 5.5345, "step": 2020 }, { "epoch": 0.04146785984887269, "grad_norm": 0.32269689440727234, "learning_rate": 0.0003, "loss": 5.5702, "step": 2025 }, { "epoch": 0.04157024962627731, "grad_norm": 0.38060262799263, "learning_rate": 0.0003, "loss": 5.5379, "step": 2030 }, { "epoch": 0.04167263940368194, "grad_norm": 0.32364749908447266, "learning_rate": 0.0003, "loss": 5.5284, "step": 2035 }, { "epoch": 0.04177502918108656, "grad_norm": 0.3137708008289337, "learning_rate": 0.0003, "loss": 5.5021, "step": 2040 }, { "epoch": 0.04187741895849118, "grad_norm": 0.3555919826030731, "learning_rate": 0.0003, "loss": 5.5261, "step": 2045 }, { "epoch": 0.04197980873589581, "grad_norm": 0.3532278537750244, "learning_rate": 0.0003, "loss": 5.5472, "step": 2050 }, { "epoch": 0.04208219851330043, "grad_norm": 0.331758052110672, "learning_rate": 0.0003, "loss": 5.4677, "step": 2055 }, { "epoch": 0.04218458829070506, "grad_norm": 0.49990275502204895, "learning_rate": 0.0003, "loss": 5.5588, "step": 2060 }, { "epoch": 0.04228697806810968, "grad_norm": 0.30251890420913696, "learning_rate": 0.0003, "loss": 5.5241, "step": 2065 }, { "epoch": 0.0423893678455143, "grad_norm": 0.36536428332328796, "learning_rate": 0.0003, "loss": 5.5187, "step": 2070 }, { "epoch": 0.04249175762291893, "grad_norm": 0.29050710797309875, "learning_rate": 0.0003, "loss": 5.5183, "step": 2075 }, { "epoch": 0.04259414740032355, "grad_norm": 4.046236991882324, "learning_rate": 0.0003, "loss": 5.5574, "step": 2080 }, { "epoch": 0.04269653717772818, "grad_norm": 0.39397743344306946, "learning_rate": 0.0003, "loss": 5.576, "step": 2085 }, { "epoch": 0.0427989269551328, "grad_norm": 0.4010278880596161, "learning_rate": 0.0003, "loss": 5.513, "step": 2090 }, { "epoch": 0.04290131673253742, "grad_norm": 0.368369460105896, "learning_rate": 0.0003, "loss": 5.4895, "step": 2095 }, { "epoch": 0.04300370650994205, "grad_norm": 0.3162172734737396, "learning_rate": 0.0003, "loss": 5.6343, "step": 2100 }, { "epoch": 0.04310609628734667, "grad_norm": 0.32602453231811523, "learning_rate": 0.0003, "loss": 5.5658, "step": 2105 }, { "epoch": 0.04320848606475129, "grad_norm": 0.29531750082969666, "learning_rate": 0.0003, "loss": 5.5271, "step": 2110 }, { "epoch": 0.04331087584215592, "grad_norm": 0.31045177578926086, "learning_rate": 0.0003, "loss": 5.5363, "step": 2115 }, { "epoch": 0.04341326561956054, "grad_norm": 0.29968997836112976, "learning_rate": 0.0003, "loss": 5.4967, "step": 2120 }, { "epoch": 0.04351565539696517, "grad_norm": 0.32277289032936096, "learning_rate": 0.0003, "loss": 5.3919, "step": 2125 }, { "epoch": 0.04361804517436979, "grad_norm": 0.3237505555152893, "learning_rate": 0.0003, "loss": 5.5099, "step": 2130 }, { "epoch": 0.04372043495177441, "grad_norm": 0.3156304657459259, "learning_rate": 0.0003, "loss": 5.4859, "step": 2135 }, { "epoch": 0.04382282472917904, "grad_norm": 0.2915082573890686, "learning_rate": 0.0003, "loss": 5.5746, "step": 2140 }, { "epoch": 0.04392521450658366, "grad_norm": 0.3208426237106323, "learning_rate": 0.0003, "loss": 5.4987, "step": 2145 }, { "epoch": 0.04402760428398829, "grad_norm": 0.2956843674182892, "learning_rate": 0.0003, "loss": 5.5098, "step": 2150 }, { "epoch": 0.04412999406139291, "grad_norm": 0.294599711894989, "learning_rate": 0.0003, "loss": 5.5153, "step": 2155 }, { "epoch": 0.04423238383879753, "grad_norm": 0.3562420904636383, "learning_rate": 0.0003, "loss": 5.4741, "step": 2160 }, { "epoch": 0.04433477361620216, "grad_norm": 0.3201698362827301, "learning_rate": 0.0003, "loss": 5.532, "step": 2165 }, { "epoch": 0.04443716339360678, "grad_norm": 0.304730087518692, "learning_rate": 0.0003, "loss": 5.5303, "step": 2170 }, { "epoch": 0.04453955317101141, "grad_norm": 0.332582026720047, "learning_rate": 0.0003, "loss": 5.4733, "step": 2175 }, { "epoch": 0.04464194294841603, "grad_norm": 0.30200523138046265, "learning_rate": 0.0003, "loss": 5.4378, "step": 2180 }, { "epoch": 0.04474433272582065, "grad_norm": 0.3588789999485016, "learning_rate": 0.0003, "loss": 5.4876, "step": 2185 }, { "epoch": 0.04484672250322528, "grad_norm": 0.3039074242115021, "learning_rate": 0.0003, "loss": 5.4944, "step": 2190 }, { "epoch": 0.0449491122806299, "grad_norm": 0.48547083139419556, "learning_rate": 0.0003, "loss": 5.5256, "step": 2195 }, { "epoch": 0.04505150205803453, "grad_norm": 0.32456061244010925, "learning_rate": 0.0003, "loss": 5.4931, "step": 2200 }, { "epoch": 0.04515389183543915, "grad_norm": 0.36695829033851624, "learning_rate": 0.0003, "loss": 5.4575, "step": 2205 }, { "epoch": 0.04525628161284377, "grad_norm": 0.365884929895401, "learning_rate": 0.0003, "loss": 5.4911, "step": 2210 }, { "epoch": 0.0453586713902484, "grad_norm": 0.3172907829284668, "learning_rate": 0.0003, "loss": 5.5056, "step": 2215 }, { "epoch": 0.04546106116765302, "grad_norm": 0.31234583258628845, "learning_rate": 0.0003, "loss": 5.477, "step": 2220 }, { "epoch": 0.04556345094505765, "grad_norm": 0.3623846769332886, "learning_rate": 0.0003, "loss": 5.4839, "step": 2225 }, { "epoch": 0.04566584072246227, "grad_norm": 0.29917284846305847, "learning_rate": 0.0003, "loss": 5.533, "step": 2230 }, { "epoch": 0.04576823049986689, "grad_norm": 0.30479612946510315, "learning_rate": 0.0003, "loss": 5.467, "step": 2235 }, { "epoch": 0.04587062027727152, "grad_norm": 0.29524606466293335, "learning_rate": 0.0003, "loss": 5.4925, "step": 2240 }, { "epoch": 0.04597301005467614, "grad_norm": 0.37530916929244995, "learning_rate": 0.0003, "loss": 5.5117, "step": 2245 }, { "epoch": 0.04607539983208077, "grad_norm": 0.33626043796539307, "learning_rate": 0.0003, "loss": 5.4686, "step": 2250 }, { "epoch": 0.04617778960948539, "grad_norm": 0.30177098512649536, "learning_rate": 0.0003, "loss": 5.462, "step": 2255 }, { "epoch": 0.04628017938689001, "grad_norm": 0.30366501212120056, "learning_rate": 0.0003, "loss": 5.4903, "step": 2260 }, { "epoch": 0.04638256916429464, "grad_norm": 0.281887412071228, "learning_rate": 0.0003, "loss": 5.4703, "step": 2265 }, { "epoch": 0.04648495894169926, "grad_norm": 0.2937527298927307, "learning_rate": 0.0003, "loss": 5.4657, "step": 2270 }, { "epoch": 0.04658734871910388, "grad_norm": 0.34095871448516846, "learning_rate": 0.0003, "loss": 5.43, "step": 2275 }, { "epoch": 0.04668973849650851, "grad_norm": 0.29006507992744446, "learning_rate": 0.0003, "loss": 5.4479, "step": 2280 }, { "epoch": 0.04679212827391313, "grad_norm": 0.29680919647216797, "learning_rate": 0.0003, "loss": 5.4708, "step": 2285 }, { "epoch": 0.04689451805131776, "grad_norm": 0.3066211938858032, "learning_rate": 0.0003, "loss": 5.4698, "step": 2290 }, { "epoch": 0.04699690782872238, "grad_norm": 0.3496037423610687, "learning_rate": 0.0003, "loss": 5.4626, "step": 2295 }, { "epoch": 0.047099297606127, "grad_norm": 0.30650416016578674, "learning_rate": 0.0003, "loss": 5.503, "step": 2300 }, { "epoch": 0.04720168738353163, "grad_norm": 0.3337506353855133, "learning_rate": 0.0003, "loss": 5.4569, "step": 2305 }, { "epoch": 0.04730407716093625, "grad_norm": 0.3257908523082733, "learning_rate": 0.0003, "loss": 5.4877, "step": 2310 }, { "epoch": 0.04740646693834088, "grad_norm": 0.32778066396713257, "learning_rate": 0.0003, "loss": 5.4402, "step": 2315 }, { "epoch": 0.0475088567157455, "grad_norm": 0.3712349534034729, "learning_rate": 0.0003, "loss": 5.4845, "step": 2320 }, { "epoch": 0.04761124649315012, "grad_norm": 0.29005157947540283, "learning_rate": 0.0003, "loss": 5.4534, "step": 2325 }, { "epoch": 0.04771363627055475, "grad_norm": 0.2965439558029175, "learning_rate": 0.0003, "loss": 5.4702, "step": 2330 }, { "epoch": 0.04781602604795937, "grad_norm": 0.3257777988910675, "learning_rate": 0.0003, "loss": 5.4405, "step": 2335 }, { "epoch": 0.047918415825364, "grad_norm": 0.30828627943992615, "learning_rate": 0.0003, "loss": 5.5202, "step": 2340 }, { "epoch": 0.04802080560276862, "grad_norm": 0.3007051944732666, "learning_rate": 0.0003, "loss": 5.4896, "step": 2345 }, { "epoch": 0.04812319538017324, "grad_norm": 0.31143495440483093, "learning_rate": 0.0003, "loss": 5.5087, "step": 2350 }, { "epoch": 0.04822558515757787, "grad_norm": 0.3348305821418762, "learning_rate": 0.0003, "loss": 5.4619, "step": 2355 }, { "epoch": 0.04832797493498249, "grad_norm": 0.3939598798751831, "learning_rate": 0.0003, "loss": 5.4527, "step": 2360 }, { "epoch": 0.04843036471238712, "grad_norm": 0.418344110250473, "learning_rate": 0.0003, "loss": 5.457, "step": 2365 }, { "epoch": 0.04853275448979174, "grad_norm": 0.3089907765388489, "learning_rate": 0.0003, "loss": 5.4268, "step": 2370 }, { "epoch": 0.04863514426719636, "grad_norm": 0.30724790692329407, "learning_rate": 0.0003, "loss": 5.497, "step": 2375 }, { "epoch": 0.04873753404460099, "grad_norm": 0.2833951711654663, "learning_rate": 0.0003, "loss": 5.436, "step": 2380 }, { "epoch": 0.04883992382200561, "grad_norm": 0.26925790309906006, "learning_rate": 0.0003, "loss": 5.4077, "step": 2385 }, { "epoch": 0.04894231359941024, "grad_norm": 0.2891678810119629, "learning_rate": 0.0003, "loss": 5.4053, "step": 2390 }, { "epoch": 0.04904470337681486, "grad_norm": 0.37123313546180725, "learning_rate": 0.0003, "loss": 5.4378, "step": 2395 }, { "epoch": 0.04914709315421948, "grad_norm": 0.3456077575683594, "learning_rate": 0.0003, "loss": 5.4581, "step": 2400 }, { "epoch": 0.04924948293162411, "grad_norm": 0.3430868983268738, "learning_rate": 0.0003, "loss": 5.4521, "step": 2405 }, { "epoch": 0.04935187270902873, "grad_norm": 0.3325310945510864, "learning_rate": 0.0003, "loss": 5.4949, "step": 2410 }, { "epoch": 0.04945426248643336, "grad_norm": 0.6735697388648987, "learning_rate": 0.0003, "loss": 5.4431, "step": 2415 }, { "epoch": 0.04955665226383798, "grad_norm": 0.32990604639053345, "learning_rate": 0.0003, "loss": 5.4302, "step": 2420 }, { "epoch": 0.0496590420412426, "grad_norm": 0.8016563653945923, "learning_rate": 0.0003, "loss": 5.4676, "step": 2425 }, { "epoch": 0.04976143181864723, "grad_norm": 0.32512250542640686, "learning_rate": 0.0003, "loss": 5.4798, "step": 2430 }, { "epoch": 0.04986382159605185, "grad_norm": 0.31798192858695984, "learning_rate": 0.0003, "loss": 5.4696, "step": 2435 }, { "epoch": 0.04996621137345648, "grad_norm": 0.3064839541912079, "learning_rate": 0.0003, "loss": 5.4655, "step": 2440 }, { "epoch": 0.0500686011508611, "grad_norm": 0.3080918490886688, "learning_rate": 0.0003, "loss": 5.4392, "step": 2445 }, { "epoch": 0.05017099092826572, "grad_norm": 0.27605608105659485, "learning_rate": 0.0003, "loss": 5.4385, "step": 2450 }, { "epoch": 0.05027338070567035, "grad_norm": 0.2982933521270752, "learning_rate": 0.0003, "loss": 5.4325, "step": 2455 }, { "epoch": 0.05037577048307497, "grad_norm": 0.3271021544933319, "learning_rate": 0.0003, "loss": 5.4436, "step": 2460 }, { "epoch": 0.05047816026047959, "grad_norm": 0.3043646216392517, "learning_rate": 0.0003, "loss": 5.4242, "step": 2465 }, { "epoch": 0.05058055003788422, "grad_norm": 0.3109362721443176, "learning_rate": 0.0003, "loss": 5.4851, "step": 2470 }, { "epoch": 0.05068293981528884, "grad_norm": 0.2791377604007721, "learning_rate": 0.0003, "loss": 5.4403, "step": 2475 }, { "epoch": 0.05078532959269347, "grad_norm": 0.3052929639816284, "learning_rate": 0.0003, "loss": 5.4143, "step": 2480 }, { "epoch": 0.05088771937009809, "grad_norm": 0.2574477195739746, "learning_rate": 0.0003, "loss": 5.4014, "step": 2485 }, { "epoch": 0.05099010914750271, "grad_norm": 0.2741445302963257, "learning_rate": 0.0003, "loss": 5.4129, "step": 2490 }, { "epoch": 0.05109249892490734, "grad_norm": 0.32297956943511963, "learning_rate": 0.0003, "loss": 5.4256, "step": 2495 }, { "epoch": 0.05119488870231196, "grad_norm": 0.32383501529693604, "learning_rate": 0.0003, "loss": 5.4153, "step": 2500 }, { "epoch": 0.05129727847971659, "grad_norm": 0.33168911933898926, "learning_rate": 0.0003, "loss": 5.4351, "step": 2505 }, { "epoch": 0.05139966825712121, "grad_norm": 0.2909999191761017, "learning_rate": 0.0003, "loss": 5.4195, "step": 2510 }, { "epoch": 0.05150205803452583, "grad_norm": 0.29537051916122437, "learning_rate": 0.0003, "loss": 5.4515, "step": 2515 }, { "epoch": 0.05160444781193046, "grad_norm": 0.29898619651794434, "learning_rate": 0.0003, "loss": 5.3557, "step": 2520 }, { "epoch": 0.05170683758933508, "grad_norm": 0.2907998859882355, "learning_rate": 0.0003, "loss": 5.4229, "step": 2525 }, { "epoch": 0.05180922736673971, "grad_norm": 0.3177140951156616, "learning_rate": 0.0003, "loss": 5.4054, "step": 2530 }, { "epoch": 0.05191161714414433, "grad_norm": 0.28961414098739624, "learning_rate": 0.0003, "loss": 5.417, "step": 2535 }, { "epoch": 0.05201400692154895, "grad_norm": 0.3086441159248352, "learning_rate": 0.0003, "loss": 5.4408, "step": 2540 }, { "epoch": 0.05211639669895358, "grad_norm": 0.2974289655685425, "learning_rate": 0.0003, "loss": 5.4342, "step": 2545 }, { "epoch": 0.0522187864763582, "grad_norm": 0.3043321967124939, "learning_rate": 0.0003, "loss": 5.3939, "step": 2550 }, { "epoch": 0.05232117625376283, "grad_norm": 0.313244104385376, "learning_rate": 0.0003, "loss": 5.4296, "step": 2555 }, { "epoch": 0.05242356603116745, "grad_norm": 0.3215901851654053, "learning_rate": 0.0003, "loss": 5.3516, "step": 2560 }, { "epoch": 0.05252595580857207, "grad_norm": 0.287676602602005, "learning_rate": 0.0003, "loss": 5.3864, "step": 2565 }, { "epoch": 0.0526283455859767, "grad_norm": 0.29060837626457214, "learning_rate": 0.0003, "loss": 5.399, "step": 2570 }, { "epoch": 0.05273073536338132, "grad_norm": 0.31551116704940796, "learning_rate": 0.0003, "loss": 5.436, "step": 2575 }, { "epoch": 0.05283312514078595, "grad_norm": 0.3053077161312103, "learning_rate": 0.0003, "loss": 5.408, "step": 2580 }, { "epoch": 0.05293551491819057, "grad_norm": 0.2839934825897217, "learning_rate": 0.0003, "loss": 5.4055, "step": 2585 }, { "epoch": 0.05303790469559519, "grad_norm": 0.2997831106185913, "learning_rate": 0.0003, "loss": 5.4258, "step": 2590 }, { "epoch": 0.05314029447299982, "grad_norm": 0.27672094106674194, "learning_rate": 0.0003, "loss": 5.4314, "step": 2595 }, { "epoch": 0.05324268425040444, "grad_norm": 0.2853901982307434, "learning_rate": 0.0003, "loss": 5.4281, "step": 2600 }, { "epoch": 0.05334507402780907, "grad_norm": 0.432605117559433, "learning_rate": 0.0003, "loss": 5.2178, "step": 2605 }, { "epoch": 0.05344746380521369, "grad_norm": 0.28586146235466003, "learning_rate": 0.0003, "loss": 5.4123, "step": 2610 }, { "epoch": 0.05354985358261831, "grad_norm": 0.2829761803150177, "learning_rate": 0.0003, "loss": 5.4273, "step": 2615 }, { "epoch": 0.05365224336002294, "grad_norm": 0.33027493953704834, "learning_rate": 0.0003, "loss": 5.3922, "step": 2620 }, { "epoch": 0.05375463313742756, "grad_norm": 0.28707289695739746, "learning_rate": 0.0003, "loss": 5.396, "step": 2625 }, { "epoch": 0.05385702291483219, "grad_norm": 0.35139745473861694, "learning_rate": 0.0003, "loss": 5.405, "step": 2630 }, { "epoch": 0.05395941269223681, "grad_norm": 0.27695679664611816, "learning_rate": 0.0003, "loss": 5.4257, "step": 2635 }, { "epoch": 0.05406180246964143, "grad_norm": 0.30533459782600403, "learning_rate": 0.0003, "loss": 5.3721, "step": 2640 }, { "epoch": 0.05416419224704606, "grad_norm": 0.3125362694263458, "learning_rate": 0.0003, "loss": 5.4178, "step": 2645 }, { "epoch": 0.05426658202445068, "grad_norm": 0.26875168085098267, "learning_rate": 0.0003, "loss": 5.4108, "step": 2650 }, { "epoch": 0.0543689718018553, "grad_norm": 0.2825389802455902, "learning_rate": 0.0003, "loss": 5.4012, "step": 2655 }, { "epoch": 0.05447136157925993, "grad_norm": 0.29759857058525085, "learning_rate": 0.0003, "loss": 5.3857, "step": 2660 }, { "epoch": 0.05457375135666455, "grad_norm": 0.2873668670654297, "learning_rate": 0.0003, "loss": 5.3778, "step": 2665 }, { "epoch": 0.05467614113406918, "grad_norm": 0.30925366282463074, "learning_rate": 0.0003, "loss": 5.4058, "step": 2670 }, { "epoch": 0.0547785309114738, "grad_norm": 0.31262725591659546, "learning_rate": 0.0003, "loss": 5.3799, "step": 2675 }, { "epoch": 0.05488092068887842, "grad_norm": 0.3050954341888428, "learning_rate": 0.0003, "loss": 5.3993, "step": 2680 }, { "epoch": 0.05498331046628305, "grad_norm": 0.28599846363067627, "learning_rate": 0.0003, "loss": 5.4062, "step": 2685 }, { "epoch": 0.05508570024368767, "grad_norm": 0.3238472044467926, "learning_rate": 0.0003, "loss": 5.3648, "step": 2690 }, { "epoch": 0.0551880900210923, "grad_norm": 0.26982608437538147, "learning_rate": 0.0003, "loss": 5.3961, "step": 2695 }, { "epoch": 0.05529047979849692, "grad_norm": 0.2549554705619812, "learning_rate": 0.0003, "loss": 5.4047, "step": 2700 }, { "epoch": 0.05539286957590154, "grad_norm": 0.26843875646591187, "learning_rate": 0.0003, "loss": 5.4177, "step": 2705 }, { "epoch": 0.05549525935330617, "grad_norm": 0.3723680377006531, "learning_rate": 0.0003, "loss": 5.349, "step": 2710 }, { "epoch": 0.05559764913071079, "grad_norm": 0.2782374620437622, "learning_rate": 0.0003, "loss": 5.3586, "step": 2715 }, { "epoch": 0.055700038908115417, "grad_norm": 0.3722502589225769, "learning_rate": 0.0003, "loss": 5.3817, "step": 2720 }, { "epoch": 0.05580242868552004, "grad_norm": 0.3401469886302948, "learning_rate": 0.0003, "loss": 5.331, "step": 2725 }, { "epoch": 0.05590481846292466, "grad_norm": 0.33351531624794006, "learning_rate": 0.0003, "loss": 5.3896, "step": 2730 }, { "epoch": 0.05600720824032929, "grad_norm": 0.2746967077255249, "learning_rate": 0.0003, "loss": 5.3825, "step": 2735 }, { "epoch": 0.05610959801773391, "grad_norm": 0.3015638291835785, "learning_rate": 0.0003, "loss": 5.3828, "step": 2740 }, { "epoch": 0.056211987795138536, "grad_norm": 0.27876561880111694, "learning_rate": 0.0003, "loss": 5.4345, "step": 2745 }, { "epoch": 0.05631437757254316, "grad_norm": 0.31784892082214355, "learning_rate": 0.0003, "loss": 5.3858, "step": 2750 }, { "epoch": 0.05641676734994778, "grad_norm": 0.2963598072528839, "learning_rate": 0.0003, "loss": 5.3802, "step": 2755 }, { "epoch": 0.05651915712735241, "grad_norm": 0.31250786781311035, "learning_rate": 0.0003, "loss": 5.355, "step": 2760 }, { "epoch": 0.05662154690475703, "grad_norm": 0.327053964138031, "learning_rate": 0.0003, "loss": 5.4345, "step": 2765 }, { "epoch": 0.056723936682161656, "grad_norm": 0.2861989736557007, "learning_rate": 0.0003, "loss": 5.3666, "step": 2770 }, { "epoch": 0.05682632645956628, "grad_norm": 0.3195374011993408, "learning_rate": 0.0003, "loss": 5.4485, "step": 2775 }, { "epoch": 0.0569287162369709, "grad_norm": 0.30048274993896484, "learning_rate": 0.0003, "loss": 5.3655, "step": 2780 }, { "epoch": 0.05703110601437553, "grad_norm": 0.3026953339576721, "learning_rate": 0.0003, "loss": 5.3903, "step": 2785 }, { "epoch": 0.05713349579178015, "grad_norm": 0.2979716956615448, "learning_rate": 0.0003, "loss": 5.3727, "step": 2790 }, { "epoch": 0.057235885569184776, "grad_norm": 0.29879677295684814, "learning_rate": 0.0003, "loss": 5.3949, "step": 2795 }, { "epoch": 0.0573382753465894, "grad_norm": 0.2776120901107788, "learning_rate": 0.0003, "loss": 5.3619, "step": 2800 }, { "epoch": 0.05744066512399402, "grad_norm": 0.29277926683425903, "learning_rate": 0.0003, "loss": 5.3101, "step": 2805 }, { "epoch": 0.057543054901398646, "grad_norm": 0.35536375641822815, "learning_rate": 0.0003, "loss": 5.2366, "step": 2810 }, { "epoch": 0.05764544467880327, "grad_norm": 0.3723018765449524, "learning_rate": 0.0003, "loss": 5.3171, "step": 2815 }, { "epoch": 0.05774783445620789, "grad_norm": 0.28674349188804626, "learning_rate": 0.0003, "loss": 5.3853, "step": 2820 }, { "epoch": 0.05785022423361252, "grad_norm": 0.26566022634506226, "learning_rate": 0.0003, "loss": 5.3303, "step": 2825 }, { "epoch": 0.05795261401101714, "grad_norm": 0.30490708351135254, "learning_rate": 0.0003, "loss": 5.3849, "step": 2830 }, { "epoch": 0.058055003788421766, "grad_norm": 0.33971863985061646, "learning_rate": 0.0003, "loss": 5.3618, "step": 2835 }, { "epoch": 0.05815739356582639, "grad_norm": 0.2888649106025696, "learning_rate": 0.0003, "loss": 5.3214, "step": 2840 }, { "epoch": 0.05825978334323101, "grad_norm": 0.27611035108566284, "learning_rate": 0.0003, "loss": 5.3986, "step": 2845 }, { "epoch": 0.05836217312063564, "grad_norm": 0.2928638160228729, "learning_rate": 0.0003, "loss": 5.3831, "step": 2850 }, { "epoch": 0.05846456289804026, "grad_norm": 0.26954448223114014, "learning_rate": 0.0003, "loss": 5.3531, "step": 2855 }, { "epoch": 0.058566952675444886, "grad_norm": 0.27161797881126404, "learning_rate": 0.0003, "loss": 5.3587, "step": 2860 }, { "epoch": 0.05866934245284951, "grad_norm": 0.27939358353614807, "learning_rate": 0.0003, "loss": 5.3553, "step": 2865 }, { "epoch": 0.05877173223025413, "grad_norm": 0.2974206507205963, "learning_rate": 0.0003, "loss": 5.3483, "step": 2870 }, { "epoch": 0.058874122007658757, "grad_norm": 0.27986735105514526, "learning_rate": 0.0003, "loss": 5.3578, "step": 2875 }, { "epoch": 0.05897651178506338, "grad_norm": 0.303239643573761, "learning_rate": 0.0003, "loss": 5.3732, "step": 2880 }, { "epoch": 0.059078901562468006, "grad_norm": 0.2981972396373749, "learning_rate": 0.0003, "loss": 5.3519, "step": 2885 }, { "epoch": 0.05918129133987263, "grad_norm": 0.3639703094959259, "learning_rate": 0.0003, "loss": 5.3359, "step": 2890 }, { "epoch": 0.05928368111727725, "grad_norm": 0.29545578360557556, "learning_rate": 0.0003, "loss": 5.3495, "step": 2895 }, { "epoch": 0.059386070894681876, "grad_norm": 0.30039510130882263, "learning_rate": 0.0003, "loss": 5.3114, "step": 2900 }, { "epoch": 0.0594884606720865, "grad_norm": 0.4698617458343506, "learning_rate": 0.0003, "loss": 5.3936, "step": 2905 }, { "epoch": 0.059590850449491126, "grad_norm": 0.4134523868560791, "learning_rate": 0.0003, "loss": 5.4452, "step": 2910 }, { "epoch": 0.05969324022689575, "grad_norm": 0.3588584065437317, "learning_rate": 0.0003, "loss": 5.3283, "step": 2915 }, { "epoch": 0.05979563000430037, "grad_norm": 0.33248433470726013, "learning_rate": 0.0003, "loss": 5.3824, "step": 2920 }, { "epoch": 0.059898019781704996, "grad_norm": 0.2713482677936554, "learning_rate": 0.0003, "loss": 5.3401, "step": 2925 }, { "epoch": 0.06000040955910962, "grad_norm": 0.30715176463127136, "learning_rate": 0.0003, "loss": 5.3221, "step": 2930 }, { "epoch": 0.060102799336514245, "grad_norm": 0.3150102198123932, "learning_rate": 0.0003, "loss": 5.3757, "step": 2935 }, { "epoch": 0.06020518911391887, "grad_norm": 0.2869734466075897, "learning_rate": 0.0003, "loss": 5.3552, "step": 2940 }, { "epoch": 0.06030757889132349, "grad_norm": 0.2754596769809723, "learning_rate": 0.0003, "loss": 5.3078, "step": 2945 }, { "epoch": 0.060409968668728116, "grad_norm": 0.2720445692539215, "learning_rate": 0.0003, "loss": 5.341, "step": 2950 }, { "epoch": 0.06051235844613274, "grad_norm": 0.256799578666687, "learning_rate": 0.0003, "loss": 5.3398, "step": 2955 }, { "epoch": 0.060614748223537365, "grad_norm": 0.25818538665771484, "learning_rate": 0.0003, "loss": 5.3507, "step": 2960 }, { "epoch": 0.060717138000941986, "grad_norm": 0.27038925886154175, "learning_rate": 0.0003, "loss": 5.3145, "step": 2965 }, { "epoch": 0.06081952777834661, "grad_norm": 0.30888015031814575, "learning_rate": 0.0003, "loss": 5.299, "step": 2970 }, { "epoch": 0.060921917555751236, "grad_norm": 0.27706509828567505, "learning_rate": 0.0003, "loss": 5.316, "step": 2975 }, { "epoch": 0.06102430733315586, "grad_norm": 0.3341308832168579, "learning_rate": 0.0003, "loss": 5.3509, "step": 2980 }, { "epoch": 0.061126697110560485, "grad_norm": 0.258544385433197, "learning_rate": 0.0003, "loss": 5.3369, "step": 2985 }, { "epoch": 0.061229086887965106, "grad_norm": 0.28303077816963196, "learning_rate": 0.0003, "loss": 5.3017, "step": 2990 }, { "epoch": 0.06133147666536973, "grad_norm": 0.26469340920448303, "learning_rate": 0.0003, "loss": 5.2852, "step": 2995 }, { "epoch": 0.061433866442774356, "grad_norm": 0.42151668667793274, "learning_rate": 0.0003, "loss": 5.3421, "step": 3000 }, { "epoch": 0.061433866442774356, "eval_loss": 5.355992317199707, "eval_runtime": 17.2748, "eval_samples_per_second": 33.691, "eval_steps_per_second": 4.226, "step": 3000 }, { "epoch": 0.06153625622017898, "grad_norm": 0.28036433458328247, "learning_rate": 0.0003, "loss": 5.3528, "step": 3005 }, { "epoch": 0.0616386459975836, "grad_norm": 0.3133857548236847, "learning_rate": 0.0003, "loss": 5.3184, "step": 3010 }, { "epoch": 0.061741035774988226, "grad_norm": 0.25309228897094727, "learning_rate": 0.0003, "loss": 5.3449, "step": 3015 }, { "epoch": 0.06184342555239285, "grad_norm": 0.28472191095352173, "learning_rate": 0.0003, "loss": 5.3376, "step": 3020 }, { "epoch": 0.061945815329797475, "grad_norm": 0.28121504187583923, "learning_rate": 0.0003, "loss": 5.3259, "step": 3025 }, { "epoch": 0.0620482051072021, "grad_norm": 0.2599506378173828, "learning_rate": 0.0003, "loss": 5.3605, "step": 3030 }, { "epoch": 0.06215059488460672, "grad_norm": 0.2690852880477905, "learning_rate": 0.0003, "loss": 5.312, "step": 3035 }, { "epoch": 0.062252984662011346, "grad_norm": 0.26789405941963196, "learning_rate": 0.0003, "loss": 5.3435, "step": 3040 }, { "epoch": 0.06235537443941597, "grad_norm": 0.26487815380096436, "learning_rate": 0.0003, "loss": 5.327, "step": 3045 }, { "epoch": 0.062457764216820595, "grad_norm": 0.4305364787578583, "learning_rate": 0.0003, "loss": 5.2532, "step": 3050 }, { "epoch": 0.06256015399422522, "grad_norm": 0.3147276043891907, "learning_rate": 0.0003, "loss": 5.3129, "step": 3055 }, { "epoch": 0.06266254377162984, "grad_norm": 0.2905116677284241, "learning_rate": 0.0003, "loss": 5.3561, "step": 3060 }, { "epoch": 0.06276493354903447, "grad_norm": 0.27454766631126404, "learning_rate": 0.0003, "loss": 5.3661, "step": 3065 }, { "epoch": 0.06286732332643909, "grad_norm": 0.29055505990982056, "learning_rate": 0.0003, "loss": 5.3424, "step": 3070 }, { "epoch": 0.06296971310384371, "grad_norm": 0.2825135886669159, "learning_rate": 0.0003, "loss": 5.2881, "step": 3075 }, { "epoch": 0.06307210288124834, "grad_norm": 0.2770971953868866, "learning_rate": 0.0003, "loss": 5.3529, "step": 3080 }, { "epoch": 0.06317449265865296, "grad_norm": 0.2654070556163788, "learning_rate": 0.0003, "loss": 5.2791, "step": 3085 }, { "epoch": 0.06327688243605759, "grad_norm": 0.25536680221557617, "learning_rate": 0.0003, "loss": 5.2641, "step": 3090 }, { "epoch": 0.0633792722134622, "grad_norm": 0.26588839292526245, "learning_rate": 0.0003, "loss": 5.3355, "step": 3095 }, { "epoch": 0.06348166199086683, "grad_norm": 0.2800893783569336, "learning_rate": 0.0003, "loss": 5.3052, "step": 3100 }, { "epoch": 0.06358405176827145, "grad_norm": 0.3270575702190399, "learning_rate": 0.0003, "loss": 5.3568, "step": 3105 }, { "epoch": 0.06368644154567608, "grad_norm": 0.28551337122917175, "learning_rate": 0.0003, "loss": 5.2801, "step": 3110 }, { "epoch": 0.0637888313230807, "grad_norm": 0.2678089141845703, "learning_rate": 0.0003, "loss": 5.3226, "step": 3115 }, { "epoch": 0.06389122110048533, "grad_norm": 1.0088434219360352, "learning_rate": 0.0003, "loss": 5.3141, "step": 3120 }, { "epoch": 0.06399361087788995, "grad_norm": 0.3256169855594635, "learning_rate": 0.0003, "loss": 5.3208, "step": 3125 }, { "epoch": 0.06409600065529457, "grad_norm": 0.2671166956424713, "learning_rate": 0.0003, "loss": 5.3377, "step": 3130 }, { "epoch": 0.0641983904326992, "grad_norm": 0.24780243635177612, "learning_rate": 0.0003, "loss": 5.3314, "step": 3135 }, { "epoch": 0.06430078021010383, "grad_norm": 0.30526068806648254, "learning_rate": 0.0003, "loss": 5.3694, "step": 3140 }, { "epoch": 0.06440316998750845, "grad_norm": 0.3097380995750427, "learning_rate": 0.0003, "loss": 5.3049, "step": 3145 }, { "epoch": 0.06450555976491307, "grad_norm": 0.2833743691444397, "learning_rate": 0.0003, "loss": 5.316, "step": 3150 }, { "epoch": 0.06460794954231769, "grad_norm": 0.26801565289497375, "learning_rate": 0.0003, "loss": 5.3528, "step": 3155 }, { "epoch": 0.06471033931972232, "grad_norm": 0.33694198727607727, "learning_rate": 0.0003, "loss": 5.3172, "step": 3160 }, { "epoch": 0.06481272909712694, "grad_norm": 0.36817753314971924, "learning_rate": 0.0003, "loss": 5.294, "step": 3165 }, { "epoch": 0.06491511887453157, "grad_norm": 0.28540757298469543, "learning_rate": 0.0003, "loss": 5.3004, "step": 3170 }, { "epoch": 0.06501750865193619, "grad_norm": 0.39156943559646606, "learning_rate": 0.0003, "loss": 5.2591, "step": 3175 }, { "epoch": 0.06511989842934081, "grad_norm": 0.27260908484458923, "learning_rate": 0.0003, "loss": 5.2961, "step": 3180 }, { "epoch": 0.06522228820674544, "grad_norm": 0.35306307673454285, "learning_rate": 0.0003, "loss": 5.3259, "step": 3185 }, { "epoch": 0.06532467798415006, "grad_norm": 0.27560725808143616, "learning_rate": 0.0003, "loss": 5.3031, "step": 3190 }, { "epoch": 0.06542706776155469, "grad_norm": 0.30916744470596313, "learning_rate": 0.0003, "loss": 5.3213, "step": 3195 }, { "epoch": 0.06552945753895931, "grad_norm": 0.26047390699386597, "learning_rate": 0.0003, "loss": 5.3198, "step": 3200 }, { "epoch": 0.06563184731636393, "grad_norm": 0.2895469069480896, "learning_rate": 0.0003, "loss": 5.3226, "step": 3205 }, { "epoch": 0.06573423709376856, "grad_norm": 0.27080896496772766, "learning_rate": 0.0003, "loss": 5.2866, "step": 3210 }, { "epoch": 0.06583662687117318, "grad_norm": 0.2811758816242218, "learning_rate": 0.0003, "loss": 5.3196, "step": 3215 }, { "epoch": 0.0659390166485778, "grad_norm": 0.29708757996559143, "learning_rate": 0.0003, "loss": 5.2298, "step": 3220 }, { "epoch": 0.06604140642598243, "grad_norm": 0.29997655749320984, "learning_rate": 0.0003, "loss": 5.3038, "step": 3225 }, { "epoch": 0.06614379620338705, "grad_norm": 0.3130645751953125, "learning_rate": 0.0003, "loss": 5.2443, "step": 3230 }, { "epoch": 0.06624618598079168, "grad_norm": 0.3080459535121918, "learning_rate": 0.0003, "loss": 5.3079, "step": 3235 }, { "epoch": 0.0663485757581963, "grad_norm": 0.3791588842868805, "learning_rate": 0.0003, "loss": 5.2863, "step": 3240 }, { "epoch": 0.06645096553560093, "grad_norm": 0.3161385953426361, "learning_rate": 0.0003, "loss": 5.3139, "step": 3245 }, { "epoch": 0.06655335531300555, "grad_norm": 0.2740941345691681, "learning_rate": 0.0003, "loss": 5.3195, "step": 3250 }, { "epoch": 0.06665574509041017, "grad_norm": 0.26943519711494446, "learning_rate": 0.0003, "loss": 5.2867, "step": 3255 }, { "epoch": 0.0667581348678148, "grad_norm": 0.2833729088306427, "learning_rate": 0.0003, "loss": 5.2506, "step": 3260 }, { "epoch": 0.06686052464521942, "grad_norm": 0.3066137135028839, "learning_rate": 0.0003, "loss": 5.2694, "step": 3265 }, { "epoch": 0.06696291442262405, "grad_norm": 0.3070327043533325, "learning_rate": 0.0003, "loss": 5.2966, "step": 3270 }, { "epoch": 0.06706530420002867, "grad_norm": 0.27142801880836487, "learning_rate": 0.0003, "loss": 5.2903, "step": 3275 }, { "epoch": 0.06716769397743329, "grad_norm": 0.26440364122390747, "learning_rate": 0.0003, "loss": 5.296, "step": 3280 }, { "epoch": 0.06727008375483792, "grad_norm": 0.27129313349723816, "learning_rate": 0.0003, "loss": 5.2924, "step": 3285 }, { "epoch": 0.06737247353224254, "grad_norm": 0.38910025358200073, "learning_rate": 0.0003, "loss": 5.339, "step": 3290 }, { "epoch": 0.06747486330964717, "grad_norm": 0.30716171860694885, "learning_rate": 0.0003, "loss": 5.2944, "step": 3295 }, { "epoch": 0.06757725308705179, "grad_norm": 0.3139477074146271, "learning_rate": 0.0003, "loss": 5.3103, "step": 3300 }, { "epoch": 0.06767964286445641, "grad_norm": 0.2722444534301758, "learning_rate": 0.0003, "loss": 5.2683, "step": 3305 }, { "epoch": 0.06778203264186104, "grad_norm": 0.2985815107822418, "learning_rate": 0.0003, "loss": 5.3044, "step": 3310 }, { "epoch": 0.06788442241926566, "grad_norm": 0.2560136318206787, "learning_rate": 0.0003, "loss": 5.3083, "step": 3315 }, { "epoch": 0.06798681219667028, "grad_norm": 0.25087544322013855, "learning_rate": 0.0003, "loss": 5.2957, "step": 3320 }, { "epoch": 0.0680892019740749, "grad_norm": 0.26948386430740356, "learning_rate": 0.0003, "loss": 5.2761, "step": 3325 }, { "epoch": 0.06819159175147953, "grad_norm": 0.2510646879673004, "learning_rate": 0.0003, "loss": 5.2919, "step": 3330 }, { "epoch": 0.06829398152888416, "grad_norm": 0.26186272501945496, "learning_rate": 0.0003, "loss": 5.3016, "step": 3335 }, { "epoch": 0.06839637130628878, "grad_norm": 0.3214239776134491, "learning_rate": 0.0003, "loss": 5.3502, "step": 3340 }, { "epoch": 0.0684987610836934, "grad_norm": 0.2626754939556122, "learning_rate": 0.0003, "loss": 5.2684, "step": 3345 }, { "epoch": 0.06860115086109803, "grad_norm": 0.2504951059818268, "learning_rate": 0.0003, "loss": 5.2904, "step": 3350 }, { "epoch": 0.06870354063850265, "grad_norm": 0.3028484284877777, "learning_rate": 0.0003, "loss": 5.3044, "step": 3355 }, { "epoch": 0.06880593041590728, "grad_norm": 0.2650691270828247, "learning_rate": 0.0003, "loss": 5.2882, "step": 3360 }, { "epoch": 0.0689083201933119, "grad_norm": 0.289113312959671, "learning_rate": 0.0003, "loss": 5.2831, "step": 3365 }, { "epoch": 0.06901070997071652, "grad_norm": 0.2897968888282776, "learning_rate": 0.0003, "loss": 5.2817, "step": 3370 }, { "epoch": 0.06911309974812115, "grad_norm": 0.27607211470603943, "learning_rate": 0.0003, "loss": 5.2752, "step": 3375 }, { "epoch": 0.06921548952552577, "grad_norm": 0.292235791683197, "learning_rate": 0.0003, "loss": 5.2336, "step": 3380 }, { "epoch": 0.0693178793029304, "grad_norm": 0.26884227991104126, "learning_rate": 0.0003, "loss": 5.2912, "step": 3385 }, { "epoch": 0.06942026908033502, "grad_norm": 0.2577894926071167, "learning_rate": 0.0003, "loss": 5.2823, "step": 3390 }, { "epoch": 0.06952265885773964, "grad_norm": 0.2647188603878021, "learning_rate": 0.0003, "loss": 5.2007, "step": 3395 }, { "epoch": 0.06962504863514427, "grad_norm": 0.2736581563949585, "learning_rate": 0.0003, "loss": 5.3041, "step": 3400 }, { "epoch": 0.06972743841254889, "grad_norm": 0.32651710510253906, "learning_rate": 0.0003, "loss": 5.3108, "step": 3405 }, { "epoch": 0.06982982818995352, "grad_norm": 0.2615613639354706, "learning_rate": 0.0003, "loss": 5.2654, "step": 3410 }, { "epoch": 0.06993221796735814, "grad_norm": 0.25817254185676575, "learning_rate": 0.0003, "loss": 5.2547, "step": 3415 }, { "epoch": 0.07003460774476276, "grad_norm": 0.2617282271385193, "learning_rate": 0.0003, "loss": 5.2543, "step": 3420 }, { "epoch": 0.07013699752216739, "grad_norm": 0.2925521433353424, "learning_rate": 0.0003, "loss": 5.2743, "step": 3425 }, { "epoch": 0.070239387299572, "grad_norm": 0.28042152523994446, "learning_rate": 0.0003, "loss": 5.2751, "step": 3430 }, { "epoch": 0.07034177707697664, "grad_norm": 0.30079346895217896, "learning_rate": 0.0003, "loss": 5.258, "step": 3435 }, { "epoch": 0.07044416685438126, "grad_norm": 0.2699873447418213, "learning_rate": 0.0003, "loss": 5.3195, "step": 3440 }, { "epoch": 0.07054655663178588, "grad_norm": 0.2844352126121521, "learning_rate": 0.0003, "loss": 5.2584, "step": 3445 }, { "epoch": 0.0706489464091905, "grad_norm": 0.42051392793655396, "learning_rate": 0.0003, "loss": 5.3133, "step": 3450 }, { "epoch": 0.07075133618659513, "grad_norm": 0.2783925533294678, "learning_rate": 0.0003, "loss": 5.2723, "step": 3455 }, { "epoch": 0.07085372596399975, "grad_norm": 0.3405534327030182, "learning_rate": 0.0003, "loss": 5.2908, "step": 3460 }, { "epoch": 0.07095611574140438, "grad_norm": 0.28797343373298645, "learning_rate": 0.0003, "loss": 5.2353, "step": 3465 }, { "epoch": 0.071058505518809, "grad_norm": 0.28846192359924316, "learning_rate": 0.0003, "loss": 5.2495, "step": 3470 }, { "epoch": 0.07116089529621362, "grad_norm": 0.25400757789611816, "learning_rate": 0.0003, "loss": 5.2741, "step": 3475 }, { "epoch": 0.07126328507361825, "grad_norm": 0.24486589431762695, "learning_rate": 0.0003, "loss": 5.2918, "step": 3480 }, { "epoch": 0.07136567485102287, "grad_norm": 0.3392948508262634, "learning_rate": 0.0003, "loss": 5.2564, "step": 3485 }, { "epoch": 0.0714680646284275, "grad_norm": 0.27733278274536133, "learning_rate": 0.0003, "loss": 5.2967, "step": 3490 }, { "epoch": 0.07157045440583212, "grad_norm": 0.28385838866233826, "learning_rate": 0.0003, "loss": 5.284, "step": 3495 }, { "epoch": 0.07167284418323674, "grad_norm": 0.3224107325077057, "learning_rate": 0.0003, "loss": 5.1633, "step": 3500 }, { "epoch": 0.07177523396064137, "grad_norm": 0.2476559430360794, "learning_rate": 0.0003, "loss": 5.2906, "step": 3505 }, { "epoch": 0.07187762373804599, "grad_norm": 0.2646051347255707, "learning_rate": 0.0003, "loss": 5.2459, "step": 3510 }, { "epoch": 0.07198001351545062, "grad_norm": 0.2576812207698822, "learning_rate": 0.0003, "loss": 5.3111, "step": 3515 }, { "epoch": 0.07208240329285524, "grad_norm": 0.260553240776062, "learning_rate": 0.0003, "loss": 5.2321, "step": 3520 }, { "epoch": 0.07218479307025986, "grad_norm": 0.261356383562088, "learning_rate": 0.0003, "loss": 5.2137, "step": 3525 }, { "epoch": 0.07228718284766449, "grad_norm": 0.24664048850536346, "learning_rate": 0.0003, "loss": 5.2492, "step": 3530 }, { "epoch": 0.0723895726250691, "grad_norm": 0.26263877749443054, "learning_rate": 0.0003, "loss": 5.229, "step": 3535 }, { "epoch": 0.07249196240247374, "grad_norm": 0.28026822209358215, "learning_rate": 0.0003, "loss": 5.2291, "step": 3540 }, { "epoch": 0.07259435217987836, "grad_norm": 0.2834453582763672, "learning_rate": 0.0003, "loss": 5.231, "step": 3545 }, { "epoch": 0.07269674195728298, "grad_norm": 0.28786909580230713, "learning_rate": 0.0003, "loss": 5.2743, "step": 3550 }, { "epoch": 0.0727991317346876, "grad_norm": 0.260613352060318, "learning_rate": 0.0003, "loss": 5.2542, "step": 3555 }, { "epoch": 0.07290152151209223, "grad_norm": 0.32520389556884766, "learning_rate": 0.0003, "loss": 5.2166, "step": 3560 }, { "epoch": 0.07300391128949686, "grad_norm": 0.2567857503890991, "learning_rate": 0.0003, "loss": 5.3022, "step": 3565 }, { "epoch": 0.07310630106690148, "grad_norm": 0.2632225453853607, "learning_rate": 0.0003, "loss": 5.2454, "step": 3570 }, { "epoch": 0.0732086908443061, "grad_norm": 0.26637014746665955, "learning_rate": 0.0003, "loss": 5.2041, "step": 3575 }, { "epoch": 0.07331108062171073, "grad_norm": 0.28755322098731995, "learning_rate": 0.0003, "loss": 5.2866, "step": 3580 }, { "epoch": 0.07341347039911535, "grad_norm": 0.32612115144729614, "learning_rate": 0.0003, "loss": 5.3024, "step": 3585 }, { "epoch": 0.07351586017651998, "grad_norm": 0.30339866876602173, "learning_rate": 0.0003, "loss": 5.2562, "step": 3590 }, { "epoch": 0.0736182499539246, "grad_norm": 0.2615393400192261, "learning_rate": 0.0003, "loss": 5.2801, "step": 3595 }, { "epoch": 0.07372063973132922, "grad_norm": 0.28131192922592163, "learning_rate": 0.0003, "loss": 5.2344, "step": 3600 }, { "epoch": 0.07382302950873385, "grad_norm": 0.2984217703342438, "learning_rate": 0.0003, "loss": 5.2558, "step": 3605 }, { "epoch": 0.07392541928613847, "grad_norm": 0.3154425323009491, "learning_rate": 0.0003, "loss": 5.2796, "step": 3610 }, { "epoch": 0.0740278090635431, "grad_norm": 0.27574992179870605, "learning_rate": 0.0003, "loss": 5.2575, "step": 3615 }, { "epoch": 0.07413019884094772, "grad_norm": 0.2771117687225342, "learning_rate": 0.0003, "loss": 5.2447, "step": 3620 }, { "epoch": 0.07423258861835234, "grad_norm": 0.2700023949146271, "learning_rate": 0.0003, "loss": 5.2851, "step": 3625 }, { "epoch": 0.07433497839575696, "grad_norm": 0.24821822345256805, "learning_rate": 0.0003, "loss": 5.2321, "step": 3630 }, { "epoch": 0.07443736817316159, "grad_norm": 0.2916540801525116, "learning_rate": 0.0003, "loss": 5.2355, "step": 3635 }, { "epoch": 0.07453975795056622, "grad_norm": 0.24802926182746887, "learning_rate": 0.0003, "loss": 5.2403, "step": 3640 }, { "epoch": 0.07464214772797084, "grad_norm": 0.24772369861602783, "learning_rate": 0.0003, "loss": 5.2455, "step": 3645 }, { "epoch": 0.07474453750537546, "grad_norm": 0.25520002841949463, "learning_rate": 0.0003, "loss": 5.2628, "step": 3650 }, { "epoch": 0.07484692728278008, "grad_norm": 0.2472926676273346, "learning_rate": 0.0003, "loss": 5.2709, "step": 3655 }, { "epoch": 0.0749493170601847, "grad_norm": 0.2765505015850067, "learning_rate": 0.0003, "loss": 5.2843, "step": 3660 }, { "epoch": 0.07505170683758934, "grad_norm": 0.26966553926467896, "learning_rate": 0.0003, "loss": 5.2948, "step": 3665 }, { "epoch": 0.07515409661499396, "grad_norm": 0.2616786062717438, "learning_rate": 0.0003, "loss": 5.2205, "step": 3670 }, { "epoch": 0.07525648639239858, "grad_norm": 0.27883464097976685, "learning_rate": 0.0003, "loss": 5.2227, "step": 3675 }, { "epoch": 0.0753588761698032, "grad_norm": 0.2898852825164795, "learning_rate": 0.0003, "loss": 5.2056, "step": 3680 }, { "epoch": 0.07546126594720783, "grad_norm": 0.2799520194530487, "learning_rate": 0.0003, "loss": 5.2248, "step": 3685 }, { "epoch": 0.07556365572461246, "grad_norm": 0.24998842179775238, "learning_rate": 0.0003, "loss": 5.2606, "step": 3690 }, { "epoch": 0.07566604550201708, "grad_norm": 0.32577699422836304, "learning_rate": 0.0003, "loss": 5.2199, "step": 3695 }, { "epoch": 0.0757684352794217, "grad_norm": 0.3090455234050751, "learning_rate": 0.0003, "loss": 5.2008, "step": 3700 }, { "epoch": 0.07587082505682632, "grad_norm": 0.29049402475357056, "learning_rate": 0.0003, "loss": 5.2657, "step": 3705 }, { "epoch": 0.07597321483423095, "grad_norm": 0.2764868140220642, "learning_rate": 0.0003, "loss": 5.2243, "step": 3710 }, { "epoch": 0.07607560461163558, "grad_norm": 0.2757784426212311, "learning_rate": 0.0003, "loss": 5.2708, "step": 3715 }, { "epoch": 0.0761779943890402, "grad_norm": 0.2754042446613312, "learning_rate": 0.0003, "loss": 5.2574, "step": 3720 }, { "epoch": 0.07628038416644482, "grad_norm": 0.2590422034263611, "learning_rate": 0.0003, "loss": 5.2617, "step": 3725 }, { "epoch": 0.07638277394384944, "grad_norm": 0.2684026062488556, "learning_rate": 0.0003, "loss": 5.201, "step": 3730 }, { "epoch": 0.07648516372125407, "grad_norm": 0.28633198142051697, "learning_rate": 0.0003, "loss": 5.2652, "step": 3735 }, { "epoch": 0.0765875534986587, "grad_norm": 0.2651969790458679, "learning_rate": 0.0003, "loss": 5.2194, "step": 3740 }, { "epoch": 0.07668994327606332, "grad_norm": 0.25093379616737366, "learning_rate": 0.0003, "loss": 5.2727, "step": 3745 }, { "epoch": 0.07679233305346794, "grad_norm": 0.2667597234249115, "learning_rate": 0.0003, "loss": 5.2158, "step": 3750 }, { "epoch": 0.07689472283087256, "grad_norm": 0.26323240995407104, "learning_rate": 0.0003, "loss": 5.2185, "step": 3755 }, { "epoch": 0.07699711260827719, "grad_norm": 0.24883873760700226, "learning_rate": 0.0003, "loss": 5.2437, "step": 3760 }, { "epoch": 0.07709950238568182, "grad_norm": 0.24001909792423248, "learning_rate": 0.0003, "loss": 5.2574, "step": 3765 }, { "epoch": 0.07720189216308644, "grad_norm": 0.2598496377468109, "learning_rate": 0.0003, "loss": 5.1732, "step": 3770 }, { "epoch": 0.07730428194049106, "grad_norm": 0.26076823472976685, "learning_rate": 0.0003, "loss": 5.2255, "step": 3775 }, { "epoch": 0.07740667171789568, "grad_norm": 0.24912242591381073, "learning_rate": 0.0003, "loss": 5.2146, "step": 3780 }, { "epoch": 0.0775090614953003, "grad_norm": 0.2977564036846161, "learning_rate": 0.0003, "loss": 5.1693, "step": 3785 }, { "epoch": 0.07761145127270494, "grad_norm": 0.27076271176338196, "learning_rate": 0.0003, "loss": 5.2399, "step": 3790 }, { "epoch": 0.07771384105010956, "grad_norm": 0.2570875883102417, "learning_rate": 0.0003, "loss": 5.2905, "step": 3795 }, { "epoch": 0.07781623082751418, "grad_norm": 0.25177836418151855, "learning_rate": 0.0003, "loss": 5.2525, "step": 3800 }, { "epoch": 0.0779186206049188, "grad_norm": 0.32862451672554016, "learning_rate": 0.0003, "loss": 5.1975, "step": 3805 }, { "epoch": 0.07802101038232342, "grad_norm": 0.274938702583313, "learning_rate": 0.0003, "loss": 5.2303, "step": 3810 }, { "epoch": 0.07812340015972805, "grad_norm": 0.27677834033966064, "learning_rate": 0.0003, "loss": 5.2273, "step": 3815 }, { "epoch": 0.07822578993713268, "grad_norm": 0.2708391845226288, "learning_rate": 0.0003, "loss": 5.242, "step": 3820 }, { "epoch": 0.0783281797145373, "grad_norm": 0.25845611095428467, "learning_rate": 0.0003, "loss": 5.2319, "step": 3825 }, { "epoch": 0.07843056949194192, "grad_norm": 0.2609349489212036, "learning_rate": 0.0003, "loss": 5.2428, "step": 3830 }, { "epoch": 0.07853295926934654, "grad_norm": 0.2555552124977112, "learning_rate": 0.0003, "loss": 5.2292, "step": 3835 }, { "epoch": 0.07863534904675117, "grad_norm": 0.2581983506679535, "learning_rate": 0.0003, "loss": 5.1964, "step": 3840 }, { "epoch": 0.0787377388241558, "grad_norm": 0.3047882318496704, "learning_rate": 0.0003, "loss": 5.3112, "step": 3845 }, { "epoch": 0.07884012860156042, "grad_norm": 0.7512697577476501, "learning_rate": 0.0003, "loss": 5.2443, "step": 3850 }, { "epoch": 0.07894251837896504, "grad_norm": 0.321704238653183, "learning_rate": 0.0003, "loss": 5.2639, "step": 3855 }, { "epoch": 0.07904490815636966, "grad_norm": 0.27286747097969055, "learning_rate": 0.0003, "loss": 5.1557, "step": 3860 }, { "epoch": 0.07914729793377429, "grad_norm": 0.2746713161468506, "learning_rate": 0.0003, "loss": 5.2666, "step": 3865 }, { "epoch": 0.07924968771117892, "grad_norm": 0.3512607216835022, "learning_rate": 0.0003, "loss": 5.2191, "step": 3870 }, { "epoch": 0.07935207748858354, "grad_norm": 0.2750149965286255, "learning_rate": 0.0003, "loss": 5.2028, "step": 3875 }, { "epoch": 0.07945446726598816, "grad_norm": 0.2558034658432007, "learning_rate": 0.0003, "loss": 5.2566, "step": 3880 }, { "epoch": 0.07955685704339278, "grad_norm": 0.2888765037059784, "learning_rate": 0.0003, "loss": 5.2306, "step": 3885 }, { "epoch": 0.0796592468207974, "grad_norm": 0.3252604305744171, "learning_rate": 0.0003, "loss": 5.173, "step": 3890 }, { "epoch": 0.07976163659820204, "grad_norm": 0.2762053906917572, "learning_rate": 0.0003, "loss": 5.1917, "step": 3895 }, { "epoch": 0.07986402637560666, "grad_norm": 0.270160973072052, "learning_rate": 0.0003, "loss": 5.2439, "step": 3900 }, { "epoch": 0.07996641615301128, "grad_norm": 0.2815668284893036, "learning_rate": 0.0003, "loss": 5.2011, "step": 3905 }, { "epoch": 0.0800688059304159, "grad_norm": 0.24999023973941803, "learning_rate": 0.0003, "loss": 5.2585, "step": 3910 }, { "epoch": 0.08017119570782053, "grad_norm": 0.2633795142173767, "learning_rate": 0.0003, "loss": 5.1888, "step": 3915 }, { "epoch": 0.08027358548522516, "grad_norm": 0.25494393706321716, "learning_rate": 0.0003, "loss": 5.2547, "step": 3920 }, { "epoch": 0.08037597526262978, "grad_norm": 0.2790360450744629, "learning_rate": 0.0003, "loss": 5.1955, "step": 3925 }, { "epoch": 0.0804783650400344, "grad_norm": 0.2529444396495819, "learning_rate": 0.0003, "loss": 5.2178, "step": 3930 }, { "epoch": 0.08058075481743902, "grad_norm": 0.2555560767650604, "learning_rate": 0.0003, "loss": 5.2064, "step": 3935 }, { "epoch": 0.08068314459484364, "grad_norm": 0.2914392352104187, "learning_rate": 0.0003, "loss": 5.2048, "step": 3940 }, { "epoch": 0.08078553437224828, "grad_norm": 0.2684366703033447, "learning_rate": 0.0003, "loss": 5.1977, "step": 3945 }, { "epoch": 0.0808879241496529, "grad_norm": 0.2719501256942749, "learning_rate": 0.0003, "loss": 5.1895, "step": 3950 }, { "epoch": 0.08099031392705752, "grad_norm": 0.27822640538215637, "learning_rate": 0.0003, "loss": 5.1856, "step": 3955 }, { "epoch": 0.08109270370446214, "grad_norm": 0.30609002709388733, "learning_rate": 0.0003, "loss": 5.1872, "step": 3960 }, { "epoch": 0.08119509348186676, "grad_norm": 0.26783978939056396, "learning_rate": 0.0003, "loss": 5.2106, "step": 3965 }, { "epoch": 0.0812974832592714, "grad_norm": 0.2516503930091858, "learning_rate": 0.0003, "loss": 5.3007, "step": 3970 }, { "epoch": 0.08139987303667602, "grad_norm": 0.26788803935050964, "learning_rate": 0.0003, "loss": 5.2178, "step": 3975 }, { "epoch": 0.08150226281408064, "grad_norm": 0.25448089838027954, "learning_rate": 0.0003, "loss": 5.2223, "step": 3980 }, { "epoch": 0.08160465259148526, "grad_norm": 0.26501232385635376, "learning_rate": 0.0003, "loss": 5.2898, "step": 3985 }, { "epoch": 0.08170704236888988, "grad_norm": 0.26919659972190857, "learning_rate": 0.0003, "loss": 5.1992, "step": 3990 }, { "epoch": 0.08180943214629452, "grad_norm": 0.29884156584739685, "learning_rate": 0.0003, "loss": 5.19, "step": 3995 }, { "epoch": 0.08191182192369914, "grad_norm": 0.26546797156333923, "learning_rate": 0.0003, "loss": 5.207, "step": 4000 }, { "epoch": 0.08191182192369914, "eval_loss": 5.231751441955566, "eval_runtime": 17.2576, "eval_samples_per_second": 33.724, "eval_steps_per_second": 4.23, "step": 4000 }, { "epoch": 0.08201421170110376, "grad_norm": 0.28061461448669434, "learning_rate": 0.0003, "loss": 5.1794, "step": 4005 }, { "epoch": 0.08211660147850838, "grad_norm": 0.3089412748813629, "learning_rate": 0.0003, "loss": 5.2058, "step": 4010 }, { "epoch": 0.082218991255913, "grad_norm": 0.24918992817401886, "learning_rate": 0.0003, "loss": 5.1594, "step": 4015 }, { "epoch": 0.08232138103331764, "grad_norm": 0.2777044475078583, "learning_rate": 0.0003, "loss": 5.209, "step": 4020 }, { "epoch": 0.08242377081072226, "grad_norm": 0.262815922498703, "learning_rate": 0.0003, "loss": 5.2152, "step": 4025 }, { "epoch": 0.08252616058812688, "grad_norm": 0.29473477602005005, "learning_rate": 0.0003, "loss": 5.1525, "step": 4030 }, { "epoch": 0.0826285503655315, "grad_norm": 0.2584522068500519, "learning_rate": 0.0003, "loss": 5.2217, "step": 4035 }, { "epoch": 0.08273094014293612, "grad_norm": 0.22683992981910706, "learning_rate": 0.0003, "loss": 5.2038, "step": 4040 }, { "epoch": 0.08283332992034076, "grad_norm": 0.23628821969032288, "learning_rate": 0.0003, "loss": 5.2474, "step": 4045 }, { "epoch": 0.08293571969774538, "grad_norm": 0.2681495249271393, "learning_rate": 0.0003, "loss": 5.2409, "step": 4050 }, { "epoch": 0.08303810947515, "grad_norm": 0.29070958495140076, "learning_rate": 0.0003, "loss": 5.1777, "step": 4055 }, { "epoch": 0.08314049925255462, "grad_norm": 0.3845941126346588, "learning_rate": 0.0003, "loss": 5.2047, "step": 4060 }, { "epoch": 0.08324288902995924, "grad_norm": 0.2862512171268463, "learning_rate": 0.0003, "loss": 5.1744, "step": 4065 }, { "epoch": 0.08334527880736388, "grad_norm": 0.2699524760246277, "learning_rate": 0.0003, "loss": 5.1751, "step": 4070 }, { "epoch": 0.0834476685847685, "grad_norm": 0.277503103017807, "learning_rate": 0.0003, "loss": 5.2415, "step": 4075 }, { "epoch": 0.08355005836217312, "grad_norm": 0.26220452785491943, "learning_rate": 0.0003, "loss": 5.2355, "step": 4080 }, { "epoch": 0.08365244813957774, "grad_norm": 0.25356751680374146, "learning_rate": 0.0003, "loss": 5.2204, "step": 4085 }, { "epoch": 0.08375483791698236, "grad_norm": 0.24540142714977264, "learning_rate": 0.0003, "loss": 5.181, "step": 4090 }, { "epoch": 0.083857227694387, "grad_norm": 0.24948079884052277, "learning_rate": 0.0003, "loss": 5.1546, "step": 4095 }, { "epoch": 0.08395961747179162, "grad_norm": 0.2514345645904541, "learning_rate": 0.0003, "loss": 5.2463, "step": 4100 }, { "epoch": 0.08406200724919624, "grad_norm": 0.22873541712760925, "learning_rate": 0.0003, "loss": 5.1944, "step": 4105 }, { "epoch": 0.08416439702660086, "grad_norm": 0.29980912804603577, "learning_rate": 0.0003, "loss": 5.2013, "step": 4110 }, { "epoch": 0.08426678680400548, "grad_norm": 0.37752649188041687, "learning_rate": 0.0003, "loss": 5.1881, "step": 4115 }, { "epoch": 0.08436917658141012, "grad_norm": 0.27054810523986816, "learning_rate": 0.0003, "loss": 5.1772, "step": 4120 }, { "epoch": 0.08447156635881474, "grad_norm": 0.2550160586833954, "learning_rate": 0.0003, "loss": 5.2239, "step": 4125 }, { "epoch": 0.08457395613621936, "grad_norm": 0.2638941705226898, "learning_rate": 0.0003, "loss": 5.221, "step": 4130 }, { "epoch": 0.08467634591362398, "grad_norm": 0.2580711245536804, "learning_rate": 0.0003, "loss": 5.1863, "step": 4135 }, { "epoch": 0.0847787356910286, "grad_norm": 0.26525193452835083, "learning_rate": 0.0003, "loss": 5.1725, "step": 4140 }, { "epoch": 0.08488112546843324, "grad_norm": 0.2745465040206909, "learning_rate": 0.0003, "loss": 5.2119, "step": 4145 }, { "epoch": 0.08498351524583786, "grad_norm": 0.2355852574110031, "learning_rate": 0.0003, "loss": 5.1977, "step": 4150 }, { "epoch": 0.08508590502324248, "grad_norm": 0.2682251036167145, "learning_rate": 0.0003, "loss": 5.2275, "step": 4155 }, { "epoch": 0.0851882948006471, "grad_norm": 0.26811787486076355, "learning_rate": 0.0003, "loss": 5.1731, "step": 4160 }, { "epoch": 0.08529068457805172, "grad_norm": 0.26039308309555054, "learning_rate": 0.0003, "loss": 5.1438, "step": 4165 }, { "epoch": 0.08539307435545636, "grad_norm": 0.31183162331581116, "learning_rate": 0.0003, "loss": 5.2215, "step": 4170 }, { "epoch": 0.08549546413286098, "grad_norm": 0.30162638425827026, "learning_rate": 0.0003, "loss": 5.1634, "step": 4175 }, { "epoch": 0.0855978539102656, "grad_norm": 0.27771133184432983, "learning_rate": 0.0003, "loss": 5.1333, "step": 4180 }, { "epoch": 0.08570024368767022, "grad_norm": 0.30736786127090454, "learning_rate": 0.0003, "loss": 5.2121, "step": 4185 }, { "epoch": 0.08580263346507484, "grad_norm": 0.29298821091651917, "learning_rate": 0.0003, "loss": 5.224, "step": 4190 }, { "epoch": 0.08590502324247946, "grad_norm": 0.2820582091808319, "learning_rate": 0.0003, "loss": 5.1978, "step": 4195 }, { "epoch": 0.0860074130198841, "grad_norm": 0.26031070947647095, "learning_rate": 0.0003, "loss": 5.2011, "step": 4200 }, { "epoch": 0.08610980279728872, "grad_norm": 0.321780264377594, "learning_rate": 0.0003, "loss": 5.1574, "step": 4205 }, { "epoch": 0.08621219257469334, "grad_norm": 0.31359395384788513, "learning_rate": 0.0003, "loss": 5.1827, "step": 4210 }, { "epoch": 0.08631458235209796, "grad_norm": 0.7416604161262512, "learning_rate": 0.0003, "loss": 5.0659, "step": 4215 }, { "epoch": 0.08641697212950258, "grad_norm": 0.2704704701900482, "learning_rate": 0.0003, "loss": 5.1829, "step": 4220 }, { "epoch": 0.08651936190690722, "grad_norm": 0.2521301209926605, "learning_rate": 0.0003, "loss": 5.2136, "step": 4225 }, { "epoch": 0.08662175168431184, "grad_norm": 0.25025126338005066, "learning_rate": 0.0003, "loss": 5.1878, "step": 4230 }, { "epoch": 0.08672414146171646, "grad_norm": 0.3045847415924072, "learning_rate": 0.0003, "loss": 5.1402, "step": 4235 }, { "epoch": 0.08682653123912108, "grad_norm": 0.5664308071136475, "learning_rate": 0.0003, "loss": 5.1917, "step": 4240 }, { "epoch": 0.0869289210165257, "grad_norm": 0.2421114146709442, "learning_rate": 0.0003, "loss": 5.1617, "step": 4245 }, { "epoch": 0.08703131079393034, "grad_norm": 0.26926347613334656, "learning_rate": 0.0003, "loss": 5.2445, "step": 4250 }, { "epoch": 0.08713370057133496, "grad_norm": 0.2814469337463379, "learning_rate": 0.0003, "loss": 5.2284, "step": 4255 }, { "epoch": 0.08723609034873958, "grad_norm": 0.2786811590194702, "learning_rate": 0.0003, "loss": 5.2273, "step": 4260 }, { "epoch": 0.0873384801261442, "grad_norm": 0.30829358100891113, "learning_rate": 0.0003, "loss": 5.1942, "step": 4265 }, { "epoch": 0.08744086990354882, "grad_norm": 0.2992086708545685, "learning_rate": 0.0003, "loss": 5.1899, "step": 4270 }, { "epoch": 0.08754325968095346, "grad_norm": 0.285158634185791, "learning_rate": 0.0003, "loss": 5.218, "step": 4275 }, { "epoch": 0.08764564945835808, "grad_norm": 0.2520645558834076, "learning_rate": 0.0003, "loss": 5.2074, "step": 4280 }, { "epoch": 0.0877480392357627, "grad_norm": 0.23772001266479492, "learning_rate": 0.0003, "loss": 5.21, "step": 4285 }, { "epoch": 0.08785042901316732, "grad_norm": 0.2620167136192322, "learning_rate": 0.0003, "loss": 5.1579, "step": 4290 }, { "epoch": 0.08795281879057194, "grad_norm": 0.24005870521068573, "learning_rate": 0.0003, "loss": 5.1406, "step": 4295 }, { "epoch": 0.08805520856797658, "grad_norm": 0.27313220500946045, "learning_rate": 0.0003, "loss": 5.2065, "step": 4300 }, { "epoch": 0.0881575983453812, "grad_norm": 0.2567211091518402, "learning_rate": 0.0003, "loss": 5.1661, "step": 4305 }, { "epoch": 0.08825998812278582, "grad_norm": 0.2909543216228485, "learning_rate": 0.0003, "loss": 5.1768, "step": 4310 }, { "epoch": 0.08836237790019044, "grad_norm": 0.2807057201862335, "learning_rate": 0.0003, "loss": 5.2037, "step": 4315 }, { "epoch": 0.08846476767759506, "grad_norm": 0.24894212186336517, "learning_rate": 0.0003, "loss": 5.1616, "step": 4320 }, { "epoch": 0.0885671574549997, "grad_norm": 0.2790505290031433, "learning_rate": 0.0003, "loss": 5.1889, "step": 4325 }, { "epoch": 0.08866954723240432, "grad_norm": 0.24801431596279144, "learning_rate": 0.0003, "loss": 5.1444, "step": 4330 }, { "epoch": 0.08877193700980894, "grad_norm": 0.2559351921081543, "learning_rate": 0.0003, "loss": 5.1971, "step": 4335 }, { "epoch": 0.08887432678721356, "grad_norm": 0.2676590383052826, "learning_rate": 0.0003, "loss": 5.1643, "step": 4340 }, { "epoch": 0.08897671656461818, "grad_norm": 0.26642993092536926, "learning_rate": 0.0003, "loss": 5.1413, "step": 4345 }, { "epoch": 0.08907910634202282, "grad_norm": 0.386616587638855, "learning_rate": 0.0003, "loss": 5.1662, "step": 4350 }, { "epoch": 0.08918149611942744, "grad_norm": 0.26433825492858887, "learning_rate": 0.0003, "loss": 5.1705, "step": 4355 }, { "epoch": 0.08928388589683206, "grad_norm": 0.2568153142929077, "learning_rate": 0.0003, "loss": 5.1989, "step": 4360 }, { "epoch": 0.08938627567423668, "grad_norm": 0.24500320851802826, "learning_rate": 0.0003, "loss": 5.1974, "step": 4365 }, { "epoch": 0.0894886654516413, "grad_norm": 0.2564222514629364, "learning_rate": 0.0003, "loss": 5.2068, "step": 4370 }, { "epoch": 0.08959105522904594, "grad_norm": 0.3021552562713623, "learning_rate": 0.0003, "loss": 5.166, "step": 4375 }, { "epoch": 0.08969344500645056, "grad_norm": 0.509596586227417, "learning_rate": 0.0003, "loss": 5.2846, "step": 4380 }, { "epoch": 0.08979583478385518, "grad_norm": 0.30619922280311584, "learning_rate": 0.0003, "loss": 5.1868, "step": 4385 }, { "epoch": 0.0898982245612598, "grad_norm": 0.3148152530193329, "learning_rate": 0.0003, "loss": 5.1385, "step": 4390 }, { "epoch": 0.09000061433866442, "grad_norm": 0.29068124294281006, "learning_rate": 0.0003, "loss": 5.1808, "step": 4395 }, { "epoch": 0.09010300411606906, "grad_norm": 0.25925976037979126, "learning_rate": 0.0003, "loss": 5.188, "step": 4400 }, { "epoch": 0.09020539389347368, "grad_norm": 0.2427985966205597, "learning_rate": 0.0003, "loss": 5.1415, "step": 4405 }, { "epoch": 0.0903077836708783, "grad_norm": 0.25115054845809937, "learning_rate": 0.0003, "loss": 5.1682, "step": 4410 }, { "epoch": 0.09041017344828292, "grad_norm": 0.33867621421813965, "learning_rate": 0.0003, "loss": 5.1595, "step": 4415 }, { "epoch": 0.09051256322568754, "grad_norm": 0.27806276082992554, "learning_rate": 0.0003, "loss": 5.1811, "step": 4420 }, { "epoch": 0.09061495300309218, "grad_norm": 0.2543964684009552, "learning_rate": 0.0003, "loss": 5.174, "step": 4425 }, { "epoch": 0.0907173427804968, "grad_norm": 0.26773425936698914, "learning_rate": 0.0003, "loss": 5.1857, "step": 4430 }, { "epoch": 0.09081973255790142, "grad_norm": 0.257776141166687, "learning_rate": 0.0003, "loss": 5.157, "step": 4435 }, { "epoch": 0.09092212233530604, "grad_norm": 0.25122201442718506, "learning_rate": 0.0003, "loss": 5.2171, "step": 4440 }, { "epoch": 0.09102451211271066, "grad_norm": 0.2553665041923523, "learning_rate": 0.0003, "loss": 5.1882, "step": 4445 }, { "epoch": 0.0911269018901153, "grad_norm": 0.25656768679618835, "learning_rate": 0.0003, "loss": 5.154, "step": 4450 }, { "epoch": 0.09122929166751992, "grad_norm": 0.24263906478881836, "learning_rate": 0.0003, "loss": 5.1328, "step": 4455 }, { "epoch": 0.09133168144492454, "grad_norm": 0.270973801612854, "learning_rate": 0.0003, "loss": 5.2115, "step": 4460 }, { "epoch": 0.09143407122232916, "grad_norm": 0.26029303669929504, "learning_rate": 0.0003, "loss": 5.0833, "step": 4465 }, { "epoch": 0.09153646099973378, "grad_norm": 0.260354220867157, "learning_rate": 0.0003, "loss": 5.1212, "step": 4470 }, { "epoch": 0.09163885077713842, "grad_norm": 0.2506909966468811, "learning_rate": 0.0003, "loss": 5.1503, "step": 4475 }, { "epoch": 0.09174124055454304, "grad_norm": 0.2777837812900543, "learning_rate": 0.0003, "loss": 5.1238, "step": 4480 }, { "epoch": 0.09184363033194766, "grad_norm": 0.300954133272171, "learning_rate": 0.0003, "loss": 5.1712, "step": 4485 }, { "epoch": 0.09194602010935228, "grad_norm": 0.2487037479877472, "learning_rate": 0.0003, "loss": 5.1237, "step": 4490 }, { "epoch": 0.0920484098867569, "grad_norm": 0.2557465732097626, "learning_rate": 0.0003, "loss": 5.1414, "step": 4495 }, { "epoch": 0.09215079966416154, "grad_norm": 0.27927854657173157, "learning_rate": 0.0003, "loss": 5.1912, "step": 4500 }, { "epoch": 0.09225318944156616, "grad_norm": 0.2784734070301056, "learning_rate": 0.0003, "loss": 5.1419, "step": 4505 }, { "epoch": 0.09235557921897078, "grad_norm": 0.2585296928882599, "learning_rate": 0.0003, "loss": 5.1454, "step": 4510 }, { "epoch": 0.0924579689963754, "grad_norm": 0.24738222360610962, "learning_rate": 0.0003, "loss": 5.1355, "step": 4515 }, { "epoch": 0.09256035877378002, "grad_norm": 0.2655542492866516, "learning_rate": 0.0003, "loss": 5.1845, "step": 4520 }, { "epoch": 0.09266274855118466, "grad_norm": 0.24730607867240906, "learning_rate": 0.0003, "loss": 5.1778, "step": 4525 }, { "epoch": 0.09276513832858928, "grad_norm": 0.24619928002357483, "learning_rate": 0.0003, "loss": 5.1195, "step": 4530 }, { "epoch": 0.0928675281059939, "grad_norm": 0.23726853728294373, "learning_rate": 0.0003, "loss": 5.1595, "step": 4535 }, { "epoch": 0.09296991788339852, "grad_norm": 0.24438048899173737, "learning_rate": 0.0003, "loss": 5.2443, "step": 4540 }, { "epoch": 0.09307230766080314, "grad_norm": 0.26595500111579895, "learning_rate": 0.0003, "loss": 5.1405, "step": 4545 }, { "epoch": 0.09317469743820776, "grad_norm": 0.2615746855735779, "learning_rate": 0.0003, "loss": 5.1826, "step": 4550 }, { "epoch": 0.0932770872156124, "grad_norm": 0.282305508852005, "learning_rate": 0.0003, "loss": 5.1594, "step": 4555 }, { "epoch": 0.09337947699301702, "grad_norm": 0.24737706780433655, "learning_rate": 0.0003, "loss": 5.1662, "step": 4560 }, { "epoch": 0.09348186677042164, "grad_norm": 0.2563910186290741, "learning_rate": 0.0003, "loss": 5.1875, "step": 4565 }, { "epoch": 0.09358425654782626, "grad_norm": 0.26837265491485596, "learning_rate": 0.0003, "loss": 5.1543, "step": 4570 }, { "epoch": 0.09368664632523088, "grad_norm": 0.26752883195877075, "learning_rate": 0.0003, "loss": 5.198, "step": 4575 }, { "epoch": 0.09378903610263552, "grad_norm": 0.33696436882019043, "learning_rate": 0.0003, "loss": 5.1321, "step": 4580 }, { "epoch": 0.09389142588004014, "grad_norm": 0.27908802032470703, "learning_rate": 0.0003, "loss": 5.1815, "step": 4585 }, { "epoch": 0.09399381565744476, "grad_norm": 0.2778615951538086, "learning_rate": 0.0003, "loss": 5.156, "step": 4590 }, { "epoch": 0.09409620543484938, "grad_norm": 0.27270758152008057, "learning_rate": 0.0003, "loss": 5.1562, "step": 4595 }, { "epoch": 0.094198595212254, "grad_norm": 0.2752048969268799, "learning_rate": 0.0003, "loss": 5.152, "step": 4600 }, { "epoch": 0.09430098498965864, "grad_norm": 0.2512056827545166, "learning_rate": 0.0003, "loss": 5.1777, "step": 4605 }, { "epoch": 0.09440337476706326, "grad_norm": 0.28345462679862976, "learning_rate": 0.0003, "loss": 5.1545, "step": 4610 }, { "epoch": 0.09450576454446788, "grad_norm": 0.26428791880607605, "learning_rate": 0.0003, "loss": 5.162, "step": 4615 }, { "epoch": 0.0946081543218725, "grad_norm": 0.26608046889305115, "learning_rate": 0.0003, "loss": 5.1649, "step": 4620 }, { "epoch": 0.09471054409927712, "grad_norm": 0.2517186403274536, "learning_rate": 0.0003, "loss": 5.1542, "step": 4625 }, { "epoch": 0.09481293387668176, "grad_norm": 0.27337828278541565, "learning_rate": 0.0003, "loss": 5.1467, "step": 4630 }, { "epoch": 0.09491532365408638, "grad_norm": 0.26084741950035095, "learning_rate": 0.0003, "loss": 5.201, "step": 4635 }, { "epoch": 0.095017713431491, "grad_norm": 0.26555636525154114, "learning_rate": 0.0003, "loss": 5.1691, "step": 4640 }, { "epoch": 0.09512010320889562, "grad_norm": 0.25768858194351196, "learning_rate": 0.0003, "loss": 5.1592, "step": 4645 }, { "epoch": 0.09522249298630024, "grad_norm": 0.2555982172489166, "learning_rate": 0.0003, "loss": 5.153, "step": 4650 }, { "epoch": 0.09532488276370488, "grad_norm": 0.26049119234085083, "learning_rate": 0.0003, "loss": 5.0963, "step": 4655 }, { "epoch": 0.0954272725411095, "grad_norm": 0.266277015209198, "learning_rate": 0.0003, "loss": 5.1567, "step": 4660 }, { "epoch": 0.09552966231851412, "grad_norm": 0.2679003179073334, "learning_rate": 0.0003, "loss": 5.1182, "step": 4665 }, { "epoch": 0.09563205209591874, "grad_norm": 0.23862691223621368, "learning_rate": 0.0003, "loss": 5.1466, "step": 4670 }, { "epoch": 0.09573444187332336, "grad_norm": 0.24389131367206573, "learning_rate": 0.0003, "loss": 5.1213, "step": 4675 }, { "epoch": 0.095836831650728, "grad_norm": 0.24601227045059204, "learning_rate": 0.0003, "loss": 5.1277, "step": 4680 }, { "epoch": 0.09593922142813262, "grad_norm": 0.2755354344844818, "learning_rate": 0.0003, "loss": 5.0533, "step": 4685 }, { "epoch": 0.09604161120553724, "grad_norm": 0.3208519518375397, "learning_rate": 0.0003, "loss": 5.0945, "step": 4690 }, { "epoch": 0.09614400098294186, "grad_norm": 0.23996461927890778, "learning_rate": 0.0003, "loss": 5.2057, "step": 4695 }, { "epoch": 0.09624639076034648, "grad_norm": 0.2781566083431244, "learning_rate": 0.0003, "loss": 5.1222, "step": 4700 }, { "epoch": 0.09634878053775112, "grad_norm": 0.243170365691185, "learning_rate": 0.0003, "loss": 5.1748, "step": 4705 }, { "epoch": 0.09645117031515574, "grad_norm": 0.2466488778591156, "learning_rate": 0.0003, "loss": 5.1225, "step": 4710 }, { "epoch": 0.09655356009256036, "grad_norm": 0.2508876621723175, "learning_rate": 0.0003, "loss": 5.1061, "step": 4715 }, { "epoch": 0.09665594986996498, "grad_norm": 0.27872124314308167, "learning_rate": 0.0003, "loss": 5.1281, "step": 4720 }, { "epoch": 0.0967583396473696, "grad_norm": 0.2469724863767624, "learning_rate": 0.0003, "loss": 5.1669, "step": 4725 }, { "epoch": 0.09686072942477424, "grad_norm": 0.2937515377998352, "learning_rate": 0.0003, "loss": 5.1424, "step": 4730 }, { "epoch": 0.09696311920217886, "grad_norm": 0.23722520470619202, "learning_rate": 0.0003, "loss": 5.1555, "step": 4735 }, { "epoch": 0.09706550897958348, "grad_norm": 0.24938583374023438, "learning_rate": 0.0003, "loss": 5.1483, "step": 4740 }, { "epoch": 0.0971678987569881, "grad_norm": 0.23783664405345917, "learning_rate": 0.0003, "loss": 5.1784, "step": 4745 }, { "epoch": 0.09727028853439272, "grad_norm": 0.24777854979038239, "learning_rate": 0.0003, "loss": 5.1539, "step": 4750 }, { "epoch": 0.09737267831179736, "grad_norm": 0.2501198947429657, "learning_rate": 0.0003, "loss": 5.1269, "step": 4755 }, { "epoch": 0.09747506808920198, "grad_norm": 0.2623465359210968, "learning_rate": 0.0003, "loss": 5.1248, "step": 4760 }, { "epoch": 0.0975774578666066, "grad_norm": 0.2732715904712677, "learning_rate": 0.0003, "loss": 5.1493, "step": 4765 }, { "epoch": 0.09767984764401122, "grad_norm": 0.22563378512859344, "learning_rate": 0.0003, "loss": 5.1442, "step": 4770 }, { "epoch": 0.09778223742141584, "grad_norm": 0.2790745198726654, "learning_rate": 0.0003, "loss": 5.1472, "step": 4775 }, { "epoch": 0.09788462719882048, "grad_norm": 0.2794435918331146, "learning_rate": 0.0003, "loss": 5.1209, "step": 4780 }, { "epoch": 0.0979870169762251, "grad_norm": 0.26980283856391907, "learning_rate": 0.0003, "loss": 5.1331, "step": 4785 }, { "epoch": 0.09808940675362972, "grad_norm": 0.3303547203540802, "learning_rate": 0.0003, "loss": 5.0662, "step": 4790 }, { "epoch": 0.09819179653103434, "grad_norm": 0.24784614145755768, "learning_rate": 0.0003, "loss": 5.1229, "step": 4795 }, { "epoch": 0.09829418630843896, "grad_norm": 0.25418931245803833, "learning_rate": 0.0003, "loss": 5.1011, "step": 4800 }, { "epoch": 0.0983965760858436, "grad_norm": 0.27420976758003235, "learning_rate": 0.0003, "loss": 5.1764, "step": 4805 }, { "epoch": 0.09849896586324822, "grad_norm": 0.2599352300167084, "learning_rate": 0.0003, "loss": 5.1369, "step": 4810 }, { "epoch": 0.09860135564065284, "grad_norm": 0.26636365056037903, "learning_rate": 0.0003, "loss": 5.1434, "step": 4815 }, { "epoch": 0.09870374541805746, "grad_norm": 0.2413569837808609, "learning_rate": 0.0003, "loss": 5.2023, "step": 4820 }, { "epoch": 0.09880613519546208, "grad_norm": 0.3400757312774658, "learning_rate": 0.0003, "loss": 5.1625, "step": 4825 }, { "epoch": 0.09890852497286672, "grad_norm": 0.26301613450050354, "learning_rate": 0.0003, "loss": 5.12, "step": 4830 }, { "epoch": 0.09901091475027134, "grad_norm": 0.2481679618358612, "learning_rate": 0.0003, "loss": 5.1281, "step": 4835 }, { "epoch": 0.09911330452767596, "grad_norm": 0.2625468075275421, "learning_rate": 0.0003, "loss": 5.1418, "step": 4840 }, { "epoch": 0.09921569430508058, "grad_norm": 0.2667504549026489, "learning_rate": 0.0003, "loss": 5.1343, "step": 4845 }, { "epoch": 0.0993180840824852, "grad_norm": 0.244068443775177, "learning_rate": 0.0003, "loss": 5.1611, "step": 4850 }, { "epoch": 0.09942047385988984, "grad_norm": 0.27124324440956116, "learning_rate": 0.0003, "loss": 5.1413, "step": 4855 }, { "epoch": 0.09952286363729446, "grad_norm": 0.2569420635700226, "learning_rate": 0.0003, "loss": 5.1242, "step": 4860 }, { "epoch": 0.09962525341469908, "grad_norm": 0.2730373442173004, "learning_rate": 0.0003, "loss": 5.1154, "step": 4865 }, { "epoch": 0.0997276431921037, "grad_norm": 0.2586059868335724, "learning_rate": 0.0003, "loss": 5.1066, "step": 4870 }, { "epoch": 0.09983003296950832, "grad_norm": 0.24679683148860931, "learning_rate": 0.0003, "loss": 5.1419, "step": 4875 }, { "epoch": 0.09993242274691296, "grad_norm": 0.24525699019432068, "learning_rate": 0.0003, "loss": 5.1382, "step": 4880 }, { "epoch": 0.10003481252431758, "grad_norm": 0.2788681089878082, "learning_rate": 0.0003, "loss": 5.1226, "step": 4885 }, { "epoch": 0.1001372023017222, "grad_norm": 0.2719569504261017, "learning_rate": 0.0003, "loss": 5.1617, "step": 4890 }, { "epoch": 0.10023959207912682, "grad_norm": 0.2520180642604828, "learning_rate": 0.0003, "loss": 5.201, "step": 4895 }, { "epoch": 0.10034198185653144, "grad_norm": 0.2797073423862457, "learning_rate": 0.0003, "loss": 5.1073, "step": 4900 }, { "epoch": 0.10044437163393606, "grad_norm": 0.23679588735103607, "learning_rate": 0.0003, "loss": 5.0595, "step": 4905 }, { "epoch": 0.1005467614113407, "grad_norm": 0.2508576512336731, "learning_rate": 0.0003, "loss": 5.1936, "step": 4910 }, { "epoch": 0.10064915118874532, "grad_norm": 0.24073272943496704, "learning_rate": 0.0003, "loss": 5.1966, "step": 4915 }, { "epoch": 0.10075154096614994, "grad_norm": 0.30414456129074097, "learning_rate": 0.0003, "loss": 5.1499, "step": 4920 }, { "epoch": 0.10085393074355456, "grad_norm": 0.26562175154685974, "learning_rate": 0.0003, "loss": 5.1271, "step": 4925 }, { "epoch": 0.10095632052095918, "grad_norm": 0.28138473629951477, "learning_rate": 0.0003, "loss": 5.1332, "step": 4930 }, { "epoch": 0.10105871029836382, "grad_norm": 0.2860601246356964, "learning_rate": 0.0003, "loss": 5.1772, "step": 4935 }, { "epoch": 0.10116110007576844, "grad_norm": 0.35466572642326355, "learning_rate": 0.0003, "loss": 5.1402, "step": 4940 }, { "epoch": 0.10126348985317306, "grad_norm": 0.3055334687232971, "learning_rate": 0.0003, "loss": 5.109, "step": 4945 }, { "epoch": 0.10136587963057768, "grad_norm": 0.2625313401222229, "learning_rate": 0.0003, "loss": 5.1465, "step": 4950 }, { "epoch": 0.1014682694079823, "grad_norm": 0.3070855438709259, "learning_rate": 0.0003, "loss": 5.1775, "step": 4955 }, { "epoch": 0.10157065918538694, "grad_norm": 0.252029687166214, "learning_rate": 0.0003, "loss": 5.1638, "step": 4960 }, { "epoch": 0.10167304896279156, "grad_norm": 0.2720160186290741, "learning_rate": 0.0003, "loss": 5.1241, "step": 4965 }, { "epoch": 0.10177543874019618, "grad_norm": 0.29157376289367676, "learning_rate": 0.0003, "loss": 5.1356, "step": 4970 }, { "epoch": 0.1018778285176008, "grad_norm": 0.2681252062320709, "learning_rate": 0.0003, "loss": 5.1856, "step": 4975 }, { "epoch": 0.10198021829500542, "grad_norm": 0.25204727053642273, "learning_rate": 0.0003, "loss": 5.094, "step": 4980 }, { "epoch": 0.10208260807241006, "grad_norm": 0.2691825032234192, "learning_rate": 0.0003, "loss": 5.105, "step": 4985 }, { "epoch": 0.10218499784981468, "grad_norm": 0.27777227759361267, "learning_rate": 0.0003, "loss": 5.1761, "step": 4990 }, { "epoch": 0.1022873876272193, "grad_norm": 0.28451526165008545, "learning_rate": 0.0003, "loss": 5.1084, "step": 4995 }, { "epoch": 0.10238977740462392, "grad_norm": 0.25006020069122314, "learning_rate": 0.0003, "loss": 5.1251, "step": 5000 }, { "epoch": 0.10238977740462392, "eval_loss": 5.143056869506836, "eval_runtime": 17.2601, "eval_samples_per_second": 33.719, "eval_steps_per_second": 4.229, "step": 5000 }, { "epoch": 0.10249216718202854, "grad_norm": 0.23722681403160095, "learning_rate": 0.0003, "loss": 5.1111, "step": 5005 }, { "epoch": 0.10259455695943318, "grad_norm": 0.25522348284721375, "learning_rate": 0.0003, "loss": 5.0909, "step": 5010 }, { "epoch": 0.1026969467368378, "grad_norm": 0.27011430263519287, "learning_rate": 0.0003, "loss": 5.1469, "step": 5015 }, { "epoch": 0.10279933651424242, "grad_norm": 0.28230318427085876, "learning_rate": 0.0003, "loss": 5.1279, "step": 5020 }, { "epoch": 0.10290172629164704, "grad_norm": 0.2487412542104721, "learning_rate": 0.0003, "loss": 5.1361, "step": 5025 }, { "epoch": 0.10300411606905166, "grad_norm": 0.24795986711978912, "learning_rate": 0.0003, "loss": 5.1209, "step": 5030 }, { "epoch": 0.1031065058464563, "grad_norm": 0.2390604168176651, "learning_rate": 0.0003, "loss": 5.1062, "step": 5035 }, { "epoch": 0.10320889562386092, "grad_norm": 0.28911134600639343, "learning_rate": 0.0003, "loss": 5.112, "step": 5040 }, { "epoch": 0.10331128540126554, "grad_norm": 0.27236247062683105, "learning_rate": 0.0003, "loss": 5.0818, "step": 5045 }, { "epoch": 0.10341367517867016, "grad_norm": 0.28922855854034424, "learning_rate": 0.0003, "loss": 5.1046, "step": 5050 }, { "epoch": 0.10351606495607478, "grad_norm": 0.31324613094329834, "learning_rate": 0.0003, "loss": 5.1404, "step": 5055 }, { "epoch": 0.10361845473347941, "grad_norm": 0.26908165216445923, "learning_rate": 0.0003, "loss": 5.0967, "step": 5060 }, { "epoch": 0.10372084451088404, "grad_norm": 0.2754386365413666, "learning_rate": 0.0003, "loss": 5.1459, "step": 5065 }, { "epoch": 0.10382323428828866, "grad_norm": 0.29012352228164673, "learning_rate": 0.0003, "loss": 5.1556, "step": 5070 }, { "epoch": 0.10392562406569328, "grad_norm": 0.31764522194862366, "learning_rate": 0.0003, "loss": 5.1194, "step": 5075 }, { "epoch": 0.1040280138430979, "grad_norm": 0.27008572220802307, "learning_rate": 0.0003, "loss": 5.1184, "step": 5080 }, { "epoch": 0.10413040362050253, "grad_norm": 0.25800609588623047, "learning_rate": 0.0003, "loss": 5.0866, "step": 5085 }, { "epoch": 0.10423279339790716, "grad_norm": 0.6253742575645447, "learning_rate": 0.0003, "loss": 5.1341, "step": 5090 }, { "epoch": 0.10433518317531178, "grad_norm": 0.26275634765625, "learning_rate": 0.0003, "loss": 5.1114, "step": 5095 }, { "epoch": 0.1044375729527164, "grad_norm": 0.25894594192504883, "learning_rate": 0.0003, "loss": 5.0714, "step": 5100 }, { "epoch": 0.10453996273012102, "grad_norm": 0.2545381784439087, "learning_rate": 0.0003, "loss": 5.1005, "step": 5105 }, { "epoch": 0.10464235250752565, "grad_norm": 0.2790254056453705, "learning_rate": 0.0003, "loss": 5.1025, "step": 5110 }, { "epoch": 0.10474474228493028, "grad_norm": 0.25671303272247314, "learning_rate": 0.0003, "loss": 5.1659, "step": 5115 }, { "epoch": 0.1048471320623349, "grad_norm": 0.2371091991662979, "learning_rate": 0.0003, "loss": 5.1035, "step": 5120 }, { "epoch": 0.10494952183973952, "grad_norm": 0.2600834369659424, "learning_rate": 0.0003, "loss": 5.0701, "step": 5125 }, { "epoch": 0.10505191161714414, "grad_norm": 0.3032551109790802, "learning_rate": 0.0003, "loss": 5.094, "step": 5130 }, { "epoch": 0.10515430139454877, "grad_norm": 0.24528567492961884, "learning_rate": 0.0003, "loss": 5.1415, "step": 5135 }, { "epoch": 0.1052566911719534, "grad_norm": 0.291016161441803, "learning_rate": 0.0003, "loss": 5.0553, "step": 5140 }, { "epoch": 0.10535908094935802, "grad_norm": 0.23490361869335175, "learning_rate": 0.0003, "loss": 5.1263, "step": 5145 }, { "epoch": 0.10546147072676264, "grad_norm": 0.42114129662513733, "learning_rate": 0.0003, "loss": 5.1286, "step": 5150 }, { "epoch": 0.10556386050416726, "grad_norm": 0.25405389070510864, "learning_rate": 0.0003, "loss": 5.1484, "step": 5155 }, { "epoch": 0.1056662502815719, "grad_norm": 0.29397106170654297, "learning_rate": 0.0003, "loss": 5.1388, "step": 5160 }, { "epoch": 0.10576864005897652, "grad_norm": 0.2647407054901123, "learning_rate": 0.0003, "loss": 5.1146, "step": 5165 }, { "epoch": 0.10587102983638114, "grad_norm": 0.2777150273323059, "learning_rate": 0.0003, "loss": 5.1521, "step": 5170 }, { "epoch": 0.10597341961378576, "grad_norm": 0.25496071577072144, "learning_rate": 0.0003, "loss": 5.1233, "step": 5175 }, { "epoch": 0.10607580939119038, "grad_norm": 0.2745848298072815, "learning_rate": 0.0003, "loss": 5.1353, "step": 5180 }, { "epoch": 0.10617819916859501, "grad_norm": 0.2660798132419586, "learning_rate": 0.0003, "loss": 5.133, "step": 5185 }, { "epoch": 0.10628058894599964, "grad_norm": 0.2540626525878906, "learning_rate": 0.0003, "loss": 5.0843, "step": 5190 }, { "epoch": 0.10638297872340426, "grad_norm": 0.24836969375610352, "learning_rate": 0.0003, "loss": 5.1156, "step": 5195 }, { "epoch": 0.10648536850080888, "grad_norm": 0.28216132521629333, "learning_rate": 0.0003, "loss": 5.1307, "step": 5200 }, { "epoch": 0.1065877582782135, "grad_norm": 0.23361997306346893, "learning_rate": 0.0003, "loss": 5.128, "step": 5205 }, { "epoch": 0.10669014805561813, "grad_norm": 0.2421647608280182, "learning_rate": 0.0003, "loss": 5.1002, "step": 5210 }, { "epoch": 0.10679253783302275, "grad_norm": 0.24135902523994446, "learning_rate": 0.0003, "loss": 5.0917, "step": 5215 }, { "epoch": 0.10689492761042738, "grad_norm": 0.26153501868247986, "learning_rate": 0.0003, "loss": 5.0929, "step": 5220 }, { "epoch": 0.106997317387832, "grad_norm": 0.2569580078125, "learning_rate": 0.0003, "loss": 5.1, "step": 5225 }, { "epoch": 0.10709970716523662, "grad_norm": 0.34273838996887207, "learning_rate": 0.0003, "loss": 5.1024, "step": 5230 }, { "epoch": 0.10720209694264125, "grad_norm": 0.25896260142326355, "learning_rate": 0.0003, "loss": 5.116, "step": 5235 }, { "epoch": 0.10730448672004587, "grad_norm": 0.25686803460121155, "learning_rate": 0.0003, "loss": 5.117, "step": 5240 }, { "epoch": 0.1074068764974505, "grad_norm": 0.2619083821773529, "learning_rate": 0.0003, "loss": 5.0904, "step": 5245 }, { "epoch": 0.10750926627485512, "grad_norm": 0.26761579513549805, "learning_rate": 0.0003, "loss": 5.1233, "step": 5250 }, { "epoch": 0.10761165605225974, "grad_norm": 0.2648669481277466, "learning_rate": 0.0003, "loss": 5.1176, "step": 5255 }, { "epoch": 0.10771404582966437, "grad_norm": 0.2606008052825928, "learning_rate": 0.0003, "loss": 5.1285, "step": 5260 }, { "epoch": 0.107816435607069, "grad_norm": 0.2694513499736786, "learning_rate": 0.0003, "loss": 5.0945, "step": 5265 }, { "epoch": 0.10791882538447362, "grad_norm": 0.267244815826416, "learning_rate": 0.0003, "loss": 5.1243, "step": 5270 }, { "epoch": 0.10802121516187824, "grad_norm": 0.24808046221733093, "learning_rate": 0.0003, "loss": 5.0959, "step": 5275 }, { "epoch": 0.10812360493928286, "grad_norm": 0.3454941213130951, "learning_rate": 0.0003, "loss": 5.083, "step": 5280 }, { "epoch": 0.10822599471668748, "grad_norm": 0.2596804201602936, "learning_rate": 0.0003, "loss": 5.1347, "step": 5285 }, { "epoch": 0.10832838449409211, "grad_norm": 0.2772788107395172, "learning_rate": 0.0003, "loss": 5.1316, "step": 5290 }, { "epoch": 0.10843077427149674, "grad_norm": 0.31930217146873474, "learning_rate": 0.0003, "loss": 5.0711, "step": 5295 }, { "epoch": 0.10853316404890136, "grad_norm": 0.2737719416618347, "learning_rate": 0.0003, "loss": 5.1095, "step": 5300 }, { "epoch": 0.10863555382630598, "grad_norm": 0.2516084313392639, "learning_rate": 0.0003, "loss": 5.0975, "step": 5305 }, { "epoch": 0.1087379436037106, "grad_norm": 0.24778488278388977, "learning_rate": 0.0003, "loss": 5.0538, "step": 5310 }, { "epoch": 0.10884033338111523, "grad_norm": 0.33298465609550476, "learning_rate": 0.0003, "loss": 5.0902, "step": 5315 }, { "epoch": 0.10894272315851986, "grad_norm": 0.30207422375679016, "learning_rate": 0.0003, "loss": 5.0882, "step": 5320 }, { "epoch": 0.10904511293592448, "grad_norm": 0.26138100028038025, "learning_rate": 0.0003, "loss": 5.0834, "step": 5325 }, { "epoch": 0.1091475027133291, "grad_norm": 0.24217452108860016, "learning_rate": 0.0003, "loss": 5.1091, "step": 5330 }, { "epoch": 0.10924989249073372, "grad_norm": 0.2715062201023102, "learning_rate": 0.0003, "loss": 5.0695, "step": 5335 }, { "epoch": 0.10935228226813835, "grad_norm": 0.24975372850894928, "learning_rate": 0.0003, "loss": 5.1061, "step": 5340 }, { "epoch": 0.10945467204554298, "grad_norm": 0.24967104196548462, "learning_rate": 0.0003, "loss": 5.1191, "step": 5345 }, { "epoch": 0.1095570618229476, "grad_norm": 0.23948703706264496, "learning_rate": 0.0003, "loss": 5.1065, "step": 5350 }, { "epoch": 0.10965945160035222, "grad_norm": 0.2402109056711197, "learning_rate": 0.0003, "loss": 5.0674, "step": 5355 }, { "epoch": 0.10976184137775684, "grad_norm": 0.252261221408844, "learning_rate": 0.0003, "loss": 5.0853, "step": 5360 }, { "epoch": 0.10986423115516147, "grad_norm": 0.2358742654323578, "learning_rate": 0.0003, "loss": 5.1215, "step": 5365 }, { "epoch": 0.1099666209325661, "grad_norm": 0.2711077332496643, "learning_rate": 0.0003, "loss": 5.1049, "step": 5370 }, { "epoch": 0.11006901070997072, "grad_norm": 0.2335502803325653, "learning_rate": 0.0003, "loss": 5.0961, "step": 5375 }, { "epoch": 0.11017140048737534, "grad_norm": 0.28193357586860657, "learning_rate": 0.0003, "loss": 4.9784, "step": 5380 }, { "epoch": 0.11027379026477996, "grad_norm": 0.27377572655677795, "learning_rate": 0.0003, "loss": 5.1219, "step": 5385 }, { "epoch": 0.1103761800421846, "grad_norm": 0.25776010751724243, "learning_rate": 0.0003, "loss": 5.1087, "step": 5390 }, { "epoch": 0.11047856981958921, "grad_norm": 0.23744498193264008, "learning_rate": 0.0003, "loss": 5.1089, "step": 5395 }, { "epoch": 0.11058095959699384, "grad_norm": 0.24810472130775452, "learning_rate": 0.0003, "loss": 5.1097, "step": 5400 }, { "epoch": 0.11068334937439846, "grad_norm": 0.2660318613052368, "learning_rate": 0.0003, "loss": 5.0666, "step": 5405 }, { "epoch": 0.11078573915180308, "grad_norm": 0.2684977650642395, "learning_rate": 0.0003, "loss": 5.1226, "step": 5410 }, { "epoch": 0.11088812892920771, "grad_norm": 0.23833255469799042, "learning_rate": 0.0003, "loss": 5.059, "step": 5415 }, { "epoch": 0.11099051870661233, "grad_norm": 0.22599267959594727, "learning_rate": 0.0003, "loss": 5.0964, "step": 5420 }, { "epoch": 0.11109290848401696, "grad_norm": 0.2854502499103546, "learning_rate": 0.0003, "loss": 5.0965, "step": 5425 }, { "epoch": 0.11119529826142158, "grad_norm": 0.25768956542015076, "learning_rate": 0.0003, "loss": 5.0795, "step": 5430 }, { "epoch": 0.1112976880388262, "grad_norm": 0.25362953543663025, "learning_rate": 0.0003, "loss": 5.1051, "step": 5435 }, { "epoch": 0.11140007781623083, "grad_norm": 0.25006183981895447, "learning_rate": 0.0003, "loss": 5.1367, "step": 5440 }, { "epoch": 0.11150246759363545, "grad_norm": 0.26781198382377625, "learning_rate": 0.0003, "loss": 5.1088, "step": 5445 }, { "epoch": 0.11160485737104008, "grad_norm": 0.24142861366271973, "learning_rate": 0.0003, "loss": 5.0972, "step": 5450 }, { "epoch": 0.1117072471484447, "grad_norm": 0.5353167057037354, "learning_rate": 0.0003, "loss": 5.0888, "step": 5455 }, { "epoch": 0.11180963692584932, "grad_norm": 0.2552117705345154, "learning_rate": 0.0003, "loss": 5.1282, "step": 5460 }, { "epoch": 0.11191202670325395, "grad_norm": 0.2614104747772217, "learning_rate": 0.0003, "loss": 5.0819, "step": 5465 }, { "epoch": 0.11201441648065857, "grad_norm": 0.2724562883377075, "learning_rate": 0.0003, "loss": 5.0925, "step": 5470 }, { "epoch": 0.1121168062580632, "grad_norm": 0.25823545455932617, "learning_rate": 0.0003, "loss": 5.0824, "step": 5475 }, { "epoch": 0.11221919603546782, "grad_norm": 0.2416844666004181, "learning_rate": 0.0003, "loss": 5.086, "step": 5480 }, { "epoch": 0.11232158581287244, "grad_norm": 0.24802665412425995, "learning_rate": 0.0003, "loss": 5.116, "step": 5485 }, { "epoch": 0.11242397559027707, "grad_norm": 0.26273369789123535, "learning_rate": 0.0003, "loss": 5.0873, "step": 5490 }, { "epoch": 0.1125263653676817, "grad_norm": 0.25046268105506897, "learning_rate": 0.0003, "loss": 5.0717, "step": 5495 }, { "epoch": 0.11262875514508632, "grad_norm": 0.2463729828596115, "learning_rate": 0.0003, "loss": 5.0657, "step": 5500 }, { "epoch": 0.11273114492249094, "grad_norm": 0.2476728856563568, "learning_rate": 0.0003, "loss": 5.1009, "step": 5505 }, { "epoch": 0.11283353469989556, "grad_norm": 0.23306916654109955, "learning_rate": 0.0003, "loss": 5.1019, "step": 5510 }, { "epoch": 0.11293592447730019, "grad_norm": 0.2420760691165924, "learning_rate": 0.0003, "loss": 5.1188, "step": 5515 }, { "epoch": 0.11303831425470481, "grad_norm": 0.2833799421787262, "learning_rate": 0.0003, "loss": 5.1152, "step": 5520 }, { "epoch": 0.11314070403210943, "grad_norm": 0.27480828762054443, "learning_rate": 0.0003, "loss": 5.0933, "step": 5525 }, { "epoch": 0.11324309380951406, "grad_norm": 0.32995694875717163, "learning_rate": 0.0003, "loss": 5.0861, "step": 5530 }, { "epoch": 0.11334548358691868, "grad_norm": 0.2790719270706177, "learning_rate": 0.0003, "loss": 5.0864, "step": 5535 }, { "epoch": 0.11344787336432331, "grad_norm": 0.260960191488266, "learning_rate": 0.0003, "loss": 5.0609, "step": 5540 }, { "epoch": 0.11355026314172793, "grad_norm": 0.24379654228687286, "learning_rate": 0.0003, "loss": 5.1043, "step": 5545 }, { "epoch": 0.11365265291913255, "grad_norm": 0.5207554697990417, "learning_rate": 0.0003, "loss": 5.1387, "step": 5550 }, { "epoch": 0.11375504269653718, "grad_norm": 0.24510209262371063, "learning_rate": 0.0003, "loss": 5.0969, "step": 5555 }, { "epoch": 0.1138574324739418, "grad_norm": 0.2376794070005417, "learning_rate": 0.0003, "loss": 5.1169, "step": 5560 }, { "epoch": 0.11395982225134643, "grad_norm": 0.27716517448425293, "learning_rate": 0.0003, "loss": 5.1212, "step": 5565 }, { "epoch": 0.11406221202875105, "grad_norm": 0.2802460193634033, "learning_rate": 0.0003, "loss": 5.0595, "step": 5570 }, { "epoch": 0.11416460180615567, "grad_norm": 0.2466181516647339, "learning_rate": 0.0003, "loss": 5.1075, "step": 5575 }, { "epoch": 0.1142669915835603, "grad_norm": 0.23184147477149963, "learning_rate": 0.0003, "loss": 5.0972, "step": 5580 }, { "epoch": 0.11436938136096492, "grad_norm": 0.25597503781318665, "learning_rate": 0.0003, "loss": 5.0806, "step": 5585 }, { "epoch": 0.11447177113836955, "grad_norm": 0.2580608129501343, "learning_rate": 0.0003, "loss": 5.0831, "step": 5590 }, { "epoch": 0.11457416091577417, "grad_norm": 0.26163652539253235, "learning_rate": 0.0003, "loss": 5.085, "step": 5595 }, { "epoch": 0.1146765506931788, "grad_norm": 0.25621700286865234, "learning_rate": 0.0003, "loss": 5.0736, "step": 5600 }, { "epoch": 0.11477894047058342, "grad_norm": 0.25294628739356995, "learning_rate": 0.0003, "loss": 5.0473, "step": 5605 }, { "epoch": 0.11488133024798804, "grad_norm": 0.23033612966537476, "learning_rate": 0.0003, "loss": 5.1719, "step": 5610 }, { "epoch": 0.11498372002539267, "grad_norm": 1.2823646068572998, "learning_rate": 0.0003, "loss": 5.1058, "step": 5615 }, { "epoch": 0.11508610980279729, "grad_norm": 0.2869229018688202, "learning_rate": 0.0003, "loss": 5.1134, "step": 5620 }, { "epoch": 0.11518849958020191, "grad_norm": 0.2845302224159241, "learning_rate": 0.0003, "loss": 5.099, "step": 5625 }, { "epoch": 0.11529088935760654, "grad_norm": 0.28526628017425537, "learning_rate": 0.0003, "loss": 5.0788, "step": 5630 }, { "epoch": 0.11539327913501116, "grad_norm": 0.2521724998950958, "learning_rate": 0.0003, "loss": 5.0675, "step": 5635 }, { "epoch": 0.11549566891241578, "grad_norm": 0.24957355856895447, "learning_rate": 0.0003, "loss": 5.1247, "step": 5640 }, { "epoch": 0.11559805868982041, "grad_norm": 0.3723267912864685, "learning_rate": 0.0003, "loss": 5.0761, "step": 5645 }, { "epoch": 0.11570044846722503, "grad_norm": 0.2532084286212921, "learning_rate": 0.0003, "loss": 5.0858, "step": 5650 }, { "epoch": 0.11580283824462966, "grad_norm": 0.23800311982631683, "learning_rate": 0.0003, "loss": 5.0817, "step": 5655 }, { "epoch": 0.11590522802203428, "grad_norm": 0.27352309226989746, "learning_rate": 0.0003, "loss": 5.045, "step": 5660 }, { "epoch": 0.1160076177994389, "grad_norm": 0.2616291046142578, "learning_rate": 0.0003, "loss": 5.0713, "step": 5665 }, { "epoch": 0.11611000757684353, "grad_norm": 0.24883075058460236, "learning_rate": 0.0003, "loss": 5.0622, "step": 5670 }, { "epoch": 0.11621239735424815, "grad_norm": 0.30054086446762085, "learning_rate": 0.0003, "loss": 5.0688, "step": 5675 }, { "epoch": 0.11631478713165277, "grad_norm": 0.2814779281616211, "learning_rate": 0.0003, "loss": 5.0949, "step": 5680 }, { "epoch": 0.1164171769090574, "grad_norm": 0.23773646354675293, "learning_rate": 0.0003, "loss": 5.0514, "step": 5685 }, { "epoch": 0.11651956668646202, "grad_norm": 0.2319362759590149, "learning_rate": 0.0003, "loss": 5.0607, "step": 5690 }, { "epoch": 0.11662195646386665, "grad_norm": 0.28966546058654785, "learning_rate": 0.0003, "loss": 5.0548, "step": 5695 }, { "epoch": 0.11672434624127127, "grad_norm": 0.26125651597976685, "learning_rate": 0.0003, "loss": 5.072, "step": 5700 }, { "epoch": 0.1168267360186759, "grad_norm": 0.2489207535982132, "learning_rate": 0.0003, "loss": 5.091, "step": 5705 }, { "epoch": 0.11692912579608052, "grad_norm": 0.2681567966938019, "learning_rate": 0.0003, "loss": 5.0954, "step": 5710 }, { "epoch": 0.11703151557348514, "grad_norm": 0.3284648060798645, "learning_rate": 0.0003, "loss": 5.0812, "step": 5715 }, { "epoch": 0.11713390535088977, "grad_norm": 0.803409993648529, "learning_rate": 0.0003, "loss": 5.0538, "step": 5720 }, { "epoch": 0.1172362951282944, "grad_norm": 0.26291313767433167, "learning_rate": 0.0003, "loss": 5.0987, "step": 5725 }, { "epoch": 0.11733868490569901, "grad_norm": 0.2675195634365082, "learning_rate": 0.0003, "loss": 5.0876, "step": 5730 }, { "epoch": 0.11744107468310364, "grad_norm": 0.267531156539917, "learning_rate": 0.0003, "loss": 5.0532, "step": 5735 }, { "epoch": 0.11754346446050826, "grad_norm": 0.23051545023918152, "learning_rate": 0.0003, "loss": 5.0877, "step": 5740 }, { "epoch": 0.11764585423791289, "grad_norm": 0.24739649891853333, "learning_rate": 0.0003, "loss": 5.0688, "step": 5745 }, { "epoch": 0.11774824401531751, "grad_norm": 0.23418428003787994, "learning_rate": 0.0003, "loss": 5.1165, "step": 5750 }, { "epoch": 0.11785063379272213, "grad_norm": 0.24748854339122772, "learning_rate": 0.0003, "loss": 5.0548, "step": 5755 }, { "epoch": 0.11795302357012676, "grad_norm": 0.24577189981937408, "learning_rate": 0.0003, "loss": 5.0686, "step": 5760 }, { "epoch": 0.11805541334753138, "grad_norm": 0.2512575089931488, "learning_rate": 0.0003, "loss": 5.0668, "step": 5765 }, { "epoch": 0.11815780312493601, "grad_norm": 0.2299077957868576, "learning_rate": 0.0003, "loss": 5.1238, "step": 5770 }, { "epoch": 0.11826019290234063, "grad_norm": 0.24888236820697784, "learning_rate": 0.0003, "loss": 5.0951, "step": 5775 }, { "epoch": 0.11836258267974525, "grad_norm": 0.24673014879226685, "learning_rate": 0.0003, "loss": 5.0171, "step": 5780 }, { "epoch": 0.11846497245714988, "grad_norm": 0.24857375025749207, "learning_rate": 0.0003, "loss": 5.0761, "step": 5785 }, { "epoch": 0.1185673622345545, "grad_norm": 0.28456902503967285, "learning_rate": 0.0003, "loss": 5.0392, "step": 5790 }, { "epoch": 0.11866975201195913, "grad_norm": 0.24286267161369324, "learning_rate": 0.0003, "loss": 5.0783, "step": 5795 }, { "epoch": 0.11877214178936375, "grad_norm": 0.30398255586624146, "learning_rate": 0.0003, "loss": 5.0754, "step": 5800 }, { "epoch": 0.11887453156676837, "grad_norm": 0.2428668886423111, "learning_rate": 0.0003, "loss": 5.073, "step": 5805 }, { "epoch": 0.118976921344173, "grad_norm": 0.2462380826473236, "learning_rate": 0.0003, "loss": 5.0761, "step": 5810 }, { "epoch": 0.11907931112157762, "grad_norm": 0.2458244115114212, "learning_rate": 0.0003, "loss": 5.094, "step": 5815 }, { "epoch": 0.11918170089898225, "grad_norm": 0.23394592106342316, "learning_rate": 0.0003, "loss": 5.0877, "step": 5820 }, { "epoch": 0.11928409067638687, "grad_norm": 2.4448344707489014, "learning_rate": 0.0003, "loss": 5.1191, "step": 5825 }, { "epoch": 0.1193864804537915, "grad_norm": 0.285965234041214, "learning_rate": 0.0003, "loss": 5.0547, "step": 5830 }, { "epoch": 0.11948887023119611, "grad_norm": 0.3072868585586548, "learning_rate": 0.0003, "loss": 5.0573, "step": 5835 }, { "epoch": 0.11959126000860074, "grad_norm": 0.23148755729198456, "learning_rate": 0.0003, "loss": 5.1279, "step": 5840 }, { "epoch": 0.11969364978600537, "grad_norm": 0.2666797339916229, "learning_rate": 0.0003, "loss": 5.0577, "step": 5845 }, { "epoch": 0.11979603956340999, "grad_norm": 0.2828826308250427, "learning_rate": 0.0003, "loss": 5.0758, "step": 5850 }, { "epoch": 0.11989842934081461, "grad_norm": 0.26379990577697754, "learning_rate": 0.0003, "loss": 5.0427, "step": 5855 }, { "epoch": 0.12000081911821923, "grad_norm": 0.2764631509780884, "learning_rate": 0.0003, "loss": 5.0696, "step": 5860 }, { "epoch": 0.12010320889562386, "grad_norm": 0.358840674161911, "learning_rate": 0.0003, "loss": 5.0659, "step": 5865 }, { "epoch": 0.12020559867302849, "grad_norm": 0.2416507452726364, "learning_rate": 0.0003, "loss": 5.1139, "step": 5870 }, { "epoch": 0.12030798845043311, "grad_norm": 0.268307089805603, "learning_rate": 0.0003, "loss": 5.083, "step": 5875 }, { "epoch": 0.12041037822783773, "grad_norm": 0.2705295979976654, "learning_rate": 0.0003, "loss": 5.0614, "step": 5880 }, { "epoch": 0.12051276800524235, "grad_norm": 0.2383246272802353, "learning_rate": 0.0003, "loss": 5.0237, "step": 5885 }, { "epoch": 0.12061515778264698, "grad_norm": 0.24550370872020721, "learning_rate": 0.0003, "loss": 5.043, "step": 5890 }, { "epoch": 0.12071754756005161, "grad_norm": 0.30784371495246887, "learning_rate": 0.0003, "loss": 5.0565, "step": 5895 }, { "epoch": 0.12081993733745623, "grad_norm": 0.3065190315246582, "learning_rate": 0.0003, "loss": 5.0672, "step": 5900 }, { "epoch": 0.12092232711486085, "grad_norm": 0.2554434537887573, "learning_rate": 0.0003, "loss": 5.0605, "step": 5905 }, { "epoch": 0.12102471689226547, "grad_norm": 0.3871344327926636, "learning_rate": 0.0003, "loss": 5.0641, "step": 5910 }, { "epoch": 0.1211271066696701, "grad_norm": 0.2830437123775482, "learning_rate": 0.0003, "loss": 5.0544, "step": 5915 }, { "epoch": 0.12122949644707473, "grad_norm": 0.24620437622070312, "learning_rate": 0.0003, "loss": 5.0572, "step": 5920 }, { "epoch": 0.12133188622447935, "grad_norm": 0.225809246301651, "learning_rate": 0.0003, "loss": 5.033, "step": 5925 }, { "epoch": 0.12143427600188397, "grad_norm": 0.4193023145198822, "learning_rate": 0.0003, "loss": 5.0471, "step": 5930 }, { "epoch": 0.1215366657792886, "grad_norm": 0.2928660809993744, "learning_rate": 0.0003, "loss": 5.0839, "step": 5935 }, { "epoch": 0.12163905555669322, "grad_norm": 0.2889542877674103, "learning_rate": 0.0003, "loss": 5.0712, "step": 5940 }, { "epoch": 0.12174144533409785, "grad_norm": 0.25429001450538635, "learning_rate": 0.0003, "loss": 5.0725, "step": 5945 }, { "epoch": 0.12184383511150247, "grad_norm": 0.2578778564929962, "learning_rate": 0.0003, "loss": 5.087, "step": 5950 }, { "epoch": 0.12194622488890709, "grad_norm": 0.2804707884788513, "learning_rate": 0.0003, "loss": 5.0361, "step": 5955 }, { "epoch": 0.12204861466631171, "grad_norm": 0.2534593939781189, "learning_rate": 0.0003, "loss": 5.0565, "step": 5960 }, { "epoch": 0.12215100444371634, "grad_norm": 0.26346203684806824, "learning_rate": 0.0003, "loss": 5.0886, "step": 5965 }, { "epoch": 0.12225339422112097, "grad_norm": 0.2457103580236435, "learning_rate": 0.0003, "loss": 5.0454, "step": 5970 }, { "epoch": 0.12235578399852559, "grad_norm": 0.28149500489234924, "learning_rate": 0.0003, "loss": 5.083, "step": 5975 }, { "epoch": 0.12245817377593021, "grad_norm": 0.24643804132938385, "learning_rate": 0.0003, "loss": 5.0571, "step": 5980 }, { "epoch": 0.12256056355333483, "grad_norm": 0.24789750576019287, "learning_rate": 0.0003, "loss": 5.0151, "step": 5985 }, { "epoch": 0.12266295333073945, "grad_norm": 0.24370989203453064, "learning_rate": 0.0003, "loss": 5.0348, "step": 5990 }, { "epoch": 0.12276534310814408, "grad_norm": 0.26576387882232666, "learning_rate": 0.0003, "loss": 5.0047, "step": 5995 }, { "epoch": 0.12286773288554871, "grad_norm": 0.28600040078163147, "learning_rate": 0.0003, "loss": 5.0614, "step": 6000 }, { "epoch": 0.12286773288554871, "eval_loss": 5.087298393249512, "eval_runtime": 26.3694, "eval_samples_per_second": 22.071, "eval_steps_per_second": 2.768, "step": 6000 }, { "epoch": 0.12297012266295333, "grad_norm": 0.2305298149585724, "learning_rate": 0.0003, "loss": 5.0592, "step": 6005 }, { "epoch": 0.12307251244035795, "grad_norm": 0.25346919894218445, "learning_rate": 0.0003, "loss": 5.0096, "step": 6010 }, { "epoch": 0.12317490221776257, "grad_norm": 0.26450151205062866, "learning_rate": 0.0003, "loss": 5.0459, "step": 6015 }, { "epoch": 0.1232772919951672, "grad_norm": 0.24275422096252441, "learning_rate": 0.0003, "loss": 5.0712, "step": 6020 }, { "epoch": 0.12337968177257183, "grad_norm": 0.25413811206817627, "learning_rate": 0.0003, "loss": 5.0723, "step": 6025 }, { "epoch": 0.12348207154997645, "grad_norm": 0.25593388080596924, "learning_rate": 0.0003, "loss": 5.0804, "step": 6030 }, { "epoch": 0.12358446132738107, "grad_norm": 0.2864149510860443, "learning_rate": 0.0003, "loss": 5.0175, "step": 6035 }, { "epoch": 0.1236868511047857, "grad_norm": 0.2595575451850891, "learning_rate": 0.0003, "loss": 5.0374, "step": 6040 }, { "epoch": 0.12378924088219032, "grad_norm": 0.23770998418331146, "learning_rate": 0.0003, "loss": 5.0571, "step": 6045 }, { "epoch": 0.12389163065959495, "grad_norm": 0.2449321448802948, "learning_rate": 0.0003, "loss": 5.0374, "step": 6050 }, { "epoch": 0.12399402043699957, "grad_norm": 0.26707953214645386, "learning_rate": 0.0003, "loss": 5.0781, "step": 6055 }, { "epoch": 0.1240964102144042, "grad_norm": 0.22687005996704102, "learning_rate": 0.0003, "loss": 5.0732, "step": 6060 }, { "epoch": 0.12419879999180881, "grad_norm": 0.22488383948802948, "learning_rate": 0.0003, "loss": 5.0528, "step": 6065 }, { "epoch": 0.12430118976921344, "grad_norm": 0.2513805031776428, "learning_rate": 0.0003, "loss": 5.0521, "step": 6070 }, { "epoch": 0.12440357954661807, "grad_norm": 0.23436325788497925, "learning_rate": 0.0003, "loss": 5.048, "step": 6075 }, { "epoch": 0.12450596932402269, "grad_norm": 0.24925151467323303, "learning_rate": 0.0003, "loss": 5.0982, "step": 6080 }, { "epoch": 0.12460835910142731, "grad_norm": 0.28738659620285034, "learning_rate": 0.0003, "loss": 5.043, "step": 6085 }, { "epoch": 0.12471074887883193, "grad_norm": 0.2321338951587677, "learning_rate": 0.0003, "loss": 5.0053, "step": 6090 }, { "epoch": 0.12481313865623656, "grad_norm": 0.24323894083499908, "learning_rate": 0.0003, "loss": 5.021, "step": 6095 }, { "epoch": 0.12491552843364119, "grad_norm": 0.23402823507785797, "learning_rate": 0.0003, "loss": 5.0816, "step": 6100 }, { "epoch": 0.1250179182110458, "grad_norm": 0.26620617508888245, "learning_rate": 0.0003, "loss": 5.0764, "step": 6105 }, { "epoch": 0.12512030798845045, "grad_norm": 0.4839159846305847, "learning_rate": 0.0003, "loss": 5.0616, "step": 6110 }, { "epoch": 0.12522269776585507, "grad_norm": 0.3115023970603943, "learning_rate": 0.0003, "loss": 5.0959, "step": 6115 }, { "epoch": 0.1253250875432597, "grad_norm": 0.24713709950447083, "learning_rate": 0.0003, "loss": 5.0652, "step": 6120 }, { "epoch": 0.1254274773206643, "grad_norm": 0.2884705066680908, "learning_rate": 0.0003, "loss": 5.0471, "step": 6125 }, { "epoch": 0.12552986709806893, "grad_norm": 0.29362165927886963, "learning_rate": 0.0003, "loss": 5.1331, "step": 6130 }, { "epoch": 0.12563225687547355, "grad_norm": 0.2612384557723999, "learning_rate": 0.0003, "loss": 5.0427, "step": 6135 }, { "epoch": 0.12573464665287817, "grad_norm": 0.27509012818336487, "learning_rate": 0.0003, "loss": 5.0732, "step": 6140 }, { "epoch": 0.1258370364302828, "grad_norm": 0.29035264253616333, "learning_rate": 0.0003, "loss": 5.0591, "step": 6145 }, { "epoch": 0.12593942620768742, "grad_norm": 0.27949240803718567, "learning_rate": 0.0003, "loss": 5.0833, "step": 6150 }, { "epoch": 0.12604181598509204, "grad_norm": 0.2851013243198395, "learning_rate": 0.0003, "loss": 5.0396, "step": 6155 }, { "epoch": 0.1261442057624967, "grad_norm": 0.2831968367099762, "learning_rate": 0.0003, "loss": 5.0329, "step": 6160 }, { "epoch": 0.1262465955399013, "grad_norm": 0.2665991187095642, "learning_rate": 0.0003, "loss": 5.1208, "step": 6165 }, { "epoch": 0.12634898531730593, "grad_norm": 0.24800831079483032, "learning_rate": 0.0003, "loss": 5.041, "step": 6170 }, { "epoch": 0.12645137509471055, "grad_norm": 0.23127220571041107, "learning_rate": 0.0003, "loss": 5.0198, "step": 6175 }, { "epoch": 0.12655376487211517, "grad_norm": 0.28454628586769104, "learning_rate": 0.0003, "loss": 5.0206, "step": 6180 }, { "epoch": 0.1266561546495198, "grad_norm": 0.237688809633255, "learning_rate": 0.0003, "loss": 5.0353, "step": 6185 }, { "epoch": 0.1267585444269244, "grad_norm": 0.2456773966550827, "learning_rate": 0.0003, "loss": 5.0516, "step": 6190 }, { "epoch": 0.12686093420432903, "grad_norm": 0.256892591714859, "learning_rate": 0.0003, "loss": 5.0707, "step": 6195 }, { "epoch": 0.12696332398173366, "grad_norm": 0.23865951597690582, "learning_rate": 0.0003, "loss": 5.0908, "step": 6200 }, { "epoch": 0.12706571375913828, "grad_norm": 0.2427162379026413, "learning_rate": 0.0003, "loss": 5.0418, "step": 6205 }, { "epoch": 0.1271681035365429, "grad_norm": 0.2523399591445923, "learning_rate": 0.0003, "loss": 5.0393, "step": 6210 }, { "epoch": 0.12727049331394755, "grad_norm": 0.2777412235736847, "learning_rate": 0.0003, "loss": 5.0497, "step": 6215 }, { "epoch": 0.12737288309135217, "grad_norm": 0.26289626955986023, "learning_rate": 0.0003, "loss": 5.0905, "step": 6220 }, { "epoch": 0.1274752728687568, "grad_norm": 0.2896578013896942, "learning_rate": 0.0003, "loss": 5.1153, "step": 6225 }, { "epoch": 0.1275776626461614, "grad_norm": 0.26890280842781067, "learning_rate": 0.0003, "loss": 4.965, "step": 6230 }, { "epoch": 0.12768005242356603, "grad_norm": 0.24488884210586548, "learning_rate": 0.0003, "loss": 5.0857, "step": 6235 }, { "epoch": 0.12778244220097065, "grad_norm": 0.3304186761379242, "learning_rate": 0.0003, "loss": 5.0644, "step": 6240 }, { "epoch": 0.12788483197837527, "grad_norm": 0.2628065347671509, "learning_rate": 0.0003, "loss": 5.0786, "step": 6245 }, { "epoch": 0.1279872217557799, "grad_norm": 1.1583470106124878, "learning_rate": 0.0003, "loss": 5.0756, "step": 6250 }, { "epoch": 0.12808961153318452, "grad_norm": 0.26013973355293274, "learning_rate": 0.0003, "loss": 5.074, "step": 6255 }, { "epoch": 0.12819200131058914, "grad_norm": 0.24703949689865112, "learning_rate": 0.0003, "loss": 5.037, "step": 6260 }, { "epoch": 0.1282943910879938, "grad_norm": 0.30810126662254333, "learning_rate": 0.0003, "loss": 5.028, "step": 6265 }, { "epoch": 0.1283967808653984, "grad_norm": 0.34699803590774536, "learning_rate": 0.0003, "loss": 5.0482, "step": 6270 }, { "epoch": 0.12849917064280303, "grad_norm": 0.2741762399673462, "learning_rate": 0.0003, "loss": 5.0463, "step": 6275 }, { "epoch": 0.12860156042020765, "grad_norm": 0.2544890344142914, "learning_rate": 0.0003, "loss": 5.0228, "step": 6280 }, { "epoch": 0.12870395019761227, "grad_norm": 0.23670463263988495, "learning_rate": 0.0003, "loss": 5.0601, "step": 6285 }, { "epoch": 0.1288063399750169, "grad_norm": 0.23813705146312714, "learning_rate": 0.0003, "loss": 5.029, "step": 6290 }, { "epoch": 0.1289087297524215, "grad_norm": 0.23131109774112701, "learning_rate": 0.0003, "loss": 5.0519, "step": 6295 }, { "epoch": 0.12901111952982613, "grad_norm": 0.24065515398979187, "learning_rate": 0.0003, "loss": 5.0494, "step": 6300 }, { "epoch": 0.12911350930723076, "grad_norm": 0.23854351043701172, "learning_rate": 0.0003, "loss": 5.0359, "step": 6305 }, { "epoch": 0.12921589908463538, "grad_norm": 0.2923884391784668, "learning_rate": 0.0003, "loss": 4.9889, "step": 6310 }, { "epoch": 0.12931828886204003, "grad_norm": 0.2597198486328125, "learning_rate": 0.0003, "loss": 5.0757, "step": 6315 }, { "epoch": 0.12942067863944465, "grad_norm": 0.3665262460708618, "learning_rate": 0.0003, "loss": 5.0151, "step": 6320 }, { "epoch": 0.12952306841684927, "grad_norm": 0.24946120381355286, "learning_rate": 0.0003, "loss": 5.0674, "step": 6325 }, { "epoch": 0.1296254581942539, "grad_norm": 0.25710663199424744, "learning_rate": 0.0003, "loss": 5.0506, "step": 6330 }, { "epoch": 0.1297278479716585, "grad_norm": 0.25336015224456787, "learning_rate": 0.0003, "loss": 5.0538, "step": 6335 }, { "epoch": 0.12983023774906313, "grad_norm": 0.26922979950904846, "learning_rate": 0.0003, "loss": 5.006, "step": 6340 }, { "epoch": 0.12993262752646775, "grad_norm": 0.24607473611831665, "learning_rate": 0.0003, "loss": 5.0584, "step": 6345 }, { "epoch": 0.13003501730387237, "grad_norm": 0.28263700008392334, "learning_rate": 0.0003, "loss": 5.0866, "step": 6350 }, { "epoch": 0.130137407081277, "grad_norm": 0.24423837661743164, "learning_rate": 0.0003, "loss": 5.0601, "step": 6355 }, { "epoch": 0.13023979685868162, "grad_norm": 0.24322588741779327, "learning_rate": 0.0003, "loss": 5.0181, "step": 6360 }, { "epoch": 0.13034218663608627, "grad_norm": 0.2543542981147766, "learning_rate": 0.0003, "loss": 5.0332, "step": 6365 }, { "epoch": 0.1304445764134909, "grad_norm": 0.25317901372909546, "learning_rate": 0.0003, "loss": 5.0258, "step": 6370 }, { "epoch": 0.1305469661908955, "grad_norm": 0.262848436832428, "learning_rate": 0.0003, "loss": 5.0362, "step": 6375 }, { "epoch": 0.13064935596830013, "grad_norm": 0.26350048184394836, "learning_rate": 0.0003, "loss": 4.9923, "step": 6380 }, { "epoch": 0.13075174574570475, "grad_norm": 0.23977550864219666, "learning_rate": 0.0003, "loss": 5.0658, "step": 6385 }, { "epoch": 0.13085413552310937, "grad_norm": 0.2451464980840683, "learning_rate": 0.0003, "loss": 5.0401, "step": 6390 }, { "epoch": 0.130956525300514, "grad_norm": 0.24401719868183136, "learning_rate": 0.0003, "loss": 5.0332, "step": 6395 }, { "epoch": 0.13105891507791861, "grad_norm": 0.25680965185165405, "learning_rate": 0.0003, "loss": 4.9413, "step": 6400 }, { "epoch": 0.13116130485532324, "grad_norm": 0.2555413544178009, "learning_rate": 0.0003, "loss": 5.023, "step": 6405 }, { "epoch": 0.13126369463272786, "grad_norm": 0.23671738803386688, "learning_rate": 0.0003, "loss": 5.041, "step": 6410 }, { "epoch": 0.1313660844101325, "grad_norm": 0.25377485156059265, "learning_rate": 0.0003, "loss": 5.0157, "step": 6415 }, { "epoch": 0.13146847418753713, "grad_norm": 0.28589197993278503, "learning_rate": 0.0003, "loss": 4.9992, "step": 6420 }, { "epoch": 0.13157086396494175, "grad_norm": 0.3326317369937897, "learning_rate": 0.0003, "loss": 5.0644, "step": 6425 }, { "epoch": 0.13167325374234637, "grad_norm": 0.2430926114320755, "learning_rate": 0.0003, "loss": 5.0062, "step": 6430 }, { "epoch": 0.131775643519751, "grad_norm": 0.24135854840278625, "learning_rate": 0.0003, "loss": 5.0695, "step": 6435 }, { "epoch": 0.1318780332971556, "grad_norm": 0.2510199546813965, "learning_rate": 0.0003, "loss": 5.0286, "step": 6440 }, { "epoch": 0.13198042307456023, "grad_norm": 0.25724342465400696, "learning_rate": 0.0003, "loss": 5.0512, "step": 6445 }, { "epoch": 0.13208281285196485, "grad_norm": 0.2570081055164337, "learning_rate": 0.0003, "loss": 5.0157, "step": 6450 }, { "epoch": 0.13218520262936947, "grad_norm": 0.3087227940559387, "learning_rate": 0.0003, "loss": 5.053, "step": 6455 }, { "epoch": 0.1322875924067741, "grad_norm": 0.2415827363729477, "learning_rate": 0.0003, "loss": 5.0362, "step": 6460 }, { "epoch": 0.13238998218417874, "grad_norm": 1.4355356693267822, "learning_rate": 0.0003, "loss": 5.0126, "step": 6465 }, { "epoch": 0.13249237196158337, "grad_norm": 0.2502768635749817, "learning_rate": 0.0003, "loss": 5.0455, "step": 6470 }, { "epoch": 0.132594761738988, "grad_norm": 0.2415204793214798, "learning_rate": 0.0003, "loss": 5.0338, "step": 6475 }, { "epoch": 0.1326971515163926, "grad_norm": 0.2749311327934265, "learning_rate": 0.0003, "loss": 4.9887, "step": 6480 }, { "epoch": 0.13279954129379723, "grad_norm": 0.2325315922498703, "learning_rate": 0.0003, "loss": 5.0488, "step": 6485 }, { "epoch": 0.13290193107120185, "grad_norm": 0.23839184641838074, "learning_rate": 0.0003, "loss": 5.0283, "step": 6490 }, { "epoch": 0.13300432084860647, "grad_norm": 0.27810049057006836, "learning_rate": 0.0003, "loss": 5.0375, "step": 6495 }, { "epoch": 0.1331067106260111, "grad_norm": 0.24558596312999725, "learning_rate": 0.0003, "loss": 5.0604, "step": 6500 }, { "epoch": 0.13320910040341571, "grad_norm": 0.2379176914691925, "learning_rate": 0.0003, "loss": 5.0146, "step": 6505 }, { "epoch": 0.13331149018082034, "grad_norm": 0.24951337277889252, "learning_rate": 0.0003, "loss": 5.0139, "step": 6510 }, { "epoch": 0.13341387995822498, "grad_norm": 0.2671154737472534, "learning_rate": 0.0003, "loss": 5.0479, "step": 6515 }, { "epoch": 0.1335162697356296, "grad_norm": 0.27417638897895813, "learning_rate": 0.0003, "loss": 5.0236, "step": 6520 }, { "epoch": 0.13361865951303423, "grad_norm": 0.2691974937915802, "learning_rate": 0.0003, "loss": 5.0675, "step": 6525 }, { "epoch": 0.13372104929043885, "grad_norm": 0.2938447892665863, "learning_rate": 0.0003, "loss": 5.0354, "step": 6530 }, { "epoch": 0.13382343906784347, "grad_norm": 0.260647714138031, "learning_rate": 0.0003, "loss": 5.1266, "step": 6535 }, { "epoch": 0.1339258288452481, "grad_norm": 0.2850337028503418, "learning_rate": 0.0003, "loss": 5.036, "step": 6540 }, { "epoch": 0.1340282186226527, "grad_norm": 0.256000280380249, "learning_rate": 0.0003, "loss": 5.044, "step": 6545 }, { "epoch": 0.13413060840005733, "grad_norm": 0.24956929683685303, "learning_rate": 0.0003, "loss": 5.0733, "step": 6550 }, { "epoch": 0.13423299817746195, "grad_norm": 0.24049870669841766, "learning_rate": 0.0003, "loss": 5.0482, "step": 6555 }, { "epoch": 0.13433538795486658, "grad_norm": 0.257454514503479, "learning_rate": 0.0003, "loss": 5.0521, "step": 6560 }, { "epoch": 0.1344377777322712, "grad_norm": 0.26156890392303467, "learning_rate": 0.0003, "loss": 5.0397, "step": 6565 }, { "epoch": 0.13454016750967585, "grad_norm": 0.2320571094751358, "learning_rate": 0.0003, "loss": 5.0159, "step": 6570 }, { "epoch": 0.13464255728708047, "grad_norm": 0.24003830552101135, "learning_rate": 0.0003, "loss": 5.003, "step": 6575 }, { "epoch": 0.1347449470644851, "grad_norm": 0.25751402974128723, "learning_rate": 0.0003, "loss": 5.0338, "step": 6580 }, { "epoch": 0.1348473368418897, "grad_norm": 0.2699262499809265, "learning_rate": 0.0003, "loss": 5.0267, "step": 6585 }, { "epoch": 0.13494972661929433, "grad_norm": 0.2557588517665863, "learning_rate": 0.0003, "loss": 5.0298, "step": 6590 }, { "epoch": 0.13505211639669895, "grad_norm": 0.24363237619400024, "learning_rate": 0.0003, "loss": 5.0677, "step": 6595 }, { "epoch": 0.13515450617410357, "grad_norm": 0.2713419497013092, "learning_rate": 0.0003, "loss": 5.0305, "step": 6600 }, { "epoch": 0.1352568959515082, "grad_norm": 0.24032361805438995, "learning_rate": 0.0003, "loss": 5.0273, "step": 6605 }, { "epoch": 0.13535928572891281, "grad_norm": 0.2483200877904892, "learning_rate": 0.0003, "loss": 4.9871, "step": 6610 }, { "epoch": 0.13546167550631744, "grad_norm": 0.260397732257843, "learning_rate": 0.0003, "loss": 5.031, "step": 6615 }, { "epoch": 0.13556406528372208, "grad_norm": 0.26650041341781616, "learning_rate": 0.0003, "loss": 5.0102, "step": 6620 }, { "epoch": 0.1356664550611267, "grad_norm": 0.23946025967597961, "learning_rate": 0.0003, "loss": 5.0234, "step": 6625 }, { "epoch": 0.13576884483853133, "grad_norm": 0.2972355782985687, "learning_rate": 0.0003, "loss": 5.0263, "step": 6630 }, { "epoch": 0.13587123461593595, "grad_norm": 0.2401089370250702, "learning_rate": 0.0003, "loss": 5.0458, "step": 6635 }, { "epoch": 0.13597362439334057, "grad_norm": 0.24230457842350006, "learning_rate": 0.0003, "loss": 4.9748, "step": 6640 }, { "epoch": 0.1360760141707452, "grad_norm": 0.24211932718753815, "learning_rate": 0.0003, "loss": 5.0433, "step": 6645 }, { "epoch": 0.1361784039481498, "grad_norm": 0.25131285190582275, "learning_rate": 0.0003, "loss": 5.0782, "step": 6650 }, { "epoch": 0.13628079372555443, "grad_norm": 0.28355124592781067, "learning_rate": 0.0003, "loss": 5.0281, "step": 6655 }, { "epoch": 0.13638318350295905, "grad_norm": 0.2314610481262207, "learning_rate": 0.0003, "loss": 5.0297, "step": 6660 }, { "epoch": 0.13648557328036368, "grad_norm": 0.26233312487602234, "learning_rate": 0.0003, "loss": 5.0942, "step": 6665 }, { "epoch": 0.13658796305776832, "grad_norm": 0.24610739946365356, "learning_rate": 0.0003, "loss": 5.0764, "step": 6670 }, { "epoch": 0.13669035283517295, "grad_norm": 0.28637972474098206, "learning_rate": 0.0003, "loss": 5.0386, "step": 6675 }, { "epoch": 0.13679274261257757, "grad_norm": 0.2488769143819809, "learning_rate": 0.0003, "loss": 5.0485, "step": 6680 }, { "epoch": 0.1368951323899822, "grad_norm": 0.2477581650018692, "learning_rate": 0.0003, "loss": 5.038, "step": 6685 }, { "epoch": 0.1369975221673868, "grad_norm": 0.25378379225730896, "learning_rate": 0.0003, "loss": 5.0652, "step": 6690 }, { "epoch": 0.13709991194479143, "grad_norm": 0.25242990255355835, "learning_rate": 0.0003, "loss": 4.9996, "step": 6695 }, { "epoch": 0.13720230172219605, "grad_norm": 0.26495328545570374, "learning_rate": 0.0003, "loss": 5.0127, "step": 6700 }, { "epoch": 0.13730469149960067, "grad_norm": 0.24489976465702057, "learning_rate": 0.0003, "loss": 5.0007, "step": 6705 }, { "epoch": 0.1374070812770053, "grad_norm": 0.2487313151359558, "learning_rate": 0.0003, "loss": 5.0113, "step": 6710 }, { "epoch": 0.13750947105440992, "grad_norm": 0.23390714824199677, "learning_rate": 0.0003, "loss": 5.0327, "step": 6715 }, { "epoch": 0.13761186083181456, "grad_norm": 0.2524976134300232, "learning_rate": 0.0003, "loss": 4.9991, "step": 6720 }, { "epoch": 0.13771425060921919, "grad_norm": 0.25584086775779724, "learning_rate": 0.0003, "loss": 5.0311, "step": 6725 }, { "epoch": 0.1378166403866238, "grad_norm": 0.25065338611602783, "learning_rate": 0.0003, "loss": 5.0097, "step": 6730 }, { "epoch": 0.13791903016402843, "grad_norm": 0.22397209703922272, "learning_rate": 0.0003, "loss": 4.9872, "step": 6735 }, { "epoch": 0.13802141994143305, "grad_norm": 0.233913391828537, "learning_rate": 0.0003, "loss": 5.0608, "step": 6740 }, { "epoch": 0.13812380971883767, "grad_norm": 0.2484273463487625, "learning_rate": 0.0003, "loss": 4.9967, "step": 6745 }, { "epoch": 0.1382261994962423, "grad_norm": 0.2793211340904236, "learning_rate": 0.0003, "loss": 4.9444, "step": 6750 }, { "epoch": 0.1383285892736469, "grad_norm": 0.25882551074028015, "learning_rate": 0.0003, "loss": 5.0358, "step": 6755 }, { "epoch": 0.13843097905105153, "grad_norm": 0.2440735548734665, "learning_rate": 0.0003, "loss": 5.03, "step": 6760 }, { "epoch": 0.13853336882845615, "grad_norm": 0.25386354327201843, "learning_rate": 0.0003, "loss": 5.0468, "step": 6765 }, { "epoch": 0.1386357586058608, "grad_norm": 0.24036642909049988, "learning_rate": 0.0003, "loss": 5.0044, "step": 6770 }, { "epoch": 0.13873814838326542, "grad_norm": 0.255259245634079, "learning_rate": 0.0003, "loss": 5.0651, "step": 6775 }, { "epoch": 0.13884053816067005, "grad_norm": 0.3121148347854614, "learning_rate": 0.0003, "loss": 4.9908, "step": 6780 }, { "epoch": 0.13894292793807467, "grad_norm": 0.24176989495754242, "learning_rate": 0.0003, "loss": 4.9985, "step": 6785 }, { "epoch": 0.1390453177154793, "grad_norm": 0.23679430782794952, "learning_rate": 0.0003, "loss": 5.0802, "step": 6790 }, { "epoch": 0.1391477074928839, "grad_norm": 0.24225348234176636, "learning_rate": 0.0003, "loss": 5.0759, "step": 6795 }, { "epoch": 0.13925009727028853, "grad_norm": 0.23297856748104095, "learning_rate": 0.0003, "loss": 4.9883, "step": 6800 }, { "epoch": 0.13935248704769315, "grad_norm": 0.26487743854522705, "learning_rate": 0.0003, "loss": 5.0093, "step": 6805 }, { "epoch": 0.13945487682509777, "grad_norm": 0.2507880628108978, "learning_rate": 0.0003, "loss": 5.0326, "step": 6810 }, { "epoch": 0.1395572666025024, "grad_norm": 0.23700927197933197, "learning_rate": 0.0003, "loss": 5.0265, "step": 6815 }, { "epoch": 0.13965965637990704, "grad_norm": 0.24869215488433838, "learning_rate": 0.0003, "loss": 5.0231, "step": 6820 }, { "epoch": 0.13976204615731166, "grad_norm": 0.3493526577949524, "learning_rate": 0.0003, "loss": 5.0427, "step": 6825 }, { "epoch": 0.13986443593471629, "grad_norm": 0.24178656935691833, "learning_rate": 0.0003, "loss": 5.0023, "step": 6830 }, { "epoch": 0.1399668257121209, "grad_norm": 0.26609402894973755, "learning_rate": 0.0003, "loss": 4.9957, "step": 6835 }, { "epoch": 0.14006921548952553, "grad_norm": 0.24732144176959991, "learning_rate": 0.0003, "loss": 5.0224, "step": 6840 }, { "epoch": 0.14017160526693015, "grad_norm": 0.23194563388824463, "learning_rate": 0.0003, "loss": 4.9699, "step": 6845 }, { "epoch": 0.14027399504433477, "grad_norm": 0.23709094524383545, "learning_rate": 0.0003, "loss": 5.0184, "step": 6850 }, { "epoch": 0.1403763848217394, "grad_norm": 0.22520099580287933, "learning_rate": 0.0003, "loss": 5.0448, "step": 6855 }, { "epoch": 0.140478774599144, "grad_norm": 0.25094732642173767, "learning_rate": 0.0003, "loss": 5.0009, "step": 6860 }, { "epoch": 0.14058116437654863, "grad_norm": 0.25067663192749023, "learning_rate": 0.0003, "loss": 5.0565, "step": 6865 }, { "epoch": 0.14068355415395328, "grad_norm": 0.22902968525886536, "learning_rate": 0.0003, "loss": 4.9944, "step": 6870 }, { "epoch": 0.1407859439313579, "grad_norm": 0.294519305229187, "learning_rate": 0.0003, "loss": 5.0484, "step": 6875 }, { "epoch": 0.14088833370876253, "grad_norm": 0.2572459876537323, "learning_rate": 0.0003, "loss": 5.0189, "step": 6880 }, { "epoch": 0.14099072348616715, "grad_norm": 0.271657794713974, "learning_rate": 0.0003, "loss": 4.9645, "step": 6885 }, { "epoch": 0.14109311326357177, "grad_norm": 0.25202035903930664, "learning_rate": 0.0003, "loss": 5.0502, "step": 6890 }, { "epoch": 0.1411955030409764, "grad_norm": 0.2642892301082611, "learning_rate": 0.0003, "loss": 5.0016, "step": 6895 }, { "epoch": 0.141297892818381, "grad_norm": 0.26839500665664673, "learning_rate": 0.0003, "loss": 4.9992, "step": 6900 }, { "epoch": 0.14140028259578563, "grad_norm": 0.2610272467136383, "learning_rate": 0.0003, "loss": 5.0438, "step": 6905 }, { "epoch": 0.14150267237319025, "grad_norm": 0.22692175209522247, "learning_rate": 0.0003, "loss": 4.9622, "step": 6910 }, { "epoch": 0.14160506215059487, "grad_norm": 0.22383558750152588, "learning_rate": 0.0003, "loss": 5.0094, "step": 6915 }, { "epoch": 0.1417074519279995, "grad_norm": 0.24731644988059998, "learning_rate": 0.0003, "loss": 5.0331, "step": 6920 }, { "epoch": 0.14180984170540414, "grad_norm": 0.2571280896663666, "learning_rate": 0.0003, "loss": 5.0036, "step": 6925 }, { "epoch": 0.14191223148280876, "grad_norm": 0.23507055640220642, "learning_rate": 0.0003, "loss": 5.0071, "step": 6930 }, { "epoch": 0.1420146212602134, "grad_norm": 0.41560256481170654, "learning_rate": 0.0003, "loss": 4.9902, "step": 6935 }, { "epoch": 0.142117011037618, "grad_norm": 0.2307703197002411, "learning_rate": 0.0003, "loss": 4.9913, "step": 6940 }, { "epoch": 0.14221940081502263, "grad_norm": 0.2455960363149643, "learning_rate": 0.0003, "loss": 5.0613, "step": 6945 }, { "epoch": 0.14232179059242725, "grad_norm": 0.2564148008823395, "learning_rate": 0.0003, "loss": 5.0081, "step": 6950 }, { "epoch": 0.14242418036983187, "grad_norm": 0.3252364993095398, "learning_rate": 0.0003, "loss": 5.0755, "step": 6955 }, { "epoch": 0.1425265701472365, "grad_norm": 0.2593346834182739, "learning_rate": 0.0003, "loss": 5.0672, "step": 6960 }, { "epoch": 0.1426289599246411, "grad_norm": 0.27190688252449036, "learning_rate": 0.0003, "loss": 4.9919, "step": 6965 }, { "epoch": 0.14273134970204573, "grad_norm": 0.3587278425693512, "learning_rate": 0.0003, "loss": 5.0529, "step": 6970 }, { "epoch": 0.14283373947945038, "grad_norm": 0.2731194794178009, "learning_rate": 0.0003, "loss": 5.0135, "step": 6975 }, { "epoch": 0.142936129256855, "grad_norm": 0.35815128684043884, "learning_rate": 0.0003, "loss": 5.0411, "step": 6980 }, { "epoch": 0.14303851903425963, "grad_norm": 0.2469884753227234, "learning_rate": 0.0003, "loss": 4.9931, "step": 6985 }, { "epoch": 0.14314090881166425, "grad_norm": 0.24399548768997192, "learning_rate": 0.0003, "loss": 4.9633, "step": 6990 }, { "epoch": 0.14324329858906887, "grad_norm": 0.2295876443386078, "learning_rate": 0.0003, "loss": 4.9609, "step": 6995 }, { "epoch": 0.1433456883664735, "grad_norm": 0.24459685385227203, "learning_rate": 0.0003, "loss": 5.0562, "step": 7000 }, { "epoch": 0.1433456883664735, "eval_loss": 5.037739276885986, "eval_runtime": 17.2414, "eval_samples_per_second": 33.756, "eval_steps_per_second": 4.234, "step": 7000 }, { "epoch": 0.1434480781438781, "grad_norm": 0.24191655218601227, "learning_rate": 0.0003, "loss": 5.012, "step": 7005 }, { "epoch": 0.14355046792128273, "grad_norm": 0.3429293930530548, "learning_rate": 0.0003, "loss": 4.9455, "step": 7010 }, { "epoch": 0.14365285769868735, "grad_norm": 0.2853206396102905, "learning_rate": 0.0003, "loss": 5.0329, "step": 7015 }, { "epoch": 0.14375524747609197, "grad_norm": 0.23877167701721191, "learning_rate": 0.0003, "loss": 4.9954, "step": 7020 }, { "epoch": 0.14385763725349662, "grad_norm": 0.2498999685049057, "learning_rate": 0.0003, "loss": 5.0355, "step": 7025 }, { "epoch": 0.14396002703090124, "grad_norm": 0.23083969950675964, "learning_rate": 0.0003, "loss": 5.0247, "step": 7030 }, { "epoch": 0.14406241680830587, "grad_norm": 0.267577201128006, "learning_rate": 0.0003, "loss": 4.9991, "step": 7035 }, { "epoch": 0.1441648065857105, "grad_norm": 0.2475428432226181, "learning_rate": 0.0003, "loss": 5.0372, "step": 7040 }, { "epoch": 0.1442671963631151, "grad_norm": 0.2713354527950287, "learning_rate": 0.0003, "loss": 4.9865, "step": 7045 }, { "epoch": 0.14436958614051973, "grad_norm": 0.30416226387023926, "learning_rate": 0.0003, "loss": 4.9769, "step": 7050 }, { "epoch": 0.14447197591792435, "grad_norm": 0.26562967896461487, "learning_rate": 0.0003, "loss": 5.0089, "step": 7055 }, { "epoch": 0.14457436569532897, "grad_norm": 0.230251744389534, "learning_rate": 0.0003, "loss": 5.0115, "step": 7060 }, { "epoch": 0.1446767554727336, "grad_norm": 0.30862435698509216, "learning_rate": 0.0003, "loss": 5.029, "step": 7065 }, { "epoch": 0.1447791452501382, "grad_norm": 0.2607644498348236, "learning_rate": 0.0003, "loss": 5.0915, "step": 7070 }, { "epoch": 0.14488153502754286, "grad_norm": 0.24924525618553162, "learning_rate": 0.0003, "loss": 5.0379, "step": 7075 }, { "epoch": 0.14498392480494748, "grad_norm": 0.23883071541786194, "learning_rate": 0.0003, "loss": 5.0237, "step": 7080 }, { "epoch": 0.1450863145823521, "grad_norm": 0.23020990192890167, "learning_rate": 0.0003, "loss": 5.006, "step": 7085 }, { "epoch": 0.14518870435975673, "grad_norm": 0.25107336044311523, "learning_rate": 0.0003, "loss": 4.9507, "step": 7090 }, { "epoch": 0.14529109413716135, "grad_norm": 0.245454341173172, "learning_rate": 0.0003, "loss": 4.9903, "step": 7095 }, { "epoch": 0.14539348391456597, "grad_norm": 0.3312162160873413, "learning_rate": 0.0003, "loss": 5.0121, "step": 7100 }, { "epoch": 0.1454958736919706, "grad_norm": 0.2554459869861603, "learning_rate": 0.0003, "loss": 4.9821, "step": 7105 }, { "epoch": 0.1455982634693752, "grad_norm": 0.2653450667858124, "learning_rate": 0.0003, "loss": 5.0476, "step": 7110 }, { "epoch": 0.14570065324677983, "grad_norm": 0.2704363763332367, "learning_rate": 0.0003, "loss": 5.0029, "step": 7115 }, { "epoch": 0.14580304302418445, "grad_norm": 0.23047906160354614, "learning_rate": 0.0003, "loss": 5.013, "step": 7120 }, { "epoch": 0.1459054328015891, "grad_norm": 0.23883530497550964, "learning_rate": 0.0003, "loss": 4.9518, "step": 7125 }, { "epoch": 0.14600782257899372, "grad_norm": 0.28106656670570374, "learning_rate": 0.0003, "loss": 4.9999, "step": 7130 }, { "epoch": 0.14611021235639834, "grad_norm": 0.25772610306739807, "learning_rate": 0.0003, "loss": 5.0161, "step": 7135 }, { "epoch": 0.14621260213380297, "grad_norm": 0.22792254388332367, "learning_rate": 0.0003, "loss": 5.0558, "step": 7140 }, { "epoch": 0.1463149919112076, "grad_norm": 0.24308115243911743, "learning_rate": 0.0003, "loss": 5.0187, "step": 7145 }, { "epoch": 0.1464173816886122, "grad_norm": 0.23834307491779327, "learning_rate": 0.0003, "loss": 4.9911, "step": 7150 }, { "epoch": 0.14651977146601683, "grad_norm": 0.2690189778804779, "learning_rate": 0.0003, "loss": 4.9441, "step": 7155 }, { "epoch": 0.14662216124342145, "grad_norm": 0.22987601161003113, "learning_rate": 0.0003, "loss": 5.0239, "step": 7160 }, { "epoch": 0.14672455102082607, "grad_norm": 0.22981633245944977, "learning_rate": 0.0003, "loss": 5.033, "step": 7165 }, { "epoch": 0.1468269407982307, "grad_norm": 0.2626011371612549, "learning_rate": 0.0003, "loss": 5.0435, "step": 7170 }, { "epoch": 0.14692933057563534, "grad_norm": 0.26072123646736145, "learning_rate": 0.0003, "loss": 5.0708, "step": 7175 }, { "epoch": 0.14703172035303996, "grad_norm": 0.2979556918144226, "learning_rate": 0.0003, "loss": 5.0462, "step": 7180 }, { "epoch": 0.14713411013044458, "grad_norm": 0.2395155280828476, "learning_rate": 0.0003, "loss": 4.9404, "step": 7185 }, { "epoch": 0.1472364999078492, "grad_norm": 0.2690223455429077, "learning_rate": 0.0003, "loss": 5.0182, "step": 7190 }, { "epoch": 0.14733888968525383, "grad_norm": 0.2435748279094696, "learning_rate": 0.0003, "loss": 4.9982, "step": 7195 }, { "epoch": 0.14744127946265845, "grad_norm": 0.25203970074653625, "learning_rate": 0.0003, "loss": 4.9533, "step": 7200 }, { "epoch": 0.14754366924006307, "grad_norm": 0.2593334913253784, "learning_rate": 0.0003, "loss": 5.0437, "step": 7205 }, { "epoch": 0.1476460590174677, "grad_norm": 0.22257547080516815, "learning_rate": 0.0003, "loss": 5.0518, "step": 7210 }, { "epoch": 0.1477484487948723, "grad_norm": 0.24585096538066864, "learning_rate": 0.0003, "loss": 4.9979, "step": 7215 }, { "epoch": 0.14785083857227693, "grad_norm": 0.26046517491340637, "learning_rate": 0.0003, "loss": 5.0929, "step": 7220 }, { "epoch": 0.14795322834968158, "grad_norm": 0.26173135638237, "learning_rate": 0.0003, "loss": 5.0058, "step": 7225 }, { "epoch": 0.1480556181270862, "grad_norm": 0.2430068701505661, "learning_rate": 0.0003, "loss": 5.0021, "step": 7230 }, { "epoch": 0.14815800790449082, "grad_norm": 0.23499150574207306, "learning_rate": 0.0003, "loss": 4.9641, "step": 7235 }, { "epoch": 0.14826039768189544, "grad_norm": 0.24758745729923248, "learning_rate": 0.0003, "loss": 5.0309, "step": 7240 }, { "epoch": 0.14836278745930007, "grad_norm": 0.24381928145885468, "learning_rate": 0.0003, "loss": 5.0278, "step": 7245 }, { "epoch": 0.1484651772367047, "grad_norm": 0.27376264333724976, "learning_rate": 0.0003, "loss": 4.9957, "step": 7250 }, { "epoch": 0.1485675670141093, "grad_norm": 0.26902592182159424, "learning_rate": 0.0003, "loss": 5.0226, "step": 7255 }, { "epoch": 0.14866995679151393, "grad_norm": 0.2667185366153717, "learning_rate": 0.0003, "loss": 5.0353, "step": 7260 }, { "epoch": 0.14877234656891855, "grad_norm": 0.25455304980278015, "learning_rate": 0.0003, "loss": 5.0368, "step": 7265 }, { "epoch": 0.14887473634632317, "grad_norm": 0.26712462306022644, "learning_rate": 0.0003, "loss": 4.9974, "step": 7270 }, { "epoch": 0.1489771261237278, "grad_norm": 0.2454889863729477, "learning_rate": 0.0003, "loss": 5.0114, "step": 7275 }, { "epoch": 0.14907951590113244, "grad_norm": 0.22794292867183685, "learning_rate": 0.0003, "loss": 4.996, "step": 7280 }, { "epoch": 0.14918190567853706, "grad_norm": 0.26641157269477844, "learning_rate": 0.0003, "loss": 5.0429, "step": 7285 }, { "epoch": 0.14928429545594168, "grad_norm": 0.24585548043251038, "learning_rate": 0.0003, "loss": 5.0082, "step": 7290 }, { "epoch": 0.1493866852333463, "grad_norm": 0.2460235208272934, "learning_rate": 0.0003, "loss": 5.013, "step": 7295 }, { "epoch": 0.14948907501075093, "grad_norm": 0.24668200314044952, "learning_rate": 0.0003, "loss": 5.0051, "step": 7300 }, { "epoch": 0.14959146478815555, "grad_norm": 0.2402060478925705, "learning_rate": 0.0003, "loss": 4.9951, "step": 7305 }, { "epoch": 0.14969385456556017, "grad_norm": 0.27260613441467285, "learning_rate": 0.0003, "loss": 5.0488, "step": 7310 }, { "epoch": 0.1497962443429648, "grad_norm": 0.23753002285957336, "learning_rate": 0.0003, "loss": 4.9958, "step": 7315 }, { "epoch": 0.1498986341203694, "grad_norm": 0.24035422503948212, "learning_rate": 0.0003, "loss": 5.0024, "step": 7320 }, { "epoch": 0.15000102389777403, "grad_norm": 0.25172603130340576, "learning_rate": 0.0003, "loss": 4.9639, "step": 7325 }, { "epoch": 0.15010341367517868, "grad_norm": 0.2577626407146454, "learning_rate": 0.0003, "loss": 4.9987, "step": 7330 }, { "epoch": 0.1502058034525833, "grad_norm": 0.3757743239402771, "learning_rate": 0.0003, "loss": 4.9497, "step": 7335 }, { "epoch": 0.15030819322998792, "grad_norm": 0.26163744926452637, "learning_rate": 0.0003, "loss": 5.0107, "step": 7340 }, { "epoch": 0.15041058300739255, "grad_norm": 0.2646372318267822, "learning_rate": 0.0003, "loss": 4.9945, "step": 7345 }, { "epoch": 0.15051297278479717, "grad_norm": 0.2732544243335724, "learning_rate": 0.0003, "loss": 4.9914, "step": 7350 }, { "epoch": 0.1506153625622018, "grad_norm": 0.23641030490398407, "learning_rate": 0.0003, "loss": 4.9703, "step": 7355 }, { "epoch": 0.1507177523396064, "grad_norm": 0.24585741758346558, "learning_rate": 0.0003, "loss": 5.0141, "step": 7360 }, { "epoch": 0.15082014211701103, "grad_norm": 0.25379955768585205, "learning_rate": 0.0003, "loss": 5.056, "step": 7365 }, { "epoch": 0.15092253189441565, "grad_norm": 0.2581508755683899, "learning_rate": 0.0003, "loss": 5.0128, "step": 7370 }, { "epoch": 0.15102492167182027, "grad_norm": 0.24063491821289062, "learning_rate": 0.0003, "loss": 5.0242, "step": 7375 }, { "epoch": 0.15112731144922492, "grad_norm": 0.26179948449134827, "learning_rate": 0.0003, "loss": 5.0013, "step": 7380 }, { "epoch": 0.15122970122662954, "grad_norm": 0.3701592981815338, "learning_rate": 0.0003, "loss": 5.0073, "step": 7385 }, { "epoch": 0.15133209100403416, "grad_norm": 0.27745503187179565, "learning_rate": 0.0003, "loss": 5.0051, "step": 7390 }, { "epoch": 0.15143448078143878, "grad_norm": 0.2534632086753845, "learning_rate": 0.0003, "loss": 4.9536, "step": 7395 }, { "epoch": 0.1515368705588434, "grad_norm": 0.2302110195159912, "learning_rate": 0.0003, "loss": 5.0014, "step": 7400 }, { "epoch": 0.15163926033624803, "grad_norm": 0.2508939504623413, "learning_rate": 0.0003, "loss": 5.0082, "step": 7405 }, { "epoch": 0.15174165011365265, "grad_norm": 0.256247878074646, "learning_rate": 0.0003, "loss": 4.9864, "step": 7410 }, { "epoch": 0.15184403989105727, "grad_norm": 0.2760912775993347, "learning_rate": 0.0003, "loss": 4.9779, "step": 7415 }, { "epoch": 0.1519464296684619, "grad_norm": 0.25915077328681946, "learning_rate": 0.0003, "loss": 4.9908, "step": 7420 }, { "epoch": 0.1520488194458665, "grad_norm": 0.41883066296577454, "learning_rate": 0.0003, "loss": 5.0017, "step": 7425 }, { "epoch": 0.15215120922327116, "grad_norm": 0.32636791467666626, "learning_rate": 0.0003, "loss": 4.9843, "step": 7430 }, { "epoch": 0.15225359900067578, "grad_norm": 0.24279625713825226, "learning_rate": 0.0003, "loss": 5.0171, "step": 7435 }, { "epoch": 0.1523559887780804, "grad_norm": 0.2643495202064514, "learning_rate": 0.0003, "loss": 5.0126, "step": 7440 }, { "epoch": 0.15245837855548502, "grad_norm": 0.23208200931549072, "learning_rate": 0.0003, "loss": 5.0198, "step": 7445 }, { "epoch": 0.15256076833288965, "grad_norm": 0.2417708933353424, "learning_rate": 0.0003, "loss": 4.9749, "step": 7450 }, { "epoch": 0.15266315811029427, "grad_norm": 0.25086939334869385, "learning_rate": 0.0003, "loss": 5.0015, "step": 7455 }, { "epoch": 0.1527655478876989, "grad_norm": 0.26524269580841064, "learning_rate": 0.0003, "loss": 4.9887, "step": 7460 }, { "epoch": 0.1528679376651035, "grad_norm": 0.45219752192497253, "learning_rate": 0.0003, "loss": 5.0129, "step": 7465 }, { "epoch": 0.15297032744250813, "grad_norm": 0.2360047549009323, "learning_rate": 0.0003, "loss": 5.0366, "step": 7470 }, { "epoch": 0.15307271721991275, "grad_norm": 0.23776350915431976, "learning_rate": 0.0003, "loss": 4.9592, "step": 7475 }, { "epoch": 0.1531751069973174, "grad_norm": 0.23889972269535065, "learning_rate": 0.0003, "loss": 5.0274, "step": 7480 }, { "epoch": 0.15327749677472202, "grad_norm": 0.2513630986213684, "learning_rate": 0.0003, "loss": 4.9837, "step": 7485 }, { "epoch": 0.15337988655212664, "grad_norm": 0.2382023185491562, "learning_rate": 0.0003, "loss": 5.0247, "step": 7490 }, { "epoch": 0.15348227632953126, "grad_norm": 0.23475874960422516, "learning_rate": 0.0003, "loss": 5.0181, "step": 7495 }, { "epoch": 0.15358466610693589, "grad_norm": 0.2395312786102295, "learning_rate": 0.0003, "loss": 5.0178, "step": 7500 }, { "epoch": 0.1536870558843405, "grad_norm": 0.2524147629737854, "learning_rate": 0.0003, "loss": 4.9548, "step": 7505 }, { "epoch": 0.15378944566174513, "grad_norm": 0.28112271428108215, "learning_rate": 0.0003, "loss": 5.0397, "step": 7510 }, { "epoch": 0.15389183543914975, "grad_norm": 0.23278845846652985, "learning_rate": 0.0003, "loss": 5.0145, "step": 7515 }, { "epoch": 0.15399422521655437, "grad_norm": 0.27111172676086426, "learning_rate": 0.0003, "loss": 4.9722, "step": 7520 }, { "epoch": 0.154096614993959, "grad_norm": 0.2546147406101227, "learning_rate": 0.0003, "loss": 4.9905, "step": 7525 }, { "epoch": 0.15419900477136364, "grad_norm": 0.25940370559692383, "learning_rate": 0.0003, "loss": 4.9608, "step": 7530 }, { "epoch": 0.15430139454876826, "grad_norm": 0.23555074632167816, "learning_rate": 0.0003, "loss": 4.9884, "step": 7535 }, { "epoch": 0.15440378432617288, "grad_norm": 0.24311836063861847, "learning_rate": 0.0003, "loss": 4.9787, "step": 7540 }, { "epoch": 0.1545061741035775, "grad_norm": 0.24595271050930023, "learning_rate": 0.0003, "loss": 5.0135, "step": 7545 }, { "epoch": 0.15460856388098212, "grad_norm": 0.2362990379333496, "learning_rate": 0.0003, "loss": 4.9453, "step": 7550 }, { "epoch": 0.15471095365838675, "grad_norm": 0.24405013024806976, "learning_rate": 0.0003, "loss": 5.0109, "step": 7555 }, { "epoch": 0.15481334343579137, "grad_norm": 0.2482171505689621, "learning_rate": 0.0003, "loss": 5.0172, "step": 7560 }, { "epoch": 0.154915733213196, "grad_norm": 0.245453879237175, "learning_rate": 0.0003, "loss": 5.003, "step": 7565 }, { "epoch": 0.1550181229906006, "grad_norm": 0.32339438796043396, "learning_rate": 0.0003, "loss": 4.9981, "step": 7570 }, { "epoch": 0.15512051276800523, "grad_norm": 0.2471870481967926, "learning_rate": 0.0003, "loss": 5.0272, "step": 7575 }, { "epoch": 0.15522290254540988, "grad_norm": 0.25140976905822754, "learning_rate": 0.0003, "loss": 4.9293, "step": 7580 }, { "epoch": 0.1553252923228145, "grad_norm": 0.2848415970802307, "learning_rate": 0.0003, "loss": 4.9867, "step": 7585 }, { "epoch": 0.15542768210021912, "grad_norm": 0.2239174246788025, "learning_rate": 0.0003, "loss": 4.9804, "step": 7590 }, { "epoch": 0.15553007187762374, "grad_norm": 0.2289915233850479, "learning_rate": 0.0003, "loss": 4.9677, "step": 7595 }, { "epoch": 0.15563246165502836, "grad_norm": 0.25757190585136414, "learning_rate": 0.0003, "loss": 4.9976, "step": 7600 }, { "epoch": 0.15573485143243299, "grad_norm": 0.2640559673309326, "learning_rate": 0.0003, "loss": 4.9737, "step": 7605 }, { "epoch": 0.1558372412098376, "grad_norm": 0.2947389483451843, "learning_rate": 0.0003, "loss": 4.9666, "step": 7610 }, { "epoch": 0.15593963098724223, "grad_norm": 0.2592456638813019, "learning_rate": 0.0003, "loss": 4.9817, "step": 7615 }, { "epoch": 0.15604202076464685, "grad_norm": 0.25153815746307373, "learning_rate": 0.0003, "loss": 4.9565, "step": 7620 }, { "epoch": 0.15614441054205147, "grad_norm": 0.24976173043251038, "learning_rate": 0.0003, "loss": 4.988, "step": 7625 }, { "epoch": 0.1562468003194561, "grad_norm": 0.2521374821662903, "learning_rate": 0.0003, "loss": 4.963, "step": 7630 }, { "epoch": 0.15634919009686074, "grad_norm": 0.24638235569000244, "learning_rate": 0.0003, "loss": 4.9822, "step": 7635 }, { "epoch": 0.15645157987426536, "grad_norm": 0.2727237045764923, "learning_rate": 0.0003, "loss": 5.0165, "step": 7640 }, { "epoch": 0.15655396965166998, "grad_norm": 0.24292902648448944, "learning_rate": 0.0003, "loss": 4.9786, "step": 7645 }, { "epoch": 0.1566563594290746, "grad_norm": 0.29140084981918335, "learning_rate": 0.0003, "loss": 4.9874, "step": 7650 }, { "epoch": 0.15675874920647923, "grad_norm": 0.23380549252033234, "learning_rate": 0.0003, "loss": 5.0012, "step": 7655 }, { "epoch": 0.15686113898388385, "grad_norm": 0.24549448490142822, "learning_rate": 0.0003, "loss": 5.0244, "step": 7660 }, { "epoch": 0.15696352876128847, "grad_norm": 0.24343416094779968, "learning_rate": 0.0003, "loss": 5.0487, "step": 7665 }, { "epoch": 0.1570659185386931, "grad_norm": 0.252719521522522, "learning_rate": 0.0003, "loss": 4.9932, "step": 7670 }, { "epoch": 0.1571683083160977, "grad_norm": 0.245554581284523, "learning_rate": 0.0003, "loss": 5.0099, "step": 7675 }, { "epoch": 0.15727069809350233, "grad_norm": 0.24932564795017242, "learning_rate": 0.0003, "loss": 5.0463, "step": 7680 }, { "epoch": 0.15737308787090698, "grad_norm": 0.29451170563697815, "learning_rate": 0.0003, "loss": 5.005, "step": 7685 }, { "epoch": 0.1574754776483116, "grad_norm": 0.23839223384857178, "learning_rate": 0.0003, "loss": 4.977, "step": 7690 }, { "epoch": 0.15757786742571622, "grad_norm": 0.24138836562633514, "learning_rate": 0.0003, "loss": 4.9934, "step": 7695 }, { "epoch": 0.15768025720312084, "grad_norm": 0.36683306097984314, "learning_rate": 0.0003, "loss": 5.024, "step": 7700 }, { "epoch": 0.15778264698052546, "grad_norm": 0.26034170389175415, "learning_rate": 0.0003, "loss": 5.0189, "step": 7705 }, { "epoch": 0.1578850367579301, "grad_norm": 0.23770155012607574, "learning_rate": 0.0003, "loss": 4.9818, "step": 7710 }, { "epoch": 0.1579874265353347, "grad_norm": 0.25007253885269165, "learning_rate": 0.0003, "loss": 5.0418, "step": 7715 }, { "epoch": 0.15808981631273933, "grad_norm": 0.25594475865364075, "learning_rate": 0.0003, "loss": 4.9985, "step": 7720 }, { "epoch": 0.15819220609014395, "grad_norm": 0.26413026452064514, "learning_rate": 0.0003, "loss": 5.019, "step": 7725 }, { "epoch": 0.15829459586754857, "grad_norm": 0.25864240527153015, "learning_rate": 0.0003, "loss": 4.9674, "step": 7730 }, { "epoch": 0.15839698564495322, "grad_norm": 0.24335207045078278, "learning_rate": 0.0003, "loss": 5.0241, "step": 7735 }, { "epoch": 0.15849937542235784, "grad_norm": 0.2253551185131073, "learning_rate": 0.0003, "loss": 5.0276, "step": 7740 }, { "epoch": 0.15860176519976246, "grad_norm": 0.2327730655670166, "learning_rate": 0.0003, "loss": 4.9949, "step": 7745 }, { "epoch": 0.15870415497716708, "grad_norm": 0.26976391673088074, "learning_rate": 0.0003, "loss": 5.0058, "step": 7750 }, { "epoch": 0.1588065447545717, "grad_norm": 0.24418939650058746, "learning_rate": 0.0003, "loss": 5.0076, "step": 7755 }, { "epoch": 0.15890893453197633, "grad_norm": 0.3299313485622406, "learning_rate": 0.0003, "loss": 4.8469, "step": 7760 }, { "epoch": 0.15901132430938095, "grad_norm": 0.2607487142086029, "learning_rate": 0.0003, "loss": 4.9981, "step": 7765 }, { "epoch": 0.15911371408678557, "grad_norm": 0.29214009642601013, "learning_rate": 0.0003, "loss": 4.9962, "step": 7770 }, { "epoch": 0.1592161038641902, "grad_norm": 0.2634567320346832, "learning_rate": 0.0003, "loss": 4.9739, "step": 7775 }, { "epoch": 0.1593184936415948, "grad_norm": 0.2418191134929657, "learning_rate": 0.0003, "loss": 5.0173, "step": 7780 }, { "epoch": 0.15942088341899946, "grad_norm": 0.24370892345905304, "learning_rate": 0.0003, "loss": 4.9085, "step": 7785 }, { "epoch": 0.15952327319640408, "grad_norm": 0.2694243788719177, "learning_rate": 0.0003, "loss": 4.9661, "step": 7790 }, { "epoch": 0.1596256629738087, "grad_norm": 0.309671014547348, "learning_rate": 0.0003, "loss": 4.9758, "step": 7795 }, { "epoch": 0.15972805275121332, "grad_norm": 0.28414806723594666, "learning_rate": 0.0003, "loss": 5.0043, "step": 7800 }, { "epoch": 0.15983044252861794, "grad_norm": 0.23653919994831085, "learning_rate": 0.0003, "loss": 4.9264, "step": 7805 }, { "epoch": 0.15993283230602257, "grad_norm": 0.5142612457275391, "learning_rate": 0.0003, "loss": 4.9463, "step": 7810 }, { "epoch": 0.1600352220834272, "grad_norm": 0.2367003709077835, "learning_rate": 0.0003, "loss": 5.0234, "step": 7815 }, { "epoch": 0.1601376118608318, "grad_norm": 0.2358447164297104, "learning_rate": 0.0003, "loss": 4.9898, "step": 7820 }, { "epoch": 0.16024000163823643, "grad_norm": 0.22502097487449646, "learning_rate": 0.0003, "loss": 4.973, "step": 7825 }, { "epoch": 0.16034239141564105, "grad_norm": 0.25816860795021057, "learning_rate": 0.0003, "loss": 4.9557, "step": 7830 }, { "epoch": 0.1604447811930457, "grad_norm": 0.2492964267730713, "learning_rate": 0.0003, "loss": 4.9745, "step": 7835 }, { "epoch": 0.16054717097045032, "grad_norm": 0.25804537534713745, "learning_rate": 0.0003, "loss": 4.9777, "step": 7840 }, { "epoch": 0.16064956074785494, "grad_norm": 0.3088281750679016, "learning_rate": 0.0003, "loss": 4.9348, "step": 7845 }, { "epoch": 0.16075195052525956, "grad_norm": 0.2266150712966919, "learning_rate": 0.0003, "loss": 4.981, "step": 7850 }, { "epoch": 0.16085434030266418, "grad_norm": 0.23817439377307892, "learning_rate": 0.0003, "loss": 4.9638, "step": 7855 }, { "epoch": 0.1609567300800688, "grad_norm": 0.2314368039369583, "learning_rate": 0.0003, "loss": 4.9474, "step": 7860 }, { "epoch": 0.16105911985747343, "grad_norm": 0.22157548367977142, "learning_rate": 0.0003, "loss": 4.9844, "step": 7865 }, { "epoch": 0.16116150963487805, "grad_norm": 0.2731040418148041, "learning_rate": 0.0003, "loss": 4.9799, "step": 7870 }, { "epoch": 0.16126389941228267, "grad_norm": 0.2672834098339081, "learning_rate": 0.0003, "loss": 4.9617, "step": 7875 }, { "epoch": 0.1613662891896873, "grad_norm": 0.24145518243312836, "learning_rate": 0.0003, "loss": 4.9869, "step": 7880 }, { "epoch": 0.16146867896709194, "grad_norm": 0.33095917105674744, "learning_rate": 0.0003, "loss": 4.9788, "step": 7885 }, { "epoch": 0.16157106874449656, "grad_norm": 0.23778679966926575, "learning_rate": 0.0003, "loss": 4.9911, "step": 7890 }, { "epoch": 0.16167345852190118, "grad_norm": 0.2597041130065918, "learning_rate": 0.0003, "loss": 4.9643, "step": 7895 }, { "epoch": 0.1617758482993058, "grad_norm": 0.32607340812683105, "learning_rate": 0.0003, "loss": 5.0795, "step": 7900 }, { "epoch": 0.16187823807671042, "grad_norm": 0.29756247997283936, "learning_rate": 0.0003, "loss": 5.0274, "step": 7905 }, { "epoch": 0.16198062785411504, "grad_norm": 0.2715311646461487, "learning_rate": 0.0003, "loss": 4.9335, "step": 7910 }, { "epoch": 0.16208301763151967, "grad_norm": 0.27370917797088623, "learning_rate": 0.0003, "loss": 4.9757, "step": 7915 }, { "epoch": 0.1621854074089243, "grad_norm": 0.2792314887046814, "learning_rate": 0.0003, "loss": 4.9968, "step": 7920 }, { "epoch": 0.1622877971863289, "grad_norm": 0.23655901849269867, "learning_rate": 0.0003, "loss": 5.0076, "step": 7925 }, { "epoch": 0.16239018696373353, "grad_norm": 0.315487802028656, "learning_rate": 0.0003, "loss": 4.9748, "step": 7930 }, { "epoch": 0.16249257674113818, "grad_norm": 0.2521093189716339, "learning_rate": 0.0003, "loss": 4.9814, "step": 7935 }, { "epoch": 0.1625949665185428, "grad_norm": 0.2644865810871124, "learning_rate": 0.0003, "loss": 4.9929, "step": 7940 }, { "epoch": 0.16269735629594742, "grad_norm": 0.2822543680667877, "learning_rate": 0.0003, "loss": 4.9796, "step": 7945 }, { "epoch": 0.16279974607335204, "grad_norm": 0.2337653636932373, "learning_rate": 0.0003, "loss": 4.9272, "step": 7950 }, { "epoch": 0.16290213585075666, "grad_norm": 0.2425181120634079, "learning_rate": 0.0003, "loss": 5.0105, "step": 7955 }, { "epoch": 0.16300452562816128, "grad_norm": 0.24469076097011566, "learning_rate": 0.0003, "loss": 4.9679, "step": 7960 }, { "epoch": 0.1631069154055659, "grad_norm": 0.2601161301136017, "learning_rate": 0.0003, "loss": 4.8934, "step": 7965 }, { "epoch": 0.16320930518297053, "grad_norm": 0.3003526031970978, "learning_rate": 0.0003, "loss": 4.9514, "step": 7970 }, { "epoch": 0.16331169496037515, "grad_norm": 0.2524234652519226, "learning_rate": 0.0003, "loss": 4.9787, "step": 7975 }, { "epoch": 0.16341408473777977, "grad_norm": 0.2413032203912735, "learning_rate": 0.0003, "loss": 5.0099, "step": 7980 }, { "epoch": 0.1635164745151844, "grad_norm": 0.23179535567760468, "learning_rate": 0.0003, "loss": 5.0039, "step": 7985 }, { "epoch": 0.16361886429258904, "grad_norm": 0.24087181687355042, "learning_rate": 0.0003, "loss": 4.9662, "step": 7990 }, { "epoch": 0.16372125406999366, "grad_norm": 0.23221777379512787, "learning_rate": 0.0003, "loss": 4.9369, "step": 7995 }, { "epoch": 0.16382364384739828, "grad_norm": 0.21853451430797577, "learning_rate": 0.0003, "loss": 4.8567, "step": 8000 }, { "epoch": 0.16382364384739828, "eval_loss": 5.000431060791016, "eval_runtime": 17.2744, "eval_samples_per_second": 33.692, "eval_steps_per_second": 4.226, "step": 8000 }, { "epoch": 0.1639260336248029, "grad_norm": 0.23773954808712006, "learning_rate": 0.0003, "loss": 4.9538, "step": 8005 }, { "epoch": 0.16402842340220752, "grad_norm": 0.21447262167930603, "learning_rate": 0.0003, "loss": 4.9967, "step": 8010 }, { "epoch": 0.16413081317961215, "grad_norm": 0.23721234500408173, "learning_rate": 0.0003, "loss": 4.9341, "step": 8015 }, { "epoch": 0.16423320295701677, "grad_norm": 0.24699951708316803, "learning_rate": 0.0003, "loss": 4.9569, "step": 8020 }, { "epoch": 0.1643355927344214, "grad_norm": 0.2360193431377411, "learning_rate": 0.0003, "loss": 4.9537, "step": 8025 }, { "epoch": 0.164437982511826, "grad_norm": 0.24413712322711945, "learning_rate": 0.0003, "loss": 5.0083, "step": 8030 }, { "epoch": 0.16454037228923063, "grad_norm": 0.2634340822696686, "learning_rate": 0.0003, "loss": 5.0273, "step": 8035 }, { "epoch": 0.16464276206663528, "grad_norm": 0.2583164870738983, "learning_rate": 0.0003, "loss": 4.9409, "step": 8040 }, { "epoch": 0.1647451518440399, "grad_norm": 0.23808503150939941, "learning_rate": 0.0003, "loss": 4.9729, "step": 8045 }, { "epoch": 0.16484754162144452, "grad_norm": 0.2662254869937897, "learning_rate": 0.0003, "loss": 4.9806, "step": 8050 }, { "epoch": 0.16494993139884914, "grad_norm": 0.25939038395881653, "learning_rate": 0.0003, "loss": 4.9699, "step": 8055 }, { "epoch": 0.16505232117625376, "grad_norm": 0.23665112257003784, "learning_rate": 0.0003, "loss": 4.9592, "step": 8060 }, { "epoch": 0.16515471095365838, "grad_norm": 0.2312067747116089, "learning_rate": 0.0003, "loss": 4.9986, "step": 8065 }, { "epoch": 0.165257100731063, "grad_norm": 0.22978922724723816, "learning_rate": 0.0003, "loss": 4.9237, "step": 8070 }, { "epoch": 0.16535949050846763, "grad_norm": 0.22837577760219574, "learning_rate": 0.0003, "loss": 4.9564, "step": 8075 }, { "epoch": 0.16546188028587225, "grad_norm": 0.2294594943523407, "learning_rate": 0.0003, "loss": 4.9235, "step": 8080 }, { "epoch": 0.16556427006327687, "grad_norm": 0.25017088651657104, "learning_rate": 0.0003, "loss": 4.9978, "step": 8085 }, { "epoch": 0.16566665984068152, "grad_norm": 0.23632113635540009, "learning_rate": 0.0003, "loss": 4.9601, "step": 8090 }, { "epoch": 0.16576904961808614, "grad_norm": 0.24469229578971863, "learning_rate": 0.0003, "loss": 4.9547, "step": 8095 }, { "epoch": 0.16587143939549076, "grad_norm": 0.23987941443920135, "learning_rate": 0.0003, "loss": 4.9769, "step": 8100 }, { "epoch": 0.16597382917289538, "grad_norm": 0.2555577754974365, "learning_rate": 0.0003, "loss": 4.99, "step": 8105 }, { "epoch": 0.1660762189503, "grad_norm": 0.22785189747810364, "learning_rate": 0.0003, "loss": 4.9571, "step": 8110 }, { "epoch": 0.16617860872770462, "grad_norm": 0.22487638890743256, "learning_rate": 0.0003, "loss": 4.9281, "step": 8115 }, { "epoch": 0.16628099850510925, "grad_norm": 0.2585775554180145, "learning_rate": 0.0003, "loss": 4.9635, "step": 8120 }, { "epoch": 0.16638338828251387, "grad_norm": 0.24215564131736755, "learning_rate": 0.0003, "loss": 4.9886, "step": 8125 }, { "epoch": 0.1664857780599185, "grad_norm": 0.2455897480249405, "learning_rate": 0.0003, "loss": 5.0244, "step": 8130 }, { "epoch": 0.1665881678373231, "grad_norm": 0.2518976330757141, "learning_rate": 0.0003, "loss": 4.9591, "step": 8135 }, { "epoch": 0.16669055761472776, "grad_norm": 0.2537906765937805, "learning_rate": 0.0003, "loss": 5.0466, "step": 8140 }, { "epoch": 0.16679294739213238, "grad_norm": 0.24115991592407227, "learning_rate": 0.0003, "loss": 4.9882, "step": 8145 }, { "epoch": 0.166895337169537, "grad_norm": 0.243105947971344, "learning_rate": 0.0003, "loss": 4.9919, "step": 8150 }, { "epoch": 0.16699772694694162, "grad_norm": 0.24019856750965118, "learning_rate": 0.0003, "loss": 5.0229, "step": 8155 }, { "epoch": 0.16710011672434624, "grad_norm": 0.238954558968544, "learning_rate": 0.0003, "loss": 4.9798, "step": 8160 }, { "epoch": 0.16720250650175086, "grad_norm": 0.2641277611255646, "learning_rate": 0.0003, "loss": 4.9897, "step": 8165 }, { "epoch": 0.16730489627915549, "grad_norm": 0.24453116953372955, "learning_rate": 0.0003, "loss": 4.9824, "step": 8170 }, { "epoch": 0.1674072860565601, "grad_norm": 0.34915784001350403, "learning_rate": 0.0003, "loss": 4.928, "step": 8175 }, { "epoch": 0.16750967583396473, "grad_norm": 0.22643156349658966, "learning_rate": 0.0003, "loss": 4.985, "step": 8180 }, { "epoch": 0.16761206561136935, "grad_norm": 0.23364658653736115, "learning_rate": 0.0003, "loss": 4.9712, "step": 8185 }, { "epoch": 0.167714455388774, "grad_norm": 0.26778602600097656, "learning_rate": 0.0003, "loss": 4.9234, "step": 8190 }, { "epoch": 0.16781684516617862, "grad_norm": 0.2409345656633377, "learning_rate": 0.0003, "loss": 4.9569, "step": 8195 }, { "epoch": 0.16791923494358324, "grad_norm": 0.26513752341270447, "learning_rate": 0.0003, "loss": 4.9827, "step": 8200 }, { "epoch": 0.16802162472098786, "grad_norm": 0.24178951978683472, "learning_rate": 0.0003, "loss": 4.988, "step": 8205 }, { "epoch": 0.16812401449839248, "grad_norm": 0.26739561557769775, "learning_rate": 0.0003, "loss": 4.965, "step": 8210 }, { "epoch": 0.1682264042757971, "grad_norm": 0.2487669438123703, "learning_rate": 0.0003, "loss": 4.9987, "step": 8215 }, { "epoch": 0.16832879405320172, "grad_norm": 0.24459333717823029, "learning_rate": 0.0003, "loss": 4.9298, "step": 8220 }, { "epoch": 0.16843118383060635, "grad_norm": 0.24657560884952545, "learning_rate": 0.0003, "loss": 4.9931, "step": 8225 }, { "epoch": 0.16853357360801097, "grad_norm": 0.2773890495300293, "learning_rate": 0.0003, "loss": 4.9824, "step": 8230 }, { "epoch": 0.1686359633854156, "grad_norm": 0.24502816796302795, "learning_rate": 0.0003, "loss": 4.94, "step": 8235 }, { "epoch": 0.16873835316282024, "grad_norm": 0.25656858086586, "learning_rate": 0.0003, "loss": 4.973, "step": 8240 }, { "epoch": 0.16884074294022486, "grad_norm": 0.2743070125579834, "learning_rate": 0.0003, "loss": 4.9453, "step": 8245 }, { "epoch": 0.16894313271762948, "grad_norm": 0.232648566365242, "learning_rate": 0.0003, "loss": 4.9835, "step": 8250 }, { "epoch": 0.1690455224950341, "grad_norm": 0.2495628297328949, "learning_rate": 0.0003, "loss": 4.9835, "step": 8255 }, { "epoch": 0.16914791227243872, "grad_norm": 0.24052630364894867, "learning_rate": 0.0003, "loss": 4.9369, "step": 8260 }, { "epoch": 0.16925030204984334, "grad_norm": 0.23074476420879364, "learning_rate": 0.0003, "loss": 4.9835, "step": 8265 }, { "epoch": 0.16935269182724796, "grad_norm": 0.21705974638462067, "learning_rate": 0.0003, "loss": 4.9878, "step": 8270 }, { "epoch": 0.16945508160465259, "grad_norm": 0.25947219133377075, "learning_rate": 0.0003, "loss": 4.9692, "step": 8275 }, { "epoch": 0.1695574713820572, "grad_norm": 0.23706823587417603, "learning_rate": 0.0003, "loss": 4.9927, "step": 8280 }, { "epoch": 0.16965986115946183, "grad_norm": 0.23366497457027435, "learning_rate": 0.0003, "loss": 4.9778, "step": 8285 }, { "epoch": 0.16976225093686648, "grad_norm": 0.23043937981128693, "learning_rate": 0.0003, "loss": 4.9375, "step": 8290 }, { "epoch": 0.1698646407142711, "grad_norm": 0.22917410731315613, "learning_rate": 0.0003, "loss": 4.9518, "step": 8295 }, { "epoch": 0.16996703049167572, "grad_norm": 0.23943403363227844, "learning_rate": 0.0003, "loss": 4.9337, "step": 8300 }, { "epoch": 0.17006942026908034, "grad_norm": 0.24286477267742157, "learning_rate": 0.0003, "loss": 5.0025, "step": 8305 }, { "epoch": 0.17017181004648496, "grad_norm": 0.23036329448223114, "learning_rate": 0.0003, "loss": 4.9614, "step": 8310 }, { "epoch": 0.17027419982388958, "grad_norm": 0.2531217038631439, "learning_rate": 0.0003, "loss": 4.9789, "step": 8315 }, { "epoch": 0.1703765896012942, "grad_norm": 0.2279614508152008, "learning_rate": 0.0003, "loss": 4.9902, "step": 8320 }, { "epoch": 0.17047897937869883, "grad_norm": 0.2787529528141022, "learning_rate": 0.0003, "loss": 4.9625, "step": 8325 }, { "epoch": 0.17058136915610345, "grad_norm": 0.23746182024478912, "learning_rate": 0.0003, "loss": 4.9784, "step": 8330 }, { "epoch": 0.17068375893350807, "grad_norm": 0.24724778532981873, "learning_rate": 0.0003, "loss": 4.9431, "step": 8335 }, { "epoch": 0.17078614871091272, "grad_norm": 0.23509597778320312, "learning_rate": 0.0003, "loss": 4.9764, "step": 8340 }, { "epoch": 0.17088853848831734, "grad_norm": 0.23609371483325958, "learning_rate": 0.0003, "loss": 4.9762, "step": 8345 }, { "epoch": 0.17099092826572196, "grad_norm": 0.23974719643592834, "learning_rate": 0.0003, "loss": 4.9617, "step": 8350 }, { "epoch": 0.17109331804312658, "grad_norm": 0.22124601900577545, "learning_rate": 0.0003, "loss": 4.9675, "step": 8355 }, { "epoch": 0.1711957078205312, "grad_norm": 0.2362033724784851, "learning_rate": 0.0003, "loss": 4.999, "step": 8360 }, { "epoch": 0.17129809759793582, "grad_norm": 0.2428901493549347, "learning_rate": 0.0003, "loss": 5.0014, "step": 8365 }, { "epoch": 0.17140048737534044, "grad_norm": 0.24708890914916992, "learning_rate": 0.0003, "loss": 4.9538, "step": 8370 }, { "epoch": 0.17150287715274506, "grad_norm": 0.24944816529750824, "learning_rate": 0.0003, "loss": 4.9225, "step": 8375 }, { "epoch": 0.17160526693014969, "grad_norm": 0.2560562789440155, "learning_rate": 0.0003, "loss": 4.9858, "step": 8380 }, { "epoch": 0.1717076567075543, "grad_norm": 0.26690977811813354, "learning_rate": 0.0003, "loss": 4.9503, "step": 8385 }, { "epoch": 0.17181004648495893, "grad_norm": 0.24184927344322205, "learning_rate": 0.0003, "loss": 4.9364, "step": 8390 }, { "epoch": 0.17191243626236358, "grad_norm": 0.24029432237148285, "learning_rate": 0.0003, "loss": 4.9838, "step": 8395 }, { "epoch": 0.1720148260397682, "grad_norm": 0.2464255541563034, "learning_rate": 0.0003, "loss": 4.979, "step": 8400 }, { "epoch": 0.17211721581717282, "grad_norm": 0.22707878053188324, "learning_rate": 0.0003, "loss": 4.9349, "step": 8405 }, { "epoch": 0.17221960559457744, "grad_norm": 0.23702223598957062, "learning_rate": 0.0003, "loss": 4.9087, "step": 8410 }, { "epoch": 0.17232199537198206, "grad_norm": 0.2341252565383911, "learning_rate": 0.0003, "loss": 4.9247, "step": 8415 }, { "epoch": 0.17242438514938668, "grad_norm": 0.2504211366176605, "learning_rate": 0.0003, "loss": 4.9337, "step": 8420 }, { "epoch": 0.1725267749267913, "grad_norm": 0.25000959634780884, "learning_rate": 0.0003, "loss": 4.9552, "step": 8425 }, { "epoch": 0.17262916470419593, "grad_norm": 0.2468976229429245, "learning_rate": 0.0003, "loss": 4.9799, "step": 8430 }, { "epoch": 0.17273155448160055, "grad_norm": 0.2432153970003128, "learning_rate": 0.0003, "loss": 4.9608, "step": 8435 }, { "epoch": 0.17283394425900517, "grad_norm": 0.24488693475723267, "learning_rate": 0.0003, "loss": 4.9715, "step": 8440 }, { "epoch": 0.17293633403640982, "grad_norm": 0.26678532361984253, "learning_rate": 0.0003, "loss": 4.9634, "step": 8445 }, { "epoch": 0.17303872381381444, "grad_norm": 0.3612540066242218, "learning_rate": 0.0003, "loss": 4.9252, "step": 8450 }, { "epoch": 0.17314111359121906, "grad_norm": 0.258616179227829, "learning_rate": 0.0003, "loss": 4.951, "step": 8455 }, { "epoch": 0.17324350336862368, "grad_norm": 0.2403879016637802, "learning_rate": 0.0003, "loss": 4.9513, "step": 8460 }, { "epoch": 0.1733458931460283, "grad_norm": 0.2637866139411926, "learning_rate": 0.0003, "loss": 4.9427, "step": 8465 }, { "epoch": 0.17344828292343292, "grad_norm": 0.4013460576534271, "learning_rate": 0.0003, "loss": 4.9076, "step": 8470 }, { "epoch": 0.17355067270083754, "grad_norm": 0.2604331970214844, "learning_rate": 0.0003, "loss": 4.968, "step": 8475 }, { "epoch": 0.17365306247824217, "grad_norm": 0.2690982222557068, "learning_rate": 0.0003, "loss": 4.9317, "step": 8480 }, { "epoch": 0.1737554522556468, "grad_norm": 0.23837333917617798, "learning_rate": 0.0003, "loss": 5.0024, "step": 8485 }, { "epoch": 0.1738578420330514, "grad_norm": 0.24582943320274353, "learning_rate": 0.0003, "loss": 4.9811, "step": 8490 }, { "epoch": 0.17396023181045606, "grad_norm": 0.2452264428138733, "learning_rate": 0.0003, "loss": 4.961, "step": 8495 }, { "epoch": 0.17406262158786068, "grad_norm": 0.28215664625167847, "learning_rate": 0.0003, "loss": 5.0268, "step": 8500 }, { "epoch": 0.1741650113652653, "grad_norm": 0.2856769859790802, "learning_rate": 0.0003, "loss": 4.9394, "step": 8505 }, { "epoch": 0.17426740114266992, "grad_norm": 1.280678391456604, "learning_rate": 0.0003, "loss": 4.9656, "step": 8510 }, { "epoch": 0.17436979092007454, "grad_norm": 0.30562543869018555, "learning_rate": 0.0003, "loss": 5.0055, "step": 8515 }, { "epoch": 0.17447218069747916, "grad_norm": 0.29330888390541077, "learning_rate": 0.0003, "loss": 4.9697, "step": 8520 }, { "epoch": 0.17457457047488378, "grad_norm": 0.2480739951133728, "learning_rate": 0.0003, "loss": 4.9607, "step": 8525 }, { "epoch": 0.1746769602522884, "grad_norm": 0.24799346923828125, "learning_rate": 0.0003, "loss": 5.0038, "step": 8530 }, { "epoch": 0.17477935002969303, "grad_norm": 0.2737793028354645, "learning_rate": 0.0003, "loss": 4.9091, "step": 8535 }, { "epoch": 0.17488173980709765, "grad_norm": 0.25553327798843384, "learning_rate": 0.0003, "loss": 5.0308, "step": 8540 }, { "epoch": 0.1749841295845023, "grad_norm": 0.23692303895950317, "learning_rate": 0.0003, "loss": 4.9652, "step": 8545 }, { "epoch": 0.17508651936190692, "grad_norm": 0.30589890480041504, "learning_rate": 0.0003, "loss": 4.9889, "step": 8550 }, { "epoch": 0.17518890913931154, "grad_norm": 0.243758425116539, "learning_rate": 0.0003, "loss": 5.0105, "step": 8555 }, { "epoch": 0.17529129891671616, "grad_norm": 0.22573265433311462, "learning_rate": 0.0003, "loss": 4.9811, "step": 8560 }, { "epoch": 0.17539368869412078, "grad_norm": 0.23498474061489105, "learning_rate": 0.0003, "loss": 5.009, "step": 8565 }, { "epoch": 0.1754960784715254, "grad_norm": 0.2366688996553421, "learning_rate": 0.0003, "loss": 4.952, "step": 8570 }, { "epoch": 0.17559846824893002, "grad_norm": 0.2420990914106369, "learning_rate": 0.0003, "loss": 4.9078, "step": 8575 }, { "epoch": 0.17570085802633464, "grad_norm": 0.25633060932159424, "learning_rate": 0.0003, "loss": 4.9264, "step": 8580 }, { "epoch": 0.17580324780373927, "grad_norm": 0.630459189414978, "learning_rate": 0.0003, "loss": 4.987, "step": 8585 }, { "epoch": 0.1759056375811439, "grad_norm": 0.2781355381011963, "learning_rate": 0.0003, "loss": 4.9808, "step": 8590 }, { "epoch": 0.17600802735854854, "grad_norm": 0.2405020296573639, "learning_rate": 0.0003, "loss": 4.9342, "step": 8595 }, { "epoch": 0.17611041713595316, "grad_norm": 0.2748335003852844, "learning_rate": 0.0003, "loss": 4.99, "step": 8600 }, { "epoch": 0.17621280691335778, "grad_norm": 0.2357555776834488, "learning_rate": 0.0003, "loss": 4.9803, "step": 8605 }, { "epoch": 0.1763151966907624, "grad_norm": 0.2591405212879181, "learning_rate": 0.0003, "loss": 4.9526, "step": 8610 }, { "epoch": 0.17641758646816702, "grad_norm": 0.2714153826236725, "learning_rate": 0.0003, "loss": 4.9511, "step": 8615 }, { "epoch": 0.17651997624557164, "grad_norm": 0.2358967512845993, "learning_rate": 0.0003, "loss": 4.9982, "step": 8620 }, { "epoch": 0.17662236602297626, "grad_norm": 0.25634580850601196, "learning_rate": 0.0003, "loss": 4.9297, "step": 8625 }, { "epoch": 0.17672475580038088, "grad_norm": 0.24031628668308258, "learning_rate": 0.0003, "loss": 4.9554, "step": 8630 }, { "epoch": 0.1768271455777855, "grad_norm": 0.22725112736225128, "learning_rate": 0.0003, "loss": 4.9684, "step": 8635 }, { "epoch": 0.17692953535519013, "grad_norm": 0.23442846536636353, "learning_rate": 0.0003, "loss": 4.9531, "step": 8640 }, { "epoch": 0.17703192513259478, "grad_norm": 0.23823332786560059, "learning_rate": 0.0003, "loss": 4.9727, "step": 8645 }, { "epoch": 0.1771343149099994, "grad_norm": 0.2390361726284027, "learning_rate": 0.0003, "loss": 4.9173, "step": 8650 }, { "epoch": 0.17723670468740402, "grad_norm": 0.25551703572273254, "learning_rate": 0.0003, "loss": 5.0018, "step": 8655 }, { "epoch": 0.17733909446480864, "grad_norm": 0.2616801857948303, "learning_rate": 0.0003, "loss": 4.916, "step": 8660 }, { "epoch": 0.17744148424221326, "grad_norm": 0.2571796178817749, "learning_rate": 0.0003, "loss": 4.9279, "step": 8665 }, { "epoch": 0.17754387401961788, "grad_norm": 0.231767475605011, "learning_rate": 0.0003, "loss": 4.9712, "step": 8670 }, { "epoch": 0.1776462637970225, "grad_norm": 0.27216964960098267, "learning_rate": 0.0003, "loss": 4.9273, "step": 8675 }, { "epoch": 0.17774865357442712, "grad_norm": 0.22776266932487488, "learning_rate": 0.0003, "loss": 4.9075, "step": 8680 }, { "epoch": 0.17785104335183174, "grad_norm": 0.27528294920921326, "learning_rate": 0.0003, "loss": 4.8684, "step": 8685 }, { "epoch": 0.17795343312923637, "grad_norm": 0.24167364835739136, "learning_rate": 0.0003, "loss": 4.9285, "step": 8690 }, { "epoch": 0.17805582290664101, "grad_norm": 0.28756558895111084, "learning_rate": 0.0003, "loss": 4.9294, "step": 8695 }, { "epoch": 0.17815821268404564, "grad_norm": 0.27697524428367615, "learning_rate": 0.0003, "loss": 4.9078, "step": 8700 }, { "epoch": 0.17826060246145026, "grad_norm": 0.4069802165031433, "learning_rate": 0.0003, "loss": 4.9637, "step": 8705 }, { "epoch": 0.17836299223885488, "grad_norm": 0.25043371319770813, "learning_rate": 0.0003, "loss": 4.9357, "step": 8710 }, { "epoch": 0.1784653820162595, "grad_norm": 0.2558344304561615, "learning_rate": 0.0003, "loss": 5.004, "step": 8715 }, { "epoch": 0.17856777179366412, "grad_norm": 0.26395145058631897, "learning_rate": 0.0003, "loss": 4.9534, "step": 8720 }, { "epoch": 0.17867016157106874, "grad_norm": 0.23396216332912445, "learning_rate": 0.0003, "loss": 4.9541, "step": 8725 }, { "epoch": 0.17877255134847336, "grad_norm": 0.22810359299182892, "learning_rate": 0.0003, "loss": 4.9797, "step": 8730 }, { "epoch": 0.17887494112587798, "grad_norm": 0.24070174992084503, "learning_rate": 0.0003, "loss": 4.9558, "step": 8735 }, { "epoch": 0.1789773309032826, "grad_norm": 0.2460024207830429, "learning_rate": 0.0003, "loss": 4.9305, "step": 8740 }, { "epoch": 0.17907972068068723, "grad_norm": 0.2506413459777832, "learning_rate": 0.0003, "loss": 4.9893, "step": 8745 }, { "epoch": 0.17918211045809188, "grad_norm": 0.2459816187620163, "learning_rate": 0.0003, "loss": 5.0012, "step": 8750 }, { "epoch": 0.1792845002354965, "grad_norm": 0.2408568561077118, "learning_rate": 0.0003, "loss": 4.9595, "step": 8755 }, { "epoch": 0.17938689001290112, "grad_norm": 0.23039047420024872, "learning_rate": 0.0003, "loss": 4.976, "step": 8760 }, { "epoch": 0.17948927979030574, "grad_norm": 0.23503048717975616, "learning_rate": 0.0003, "loss": 4.9216, "step": 8765 }, { "epoch": 0.17959166956771036, "grad_norm": 0.23038825392723083, "learning_rate": 0.0003, "loss": 4.9127, "step": 8770 }, { "epoch": 0.17969405934511498, "grad_norm": 0.22313566505908966, "learning_rate": 0.0003, "loss": 4.9082, "step": 8775 }, { "epoch": 0.1797964491225196, "grad_norm": 0.23721888661384583, "learning_rate": 0.0003, "loss": 4.9445, "step": 8780 }, { "epoch": 0.17989883889992422, "grad_norm": 0.2366582602262497, "learning_rate": 0.0003, "loss": 4.9145, "step": 8785 }, { "epoch": 0.18000122867732885, "grad_norm": 0.24238401651382446, "learning_rate": 0.0003, "loss": 4.9726, "step": 8790 }, { "epoch": 0.18010361845473347, "grad_norm": 0.27836301922798157, "learning_rate": 0.0003, "loss": 4.9378, "step": 8795 }, { "epoch": 0.18020600823213812, "grad_norm": 0.3087904453277588, "learning_rate": 0.0003, "loss": 4.947, "step": 8800 }, { "epoch": 0.18030839800954274, "grad_norm": 0.2912931740283966, "learning_rate": 0.0003, "loss": 4.9846, "step": 8805 }, { "epoch": 0.18041078778694736, "grad_norm": 0.2829636037349701, "learning_rate": 0.0003, "loss": 4.9376, "step": 8810 }, { "epoch": 0.18051317756435198, "grad_norm": 0.24942532181739807, "learning_rate": 0.0003, "loss": 4.9389, "step": 8815 }, { "epoch": 0.1806155673417566, "grad_norm": 0.24345135688781738, "learning_rate": 0.0003, "loss": 4.9813, "step": 8820 }, { "epoch": 0.18071795711916122, "grad_norm": 0.24752293527126312, "learning_rate": 0.0003, "loss": 4.9714, "step": 8825 }, { "epoch": 0.18082034689656584, "grad_norm": 0.22899101674556732, "learning_rate": 0.0003, "loss": 4.9175, "step": 8830 }, { "epoch": 0.18092273667397046, "grad_norm": 0.23931820690631866, "learning_rate": 0.0003, "loss": 4.9516, "step": 8835 }, { "epoch": 0.18102512645137508, "grad_norm": 0.2398419827222824, "learning_rate": 0.0003, "loss": 4.9473, "step": 8840 }, { "epoch": 0.1811275162287797, "grad_norm": 0.31384849548339844, "learning_rate": 0.0003, "loss": 4.9467, "step": 8845 }, { "epoch": 0.18122990600618435, "grad_norm": 0.2399289309978485, "learning_rate": 0.0003, "loss": 4.9555, "step": 8850 }, { "epoch": 0.18133229578358898, "grad_norm": 0.23261117935180664, "learning_rate": 0.0003, "loss": 4.9576, "step": 8855 }, { "epoch": 0.1814346855609936, "grad_norm": 0.23390693962574005, "learning_rate": 0.0003, "loss": 4.95, "step": 8860 }, { "epoch": 0.18153707533839822, "grad_norm": 0.2741168439388275, "learning_rate": 0.0003, "loss": 4.9439, "step": 8865 }, { "epoch": 0.18163946511580284, "grad_norm": 0.2826792895793915, "learning_rate": 0.0003, "loss": 4.9285, "step": 8870 }, { "epoch": 0.18174185489320746, "grad_norm": 0.24303020536899567, "learning_rate": 0.0003, "loss": 4.9537, "step": 8875 }, { "epoch": 0.18184424467061208, "grad_norm": 0.23413772881031036, "learning_rate": 0.0003, "loss": 4.9463, "step": 8880 }, { "epoch": 0.1819466344480167, "grad_norm": 0.25310632586479187, "learning_rate": 0.0003, "loss": 4.9643, "step": 8885 }, { "epoch": 0.18204902422542132, "grad_norm": 0.25220412015914917, "learning_rate": 0.0003, "loss": 4.9889, "step": 8890 }, { "epoch": 0.18215141400282595, "grad_norm": 0.23816744983196259, "learning_rate": 0.0003, "loss": 4.9433, "step": 8895 }, { "epoch": 0.1822538037802306, "grad_norm": 0.24224130809307098, "learning_rate": 0.0003, "loss": 5.0065, "step": 8900 }, { "epoch": 0.18235619355763522, "grad_norm": 0.23748779296875, "learning_rate": 0.0003, "loss": 4.9892, "step": 8905 }, { "epoch": 0.18245858333503984, "grad_norm": 0.23334605991840363, "learning_rate": 0.0003, "loss": 4.9289, "step": 8910 }, { "epoch": 0.18256097311244446, "grad_norm": 0.2519617974758148, "learning_rate": 0.0003, "loss": 4.8981, "step": 8915 }, { "epoch": 0.18266336288984908, "grad_norm": 0.2552023231983185, "learning_rate": 0.0003, "loss": 4.974, "step": 8920 }, { "epoch": 0.1827657526672537, "grad_norm": 0.24951639771461487, "learning_rate": 0.0003, "loss": 4.9376, "step": 8925 }, { "epoch": 0.18286814244465832, "grad_norm": 0.38250601291656494, "learning_rate": 0.0003, "loss": 4.9658, "step": 8930 }, { "epoch": 0.18297053222206294, "grad_norm": 0.26618579030036926, "learning_rate": 0.0003, "loss": 4.9351, "step": 8935 }, { "epoch": 0.18307292199946756, "grad_norm": 0.25149083137512207, "learning_rate": 0.0003, "loss": 4.9782, "step": 8940 }, { "epoch": 0.18317531177687219, "grad_norm": 0.25862792134284973, "learning_rate": 0.0003, "loss": 4.9518, "step": 8945 }, { "epoch": 0.18327770155427683, "grad_norm": 0.23274236917495728, "learning_rate": 0.0003, "loss": 4.9542, "step": 8950 }, { "epoch": 0.18338009133168146, "grad_norm": 0.23445458710193634, "learning_rate": 0.0003, "loss": 4.9482, "step": 8955 }, { "epoch": 0.18348248110908608, "grad_norm": 0.2242894023656845, "learning_rate": 0.0003, "loss": 4.9599, "step": 8960 }, { "epoch": 0.1835848708864907, "grad_norm": 0.243010476231575, "learning_rate": 0.0003, "loss": 4.9959, "step": 8965 }, { "epoch": 0.18368726066389532, "grad_norm": 0.2701684534549713, "learning_rate": 0.0003, "loss": 4.9371, "step": 8970 }, { "epoch": 0.18378965044129994, "grad_norm": 0.2610296308994293, "learning_rate": 0.0003, "loss": 4.9253, "step": 8975 }, { "epoch": 0.18389204021870456, "grad_norm": 0.24804575741291046, "learning_rate": 0.0003, "loss": 4.8889, "step": 8980 }, { "epoch": 0.18399442999610918, "grad_norm": 0.23091092705726624, "learning_rate": 0.0003, "loss": 4.9377, "step": 8985 }, { "epoch": 0.1840968197735138, "grad_norm": 0.2444520741701126, "learning_rate": 0.0003, "loss": 4.9067, "step": 8990 }, { "epoch": 0.18419920955091842, "grad_norm": 0.2413340061903, "learning_rate": 0.0003, "loss": 4.9861, "step": 8995 }, { "epoch": 0.18430159932832307, "grad_norm": 0.23155519366264343, "learning_rate": 0.0003, "loss": 4.9702, "step": 9000 }, { "epoch": 0.18430159932832307, "eval_loss": 4.967161178588867, "eval_runtime": 17.2382, "eval_samples_per_second": 33.762, "eval_steps_per_second": 4.235, "step": 9000 }, { "epoch": 0.1844039891057277, "grad_norm": 0.26229947805404663, "learning_rate": 0.0003, "loss": 4.9275, "step": 9005 }, { "epoch": 0.18450637888313232, "grad_norm": 0.2596692144870758, "learning_rate": 0.0003, "loss": 4.9474, "step": 9010 }, { "epoch": 0.18460876866053694, "grad_norm": 0.24281714856624603, "learning_rate": 0.0003, "loss": 4.9183, "step": 9015 }, { "epoch": 0.18471115843794156, "grad_norm": 0.26011863350868225, "learning_rate": 0.0003, "loss": 4.9363, "step": 9020 }, { "epoch": 0.18481354821534618, "grad_norm": 0.24929960072040558, "learning_rate": 0.0003, "loss": 4.9551, "step": 9025 }, { "epoch": 0.1849159379927508, "grad_norm": 0.25172069668769836, "learning_rate": 0.0003, "loss": 4.9257, "step": 9030 }, { "epoch": 0.18501832777015542, "grad_norm": 0.2448473423719406, "learning_rate": 0.0003, "loss": 4.9615, "step": 9035 }, { "epoch": 0.18512071754756004, "grad_norm": 0.24837471544742584, "learning_rate": 0.0003, "loss": 4.9719, "step": 9040 }, { "epoch": 0.18522310732496466, "grad_norm": 0.23794622719287872, "learning_rate": 0.0003, "loss": 4.947, "step": 9045 }, { "epoch": 0.1853254971023693, "grad_norm": 0.33640262484550476, "learning_rate": 0.0003, "loss": 4.9217, "step": 9050 }, { "epoch": 0.18542788687977393, "grad_norm": 0.2828204929828644, "learning_rate": 0.0003, "loss": 4.9556, "step": 9055 }, { "epoch": 0.18553027665717856, "grad_norm": 0.2490418553352356, "learning_rate": 0.0003, "loss": 4.9401, "step": 9060 }, { "epoch": 0.18563266643458318, "grad_norm": 0.24733878672122955, "learning_rate": 0.0003, "loss": 4.9404, "step": 9065 }, { "epoch": 0.1857350562119878, "grad_norm": 0.2425927221775055, "learning_rate": 0.0003, "loss": 4.9148, "step": 9070 }, { "epoch": 0.18583744598939242, "grad_norm": 0.2517198920249939, "learning_rate": 0.0003, "loss": 4.9049, "step": 9075 }, { "epoch": 0.18593983576679704, "grad_norm": 0.2480369359254837, "learning_rate": 0.0003, "loss": 4.9222, "step": 9080 }, { "epoch": 0.18604222554420166, "grad_norm": 0.2834649980068207, "learning_rate": 0.0003, "loss": 4.9167, "step": 9085 }, { "epoch": 0.18614461532160628, "grad_norm": 0.21334508061408997, "learning_rate": 0.0003, "loss": 4.9144, "step": 9090 }, { "epoch": 0.1862470050990109, "grad_norm": 0.24133138358592987, "learning_rate": 0.0003, "loss": 4.9551, "step": 9095 }, { "epoch": 0.18634939487641553, "grad_norm": 0.2861619293689728, "learning_rate": 0.0003, "loss": 4.9004, "step": 9100 }, { "epoch": 0.18645178465382017, "grad_norm": 0.25369203090667725, "learning_rate": 0.0003, "loss": 4.9408, "step": 9105 }, { "epoch": 0.1865541744312248, "grad_norm": 0.24333781003952026, "learning_rate": 0.0003, "loss": 4.915, "step": 9110 }, { "epoch": 0.18665656420862942, "grad_norm": 0.23844999074935913, "learning_rate": 0.0003, "loss": 4.9096, "step": 9115 }, { "epoch": 0.18675895398603404, "grad_norm": 0.2520522177219391, "learning_rate": 0.0003, "loss": 4.9019, "step": 9120 }, { "epoch": 0.18686134376343866, "grad_norm": 0.25806185603141785, "learning_rate": 0.0003, "loss": 4.9489, "step": 9125 }, { "epoch": 0.18696373354084328, "grad_norm": 0.2639296352863312, "learning_rate": 0.0003, "loss": 4.9078, "step": 9130 }, { "epoch": 0.1870661233182479, "grad_norm": 0.25021064281463623, "learning_rate": 0.0003, "loss": 4.947, "step": 9135 }, { "epoch": 0.18716851309565252, "grad_norm": 0.25453871488571167, "learning_rate": 0.0003, "loss": 4.8956, "step": 9140 }, { "epoch": 0.18727090287305714, "grad_norm": 0.2519582211971283, "learning_rate": 0.0003, "loss": 4.9217, "step": 9145 }, { "epoch": 0.18737329265046176, "grad_norm": 0.2347276508808136, "learning_rate": 0.0003, "loss": 4.9446, "step": 9150 }, { "epoch": 0.1874756824278664, "grad_norm": 0.22869284451007843, "learning_rate": 0.0003, "loss": 4.9314, "step": 9155 }, { "epoch": 0.18757807220527103, "grad_norm": 0.23401141166687012, "learning_rate": 0.0003, "loss": 4.9272, "step": 9160 }, { "epoch": 0.18768046198267566, "grad_norm": 0.26685070991516113, "learning_rate": 0.0003, "loss": 4.9796, "step": 9165 }, { "epoch": 0.18778285176008028, "grad_norm": 0.2689502239227295, "learning_rate": 0.0003, "loss": 4.9418, "step": 9170 }, { "epoch": 0.1878852415374849, "grad_norm": 0.40141159296035767, "learning_rate": 0.0003, "loss": 4.9608, "step": 9175 }, { "epoch": 0.18798763131488952, "grad_norm": 0.24301853775978088, "learning_rate": 0.0003, "loss": 4.9604, "step": 9180 }, { "epoch": 0.18809002109229414, "grad_norm": 0.24619057774543762, "learning_rate": 0.0003, "loss": 4.9413, "step": 9185 }, { "epoch": 0.18819241086969876, "grad_norm": 0.23454906046390533, "learning_rate": 0.0003, "loss": 4.9371, "step": 9190 }, { "epoch": 0.18829480064710338, "grad_norm": 0.26107001304626465, "learning_rate": 0.0003, "loss": 4.9023, "step": 9195 }, { "epoch": 0.188397190424508, "grad_norm": 0.2492724508047104, "learning_rate": 0.0003, "loss": 4.9876, "step": 9200 }, { "epoch": 0.18849958020191265, "grad_norm": 0.27773427963256836, "learning_rate": 0.0003, "loss": 4.9478, "step": 9205 }, { "epoch": 0.18860196997931727, "grad_norm": 0.2248711884021759, "learning_rate": 0.0003, "loss": 4.9573, "step": 9210 }, { "epoch": 0.1887043597567219, "grad_norm": 0.2660972476005554, "learning_rate": 0.0003, "loss": 4.9328, "step": 9215 }, { "epoch": 0.18880674953412652, "grad_norm": 0.23380662500858307, "learning_rate": 0.0003, "loss": 4.9396, "step": 9220 }, { "epoch": 0.18890913931153114, "grad_norm": 0.242654487490654, "learning_rate": 0.0003, "loss": 4.896, "step": 9225 }, { "epoch": 0.18901152908893576, "grad_norm": 0.23970633745193481, "learning_rate": 0.0003, "loss": 4.9227, "step": 9230 }, { "epoch": 0.18911391886634038, "grad_norm": 0.30415284633636475, "learning_rate": 0.0003, "loss": 4.9366, "step": 9235 }, { "epoch": 0.189216308643745, "grad_norm": 0.2458249032497406, "learning_rate": 0.0003, "loss": 4.9759, "step": 9240 }, { "epoch": 0.18931869842114962, "grad_norm": 0.2422608584165573, "learning_rate": 0.0003, "loss": 4.9593, "step": 9245 }, { "epoch": 0.18942108819855424, "grad_norm": 0.25077205896377563, "learning_rate": 0.0003, "loss": 4.9483, "step": 9250 }, { "epoch": 0.1895234779759589, "grad_norm": 0.2309390902519226, "learning_rate": 0.0003, "loss": 4.9209, "step": 9255 }, { "epoch": 0.18962586775336351, "grad_norm": 0.2438856065273285, "learning_rate": 0.0003, "loss": 4.9279, "step": 9260 }, { "epoch": 0.18972825753076814, "grad_norm": 0.2440236508846283, "learning_rate": 0.0003, "loss": 4.9087, "step": 9265 }, { "epoch": 0.18983064730817276, "grad_norm": 0.2421654611825943, "learning_rate": 0.0003, "loss": 4.9221, "step": 9270 }, { "epoch": 0.18993303708557738, "grad_norm": 0.251799076795578, "learning_rate": 0.0003, "loss": 4.9246, "step": 9275 }, { "epoch": 0.190035426862982, "grad_norm": 0.24180428683757782, "learning_rate": 0.0003, "loss": 4.9738, "step": 9280 }, { "epoch": 0.19013781664038662, "grad_norm": 0.26821961998939514, "learning_rate": 0.0003, "loss": 4.925, "step": 9285 }, { "epoch": 0.19024020641779124, "grad_norm": 0.26987001299858093, "learning_rate": 0.0003, "loss": 4.9381, "step": 9290 }, { "epoch": 0.19034259619519586, "grad_norm": 0.23224502801895142, "learning_rate": 0.0003, "loss": 4.9938, "step": 9295 }, { "epoch": 0.19044498597260048, "grad_norm": 0.2341042160987854, "learning_rate": 0.0003, "loss": 4.9247, "step": 9300 }, { "epoch": 0.19054737575000513, "grad_norm": 0.2788233757019043, "learning_rate": 0.0003, "loss": 4.9378, "step": 9305 }, { "epoch": 0.19064976552740975, "grad_norm": 0.2468472719192505, "learning_rate": 0.0003, "loss": 4.9314, "step": 9310 }, { "epoch": 0.19075215530481437, "grad_norm": 0.2537922263145447, "learning_rate": 0.0003, "loss": 4.9645, "step": 9315 }, { "epoch": 0.190854545082219, "grad_norm": 0.22504645586013794, "learning_rate": 0.0003, "loss": 4.9711, "step": 9320 }, { "epoch": 0.19095693485962362, "grad_norm": 0.43650156259536743, "learning_rate": 0.0003, "loss": 4.8517, "step": 9325 }, { "epoch": 0.19105932463702824, "grad_norm": 0.22850650548934937, "learning_rate": 0.0003, "loss": 4.918, "step": 9330 }, { "epoch": 0.19116171441443286, "grad_norm": 0.24966390430927277, "learning_rate": 0.0003, "loss": 4.8949, "step": 9335 }, { "epoch": 0.19126410419183748, "grad_norm": 0.2230202704668045, "learning_rate": 0.0003, "loss": 4.9572, "step": 9340 }, { "epoch": 0.1913664939692421, "grad_norm": 0.24259352684020996, "learning_rate": 0.0003, "loss": 5.009, "step": 9345 }, { "epoch": 0.19146888374664672, "grad_norm": 0.23545870184898376, "learning_rate": 0.0003, "loss": 5.0105, "step": 9350 }, { "epoch": 0.19157127352405137, "grad_norm": 0.2519451379776001, "learning_rate": 0.0003, "loss": 4.9496, "step": 9355 }, { "epoch": 0.191673663301456, "grad_norm": 0.23786629736423492, "learning_rate": 0.0003, "loss": 4.9777, "step": 9360 }, { "epoch": 0.19177605307886061, "grad_norm": 0.24780771136283875, "learning_rate": 0.0003, "loss": 4.983, "step": 9365 }, { "epoch": 0.19187844285626524, "grad_norm": 0.23074345290660858, "learning_rate": 0.0003, "loss": 4.9321, "step": 9370 }, { "epoch": 0.19198083263366986, "grad_norm": 0.2469666451215744, "learning_rate": 0.0003, "loss": 4.8674, "step": 9375 }, { "epoch": 0.19208322241107448, "grad_norm": 0.2651034891605377, "learning_rate": 0.0003, "loss": 4.9177, "step": 9380 }, { "epoch": 0.1921856121884791, "grad_norm": 0.218081533908844, "learning_rate": 0.0003, "loss": 4.925, "step": 9385 }, { "epoch": 0.19228800196588372, "grad_norm": 0.22809651494026184, "learning_rate": 0.0003, "loss": 4.9752, "step": 9390 }, { "epoch": 0.19239039174328834, "grad_norm": 0.25222814083099365, "learning_rate": 0.0003, "loss": 4.9522, "step": 9395 }, { "epoch": 0.19249278152069296, "grad_norm": 0.23496654629707336, "learning_rate": 0.0003, "loss": 4.887, "step": 9400 }, { "epoch": 0.1925951712980976, "grad_norm": 0.2361747771501541, "learning_rate": 0.0003, "loss": 4.9137, "step": 9405 }, { "epoch": 0.19269756107550223, "grad_norm": 0.2526848316192627, "learning_rate": 0.0003, "loss": 4.9855, "step": 9410 }, { "epoch": 0.19279995085290685, "grad_norm": 0.250507116317749, "learning_rate": 0.0003, "loss": 4.9262, "step": 9415 }, { "epoch": 0.19290234063031148, "grad_norm": 0.2319098562002182, "learning_rate": 0.0003, "loss": 4.8931, "step": 9420 }, { "epoch": 0.1930047304077161, "grad_norm": 0.23578836023807526, "learning_rate": 0.0003, "loss": 4.924, "step": 9425 }, { "epoch": 0.19310712018512072, "grad_norm": 0.22996222972869873, "learning_rate": 0.0003, "loss": 4.9866, "step": 9430 }, { "epoch": 0.19320950996252534, "grad_norm": 0.2504240870475769, "learning_rate": 0.0003, "loss": 4.9552, "step": 9435 }, { "epoch": 0.19331189973992996, "grad_norm": 0.2702684998512268, "learning_rate": 0.0003, "loss": 4.9467, "step": 9440 }, { "epoch": 0.19341428951733458, "grad_norm": 0.2502329349517822, "learning_rate": 0.0003, "loss": 4.9376, "step": 9445 }, { "epoch": 0.1935166792947392, "grad_norm": 0.24836665391921997, "learning_rate": 0.0003, "loss": 4.913, "step": 9450 }, { "epoch": 0.19361906907214382, "grad_norm": 0.23882754147052765, "learning_rate": 0.0003, "loss": 4.8785, "step": 9455 }, { "epoch": 0.19372145884954847, "grad_norm": 0.2295556366443634, "learning_rate": 0.0003, "loss": 4.9463, "step": 9460 }, { "epoch": 0.1938238486269531, "grad_norm": 0.22904041409492493, "learning_rate": 0.0003, "loss": 4.9021, "step": 9465 }, { "epoch": 0.19392623840435771, "grad_norm": 0.2299875169992447, "learning_rate": 0.0003, "loss": 4.9742, "step": 9470 }, { "epoch": 0.19402862818176234, "grad_norm": 0.23397061228752136, "learning_rate": 0.0003, "loss": 4.8921, "step": 9475 }, { "epoch": 0.19413101795916696, "grad_norm": 0.25410881638526917, "learning_rate": 0.0003, "loss": 4.9202, "step": 9480 }, { "epoch": 0.19423340773657158, "grad_norm": 0.23728393018245697, "learning_rate": 0.0003, "loss": 4.947, "step": 9485 }, { "epoch": 0.1943357975139762, "grad_norm": 0.2503977417945862, "learning_rate": 0.0003, "loss": 4.9673, "step": 9490 }, { "epoch": 0.19443818729138082, "grad_norm": 0.2601408362388611, "learning_rate": 0.0003, "loss": 4.9154, "step": 9495 }, { "epoch": 0.19454057706878544, "grad_norm": 0.2707257568836212, "learning_rate": 0.0003, "loss": 4.8975, "step": 9500 }, { "epoch": 0.19464296684619006, "grad_norm": 0.25026175379753113, "learning_rate": 0.0003, "loss": 4.9179, "step": 9505 }, { "epoch": 0.1947453566235947, "grad_norm": 0.2416429966688156, "learning_rate": 0.0003, "loss": 4.9306, "step": 9510 }, { "epoch": 0.19484774640099933, "grad_norm": 0.3844289183616638, "learning_rate": 0.0003, "loss": 4.925, "step": 9515 }, { "epoch": 0.19495013617840395, "grad_norm": 0.24641218781471252, "learning_rate": 0.0003, "loss": 4.9458, "step": 9520 }, { "epoch": 0.19505252595580858, "grad_norm": 0.24381622672080994, "learning_rate": 0.0003, "loss": 4.9495, "step": 9525 }, { "epoch": 0.1951549157332132, "grad_norm": 0.2696736454963684, "learning_rate": 0.0003, "loss": 4.9343, "step": 9530 }, { "epoch": 0.19525730551061782, "grad_norm": 0.24814894795417786, "learning_rate": 0.0003, "loss": 4.9781, "step": 9535 }, { "epoch": 0.19535969528802244, "grad_norm": 0.24902163445949554, "learning_rate": 0.0003, "loss": 4.922, "step": 9540 }, { "epoch": 0.19546208506542706, "grad_norm": 0.23567412793636322, "learning_rate": 0.0003, "loss": 4.9178, "step": 9545 }, { "epoch": 0.19556447484283168, "grad_norm": 0.23954512178897858, "learning_rate": 0.0003, "loss": 4.9485, "step": 9550 }, { "epoch": 0.1956668646202363, "grad_norm": 0.26197537779808044, "learning_rate": 0.0003, "loss": 4.9247, "step": 9555 }, { "epoch": 0.19576925439764095, "grad_norm": 0.23075419664382935, "learning_rate": 0.0003, "loss": 4.946, "step": 9560 }, { "epoch": 0.19587164417504557, "grad_norm": 0.29694515466690063, "learning_rate": 0.0003, "loss": 4.9019, "step": 9565 }, { "epoch": 0.1959740339524502, "grad_norm": 0.24621470272541046, "learning_rate": 0.0003, "loss": 4.8725, "step": 9570 }, { "epoch": 0.19607642372985482, "grad_norm": 0.2673153877258301, "learning_rate": 0.0003, "loss": 4.9209, "step": 9575 }, { "epoch": 0.19617881350725944, "grad_norm": 0.23953528702259064, "learning_rate": 0.0003, "loss": 4.953, "step": 9580 }, { "epoch": 0.19628120328466406, "grad_norm": 0.23474064469337463, "learning_rate": 0.0003, "loss": 4.9477, "step": 9585 }, { "epoch": 0.19638359306206868, "grad_norm": 0.24345049262046814, "learning_rate": 0.0003, "loss": 4.9062, "step": 9590 }, { "epoch": 0.1964859828394733, "grad_norm": 0.2565402686595917, "learning_rate": 0.0003, "loss": 4.9048, "step": 9595 }, { "epoch": 0.19658837261687792, "grad_norm": 0.2397555708885193, "learning_rate": 0.0003, "loss": 4.9502, "step": 9600 }, { "epoch": 0.19669076239428254, "grad_norm": 0.26241710782051086, "learning_rate": 0.0003, "loss": 4.9442, "step": 9605 }, { "epoch": 0.1967931521716872, "grad_norm": 0.32985183596611023, "learning_rate": 0.0003, "loss": 4.8082, "step": 9610 }, { "epoch": 0.1968955419490918, "grad_norm": 0.2346508502960205, "learning_rate": 0.0003, "loss": 4.934, "step": 9615 }, { "epoch": 0.19699793172649643, "grad_norm": 0.2326173037290573, "learning_rate": 0.0003, "loss": 4.97, "step": 9620 }, { "epoch": 0.19710032150390105, "grad_norm": 0.22407038509845734, "learning_rate": 0.0003, "loss": 4.9392, "step": 9625 }, { "epoch": 0.19720271128130568, "grad_norm": 0.23821721971035004, "learning_rate": 0.0003, "loss": 4.8939, "step": 9630 }, { "epoch": 0.1973051010587103, "grad_norm": 0.3187132775783539, "learning_rate": 0.0003, "loss": 4.9532, "step": 9635 }, { "epoch": 0.19740749083611492, "grad_norm": 0.2720498740673065, "learning_rate": 0.0003, "loss": 4.9322, "step": 9640 }, { "epoch": 0.19750988061351954, "grad_norm": 0.2887953221797943, "learning_rate": 0.0003, "loss": 4.9195, "step": 9645 }, { "epoch": 0.19761227039092416, "grad_norm": 0.2539193034172058, "learning_rate": 0.0003, "loss": 4.9229, "step": 9650 }, { "epoch": 0.19771466016832878, "grad_norm": 0.2742467224597931, "learning_rate": 0.0003, "loss": 4.919, "step": 9655 }, { "epoch": 0.19781704994573343, "grad_norm": 0.23102837800979614, "learning_rate": 0.0003, "loss": 4.924, "step": 9660 }, { "epoch": 0.19791943972313805, "grad_norm": 0.2715983986854553, "learning_rate": 0.0003, "loss": 4.9037, "step": 9665 }, { "epoch": 0.19802182950054267, "grad_norm": 0.24073027074337006, "learning_rate": 0.0003, "loss": 4.9378, "step": 9670 }, { "epoch": 0.1981242192779473, "grad_norm": 0.2399149239063263, "learning_rate": 0.0003, "loss": 4.9259, "step": 9675 }, { "epoch": 0.19822660905535192, "grad_norm": 0.2453446388244629, "learning_rate": 0.0003, "loss": 4.9253, "step": 9680 }, { "epoch": 0.19832899883275654, "grad_norm": 0.22920511662960052, "learning_rate": 0.0003, "loss": 4.931, "step": 9685 }, { "epoch": 0.19843138861016116, "grad_norm": 0.24713505804538727, "learning_rate": 0.0003, "loss": 4.8891, "step": 9690 }, { "epoch": 0.19853377838756578, "grad_norm": 0.22313618659973145, "learning_rate": 0.0003, "loss": 4.9379, "step": 9695 }, { "epoch": 0.1986361681649704, "grad_norm": 0.21806932985782623, "learning_rate": 0.0003, "loss": 4.8942, "step": 9700 }, { "epoch": 0.19873855794237502, "grad_norm": 0.22739502787590027, "learning_rate": 0.0003, "loss": 4.9851, "step": 9705 }, { "epoch": 0.19884094771977967, "grad_norm": 0.22084246575832367, "learning_rate": 0.0003, "loss": 4.9006, "step": 9710 }, { "epoch": 0.1989433374971843, "grad_norm": 0.2365892231464386, "learning_rate": 0.0003, "loss": 4.887, "step": 9715 }, { "epoch": 0.1990457272745889, "grad_norm": 0.23465533554553986, "learning_rate": 0.0003, "loss": 4.9341, "step": 9720 }, { "epoch": 0.19914811705199353, "grad_norm": 0.22784896194934845, "learning_rate": 0.0003, "loss": 4.9387, "step": 9725 }, { "epoch": 0.19925050682939816, "grad_norm": 0.2590842545032501, "learning_rate": 0.0003, "loss": 4.8865, "step": 9730 }, { "epoch": 0.19935289660680278, "grad_norm": 0.25291505455970764, "learning_rate": 0.0003, "loss": 4.9147, "step": 9735 }, { "epoch": 0.1994552863842074, "grad_norm": 0.2361178994178772, "learning_rate": 0.0003, "loss": 4.8865, "step": 9740 }, { "epoch": 0.19955767616161202, "grad_norm": 0.25283992290496826, "learning_rate": 0.0003, "loss": 4.8911, "step": 9745 }, { "epoch": 0.19966006593901664, "grad_norm": 0.2415904700756073, "learning_rate": 0.0003, "loss": 4.8535, "step": 9750 }, { "epoch": 0.19976245571642126, "grad_norm": 0.249093160033226, "learning_rate": 0.0003, "loss": 4.8886, "step": 9755 }, { "epoch": 0.1998648454938259, "grad_norm": 0.23306076228618622, "learning_rate": 0.0003, "loss": 4.9647, "step": 9760 }, { "epoch": 0.19996723527123053, "grad_norm": 0.2572837471961975, "learning_rate": 0.0003, "loss": 4.8966, "step": 9765 }, { "epoch": 0.20006962504863515, "grad_norm": 0.24322238564491272, "learning_rate": 0.0003, "loss": 4.9127, "step": 9770 }, { "epoch": 0.20017201482603977, "grad_norm": 0.35075312852859497, "learning_rate": 0.0003, "loss": 4.9215, "step": 9775 }, { "epoch": 0.2002744046034444, "grad_norm": 0.2796444892883301, "learning_rate": 0.0003, "loss": 4.9081, "step": 9780 }, { "epoch": 0.20037679438084902, "grad_norm": 0.2839694619178772, "learning_rate": 0.0003, "loss": 4.9978, "step": 9785 }, { "epoch": 0.20047918415825364, "grad_norm": 0.24952860176563263, "learning_rate": 0.0003, "loss": 4.9422, "step": 9790 }, { "epoch": 0.20058157393565826, "grad_norm": 0.2544446289539337, "learning_rate": 0.0003, "loss": 4.9141, "step": 9795 }, { "epoch": 0.20068396371306288, "grad_norm": 0.2726847231388092, "learning_rate": 0.0003, "loss": 4.9476, "step": 9800 }, { "epoch": 0.2007863534904675, "grad_norm": 0.27150022983551025, "learning_rate": 0.0003, "loss": 4.9017, "step": 9805 }, { "epoch": 0.20088874326787212, "grad_norm": 0.253474622964859, "learning_rate": 0.0003, "loss": 4.9364, "step": 9810 }, { "epoch": 0.20099113304527677, "grad_norm": 0.3557785749435425, "learning_rate": 0.0003, "loss": 4.8741, "step": 9815 }, { "epoch": 0.2010935228226814, "grad_norm": 0.2604922652244568, "learning_rate": 0.0003, "loss": 4.9599, "step": 9820 }, { "epoch": 0.201195912600086, "grad_norm": 0.32582342624664307, "learning_rate": 0.0003, "loss": 4.9457, "step": 9825 }, { "epoch": 0.20129830237749063, "grad_norm": 0.2412978559732437, "learning_rate": 0.0003, "loss": 4.9239, "step": 9830 }, { "epoch": 0.20140069215489526, "grad_norm": 0.24473164975643158, "learning_rate": 0.0003, "loss": 4.9257, "step": 9835 }, { "epoch": 0.20150308193229988, "grad_norm": 0.2287527620792389, "learning_rate": 0.0003, "loss": 4.8948, "step": 9840 }, { "epoch": 0.2016054717097045, "grad_norm": 0.22850751876831055, "learning_rate": 0.0003, "loss": 4.9297, "step": 9845 }, { "epoch": 0.20170786148710912, "grad_norm": 0.23829907178878784, "learning_rate": 0.0003, "loss": 4.9754, "step": 9850 }, { "epoch": 0.20181025126451374, "grad_norm": 0.23911869525909424, "learning_rate": 0.0003, "loss": 4.9296, "step": 9855 }, { "epoch": 0.20191264104191836, "grad_norm": 0.24259842932224274, "learning_rate": 0.0003, "loss": 4.9028, "step": 9860 }, { "epoch": 0.202015030819323, "grad_norm": 0.26722252368927, "learning_rate": 0.0003, "loss": 4.8914, "step": 9865 }, { "epoch": 0.20211742059672763, "grad_norm": 0.25296705961227417, "learning_rate": 0.0003, "loss": 4.9552, "step": 9870 }, { "epoch": 0.20221981037413225, "grad_norm": 0.2500793933868408, "learning_rate": 0.0003, "loss": 4.956, "step": 9875 }, { "epoch": 0.20232220015153687, "grad_norm": 0.2558973729610443, "learning_rate": 0.0003, "loss": 4.9346, "step": 9880 }, { "epoch": 0.2024245899289415, "grad_norm": 0.25476959347724915, "learning_rate": 0.0003, "loss": 4.9359, "step": 9885 }, { "epoch": 0.20252697970634612, "grad_norm": 0.23200826346874237, "learning_rate": 0.0003, "loss": 4.925, "step": 9890 }, { "epoch": 0.20262936948375074, "grad_norm": 0.9324480891227722, "learning_rate": 0.0003, "loss": 4.9134, "step": 9895 }, { "epoch": 0.20273175926115536, "grad_norm": 0.28359031677246094, "learning_rate": 0.0003, "loss": 4.9316, "step": 9900 }, { "epoch": 0.20283414903855998, "grad_norm": 0.2840028405189514, "learning_rate": 0.0003, "loss": 4.9371, "step": 9905 }, { "epoch": 0.2029365388159646, "grad_norm": 0.2340623140335083, "learning_rate": 0.0003, "loss": 4.9496, "step": 9910 }, { "epoch": 0.20303892859336925, "grad_norm": 0.2449018359184265, "learning_rate": 0.0003, "loss": 4.9336, "step": 9915 }, { "epoch": 0.20314131837077387, "grad_norm": 0.23518037796020508, "learning_rate": 0.0003, "loss": 4.9661, "step": 9920 }, { "epoch": 0.2032437081481785, "grad_norm": 0.2316855490207672, "learning_rate": 0.0003, "loss": 4.9192, "step": 9925 }, { "epoch": 0.2033460979255831, "grad_norm": 0.23387083411216736, "learning_rate": 0.0003, "loss": 4.919, "step": 9930 }, { "epoch": 0.20344848770298773, "grad_norm": 0.22854074835777283, "learning_rate": 0.0003, "loss": 4.9261, "step": 9935 }, { "epoch": 0.20355087748039236, "grad_norm": 0.24308951199054718, "learning_rate": 0.0003, "loss": 4.8899, "step": 9940 }, { "epoch": 0.20365326725779698, "grad_norm": 0.2530437111854553, "learning_rate": 0.0003, "loss": 4.9501, "step": 9945 }, { "epoch": 0.2037556570352016, "grad_norm": 0.23754967749118805, "learning_rate": 0.0003, "loss": 4.9646, "step": 9950 }, { "epoch": 0.20385804681260622, "grad_norm": 0.28361964225769043, "learning_rate": 0.0003, "loss": 4.8772, "step": 9955 }, { "epoch": 0.20396043659001084, "grad_norm": 0.23497337102890015, "learning_rate": 0.0003, "loss": 4.9215, "step": 9960 }, { "epoch": 0.2040628263674155, "grad_norm": 0.24981237947940826, "learning_rate": 0.0003, "loss": 4.8853, "step": 9965 }, { "epoch": 0.2041652161448201, "grad_norm": 0.25962939858436584, "learning_rate": 0.0003, "loss": 4.929, "step": 9970 }, { "epoch": 0.20426760592222473, "grad_norm": 0.24603718519210815, "learning_rate": 0.0003, "loss": 4.8615, "step": 9975 }, { "epoch": 0.20436999569962935, "grad_norm": 0.23433342576026917, "learning_rate": 0.0003, "loss": 4.9285, "step": 9980 }, { "epoch": 0.20447238547703397, "grad_norm": 0.251792848110199, "learning_rate": 0.0003, "loss": 4.9216, "step": 9985 }, { "epoch": 0.2045747752544386, "grad_norm": 0.2492300122976303, "learning_rate": 0.0003, "loss": 4.9555, "step": 9990 }, { "epoch": 0.20467716503184322, "grad_norm": 0.3164156675338745, "learning_rate": 0.0003, "loss": 4.879, "step": 9995 }, { "epoch": 0.20477955480924784, "grad_norm": 0.289057195186615, "learning_rate": 0.0003, "loss": 4.9641, "step": 10000 }, { "epoch": 0.20477955480924784, "eval_loss": 4.946022033691406, "eval_runtime": 17.2507, "eval_samples_per_second": 33.738, "eval_steps_per_second": 4.232, "step": 10000 }, { "epoch": 0.20488194458665246, "grad_norm": 0.3100838363170624, "learning_rate": 0.0003, "loss": 4.9897, "step": 10005 }, { "epoch": 0.20498433436405708, "grad_norm": 0.2570136487483978, "learning_rate": 0.0003, "loss": 4.9455, "step": 10010 }, { "epoch": 0.20508672414146173, "grad_norm": 0.23518770933151245, "learning_rate": 0.0003, "loss": 4.9661, "step": 10015 }, { "epoch": 0.20518911391886635, "grad_norm": 0.23959320783615112, "learning_rate": 0.0003, "loss": 4.9419, "step": 10020 }, { "epoch": 0.20529150369627097, "grad_norm": 0.32576480507850647, "learning_rate": 0.0003, "loss": 4.9426, "step": 10025 }, { "epoch": 0.2053938934736756, "grad_norm": 0.2734343409538269, "learning_rate": 0.0003, "loss": 4.9268, "step": 10030 }, { "epoch": 0.20549628325108021, "grad_norm": 0.32146018743515015, "learning_rate": 0.0003, "loss": 4.9162, "step": 10035 }, { "epoch": 0.20559867302848484, "grad_norm": 0.24999310076236725, "learning_rate": 0.0003, "loss": 4.9229, "step": 10040 }, { "epoch": 0.20570106280588946, "grad_norm": 0.2612939774990082, "learning_rate": 0.0003, "loss": 4.9008, "step": 10045 }, { "epoch": 0.20580345258329408, "grad_norm": 0.23676054179668427, "learning_rate": 0.0003, "loss": 4.9459, "step": 10050 }, { "epoch": 0.2059058423606987, "grad_norm": 0.23431552946567535, "learning_rate": 0.0003, "loss": 4.9652, "step": 10055 }, { "epoch": 0.20600823213810332, "grad_norm": 0.2386656105518341, "learning_rate": 0.0003, "loss": 4.9272, "step": 10060 }, { "epoch": 0.20611062191550797, "grad_norm": 0.2440197616815567, "learning_rate": 0.0003, "loss": 4.923, "step": 10065 }, { "epoch": 0.2062130116929126, "grad_norm": 0.24586227536201477, "learning_rate": 0.0003, "loss": 4.9104, "step": 10070 }, { "epoch": 0.2063154014703172, "grad_norm": 0.2599840462207794, "learning_rate": 0.0003, "loss": 4.9388, "step": 10075 }, { "epoch": 0.20641779124772183, "grad_norm": 0.2490772008895874, "learning_rate": 0.0003, "loss": 4.9334, "step": 10080 }, { "epoch": 0.20652018102512645, "grad_norm": 0.22175948321819305, "learning_rate": 0.0003, "loss": 4.9286, "step": 10085 }, { "epoch": 0.20662257080253107, "grad_norm": 0.23675376176834106, "learning_rate": 0.0003, "loss": 4.9357, "step": 10090 }, { "epoch": 0.2067249605799357, "grad_norm": 0.2420710176229477, "learning_rate": 0.0003, "loss": 4.9201, "step": 10095 }, { "epoch": 0.20682735035734032, "grad_norm": 0.2561867833137512, "learning_rate": 0.0003, "loss": 4.9254, "step": 10100 }, { "epoch": 0.20692974013474494, "grad_norm": 0.22612594068050385, "learning_rate": 0.0003, "loss": 4.9592, "step": 10105 }, { "epoch": 0.20703212991214956, "grad_norm": 0.23760299384593964, "learning_rate": 0.0003, "loss": 4.8737, "step": 10110 }, { "epoch": 0.2071345196895542, "grad_norm": 0.26986318826675415, "learning_rate": 0.0003, "loss": 4.9264, "step": 10115 }, { "epoch": 0.20723690946695883, "grad_norm": 0.24083851277828217, "learning_rate": 0.0003, "loss": 4.8706, "step": 10120 }, { "epoch": 0.20733929924436345, "grad_norm": 0.2366999238729477, "learning_rate": 0.0003, "loss": 4.8748, "step": 10125 }, { "epoch": 0.20744168902176807, "grad_norm": 0.2339482456445694, "learning_rate": 0.0003, "loss": 4.9069, "step": 10130 }, { "epoch": 0.2075440787991727, "grad_norm": 0.2478560507297516, "learning_rate": 0.0003, "loss": 4.8885, "step": 10135 }, { "epoch": 0.20764646857657731, "grad_norm": 0.23827484250068665, "learning_rate": 0.0003, "loss": 4.9083, "step": 10140 }, { "epoch": 0.20774885835398194, "grad_norm": 0.24935999512672424, "learning_rate": 0.0003, "loss": 4.9239, "step": 10145 }, { "epoch": 0.20785124813138656, "grad_norm": 0.2415740042924881, "learning_rate": 0.0003, "loss": 4.9198, "step": 10150 }, { "epoch": 0.20795363790879118, "grad_norm": 0.2714764475822449, "learning_rate": 0.0003, "loss": 4.953, "step": 10155 }, { "epoch": 0.2080560276861958, "grad_norm": 0.24922576546669006, "learning_rate": 0.0003, "loss": 4.954, "step": 10160 }, { "epoch": 0.20815841746360042, "grad_norm": 0.2823648452758789, "learning_rate": 0.0003, "loss": 4.8743, "step": 10165 }, { "epoch": 0.20826080724100507, "grad_norm": 0.2266201674938202, "learning_rate": 0.0003, "loss": 4.9497, "step": 10170 }, { "epoch": 0.2083631970184097, "grad_norm": 0.24112917482852936, "learning_rate": 0.0003, "loss": 4.8825, "step": 10175 }, { "epoch": 0.2084655867958143, "grad_norm": 0.23484036326408386, "learning_rate": 0.0003, "loss": 4.9199, "step": 10180 }, { "epoch": 0.20856797657321893, "grad_norm": 0.24271684885025024, "learning_rate": 0.0003, "loss": 4.961, "step": 10185 }, { "epoch": 0.20867036635062355, "grad_norm": 0.24168355762958527, "learning_rate": 0.0003, "loss": 4.9199, "step": 10190 }, { "epoch": 0.20877275612802818, "grad_norm": 0.24393868446350098, "learning_rate": 0.0003, "loss": 4.9405, "step": 10195 }, { "epoch": 0.2088751459054328, "grad_norm": 0.23558944463729858, "learning_rate": 0.0003, "loss": 4.9036, "step": 10200 }, { "epoch": 0.20897753568283742, "grad_norm": 0.24454809725284576, "learning_rate": 0.0003, "loss": 4.899, "step": 10205 }, { "epoch": 0.20907992546024204, "grad_norm": 0.27660131454467773, "learning_rate": 0.0003, "loss": 4.9094, "step": 10210 }, { "epoch": 0.20918231523764666, "grad_norm": 0.2513921856880188, "learning_rate": 0.0003, "loss": 4.8493, "step": 10215 }, { "epoch": 0.2092847050150513, "grad_norm": 0.23608632385730743, "learning_rate": 0.0003, "loss": 4.9348, "step": 10220 }, { "epoch": 0.20938709479245593, "grad_norm": 0.26476597785949707, "learning_rate": 0.0003, "loss": 4.8984, "step": 10225 }, { "epoch": 0.20948948456986055, "grad_norm": 0.24490049481391907, "learning_rate": 0.0003, "loss": 4.8976, "step": 10230 }, { "epoch": 0.20959187434726517, "grad_norm": 0.23808947205543518, "learning_rate": 0.0003, "loss": 4.8992, "step": 10235 }, { "epoch": 0.2096942641246698, "grad_norm": 0.24560321867465973, "learning_rate": 0.0003, "loss": 4.8878, "step": 10240 }, { "epoch": 0.20979665390207441, "grad_norm": 0.23638848960399628, "learning_rate": 0.0003, "loss": 4.9148, "step": 10245 }, { "epoch": 0.20989904367947904, "grad_norm": 0.24860762059688568, "learning_rate": 0.0003, "loss": 4.9318, "step": 10250 }, { "epoch": 0.21000143345688366, "grad_norm": 0.2630869448184967, "learning_rate": 0.0003, "loss": 4.957, "step": 10255 }, { "epoch": 0.21010382323428828, "grad_norm": 0.348331481218338, "learning_rate": 0.0003, "loss": 4.8954, "step": 10260 }, { "epoch": 0.2102062130116929, "grad_norm": 0.23188897967338562, "learning_rate": 0.0003, "loss": 4.9356, "step": 10265 }, { "epoch": 0.21030860278909755, "grad_norm": 0.23428316414356232, "learning_rate": 0.0003, "loss": 4.9342, "step": 10270 }, { "epoch": 0.21041099256650217, "grad_norm": 0.25637906789779663, "learning_rate": 0.0003, "loss": 4.9023, "step": 10275 }, { "epoch": 0.2105133823439068, "grad_norm": 0.2521513104438782, "learning_rate": 0.0003, "loss": 4.8935, "step": 10280 }, { "epoch": 0.2106157721213114, "grad_norm": 0.23467016220092773, "learning_rate": 0.0003, "loss": 4.932, "step": 10285 }, { "epoch": 0.21071816189871603, "grad_norm": 0.23410767316818237, "learning_rate": 0.0003, "loss": 4.9049, "step": 10290 }, { "epoch": 0.21082055167612065, "grad_norm": 0.22825035452842712, "learning_rate": 0.0003, "loss": 4.9483, "step": 10295 }, { "epoch": 0.21092294145352528, "grad_norm": 0.2342665195465088, "learning_rate": 0.0003, "loss": 4.8045, "step": 10300 }, { "epoch": 0.2110253312309299, "grad_norm": 0.24313782155513763, "learning_rate": 0.0003, "loss": 4.9105, "step": 10305 }, { "epoch": 0.21112772100833452, "grad_norm": 0.24948935210704803, "learning_rate": 0.0003, "loss": 4.9, "step": 10310 }, { "epoch": 0.21123011078573914, "grad_norm": 0.24122211337089539, "learning_rate": 0.0003, "loss": 4.9418, "step": 10315 }, { "epoch": 0.2113325005631438, "grad_norm": 0.23953549563884735, "learning_rate": 0.0003, "loss": 4.923, "step": 10320 }, { "epoch": 0.2114348903405484, "grad_norm": 0.36018356680870056, "learning_rate": 0.0003, "loss": 4.9288, "step": 10325 }, { "epoch": 0.21153728011795303, "grad_norm": 0.23670226335525513, "learning_rate": 0.0003, "loss": 4.9307, "step": 10330 }, { "epoch": 0.21163966989535765, "grad_norm": 0.23643650114536285, "learning_rate": 0.0003, "loss": 4.9081, "step": 10335 }, { "epoch": 0.21174205967276227, "grad_norm": 0.24532055854797363, "learning_rate": 0.0003, "loss": 4.8971, "step": 10340 }, { "epoch": 0.2118444494501669, "grad_norm": 0.2591317594051361, "learning_rate": 0.0003, "loss": 4.9349, "step": 10345 }, { "epoch": 0.21194683922757152, "grad_norm": 0.22916889190673828, "learning_rate": 0.0003, "loss": 4.9148, "step": 10350 }, { "epoch": 0.21204922900497614, "grad_norm": 0.22888129949569702, "learning_rate": 0.0003, "loss": 4.9326, "step": 10355 }, { "epoch": 0.21215161878238076, "grad_norm": 0.22662124037742615, "learning_rate": 0.0003, "loss": 4.9134, "step": 10360 }, { "epoch": 0.21225400855978538, "grad_norm": 0.24721461534500122, "learning_rate": 0.0003, "loss": 4.8911, "step": 10365 }, { "epoch": 0.21235639833719003, "grad_norm": 0.23616601526737213, "learning_rate": 0.0003, "loss": 4.9176, "step": 10370 }, { "epoch": 0.21245878811459465, "grad_norm": 0.23695924878120422, "learning_rate": 0.0003, "loss": 4.95, "step": 10375 }, { "epoch": 0.21256117789199927, "grad_norm": 0.24746832251548767, "learning_rate": 0.0003, "loss": 4.9246, "step": 10380 }, { "epoch": 0.2126635676694039, "grad_norm": 0.24671892821788788, "learning_rate": 0.0003, "loss": 4.8928, "step": 10385 }, { "epoch": 0.2127659574468085, "grad_norm": 0.2777179777622223, "learning_rate": 0.0003, "loss": 4.8935, "step": 10390 }, { "epoch": 0.21286834722421313, "grad_norm": 0.23937489092350006, "learning_rate": 0.0003, "loss": 4.8945, "step": 10395 }, { "epoch": 0.21297073700161775, "grad_norm": 0.2379358410835266, "learning_rate": 0.0003, "loss": 4.8815, "step": 10400 }, { "epoch": 0.21307312677902238, "grad_norm": 0.2482132464647293, "learning_rate": 0.0003, "loss": 4.8642, "step": 10405 }, { "epoch": 0.213175516556427, "grad_norm": 0.25206390023231506, "learning_rate": 0.0003, "loss": 4.9147, "step": 10410 }, { "epoch": 0.21327790633383162, "grad_norm": 0.23628085851669312, "learning_rate": 0.0003, "loss": 4.8867, "step": 10415 }, { "epoch": 0.21338029611123627, "grad_norm": 0.22866074740886688, "learning_rate": 0.0003, "loss": 4.8927, "step": 10420 }, { "epoch": 0.2134826858886409, "grad_norm": 0.24655304849147797, "learning_rate": 0.0003, "loss": 4.8836, "step": 10425 }, { "epoch": 0.2135850756660455, "grad_norm": 0.36821508407592773, "learning_rate": 0.0003, "loss": 4.9246, "step": 10430 }, { "epoch": 0.21368746544345013, "grad_norm": 0.24489273130893707, "learning_rate": 0.0003, "loss": 4.8621, "step": 10435 }, { "epoch": 0.21378985522085475, "grad_norm": 0.22568266093730927, "learning_rate": 0.0003, "loss": 4.883, "step": 10440 }, { "epoch": 0.21389224499825937, "grad_norm": 0.24610939621925354, "learning_rate": 0.0003, "loss": 4.9333, "step": 10445 }, { "epoch": 0.213994634775664, "grad_norm": 0.2335331290960312, "learning_rate": 0.0003, "loss": 4.8785, "step": 10450 }, { "epoch": 0.21409702455306862, "grad_norm": 0.2339749038219452, "learning_rate": 0.0003, "loss": 4.9098, "step": 10455 }, { "epoch": 0.21419941433047324, "grad_norm": 0.2380892038345337, "learning_rate": 0.0003, "loss": 4.8889, "step": 10460 }, { "epoch": 0.21430180410787786, "grad_norm": 0.24296675622463226, "learning_rate": 0.0003, "loss": 4.9245, "step": 10465 }, { "epoch": 0.2144041938852825, "grad_norm": 0.24508559703826904, "learning_rate": 0.0003, "loss": 4.8672, "step": 10470 }, { "epoch": 0.21450658366268713, "grad_norm": 0.23185114562511444, "learning_rate": 0.0003, "loss": 4.896, "step": 10475 }, { "epoch": 0.21460897344009175, "grad_norm": 0.2594311237335205, "learning_rate": 0.0003, "loss": 4.8997, "step": 10480 }, { "epoch": 0.21471136321749637, "grad_norm": 0.24049535393714905, "learning_rate": 0.0003, "loss": 4.9123, "step": 10485 }, { "epoch": 0.214813752994901, "grad_norm": 0.24497857689857483, "learning_rate": 0.0003, "loss": 4.9061, "step": 10490 }, { "epoch": 0.2149161427723056, "grad_norm": 0.2623315155506134, "learning_rate": 0.0003, "loss": 4.8854, "step": 10495 }, { "epoch": 0.21501853254971023, "grad_norm": 0.2425769716501236, "learning_rate": 0.0003, "loss": 4.8957, "step": 10500 }, { "epoch": 0.21512092232711486, "grad_norm": 0.27547577023506165, "learning_rate": 0.0003, "loss": 4.9533, "step": 10505 }, { "epoch": 0.21522331210451948, "grad_norm": 0.24182581901550293, "learning_rate": 0.0003, "loss": 4.877, "step": 10510 }, { "epoch": 0.2153257018819241, "grad_norm": 0.2818247079849243, "learning_rate": 0.0003, "loss": 4.9121, "step": 10515 }, { "epoch": 0.21542809165932875, "grad_norm": 0.2383284568786621, "learning_rate": 0.0003, "loss": 4.9165, "step": 10520 }, { "epoch": 0.21553048143673337, "grad_norm": 0.2394833266735077, "learning_rate": 0.0003, "loss": 4.9228, "step": 10525 }, { "epoch": 0.215632871214138, "grad_norm": 0.2611193358898163, "learning_rate": 0.0003, "loss": 4.9472, "step": 10530 }, { "epoch": 0.2157352609915426, "grad_norm": 0.2499607503414154, "learning_rate": 0.0003, "loss": 4.9171, "step": 10535 }, { "epoch": 0.21583765076894723, "grad_norm": 0.2569971978664398, "learning_rate": 0.0003, "loss": 4.9122, "step": 10540 }, { "epoch": 0.21594004054635185, "grad_norm": 0.25128480792045593, "learning_rate": 0.0003, "loss": 4.8578, "step": 10545 }, { "epoch": 0.21604243032375647, "grad_norm": 0.2561035752296448, "learning_rate": 0.0003, "loss": 4.9266, "step": 10550 }, { "epoch": 0.2161448201011611, "grad_norm": 0.2700226604938507, "learning_rate": 0.0003, "loss": 4.8567, "step": 10555 }, { "epoch": 0.21624720987856572, "grad_norm": 0.2432553470134735, "learning_rate": 0.0003, "loss": 4.9168, "step": 10560 }, { "epoch": 0.21634959965597034, "grad_norm": 0.2681170403957367, "learning_rate": 0.0003, "loss": 4.9353, "step": 10565 }, { "epoch": 0.21645198943337496, "grad_norm": 0.2788884937763214, "learning_rate": 0.0003, "loss": 4.888, "step": 10570 }, { "epoch": 0.2165543792107796, "grad_norm": 0.24926117062568665, "learning_rate": 0.0003, "loss": 4.9003, "step": 10575 }, { "epoch": 0.21665676898818423, "grad_norm": 0.24506379663944244, "learning_rate": 0.0003, "loss": 4.8823, "step": 10580 }, { "epoch": 0.21675915876558885, "grad_norm": 0.22609646618366241, "learning_rate": 0.0003, "loss": 4.942, "step": 10585 }, { "epoch": 0.21686154854299347, "grad_norm": 0.23347771167755127, "learning_rate": 0.0003, "loss": 4.913, "step": 10590 }, { "epoch": 0.2169639383203981, "grad_norm": 0.2599465548992157, "learning_rate": 0.0003, "loss": 4.8689, "step": 10595 }, { "epoch": 0.2170663280978027, "grad_norm": 0.24910423159599304, "learning_rate": 0.0003, "loss": 4.9238, "step": 10600 }, { "epoch": 0.21716871787520733, "grad_norm": 0.2776698172092438, "learning_rate": 0.0003, "loss": 4.9034, "step": 10605 }, { "epoch": 0.21727110765261196, "grad_norm": 0.27069202065467834, "learning_rate": 0.0003, "loss": 4.9207, "step": 10610 }, { "epoch": 0.21737349743001658, "grad_norm": 0.23688560724258423, "learning_rate": 0.0003, "loss": 4.9181, "step": 10615 }, { "epoch": 0.2174758872074212, "grad_norm": 0.24356743693351746, "learning_rate": 0.0003, "loss": 4.8475, "step": 10620 }, { "epoch": 0.21757827698482585, "grad_norm": 0.27072104811668396, "learning_rate": 0.0003, "loss": 4.918, "step": 10625 }, { "epoch": 0.21768066676223047, "grad_norm": 0.23528029024600983, "learning_rate": 0.0003, "loss": 4.9577, "step": 10630 }, { "epoch": 0.2177830565396351, "grad_norm": 0.2385890930891037, "learning_rate": 0.0003, "loss": 4.8956, "step": 10635 }, { "epoch": 0.2178854463170397, "grad_norm": 0.23565299808979034, "learning_rate": 0.0003, "loss": 4.9131, "step": 10640 }, { "epoch": 0.21798783609444433, "grad_norm": 0.2600311040878296, "learning_rate": 0.0003, "loss": 4.9202, "step": 10645 }, { "epoch": 0.21809022587184895, "grad_norm": 0.25743547081947327, "learning_rate": 0.0003, "loss": 4.8687, "step": 10650 }, { "epoch": 0.21819261564925357, "grad_norm": 0.2406614124774933, "learning_rate": 0.0003, "loss": 4.8673, "step": 10655 }, { "epoch": 0.2182950054266582, "grad_norm": 0.23556888103485107, "learning_rate": 0.0003, "loss": 4.8713, "step": 10660 }, { "epoch": 0.21839739520406282, "grad_norm": 0.2352827936410904, "learning_rate": 0.0003, "loss": 4.8852, "step": 10665 }, { "epoch": 0.21849978498146744, "grad_norm": 0.23802189528942108, "learning_rate": 0.0003, "loss": 4.8555, "step": 10670 }, { "epoch": 0.2186021747588721, "grad_norm": 0.24062120914459229, "learning_rate": 0.0003, "loss": 4.8945, "step": 10675 }, { "epoch": 0.2187045645362767, "grad_norm": 0.2625252902507782, "learning_rate": 0.0003, "loss": 4.869, "step": 10680 }, { "epoch": 0.21880695431368133, "grad_norm": 0.2866668701171875, "learning_rate": 0.0003, "loss": 4.9274, "step": 10685 }, { "epoch": 0.21890934409108595, "grad_norm": 0.31666335463523865, "learning_rate": 0.0003, "loss": 4.8808, "step": 10690 }, { "epoch": 0.21901173386849057, "grad_norm": 0.2611194849014282, "learning_rate": 0.0003, "loss": 4.892, "step": 10695 }, { "epoch": 0.2191141236458952, "grad_norm": 0.3305256962776184, "learning_rate": 0.0003, "loss": 4.8437, "step": 10700 }, { "epoch": 0.2192165134232998, "grad_norm": 0.24971061944961548, "learning_rate": 0.0003, "loss": 4.8621, "step": 10705 }, { "epoch": 0.21931890320070443, "grad_norm": 0.2453843057155609, "learning_rate": 0.0003, "loss": 4.9466, "step": 10710 }, { "epoch": 0.21942129297810906, "grad_norm": 0.2334168702363968, "learning_rate": 0.0003, "loss": 4.9143, "step": 10715 }, { "epoch": 0.21952368275551368, "grad_norm": 0.24947424232959747, "learning_rate": 0.0003, "loss": 4.8124, "step": 10720 }, { "epoch": 0.21962607253291833, "grad_norm": 0.2462509423494339, "learning_rate": 0.0003, "loss": 4.8933, "step": 10725 }, { "epoch": 0.21972846231032295, "grad_norm": 0.2200932651758194, "learning_rate": 0.0003, "loss": 4.9114, "step": 10730 }, { "epoch": 0.21983085208772757, "grad_norm": 0.22099857032299042, "learning_rate": 0.0003, "loss": 4.9182, "step": 10735 }, { "epoch": 0.2199332418651322, "grad_norm": 0.25395599007606506, "learning_rate": 0.0003, "loss": 4.9027, "step": 10740 }, { "epoch": 0.2200356316425368, "grad_norm": 0.24311977624893188, "learning_rate": 0.0003, "loss": 4.8598, "step": 10745 }, { "epoch": 0.22013802141994143, "grad_norm": 0.24510981142520905, "learning_rate": 0.0003, "loss": 4.8418, "step": 10750 }, { "epoch": 0.22024041119734605, "grad_norm": 0.25221526622772217, "learning_rate": 0.0003, "loss": 4.9188, "step": 10755 }, { "epoch": 0.22034280097475067, "grad_norm": 0.22577425837516785, "learning_rate": 0.0003, "loss": 4.8804, "step": 10760 }, { "epoch": 0.2204451907521553, "grad_norm": 0.23756377398967743, "learning_rate": 0.0003, "loss": 4.8441, "step": 10765 }, { "epoch": 0.22054758052955992, "grad_norm": 0.2356066256761551, "learning_rate": 0.00029999999540266433, "loss": 4.9067, "step": 10770 }, { "epoch": 0.22064997030696457, "grad_norm": 0.23691512644290924, "learning_rate": 0.00029999996730783644, "loss": 4.9293, "step": 10775 }, { "epoch": 0.2207523600843692, "grad_norm": 0.22614943981170654, "learning_rate": 0.00029999991367226075, "loss": 4.9012, "step": 10780 }, { "epoch": 0.2208547498617738, "grad_norm": 0.23372440040111542, "learning_rate": 0.0002999998344959465, "loss": 4.9256, "step": 10785 }, { "epoch": 0.22095713963917843, "grad_norm": 0.24283495545387268, "learning_rate": 0.000299999729778907, "loss": 4.8643, "step": 10790 }, { "epoch": 0.22105952941658305, "grad_norm": 0.27330297231674194, "learning_rate": 0.00029999959952116027, "loss": 4.897, "step": 10795 }, { "epoch": 0.22116191919398767, "grad_norm": 0.25471654534339905, "learning_rate": 0.00029999944372272833, "loss": 4.8822, "step": 10800 }, { "epoch": 0.2212643089713923, "grad_norm": 0.2778959274291992, "learning_rate": 0.00029999926238363783, "loss": 4.8658, "step": 10805 }, { "epoch": 0.22136669874879691, "grad_norm": 0.2582586705684662, "learning_rate": 0.00029999905550391956, "loss": 4.9524, "step": 10810 }, { "epoch": 0.22146908852620154, "grad_norm": 0.24232889711856842, "learning_rate": 0.0002999988230836088, "loss": 4.8638, "step": 10815 }, { "epoch": 0.22157147830360616, "grad_norm": 0.25435176491737366, "learning_rate": 0.0002999985651227451, "loss": 4.8973, "step": 10820 }, { "epoch": 0.2216738680810108, "grad_norm": 0.23583756387233734, "learning_rate": 0.0002999982816213724, "loss": 4.8829, "step": 10825 }, { "epoch": 0.22177625785841543, "grad_norm": 0.25342777371406555, "learning_rate": 0.0002999979725795389, "loss": 4.8806, "step": 10830 }, { "epoch": 0.22187864763582005, "grad_norm": 0.22450606524944305, "learning_rate": 0.00029999763799729733, "loss": 4.9066, "step": 10835 }, { "epoch": 0.22198103741322467, "grad_norm": 0.23883618414402008, "learning_rate": 0.0002999972778747046, "loss": 4.9482, "step": 10840 }, { "epoch": 0.2220834271906293, "grad_norm": 0.2697833776473999, "learning_rate": 0.0002999968922118221, "loss": 4.9225, "step": 10845 }, { "epoch": 0.2221858169680339, "grad_norm": 0.23498277366161346, "learning_rate": 0.00029999648100871535, "loss": 4.8926, "step": 10850 }, { "epoch": 0.22228820674543853, "grad_norm": 0.2613791525363922, "learning_rate": 0.0002999960442654545, "loss": 4.9152, "step": 10855 }, { "epoch": 0.22239059652284315, "grad_norm": 0.264046311378479, "learning_rate": 0.0002999955819821138, "loss": 4.9276, "step": 10860 }, { "epoch": 0.22249298630024777, "grad_norm": 0.2305372655391693, "learning_rate": 0.00029999509415877215, "loss": 4.8974, "step": 10865 }, { "epoch": 0.2225953760776524, "grad_norm": 0.22589483857154846, "learning_rate": 0.0002999945807955124, "loss": 4.8986, "step": 10870 }, { "epoch": 0.22269776585505704, "grad_norm": 0.24377058446407318, "learning_rate": 0.0002999940418924221, "loss": 4.8925, "step": 10875 }, { "epoch": 0.22280015563246167, "grad_norm": 0.23469287157058716, "learning_rate": 0.0002999934774495929, "loss": 4.8825, "step": 10880 }, { "epoch": 0.2229025454098663, "grad_norm": 0.2578478455543518, "learning_rate": 0.00029999288746712107, "loss": 4.8929, "step": 10885 }, { "epoch": 0.2230049351872709, "grad_norm": 0.24005863070487976, "learning_rate": 0.0002999922719451069, "loss": 4.8768, "step": 10890 }, { "epoch": 0.22310732496467553, "grad_norm": 0.3334100544452667, "learning_rate": 0.0002999916308836553, "loss": 4.89, "step": 10895 }, { "epoch": 0.22320971474208015, "grad_norm": 0.2511036992073059, "learning_rate": 0.00029999096428287537, "loss": 4.8613, "step": 10900 }, { "epoch": 0.22331210451948477, "grad_norm": 0.2527853548526764, "learning_rate": 0.0002999902721428807, "loss": 4.8834, "step": 10905 }, { "epoch": 0.2234144942968894, "grad_norm": 0.23672787845134735, "learning_rate": 0.000299989554463789, "loss": 4.9026, "step": 10910 }, { "epoch": 0.22351688407429401, "grad_norm": 0.26204660534858704, "learning_rate": 0.0002999888112457226, "loss": 4.9342, "step": 10915 }, { "epoch": 0.22361927385169864, "grad_norm": 0.24639709293842316, "learning_rate": 0.000299988042488808, "loss": 4.8823, "step": 10920 }, { "epoch": 0.22372166362910326, "grad_norm": 0.2559228539466858, "learning_rate": 0.000299987248193176, "loss": 4.9291, "step": 10925 }, { "epoch": 0.2238240534065079, "grad_norm": 0.2215857356786728, "learning_rate": 0.00029998642835896204, "loss": 4.8747, "step": 10930 }, { "epoch": 0.22392644318391253, "grad_norm": 0.23071123659610748, "learning_rate": 0.00029998558298630556, "loss": 4.8983, "step": 10935 }, { "epoch": 0.22402883296131715, "grad_norm": 0.24900750815868378, "learning_rate": 0.0002999847120753506, "loss": 4.9054, "step": 10940 }, { "epoch": 0.22413122273872177, "grad_norm": 0.2291107475757599, "learning_rate": 0.0002999838156262454, "loss": 4.8966, "step": 10945 }, { "epoch": 0.2242336125161264, "grad_norm": 0.24814331531524658, "learning_rate": 0.0002999828936391426, "loss": 4.855, "step": 10950 }, { "epoch": 0.224336002293531, "grad_norm": 0.24090422689914703, "learning_rate": 0.0002999819461141992, "loss": 4.8942, "step": 10955 }, { "epoch": 0.22443839207093563, "grad_norm": 0.22519832849502563, "learning_rate": 0.0002999809730515765, "loss": 4.9153, "step": 10960 }, { "epoch": 0.22454078184834025, "grad_norm": 0.29087841510772705, "learning_rate": 0.0002999799744514403, "loss": 4.9382, "step": 10965 }, { "epoch": 0.22464317162574488, "grad_norm": 0.2530803084373474, "learning_rate": 0.0002999789503139605, "loss": 4.8792, "step": 10970 }, { "epoch": 0.2247455614031495, "grad_norm": 0.26962804794311523, "learning_rate": 0.0002999779006393115, "loss": 4.8897, "step": 10975 }, { "epoch": 0.22484795118055415, "grad_norm": 0.2534395754337311, "learning_rate": 0.0002999768254276721, "loss": 4.9142, "step": 10980 }, { "epoch": 0.22495034095795877, "grad_norm": 0.27273815870285034, "learning_rate": 0.0002999757246792254, "loss": 4.9112, "step": 10985 }, { "epoch": 0.2250527307353634, "grad_norm": 0.2543729543685913, "learning_rate": 0.0002999745983941587, "loss": 4.8987, "step": 10990 }, { "epoch": 0.225155120512768, "grad_norm": 0.22888705134391785, "learning_rate": 0.00029997344657266387, "loss": 4.8616, "step": 10995 }, { "epoch": 0.22525751029017263, "grad_norm": 0.24257194995880127, "learning_rate": 0.00029997226921493693, "loss": 4.868, "step": 11000 }, { "epoch": 0.22525751029017263, "eval_loss": 4.919167518615723, "eval_runtime": 17.2454, "eval_samples_per_second": 33.748, "eval_steps_per_second": 4.233, "step": 11000 }, { "epoch": 0.22535990006757725, "grad_norm": 0.24592594802379608, "learning_rate": 0.0002999710663211785, "loss": 4.9788, "step": 11005 }, { "epoch": 0.22546228984498187, "grad_norm": 0.2423733025789261, "learning_rate": 0.0002999698378915933, "loss": 4.8767, "step": 11010 }, { "epoch": 0.2255646796223865, "grad_norm": 0.30392077565193176, "learning_rate": 0.00029996858392639054, "loss": 4.937, "step": 11015 }, { "epoch": 0.22566706939979111, "grad_norm": 0.2611886262893677, "learning_rate": 0.00029996730442578373, "loss": 4.8765, "step": 11020 }, { "epoch": 0.22576945917719574, "grad_norm": 0.24054093658924103, "learning_rate": 0.0002999659993899907, "loss": 4.931, "step": 11025 }, { "epoch": 0.22587184895460038, "grad_norm": 0.2398342490196228, "learning_rate": 0.00029996466881923367, "loss": 4.8824, "step": 11030 }, { "epoch": 0.225974238732005, "grad_norm": 0.25371286273002625, "learning_rate": 0.0002999633127137392, "loss": 4.9421, "step": 11035 }, { "epoch": 0.22607662850940963, "grad_norm": 0.26272109150886536, "learning_rate": 0.00029996193107373825, "loss": 4.8708, "step": 11040 }, { "epoch": 0.22617901828681425, "grad_norm": 0.24237163364887238, "learning_rate": 0.000299960523899466, "loss": 4.8722, "step": 11045 }, { "epoch": 0.22628140806421887, "grad_norm": 0.2590733766555786, "learning_rate": 0.0002999590911911621, "loss": 4.9091, "step": 11050 }, { "epoch": 0.2263837978416235, "grad_norm": 0.23286749422550201, "learning_rate": 0.00029995763294907044, "loss": 4.9017, "step": 11055 }, { "epoch": 0.2264861876190281, "grad_norm": 0.2581934630870819, "learning_rate": 0.0002999561491734394, "loss": 4.8817, "step": 11060 }, { "epoch": 0.22658857739643273, "grad_norm": 0.23137080669403076, "learning_rate": 0.00029995463986452156, "loss": 4.8804, "step": 11065 }, { "epoch": 0.22669096717383735, "grad_norm": 0.23512017726898193, "learning_rate": 0.0002999531050225739, "loss": 4.8719, "step": 11070 }, { "epoch": 0.22679335695124198, "grad_norm": 0.22481665015220642, "learning_rate": 0.00029995154464785786, "loss": 4.8657, "step": 11075 }, { "epoch": 0.22689574672864662, "grad_norm": 0.2558952569961548, "learning_rate": 0.00029994995874063905, "loss": 4.9027, "step": 11080 }, { "epoch": 0.22699813650605125, "grad_norm": 0.25101906061172485, "learning_rate": 0.0002999483473011875, "loss": 4.8701, "step": 11085 }, { "epoch": 0.22710052628345587, "grad_norm": 0.23361775279045105, "learning_rate": 0.00029994671032977763, "loss": 4.9379, "step": 11090 }, { "epoch": 0.2272029160608605, "grad_norm": 0.24297501146793365, "learning_rate": 0.0002999450478266881, "loss": 4.8984, "step": 11095 }, { "epoch": 0.2273053058382651, "grad_norm": 0.23249764740467072, "learning_rate": 0.0002999433597922021, "loss": 4.8988, "step": 11100 }, { "epoch": 0.22740769561566973, "grad_norm": 0.24362662434577942, "learning_rate": 0.00029994164622660697, "loss": 4.865, "step": 11105 }, { "epoch": 0.22751008539307435, "grad_norm": 0.2715204060077667, "learning_rate": 0.00029993990713019446, "loss": 4.8854, "step": 11110 }, { "epoch": 0.22761247517047897, "grad_norm": 0.2356962114572525, "learning_rate": 0.00029993814250326085, "loss": 4.8502, "step": 11115 }, { "epoch": 0.2277148649478836, "grad_norm": 0.2375289797782898, "learning_rate": 0.0002999363523461064, "loss": 4.9038, "step": 11120 }, { "epoch": 0.22781725472528822, "grad_norm": 0.26735198497772217, "learning_rate": 0.000299934536659036, "loss": 4.8722, "step": 11125 }, { "epoch": 0.22791964450269286, "grad_norm": 0.2644442617893219, "learning_rate": 0.00029993269544235885, "loss": 4.8949, "step": 11130 }, { "epoch": 0.22802203428009749, "grad_norm": 0.23039349913597107, "learning_rate": 0.00029993082869638843, "loss": 4.9066, "step": 11135 }, { "epoch": 0.2281244240575021, "grad_norm": 0.25083819031715393, "learning_rate": 0.0002999289364214426, "loss": 4.9457, "step": 11140 }, { "epoch": 0.22822681383490673, "grad_norm": 0.22685234248638153, "learning_rate": 0.0002999270186178435, "loss": 4.8967, "step": 11145 }, { "epoch": 0.22832920361231135, "grad_norm": 0.2520279884338379, "learning_rate": 0.00029992507528591784, "loss": 4.8651, "step": 11150 }, { "epoch": 0.22843159338971597, "grad_norm": 0.25449731945991516, "learning_rate": 0.00029992310642599633, "loss": 4.8679, "step": 11155 }, { "epoch": 0.2285339831671206, "grad_norm": 0.23374278843402863, "learning_rate": 0.0002999211120384143, "loss": 4.8691, "step": 11160 }, { "epoch": 0.2286363729445252, "grad_norm": 0.24310655891895294, "learning_rate": 0.00029991909212351135, "loss": 4.8958, "step": 11165 }, { "epoch": 0.22873876272192983, "grad_norm": 0.22431394457817078, "learning_rate": 0.00029991704668163135, "loss": 4.8878, "step": 11170 }, { "epoch": 0.22884115249933445, "grad_norm": 0.22711428999900818, "learning_rate": 0.00029991497571312263, "loss": 4.8992, "step": 11175 }, { "epoch": 0.2289435422767391, "grad_norm": 0.23749734461307526, "learning_rate": 0.0002999128792183378, "loss": 4.8963, "step": 11180 }, { "epoch": 0.22904593205414372, "grad_norm": 0.24261240661144257, "learning_rate": 0.0002999107571976338, "loss": 4.9011, "step": 11185 }, { "epoch": 0.22914832183154835, "grad_norm": 0.23800702393054962, "learning_rate": 0.00029990860965137207, "loss": 4.8943, "step": 11190 }, { "epoch": 0.22925071160895297, "grad_norm": 0.24408547580242157, "learning_rate": 0.0002999064365799182, "loss": 4.8213, "step": 11195 }, { "epoch": 0.2293531013863576, "grad_norm": 0.22900575399398804, "learning_rate": 0.00029990423798364216, "loss": 4.844, "step": 11200 }, { "epoch": 0.2294554911637622, "grad_norm": 0.2334994524717331, "learning_rate": 0.00029990201386291836, "loss": 4.8588, "step": 11205 }, { "epoch": 0.22955788094116683, "grad_norm": 0.2324739694595337, "learning_rate": 0.0002998997642181255, "loss": 4.8618, "step": 11210 }, { "epoch": 0.22966027071857145, "grad_norm": 0.24837611615657806, "learning_rate": 0.0002998974890496466, "loss": 4.8935, "step": 11215 }, { "epoch": 0.22976266049597607, "grad_norm": 0.24437487125396729, "learning_rate": 0.0002998951883578691, "loss": 4.8669, "step": 11220 }, { "epoch": 0.2298650502733807, "grad_norm": 0.3906613290309906, "learning_rate": 0.00029989286214318476, "loss": 4.8749, "step": 11225 }, { "epoch": 0.22996744005078534, "grad_norm": 0.24109190702438354, "learning_rate": 0.0002998905104059896, "loss": 4.9013, "step": 11230 }, { "epoch": 0.23006982982818996, "grad_norm": 0.24165146052837372, "learning_rate": 0.0002998881331466841, "loss": 4.8549, "step": 11235 }, { "epoch": 0.23017221960559459, "grad_norm": 0.24196457862854004, "learning_rate": 0.000299885730365673, "loss": 4.8945, "step": 11240 }, { "epoch": 0.2302746093829992, "grad_norm": 0.2327190488576889, "learning_rate": 0.00029988330206336545, "loss": 4.9292, "step": 11245 }, { "epoch": 0.23037699916040383, "grad_norm": 0.2443750500679016, "learning_rate": 0.00029988084824017497, "loss": 4.8996, "step": 11250 }, { "epoch": 0.23047938893780845, "grad_norm": 0.2533819377422333, "learning_rate": 0.0002998783688965193, "loss": 4.9055, "step": 11255 }, { "epoch": 0.23058177871521307, "grad_norm": 0.6916893720626831, "learning_rate": 0.0002998758640328206, "loss": 4.78, "step": 11260 }, { "epoch": 0.2306841684926177, "grad_norm": 0.2645145654678345, "learning_rate": 0.0002998733336495054, "loss": 4.8977, "step": 11265 }, { "epoch": 0.2307865582700223, "grad_norm": 0.2584141790866852, "learning_rate": 0.00029987077774700464, "loss": 4.8775, "step": 11270 }, { "epoch": 0.23088894804742693, "grad_norm": 0.22412125766277313, "learning_rate": 0.0002998681963257534, "loss": 4.8988, "step": 11275 }, { "epoch": 0.23099133782483156, "grad_norm": 0.317967027425766, "learning_rate": 0.0002998655893861913, "loss": 4.8529, "step": 11280 }, { "epoch": 0.2310937276022362, "grad_norm": 0.2685169577598572, "learning_rate": 0.0002998629569287621, "loss": 4.8384, "step": 11285 }, { "epoch": 0.23119611737964083, "grad_norm": 0.24802027642726898, "learning_rate": 0.0002998602989539142, "loss": 4.8708, "step": 11290 }, { "epoch": 0.23129850715704545, "grad_norm": 0.2655007243156433, "learning_rate": 0.00029985761546210007, "loss": 4.9067, "step": 11295 }, { "epoch": 0.23140089693445007, "grad_norm": 0.24444621801376343, "learning_rate": 0.0002998549064537767, "loss": 4.907, "step": 11300 }, { "epoch": 0.2315032867118547, "grad_norm": 0.22952929139137268, "learning_rate": 0.00029985217192940526, "loss": 4.9045, "step": 11305 }, { "epoch": 0.2316056764892593, "grad_norm": 0.2440677136182785, "learning_rate": 0.00029984941188945146, "loss": 4.9086, "step": 11310 }, { "epoch": 0.23170806626666393, "grad_norm": 0.22809574007987976, "learning_rate": 0.0002998466263343853, "loss": 4.8561, "step": 11315 }, { "epoch": 0.23181045604406855, "grad_norm": 0.24641495943069458, "learning_rate": 0.0002998438152646809, "loss": 4.899, "step": 11320 }, { "epoch": 0.23191284582147317, "grad_norm": 0.24032865464687347, "learning_rate": 0.000299840978680817, "loss": 4.8743, "step": 11325 }, { "epoch": 0.2320152355988778, "grad_norm": 0.24213042855262756, "learning_rate": 0.0002998381165832766, "loss": 4.8848, "step": 11330 }, { "epoch": 0.23211762537628244, "grad_norm": 0.24833174049854279, "learning_rate": 0.00029983522897254706, "loss": 4.8933, "step": 11335 }, { "epoch": 0.23222001515368706, "grad_norm": 0.26272082328796387, "learning_rate": 0.00029983231584912006, "loss": 4.8843, "step": 11340 }, { "epoch": 0.23232240493109169, "grad_norm": 0.2515399754047394, "learning_rate": 0.00029982937721349155, "loss": 4.918, "step": 11345 }, { "epoch": 0.2324247947084963, "grad_norm": 0.25837281346321106, "learning_rate": 0.00029982641306616195, "loss": 4.876, "step": 11350 }, { "epoch": 0.23252718448590093, "grad_norm": 0.23617346584796906, "learning_rate": 0.00029982342340763594, "loss": 4.9494, "step": 11355 }, { "epoch": 0.23262957426330555, "grad_norm": 0.2497721016407013, "learning_rate": 0.0002998204082384226, "loss": 4.8599, "step": 11360 }, { "epoch": 0.23273196404071017, "grad_norm": 0.22829598188400269, "learning_rate": 0.00029981736755903537, "loss": 4.8906, "step": 11365 }, { "epoch": 0.2328343538181148, "grad_norm": 0.24126386642456055, "learning_rate": 0.0002998143013699919, "loss": 4.9093, "step": 11370 }, { "epoch": 0.2329367435955194, "grad_norm": 0.2352132499217987, "learning_rate": 0.0002998112096718143, "loss": 4.8942, "step": 11375 }, { "epoch": 0.23303913337292403, "grad_norm": 0.2613617479801178, "learning_rate": 0.00029980809246502906, "loss": 4.9019, "step": 11380 }, { "epoch": 0.23314152315032868, "grad_norm": 0.2374405860900879, "learning_rate": 0.0002998049497501669, "loss": 4.8281, "step": 11385 }, { "epoch": 0.2332439129277333, "grad_norm": 0.2515256702899933, "learning_rate": 0.0002998017815277629, "loss": 4.8755, "step": 11390 }, { "epoch": 0.23334630270513793, "grad_norm": 0.25903964042663574, "learning_rate": 0.00029979858779835664, "loss": 4.8912, "step": 11395 }, { "epoch": 0.23344869248254255, "grad_norm": 0.251465767621994, "learning_rate": 0.0002997953685624918, "loss": 4.8704, "step": 11400 }, { "epoch": 0.23355108225994717, "grad_norm": 0.2379695475101471, "learning_rate": 0.00029979212382071654, "loss": 4.9282, "step": 11405 }, { "epoch": 0.2336534720373518, "grad_norm": 0.23356886208057404, "learning_rate": 0.0002997888535735834, "loss": 4.8989, "step": 11410 }, { "epoch": 0.2337558618147564, "grad_norm": 0.2495730221271515, "learning_rate": 0.00029978555782164923, "loss": 4.8938, "step": 11415 }, { "epoch": 0.23385825159216103, "grad_norm": 0.24322271347045898, "learning_rate": 0.00029978223656547506, "loss": 4.924, "step": 11420 }, { "epoch": 0.23396064136956565, "grad_norm": 0.31163012981414795, "learning_rate": 0.0002997788898056266, "loss": 4.8874, "step": 11425 }, { "epoch": 0.23406303114697027, "grad_norm": 0.25091469287872314, "learning_rate": 0.0002997755175426736, "loss": 4.9591, "step": 11430 }, { "epoch": 0.23416542092437492, "grad_norm": 0.2644343674182892, "learning_rate": 0.00029977211977719024, "loss": 4.8423, "step": 11435 }, { "epoch": 0.23426781070177954, "grad_norm": 0.24409982562065125, "learning_rate": 0.0002997686965097551, "loss": 4.9034, "step": 11440 }, { "epoch": 0.23437020047918417, "grad_norm": 0.2304883897304535, "learning_rate": 0.00029976524774095114, "loss": 4.9265, "step": 11445 }, { "epoch": 0.2344725902565888, "grad_norm": 0.24100245535373688, "learning_rate": 0.0002997617734713654, "loss": 4.9006, "step": 11450 }, { "epoch": 0.2345749800339934, "grad_norm": 0.24879340827465057, "learning_rate": 0.00029975827370158967, "loss": 4.9382, "step": 11455 }, { "epoch": 0.23467736981139803, "grad_norm": 0.2458147257566452, "learning_rate": 0.0002997547484322197, "loss": 4.9253, "step": 11460 }, { "epoch": 0.23477975958880265, "grad_norm": 0.2216917872428894, "learning_rate": 0.0002997511976638558, "loss": 4.8462, "step": 11465 }, { "epoch": 0.23488214936620727, "grad_norm": 0.22838068008422852, "learning_rate": 0.0002997476213971025, "loss": 4.8981, "step": 11470 }, { "epoch": 0.2349845391436119, "grad_norm": 0.2378031313419342, "learning_rate": 0.00029974401963256885, "loss": 4.9067, "step": 11475 }, { "epoch": 0.2350869289210165, "grad_norm": 0.2917433977127075, "learning_rate": 0.0002997403923708681, "loss": 4.9242, "step": 11480 }, { "epoch": 0.23518931869842116, "grad_norm": 0.24154146015644073, "learning_rate": 0.0002997367396126179, "loss": 4.8598, "step": 11485 }, { "epoch": 0.23529170847582578, "grad_norm": 0.24472050368785858, "learning_rate": 0.0002997330613584401, "loss": 4.8583, "step": 11490 }, { "epoch": 0.2353940982532304, "grad_norm": 0.25144362449645996, "learning_rate": 0.00029972935760896104, "loss": 4.8015, "step": 11495 }, { "epoch": 0.23549648803063503, "grad_norm": 0.24405333399772644, "learning_rate": 0.0002997256283648114, "loss": 4.9228, "step": 11500 }, { "epoch": 0.23559887780803965, "grad_norm": 0.25055980682373047, "learning_rate": 0.00029972187362662615, "loss": 4.9692, "step": 11505 }, { "epoch": 0.23570126758544427, "grad_norm": 0.28883928060531616, "learning_rate": 0.0002997180933950446, "loss": 4.8954, "step": 11510 }, { "epoch": 0.2358036573628489, "grad_norm": 0.23557475209236145, "learning_rate": 0.00029971428767071046, "loss": 4.8881, "step": 11515 }, { "epoch": 0.2359060471402535, "grad_norm": 0.2456962913274765, "learning_rate": 0.0002997104564542717, "loss": 4.9171, "step": 11520 }, { "epoch": 0.23600843691765813, "grad_norm": 0.2360301911830902, "learning_rate": 0.00029970659974638066, "loss": 4.9174, "step": 11525 }, { "epoch": 0.23611082669506275, "grad_norm": 0.2407263070344925, "learning_rate": 0.00029970271754769406, "loss": 4.8916, "step": 11530 }, { "epoch": 0.2362132164724674, "grad_norm": 0.2519512176513672, "learning_rate": 0.00029969880985887293, "loss": 4.884, "step": 11535 }, { "epoch": 0.23631560624987202, "grad_norm": 0.22996272146701813, "learning_rate": 0.00029969487668058266, "loss": 4.8716, "step": 11540 }, { "epoch": 0.23641799602727664, "grad_norm": 0.2510731816291809, "learning_rate": 0.00029969091801349283, "loss": 4.9083, "step": 11545 }, { "epoch": 0.23652038580468127, "grad_norm": 0.24260838329792023, "learning_rate": 0.0002996869338582776, "loss": 4.8984, "step": 11550 }, { "epoch": 0.2366227755820859, "grad_norm": 0.255476176738739, "learning_rate": 0.0002996829242156154, "loss": 4.8608, "step": 11555 }, { "epoch": 0.2367251653594905, "grad_norm": 0.2573225796222687, "learning_rate": 0.0002996788890861889, "loss": 4.8959, "step": 11560 }, { "epoch": 0.23682755513689513, "grad_norm": 0.25678911805152893, "learning_rate": 0.0002996748284706851, "loss": 4.8688, "step": 11565 }, { "epoch": 0.23692994491429975, "grad_norm": 0.24863708019256592, "learning_rate": 0.0002996707423697955, "loss": 4.915, "step": 11570 }, { "epoch": 0.23703233469170437, "grad_norm": 0.27415382862091064, "learning_rate": 0.0002996666307842158, "loss": 4.861, "step": 11575 }, { "epoch": 0.237134724469109, "grad_norm": 0.2612413167953491, "learning_rate": 0.0002996624937146461, "loss": 4.9256, "step": 11580 }, { "epoch": 0.23723711424651364, "grad_norm": 0.234205961227417, "learning_rate": 0.00029965833116179087, "loss": 4.8574, "step": 11585 }, { "epoch": 0.23733950402391826, "grad_norm": 0.25088369846343994, "learning_rate": 0.00029965414312635886, "loss": 4.8845, "step": 11590 }, { "epoch": 0.23744189380132288, "grad_norm": 0.25705236196517944, "learning_rate": 0.00029964992960906315, "loss": 4.8452, "step": 11595 }, { "epoch": 0.2375442835787275, "grad_norm": 0.31214144825935364, "learning_rate": 0.0002996456906106211, "loss": 4.8657, "step": 11600 }, { "epoch": 0.23764667335613213, "grad_norm": 0.2581920027732849, "learning_rate": 0.0002996414261317547, "loss": 4.9491, "step": 11605 }, { "epoch": 0.23774906313353675, "grad_norm": 0.2636168897151947, "learning_rate": 0.00029963713617318986, "loss": 4.8617, "step": 11610 }, { "epoch": 0.23785145291094137, "grad_norm": 0.2412351369857788, "learning_rate": 0.00029963282073565715, "loss": 4.8962, "step": 11615 }, { "epoch": 0.237953842688346, "grad_norm": 0.22004848718643188, "learning_rate": 0.0002996284798198913, "loss": 4.8952, "step": 11620 }, { "epoch": 0.2380562324657506, "grad_norm": 0.24782150983810425, "learning_rate": 0.00029962411342663155, "loss": 4.8736, "step": 11625 }, { "epoch": 0.23815862224315523, "grad_norm": 0.23767095804214478, "learning_rate": 0.0002996197215566213, "loss": 4.9123, "step": 11630 }, { "epoch": 0.23826101202055985, "grad_norm": 0.23496316373348236, "learning_rate": 0.00029961530421060835, "loss": 4.8902, "step": 11635 }, { "epoch": 0.2383634017979645, "grad_norm": 0.2419152557849884, "learning_rate": 0.00029961086138934485, "loss": 4.9154, "step": 11640 }, { "epoch": 0.23846579157536912, "grad_norm": 0.2437465935945511, "learning_rate": 0.00029960639309358736, "loss": 4.8857, "step": 11645 }, { "epoch": 0.23856818135277374, "grad_norm": 0.2309345155954361, "learning_rate": 0.0002996018993240966, "loss": 4.8832, "step": 11650 }, { "epoch": 0.23867057113017837, "grad_norm": 0.2655540406703949, "learning_rate": 0.00029959738008163785, "loss": 4.881, "step": 11655 }, { "epoch": 0.238772960907583, "grad_norm": 0.24720311164855957, "learning_rate": 0.0002995928353669805, "loss": 4.9194, "step": 11660 }, { "epoch": 0.2388753506849876, "grad_norm": 0.2300921380519867, "learning_rate": 0.00029958826518089844, "loss": 4.8955, "step": 11665 }, { "epoch": 0.23897774046239223, "grad_norm": 0.25959837436676025, "learning_rate": 0.0002995836695241698, "loss": 4.8663, "step": 11670 }, { "epoch": 0.23908013023979685, "grad_norm": 0.3259102702140808, "learning_rate": 0.00029957904839757714, "loss": 4.8782, "step": 11675 }, { "epoch": 0.23918252001720147, "grad_norm": 0.3216576874256134, "learning_rate": 0.0002995744018019073, "loss": 4.8729, "step": 11680 }, { "epoch": 0.2392849097946061, "grad_norm": 0.2604944109916687, "learning_rate": 0.0002995697297379515, "loss": 4.9039, "step": 11685 }, { "epoch": 0.23938729957201074, "grad_norm": 0.2581782639026642, "learning_rate": 0.0002995650322065051, "loss": 4.8791, "step": 11690 }, { "epoch": 0.23948968934941536, "grad_norm": 0.24211914837360382, "learning_rate": 0.00029956030920836815, "loss": 4.8697, "step": 11695 }, { "epoch": 0.23959207912681998, "grad_norm": 0.24143648147583008, "learning_rate": 0.00029955556074434474, "loss": 4.8289, "step": 11700 }, { "epoch": 0.2396944689042246, "grad_norm": 0.2300914078950882, "learning_rate": 0.00029955078681524343, "loss": 4.908, "step": 11705 }, { "epoch": 0.23979685868162923, "grad_norm": 0.2471146285533905, "learning_rate": 0.00029954598742187703, "loss": 4.9411, "step": 11710 }, { "epoch": 0.23989924845903385, "grad_norm": 0.25336185097694397, "learning_rate": 0.0002995411625650628, "loss": 4.9059, "step": 11715 }, { "epoch": 0.24000163823643847, "grad_norm": 0.261109858751297, "learning_rate": 0.00029953631224562236, "loss": 4.8506, "step": 11720 }, { "epoch": 0.2401040280138431, "grad_norm": 0.24551954865455627, "learning_rate": 0.0002995314364643814, "loss": 4.9237, "step": 11725 }, { "epoch": 0.2402064177912477, "grad_norm": 0.30039453506469727, "learning_rate": 0.00029952653522217023, "loss": 4.9264, "step": 11730 }, { "epoch": 0.24030880756865233, "grad_norm": 0.24575817584991455, "learning_rate": 0.00029952160851982336, "loss": 4.8702, "step": 11735 }, { "epoch": 0.24041119734605698, "grad_norm": 0.24071602523326874, "learning_rate": 0.0002995166563581797, "loss": 4.9219, "step": 11740 }, { "epoch": 0.2405135871234616, "grad_norm": 0.2818033993244171, "learning_rate": 0.0002995116787380825, "loss": 4.8766, "step": 11745 }, { "epoch": 0.24061597690086622, "grad_norm": 0.2729192078113556, "learning_rate": 0.00029950667566037916, "loss": 4.8786, "step": 11750 }, { "epoch": 0.24071836667827085, "grad_norm": 0.24694868922233582, "learning_rate": 0.0002995016471259217, "loss": 4.9232, "step": 11755 }, { "epoch": 0.24082075645567547, "grad_norm": 0.3318852186203003, "learning_rate": 0.0002994965931355663, "loss": 4.8491, "step": 11760 }, { "epoch": 0.2409231462330801, "grad_norm": 0.24177590012550354, "learning_rate": 0.0002994915136901735, "loss": 4.899, "step": 11765 }, { "epoch": 0.2410255360104847, "grad_norm": 0.24748991429805756, "learning_rate": 0.00029948640879060815, "loss": 4.8468, "step": 11770 }, { "epoch": 0.24112792578788933, "grad_norm": 0.24905535578727722, "learning_rate": 0.0002994812784377396, "loss": 4.8605, "step": 11775 }, { "epoch": 0.24123031556529395, "grad_norm": 0.23898392915725708, "learning_rate": 0.0002994761226324412, "loss": 4.8881, "step": 11780 }, { "epoch": 0.24133270534269857, "grad_norm": 0.2341669201850891, "learning_rate": 0.00029947094137559106, "loss": 4.8815, "step": 11785 }, { "epoch": 0.24143509512010322, "grad_norm": 0.2497628629207611, "learning_rate": 0.00029946573466807123, "loss": 4.8997, "step": 11790 }, { "epoch": 0.24153748489750784, "grad_norm": 0.252549409866333, "learning_rate": 0.0002994605025107684, "loss": 4.9143, "step": 11795 }, { "epoch": 0.24163987467491246, "grad_norm": 0.2625248432159424, "learning_rate": 0.0002994552449045733, "loss": 4.9114, "step": 11800 }, { "epoch": 0.24174226445231708, "grad_norm": 0.2671903371810913, "learning_rate": 0.0002994499618503813, "loss": 4.8908, "step": 11805 }, { "epoch": 0.2418446542297217, "grad_norm": 0.2578773498535156, "learning_rate": 0.00029944465334909184, "loss": 4.9297, "step": 11810 }, { "epoch": 0.24194704400712633, "grad_norm": 0.2517646253108978, "learning_rate": 0.0002994393194016089, "loss": 4.9183, "step": 11815 }, { "epoch": 0.24204943378453095, "grad_norm": 0.23537741601467133, "learning_rate": 0.00029943396000884064, "loss": 4.91, "step": 11820 }, { "epoch": 0.24215182356193557, "grad_norm": 0.2289832979440689, "learning_rate": 0.0002994285751716997, "loss": 4.8988, "step": 11825 }, { "epoch": 0.2422542133393402, "grad_norm": 0.244520902633667, "learning_rate": 0.0002994231648911028, "loss": 4.8668, "step": 11830 }, { "epoch": 0.2423566031167448, "grad_norm": 0.2290887087583542, "learning_rate": 0.0002994177291679713, "loss": 4.8668, "step": 11835 }, { "epoch": 0.24245899289414946, "grad_norm": 0.23854729533195496, "learning_rate": 0.0002994122680032307, "loss": 4.8915, "step": 11840 }, { "epoch": 0.24256138267155408, "grad_norm": 0.2502431273460388, "learning_rate": 0.0002994067813978109, "loss": 4.8691, "step": 11845 }, { "epoch": 0.2426637724489587, "grad_norm": 0.22970812022686005, "learning_rate": 0.0002994012693526461, "loss": 4.8593, "step": 11850 }, { "epoch": 0.24276616222636332, "grad_norm": 0.23039467632770538, "learning_rate": 0.00029939573186867486, "loss": 4.8874, "step": 11855 }, { "epoch": 0.24286855200376795, "grad_norm": 0.2997787296772003, "learning_rate": 0.00029939016894684004, "loss": 4.8941, "step": 11860 }, { "epoch": 0.24297094178117257, "grad_norm": 0.2484385073184967, "learning_rate": 0.0002993845805880888, "loss": 4.8978, "step": 11865 }, { "epoch": 0.2430733315585772, "grad_norm": 0.2619800865650177, "learning_rate": 0.0002993789667933728, "loss": 4.9135, "step": 11870 }, { "epoch": 0.2431757213359818, "grad_norm": 0.24602265655994415, "learning_rate": 0.0002993733275636478, "loss": 4.8797, "step": 11875 }, { "epoch": 0.24327811111338643, "grad_norm": 0.2266729325056076, "learning_rate": 0.0002993676628998741, "loss": 4.8751, "step": 11880 }, { "epoch": 0.24338050089079105, "grad_norm": 0.2554974853992462, "learning_rate": 0.0002993619728030161, "loss": 4.8515, "step": 11885 }, { "epoch": 0.2434828906681957, "grad_norm": 0.24222640693187714, "learning_rate": 0.00029935625727404275, "loss": 4.9201, "step": 11890 }, { "epoch": 0.24358528044560032, "grad_norm": 0.2396879941225052, "learning_rate": 0.00029935051631392725, "loss": 4.8785, "step": 11895 }, { "epoch": 0.24368767022300494, "grad_norm": 0.26316046714782715, "learning_rate": 0.0002993447499236471, "loss": 4.7969, "step": 11900 }, { "epoch": 0.24379006000040956, "grad_norm": 0.2509313225746155, "learning_rate": 0.00029933895810418416, "loss": 4.8781, "step": 11905 }, { "epoch": 0.24389244977781419, "grad_norm": 0.24141843616962433, "learning_rate": 0.0002993331408565246, "loss": 4.8658, "step": 11910 }, { "epoch": 0.2439948395552188, "grad_norm": 0.2386726289987564, "learning_rate": 0.00029932729818165895, "loss": 4.8963, "step": 11915 }, { "epoch": 0.24409722933262343, "grad_norm": 0.22077973186969757, "learning_rate": 0.00029932143008058206, "loss": 4.933, "step": 11920 }, { "epoch": 0.24419961911002805, "grad_norm": 0.22215411067008972, "learning_rate": 0.00029931553655429305, "loss": 4.8624, "step": 11925 }, { "epoch": 0.24430200888743267, "grad_norm": 0.23376335203647614, "learning_rate": 0.00029930961760379546, "loss": 4.9226, "step": 11930 }, { "epoch": 0.2444043986648373, "grad_norm": 0.2362065464258194, "learning_rate": 0.00029930367323009716, "loss": 4.8627, "step": 11935 }, { "epoch": 0.24450678844224194, "grad_norm": 0.24486564099788666, "learning_rate": 0.0002992977034342102, "loss": 4.899, "step": 11940 }, { "epoch": 0.24460917821964656, "grad_norm": 0.24312803149223328, "learning_rate": 0.0002992917082171512, "loss": 4.9063, "step": 11945 }, { "epoch": 0.24471156799705118, "grad_norm": 0.22825714945793152, "learning_rate": 0.0002992856875799408, "loss": 4.8959, "step": 11950 }, { "epoch": 0.2448139577744558, "grad_norm": 0.25117015838623047, "learning_rate": 0.00029927964152360437, "loss": 4.9115, "step": 11955 }, { "epoch": 0.24491634755186042, "grad_norm": 0.23526737093925476, "learning_rate": 0.0002992735700491712, "loss": 4.8645, "step": 11960 }, { "epoch": 0.24501873732926505, "grad_norm": 0.23061364889144897, "learning_rate": 0.00029926747315767513, "loss": 4.8657, "step": 11965 }, { "epoch": 0.24512112710666967, "grad_norm": 0.22766825556755066, "learning_rate": 0.00029926135085015437, "loss": 4.8939, "step": 11970 }, { "epoch": 0.2452235168840743, "grad_norm": 0.24208544194698334, "learning_rate": 0.00029925520312765126, "loss": 4.8886, "step": 11975 }, { "epoch": 0.2453259066614789, "grad_norm": 0.26806017756462097, "learning_rate": 0.0002992490299912127, "loss": 4.8273, "step": 11980 }, { "epoch": 0.24542829643888353, "grad_norm": 0.24627339839935303, "learning_rate": 0.0002992428314418897, "loss": 4.9321, "step": 11985 }, { "epoch": 0.24553068621628815, "grad_norm": 0.24083460867404938, "learning_rate": 0.00029923660748073773, "loss": 4.8596, "step": 11990 }, { "epoch": 0.2456330759936928, "grad_norm": 0.235739067196846, "learning_rate": 0.0002992303581088166, "loss": 4.9229, "step": 11995 }, { "epoch": 0.24573546577109742, "grad_norm": 0.27138984203338623, "learning_rate": 0.00029922408332719037, "loss": 4.8903, "step": 12000 }, { "epoch": 0.24573546577109742, "eval_loss": 4.899479389190674, "eval_runtime": 17.2369, "eval_samples_per_second": 33.765, "eval_steps_per_second": 4.235, "step": 12000 }, { "epoch": 0.24583785554850204, "grad_norm": 0.2613738179206848, "learning_rate": 0.00029921778313692744, "loss": 4.8717, "step": 12005 }, { "epoch": 0.24594024532590666, "grad_norm": 0.281594842672348, "learning_rate": 0.0002992114575391005, "loss": 4.9472, "step": 12010 }, { "epoch": 0.24604263510331129, "grad_norm": 0.23299552500247955, "learning_rate": 0.0002992051065347868, "loss": 4.8476, "step": 12015 }, { "epoch": 0.2461450248807159, "grad_norm": 0.2404499500989914, "learning_rate": 0.00029919873012506753, "loss": 4.8642, "step": 12020 }, { "epoch": 0.24624741465812053, "grad_norm": 0.24060982465744019, "learning_rate": 0.00029919232831102853, "loss": 4.8646, "step": 12025 }, { "epoch": 0.24634980443552515, "grad_norm": 0.24198798835277557, "learning_rate": 0.0002991859010937598, "loss": 4.888, "step": 12030 }, { "epoch": 0.24645219421292977, "grad_norm": 0.24580661952495575, "learning_rate": 0.0002991794484743558, "loss": 4.9298, "step": 12035 }, { "epoch": 0.2465545839903344, "grad_norm": 0.26965638995170593, "learning_rate": 0.0002991729704539151, "loss": 4.8499, "step": 12040 }, { "epoch": 0.24665697376773904, "grad_norm": 0.2403213232755661, "learning_rate": 0.00029916646703354076, "loss": 4.8562, "step": 12045 }, { "epoch": 0.24675936354514366, "grad_norm": 0.2616066038608551, "learning_rate": 0.0002991599382143402, "loss": 4.9207, "step": 12050 }, { "epoch": 0.24686175332254828, "grad_norm": 0.2665911018848419, "learning_rate": 0.0002991533839974251, "loss": 4.8644, "step": 12055 }, { "epoch": 0.2469641430999529, "grad_norm": 0.2643393576145172, "learning_rate": 0.00029914680438391134, "loss": 4.8855, "step": 12060 }, { "epoch": 0.24706653287735753, "grad_norm": 0.26985055208206177, "learning_rate": 0.0002991401993749193, "loss": 4.8476, "step": 12065 }, { "epoch": 0.24716892265476215, "grad_norm": 0.24749569594860077, "learning_rate": 0.00029913356897157367, "loss": 4.8275, "step": 12070 }, { "epoch": 0.24727131243216677, "grad_norm": 0.24287645518779755, "learning_rate": 0.0002991269131750033, "loss": 4.8895, "step": 12075 }, { "epoch": 0.2473737022095714, "grad_norm": 0.2698119878768921, "learning_rate": 0.0002991202319863417, "loss": 4.8322, "step": 12080 }, { "epoch": 0.247476091986976, "grad_norm": 0.2555227279663086, "learning_rate": 0.00029911352540672626, "loss": 4.8419, "step": 12085 }, { "epoch": 0.24757848176438063, "grad_norm": 0.2494737207889557, "learning_rate": 0.00029910679343729904, "loss": 4.8357, "step": 12090 }, { "epoch": 0.24768087154178528, "grad_norm": 0.2341720312833786, "learning_rate": 0.0002991000360792063, "loss": 4.8621, "step": 12095 }, { "epoch": 0.2477832613191899, "grad_norm": 0.25251176953315735, "learning_rate": 0.00029909325333359857, "loss": 4.8641, "step": 12100 }, { "epoch": 0.24788565109659452, "grad_norm": 0.23488686978816986, "learning_rate": 0.0002990864452016308, "loss": 4.8845, "step": 12105 }, { "epoch": 0.24798804087399914, "grad_norm": 0.2652224004268646, "learning_rate": 0.0002990796116844622, "loss": 4.89, "step": 12110 }, { "epoch": 0.24809043065140376, "grad_norm": 0.23602823913097382, "learning_rate": 0.0002990727527832564, "loss": 4.8609, "step": 12115 }, { "epoch": 0.2481928204288084, "grad_norm": 0.2633868455886841, "learning_rate": 0.0002990658684991812, "loss": 4.9009, "step": 12120 }, { "epoch": 0.248295210206213, "grad_norm": 0.2638199031352997, "learning_rate": 0.0002990589588334088, "loss": 4.9046, "step": 12125 }, { "epoch": 0.24839759998361763, "grad_norm": 0.30601924657821655, "learning_rate": 0.00029905202378711573, "loss": 4.9206, "step": 12130 }, { "epoch": 0.24849998976102225, "grad_norm": 0.2439941018819809, "learning_rate": 0.00029904506336148284, "loss": 4.8901, "step": 12135 }, { "epoch": 0.24860237953842687, "grad_norm": 0.27119070291519165, "learning_rate": 0.00029903807755769535, "loss": 4.9295, "step": 12140 }, { "epoch": 0.24870476931583152, "grad_norm": 0.2307243049144745, "learning_rate": 0.00029903106637694263, "loss": 4.8929, "step": 12145 }, { "epoch": 0.24880715909323614, "grad_norm": 0.24744997918605804, "learning_rate": 0.00029902402982041854, "loss": 4.8785, "step": 12150 }, { "epoch": 0.24890954887064076, "grad_norm": 0.2281043976545334, "learning_rate": 0.0002990169678893212, "loss": 4.8468, "step": 12155 }, { "epoch": 0.24901193864804538, "grad_norm": 0.21781419217586517, "learning_rate": 0.0002990098805848531, "loss": 4.9168, "step": 12160 }, { "epoch": 0.24911432842545, "grad_norm": 0.23341822624206543, "learning_rate": 0.000299002767908221, "loss": 4.8919, "step": 12165 }, { "epoch": 0.24921671820285463, "grad_norm": 0.2326321005821228, "learning_rate": 0.0002989956298606359, "loss": 4.9025, "step": 12170 }, { "epoch": 0.24931910798025925, "grad_norm": 0.2548215985298157, "learning_rate": 0.0002989884664433133, "loss": 4.874, "step": 12175 }, { "epoch": 0.24942149775766387, "grad_norm": 0.27959537506103516, "learning_rate": 0.0002989812776574729, "loss": 4.779, "step": 12180 }, { "epoch": 0.2495238875350685, "grad_norm": 0.2326943725347519, "learning_rate": 0.0002989740635043387, "loss": 4.8494, "step": 12185 }, { "epoch": 0.2496262773124731, "grad_norm": 0.25300073623657227, "learning_rate": 0.00029896682398513914, "loss": 4.8717, "step": 12190 }, { "epoch": 0.24972866708987776, "grad_norm": 0.24167612195014954, "learning_rate": 0.0002989595591011069, "loss": 4.8367, "step": 12195 }, { "epoch": 0.24983105686728238, "grad_norm": 0.25171294808387756, "learning_rate": 0.00029895226885347893, "loss": 4.8535, "step": 12200 }, { "epoch": 0.249933446644687, "grad_norm": 0.44656553864479065, "learning_rate": 0.00029894495324349655, "loss": 4.8807, "step": 12205 }, { "epoch": 0.2500358364220916, "grad_norm": 0.24718737602233887, "learning_rate": 0.0002989376122724055, "loss": 4.8775, "step": 12210 }, { "epoch": 0.2501382261994962, "grad_norm": 0.25999680161476135, "learning_rate": 0.0002989302459414556, "loss": 4.8474, "step": 12215 }, { "epoch": 0.2502406159769009, "grad_norm": 0.22106848657131195, "learning_rate": 0.0002989228542519013, "loss": 4.9025, "step": 12220 }, { "epoch": 0.2503430057543055, "grad_norm": 0.24296283721923828, "learning_rate": 0.00029891543720500096, "loss": 4.9432, "step": 12225 }, { "epoch": 0.25044539553171014, "grad_norm": 0.22640258073806763, "learning_rate": 0.00029890799480201773, "loss": 4.8549, "step": 12230 }, { "epoch": 0.25054778530911476, "grad_norm": 0.23242172598838806, "learning_rate": 0.0002989005270442187, "loss": 4.8673, "step": 12235 }, { "epoch": 0.2506501750865194, "grad_norm": 0.23328714072704315, "learning_rate": 0.00029889303393287544, "loss": 4.8547, "step": 12240 }, { "epoch": 0.250752564863924, "grad_norm": 0.30010512471199036, "learning_rate": 0.0002988855154692638, "loss": 4.8451, "step": 12245 }, { "epoch": 0.2508549546413286, "grad_norm": 0.24082158505916595, "learning_rate": 0.0002988779716546641, "loss": 4.8461, "step": 12250 }, { "epoch": 0.25095734441873324, "grad_norm": 0.2572222948074341, "learning_rate": 0.00029887040249036066, "loss": 4.8194, "step": 12255 }, { "epoch": 0.25105973419613786, "grad_norm": 0.23628303408622742, "learning_rate": 0.00029886280797764236, "loss": 4.9751, "step": 12260 }, { "epoch": 0.2511621239735425, "grad_norm": 0.2489302158355713, "learning_rate": 0.0002988551881178023, "loss": 4.8359, "step": 12265 }, { "epoch": 0.2512645137509471, "grad_norm": 0.23080943524837494, "learning_rate": 0.0002988475429121381, "loss": 4.8837, "step": 12270 }, { "epoch": 0.2513669035283517, "grad_norm": 0.23010952770709991, "learning_rate": 0.00029883987236195125, "loss": 4.8853, "step": 12275 }, { "epoch": 0.25146929330575635, "grad_norm": 0.239060178399086, "learning_rate": 0.000298832176468548, "loss": 4.8867, "step": 12280 }, { "epoch": 0.25157168308316097, "grad_norm": 0.2550260126590729, "learning_rate": 0.0002988244552332387, "loss": 4.8565, "step": 12285 }, { "epoch": 0.2516740728605656, "grad_norm": 0.24920465052127838, "learning_rate": 0.0002988167086573381, "loss": 4.8484, "step": 12290 }, { "epoch": 0.2517764626379702, "grad_norm": 0.24076999723911285, "learning_rate": 0.00029880893674216514, "loss": 4.9091, "step": 12295 }, { "epoch": 0.25187885241537483, "grad_norm": 0.24830858409404755, "learning_rate": 0.0002988011394890432, "loss": 4.8945, "step": 12300 }, { "epoch": 0.25198124219277945, "grad_norm": 0.2224910408258438, "learning_rate": 0.0002987933168993, "loss": 4.851, "step": 12305 }, { "epoch": 0.2520836319701841, "grad_norm": 0.24562443792819977, "learning_rate": 0.0002987854689742674, "loss": 4.8528, "step": 12310 }, { "epoch": 0.2521860217475887, "grad_norm": 0.23242530226707458, "learning_rate": 0.00029877759571528164, "loss": 4.8868, "step": 12315 }, { "epoch": 0.2522884115249934, "grad_norm": 0.23901917040348053, "learning_rate": 0.0002987696971236835, "loss": 4.8363, "step": 12320 }, { "epoch": 0.252390801302398, "grad_norm": 0.2646287977695465, "learning_rate": 0.0002987617732008177, "loss": 4.8992, "step": 12325 }, { "epoch": 0.2524931910798026, "grad_norm": 0.24285374581813812, "learning_rate": 0.00029875382394803357, "loss": 4.8793, "step": 12330 }, { "epoch": 0.25259558085720724, "grad_norm": 0.2439645379781723, "learning_rate": 0.0002987458493666846, "loss": 4.8692, "step": 12335 }, { "epoch": 0.25269797063461186, "grad_norm": 0.2646964192390442, "learning_rate": 0.00029873784945812866, "loss": 4.8714, "step": 12340 }, { "epoch": 0.2528003604120165, "grad_norm": 0.25290536880493164, "learning_rate": 0.00029872982422372786, "loss": 4.8899, "step": 12345 }, { "epoch": 0.2529027501894211, "grad_norm": 0.2795182466506958, "learning_rate": 0.00029872177366484877, "loss": 4.828, "step": 12350 }, { "epoch": 0.2530051399668257, "grad_norm": 0.2529841959476471, "learning_rate": 0.000298713697782862, "loss": 4.8898, "step": 12355 }, { "epoch": 0.25310752974423034, "grad_norm": 0.24668854475021362, "learning_rate": 0.0002987055965791428, "loss": 4.865, "step": 12360 }, { "epoch": 0.25320991952163496, "grad_norm": 0.23832497000694275, "learning_rate": 0.0002986974700550706, "loss": 4.8724, "step": 12365 }, { "epoch": 0.2533123092990396, "grad_norm": 0.24035145342350006, "learning_rate": 0.000298689318212029, "loss": 4.8931, "step": 12370 }, { "epoch": 0.2534146990764442, "grad_norm": 0.2509881258010864, "learning_rate": 0.00029868114105140604, "loss": 4.9036, "step": 12375 }, { "epoch": 0.2535170888538488, "grad_norm": 0.4640673100948334, "learning_rate": 0.0002986729385745941, "loss": 4.8843, "step": 12380 }, { "epoch": 0.25361947863125345, "grad_norm": 0.24764907360076904, "learning_rate": 0.00029866471078298984, "loss": 4.8745, "step": 12385 }, { "epoch": 0.25372186840865807, "grad_norm": 0.23006363213062286, "learning_rate": 0.00029865645767799414, "loss": 4.8311, "step": 12390 }, { "epoch": 0.2538242581860627, "grad_norm": 0.23863214254379272, "learning_rate": 0.0002986481792610124, "loss": 4.8998, "step": 12395 }, { "epoch": 0.2539266479634673, "grad_norm": 0.2536652088165283, "learning_rate": 0.00029863987553345407, "loss": 4.8854, "step": 12400 }, { "epoch": 0.25402903774087193, "grad_norm": 0.25096675753593445, "learning_rate": 0.0002986315464967331, "loss": 4.8637, "step": 12405 }, { "epoch": 0.25413142751827655, "grad_norm": 0.25348666310310364, "learning_rate": 0.00029862319215226776, "loss": 4.8907, "step": 12410 }, { "epoch": 0.2542338172956812, "grad_norm": 0.24533510208129883, "learning_rate": 0.00029861481250148043, "loss": 4.8715, "step": 12415 }, { "epoch": 0.2543362070730858, "grad_norm": 0.23900151252746582, "learning_rate": 0.000298606407545798, "loss": 4.8733, "step": 12420 }, { "epoch": 0.2544385968504905, "grad_norm": 0.501625657081604, "learning_rate": 0.00029859797728665156, "loss": 4.8067, "step": 12425 }, { "epoch": 0.2545409866278951, "grad_norm": 0.24328076839447021, "learning_rate": 0.00029858952172547655, "loss": 4.8499, "step": 12430 }, { "epoch": 0.2546433764052997, "grad_norm": 0.2383161336183548, "learning_rate": 0.00029858104086371277, "loss": 4.8578, "step": 12435 }, { "epoch": 0.25474576618270434, "grad_norm": 0.22936564683914185, "learning_rate": 0.00029857253470280416, "loss": 4.8867, "step": 12440 }, { "epoch": 0.25484815596010896, "grad_norm": 0.2559540271759033, "learning_rate": 0.00029856400324419926, "loss": 4.8367, "step": 12445 }, { "epoch": 0.2549505457375136, "grad_norm": 0.25993111729621887, "learning_rate": 0.00029855544648935053, "loss": 4.8831, "step": 12450 }, { "epoch": 0.2550529355149182, "grad_norm": 0.22727420926094055, "learning_rate": 0.000298546864439715, "loss": 4.8524, "step": 12455 }, { "epoch": 0.2551553252923228, "grad_norm": 0.4473954141139984, "learning_rate": 0.0002985382570967541, "loss": 4.8277, "step": 12460 }, { "epoch": 0.25525771506972744, "grad_norm": 0.2686767876148224, "learning_rate": 0.00029852962446193324, "loss": 4.9113, "step": 12465 }, { "epoch": 0.25536010484713206, "grad_norm": 0.24050889909267426, "learning_rate": 0.0002985209665367224, "loss": 4.8886, "step": 12470 }, { "epoch": 0.2554624946245367, "grad_norm": 0.22935850918293, "learning_rate": 0.00029851228332259576, "loss": 4.8715, "step": 12475 }, { "epoch": 0.2555648844019413, "grad_norm": 0.25805386900901794, "learning_rate": 0.00029850357482103174, "loss": 4.8766, "step": 12480 }, { "epoch": 0.2556672741793459, "grad_norm": 0.24458548426628113, "learning_rate": 0.00029849484103351335, "loss": 4.8769, "step": 12485 }, { "epoch": 0.25576966395675055, "grad_norm": 0.2588340640068054, "learning_rate": 0.0002984860819615275, "loss": 4.9156, "step": 12490 }, { "epoch": 0.25587205373415517, "grad_norm": 0.23135457932949066, "learning_rate": 0.0002984772976065657, "loss": 4.875, "step": 12495 }, { "epoch": 0.2559744435115598, "grad_norm": 0.24211369454860687, "learning_rate": 0.0002984684879701237, "loss": 4.9047, "step": 12500 }, { "epoch": 0.2560768332889644, "grad_norm": 0.22892974317073822, "learning_rate": 0.0002984596530537015, "loss": 4.8486, "step": 12505 }, { "epoch": 0.25617922306636903, "grad_norm": 0.28492414951324463, "learning_rate": 0.0002984507928588035, "loss": 4.811, "step": 12510 }, { "epoch": 0.25628161284377365, "grad_norm": 0.2366408109664917, "learning_rate": 0.0002984419073869382, "loss": 4.8877, "step": 12515 }, { "epoch": 0.2563840026211783, "grad_norm": 0.3380194306373596, "learning_rate": 0.00029843299663961866, "loss": 4.8395, "step": 12520 }, { "epoch": 0.25648639239858295, "grad_norm": 0.25046271085739136, "learning_rate": 0.0002984240606183621, "loss": 4.8949, "step": 12525 }, { "epoch": 0.2565887821759876, "grad_norm": 0.23867449164390564, "learning_rate": 0.00029841509932469014, "loss": 4.8595, "step": 12530 }, { "epoch": 0.2566911719533922, "grad_norm": 0.2649063766002655, "learning_rate": 0.00029840611276012847, "loss": 4.8243, "step": 12535 }, { "epoch": 0.2567935617307968, "grad_norm": 0.2422143667936325, "learning_rate": 0.00029839710092620733, "loss": 4.8398, "step": 12540 }, { "epoch": 0.25689595150820144, "grad_norm": 0.23876677453517914, "learning_rate": 0.00029838806382446125, "loss": 4.8657, "step": 12545 }, { "epoch": 0.25699834128560606, "grad_norm": 0.23439767956733704, "learning_rate": 0.00029837900145642893, "loss": 4.8701, "step": 12550 }, { "epoch": 0.2571007310630107, "grad_norm": 0.22320416569709778, "learning_rate": 0.0002983699138236534, "loss": 4.8458, "step": 12555 }, { "epoch": 0.2572031208404153, "grad_norm": 0.2350194752216339, "learning_rate": 0.0002983608009276821, "loss": 4.8874, "step": 12560 }, { "epoch": 0.2573055106178199, "grad_norm": 0.2400473803281784, "learning_rate": 0.00029835166277006667, "loss": 4.8909, "step": 12565 }, { "epoch": 0.25740790039522454, "grad_norm": 0.3497767150402069, "learning_rate": 0.000298342499352363, "loss": 4.8771, "step": 12570 }, { "epoch": 0.25751029017262916, "grad_norm": 0.25395315885543823, "learning_rate": 0.0002983333106761315, "loss": 4.8964, "step": 12575 }, { "epoch": 0.2576126799500338, "grad_norm": 0.2592972218990326, "learning_rate": 0.00029832409674293667, "loss": 4.8609, "step": 12580 }, { "epoch": 0.2577150697274384, "grad_norm": 0.2572644352912903, "learning_rate": 0.0002983148575543474, "loss": 4.9023, "step": 12585 }, { "epoch": 0.257817459504843, "grad_norm": 0.23417991399765015, "learning_rate": 0.0002983055931119368, "loss": 4.8572, "step": 12590 }, { "epoch": 0.25791984928224765, "grad_norm": 0.22180429100990295, "learning_rate": 0.00029829630341728243, "loss": 4.8973, "step": 12595 }, { "epoch": 0.25802223905965227, "grad_norm": 0.219478040933609, "learning_rate": 0.000298286988471966, "loss": 4.8552, "step": 12600 }, { "epoch": 0.2581246288370569, "grad_norm": 0.34473809599876404, "learning_rate": 0.0002982776482775736, "loss": 4.8242, "step": 12605 }, { "epoch": 0.2582270186144615, "grad_norm": 0.24101996421813965, "learning_rate": 0.0002982682828356956, "loss": 4.8951, "step": 12610 }, { "epoch": 0.25832940839186613, "grad_norm": 0.2439136803150177, "learning_rate": 0.00029825889214792674, "loss": 4.8801, "step": 12615 }, { "epoch": 0.25843179816927075, "grad_norm": 0.2350088506937027, "learning_rate": 0.00029824947621586584, "loss": 4.8369, "step": 12620 }, { "epoch": 0.25853418794667543, "grad_norm": 0.24532537162303925, "learning_rate": 0.0002982400350411163, "loss": 4.8539, "step": 12625 }, { "epoch": 0.25863657772408005, "grad_norm": 0.2509115934371948, "learning_rate": 0.0002982305686252856, "loss": 4.8934, "step": 12630 }, { "epoch": 0.2587389675014847, "grad_norm": 0.2993391454219818, "learning_rate": 0.0002982210769699857, "loss": 4.8307, "step": 12635 }, { "epoch": 0.2588413572788893, "grad_norm": 0.24147208034992218, "learning_rate": 0.0002982115600768326, "loss": 4.9087, "step": 12640 }, { "epoch": 0.2589437470562939, "grad_norm": 0.2247917801141739, "learning_rate": 0.00029820201794744693, "loss": 4.8464, "step": 12645 }, { "epoch": 0.25904613683369854, "grad_norm": 0.24929684400558472, "learning_rate": 0.0002981924505834534, "loss": 4.9148, "step": 12650 }, { "epoch": 0.25914852661110316, "grad_norm": 0.2546851336956024, "learning_rate": 0.00029818285798648093, "loss": 4.8648, "step": 12655 }, { "epoch": 0.2592509163885078, "grad_norm": 0.2752739489078522, "learning_rate": 0.00029817324015816304, "loss": 4.8639, "step": 12660 }, { "epoch": 0.2593533061659124, "grad_norm": 0.2405664026737213, "learning_rate": 0.0002981635971001373, "loss": 4.8622, "step": 12665 }, { "epoch": 0.259455695943317, "grad_norm": 0.2586182951927185, "learning_rate": 0.0002981539288140456, "loss": 4.8757, "step": 12670 }, { "epoch": 0.25955808572072164, "grad_norm": 0.27839696407318115, "learning_rate": 0.00029814423530153425, "loss": 4.8243, "step": 12675 }, { "epoch": 0.25966047549812626, "grad_norm": 0.23663580417633057, "learning_rate": 0.0002981345165642538, "loss": 4.8768, "step": 12680 }, { "epoch": 0.2597628652755309, "grad_norm": 0.2404603362083435, "learning_rate": 0.00029812477260385896, "loss": 4.8903, "step": 12685 }, { "epoch": 0.2598652550529355, "grad_norm": 0.2810831367969513, "learning_rate": 0.000298115003422009, "loss": 4.8141, "step": 12690 }, { "epoch": 0.2599676448303401, "grad_norm": 0.2602687180042267, "learning_rate": 0.0002981052090203672, "loss": 4.8472, "step": 12695 }, { "epoch": 0.26007003460774475, "grad_norm": 0.23098993301391602, "learning_rate": 0.0002980953894006014, "loss": 4.8435, "step": 12700 }, { "epoch": 0.26017242438514937, "grad_norm": 0.24064970016479492, "learning_rate": 0.00029808554456438346, "loss": 4.8473, "step": 12705 }, { "epoch": 0.260274814162554, "grad_norm": 0.44505807757377625, "learning_rate": 0.00029807567451338977, "loss": 4.8631, "step": 12710 }, { "epoch": 0.2603772039399586, "grad_norm": 0.24504883587360382, "learning_rate": 0.0002980657792493009, "loss": 4.8666, "step": 12715 }, { "epoch": 0.26047959371736323, "grad_norm": 0.23710520565509796, "learning_rate": 0.0002980558587738018, "loss": 4.9392, "step": 12720 }, { "epoch": 0.26058198349476785, "grad_norm": 0.2329605668783188, "learning_rate": 0.00029804591308858145, "loss": 4.8668, "step": 12725 }, { "epoch": 0.26068437327217253, "grad_norm": 0.2457711100578308, "learning_rate": 0.0002980359421953335, "loss": 4.8803, "step": 12730 }, { "epoch": 0.26078676304957715, "grad_norm": 0.2757936120033264, "learning_rate": 0.0002980259460957557, "loss": 4.8527, "step": 12735 }, { "epoch": 0.2608891528269818, "grad_norm": 0.245131254196167, "learning_rate": 0.00029801592479155003, "loss": 4.8267, "step": 12740 }, { "epoch": 0.2609915426043864, "grad_norm": 0.236470028758049, "learning_rate": 0.00029800587828442284, "loss": 4.8362, "step": 12745 }, { "epoch": 0.261093932381791, "grad_norm": 0.22589100897312164, "learning_rate": 0.0002979958065760848, "loss": 4.7946, "step": 12750 }, { "epoch": 0.26119632215919564, "grad_norm": 0.2344709187746048, "learning_rate": 0.0002979857096682509, "loss": 4.8593, "step": 12755 }, { "epoch": 0.26129871193660026, "grad_norm": 0.24362529814243317, "learning_rate": 0.0002979755875626402, "loss": 4.8996, "step": 12760 }, { "epoch": 0.2614011017140049, "grad_norm": 0.2457917481660843, "learning_rate": 0.0002979654402609763, "loss": 4.8396, "step": 12765 }, { "epoch": 0.2615034914914095, "grad_norm": 0.25447165966033936, "learning_rate": 0.000297955267764987, "loss": 4.8407, "step": 12770 }, { "epoch": 0.2616058812688141, "grad_norm": 0.271729975938797, "learning_rate": 0.00029794507007640446, "loss": 4.8571, "step": 12775 }, { "epoch": 0.26170827104621874, "grad_norm": 0.23931005597114563, "learning_rate": 0.0002979348471969649, "loss": 4.8729, "step": 12780 }, { "epoch": 0.26181066082362336, "grad_norm": 0.241059809923172, "learning_rate": 0.000297924599128409, "loss": 4.8818, "step": 12785 }, { "epoch": 0.261913050601028, "grad_norm": 0.2745431661605835, "learning_rate": 0.0002979143258724819, "loss": 4.8635, "step": 12790 }, { "epoch": 0.2620154403784326, "grad_norm": 0.2509162425994873, "learning_rate": 0.0002979040274309327, "loss": 4.7888, "step": 12795 }, { "epoch": 0.26211783015583723, "grad_norm": 0.23292668163776398, "learning_rate": 0.000297893703805515, "loss": 4.8462, "step": 12800 }, { "epoch": 0.26222021993324185, "grad_norm": 0.24121513962745667, "learning_rate": 0.00029788335499798645, "loss": 4.8779, "step": 12805 }, { "epoch": 0.26232260971064647, "grad_norm": 0.25357139110565186, "learning_rate": 0.0002978729810101094, "loss": 4.9119, "step": 12810 }, { "epoch": 0.2624249994880511, "grad_norm": 0.2415647953748703, "learning_rate": 0.00029786258184365006, "loss": 4.8324, "step": 12815 }, { "epoch": 0.2625273892654557, "grad_norm": 0.2385522425174713, "learning_rate": 0.00029785215750037926, "loss": 4.8612, "step": 12820 }, { "epoch": 0.26262977904286033, "grad_norm": 0.24159187078475952, "learning_rate": 0.0002978417079820718, "loss": 4.7787, "step": 12825 }, { "epoch": 0.262732168820265, "grad_norm": 0.25553110241889954, "learning_rate": 0.0002978312332905071, "loss": 4.8671, "step": 12830 }, { "epoch": 0.26283455859766963, "grad_norm": 0.23924241960048676, "learning_rate": 0.0002978207334274686, "loss": 4.9007, "step": 12835 }, { "epoch": 0.26293694837507425, "grad_norm": 0.24613811075687408, "learning_rate": 0.0002978102083947442, "loss": 4.8378, "step": 12840 }, { "epoch": 0.2630393381524789, "grad_norm": 0.23306281864643097, "learning_rate": 0.000297799658194126, "loss": 4.8707, "step": 12845 }, { "epoch": 0.2631417279298835, "grad_norm": 0.23660875856876373, "learning_rate": 0.00029778908282741037, "loss": 4.893, "step": 12850 }, { "epoch": 0.2632441177072881, "grad_norm": 0.2504987418651581, "learning_rate": 0.00029777848229639803, "loss": 4.8757, "step": 12855 }, { "epoch": 0.26334650748469274, "grad_norm": 0.26087918877601624, "learning_rate": 0.00029776785660289387, "loss": 4.8782, "step": 12860 }, { "epoch": 0.26344889726209736, "grad_norm": 0.2325216382741928, "learning_rate": 0.0002977572057487073, "loss": 4.8199, "step": 12865 }, { "epoch": 0.263551287039502, "grad_norm": 0.2563367187976837, "learning_rate": 0.00029774652973565166, "loss": 4.8593, "step": 12870 }, { "epoch": 0.2636536768169066, "grad_norm": 0.23316645622253418, "learning_rate": 0.00029773582856554493, "loss": 4.8754, "step": 12875 }, { "epoch": 0.2637560665943112, "grad_norm": 0.24804489314556122, "learning_rate": 0.0002977251022402092, "loss": 4.8731, "step": 12880 }, { "epoch": 0.26385845637171584, "grad_norm": 0.23226231336593628, "learning_rate": 0.00029771435076147073, "loss": 4.9199, "step": 12885 }, { "epoch": 0.26396084614912046, "grad_norm": 0.22917960584163666, "learning_rate": 0.00029770357413116033, "loss": 4.8899, "step": 12890 }, { "epoch": 0.2640632359265251, "grad_norm": 0.24159789085388184, "learning_rate": 0.0002976927723511129, "loss": 4.8238, "step": 12895 }, { "epoch": 0.2641656257039297, "grad_norm": 0.23341481387615204, "learning_rate": 0.0002976819454231677, "loss": 4.8309, "step": 12900 }, { "epoch": 0.26426801548133433, "grad_norm": 0.23765237629413605, "learning_rate": 0.00029767109334916824, "loss": 4.8656, "step": 12905 }, { "epoch": 0.26437040525873895, "grad_norm": 0.2955288589000702, "learning_rate": 0.0002976602161309623, "loss": 4.8886, "step": 12910 }, { "epoch": 0.26447279503614357, "grad_norm": 0.2644578218460083, "learning_rate": 0.000297649313770402, "loss": 4.8946, "step": 12915 }, { "epoch": 0.2645751848135482, "grad_norm": 0.441896915435791, "learning_rate": 0.0002976383862693437, "loss": 4.8514, "step": 12920 }, { "epoch": 0.2646775745909528, "grad_norm": 0.27758267521858215, "learning_rate": 0.000297627433629648, "loss": 4.849, "step": 12925 }, { "epoch": 0.2647799643683575, "grad_norm": 0.24811238050460815, "learning_rate": 0.0002976164558531798, "loss": 4.8608, "step": 12930 }, { "epoch": 0.2648823541457621, "grad_norm": 0.2523718476295471, "learning_rate": 0.00029760545294180846, "loss": 4.8731, "step": 12935 }, { "epoch": 0.26498474392316673, "grad_norm": 0.23273690044879913, "learning_rate": 0.0002975944248974073, "loss": 4.8509, "step": 12940 }, { "epoch": 0.26508713370057135, "grad_norm": 0.2602105438709259, "learning_rate": 0.0002975833717218541, "loss": 4.905, "step": 12945 }, { "epoch": 0.265189523477976, "grad_norm": 0.23075635731220245, "learning_rate": 0.00029757229341703103, "loss": 4.8708, "step": 12950 }, { "epoch": 0.2652919132553806, "grad_norm": 0.24693787097930908, "learning_rate": 0.00029756118998482427, "loss": 4.8541, "step": 12955 }, { "epoch": 0.2653943030327852, "grad_norm": 0.25319182872772217, "learning_rate": 0.0002975500614271245, "loss": 4.8526, "step": 12960 }, { "epoch": 0.26549669281018984, "grad_norm": 0.23757408559322357, "learning_rate": 0.0002975389077458266, "loss": 4.8878, "step": 12965 }, { "epoch": 0.26559908258759446, "grad_norm": 0.2358894944190979, "learning_rate": 0.0002975277289428297, "loss": 4.8995, "step": 12970 }, { "epoch": 0.2657014723649991, "grad_norm": 0.23779599368572235, "learning_rate": 0.0002975165250200372, "loss": 4.8562, "step": 12975 }, { "epoch": 0.2658038621424037, "grad_norm": 0.23578375577926636, "learning_rate": 0.0002975052959793569, "loss": 4.8733, "step": 12980 }, { "epoch": 0.2659062519198083, "grad_norm": 0.22678780555725098, "learning_rate": 0.0002974940418227007, "loss": 4.8401, "step": 12985 }, { "epoch": 0.26600864169721294, "grad_norm": 0.2594931423664093, "learning_rate": 0.0002974827625519849, "loss": 4.8202, "step": 12990 }, { "epoch": 0.26611103147461757, "grad_norm": 0.3577810823917389, "learning_rate": 0.0002974714581691301, "loss": 4.8104, "step": 12995 }, { "epoch": 0.2662134212520222, "grad_norm": 0.2552907168865204, "learning_rate": 0.00029746012867606095, "loss": 4.8498, "step": 13000 }, { "epoch": 0.2662134212520222, "eval_loss": 4.883366584777832, "eval_runtime": 17.076, "eval_samples_per_second": 34.083, "eval_steps_per_second": 4.275, "step": 13000 }, { "epoch": 0.2663158110294268, "grad_norm": 0.30066636204719543, "learning_rate": 0.0002974487740747067, "loss": 4.8731, "step": 13005 }, { "epoch": 0.26641820080683143, "grad_norm": 0.2680542469024658, "learning_rate": 0.0002974373943670007, "loss": 4.8089, "step": 13010 }, { "epoch": 0.26652059058423605, "grad_norm": 0.256352037191391, "learning_rate": 0.0002974259895548805, "loss": 4.8462, "step": 13015 }, { "epoch": 0.26662298036164067, "grad_norm": 0.25665661692619324, "learning_rate": 0.00029741455964028807, "loss": 4.8946, "step": 13020 }, { "epoch": 0.2667253701390453, "grad_norm": 0.23544175922870636, "learning_rate": 0.00029740310462516965, "loss": 4.9251, "step": 13025 }, { "epoch": 0.26682775991644997, "grad_norm": 0.2365446239709854, "learning_rate": 0.00029739162451147567, "loss": 4.8693, "step": 13030 }, { "epoch": 0.2669301496938546, "grad_norm": 0.27612054347991943, "learning_rate": 0.00029738011930116083, "loss": 4.8541, "step": 13035 }, { "epoch": 0.2670325394712592, "grad_norm": 0.2788892090320587, "learning_rate": 0.00029736858899618413, "loss": 4.888, "step": 13040 }, { "epoch": 0.26713492924866383, "grad_norm": 0.23531077802181244, "learning_rate": 0.00029735703359850895, "loss": 4.8474, "step": 13045 }, { "epoch": 0.26723731902606845, "grad_norm": 0.22987061738967896, "learning_rate": 0.0002973454531101028, "loss": 4.846, "step": 13050 }, { "epoch": 0.2673397088034731, "grad_norm": 0.24541215598583221, "learning_rate": 0.00029733384753293743, "loss": 4.8158, "step": 13055 }, { "epoch": 0.2674420985808777, "grad_norm": 0.23491857945919037, "learning_rate": 0.0002973222168689891, "loss": 4.8532, "step": 13060 }, { "epoch": 0.2675444883582823, "grad_norm": 0.2576743960380554, "learning_rate": 0.00029731056112023807, "loss": 4.8559, "step": 13065 }, { "epoch": 0.26764687813568694, "grad_norm": 0.23715195059776306, "learning_rate": 0.000297298880288669, "loss": 4.8553, "step": 13070 }, { "epoch": 0.26774926791309156, "grad_norm": 0.24965982139110565, "learning_rate": 0.00029728717437627083, "loss": 4.8552, "step": 13075 }, { "epoch": 0.2678516576904962, "grad_norm": 0.2617979645729065, "learning_rate": 0.00029727544338503664, "loss": 4.8221, "step": 13080 }, { "epoch": 0.2679540474679008, "grad_norm": 0.28162577748298645, "learning_rate": 0.00029726368731696406, "loss": 4.9248, "step": 13085 }, { "epoch": 0.2680564372453054, "grad_norm": 0.3132152259349823, "learning_rate": 0.00029725190617405474, "loss": 4.8982, "step": 13090 }, { "epoch": 0.26815882702271004, "grad_norm": 0.3432159423828125, "learning_rate": 0.0002972400999583146, "loss": 4.8901, "step": 13095 }, { "epoch": 0.26826121680011467, "grad_norm": 0.26496621966362, "learning_rate": 0.00029722826867175403, "loss": 4.828, "step": 13100 }, { "epoch": 0.2683636065775193, "grad_norm": 0.26306840777397156, "learning_rate": 0.00029721641231638746, "loss": 4.8419, "step": 13105 }, { "epoch": 0.2684659963549239, "grad_norm": 0.24219603836536407, "learning_rate": 0.00029720453089423376, "loss": 4.8359, "step": 13110 }, { "epoch": 0.26856838613232853, "grad_norm": 0.26120564341545105, "learning_rate": 0.000297192624407316, "loss": 4.8603, "step": 13115 }, { "epoch": 0.26867077590973315, "grad_norm": 0.2502914369106293, "learning_rate": 0.00029718069285766146, "loss": 4.8762, "step": 13120 }, { "epoch": 0.26877316568713777, "grad_norm": 0.522968053817749, "learning_rate": 0.0002971687362473018, "loss": 4.8406, "step": 13125 }, { "epoch": 0.2688755554645424, "grad_norm": 0.23913554847240448, "learning_rate": 0.00029715675457827283, "loss": 4.8191, "step": 13130 }, { "epoch": 0.26897794524194707, "grad_norm": 0.2351391315460205, "learning_rate": 0.00029714474785261474, "loss": 4.8869, "step": 13135 }, { "epoch": 0.2690803350193517, "grad_norm": 0.26714980602264404, "learning_rate": 0.00029713271607237194, "loss": 4.8483, "step": 13140 }, { "epoch": 0.2691827247967563, "grad_norm": 0.23896341025829315, "learning_rate": 0.0002971206592395931, "loss": 4.8974, "step": 13145 }, { "epoch": 0.26928511457416093, "grad_norm": 0.28090232610702515, "learning_rate": 0.00029710857735633107, "loss": 4.8072, "step": 13150 }, { "epoch": 0.26938750435156555, "grad_norm": 0.252164363861084, "learning_rate": 0.0002970964704246432, "loss": 4.8944, "step": 13155 }, { "epoch": 0.2694898941289702, "grad_norm": 0.2368669956922531, "learning_rate": 0.0002970843384465908, "loss": 4.8508, "step": 13160 }, { "epoch": 0.2695922839063748, "grad_norm": 0.24469216167926788, "learning_rate": 0.0002970721814242398, "loss": 4.8878, "step": 13165 }, { "epoch": 0.2696946736837794, "grad_norm": 0.22728322446346283, "learning_rate": 0.00029705999935966004, "loss": 4.8618, "step": 13170 }, { "epoch": 0.26979706346118404, "grad_norm": 0.2591262757778168, "learning_rate": 0.0002970477922549258, "loss": 4.8821, "step": 13175 }, { "epoch": 0.26989945323858866, "grad_norm": 0.24526582658290863, "learning_rate": 0.0002970355601121156, "loss": 4.8319, "step": 13180 }, { "epoch": 0.2700018430159933, "grad_norm": 0.2433377355337143, "learning_rate": 0.00029702330293331227, "loss": 4.8802, "step": 13185 }, { "epoch": 0.2701042327933979, "grad_norm": 0.25172290205955505, "learning_rate": 0.00029701102072060293, "loss": 4.845, "step": 13190 }, { "epoch": 0.2702066225708025, "grad_norm": 0.26094940304756165, "learning_rate": 0.0002969987134760787, "loss": 4.8879, "step": 13195 }, { "epoch": 0.27030901234820714, "grad_norm": 0.2812451124191284, "learning_rate": 0.0002969863812018353, "loss": 4.8625, "step": 13200 }, { "epoch": 0.27041140212561177, "grad_norm": 0.2290884107351303, "learning_rate": 0.0002969740238999726, "loss": 4.8672, "step": 13205 }, { "epoch": 0.2705137919030164, "grad_norm": 0.3153294622898102, "learning_rate": 0.0002969616415725945, "loss": 4.8075, "step": 13210 }, { "epoch": 0.270616181680421, "grad_norm": 0.2929977476596832, "learning_rate": 0.0002969492342218096, "loss": 4.8449, "step": 13215 }, { "epoch": 0.27071857145782563, "grad_norm": 0.23602008819580078, "learning_rate": 0.00029693680184973036, "loss": 4.9074, "step": 13220 }, { "epoch": 0.27082096123523025, "grad_norm": 0.25510868430137634, "learning_rate": 0.00029692434445847374, "loss": 4.8755, "step": 13225 }, { "epoch": 0.27092335101263487, "grad_norm": 0.24809250235557556, "learning_rate": 0.0002969118620501608, "loss": 4.8583, "step": 13230 }, { "epoch": 0.27102574079003955, "grad_norm": 1.9793117046356201, "learning_rate": 0.000296899354626917, "loss": 4.7931, "step": 13235 }, { "epoch": 0.27112813056744417, "grad_norm": 0.2557344138622284, "learning_rate": 0.0002968868221908721, "loss": 4.8424, "step": 13240 }, { "epoch": 0.2712305203448488, "grad_norm": 0.2346993088722229, "learning_rate": 0.0002968742647441598, "loss": 4.8449, "step": 13245 }, { "epoch": 0.2713329101222534, "grad_norm": 0.24267706274986267, "learning_rate": 0.00029686168228891844, "loss": 4.847, "step": 13250 }, { "epoch": 0.27143529989965803, "grad_norm": 0.2332083135843277, "learning_rate": 0.0002968490748272903, "loss": 4.9079, "step": 13255 }, { "epoch": 0.27153768967706265, "grad_norm": 0.2350085824728012, "learning_rate": 0.0002968364423614223, "loss": 4.8773, "step": 13260 }, { "epoch": 0.2716400794544673, "grad_norm": 0.22749336063861847, "learning_rate": 0.00029682378489346526, "loss": 4.8448, "step": 13265 }, { "epoch": 0.2717424692318719, "grad_norm": 0.25906381011009216, "learning_rate": 0.00029681110242557433, "loss": 4.822, "step": 13270 }, { "epoch": 0.2718448590092765, "grad_norm": 0.24409353733062744, "learning_rate": 0.00029679839495990915, "loss": 4.848, "step": 13275 }, { "epoch": 0.27194724878668114, "grad_norm": 0.24792829155921936, "learning_rate": 0.00029678566249863325, "loss": 4.8405, "step": 13280 }, { "epoch": 0.27204963856408576, "grad_norm": 0.2603752017021179, "learning_rate": 0.0002967729050439147, "loss": 4.8137, "step": 13285 }, { "epoch": 0.2721520283414904, "grad_norm": 0.2420881986618042, "learning_rate": 0.00029676012259792576, "loss": 4.8403, "step": 13290 }, { "epoch": 0.272254418118895, "grad_norm": 0.2657824456691742, "learning_rate": 0.00029674731516284285, "loss": 4.8592, "step": 13295 }, { "epoch": 0.2723568078962996, "grad_norm": 0.24368324875831604, "learning_rate": 0.0002967344827408468, "loss": 4.8381, "step": 13300 }, { "epoch": 0.27245919767370425, "grad_norm": 0.2560962438583374, "learning_rate": 0.0002967216253341225, "loss": 4.8776, "step": 13305 }, { "epoch": 0.27256158745110887, "grad_norm": 0.27904170751571655, "learning_rate": 0.0002967087429448593, "loss": 4.8394, "step": 13310 }, { "epoch": 0.2726639772285135, "grad_norm": 0.28567177057266235, "learning_rate": 0.0002966958355752506, "loss": 4.8572, "step": 13315 }, { "epoch": 0.2727663670059181, "grad_norm": 0.2559954524040222, "learning_rate": 0.00029668290322749436, "loss": 4.8638, "step": 13320 }, { "epoch": 0.27286875678332273, "grad_norm": 0.2415838986635208, "learning_rate": 0.00029666994590379236, "loss": 4.8609, "step": 13325 }, { "epoch": 0.27297114656072735, "grad_norm": 0.2440149486064911, "learning_rate": 0.000296656963606351, "loss": 4.7883, "step": 13330 }, { "epoch": 0.27307353633813203, "grad_norm": 0.23534487187862396, "learning_rate": 0.00029664395633738073, "loss": 4.8313, "step": 13335 }, { "epoch": 0.27317592611553665, "grad_norm": 0.2263362854719162, "learning_rate": 0.00029663092409909633, "loss": 4.8468, "step": 13340 }, { "epoch": 0.27327831589294127, "grad_norm": 0.28955426812171936, "learning_rate": 0.00029661786689371685, "loss": 4.7981, "step": 13345 }, { "epoch": 0.2733807056703459, "grad_norm": 0.2760336101055145, "learning_rate": 0.00029660478472346563, "loss": 4.8296, "step": 13350 }, { "epoch": 0.2734830954477505, "grad_norm": 0.26077723503112793, "learning_rate": 0.00029659167759057004, "loss": 4.8306, "step": 13355 }, { "epoch": 0.27358548522515513, "grad_norm": 0.24385151267051697, "learning_rate": 0.000296578545497262, "loss": 4.846, "step": 13360 }, { "epoch": 0.27368787500255976, "grad_norm": 0.2572381794452667, "learning_rate": 0.0002965653884457774, "loss": 4.8641, "step": 13365 }, { "epoch": 0.2737902647799644, "grad_norm": 0.2537418007850647, "learning_rate": 0.0002965522064383566, "loss": 4.8958, "step": 13370 }, { "epoch": 0.273892654557369, "grad_norm": 0.2534486651420593, "learning_rate": 0.00029653899947724413, "loss": 4.8643, "step": 13375 }, { "epoch": 0.2739950443347736, "grad_norm": 0.45812422037124634, "learning_rate": 0.0002965257675646887, "loss": 4.8877, "step": 13380 }, { "epoch": 0.27409743411217824, "grad_norm": 0.2593996226787567, "learning_rate": 0.00029651251070294334, "loss": 4.8917, "step": 13385 }, { "epoch": 0.27419982388958286, "grad_norm": 0.261886328458786, "learning_rate": 0.00029649922889426534, "loss": 4.8464, "step": 13390 }, { "epoch": 0.2743022136669875, "grad_norm": 0.32401710748672485, "learning_rate": 0.0002964859221409163, "loss": 4.8962, "step": 13395 }, { "epoch": 0.2744046034443921, "grad_norm": 0.276602178812027, "learning_rate": 0.00029647259044516183, "loss": 4.8667, "step": 13400 }, { "epoch": 0.2745069932217967, "grad_norm": 0.24835945665836334, "learning_rate": 0.000296459233809272, "loss": 4.8001, "step": 13405 }, { "epoch": 0.27460938299920135, "grad_norm": 0.24881890416145325, "learning_rate": 0.00029644585223552114, "loss": 4.8348, "step": 13410 }, { "epoch": 0.27471177277660597, "grad_norm": 0.2437090426683426, "learning_rate": 0.0002964324457261877, "loss": 4.7979, "step": 13415 }, { "epoch": 0.2748141625540106, "grad_norm": 0.23669248819351196, "learning_rate": 0.0002964190142835543, "loss": 4.8229, "step": 13420 }, { "epoch": 0.2749165523314152, "grad_norm": 0.26821815967559814, "learning_rate": 0.0002964055579099081, "loss": 4.8096, "step": 13425 }, { "epoch": 0.27501894210881983, "grad_norm": 0.2394394725561142, "learning_rate": 0.0002963920766075404, "loss": 4.8859, "step": 13430 }, { "epoch": 0.27512133188622445, "grad_norm": 0.25903841853141785, "learning_rate": 0.0002963785703787465, "loss": 4.8699, "step": 13435 }, { "epoch": 0.27522372166362913, "grad_norm": 0.2664482295513153, "learning_rate": 0.0002963650392258262, "loss": 4.8241, "step": 13440 }, { "epoch": 0.27532611144103375, "grad_norm": 0.31281593441963196, "learning_rate": 0.00029635148315108347, "loss": 4.852, "step": 13445 }, { "epoch": 0.27542850121843837, "grad_norm": 0.24482332170009613, "learning_rate": 0.00029633790215682654, "loss": 4.8072, "step": 13450 }, { "epoch": 0.275530890995843, "grad_norm": 0.2669447064399719, "learning_rate": 0.0002963242962453679, "loss": 4.8428, "step": 13455 }, { "epoch": 0.2756332807732476, "grad_norm": 0.22922296822071075, "learning_rate": 0.0002963106654190242, "loss": 4.8868, "step": 13460 }, { "epoch": 0.27573567055065223, "grad_norm": 0.2323535829782486, "learning_rate": 0.0002962970096801164, "loss": 4.8734, "step": 13465 }, { "epoch": 0.27583806032805686, "grad_norm": 0.22814953327178955, "learning_rate": 0.0002962833290309697, "loss": 4.8149, "step": 13470 }, { "epoch": 0.2759404501054615, "grad_norm": 0.26555827260017395, "learning_rate": 0.00029626962347391344, "loss": 4.8785, "step": 13475 }, { "epoch": 0.2760428398828661, "grad_norm": 0.24419274926185608, "learning_rate": 0.0002962558930112814, "loss": 4.8504, "step": 13480 }, { "epoch": 0.2761452296602707, "grad_norm": 0.3241846263408661, "learning_rate": 0.00029624213764541145, "loss": 4.9261, "step": 13485 }, { "epoch": 0.27624761943767534, "grad_norm": 0.3510860204696655, "learning_rate": 0.00029622835737864577, "loss": 4.7729, "step": 13490 }, { "epoch": 0.27635000921507996, "grad_norm": 0.23222967982292175, "learning_rate": 0.00029621455221333067, "loss": 4.8048, "step": 13495 }, { "epoch": 0.2764523989924846, "grad_norm": 0.23177790641784668, "learning_rate": 0.00029620072215181685, "loss": 4.8359, "step": 13500 }, { "epoch": 0.2765547887698892, "grad_norm": 0.25533327460289, "learning_rate": 0.00029618686719645916, "loss": 4.8698, "step": 13505 }, { "epoch": 0.2766571785472938, "grad_norm": 0.23750196397304535, "learning_rate": 0.00029617298734961665, "loss": 4.8783, "step": 13510 }, { "epoch": 0.27675956832469845, "grad_norm": 0.23804759979248047, "learning_rate": 0.0002961590826136528, "loss": 4.8569, "step": 13515 }, { "epoch": 0.27686195810210307, "grad_norm": 0.2607920169830322, "learning_rate": 0.00029614515299093507, "loss": 4.9458, "step": 13520 }, { "epoch": 0.2769643478795077, "grad_norm": 0.2700970470905304, "learning_rate": 0.0002961311984838353, "loss": 4.8696, "step": 13525 }, { "epoch": 0.2770667376569123, "grad_norm": 0.23598410189151764, "learning_rate": 0.0002961172190947296, "loss": 4.8557, "step": 13530 }, { "epoch": 0.27716912743431693, "grad_norm": 0.2305101603269577, "learning_rate": 0.0002961032148259982, "loss": 4.8504, "step": 13535 }, { "epoch": 0.2772715172117216, "grad_norm": 0.24045445024967194, "learning_rate": 0.0002960891856800257, "loss": 4.8541, "step": 13540 }, { "epoch": 0.27737390698912623, "grad_norm": 0.24124599993228912, "learning_rate": 0.00029607513165920077, "loss": 4.8321, "step": 13545 }, { "epoch": 0.27747629676653085, "grad_norm": 0.25752729177474976, "learning_rate": 0.0002960610527659165, "loss": 4.7887, "step": 13550 }, { "epoch": 0.27757868654393547, "grad_norm": 0.24083749949932098, "learning_rate": 0.0002960469490025701, "loss": 4.7622, "step": 13555 }, { "epoch": 0.2776810763213401, "grad_norm": 0.25222840905189514, "learning_rate": 0.00029603282037156305, "loss": 4.8629, "step": 13560 }, { "epoch": 0.2777834660987447, "grad_norm": 0.23185598850250244, "learning_rate": 0.00029601866687530107, "loss": 4.8439, "step": 13565 }, { "epoch": 0.27788585587614933, "grad_norm": 0.2794724404811859, "learning_rate": 0.000296004488516194, "loss": 4.8413, "step": 13570 }, { "epoch": 0.27798824565355396, "grad_norm": 0.25876951217651367, "learning_rate": 0.00029599028529665614, "loss": 4.8725, "step": 13575 }, { "epoch": 0.2780906354309586, "grad_norm": 0.25357869267463684, "learning_rate": 0.0002959760572191058, "loss": 4.8585, "step": 13580 }, { "epoch": 0.2781930252083632, "grad_norm": 0.2559994161128998, "learning_rate": 0.00029596180428596565, "loss": 4.8744, "step": 13585 }, { "epoch": 0.2782954149857678, "grad_norm": 0.24752305448055267, "learning_rate": 0.0002959475264996626, "loss": 4.8185, "step": 13590 }, { "epoch": 0.27839780476317244, "grad_norm": 0.2554483711719513, "learning_rate": 0.0002959332238626278, "loss": 4.8734, "step": 13595 }, { "epoch": 0.27850019454057706, "grad_norm": 0.23777826130390167, "learning_rate": 0.0002959188963772964, "loss": 4.8903, "step": 13600 }, { "epoch": 0.2786025843179817, "grad_norm": 0.23869380354881287, "learning_rate": 0.0002959045440461081, "loss": 4.795, "step": 13605 }, { "epoch": 0.2787049740953863, "grad_norm": 0.23788928985595703, "learning_rate": 0.0002958901668715067, "loss": 4.8753, "step": 13610 }, { "epoch": 0.2788073638727909, "grad_norm": 0.25508829951286316, "learning_rate": 0.0002958757648559402, "loss": 4.8589, "step": 13615 }, { "epoch": 0.27890975365019555, "grad_norm": 0.2538926303386688, "learning_rate": 0.0002958613380018609, "loss": 4.9016, "step": 13620 }, { "epoch": 0.27901214342760017, "grad_norm": 0.23998358845710754, "learning_rate": 0.00029584688631172513, "loss": 4.87, "step": 13625 }, { "epoch": 0.2791145332050048, "grad_norm": 0.2575380504131317, "learning_rate": 0.00029583240978799374, "loss": 4.7988, "step": 13630 }, { "epoch": 0.2792169229824094, "grad_norm": 0.24486133456230164, "learning_rate": 0.00029581790843313165, "loss": 4.9109, "step": 13635 }, { "epoch": 0.2793193127598141, "grad_norm": 0.2378109097480774, "learning_rate": 0.0002958033822496081, "loss": 4.8434, "step": 13640 }, { "epoch": 0.2794217025372187, "grad_norm": 0.24347257614135742, "learning_rate": 0.00029578883123989626, "loss": 4.7965, "step": 13645 }, { "epoch": 0.27952409231462333, "grad_norm": 0.2288026362657547, "learning_rate": 0.000295774255406474, "loss": 4.8634, "step": 13650 }, { "epoch": 0.27962648209202795, "grad_norm": 0.22999413311481476, "learning_rate": 0.00029575965475182306, "loss": 4.8546, "step": 13655 }, { "epoch": 0.27972887186943257, "grad_norm": 0.28599053621292114, "learning_rate": 0.00029574502927842954, "loss": 4.8263, "step": 13660 }, { "epoch": 0.2798312616468372, "grad_norm": 0.23187308013439178, "learning_rate": 0.0002957303789887838, "loss": 4.8318, "step": 13665 }, { "epoch": 0.2799336514242418, "grad_norm": 0.25390827655792236, "learning_rate": 0.0002957157038853802, "loss": 4.8723, "step": 13670 }, { "epoch": 0.28003604120164644, "grad_norm": 0.25755733251571655, "learning_rate": 0.0002957010039707177, "loss": 4.8037, "step": 13675 }, { "epoch": 0.28013843097905106, "grad_norm": 0.26187583804130554, "learning_rate": 0.0002956862792472991, "loss": 4.8296, "step": 13680 }, { "epoch": 0.2802408207564557, "grad_norm": 0.2727029025554657, "learning_rate": 0.0002956715297176317, "loss": 4.8459, "step": 13685 }, { "epoch": 0.2803432105338603, "grad_norm": 0.23636601865291595, "learning_rate": 0.00029565675538422695, "loss": 4.8011, "step": 13690 }, { "epoch": 0.2804456003112649, "grad_norm": 0.23167629539966583, "learning_rate": 0.0002956419562496004, "loss": 4.834, "step": 13695 }, { "epoch": 0.28054799008866954, "grad_norm": 0.2523535490036011, "learning_rate": 0.0002956271323162721, "loss": 4.8502, "step": 13700 }, { "epoch": 0.28065037986607416, "grad_norm": 0.23229660093784332, "learning_rate": 0.00029561228358676594, "loss": 4.8333, "step": 13705 }, { "epoch": 0.2807527696434788, "grad_norm": 0.24005243182182312, "learning_rate": 0.0002955974100636104, "loss": 4.8577, "step": 13710 }, { "epoch": 0.2808551594208834, "grad_norm": 0.2420342117547989, "learning_rate": 0.0002955825117493379, "loss": 4.8757, "step": 13715 }, { "epoch": 0.280957549198288, "grad_norm": 0.2549287676811218, "learning_rate": 0.0002955675886464853, "loss": 4.8719, "step": 13720 }, { "epoch": 0.28105993897569265, "grad_norm": 0.26122424006462097, "learning_rate": 0.00029555264075759345, "loss": 4.8487, "step": 13725 }, { "epoch": 0.28116232875309727, "grad_norm": 0.24591730535030365, "learning_rate": 0.0002955376680852077, "loss": 4.8564, "step": 13730 }, { "epoch": 0.2812647185305019, "grad_norm": 0.24601192772388458, "learning_rate": 0.00029552267063187744, "loss": 4.8877, "step": 13735 }, { "epoch": 0.28136710830790657, "grad_norm": 0.31291663646698, "learning_rate": 0.00029550764840015624, "loss": 4.7991, "step": 13740 }, { "epoch": 0.2814694980853112, "grad_norm": 0.28130653500556946, "learning_rate": 0.000295492601392602, "loss": 4.8278, "step": 13745 }, { "epoch": 0.2815718878627158, "grad_norm": 0.23157049715518951, "learning_rate": 0.00029547752961177684, "loss": 4.9106, "step": 13750 }, { "epoch": 0.28167427764012043, "grad_norm": 0.25931575894355774, "learning_rate": 0.000295462433060247, "loss": 4.8456, "step": 13755 }, { "epoch": 0.28177666741752505, "grad_norm": 0.2281424105167389, "learning_rate": 0.00029544731174058295, "loss": 4.8622, "step": 13760 }, { "epoch": 0.28187905719492967, "grad_norm": 0.2262158840894699, "learning_rate": 0.0002954321656553596, "loss": 4.8308, "step": 13765 }, { "epoch": 0.2819814469723343, "grad_norm": 0.23673996329307556, "learning_rate": 0.0002954169948071557, "loss": 4.828, "step": 13770 }, { "epoch": 0.2820838367497389, "grad_norm": 0.2535232901573181, "learning_rate": 0.00029540179919855454, "loss": 4.8334, "step": 13775 }, { "epoch": 0.28218622652714354, "grad_norm": 0.25045135617256165, "learning_rate": 0.0002953865788321434, "loss": 4.8997, "step": 13780 }, { "epoch": 0.28228861630454816, "grad_norm": 0.24860410392284393, "learning_rate": 0.00029537133371051403, "loss": 4.8624, "step": 13785 }, { "epoch": 0.2823910060819528, "grad_norm": 0.240471750497818, "learning_rate": 0.00029535606383626216, "loss": 4.8817, "step": 13790 }, { "epoch": 0.2824933958593574, "grad_norm": 0.2637968957424164, "learning_rate": 0.0002953407692119878, "loss": 4.8445, "step": 13795 }, { "epoch": 0.282595785636762, "grad_norm": 0.24042394757270813, "learning_rate": 0.00029532544984029515, "loss": 4.8302, "step": 13800 }, { "epoch": 0.28269817541416664, "grad_norm": 0.2555772364139557, "learning_rate": 0.00029531010572379275, "loss": 4.8607, "step": 13805 }, { "epoch": 0.28280056519157126, "grad_norm": 0.24765047430992126, "learning_rate": 0.00029529473686509323, "loss": 4.8576, "step": 13810 }, { "epoch": 0.2829029549689759, "grad_norm": 0.2348913550376892, "learning_rate": 0.0002952793432668135, "loss": 4.8319, "step": 13815 }, { "epoch": 0.2830053447463805, "grad_norm": 0.2364525943994522, "learning_rate": 0.00029526392493157456, "loss": 4.8501, "step": 13820 }, { "epoch": 0.2831077345237851, "grad_norm": 0.2292618453502655, "learning_rate": 0.00029524848186200187, "loss": 4.8406, "step": 13825 }, { "epoch": 0.28321012430118975, "grad_norm": 0.2529616057872772, "learning_rate": 0.00029523301406072486, "loss": 4.9217, "step": 13830 }, { "epoch": 0.28331251407859437, "grad_norm": 0.2316058874130249, "learning_rate": 0.0002952175215303773, "loss": 4.8789, "step": 13835 }, { "epoch": 0.283414903855999, "grad_norm": 0.28541165590286255, "learning_rate": 0.000295202004273597, "loss": 4.786, "step": 13840 }, { "epoch": 0.28351729363340367, "grad_norm": 0.22854579985141754, "learning_rate": 0.00029518646229302625, "loss": 4.8737, "step": 13845 }, { "epoch": 0.2836196834108083, "grad_norm": 0.2418985515832901, "learning_rate": 0.0002951708955913113, "loss": 4.8302, "step": 13850 }, { "epoch": 0.2837220731882129, "grad_norm": 0.2630443871021271, "learning_rate": 0.0002951553041711029, "loss": 4.8347, "step": 13855 }, { "epoch": 0.28382446296561753, "grad_norm": 0.2776937782764435, "learning_rate": 0.00029513968803505567, "loss": 4.8634, "step": 13860 }, { "epoch": 0.28392685274302215, "grad_norm": 0.26442214846611023, "learning_rate": 0.0002951240471858286, "loss": 4.8737, "step": 13865 }, { "epoch": 0.2840292425204268, "grad_norm": 0.35495322942733765, "learning_rate": 0.000295108381626085, "loss": 4.7939, "step": 13870 }, { "epoch": 0.2841316322978314, "grad_norm": 0.22926858067512512, "learning_rate": 0.0002950926913584921, "loss": 4.8723, "step": 13875 }, { "epoch": 0.284234022075236, "grad_norm": 0.22714608907699585, "learning_rate": 0.00029507697638572164, "loss": 4.8262, "step": 13880 }, { "epoch": 0.28433641185264064, "grad_norm": 0.2394273579120636, "learning_rate": 0.00029506123671044936, "loss": 4.8742, "step": 13885 }, { "epoch": 0.28443880163004526, "grad_norm": 0.2603166103363037, "learning_rate": 0.0002950454723353553, "loss": 4.8836, "step": 13890 }, { "epoch": 0.2845411914074499, "grad_norm": 0.2738681435585022, "learning_rate": 0.0002950296832631238, "loss": 4.8528, "step": 13895 }, { "epoch": 0.2846435811848545, "grad_norm": 0.25415581464767456, "learning_rate": 0.0002950138694964431, "loss": 4.7952, "step": 13900 }, { "epoch": 0.2847459709622591, "grad_norm": 0.23789221048355103, "learning_rate": 0.00029499803103800597, "loss": 4.8537, "step": 13905 }, { "epoch": 0.28484836073966374, "grad_norm": 0.28230541944503784, "learning_rate": 0.0002949821678905092, "loss": 4.8322, "step": 13910 }, { "epoch": 0.28495075051706836, "grad_norm": 0.28278619050979614, "learning_rate": 0.00029496628005665383, "loss": 4.8186, "step": 13915 }, { "epoch": 0.285053140294473, "grad_norm": 0.2463081181049347, "learning_rate": 0.0002949503675391452, "loss": 4.8754, "step": 13920 }, { "epoch": 0.2851555300718776, "grad_norm": 0.2316562682390213, "learning_rate": 0.0002949344303406926, "loss": 4.8454, "step": 13925 }, { "epoch": 0.2852579198492822, "grad_norm": 0.2407759726047516, "learning_rate": 0.0002949184684640098, "loss": 4.8185, "step": 13930 }, { "epoch": 0.28536030962668685, "grad_norm": 0.34712404012680054, "learning_rate": 0.00029490248191181464, "loss": 4.863, "step": 13935 }, { "epoch": 0.28546269940409147, "grad_norm": 0.25207993388175964, "learning_rate": 0.00029488647068682915, "loss": 4.8482, "step": 13940 }, { "epoch": 0.28556508918149615, "grad_norm": 0.23798006772994995, "learning_rate": 0.0002948704347917796, "loss": 4.8646, "step": 13945 }, { "epoch": 0.28566747895890077, "grad_norm": 0.24008354544639587, "learning_rate": 0.00029485437422939643, "loss": 4.9122, "step": 13950 }, { "epoch": 0.2857698687363054, "grad_norm": 0.25269708037376404, "learning_rate": 0.00029483828900241435, "loss": 4.8838, "step": 13955 }, { "epoch": 0.28587225851371, "grad_norm": 0.26204586029052734, "learning_rate": 0.0002948221791135722, "loss": 4.8624, "step": 13960 }, { "epoch": 0.28597464829111463, "grad_norm": 0.2489648014307022, "learning_rate": 0.00029480604456561297, "loss": 4.8598, "step": 13965 }, { "epoch": 0.28607703806851925, "grad_norm": 0.256124883890152, "learning_rate": 0.000294789885361284, "loss": 4.8417, "step": 13970 }, { "epoch": 0.2861794278459239, "grad_norm": 0.25204986333847046, "learning_rate": 0.0002947737015033367, "loss": 4.8558, "step": 13975 }, { "epoch": 0.2862818176233285, "grad_norm": 0.23605899512767792, "learning_rate": 0.0002947574929945268, "loss": 4.8755, "step": 13980 }, { "epoch": 0.2863842074007331, "grad_norm": 0.2371349036693573, "learning_rate": 0.00029474125983761405, "loss": 4.8112, "step": 13985 }, { "epoch": 0.28648659717813774, "grad_norm": 0.2582697868347168, "learning_rate": 0.0002947250020353625, "loss": 4.7941, "step": 13990 }, { "epoch": 0.28658898695554236, "grad_norm": 0.25083622336387634, "learning_rate": 0.0002947087195905405, "loss": 4.8397, "step": 13995 }, { "epoch": 0.286691376732947, "grad_norm": 0.2635503113269806, "learning_rate": 0.0002946924125059204, "loss": 4.8535, "step": 14000 }, { "epoch": 0.286691376732947, "eval_loss": 4.864429950714111, "eval_runtime": 17.2602, "eval_samples_per_second": 33.719, "eval_steps_per_second": 4.229, "step": 14000 }, { "epoch": 0.2867937665103516, "grad_norm": 0.26862239837646484, "learning_rate": 0.00029467608078427885, "loss": 4.8708, "step": 14005 }, { "epoch": 0.2868961562877562, "grad_norm": 0.25329339504241943, "learning_rate": 0.0002946597244283967, "loss": 4.8642, "step": 14010 }, { "epoch": 0.28699854606516084, "grad_norm": 0.24351774156093597, "learning_rate": 0.00029464334344105894, "loss": 4.8595, "step": 14015 }, { "epoch": 0.28710093584256546, "grad_norm": 0.23963317275047302, "learning_rate": 0.0002946269378250548, "loss": 4.8637, "step": 14020 }, { "epoch": 0.2872033256199701, "grad_norm": 0.29429295659065247, "learning_rate": 0.0002946105075831777, "loss": 4.8212, "step": 14025 }, { "epoch": 0.2873057153973747, "grad_norm": 0.28569620847702026, "learning_rate": 0.00029459405271822526, "loss": 4.8509, "step": 14030 }, { "epoch": 0.2874081051747793, "grad_norm": 0.23280423879623413, "learning_rate": 0.00029457757323299923, "loss": 4.866, "step": 14035 }, { "epoch": 0.28751049495218395, "grad_norm": 0.2550869584083557, "learning_rate": 0.00029456106913030563, "loss": 4.8956, "step": 14040 }, { "epoch": 0.2876128847295886, "grad_norm": 0.3817024230957031, "learning_rate": 0.0002945445404129547, "loss": 4.8865, "step": 14045 }, { "epoch": 0.28771527450699325, "grad_norm": 0.298347145318985, "learning_rate": 0.00029452798708376075, "loss": 4.8385, "step": 14050 }, { "epoch": 0.28781766428439787, "grad_norm": 0.24775293469429016, "learning_rate": 0.0002945114091455423, "loss": 4.808, "step": 14055 }, { "epoch": 0.2879200540618025, "grad_norm": 0.24405379593372345, "learning_rate": 0.0002944948066011222, "loss": 4.82, "step": 14060 }, { "epoch": 0.2880224438392071, "grad_norm": 0.24029859900474548, "learning_rate": 0.0002944781794533273, "loss": 4.8412, "step": 14065 }, { "epoch": 0.28812483361661173, "grad_norm": 0.23408369719982147, "learning_rate": 0.00029446152770498885, "loss": 4.8452, "step": 14070 }, { "epoch": 0.28822722339401635, "grad_norm": 0.22656705975532532, "learning_rate": 0.00029444485135894205, "loss": 4.8105, "step": 14075 }, { "epoch": 0.288329613171421, "grad_norm": 0.2891269028186798, "learning_rate": 0.0002944281504180264, "loss": 4.8055, "step": 14080 }, { "epoch": 0.2884320029488256, "grad_norm": 0.23477844893932343, "learning_rate": 0.00029441142488508573, "loss": 4.8317, "step": 14085 }, { "epoch": 0.2885343927262302, "grad_norm": 0.2410440742969513, "learning_rate": 0.00029439467476296785, "loss": 4.8635, "step": 14090 }, { "epoch": 0.28863678250363484, "grad_norm": 0.2463831752538681, "learning_rate": 0.0002943779000545248, "loss": 4.8682, "step": 14095 }, { "epoch": 0.28873917228103946, "grad_norm": 0.2598053216934204, "learning_rate": 0.0002943611007626129, "loss": 4.8892, "step": 14100 }, { "epoch": 0.2888415620584441, "grad_norm": 0.260195255279541, "learning_rate": 0.0002943442768900926, "loss": 4.8778, "step": 14105 }, { "epoch": 0.2889439518358487, "grad_norm": 0.2583038806915283, "learning_rate": 0.00029432742843982847, "loss": 4.7925, "step": 14110 }, { "epoch": 0.2890463416132533, "grad_norm": 0.23612385988235474, "learning_rate": 0.00029431055541468933, "loss": 4.8462, "step": 14115 }, { "epoch": 0.28914873139065794, "grad_norm": 0.25369763374328613, "learning_rate": 0.0002942936578175482, "loss": 4.8755, "step": 14120 }, { "epoch": 0.28925112116806256, "grad_norm": 0.23460794985294342, "learning_rate": 0.0002942767356512823, "loss": 4.7736, "step": 14125 }, { "epoch": 0.2893535109454672, "grad_norm": 0.2332044243812561, "learning_rate": 0.00029425978891877287, "loss": 4.8565, "step": 14130 }, { "epoch": 0.2894559007228718, "grad_norm": 0.23149293661117554, "learning_rate": 0.0002942428176229056, "loss": 4.8087, "step": 14135 }, { "epoch": 0.2895582905002764, "grad_norm": 0.27413442730903625, "learning_rate": 0.0002942258217665701, "loss": 4.8628, "step": 14140 }, { "epoch": 0.2896606802776811, "grad_norm": 0.2448301911354065, "learning_rate": 0.0002942088013526604, "loss": 4.8488, "step": 14145 }, { "epoch": 0.2897630700550857, "grad_norm": 0.24141919612884521, "learning_rate": 0.0002941917563840746, "loss": 4.8163, "step": 14150 }, { "epoch": 0.28986545983249035, "grad_norm": 0.23401378095149994, "learning_rate": 0.00029417468686371485, "loss": 4.872, "step": 14155 }, { "epoch": 0.28996784960989497, "grad_norm": 0.23848387598991394, "learning_rate": 0.0002941575927944877, "loss": 4.8367, "step": 14160 }, { "epoch": 0.2900702393872996, "grad_norm": 0.2547352910041809, "learning_rate": 0.00029414047417930376, "loss": 4.8486, "step": 14165 }, { "epoch": 0.2901726291647042, "grad_norm": 0.2544548213481903, "learning_rate": 0.0002941233310210778, "loss": 4.8464, "step": 14170 }, { "epoch": 0.29027501894210883, "grad_norm": 0.24377994239330292, "learning_rate": 0.00029410616332272896, "loss": 4.8373, "step": 14175 }, { "epoch": 0.29037740871951345, "grad_norm": 0.2727868854999542, "learning_rate": 0.0002940889710871802, "loss": 4.836, "step": 14180 }, { "epoch": 0.2904797984969181, "grad_norm": 0.24912340939044952, "learning_rate": 0.00029407175431735903, "loss": 4.8477, "step": 14185 }, { "epoch": 0.2905821882743227, "grad_norm": 0.24339748919010162, "learning_rate": 0.00029405451301619693, "loss": 4.7966, "step": 14190 }, { "epoch": 0.2906845780517273, "grad_norm": 0.26348021626472473, "learning_rate": 0.00029403724718662965, "loss": 4.8565, "step": 14195 }, { "epoch": 0.29078696782913194, "grad_norm": 0.24618175625801086, "learning_rate": 0.000294019956831597, "loss": 4.8437, "step": 14200 }, { "epoch": 0.29088935760653656, "grad_norm": 0.24233630299568176, "learning_rate": 0.00029400264195404306, "loss": 4.8455, "step": 14205 }, { "epoch": 0.2909917473839412, "grad_norm": 0.2411973625421524, "learning_rate": 0.00029398530255691607, "loss": 4.8245, "step": 14210 }, { "epoch": 0.2910941371613458, "grad_norm": 0.27775901556015015, "learning_rate": 0.00029396793864316847, "loss": 4.8144, "step": 14215 }, { "epoch": 0.2911965269387504, "grad_norm": 0.2820267081260681, "learning_rate": 0.00029395055021575676, "loss": 4.8259, "step": 14220 }, { "epoch": 0.29129891671615504, "grad_norm": 0.26808738708496094, "learning_rate": 0.0002939331372776418, "loss": 4.8684, "step": 14225 }, { "epoch": 0.29140130649355966, "grad_norm": 0.26385441422462463, "learning_rate": 0.00029391569983178845, "loss": 4.8392, "step": 14230 }, { "epoch": 0.2915036962709643, "grad_norm": 0.2610601782798767, "learning_rate": 0.0002938982378811658, "loss": 4.8455, "step": 14235 }, { "epoch": 0.2916060860483689, "grad_norm": 0.2848125100135803, "learning_rate": 0.0002938807514287472, "loss": 4.8184, "step": 14240 }, { "epoch": 0.2917084758257735, "grad_norm": 0.2858230471611023, "learning_rate": 0.0002938632404775101, "loss": 4.7833, "step": 14245 }, { "epoch": 0.2918108656031782, "grad_norm": 0.2503651976585388, "learning_rate": 0.000293845705030436, "loss": 4.8225, "step": 14250 }, { "epoch": 0.2919132553805828, "grad_norm": 0.22477854788303375, "learning_rate": 0.0002938281450905108, "loss": 4.8776, "step": 14255 }, { "epoch": 0.29201564515798745, "grad_norm": 0.2450319230556488, "learning_rate": 0.0002938105606607244, "loss": 4.8536, "step": 14260 }, { "epoch": 0.29211803493539207, "grad_norm": 0.2509637773036957, "learning_rate": 0.000293792951744071, "loss": 4.8472, "step": 14265 }, { "epoch": 0.2922204247127967, "grad_norm": 0.24204738438129425, "learning_rate": 0.00029377531834354887, "loss": 4.8526, "step": 14270 }, { "epoch": 0.2923228144902013, "grad_norm": 0.24912716448307037, "learning_rate": 0.0002937576604621604, "loss": 4.8502, "step": 14275 }, { "epoch": 0.29242520426760593, "grad_norm": 0.25580674409866333, "learning_rate": 0.0002937399781029123, "loss": 4.861, "step": 14280 }, { "epoch": 0.29252759404501055, "grad_norm": 0.2348536103963852, "learning_rate": 0.0002937222712688154, "loss": 4.8442, "step": 14285 }, { "epoch": 0.2926299838224152, "grad_norm": 0.2764236330986023, "learning_rate": 0.0002937045399628847, "loss": 4.8065, "step": 14290 }, { "epoch": 0.2927323735998198, "grad_norm": 0.3763650357723236, "learning_rate": 0.0002936867841881392, "loss": 4.822, "step": 14295 }, { "epoch": 0.2928347633772244, "grad_norm": 0.22743472456932068, "learning_rate": 0.0002936690039476023, "loss": 4.8379, "step": 14300 }, { "epoch": 0.29293715315462904, "grad_norm": 0.23576591908931732, "learning_rate": 0.0002936511992443015, "loss": 4.8546, "step": 14305 }, { "epoch": 0.29303954293203366, "grad_norm": 0.2846321761608124, "learning_rate": 0.0002936333700812684, "loss": 4.8515, "step": 14310 }, { "epoch": 0.2931419327094383, "grad_norm": 0.2503238022327423, "learning_rate": 0.00029361551646153874, "loss": 4.8576, "step": 14315 }, { "epoch": 0.2932443224868429, "grad_norm": 0.2515864372253418, "learning_rate": 0.00029359763838815255, "loss": 4.8107, "step": 14320 }, { "epoch": 0.2933467122642475, "grad_norm": 0.25199994444847107, "learning_rate": 0.00029357973586415393, "loss": 4.8071, "step": 14325 }, { "epoch": 0.29344910204165214, "grad_norm": 0.24264228343963623, "learning_rate": 0.00029356180889259127, "loss": 4.7938, "step": 14330 }, { "epoch": 0.29355149181905676, "grad_norm": 0.2395656406879425, "learning_rate": 0.0002935438574765169, "loss": 4.8557, "step": 14335 }, { "epoch": 0.2936538815964614, "grad_norm": 0.2653576731681824, "learning_rate": 0.00029352588161898755, "loss": 4.8134, "step": 14340 }, { "epoch": 0.293756271373866, "grad_norm": 0.2938157021999359, "learning_rate": 0.0002935078813230639, "loss": 4.8175, "step": 14345 }, { "epoch": 0.2938586611512707, "grad_norm": 0.2888815999031067, "learning_rate": 0.0002934898565918109, "loss": 4.7882, "step": 14350 }, { "epoch": 0.2939610509286753, "grad_norm": 0.26561999320983887, "learning_rate": 0.00029347180742829773, "loss": 4.8123, "step": 14355 }, { "epoch": 0.2940634407060799, "grad_norm": 0.23468156158924103, "learning_rate": 0.00029345373383559754, "loss": 4.8979, "step": 14360 }, { "epoch": 0.29416583048348455, "grad_norm": 0.23520389199256897, "learning_rate": 0.0002934356358167879, "loss": 4.8703, "step": 14365 }, { "epoch": 0.29426822026088917, "grad_norm": 0.26250308752059937, "learning_rate": 0.0002934175133749502, "loss": 4.8335, "step": 14370 }, { "epoch": 0.2943706100382938, "grad_norm": 0.24580250680446625, "learning_rate": 0.00029339936651317036, "loss": 4.8437, "step": 14375 }, { "epoch": 0.2944729998156984, "grad_norm": 0.2702936828136444, "learning_rate": 0.0002933811952345382, "loss": 4.8213, "step": 14380 }, { "epoch": 0.29457538959310303, "grad_norm": 0.2317942976951599, "learning_rate": 0.00029336299954214775, "loss": 4.7984, "step": 14385 }, { "epoch": 0.29467777937050765, "grad_norm": 0.24207837879657745, "learning_rate": 0.00029334477943909727, "loss": 4.8218, "step": 14390 }, { "epoch": 0.2947801691479123, "grad_norm": 0.2444547861814499, "learning_rate": 0.00029332653492848904, "loss": 4.7892, "step": 14395 }, { "epoch": 0.2948825589253169, "grad_norm": 0.24976152181625366, "learning_rate": 0.0002933082660134297, "loss": 4.8267, "step": 14400 }, { "epoch": 0.2949849487027215, "grad_norm": 0.25552061200141907, "learning_rate": 0.00029328997269702987, "loss": 4.8644, "step": 14405 }, { "epoch": 0.29508733848012614, "grad_norm": 0.23503315448760986, "learning_rate": 0.0002932716549824044, "loss": 4.8656, "step": 14410 }, { "epoch": 0.29518972825753076, "grad_norm": 0.23296082019805908, "learning_rate": 0.00029325331287267226, "loss": 4.8821, "step": 14415 }, { "epoch": 0.2952921180349354, "grad_norm": 0.23490382730960846, "learning_rate": 0.0002932349463709566, "loss": 4.8248, "step": 14420 }, { "epoch": 0.29539450781234, "grad_norm": 0.240232452750206, "learning_rate": 0.00029321655548038473, "loss": 4.8924, "step": 14425 }, { "epoch": 0.2954968975897446, "grad_norm": 0.2854427397251129, "learning_rate": 0.000293198140204088, "loss": 4.8129, "step": 14430 }, { "epoch": 0.29559928736714924, "grad_norm": 0.24002832174301147, "learning_rate": 0.0002931797005452022, "loss": 4.8113, "step": 14435 }, { "epoch": 0.29570167714455386, "grad_norm": 0.33993467688560486, "learning_rate": 0.00029316123650686694, "loss": 4.8248, "step": 14440 }, { "epoch": 0.2958040669219585, "grad_norm": 0.23282067477703094, "learning_rate": 0.00029314274809222617, "loss": 4.8296, "step": 14445 }, { "epoch": 0.29590645669936316, "grad_norm": 0.359872967004776, "learning_rate": 0.00029312423530442793, "loss": 4.8899, "step": 14450 }, { "epoch": 0.2960088464767678, "grad_norm": 0.23167221248149872, "learning_rate": 0.00029310569814662443, "loss": 4.8045, "step": 14455 }, { "epoch": 0.2961112362541724, "grad_norm": 0.25053495168685913, "learning_rate": 0.00029308713662197196, "loss": 4.7976, "step": 14460 }, { "epoch": 0.296213626031577, "grad_norm": 0.22132182121276855, "learning_rate": 0.0002930685507336311, "loss": 4.8098, "step": 14465 }, { "epoch": 0.29631601580898165, "grad_norm": 0.2549269497394562, "learning_rate": 0.00029304994048476654, "loss": 4.8825, "step": 14470 }, { "epoch": 0.29641840558638627, "grad_norm": 0.21993915736675262, "learning_rate": 0.00029303130587854696, "loss": 4.8111, "step": 14475 }, { "epoch": 0.2965207953637909, "grad_norm": 0.24117755889892578, "learning_rate": 0.0002930126469181454, "loss": 4.8284, "step": 14480 }, { "epoch": 0.2966231851411955, "grad_norm": 0.2512553632259369, "learning_rate": 0.0002929939636067389, "loss": 4.7943, "step": 14485 }, { "epoch": 0.29672557491860013, "grad_norm": 0.4628860652446747, "learning_rate": 0.00029297525594750874, "loss": 4.8659, "step": 14490 }, { "epoch": 0.29682796469600475, "grad_norm": 0.27421388030052185, "learning_rate": 0.00029295652394364024, "loss": 4.7795, "step": 14495 }, { "epoch": 0.2969303544734094, "grad_norm": 0.23088233172893524, "learning_rate": 0.000292937767598323, "loss": 4.8242, "step": 14500 }, { "epoch": 0.297032744250814, "grad_norm": 0.2422669231891632, "learning_rate": 0.0002929189869147507, "loss": 4.8384, "step": 14505 }, { "epoch": 0.2971351340282186, "grad_norm": 0.24192313849925995, "learning_rate": 0.00029290018189612107, "loss": 4.8465, "step": 14510 }, { "epoch": 0.29723752380562324, "grad_norm": 0.2521132230758667, "learning_rate": 0.0002928813525456362, "loss": 4.8136, "step": 14515 }, { "epoch": 0.29733991358302786, "grad_norm": 0.2397879958152771, "learning_rate": 0.000292862498866502, "loss": 4.8538, "step": 14520 }, { "epoch": 0.2974423033604325, "grad_norm": 0.2553879916667938, "learning_rate": 0.00029284362086192897, "loss": 4.8258, "step": 14525 }, { "epoch": 0.2975446931378371, "grad_norm": 0.2838282287120819, "learning_rate": 0.0002928247185351313, "loss": 4.881, "step": 14530 }, { "epoch": 0.2976470829152417, "grad_norm": 0.24460811913013458, "learning_rate": 0.0002928057918893276, "loss": 4.8377, "step": 14535 }, { "epoch": 0.29774947269264634, "grad_norm": 0.2518077492713928, "learning_rate": 0.00029278684092774054, "loss": 4.8534, "step": 14540 }, { "epoch": 0.29785186247005097, "grad_norm": 0.32397088408470154, "learning_rate": 0.0002927678656535969, "loss": 4.8601, "step": 14545 }, { "epoch": 0.2979542522474556, "grad_norm": 0.25878608226776123, "learning_rate": 0.00029274886607012773, "loss": 4.8296, "step": 14550 }, { "epoch": 0.29805664202486026, "grad_norm": 0.2423200160264969, "learning_rate": 0.000292729842180568, "loss": 4.8986, "step": 14555 }, { "epoch": 0.2981590318022649, "grad_norm": 0.23571667075157166, "learning_rate": 0.00029271079398815697, "loss": 4.8347, "step": 14560 }, { "epoch": 0.2982614215796695, "grad_norm": 0.22638769447803497, "learning_rate": 0.00029269172149613804, "loss": 4.8315, "step": 14565 }, { "epoch": 0.2983638113570741, "grad_norm": 0.29264798760414124, "learning_rate": 0.00029267262470775873, "loss": 4.7274, "step": 14570 }, { "epoch": 0.29846620113447875, "grad_norm": 0.25082406401634216, "learning_rate": 0.0002926535036262706, "loss": 4.8894, "step": 14575 }, { "epoch": 0.29856859091188337, "grad_norm": 0.2945980131626129, "learning_rate": 0.0002926343582549295, "loss": 4.7709, "step": 14580 }, { "epoch": 0.298670980689288, "grad_norm": 0.253180593252182, "learning_rate": 0.00029261518859699536, "loss": 4.8286, "step": 14585 }, { "epoch": 0.2987733704666926, "grad_norm": 0.250714510679245, "learning_rate": 0.0002925959946557322, "loss": 4.8418, "step": 14590 }, { "epoch": 0.29887576024409723, "grad_norm": 0.2238171398639679, "learning_rate": 0.0002925767764344082, "loss": 4.8599, "step": 14595 }, { "epoch": 0.29897815002150185, "grad_norm": 0.2589882016181946, "learning_rate": 0.00029255753393629566, "loss": 4.8653, "step": 14600 }, { "epoch": 0.2990805397989065, "grad_norm": 0.23701760172843933, "learning_rate": 0.0002925382671646711, "loss": 4.828, "step": 14605 }, { "epoch": 0.2991829295763111, "grad_norm": 0.2523413896560669, "learning_rate": 0.00029251897612281503, "loss": 4.7975, "step": 14610 }, { "epoch": 0.2992853193537157, "grad_norm": 0.2446378916501999, "learning_rate": 0.00029249966081401217, "loss": 4.8171, "step": 14615 }, { "epoch": 0.29938770913112034, "grad_norm": 0.24434706568717957, "learning_rate": 0.00029248032124155144, "loss": 4.8778, "step": 14620 }, { "epoch": 0.29949009890852496, "grad_norm": 0.2354562133550644, "learning_rate": 0.0002924609574087258, "loss": 4.8285, "step": 14625 }, { "epoch": 0.2995924886859296, "grad_norm": 0.26459982991218567, "learning_rate": 0.00029244156931883226, "loss": 4.8441, "step": 14630 }, { "epoch": 0.2996948784633342, "grad_norm": 0.23767104744911194, "learning_rate": 0.0002924221569751722, "loss": 4.8784, "step": 14635 }, { "epoch": 0.2997972682407388, "grad_norm": 0.22710983455181122, "learning_rate": 0.0002924027203810509, "loss": 4.8236, "step": 14640 }, { "epoch": 0.29989965801814344, "grad_norm": 0.27179643511772156, "learning_rate": 0.00029238325953977797, "loss": 4.8766, "step": 14645 }, { "epoch": 0.30000204779554807, "grad_norm": 0.2793235778808594, "learning_rate": 0.0002923637744546669, "loss": 4.831, "step": 14650 }, { "epoch": 0.30010443757295274, "grad_norm": 0.23421184718608856, "learning_rate": 0.0002923442651290356, "loss": 4.8611, "step": 14655 }, { "epoch": 0.30020682735035736, "grad_norm": 0.25046485662460327, "learning_rate": 0.0002923247315662058, "loss": 4.8713, "step": 14660 }, { "epoch": 0.300309217127762, "grad_norm": 0.24280314147472382, "learning_rate": 0.0002923051737695036, "loss": 4.7816, "step": 14665 }, { "epoch": 0.3004116069051666, "grad_norm": 0.24065694212913513, "learning_rate": 0.0002922855917422591, "loss": 4.8155, "step": 14670 }, { "epoch": 0.3005139966825712, "grad_norm": 0.23348140716552734, "learning_rate": 0.0002922659854878067, "loss": 4.8206, "step": 14675 }, { "epoch": 0.30061638645997585, "grad_norm": 0.23345831036567688, "learning_rate": 0.0002922463550094846, "loss": 4.7384, "step": 14680 }, { "epoch": 0.30071877623738047, "grad_norm": 0.23435938358306885, "learning_rate": 0.0002922267003106354, "loss": 4.8781, "step": 14685 }, { "epoch": 0.3008211660147851, "grad_norm": 0.2535153925418854, "learning_rate": 0.00029220702139460576, "loss": 4.8395, "step": 14690 }, { "epoch": 0.3009235557921897, "grad_norm": 0.23041364550590515, "learning_rate": 0.0002921873182647464, "loss": 4.8476, "step": 14695 }, { "epoch": 0.30102594556959433, "grad_norm": 0.2340639978647232, "learning_rate": 0.00029216759092441224, "loss": 4.8033, "step": 14700 }, { "epoch": 0.30112833534699895, "grad_norm": 0.23467111587524414, "learning_rate": 0.00029214783937696223, "loss": 4.8225, "step": 14705 }, { "epoch": 0.3012307251244036, "grad_norm": 0.2542557120323181, "learning_rate": 0.00029212806362575956, "loss": 4.8653, "step": 14710 }, { "epoch": 0.3013331149018082, "grad_norm": 0.25412076711654663, "learning_rate": 0.00029210826367417143, "loss": 4.8282, "step": 14715 }, { "epoch": 0.3014355046792128, "grad_norm": 0.23797860741615295, "learning_rate": 0.0002920884395255693, "loss": 4.8049, "step": 14720 }, { "epoch": 0.30153789445661744, "grad_norm": 0.2716714143753052, "learning_rate": 0.0002920685911833286, "loss": 4.8646, "step": 14725 }, { "epoch": 0.30164028423402206, "grad_norm": 0.23290424048900604, "learning_rate": 0.0002920487186508289, "loss": 4.8195, "step": 14730 }, { "epoch": 0.3017426740114267, "grad_norm": 0.23280218243598938, "learning_rate": 0.000292028821931454, "loss": 4.833, "step": 14735 }, { "epoch": 0.3018450637888313, "grad_norm": 0.2538447380065918, "learning_rate": 0.0002920089010285917, "loss": 4.8369, "step": 14740 }, { "epoch": 0.3019474535662359, "grad_norm": 0.24996918439865112, "learning_rate": 0.00029198895594563404, "loss": 4.8894, "step": 14745 }, { "epoch": 0.30204984334364054, "grad_norm": 0.2496694177389145, "learning_rate": 0.000291968986685977, "loss": 4.8217, "step": 14750 }, { "epoch": 0.3021522331210452, "grad_norm": 0.24708005785942078, "learning_rate": 0.00029194899325302084, "loss": 4.8339, "step": 14755 }, { "epoch": 0.30225462289844984, "grad_norm": 0.2296389639377594, "learning_rate": 0.0002919289756501698, "loss": 4.799, "step": 14760 }, { "epoch": 0.30235701267585446, "grad_norm": 0.24229353666305542, "learning_rate": 0.00029190893388083243, "loss": 4.7975, "step": 14765 }, { "epoch": 0.3024594024532591, "grad_norm": 0.2631322145462036, "learning_rate": 0.00029188886794842127, "loss": 4.7853, "step": 14770 }, { "epoch": 0.3025617922306637, "grad_norm": 0.2699955105781555, "learning_rate": 0.0002918687778563529, "loss": 4.8729, "step": 14775 }, { "epoch": 0.3026641820080683, "grad_norm": 0.2579222321510315, "learning_rate": 0.0002918486636080481, "loss": 4.8245, "step": 14780 }, { "epoch": 0.30276657178547295, "grad_norm": 0.23751991987228394, "learning_rate": 0.0002918285252069318, "loss": 4.8287, "step": 14785 }, { "epoch": 0.30286896156287757, "grad_norm": 0.24596819281578064, "learning_rate": 0.000291808362656433, "loss": 4.85, "step": 14790 }, { "epoch": 0.3029713513402822, "grad_norm": 0.24527274072170258, "learning_rate": 0.0002917881759599848, "loss": 4.8079, "step": 14795 }, { "epoch": 0.3030737411176868, "grad_norm": 0.2361818104982376, "learning_rate": 0.00029176796512102435, "loss": 4.8187, "step": 14800 }, { "epoch": 0.30317613089509143, "grad_norm": 0.2400951236486435, "learning_rate": 0.0002917477301429931, "loss": 4.8427, "step": 14805 }, { "epoch": 0.30327852067249605, "grad_norm": 0.23457743227481842, "learning_rate": 0.0002917274710293364, "loss": 4.8784, "step": 14810 }, { "epoch": 0.3033809104499007, "grad_norm": 0.23099301755428314, "learning_rate": 0.0002917071877835039, "loss": 4.8478, "step": 14815 }, { "epoch": 0.3034833002273053, "grad_norm": 0.29923340678215027, "learning_rate": 0.00029168688040894923, "loss": 4.8072, "step": 14820 }, { "epoch": 0.3035856900047099, "grad_norm": 0.23164720833301544, "learning_rate": 0.00029166654890913006, "loss": 4.82, "step": 14825 }, { "epoch": 0.30368807978211454, "grad_norm": 0.2423633486032486, "learning_rate": 0.0002916461932875084, "loss": 4.7873, "step": 14830 }, { "epoch": 0.30379046955951916, "grad_norm": 0.4875575304031372, "learning_rate": 0.0002916258135475502, "loss": 4.8053, "step": 14835 }, { "epoch": 0.3038928593369238, "grad_norm": 0.2630945146083832, "learning_rate": 0.0002916054096927255, "loss": 4.8467, "step": 14840 }, { "epoch": 0.3039952491143284, "grad_norm": 0.24198538064956665, "learning_rate": 0.0002915849817265085, "loss": 4.81, "step": 14845 }, { "epoch": 0.304097638891733, "grad_norm": 0.2396472841501236, "learning_rate": 0.0002915645296523777, "loss": 4.8306, "step": 14850 }, { "epoch": 0.3042000286691377, "grad_norm": 0.267425000667572, "learning_rate": 0.0002915440534738152, "loss": 4.8202, "step": 14855 }, { "epoch": 0.3043024184465423, "grad_norm": 0.25262871384620667, "learning_rate": 0.0002915235531943077, "loss": 4.8405, "step": 14860 }, { "epoch": 0.30440480822394694, "grad_norm": 0.25218236446380615, "learning_rate": 0.00029150302881734584, "loss": 4.8296, "step": 14865 }, { "epoch": 0.30450719800135156, "grad_norm": 0.2474459409713745, "learning_rate": 0.00029148248034642423, "loss": 4.8403, "step": 14870 }, { "epoch": 0.3046095877787562, "grad_norm": 0.27986475825309753, "learning_rate": 0.00029146190778504176, "loss": 4.7722, "step": 14875 }, { "epoch": 0.3047119775561608, "grad_norm": 0.24763359129428864, "learning_rate": 0.00029144131113670134, "loss": 4.8281, "step": 14880 }, { "epoch": 0.30481436733356543, "grad_norm": 0.23273034393787384, "learning_rate": 0.00029142069040490997, "loss": 4.7693, "step": 14885 }, { "epoch": 0.30491675711097005, "grad_norm": 0.26339665055274963, "learning_rate": 0.00029140004559317884, "loss": 4.8096, "step": 14890 }, { "epoch": 0.30501914688837467, "grad_norm": 0.2356628030538559, "learning_rate": 0.0002913793767050231, "loss": 4.8152, "step": 14895 }, { "epoch": 0.3051215366657793, "grad_norm": 0.23748421669006348, "learning_rate": 0.00029135868374396214, "loss": 4.8254, "step": 14900 }, { "epoch": 0.3052239264431839, "grad_norm": 0.23817649483680725, "learning_rate": 0.00029133796671351937, "loss": 4.851, "step": 14905 }, { "epoch": 0.30532631622058853, "grad_norm": 0.23400908708572388, "learning_rate": 0.0002913172256172223, "loss": 4.8095, "step": 14910 }, { "epoch": 0.30542870599799316, "grad_norm": 0.2905957102775574, "learning_rate": 0.00029129646045860253, "loss": 4.8246, "step": 14915 }, { "epoch": 0.3055310957753978, "grad_norm": 0.2719612419605255, "learning_rate": 0.00029127567124119584, "loss": 4.7834, "step": 14920 }, { "epoch": 0.3056334855528024, "grad_norm": 0.2600967586040497, "learning_rate": 0.00029125485796854204, "loss": 4.8093, "step": 14925 }, { "epoch": 0.305735875330207, "grad_norm": 0.2781115472316742, "learning_rate": 0.00029123402064418495, "loss": 4.8964, "step": 14930 }, { "epoch": 0.30583826510761164, "grad_norm": 0.23672987520694733, "learning_rate": 0.00029121315927167264, "loss": 4.7962, "step": 14935 }, { "epoch": 0.30594065488501626, "grad_norm": 0.2530001401901245, "learning_rate": 0.00029119227385455726, "loss": 4.8588, "step": 14940 }, { "epoch": 0.3060430446624209, "grad_norm": 0.2906632423400879, "learning_rate": 0.0002911713643963949, "loss": 4.8293, "step": 14945 }, { "epoch": 0.3061454344398255, "grad_norm": 0.30496054887771606, "learning_rate": 0.0002911504309007459, "loss": 4.8093, "step": 14950 }, { "epoch": 0.3062478242172301, "grad_norm": 0.24770379066467285, "learning_rate": 0.00029112947337117467, "loss": 4.8286, "step": 14955 }, { "epoch": 0.3063502139946348, "grad_norm": 0.23954397439956665, "learning_rate": 0.00029110849181124965, "loss": 4.8892, "step": 14960 }, { "epoch": 0.3064526037720394, "grad_norm": 0.23769916594028473, "learning_rate": 0.00029108748622454347, "loss": 4.8276, "step": 14965 }, { "epoch": 0.30655499354944404, "grad_norm": 0.24345830082893372, "learning_rate": 0.0002910664566146327, "loss": 4.7806, "step": 14970 }, { "epoch": 0.30665738332684866, "grad_norm": 0.25595176219940186, "learning_rate": 0.00029104540298509807, "loss": 4.8298, "step": 14975 }, { "epoch": 0.3067597731042533, "grad_norm": 0.28201404213905334, "learning_rate": 0.0002910243253395245, "loss": 4.8136, "step": 14980 }, { "epoch": 0.3068621628816579, "grad_norm": 0.24373804032802582, "learning_rate": 0.0002910032236815009, "loss": 4.833, "step": 14985 }, { "epoch": 0.30696455265906253, "grad_norm": 0.2403092235326767, "learning_rate": 0.00029098209801462025, "loss": 4.8195, "step": 14990 }, { "epoch": 0.30706694243646715, "grad_norm": 0.2325127273797989, "learning_rate": 0.0002909609483424796, "loss": 4.8381, "step": 14995 }, { "epoch": 0.30716933221387177, "grad_norm": 0.24421493709087372, "learning_rate": 0.0002909397746686803, "loss": 4.7955, "step": 15000 }, { "epoch": 0.30716933221387177, "eval_loss": 4.847011089324951, "eval_runtime": 17.2506, "eval_samples_per_second": 33.738, "eval_steps_per_second": 4.232, "step": 15000 }, { "epoch": 0.3072717219912764, "grad_norm": 0.2475053369998932, "learning_rate": 0.0002909185769968275, "loss": 4.8524, "step": 15005 }, { "epoch": 0.307374111768681, "grad_norm": 0.2444298416376114, "learning_rate": 0.0002908973553305307, "loss": 4.8274, "step": 15010 }, { "epoch": 0.30747650154608563, "grad_norm": 0.25403663516044617, "learning_rate": 0.0002908761096734031, "loss": 4.848, "step": 15015 }, { "epoch": 0.30757889132349026, "grad_norm": 0.2355048805475235, "learning_rate": 0.0002908548400290625, "loss": 4.8068, "step": 15020 }, { "epoch": 0.3076812811008949, "grad_norm": 0.2571440041065216, "learning_rate": 0.00029083354640113034, "loss": 4.8013, "step": 15025 }, { "epoch": 0.3077836708782995, "grad_norm": 0.2513095736503601, "learning_rate": 0.0002908122287932324, "loss": 4.808, "step": 15030 }, { "epoch": 0.3078860606557041, "grad_norm": 0.2425457090139389, "learning_rate": 0.00029079088720899846, "loss": 4.8485, "step": 15035 }, { "epoch": 0.30798845043310874, "grad_norm": 0.2379244714975357, "learning_rate": 0.00029076952165206233, "loss": 4.8135, "step": 15040 }, { "epoch": 0.30809084021051336, "grad_norm": 0.2496187686920166, "learning_rate": 0.00029074813212606206, "loss": 4.7596, "step": 15045 }, { "epoch": 0.308193229987918, "grad_norm": 0.293245792388916, "learning_rate": 0.0002907267186346396, "loss": 4.8237, "step": 15050 }, { "epoch": 0.3082956197653226, "grad_norm": 0.26889434456825256, "learning_rate": 0.0002907052811814411, "loss": 4.7961, "step": 15055 }, { "epoch": 0.3083980095427273, "grad_norm": 0.24317938089370728, "learning_rate": 0.0002906838197701167, "loss": 4.7785, "step": 15060 }, { "epoch": 0.3085003993201319, "grad_norm": 0.2693104147911072, "learning_rate": 0.00029066233440432077, "loss": 4.7457, "step": 15065 }, { "epoch": 0.3086027890975365, "grad_norm": 0.24795982241630554, "learning_rate": 0.0002906408250877116, "loss": 4.8225, "step": 15070 }, { "epoch": 0.30870517887494114, "grad_norm": 0.2525266408920288, "learning_rate": 0.00029061929182395155, "loss": 4.84, "step": 15075 }, { "epoch": 0.30880756865234577, "grad_norm": 0.2872849702835083, "learning_rate": 0.0002905977346167072, "loss": 4.8114, "step": 15080 }, { "epoch": 0.3089099584297504, "grad_norm": 0.256111204624176, "learning_rate": 0.00029057615346964917, "loss": 4.7953, "step": 15085 }, { "epoch": 0.309012348207155, "grad_norm": 0.25090688467025757, "learning_rate": 0.0002905545483864521, "loss": 4.8147, "step": 15090 }, { "epoch": 0.30911473798455963, "grad_norm": 0.24256761372089386, "learning_rate": 0.00029053291937079464, "loss": 4.8341, "step": 15095 }, { "epoch": 0.30921712776196425, "grad_norm": 0.29538941383361816, "learning_rate": 0.0002905112664263597, "loss": 4.8359, "step": 15100 }, { "epoch": 0.30931951753936887, "grad_norm": 0.3012793958187103, "learning_rate": 0.000290489589556834, "loss": 4.7685, "step": 15105 }, { "epoch": 0.3094219073167735, "grad_norm": 0.24757792055606842, "learning_rate": 0.0002904678887659088, "loss": 4.8886, "step": 15110 }, { "epoch": 0.3095242970941781, "grad_norm": 0.2345486283302307, "learning_rate": 0.0002904461640572788, "loss": 4.8281, "step": 15115 }, { "epoch": 0.30962668687158273, "grad_norm": 0.252042293548584, "learning_rate": 0.00029042441543464335, "loss": 4.8532, "step": 15120 }, { "epoch": 0.30972907664898736, "grad_norm": 0.23863114416599274, "learning_rate": 0.00029040264290170544, "loss": 4.8753, "step": 15125 }, { "epoch": 0.309831466426392, "grad_norm": 0.25628069043159485, "learning_rate": 0.00029038084646217246, "loss": 4.8412, "step": 15130 }, { "epoch": 0.3099338562037966, "grad_norm": 0.24536699056625366, "learning_rate": 0.0002903590261197557, "loss": 4.8406, "step": 15135 }, { "epoch": 0.3100362459812012, "grad_norm": 0.23125344514846802, "learning_rate": 0.0002903371818781704, "loss": 4.7983, "step": 15140 }, { "epoch": 0.31013863575860584, "grad_norm": 0.3031942546367645, "learning_rate": 0.00029031531374113624, "loss": 4.7796, "step": 15145 }, { "epoch": 0.31024102553601046, "grad_norm": 0.23981033265590668, "learning_rate": 0.00029029342171237656, "loss": 4.8364, "step": 15150 }, { "epoch": 0.3103434153134151, "grad_norm": 0.24899478256702423, "learning_rate": 0.000290271505795619, "loss": 4.8795, "step": 15155 }, { "epoch": 0.31044580509081976, "grad_norm": 0.23420363664627075, "learning_rate": 0.00029024956599459534, "loss": 4.8527, "step": 15160 }, { "epoch": 0.3105481948682244, "grad_norm": 0.22677026689052582, "learning_rate": 0.00029022760231304113, "loss": 4.8321, "step": 15165 }, { "epoch": 0.310650584645629, "grad_norm": 0.25254639983177185, "learning_rate": 0.0002902056147546963, "loss": 4.7307, "step": 15170 }, { "epoch": 0.3107529744230336, "grad_norm": 0.25167813897132874, "learning_rate": 0.0002901836033233047, "loss": 4.8039, "step": 15175 }, { "epoch": 0.31085536420043824, "grad_norm": 0.2575048506259918, "learning_rate": 0.0002901615680226141, "loss": 4.7933, "step": 15180 }, { "epoch": 0.31095775397784287, "grad_norm": 0.2427930384874344, "learning_rate": 0.0002901395088563767, "loss": 4.826, "step": 15185 }, { "epoch": 0.3110601437552475, "grad_norm": 0.2652527391910553, "learning_rate": 0.0002901174258283484, "loss": 4.78, "step": 15190 }, { "epoch": 0.3111625335326521, "grad_norm": 0.23744867742061615, "learning_rate": 0.00029009531894228933, "loss": 4.7958, "step": 15195 }, { "epoch": 0.31126492331005673, "grad_norm": 0.23774176836013794, "learning_rate": 0.0002900731882019638, "loss": 4.8793, "step": 15200 }, { "epoch": 0.31136731308746135, "grad_norm": 0.23590290546417236, "learning_rate": 0.0002900510336111398, "loss": 4.7966, "step": 15205 }, { "epoch": 0.31146970286486597, "grad_norm": 0.24009494483470917, "learning_rate": 0.0002900288551735899, "loss": 4.8661, "step": 15210 }, { "epoch": 0.3115720926422706, "grad_norm": 0.23347415030002594, "learning_rate": 0.00029000665289309033, "loss": 4.8165, "step": 15215 }, { "epoch": 0.3116744824196752, "grad_norm": 0.2547255754470825, "learning_rate": 0.0002899844267734214, "loss": 4.7903, "step": 15220 }, { "epoch": 0.31177687219707984, "grad_norm": 0.2570987343788147, "learning_rate": 0.00028996217681836784, "loss": 4.8647, "step": 15225 }, { "epoch": 0.31187926197448446, "grad_norm": 0.2709431052207947, "learning_rate": 0.000289939903031718, "loss": 4.844, "step": 15230 }, { "epoch": 0.3119816517518891, "grad_norm": 0.2512519955635071, "learning_rate": 0.00028991760541726455, "loss": 4.8062, "step": 15235 }, { "epoch": 0.3120840415292937, "grad_norm": 0.26664140820503235, "learning_rate": 0.00028989528397880415, "loss": 4.8416, "step": 15240 }, { "epoch": 0.3121864313066983, "grad_norm": 0.25389641523361206, "learning_rate": 0.00028987293872013737, "loss": 4.83, "step": 15245 }, { "epoch": 0.31228882108410294, "grad_norm": 0.2916410565376282, "learning_rate": 0.0002898505696450692, "loss": 4.8773, "step": 15250 }, { "epoch": 0.31239121086150756, "grad_norm": 0.24153843522071838, "learning_rate": 0.0002898281767574083, "loss": 4.8245, "step": 15255 }, { "epoch": 0.3124936006389122, "grad_norm": 0.23637597262859344, "learning_rate": 0.0002898057600609676, "loss": 4.828, "step": 15260 }, { "epoch": 0.31259599041631686, "grad_norm": 0.2532103359699249, "learning_rate": 0.000289783319559564, "loss": 4.8363, "step": 15265 }, { "epoch": 0.3126983801937215, "grad_norm": 0.3597238063812256, "learning_rate": 0.0002897608552570185, "loss": 4.8332, "step": 15270 }, { "epoch": 0.3128007699711261, "grad_norm": 0.2457640916109085, "learning_rate": 0.0002897383671571562, "loss": 4.8603, "step": 15275 }, { "epoch": 0.3129031597485307, "grad_norm": 0.23612958192825317, "learning_rate": 0.000289715855263806, "loss": 4.8515, "step": 15280 }, { "epoch": 0.31300554952593534, "grad_norm": 0.23397192358970642, "learning_rate": 0.00028969331958080127, "loss": 4.787, "step": 15285 }, { "epoch": 0.31310793930333997, "grad_norm": 0.29118800163269043, "learning_rate": 0.00028967076011197907, "loss": 4.8675, "step": 15290 }, { "epoch": 0.3132103290807446, "grad_norm": 0.23778823018074036, "learning_rate": 0.0002896481768611807, "loss": 4.8126, "step": 15295 }, { "epoch": 0.3133127188581492, "grad_norm": 0.2586337625980377, "learning_rate": 0.0002896255698322513, "loss": 4.8179, "step": 15300 }, { "epoch": 0.31341510863555383, "grad_norm": 0.3234255909919739, "learning_rate": 0.00028960293902904037, "loss": 4.8404, "step": 15305 }, { "epoch": 0.31351749841295845, "grad_norm": 0.24055859446525574, "learning_rate": 0.00028958028445540124, "loss": 4.7472, "step": 15310 }, { "epoch": 0.31361988819036307, "grad_norm": 0.23130378127098083, "learning_rate": 0.00028955760611519137, "loss": 4.84, "step": 15315 }, { "epoch": 0.3137222779677677, "grad_norm": 0.22796563804149628, "learning_rate": 0.0002895349040122722, "loss": 4.7846, "step": 15320 }, { "epoch": 0.3138246677451723, "grad_norm": 0.24777504801750183, "learning_rate": 0.0002895121781505092, "loss": 4.8461, "step": 15325 }, { "epoch": 0.31392705752257694, "grad_norm": 0.26132479310035706, "learning_rate": 0.0002894894285337721, "loss": 4.7783, "step": 15330 }, { "epoch": 0.31402944729998156, "grad_norm": 0.268828809261322, "learning_rate": 0.00028946665516593436, "loss": 4.822, "step": 15335 }, { "epoch": 0.3141318370773862, "grad_norm": 0.2682000398635864, "learning_rate": 0.0002894438580508737, "loss": 4.7456, "step": 15340 }, { "epoch": 0.3142342268547908, "grad_norm": 0.24134917557239532, "learning_rate": 0.00028942103719247186, "loss": 4.8673, "step": 15345 }, { "epoch": 0.3143366166321954, "grad_norm": 0.2589450776576996, "learning_rate": 0.00028939819259461453, "loss": 4.8995, "step": 15350 }, { "epoch": 0.31443900640960004, "grad_norm": 0.22643651068210602, "learning_rate": 0.00028937532426119155, "loss": 4.8269, "step": 15355 }, { "epoch": 0.31454139618700466, "grad_norm": 0.24379104375839233, "learning_rate": 0.0002893524321960967, "loss": 4.8361, "step": 15360 }, { "epoch": 0.31464378596440934, "grad_norm": 0.25083568692207336, "learning_rate": 0.0002893295164032279, "loss": 4.7882, "step": 15365 }, { "epoch": 0.31474617574181396, "grad_norm": 0.2672452926635742, "learning_rate": 0.00028930657688648696, "loss": 4.791, "step": 15370 }, { "epoch": 0.3148485655192186, "grad_norm": 1.3716267347335815, "learning_rate": 0.00028928361364977996, "loss": 4.8938, "step": 15375 }, { "epoch": 0.3149509552966232, "grad_norm": 0.23629119992256165, "learning_rate": 0.0002892606266970168, "loss": 4.8081, "step": 15380 }, { "epoch": 0.3150533450740278, "grad_norm": 0.2300252914428711, "learning_rate": 0.0002892376160321116, "loss": 4.8424, "step": 15385 }, { "epoch": 0.31515573485143245, "grad_norm": 0.2472316175699234, "learning_rate": 0.0002892145816589823, "loss": 4.8456, "step": 15390 }, { "epoch": 0.31525812462883707, "grad_norm": 0.2334398329257965, "learning_rate": 0.0002891915235815511, "loss": 4.8625, "step": 15395 }, { "epoch": 0.3153605144062417, "grad_norm": 0.27647051215171814, "learning_rate": 0.00028916844180374405, "loss": 4.8315, "step": 15400 }, { "epoch": 0.3154629041836463, "grad_norm": 0.24235866963863373, "learning_rate": 0.0002891453363294915, "loss": 4.8063, "step": 15405 }, { "epoch": 0.31556529396105093, "grad_norm": 0.2546367049217224, "learning_rate": 0.0002891222071627274, "loss": 4.7442, "step": 15410 }, { "epoch": 0.31566768373845555, "grad_norm": 0.25487855076789856, "learning_rate": 0.00028909905430739015, "loss": 4.7989, "step": 15415 }, { "epoch": 0.3157700735158602, "grad_norm": 0.26692599058151245, "learning_rate": 0.0002890758777674221, "loss": 4.8926, "step": 15420 }, { "epoch": 0.3158724632932648, "grad_norm": 0.2569117844104767, "learning_rate": 0.0002890526775467694, "loss": 4.8236, "step": 15425 }, { "epoch": 0.3159748530706694, "grad_norm": 0.24728327989578247, "learning_rate": 0.0002890294536493824, "loss": 4.7589, "step": 15430 }, { "epoch": 0.31607724284807404, "grad_norm": 0.27122560143470764, "learning_rate": 0.0002890062060792155, "loss": 4.8202, "step": 15435 }, { "epoch": 0.31617963262547866, "grad_norm": 0.26875802874565125, "learning_rate": 0.0002889829348402272, "loss": 4.7956, "step": 15440 }, { "epoch": 0.3162820224028833, "grad_norm": 0.26880842447280884, "learning_rate": 0.00028895963993637986, "loss": 4.8323, "step": 15445 }, { "epoch": 0.3163844121802879, "grad_norm": 0.2605154514312744, "learning_rate": 0.0002889363213716399, "loss": 4.8423, "step": 15450 }, { "epoch": 0.3164868019576925, "grad_norm": 0.25857120752334595, "learning_rate": 0.00028891297914997795, "loss": 4.7802, "step": 15455 }, { "epoch": 0.31658919173509714, "grad_norm": 0.26708948612213135, "learning_rate": 0.00028888961327536833, "loss": 4.7449, "step": 15460 }, { "epoch": 0.3166915815125018, "grad_norm": 0.23551607131958008, "learning_rate": 0.00028886622375178976, "loss": 4.8423, "step": 15465 }, { "epoch": 0.31679397128990644, "grad_norm": 0.24214282631874084, "learning_rate": 0.0002888428105832247, "loss": 4.7936, "step": 15470 }, { "epoch": 0.31689636106731106, "grad_norm": 0.23676030337810516, "learning_rate": 0.0002888193737736598, "loss": 4.8165, "step": 15475 }, { "epoch": 0.3169987508447157, "grad_norm": 0.6792005896568298, "learning_rate": 0.0002887959133270857, "loss": 4.8453, "step": 15480 }, { "epoch": 0.3171011406221203, "grad_norm": 0.2418385148048401, "learning_rate": 0.000288772429247497, "loss": 4.8382, "step": 15485 }, { "epoch": 0.3172035303995249, "grad_norm": 0.2369857281446457, "learning_rate": 0.0002887489215388924, "loss": 4.7731, "step": 15490 }, { "epoch": 0.31730592017692955, "grad_norm": 0.23815925419330597, "learning_rate": 0.00028872539020527465, "loss": 4.8536, "step": 15495 }, { "epoch": 0.31740830995433417, "grad_norm": 0.24218904972076416, "learning_rate": 0.00028870183525065037, "loss": 4.8051, "step": 15500 }, { "epoch": 0.3175106997317388, "grad_norm": 0.22303232550621033, "learning_rate": 0.00028867825667903036, "loss": 4.8295, "step": 15505 }, { "epoch": 0.3176130895091434, "grad_norm": 0.2368650883436203, "learning_rate": 0.00028865465449442936, "loss": 4.8234, "step": 15510 }, { "epoch": 0.31771547928654803, "grad_norm": 0.23938125371932983, "learning_rate": 0.0002886310287008662, "loss": 4.8194, "step": 15515 }, { "epoch": 0.31781786906395265, "grad_norm": 0.33564168214797974, "learning_rate": 0.0002886073793023637, "loss": 4.8487, "step": 15520 }, { "epoch": 0.3179202588413573, "grad_norm": 0.2706722319126129, "learning_rate": 0.00028858370630294855, "loss": 4.8275, "step": 15525 }, { "epoch": 0.3180226486187619, "grad_norm": 0.24289891123771667, "learning_rate": 0.0002885600097066517, "loss": 4.8184, "step": 15530 }, { "epoch": 0.3181250383961665, "grad_norm": 0.2418476790189743, "learning_rate": 0.000288536289517508, "loss": 4.7873, "step": 15535 }, { "epoch": 0.31822742817357114, "grad_norm": 0.2375362068414688, "learning_rate": 0.0002885125457395563, "loss": 4.8121, "step": 15540 }, { "epoch": 0.31832981795097576, "grad_norm": 0.24532565474510193, "learning_rate": 0.00028848877837683954, "loss": 4.8509, "step": 15545 }, { "epoch": 0.3184322077283804, "grad_norm": 0.2715478241443634, "learning_rate": 0.0002884649874334045, "loss": 4.847, "step": 15550 }, { "epoch": 0.318534597505785, "grad_norm": 0.2489650547504425, "learning_rate": 0.00028844117291330234, "loss": 4.7647, "step": 15555 }, { "epoch": 0.3186369872831896, "grad_norm": 0.24650003015995026, "learning_rate": 0.00028841733482058777, "loss": 4.8137, "step": 15560 }, { "epoch": 0.3187393770605943, "grad_norm": 0.2365977168083191, "learning_rate": 0.00028839347315931985, "loss": 4.8598, "step": 15565 }, { "epoch": 0.3188417668379989, "grad_norm": 0.25365331768989563, "learning_rate": 0.0002883695879335615, "loss": 4.8118, "step": 15570 }, { "epoch": 0.31894415661540354, "grad_norm": 0.24908719956874847, "learning_rate": 0.00028834567914737974, "loss": 4.8367, "step": 15575 }, { "epoch": 0.31904654639280816, "grad_norm": 0.24689409136772156, "learning_rate": 0.0002883217468048456, "loss": 4.8306, "step": 15580 }, { "epoch": 0.3191489361702128, "grad_norm": 0.23742550611495972, "learning_rate": 0.0002882977909100339, "loss": 4.81, "step": 15585 }, { "epoch": 0.3192513259476174, "grad_norm": 0.23478001356124878, "learning_rate": 0.00028827381146702386, "loss": 4.8192, "step": 15590 }, { "epoch": 0.319353715725022, "grad_norm": 0.2573738992214203, "learning_rate": 0.00028824980847989845, "loss": 4.839, "step": 15595 }, { "epoch": 0.31945610550242665, "grad_norm": 0.3302048444747925, "learning_rate": 0.0002882257819527446, "loss": 4.7653, "step": 15600 }, { "epoch": 0.31955849527983127, "grad_norm": 0.24334406852722168, "learning_rate": 0.0002882017318896534, "loss": 4.8084, "step": 15605 }, { "epoch": 0.3196608850572359, "grad_norm": 0.2528078258037567, "learning_rate": 0.0002881776582947199, "loss": 4.7564, "step": 15610 }, { "epoch": 0.3197632748346405, "grad_norm": 0.23048368096351624, "learning_rate": 0.0002881535611720432, "loss": 4.8018, "step": 15615 }, { "epoch": 0.31986566461204513, "grad_norm": 0.2321743220090866, "learning_rate": 0.0002881294405257263, "loss": 4.8037, "step": 15620 }, { "epoch": 0.31996805438944975, "grad_norm": 0.2332994043827057, "learning_rate": 0.0002881052963598763, "loss": 4.7536, "step": 15625 }, { "epoch": 0.3200704441668544, "grad_norm": 0.2396085262298584, "learning_rate": 0.0002880811286786042, "loss": 4.8436, "step": 15630 }, { "epoch": 0.320172833944259, "grad_norm": 0.4317147433757782, "learning_rate": 0.00028805693748602514, "loss": 4.8264, "step": 15635 }, { "epoch": 0.3202752237216636, "grad_norm": 0.23182272911071777, "learning_rate": 0.0002880327227862582, "loss": 4.8014, "step": 15640 }, { "epoch": 0.32037761349906824, "grad_norm": 0.24353723227977753, "learning_rate": 0.0002880084845834264, "loss": 4.8086, "step": 15645 }, { "epoch": 0.32048000327647286, "grad_norm": 0.2554088234901428, "learning_rate": 0.00028798422288165684, "loss": 4.845, "step": 15650 }, { "epoch": 0.3205823930538775, "grad_norm": 0.23942938446998596, "learning_rate": 0.0002879599376850807, "loss": 4.7911, "step": 15655 }, { "epoch": 0.3206847828312821, "grad_norm": 0.2640797197818756, "learning_rate": 0.00028793562899783284, "loss": 4.858, "step": 15660 }, { "epoch": 0.3207871726086867, "grad_norm": 0.2859177589416504, "learning_rate": 0.00028791129682405255, "loss": 4.8502, "step": 15665 }, { "epoch": 0.3208895623860914, "grad_norm": 0.24477677047252655, "learning_rate": 0.00028788694116788283, "loss": 4.8044, "step": 15670 }, { "epoch": 0.320991952163496, "grad_norm": 0.22414985299110413, "learning_rate": 0.00028786256203347076, "loss": 4.7959, "step": 15675 }, { "epoch": 0.32109434194090064, "grad_norm": 0.2535495460033417, "learning_rate": 0.00028783815942496737, "loss": 4.8198, "step": 15680 }, { "epoch": 0.32119673171830526, "grad_norm": 0.27068763971328735, "learning_rate": 0.00028781373334652786, "loss": 4.8548, "step": 15685 }, { "epoch": 0.3212991214957099, "grad_norm": 0.2561670243740082, "learning_rate": 0.0002877892838023112, "loss": 4.8365, "step": 15690 }, { "epoch": 0.3214015112731145, "grad_norm": 0.2500530481338501, "learning_rate": 0.0002877648107964805, "loss": 4.8265, "step": 15695 }, { "epoch": 0.3215039010505191, "grad_norm": 0.23399963974952698, "learning_rate": 0.00028774031433320273, "loss": 4.81, "step": 15700 }, { "epoch": 0.32160629082792375, "grad_norm": 0.22941014170646667, "learning_rate": 0.0002877157944166491, "loss": 4.8312, "step": 15705 }, { "epoch": 0.32170868060532837, "grad_norm": 0.2397286742925644, "learning_rate": 0.00028769125105099447, "loss": 4.7979, "step": 15710 }, { "epoch": 0.321811070382733, "grad_norm": 0.25594308972358704, "learning_rate": 0.000287666684240418, "loss": 4.8217, "step": 15715 }, { "epoch": 0.3219134601601376, "grad_norm": 0.24058277904987335, "learning_rate": 0.0002876420939891028, "loss": 4.8679, "step": 15720 }, { "epoch": 0.32201584993754223, "grad_norm": 0.24760036170482635, "learning_rate": 0.0002876174803012357, "loss": 4.7753, "step": 15725 }, { "epoch": 0.32211823971494685, "grad_norm": 0.249937504529953, "learning_rate": 0.00028759284318100785, "loss": 4.8472, "step": 15730 }, { "epoch": 0.3222206294923515, "grad_norm": 0.241938978433609, "learning_rate": 0.0002875681826326142, "loss": 4.7948, "step": 15735 }, { "epoch": 0.3223230192697561, "grad_norm": 0.2601494789123535, "learning_rate": 0.00028754349866025374, "loss": 4.826, "step": 15740 }, { "epoch": 0.3224254090471607, "grad_norm": 0.2735443115234375, "learning_rate": 0.0002875187912681295, "loss": 4.8471, "step": 15745 }, { "epoch": 0.32252779882456534, "grad_norm": 0.24875079095363617, "learning_rate": 0.00028749406046044845, "loss": 4.8263, "step": 15750 }, { "epoch": 0.32263018860196996, "grad_norm": 0.24867716431617737, "learning_rate": 0.00028746930624142143, "loss": 4.8503, "step": 15755 }, { "epoch": 0.3227325783793746, "grad_norm": 0.27599936723709106, "learning_rate": 0.0002874445286152635, "loss": 4.8091, "step": 15760 }, { "epoch": 0.3228349681567792, "grad_norm": 0.2574349641799927, "learning_rate": 0.00028741972758619357, "loss": 4.8073, "step": 15765 }, { "epoch": 0.3229373579341839, "grad_norm": 0.24333839118480682, "learning_rate": 0.00028739490315843455, "loss": 4.8281, "step": 15770 }, { "epoch": 0.3230397477115885, "grad_norm": 0.2696617543697357, "learning_rate": 0.0002873700553362133, "loss": 4.8326, "step": 15775 }, { "epoch": 0.3231421374889931, "grad_norm": 0.2507472336292267, "learning_rate": 0.0002873451841237607, "loss": 4.8961, "step": 15780 }, { "epoch": 0.32324452726639774, "grad_norm": 0.269682914018631, "learning_rate": 0.00028732028952531175, "loss": 4.8254, "step": 15785 }, { "epoch": 0.32334691704380236, "grad_norm": 0.2574978470802307, "learning_rate": 0.0002872953715451051, "loss": 4.7598, "step": 15790 }, { "epoch": 0.323449306821207, "grad_norm": 0.22101959586143494, "learning_rate": 0.00028727043018738366, "loss": 4.8443, "step": 15795 }, { "epoch": 0.3235516965986116, "grad_norm": 0.23810577392578125, "learning_rate": 0.0002872454654563943, "loss": 4.8215, "step": 15800 }, { "epoch": 0.3236540863760162, "grad_norm": 0.2515735924243927, "learning_rate": 0.0002872204773563877, "loss": 4.8268, "step": 15805 }, { "epoch": 0.32375647615342085, "grad_norm": 0.25298553705215454, "learning_rate": 0.00028719546589161865, "loss": 4.8677, "step": 15810 }, { "epoch": 0.32385886593082547, "grad_norm": 0.24797645211219788, "learning_rate": 0.0002871704310663459, "loss": 4.807, "step": 15815 }, { "epoch": 0.3239612557082301, "grad_norm": 0.2690029740333557, "learning_rate": 0.00028714537288483227, "loss": 4.7658, "step": 15820 }, { "epoch": 0.3240636454856347, "grad_norm": 0.26422470808029175, "learning_rate": 0.00028712029135134434, "loss": 4.8034, "step": 15825 }, { "epoch": 0.32416603526303933, "grad_norm": 0.27222222089767456, "learning_rate": 0.0002870951864701528, "loss": 4.8401, "step": 15830 }, { "epoch": 0.32426842504044395, "grad_norm": 0.23684225976467133, "learning_rate": 0.00028707005824553237, "loss": 4.8393, "step": 15835 }, { "epoch": 0.3243708148178486, "grad_norm": 0.2376585751771927, "learning_rate": 0.00028704490668176154, "loss": 4.8081, "step": 15840 }, { "epoch": 0.3244732045952532, "grad_norm": 0.25881075859069824, "learning_rate": 0.0002870197317831231, "loss": 4.808, "step": 15845 }, { "epoch": 0.3245755943726578, "grad_norm": 0.24747461080551147, "learning_rate": 0.0002869945335539035, "loss": 4.7748, "step": 15850 }, { "epoch": 0.32467798415006244, "grad_norm": 0.23320181667804718, "learning_rate": 0.00028696931199839325, "loss": 4.7779, "step": 15855 }, { "epoch": 0.32478037392746706, "grad_norm": 0.23846809566020966, "learning_rate": 0.00028694406712088694, "loss": 4.8357, "step": 15860 }, { "epoch": 0.3248827637048717, "grad_norm": 0.241712749004364, "learning_rate": 0.0002869187989256831, "loss": 4.8271, "step": 15865 }, { "epoch": 0.32498515348227636, "grad_norm": 0.2403462678194046, "learning_rate": 0.0002868935074170841, "loss": 4.8015, "step": 15870 }, { "epoch": 0.325087543259681, "grad_norm": 0.2553309202194214, "learning_rate": 0.00028686819259939636, "loss": 4.7891, "step": 15875 }, { "epoch": 0.3251899330370856, "grad_norm": 0.24671103060245514, "learning_rate": 0.00028684285447693034, "loss": 4.7901, "step": 15880 }, { "epoch": 0.3252923228144902, "grad_norm": 0.24250692129135132, "learning_rate": 0.0002868174930540004, "loss": 4.7807, "step": 15885 }, { "epoch": 0.32539471259189484, "grad_norm": 0.2553572654724121, "learning_rate": 0.00028679210833492476, "loss": 4.8591, "step": 15890 }, { "epoch": 0.32549710236929946, "grad_norm": 0.24860507249832153, "learning_rate": 0.00028676670032402586, "loss": 4.8435, "step": 15895 }, { "epoch": 0.3255994921467041, "grad_norm": 0.2537551820278168, "learning_rate": 0.0002867412690256299, "loss": 4.8049, "step": 15900 }, { "epoch": 0.3257018819241087, "grad_norm": 0.3558686375617981, "learning_rate": 0.0002867158144440672, "loss": 4.81, "step": 15905 }, { "epoch": 0.3258042717015133, "grad_norm": 0.24676696956157684, "learning_rate": 0.00028669033658367183, "loss": 4.8159, "step": 15910 }, { "epoch": 0.32590666147891795, "grad_norm": 0.23601582646369934, "learning_rate": 0.00028666483544878194, "loss": 4.8348, "step": 15915 }, { "epoch": 0.32600905125632257, "grad_norm": 0.22834748029708862, "learning_rate": 0.0002866393110437398, "loss": 4.7828, "step": 15920 }, { "epoch": 0.3261114410337272, "grad_norm": 0.23815374076366425, "learning_rate": 0.0002866137633728914, "loss": 4.833, "step": 15925 }, { "epoch": 0.3262138308111318, "grad_norm": 0.24401137232780457, "learning_rate": 0.0002865881924405867, "loss": 4.7999, "step": 15930 }, { "epoch": 0.32631622058853643, "grad_norm": 0.23739293217658997, "learning_rate": 0.00028656259825117985, "loss": 4.8044, "step": 15935 }, { "epoch": 0.32641861036594105, "grad_norm": 0.24494828283786774, "learning_rate": 0.00028653698080902877, "loss": 4.7714, "step": 15940 }, { "epoch": 0.3265210001433457, "grad_norm": 0.269447386264801, "learning_rate": 0.0002865113401184953, "loss": 4.8062, "step": 15945 }, { "epoch": 0.3266233899207503, "grad_norm": 0.2683109641075134, "learning_rate": 0.0002864856761839455, "loss": 4.7858, "step": 15950 }, { "epoch": 0.3267257796981549, "grad_norm": 0.24812977015972137, "learning_rate": 0.000286459989009749, "loss": 4.7581, "step": 15955 }, { "epoch": 0.32682816947555954, "grad_norm": 0.24161896109580994, "learning_rate": 0.0002864342786002798, "loss": 4.852, "step": 15960 }, { "epoch": 0.32693055925296416, "grad_norm": 0.22758758068084717, "learning_rate": 0.0002864085449599155, "loss": 4.8442, "step": 15965 }, { "epoch": 0.3270329490303688, "grad_norm": 0.2524091601371765, "learning_rate": 0.00028638278809303794, "loss": 4.8593, "step": 15970 }, { "epoch": 0.32713533880777346, "grad_norm": 0.2220301330089569, "learning_rate": 0.00028635700800403265, "loss": 4.8399, "step": 15975 }, { "epoch": 0.3272377285851781, "grad_norm": 0.27116480469703674, "learning_rate": 0.00028633120469728936, "loss": 4.8445, "step": 15980 }, { "epoch": 0.3273401183625827, "grad_norm": 0.25042563676834106, "learning_rate": 0.0002863053781772016, "loss": 4.8223, "step": 15985 }, { "epoch": 0.3274425081399873, "grad_norm": 0.22962479293346405, "learning_rate": 0.00028627952844816686, "loss": 4.8336, "step": 15990 }, { "epoch": 0.32754489791739194, "grad_norm": 0.2547265291213989, "learning_rate": 0.0002862536555145867, "loss": 4.8092, "step": 15995 }, { "epoch": 0.32764728769479656, "grad_norm": 0.24327710270881653, "learning_rate": 0.00028622775938086647, "loss": 4.8828, "step": 16000 }, { "epoch": 0.32764728769479656, "eval_loss": 4.832391738891602, "eval_runtime": 17.2365, "eval_samples_per_second": 33.765, "eval_steps_per_second": 4.235, "step": 16000 }, { "epoch": 0.3277496774722012, "grad_norm": 0.23021191358566284, "learning_rate": 0.00028620184005141556, "loss": 4.7917, "step": 16005 }, { "epoch": 0.3278520672496058, "grad_norm": 0.2317049205303192, "learning_rate": 0.00028617589753064735, "loss": 4.8406, "step": 16010 }, { "epoch": 0.3279544570270104, "grad_norm": 0.24882535636425018, "learning_rate": 0.00028614993182297905, "loss": 4.7988, "step": 16015 }, { "epoch": 0.32805684680441505, "grad_norm": 0.40424302220344543, "learning_rate": 0.00028612394293283197, "loss": 4.836, "step": 16020 }, { "epoch": 0.32815923658181967, "grad_norm": 0.2324492335319519, "learning_rate": 0.0002860979308646312, "loss": 4.8052, "step": 16025 }, { "epoch": 0.3282616263592243, "grad_norm": 0.2379092574119568, "learning_rate": 0.0002860718956228059, "loss": 4.7905, "step": 16030 }, { "epoch": 0.3283640161366289, "grad_norm": 0.23237118124961853, "learning_rate": 0.00028604583721178906, "loss": 4.8486, "step": 16035 }, { "epoch": 0.32846640591403353, "grad_norm": 0.24024012684822083, "learning_rate": 0.0002860197556360178, "loss": 4.7788, "step": 16040 }, { "epoch": 0.32856879569143815, "grad_norm": 0.26056140661239624, "learning_rate": 0.0002859936508999331, "loss": 4.7711, "step": 16045 }, { "epoch": 0.3286711854688428, "grad_norm": 0.2680843472480774, "learning_rate": 0.0002859675230079797, "loss": 4.8395, "step": 16050 }, { "epoch": 0.3287735752462474, "grad_norm": 0.23419305682182312, "learning_rate": 0.00028594137196460653, "loss": 4.8007, "step": 16055 }, { "epoch": 0.328875965023652, "grad_norm": 0.2893793284893036, "learning_rate": 0.0002859151977742664, "loss": 4.8185, "step": 16060 }, { "epoch": 0.32897835480105664, "grad_norm": 0.29999348521232605, "learning_rate": 0.00028588900044141595, "loss": 4.8263, "step": 16065 }, { "epoch": 0.32908074457846126, "grad_norm": 0.2500661611557007, "learning_rate": 0.0002858627799705159, "loss": 4.812, "step": 16070 }, { "epoch": 0.32918313435586594, "grad_norm": 0.2984530031681061, "learning_rate": 0.0002858365363660309, "loss": 4.7767, "step": 16075 }, { "epoch": 0.32928552413327056, "grad_norm": 0.22807948291301727, "learning_rate": 0.0002858102696324294, "loss": 4.7975, "step": 16080 }, { "epoch": 0.3293879139106752, "grad_norm": 0.2420845627784729, "learning_rate": 0.00028578397977418394, "loss": 4.8134, "step": 16085 }, { "epoch": 0.3294903036880798, "grad_norm": 0.24594122171401978, "learning_rate": 0.0002857576667957709, "loss": 4.789, "step": 16090 }, { "epoch": 0.3295926934654844, "grad_norm": 0.22552920877933502, "learning_rate": 0.00028573133070167075, "loss": 4.8137, "step": 16095 }, { "epoch": 0.32969508324288904, "grad_norm": 0.25765693187713623, "learning_rate": 0.0002857049714963676, "loss": 4.827, "step": 16100 }, { "epoch": 0.32979747302029366, "grad_norm": 0.25722846388816833, "learning_rate": 0.0002856785891843498, "loss": 4.8175, "step": 16105 }, { "epoch": 0.3298998627976983, "grad_norm": 0.2697278559207916, "learning_rate": 0.00028565218377010944, "loss": 4.8756, "step": 16110 }, { "epoch": 0.3300022525751029, "grad_norm": 0.24712322652339935, "learning_rate": 0.00028562575525814266, "loss": 4.8043, "step": 16115 }, { "epoch": 0.3301046423525075, "grad_norm": 0.23435640335083008, "learning_rate": 0.00028559930365294945, "loss": 4.8202, "step": 16120 }, { "epoch": 0.33020703212991215, "grad_norm": 0.2309722602367401, "learning_rate": 0.00028557282895903384, "loss": 4.8489, "step": 16125 }, { "epoch": 0.33030942190731677, "grad_norm": 0.24916371703147888, "learning_rate": 0.0002855463311809037, "loss": 4.7592, "step": 16130 }, { "epoch": 0.3304118116847214, "grad_norm": 0.2296198457479477, "learning_rate": 0.0002855198103230708, "loss": 4.7814, "step": 16135 }, { "epoch": 0.330514201462126, "grad_norm": 0.24471209943294525, "learning_rate": 0.00028549326639005086, "loss": 4.8081, "step": 16140 }, { "epoch": 0.33061659123953063, "grad_norm": 0.25593388080596924, "learning_rate": 0.0002854666993863637, "loss": 4.8149, "step": 16145 }, { "epoch": 0.33071898101693525, "grad_norm": 0.27216002345085144, "learning_rate": 0.00028544010931653285, "loss": 4.8326, "step": 16150 }, { "epoch": 0.3308213707943399, "grad_norm": 0.2517782151699066, "learning_rate": 0.00028541349618508584, "loss": 4.7682, "step": 16155 }, { "epoch": 0.3309237605717445, "grad_norm": 0.24866561591625214, "learning_rate": 0.0002853868599965541, "loss": 4.7985, "step": 16160 }, { "epoch": 0.3310261503491491, "grad_norm": 0.24357706308364868, "learning_rate": 0.0002853602007554731, "loss": 4.7943, "step": 16165 }, { "epoch": 0.33112854012655374, "grad_norm": 0.25437769293785095, "learning_rate": 0.00028533351846638205, "loss": 4.8135, "step": 16170 }, { "epoch": 0.3312309299039584, "grad_norm": 0.26568377017974854, "learning_rate": 0.0002853068131338243, "loss": 4.8396, "step": 16175 }, { "epoch": 0.33133331968136304, "grad_norm": 0.2585509717464447, "learning_rate": 0.00028528008476234694, "loss": 4.8335, "step": 16180 }, { "epoch": 0.33143570945876766, "grad_norm": 0.23786552250385284, "learning_rate": 0.0002852533333565011, "loss": 4.8348, "step": 16185 }, { "epoch": 0.3315380992361723, "grad_norm": 0.2424546480178833, "learning_rate": 0.0002852265589208417, "loss": 4.7527, "step": 16190 }, { "epoch": 0.3316404890135769, "grad_norm": 0.2438025027513504, "learning_rate": 0.0002851997614599278, "loss": 4.7954, "step": 16195 }, { "epoch": 0.3317428787909815, "grad_norm": 0.5066136121749878, "learning_rate": 0.00028517294097832216, "loss": 4.7434, "step": 16200 }, { "epoch": 0.33184526856838614, "grad_norm": 0.25915953516960144, "learning_rate": 0.0002851460974805916, "loss": 4.8257, "step": 16205 }, { "epoch": 0.33194765834579076, "grad_norm": 0.23952145874500275, "learning_rate": 0.0002851192309713067, "loss": 4.797, "step": 16210 }, { "epoch": 0.3320500481231954, "grad_norm": 0.28268367052078247, "learning_rate": 0.0002850923414550422, "loss": 4.7946, "step": 16215 }, { "epoch": 0.3321524379006, "grad_norm": 0.2623506188392639, "learning_rate": 0.0002850654289363766, "loss": 4.7887, "step": 16220 }, { "epoch": 0.3322548276780046, "grad_norm": 0.2400224804878235, "learning_rate": 0.00028503849341989223, "loss": 4.863, "step": 16225 }, { "epoch": 0.33235721745540925, "grad_norm": 0.23497094213962555, "learning_rate": 0.00028501153491017556, "loss": 4.8554, "step": 16230 }, { "epoch": 0.33245960723281387, "grad_norm": 0.23780083656311035, "learning_rate": 0.0002849845534118168, "loss": 4.8084, "step": 16235 }, { "epoch": 0.3325619970102185, "grad_norm": 0.26179882884025574, "learning_rate": 0.0002849575489294102, "loss": 4.8035, "step": 16240 }, { "epoch": 0.3326643867876231, "grad_norm": 0.2360720932483673, "learning_rate": 0.0002849305214675538, "loss": 4.7602, "step": 16245 }, { "epoch": 0.33276677656502773, "grad_norm": 0.2859875559806824, "learning_rate": 0.00028490347103084956, "loss": 4.8084, "step": 16250 }, { "epoch": 0.33286916634243235, "grad_norm": 0.2751981317996979, "learning_rate": 0.00028487639762390357, "loss": 4.8233, "step": 16255 }, { "epoch": 0.332971556119837, "grad_norm": 0.2609846889972687, "learning_rate": 0.00028484930125132553, "loss": 4.7336, "step": 16260 }, { "epoch": 0.3330739458972416, "grad_norm": 0.2535109221935272, "learning_rate": 0.0002848221819177292, "loss": 4.8385, "step": 16265 }, { "epoch": 0.3331763356746462, "grad_norm": 0.25714072585105896, "learning_rate": 0.0002847950396277323, "loss": 4.8218, "step": 16270 }, { "epoch": 0.3332787254520509, "grad_norm": 0.23650965094566345, "learning_rate": 0.00028476787438595633, "loss": 4.8234, "step": 16275 }, { "epoch": 0.3333811152294555, "grad_norm": 0.26363539695739746, "learning_rate": 0.0002847406861970268, "loss": 4.808, "step": 16280 }, { "epoch": 0.33348350500686014, "grad_norm": 0.2578558623790741, "learning_rate": 0.000284713475065573, "loss": 4.7717, "step": 16285 }, { "epoch": 0.33358589478426476, "grad_norm": 0.2533988654613495, "learning_rate": 0.0002846862409962284, "loss": 4.8438, "step": 16290 }, { "epoch": 0.3336882845616694, "grad_norm": 0.3612011969089508, "learning_rate": 0.00028465898399363, "loss": 4.8041, "step": 16295 }, { "epoch": 0.333790674339074, "grad_norm": 0.23150953650474548, "learning_rate": 0.00028463170406241897, "loss": 4.7842, "step": 16300 }, { "epoch": 0.3338930641164786, "grad_norm": 0.24778206646442413, "learning_rate": 0.0002846044012072403, "loss": 4.7862, "step": 16305 }, { "epoch": 0.33399545389388324, "grad_norm": 0.24051642417907715, "learning_rate": 0.00028457707543274295, "loss": 4.767, "step": 16310 }, { "epoch": 0.33409784367128786, "grad_norm": 0.23826947808265686, "learning_rate": 0.00028454972674357975, "loss": 4.8691, "step": 16315 }, { "epoch": 0.3342002334486925, "grad_norm": 0.2339300811290741, "learning_rate": 0.0002845223551444072, "loss": 4.8082, "step": 16320 }, { "epoch": 0.3343026232260971, "grad_norm": 0.2331026792526245, "learning_rate": 0.0002844949606398861, "loss": 4.8197, "step": 16325 }, { "epoch": 0.3344050130035017, "grad_norm": 0.22589321434497833, "learning_rate": 0.0002844675432346809, "loss": 4.8737, "step": 16330 }, { "epoch": 0.33450740278090635, "grad_norm": 0.24397091567516327, "learning_rate": 0.00028444010293346005, "loss": 4.8021, "step": 16335 }, { "epoch": 0.33460979255831097, "grad_norm": 0.24349862337112427, "learning_rate": 0.00028441263974089584, "loss": 4.791, "step": 16340 }, { "epoch": 0.3347121823357156, "grad_norm": 0.23792225122451782, "learning_rate": 0.0002843851536616644, "loss": 4.7971, "step": 16345 }, { "epoch": 0.3348145721131202, "grad_norm": 0.26191046833992004, "learning_rate": 0.0002843576447004459, "loss": 4.79, "step": 16350 }, { "epoch": 0.33491696189052483, "grad_norm": 0.27435147762298584, "learning_rate": 0.0002843301128619243, "loss": 4.8152, "step": 16355 }, { "epoch": 0.33501935166792945, "grad_norm": 0.2583461105823517, "learning_rate": 0.0002843025581507876, "loss": 4.7318, "step": 16360 }, { "epoch": 0.3351217414453341, "grad_norm": 0.2498168796300888, "learning_rate": 0.00028427498057172744, "loss": 4.8408, "step": 16365 }, { "epoch": 0.3352241312227387, "grad_norm": 0.2474629431962967, "learning_rate": 0.00028424738012943954, "loss": 4.7808, "step": 16370 }, { "epoch": 0.3353265210001433, "grad_norm": 0.2743283212184906, "learning_rate": 0.00028421975682862353, "loss": 4.8157, "step": 16375 }, { "epoch": 0.335428910777548, "grad_norm": 0.2380060851573944, "learning_rate": 0.00028419211067398284, "loss": 4.7808, "step": 16380 }, { "epoch": 0.3355313005549526, "grad_norm": 0.2658345699310303, "learning_rate": 0.0002841644416702248, "loss": 4.8096, "step": 16385 }, { "epoch": 0.33563369033235724, "grad_norm": 0.24597862362861633, "learning_rate": 0.0002841367498220607, "loss": 4.8153, "step": 16390 }, { "epoch": 0.33573608010976186, "grad_norm": 0.2558453381061554, "learning_rate": 0.0002841090351342057, "loss": 4.7838, "step": 16395 }, { "epoch": 0.3358384698871665, "grad_norm": 0.22768911719322205, "learning_rate": 0.0002840812976113788, "loss": 4.8122, "step": 16400 }, { "epoch": 0.3359408596645711, "grad_norm": 0.24096199870109558, "learning_rate": 0.00028405353725830294, "loss": 4.8449, "step": 16405 }, { "epoch": 0.3360432494419757, "grad_norm": 0.23811636865139008, "learning_rate": 0.0002840257540797048, "loss": 4.7902, "step": 16410 }, { "epoch": 0.33614563921938034, "grad_norm": 0.25714996457099915, "learning_rate": 0.00028399794808031524, "loss": 4.816, "step": 16415 }, { "epoch": 0.33624802899678496, "grad_norm": 0.28754061460494995, "learning_rate": 0.00028397011926486874, "loss": 4.7533, "step": 16420 }, { "epoch": 0.3363504187741896, "grad_norm": 0.27357572317123413, "learning_rate": 0.0002839422676381037, "loss": 4.7692, "step": 16425 }, { "epoch": 0.3364528085515942, "grad_norm": 0.23396272957324982, "learning_rate": 0.0002839143932047626, "loss": 4.8036, "step": 16430 }, { "epoch": 0.33655519832899883, "grad_norm": 0.24027958512306213, "learning_rate": 0.0002838864959695916, "loss": 4.7694, "step": 16435 }, { "epoch": 0.33665758810640345, "grad_norm": 0.2548987865447998, "learning_rate": 0.00028385857593734085, "loss": 4.8462, "step": 16440 }, { "epoch": 0.33675997788380807, "grad_norm": 0.24632757902145386, "learning_rate": 0.00028383063311276425, "loss": 4.7882, "step": 16445 }, { "epoch": 0.3368623676612127, "grad_norm": 0.22980652749538422, "learning_rate": 0.0002838026675006197, "loss": 4.8391, "step": 16450 }, { "epoch": 0.3369647574386173, "grad_norm": 0.2512337863445282, "learning_rate": 0.00028377467910566904, "loss": 4.7639, "step": 16455 }, { "epoch": 0.33706714721602193, "grad_norm": 0.2517258822917938, "learning_rate": 0.0002837466679326778, "loss": 4.8728, "step": 16460 }, { "epoch": 0.33716953699342656, "grad_norm": 0.2762230932712555, "learning_rate": 0.0002837186339864156, "loss": 4.8328, "step": 16465 }, { "epoch": 0.3372719267708312, "grad_norm": 0.2519664466381073, "learning_rate": 0.0002836905772716556, "loss": 4.8005, "step": 16470 }, { "epoch": 0.3373743165482358, "grad_norm": 0.28721392154693604, "learning_rate": 0.0002836624977931754, "loss": 4.7947, "step": 16475 }, { "epoch": 0.3374767063256405, "grad_norm": 0.25575771927833557, "learning_rate": 0.0002836343955557559, "loss": 4.8228, "step": 16480 }, { "epoch": 0.3375790961030451, "grad_norm": 0.2361585646867752, "learning_rate": 0.00028360627056418216, "loss": 4.8759, "step": 16485 }, { "epoch": 0.3376814858804497, "grad_norm": 0.2382163107395172, "learning_rate": 0.0002835781228232431, "loss": 4.804, "step": 16490 }, { "epoch": 0.33778387565785434, "grad_norm": 0.24462802708148956, "learning_rate": 0.0002835499523377314, "loss": 4.8405, "step": 16495 }, { "epoch": 0.33788626543525896, "grad_norm": 0.2449190318584442, "learning_rate": 0.0002835217591124438, "loss": 4.8327, "step": 16500 }, { "epoch": 0.3379886552126636, "grad_norm": 0.25404995679855347, "learning_rate": 0.00028349354315218084, "loss": 4.8044, "step": 16505 }, { "epoch": 0.3380910449900682, "grad_norm": 0.24927467107772827, "learning_rate": 0.00028346530446174677, "loss": 4.8553, "step": 16510 }, { "epoch": 0.3381934347674728, "grad_norm": 0.25905531644821167, "learning_rate": 0.00028343704304594993, "loss": 4.8353, "step": 16515 }, { "epoch": 0.33829582454487744, "grad_norm": 0.22443336248397827, "learning_rate": 0.0002834087589096024, "loss": 4.7935, "step": 16520 }, { "epoch": 0.33839821432228206, "grad_norm": 0.24087576568126678, "learning_rate": 0.00028338045205752023, "loss": 4.7795, "step": 16525 }, { "epoch": 0.3385006040996867, "grad_norm": 0.2546943128108978, "learning_rate": 0.00028335212249452316, "loss": 4.8057, "step": 16530 }, { "epoch": 0.3386029938770913, "grad_norm": 0.23839382827281952, "learning_rate": 0.000283323770225435, "loss": 4.8209, "step": 16535 }, { "epoch": 0.33870538365449593, "grad_norm": 0.29359230399131775, "learning_rate": 0.0002832953952550833, "loss": 4.8573, "step": 16540 }, { "epoch": 0.33880777343190055, "grad_norm": 0.2874332070350647, "learning_rate": 0.0002832669975882996, "loss": 4.84, "step": 16545 }, { "epoch": 0.33891016320930517, "grad_norm": 0.2460402101278305, "learning_rate": 0.0002832385772299191, "loss": 4.6877, "step": 16550 }, { "epoch": 0.3390125529867098, "grad_norm": 0.23004408180713654, "learning_rate": 0.000283210134184781, "loss": 4.793, "step": 16555 }, { "epoch": 0.3391149427641144, "grad_norm": 0.25409266352653503, "learning_rate": 0.00028318166845772843, "loss": 4.7968, "step": 16560 }, { "epoch": 0.33921733254151903, "grad_norm": 0.2459293007850647, "learning_rate": 0.0002831531800536082, "loss": 4.8258, "step": 16565 }, { "epoch": 0.33931972231892366, "grad_norm": 0.25078505277633667, "learning_rate": 0.0002831246689772712, "loss": 4.7815, "step": 16570 }, { "epoch": 0.3394221120963283, "grad_norm": 0.2511816620826721, "learning_rate": 0.0002830961352335719, "loss": 4.8209, "step": 16575 }, { "epoch": 0.33952450187373295, "grad_norm": 0.24562671780586243, "learning_rate": 0.00028306757882736887, "loss": 4.8267, "step": 16580 }, { "epoch": 0.3396268916511376, "grad_norm": 0.23564331233501434, "learning_rate": 0.0002830389997635245, "loss": 4.8405, "step": 16585 }, { "epoch": 0.3397292814285422, "grad_norm": 0.24330298602581024, "learning_rate": 0.00028301039804690494, "loss": 4.8183, "step": 16590 }, { "epoch": 0.3398316712059468, "grad_norm": 0.22338157892227173, "learning_rate": 0.0002829817736823802, "loss": 4.8447, "step": 16595 }, { "epoch": 0.33993406098335144, "grad_norm": 0.235164612531662, "learning_rate": 0.0002829531266748244, "loss": 4.7876, "step": 16600 }, { "epoch": 0.34003645076075606, "grad_norm": 0.2610068619251251, "learning_rate": 0.0002829244570291151, "loss": 4.8369, "step": 16605 }, { "epoch": 0.3401388405381607, "grad_norm": 0.23874932527542114, "learning_rate": 0.000282895764750134, "loss": 4.7797, "step": 16610 }, { "epoch": 0.3402412303155653, "grad_norm": 0.2389604151248932, "learning_rate": 0.00028286704984276665, "loss": 4.7596, "step": 16615 }, { "epoch": 0.3403436200929699, "grad_norm": 0.24898886680603027, "learning_rate": 0.00028283831231190224, "loss": 4.8472, "step": 16620 }, { "epoch": 0.34044600987037454, "grad_norm": 0.24569012224674225, "learning_rate": 0.0002828095521624341, "loss": 4.7886, "step": 16625 }, { "epoch": 0.34054839964777917, "grad_norm": 0.2282789796590805, "learning_rate": 0.00028278076939925926, "loss": 4.8108, "step": 16630 }, { "epoch": 0.3406507894251838, "grad_norm": 0.3270750343799591, "learning_rate": 0.0002827519640272785, "loss": 4.7373, "step": 16635 }, { "epoch": 0.3407531792025884, "grad_norm": 0.2560490369796753, "learning_rate": 0.0002827231360513966, "loss": 4.7895, "step": 16640 }, { "epoch": 0.34085556897999303, "grad_norm": 0.2517138123512268, "learning_rate": 0.0002826942854765222, "loss": 4.7704, "step": 16645 }, { "epoch": 0.34095795875739765, "grad_norm": 0.24209338426589966, "learning_rate": 0.00028266541230756773, "loss": 4.8018, "step": 16650 }, { "epoch": 0.34106034853480227, "grad_norm": 0.23627488315105438, "learning_rate": 0.00028263651654944944, "loss": 4.8031, "step": 16655 }, { "epoch": 0.3411627383122069, "grad_norm": 0.24600344896316528, "learning_rate": 0.0002826075982070875, "loss": 4.817, "step": 16660 }, { "epoch": 0.3412651280896115, "grad_norm": 0.23830489814281464, "learning_rate": 0.0002825786572854058, "loss": 4.8191, "step": 16665 }, { "epoch": 0.34136751786701613, "grad_norm": 0.2401708960533142, "learning_rate": 0.0002825496937893322, "loss": 4.7781, "step": 16670 }, { "epoch": 0.34146990764442076, "grad_norm": 0.23110555112361908, "learning_rate": 0.0002825207077237985, "loss": 4.7622, "step": 16675 }, { "epoch": 0.34157229742182543, "grad_norm": 0.23407591879367828, "learning_rate": 0.00028249169909373993, "loss": 4.8343, "step": 16680 }, { "epoch": 0.34167468719923005, "grad_norm": 0.258706659078598, "learning_rate": 0.0002824626679040961, "loss": 4.7401, "step": 16685 }, { "epoch": 0.3417770769766347, "grad_norm": 0.2502451539039612, "learning_rate": 0.00028243361415981004, "loss": 4.8391, "step": 16690 }, { "epoch": 0.3418794667540393, "grad_norm": 0.26287874579429626, "learning_rate": 0.00028240453786582885, "loss": 4.8496, "step": 16695 }, { "epoch": 0.3419818565314439, "grad_norm": 0.24094346165657043, "learning_rate": 0.0002823754390271034, "loss": 4.7413, "step": 16700 }, { "epoch": 0.34208424630884854, "grad_norm": 0.2397972196340561, "learning_rate": 0.0002823463176485883, "loss": 4.8279, "step": 16705 }, { "epoch": 0.34218663608625316, "grad_norm": 0.42894208431243896, "learning_rate": 0.0002823171737352423, "loss": 4.779, "step": 16710 }, { "epoch": 0.3422890258636578, "grad_norm": 0.24300961196422577, "learning_rate": 0.00028228800729202756, "loss": 4.7593, "step": 16715 }, { "epoch": 0.3423914156410624, "grad_norm": 0.2818509638309479, "learning_rate": 0.0002822588183239105, "loss": 4.774, "step": 16720 }, { "epoch": 0.342493805418467, "grad_norm": 0.2569493055343628, "learning_rate": 0.00028222960683586096, "loss": 4.8116, "step": 16725 }, { "epoch": 0.34259619519587164, "grad_norm": 0.24437934160232544, "learning_rate": 0.00028220037283285305, "loss": 4.8085, "step": 16730 }, { "epoch": 0.34269858497327627, "grad_norm": 0.31071799993515015, "learning_rate": 0.00028217111631986434, "loss": 4.8283, "step": 16735 }, { "epoch": 0.3428009747506809, "grad_norm": 0.2863980829715729, "learning_rate": 0.0002821418373018764, "loss": 4.83, "step": 16740 }, { "epoch": 0.3429033645280855, "grad_norm": 0.25225335359573364, "learning_rate": 0.0002821125357838747, "loss": 4.8388, "step": 16745 }, { "epoch": 0.34300575430549013, "grad_norm": 0.2698221206665039, "learning_rate": 0.0002820832117708484, "loss": 4.8276, "step": 16750 }, { "epoch": 0.34310814408289475, "grad_norm": 2.018913745880127, "learning_rate": 0.0002820538652677905, "loss": 4.8773, "step": 16755 }, { "epoch": 0.34321053386029937, "grad_norm": 0.28117841482162476, "learning_rate": 0.0002820244962796981, "loss": 4.7727, "step": 16760 }, { "epoch": 0.343312923637704, "grad_norm": 0.26004379987716675, "learning_rate": 0.0002819951048115716, "loss": 4.8188, "step": 16765 }, { "epoch": 0.3434153134151086, "grad_norm": 0.23128695785999298, "learning_rate": 0.00028196569086841566, "loss": 4.7956, "step": 16770 }, { "epoch": 0.34351770319251324, "grad_norm": 0.25050005316734314, "learning_rate": 0.00028193625445523877, "loss": 4.8066, "step": 16775 }, { "epoch": 0.34362009296991786, "grad_norm": 0.27198299765586853, "learning_rate": 0.0002819067955770529, "loss": 4.8297, "step": 16780 }, { "epoch": 0.34372248274732253, "grad_norm": 0.3195876181125641, "learning_rate": 0.00028187731423887427, "loss": 4.8254, "step": 16785 }, { "epoch": 0.34382487252472715, "grad_norm": 0.2708126902580261, "learning_rate": 0.0002818478104457226, "loss": 4.8057, "step": 16790 }, { "epoch": 0.3439272623021318, "grad_norm": 0.23975291848182678, "learning_rate": 0.00028181828420262154, "loss": 4.7726, "step": 16795 }, { "epoch": 0.3440296520795364, "grad_norm": 0.23758544027805328, "learning_rate": 0.00028178873551459866, "loss": 4.8273, "step": 16800 }, { "epoch": 0.344132041856941, "grad_norm": 0.24509690701961517, "learning_rate": 0.00028175916438668517, "loss": 4.8156, "step": 16805 }, { "epoch": 0.34423443163434564, "grad_norm": 0.24822518229484558, "learning_rate": 0.00028172957082391617, "loss": 4.7679, "step": 16810 }, { "epoch": 0.34433682141175026, "grad_norm": 0.24076735973358154, "learning_rate": 0.0002816999548313308, "loss": 4.8053, "step": 16815 }, { "epoch": 0.3444392111891549, "grad_norm": 0.24137085676193237, "learning_rate": 0.0002816703164139716, "loss": 4.7941, "step": 16820 }, { "epoch": 0.3445416009665595, "grad_norm": 0.26467767357826233, "learning_rate": 0.00028164065557688525, "loss": 4.8041, "step": 16825 }, { "epoch": 0.3446439907439641, "grad_norm": 0.3696797490119934, "learning_rate": 0.00028161097232512223, "loss": 4.8209, "step": 16830 }, { "epoch": 0.34474638052136874, "grad_norm": 0.24373362958431244, "learning_rate": 0.0002815812666637366, "loss": 4.7785, "step": 16835 }, { "epoch": 0.34484877029877337, "grad_norm": 0.2379792183637619, "learning_rate": 0.00028155153859778654, "loss": 4.7788, "step": 16840 }, { "epoch": 0.344951160076178, "grad_norm": 0.2604856491088867, "learning_rate": 0.00028152178813233377, "loss": 4.7654, "step": 16845 }, { "epoch": 0.3450535498535826, "grad_norm": 0.32233041524887085, "learning_rate": 0.000281492015272444, "loss": 4.7641, "step": 16850 }, { "epoch": 0.34515593963098723, "grad_norm": 0.2897481918334961, "learning_rate": 0.00028146222002318675, "loss": 4.8001, "step": 16855 }, { "epoch": 0.34525832940839185, "grad_norm": 0.2490362972021103, "learning_rate": 0.00028143240238963526, "loss": 4.8008, "step": 16860 }, { "epoch": 0.34536071918579647, "grad_norm": 0.24507276713848114, "learning_rate": 0.00028140256237686663, "loss": 4.696, "step": 16865 }, { "epoch": 0.3454631089632011, "grad_norm": 0.238366961479187, "learning_rate": 0.00028137269998996183, "loss": 4.8042, "step": 16870 }, { "epoch": 0.3455654987406057, "grad_norm": 0.25932565331459045, "learning_rate": 0.0002813428152340055, "loss": 4.7444, "step": 16875 }, { "epoch": 0.34566788851801034, "grad_norm": 0.22153566777706146, "learning_rate": 0.0002813129081140862, "loss": 4.8033, "step": 16880 }, { "epoch": 0.345770278295415, "grad_norm": 0.26651623845100403, "learning_rate": 0.0002812829786352963, "loss": 4.8489, "step": 16885 }, { "epoch": 0.34587266807281963, "grad_norm": 0.23320205509662628, "learning_rate": 0.0002812530268027318, "loss": 4.8217, "step": 16890 }, { "epoch": 0.34597505785022425, "grad_norm": 0.27574557065963745, "learning_rate": 0.0002812230526214928, "loss": 4.7301, "step": 16895 }, { "epoch": 0.3460774476276289, "grad_norm": 0.4646994173526764, "learning_rate": 0.0002811930560966831, "loss": 4.7735, "step": 16900 }, { "epoch": 0.3461798374050335, "grad_norm": 0.268422394990921, "learning_rate": 0.0002811630372334101, "loss": 4.7691, "step": 16905 }, { "epoch": 0.3462822271824381, "grad_norm": 0.2489718496799469, "learning_rate": 0.00028113299603678524, "loss": 4.7403, "step": 16910 }, { "epoch": 0.34638461695984274, "grad_norm": 0.2449781596660614, "learning_rate": 0.00028110293251192367, "loss": 4.7402, "step": 16915 }, { "epoch": 0.34648700673724736, "grad_norm": 0.2449360489845276, "learning_rate": 0.00028107284666394435, "loss": 4.8301, "step": 16920 }, { "epoch": 0.346589396514652, "grad_norm": 0.24715960025787354, "learning_rate": 0.00028104273849797014, "loss": 4.7768, "step": 16925 }, { "epoch": 0.3466917862920566, "grad_norm": 0.24429172277450562, "learning_rate": 0.0002810126080191275, "loss": 4.7959, "step": 16930 }, { "epoch": 0.3467941760694612, "grad_norm": 0.2417449951171875, "learning_rate": 0.00028098245523254674, "loss": 4.8027, "step": 16935 }, { "epoch": 0.34689656584686585, "grad_norm": 0.25111472606658936, "learning_rate": 0.0002809522801433622, "loss": 4.7777, "step": 16940 }, { "epoch": 0.34699895562427047, "grad_norm": 0.2312670648097992, "learning_rate": 0.0002809220827567117, "loss": 4.7974, "step": 16945 }, { "epoch": 0.3471013454016751, "grad_norm": 0.2415151447057724, "learning_rate": 0.0002808918630777371, "loss": 4.8084, "step": 16950 }, { "epoch": 0.3472037351790797, "grad_norm": 0.28880220651626587, "learning_rate": 0.0002808616211115839, "loss": 4.7984, "step": 16955 }, { "epoch": 0.34730612495648433, "grad_norm": 0.2454739809036255, "learning_rate": 0.0002808313568634015, "loss": 4.7436, "step": 16960 }, { "epoch": 0.34740851473388895, "grad_norm": 0.2548595368862152, "learning_rate": 0.00028080107033834296, "loss": 4.83, "step": 16965 }, { "epoch": 0.3475109045112936, "grad_norm": 0.2426546961069107, "learning_rate": 0.0002807707615415653, "loss": 4.7768, "step": 16970 }, { "epoch": 0.3476132942886982, "grad_norm": 0.27079081535339355, "learning_rate": 0.0002807404304782292, "loss": 4.7945, "step": 16975 }, { "epoch": 0.3477156840661028, "grad_norm": 0.23837393522262573, "learning_rate": 0.00028071007715349923, "loss": 4.8218, "step": 16980 }, { "epoch": 0.3478180738435075, "grad_norm": 0.25045403838157654, "learning_rate": 0.00028067970157254365, "loss": 4.7609, "step": 16985 }, { "epoch": 0.3479204636209121, "grad_norm": 0.4101354777812958, "learning_rate": 0.00028064930374053457, "loss": 4.757, "step": 16990 }, { "epoch": 0.34802285339831673, "grad_norm": 0.2520548701286316, "learning_rate": 0.0002806188836626479, "loss": 4.8007, "step": 16995 }, { "epoch": 0.34812524317572136, "grad_norm": 0.2483699470758438, "learning_rate": 0.00028058844134406326, "loss": 4.812, "step": 17000 }, { "epoch": 0.34812524317572136, "eval_loss": 4.819680690765381, "eval_runtime": 17.2382, "eval_samples_per_second": 33.762, "eval_steps_per_second": 4.235, "step": 17000 }, { "epoch": 0.348227632953126, "grad_norm": 0.24901323020458221, "learning_rate": 0.0002805579767899642, "loss": 4.8143, "step": 17005 }, { "epoch": 0.3483300227305306, "grad_norm": 0.22599072754383087, "learning_rate": 0.00028052749000553797, "loss": 4.7996, "step": 17010 }, { "epoch": 0.3484324125079352, "grad_norm": 0.26469743251800537, "learning_rate": 0.00028049698099597557, "loss": 4.824, "step": 17015 }, { "epoch": 0.34853480228533984, "grad_norm": 0.2392892986536026, "learning_rate": 0.0002804664497664718, "loss": 4.8004, "step": 17020 }, { "epoch": 0.34863719206274446, "grad_norm": 0.2556057870388031, "learning_rate": 0.00028043589632222526, "loss": 4.8016, "step": 17025 }, { "epoch": 0.3487395818401491, "grad_norm": 0.24753303825855255, "learning_rate": 0.00028040532066843837, "loss": 4.8308, "step": 17030 }, { "epoch": 0.3488419716175537, "grad_norm": 0.2821330726146698, "learning_rate": 0.0002803747228103173, "loss": 4.8326, "step": 17035 }, { "epoch": 0.3489443613949583, "grad_norm": 0.25612354278564453, "learning_rate": 0.000280344102753072, "loss": 4.8169, "step": 17040 }, { "epoch": 0.34904675117236295, "grad_norm": 0.3134109079837799, "learning_rate": 0.00028031346050191625, "loss": 4.7893, "step": 17045 }, { "epoch": 0.34914914094976757, "grad_norm": 0.2650546729564667, "learning_rate": 0.0002802827960620674, "loss": 4.7929, "step": 17050 }, { "epoch": 0.3492515307271722, "grad_norm": 0.3147332966327667, "learning_rate": 0.0002802521094387469, "loss": 4.7455, "step": 17055 }, { "epoch": 0.3493539205045768, "grad_norm": 0.2313137948513031, "learning_rate": 0.0002802214006371797, "loss": 4.8454, "step": 17060 }, { "epoch": 0.34945631028198143, "grad_norm": 0.26301109790802, "learning_rate": 0.0002801906696625947, "loss": 4.8467, "step": 17065 }, { "epoch": 0.34955870005938605, "grad_norm": 0.24875031411647797, "learning_rate": 0.0002801599165202245, "loss": 4.7881, "step": 17070 }, { "epoch": 0.3496610898367907, "grad_norm": 0.23368525505065918, "learning_rate": 0.0002801291412153055, "loss": 4.7991, "step": 17075 }, { "epoch": 0.3497634796141953, "grad_norm": 0.2556806206703186, "learning_rate": 0.0002800983437530778, "loss": 4.741, "step": 17080 }, { "epoch": 0.3498658693915999, "grad_norm": 0.2457631677389145, "learning_rate": 0.00028006752413878543, "loss": 4.819, "step": 17085 }, { "epoch": 0.3499682591690046, "grad_norm": 0.24281609058380127, "learning_rate": 0.0002800366823776761, "loss": 4.8416, "step": 17090 }, { "epoch": 0.3500706489464092, "grad_norm": 0.23999285697937012, "learning_rate": 0.00028000581847500126, "loss": 4.8062, "step": 17095 }, { "epoch": 0.35017303872381383, "grad_norm": 0.2510368227958679, "learning_rate": 0.0002799749324360161, "loss": 4.8053, "step": 17100 }, { "epoch": 0.35027542850121846, "grad_norm": 0.25193095207214355, "learning_rate": 0.00027994402426597965, "loss": 4.7974, "step": 17105 }, { "epoch": 0.3503778182786231, "grad_norm": 0.23930718004703522, "learning_rate": 0.00027991309397015475, "loss": 4.7517, "step": 17110 }, { "epoch": 0.3504802080560277, "grad_norm": 0.25219717621803284, "learning_rate": 0.00027988214155380797, "loss": 4.7869, "step": 17115 }, { "epoch": 0.3505825978334323, "grad_norm": 0.2704419493675232, "learning_rate": 0.00027985116702220955, "loss": 4.8298, "step": 17120 }, { "epoch": 0.35068498761083694, "grad_norm": 0.24905303120613098, "learning_rate": 0.0002798201703806336, "loss": 4.7257, "step": 17125 }, { "epoch": 0.35078737738824156, "grad_norm": 0.2514397203922272, "learning_rate": 0.000279789151634358, "loss": 4.7664, "step": 17130 }, { "epoch": 0.3508897671656462, "grad_norm": 0.26159238815307617, "learning_rate": 0.0002797581107886644, "loss": 4.7936, "step": 17135 }, { "epoch": 0.3509921569430508, "grad_norm": 0.24079452455043793, "learning_rate": 0.0002797270478488381, "loss": 4.7738, "step": 17140 }, { "epoch": 0.3510945467204554, "grad_norm": 0.2545469403266907, "learning_rate": 0.00027969596282016826, "loss": 4.8097, "step": 17145 }, { "epoch": 0.35119693649786005, "grad_norm": 0.22803018987178802, "learning_rate": 0.0002796648557079478, "loss": 4.8116, "step": 17150 }, { "epoch": 0.35129932627526467, "grad_norm": 0.25125497579574585, "learning_rate": 0.00027963372651747335, "loss": 4.7974, "step": 17155 }, { "epoch": 0.3514017160526693, "grad_norm": 0.24489642679691315, "learning_rate": 0.0002796025752540454, "loss": 4.8409, "step": 17160 }, { "epoch": 0.3515041058300739, "grad_norm": 0.2647576332092285, "learning_rate": 0.00027957140192296805, "loss": 4.7582, "step": 17165 }, { "epoch": 0.35160649560747853, "grad_norm": 0.25523173809051514, "learning_rate": 0.0002795402065295493, "loss": 4.7607, "step": 17170 }, { "epoch": 0.35170888538488315, "grad_norm": 0.29808324575424194, "learning_rate": 0.0002795089890791007, "loss": 4.7946, "step": 17175 }, { "epoch": 0.3518112751622878, "grad_norm": 0.25081580877304077, "learning_rate": 0.00027947774957693794, "loss": 4.8322, "step": 17180 }, { "epoch": 0.3519136649396924, "grad_norm": 0.23519980907440186, "learning_rate": 0.00027944648802838, "loss": 4.8076, "step": 17185 }, { "epoch": 0.35201605471709707, "grad_norm": 0.24327452480793, "learning_rate": 0.0002794152044387499, "loss": 4.761, "step": 17190 }, { "epoch": 0.3521184444945017, "grad_norm": 0.22986432909965515, "learning_rate": 0.0002793838988133745, "loss": 4.8265, "step": 17195 }, { "epoch": 0.3522208342719063, "grad_norm": 0.24117164313793182, "learning_rate": 0.00027935257115758403, "loss": 4.7674, "step": 17200 }, { "epoch": 0.35232322404931093, "grad_norm": 0.26325535774230957, "learning_rate": 0.00027932122147671286, "loss": 4.7465, "step": 17205 }, { "epoch": 0.35242561382671556, "grad_norm": 0.23438550531864166, "learning_rate": 0.0002792898497760989, "loss": 4.7598, "step": 17210 }, { "epoch": 0.3525280036041202, "grad_norm": 0.25688278675079346, "learning_rate": 0.0002792584560610838, "loss": 4.8517, "step": 17215 }, { "epoch": 0.3526303933815248, "grad_norm": 0.2520984411239624, "learning_rate": 0.0002792270403370132, "loss": 4.7871, "step": 17220 }, { "epoch": 0.3527327831589294, "grad_norm": 0.24474072456359863, "learning_rate": 0.0002791956026092361, "loss": 4.8164, "step": 17225 }, { "epoch": 0.35283517293633404, "grad_norm": 0.23748761415481567, "learning_rate": 0.00027916414288310556, "loss": 4.7758, "step": 17230 }, { "epoch": 0.35293756271373866, "grad_norm": 0.25483253598213196, "learning_rate": 0.00027913266116397824, "loss": 4.7706, "step": 17235 }, { "epoch": 0.3530399524911433, "grad_norm": 0.23142124712467194, "learning_rate": 0.00027910115745721466, "loss": 4.8061, "step": 17240 }, { "epoch": 0.3531423422685479, "grad_norm": 0.23943887650966644, "learning_rate": 0.00027906963176817896, "loss": 4.7508, "step": 17245 }, { "epoch": 0.3532447320459525, "grad_norm": 0.24452316761016846, "learning_rate": 0.0002790380841022391, "loss": 4.7999, "step": 17250 }, { "epoch": 0.35334712182335715, "grad_norm": 0.2289561629295349, "learning_rate": 0.00027900651446476673, "loss": 4.7526, "step": 17255 }, { "epoch": 0.35344951160076177, "grad_norm": 0.2562633752822876, "learning_rate": 0.0002789749228611372, "loss": 4.7962, "step": 17260 }, { "epoch": 0.3535519013781664, "grad_norm": 0.2539807856082916, "learning_rate": 0.0002789433092967298, "loss": 4.8088, "step": 17265 }, { "epoch": 0.353654291155571, "grad_norm": 0.2711130380630493, "learning_rate": 0.00027891167377692733, "loss": 4.7739, "step": 17270 }, { "epoch": 0.35375668093297563, "grad_norm": 0.23005102574825287, "learning_rate": 0.0002788800163071164, "loss": 4.7649, "step": 17275 }, { "epoch": 0.35385907071038025, "grad_norm": 0.27644792199134827, "learning_rate": 0.0002788483368926875, "loss": 4.811, "step": 17280 }, { "epoch": 0.3539614604877849, "grad_norm": 0.23939038813114166, "learning_rate": 0.0002788166355390347, "loss": 4.8298, "step": 17285 }, { "epoch": 0.35406385026518955, "grad_norm": 0.2482702136039734, "learning_rate": 0.00027878491225155573, "loss": 4.7856, "step": 17290 }, { "epoch": 0.35416624004259417, "grad_norm": 0.2748696208000183, "learning_rate": 0.00027875316703565226, "loss": 4.8458, "step": 17295 }, { "epoch": 0.3542686298199988, "grad_norm": 0.2343509942293167, "learning_rate": 0.00027872139989672967, "loss": 4.7806, "step": 17300 }, { "epoch": 0.3543710195974034, "grad_norm": 0.2615395188331604, "learning_rate": 0.0002786896108401968, "loss": 4.7473, "step": 17305 }, { "epoch": 0.35447340937480804, "grad_norm": 0.2535521388053894, "learning_rate": 0.0002786577998714666, "loss": 4.8143, "step": 17310 }, { "epoch": 0.35457579915221266, "grad_norm": 0.25139519572257996, "learning_rate": 0.0002786259669959556, "loss": 4.7792, "step": 17315 }, { "epoch": 0.3546781889296173, "grad_norm": 0.23582305014133453, "learning_rate": 0.0002785941122190839, "loss": 4.8135, "step": 17320 }, { "epoch": 0.3547805787070219, "grad_norm": 0.243644580245018, "learning_rate": 0.00027856223554627557, "loss": 4.8175, "step": 17325 }, { "epoch": 0.3548829684844265, "grad_norm": 0.226680725812912, "learning_rate": 0.0002785303369829582, "loss": 4.7672, "step": 17330 }, { "epoch": 0.35498535826183114, "grad_norm": 0.2555479109287262, "learning_rate": 0.00027849841653456337, "loss": 4.792, "step": 17335 }, { "epoch": 0.35508774803923576, "grad_norm": 0.24544550478458405, "learning_rate": 0.0002784664742065261, "loss": 4.758, "step": 17340 }, { "epoch": 0.3551901378166404, "grad_norm": 0.2856253981590271, "learning_rate": 0.00027843451000428526, "loss": 4.7925, "step": 17345 }, { "epoch": 0.355292527594045, "grad_norm": 0.33989331126213074, "learning_rate": 0.00027840252393328355, "loss": 4.7906, "step": 17350 }, { "epoch": 0.3553949173714496, "grad_norm": 0.2488049864768982, "learning_rate": 0.0002783705159989672, "loss": 4.8246, "step": 17355 }, { "epoch": 0.35549730714885425, "grad_norm": 0.2608996629714966, "learning_rate": 0.0002783384862067863, "loss": 4.8372, "step": 17360 }, { "epoch": 0.35559969692625887, "grad_norm": 0.24256552755832672, "learning_rate": 0.00027830643456219463, "loss": 4.7457, "step": 17365 }, { "epoch": 0.3557020867036635, "grad_norm": 0.23565340042114258, "learning_rate": 0.00027827436107064963, "loss": 4.8171, "step": 17370 }, { "epoch": 0.3558044764810681, "grad_norm": 0.23600706458091736, "learning_rate": 0.0002782422657376125, "loss": 4.7826, "step": 17375 }, { "epoch": 0.35590686625847273, "grad_norm": 0.25355175137519836, "learning_rate": 0.00027821014856854823, "loss": 4.8079, "step": 17380 }, { "epoch": 0.35600925603587735, "grad_norm": 0.24057556688785553, "learning_rate": 0.0002781780095689255, "loss": 4.7874, "step": 17385 }, { "epoch": 0.35611164581328203, "grad_norm": 0.2481364607810974, "learning_rate": 0.00027814584874421644, "loss": 4.7971, "step": 17390 }, { "epoch": 0.35621403559068665, "grad_norm": 0.23850895464420319, "learning_rate": 0.00027811366609989743, "loss": 4.8081, "step": 17395 }, { "epoch": 0.35631642536809127, "grad_norm": 0.24444949626922607, "learning_rate": 0.00027808146164144806, "loss": 4.8147, "step": 17400 }, { "epoch": 0.3564188151454959, "grad_norm": 0.24324198067188263, "learning_rate": 0.00027804923537435195, "loss": 4.8222, "step": 17405 }, { "epoch": 0.3565212049229005, "grad_norm": 0.33767616748809814, "learning_rate": 0.00027801698730409624, "loss": 4.7202, "step": 17410 }, { "epoch": 0.35662359470030514, "grad_norm": 0.24210509657859802, "learning_rate": 0.0002779847174361719, "loss": 4.7669, "step": 17415 }, { "epoch": 0.35672598447770976, "grad_norm": 0.2612592875957489, "learning_rate": 0.00027795242577607356, "loss": 4.7519, "step": 17420 }, { "epoch": 0.3568283742551144, "grad_norm": 0.2495550662279129, "learning_rate": 0.0002779201123292996, "loss": 4.8081, "step": 17425 }, { "epoch": 0.356930764032519, "grad_norm": 0.2539116144180298, "learning_rate": 0.0002778877771013521, "loss": 4.7959, "step": 17430 }, { "epoch": 0.3570331538099236, "grad_norm": 0.25741246342658997, "learning_rate": 0.0002778554200977368, "loss": 4.8034, "step": 17435 }, { "epoch": 0.35713554358732824, "grad_norm": 0.2593652009963989, "learning_rate": 0.00027782304132396314, "loss": 4.7875, "step": 17440 }, { "epoch": 0.35723793336473286, "grad_norm": 0.24774949252605438, "learning_rate": 0.0002777906407855444, "loss": 4.7676, "step": 17445 }, { "epoch": 0.3573403231421375, "grad_norm": 0.24466589093208313, "learning_rate": 0.00027775821848799746, "loss": 4.7929, "step": 17450 }, { "epoch": 0.3574427129195421, "grad_norm": 0.24842630326747894, "learning_rate": 0.0002777257744368428, "loss": 4.8114, "step": 17455 }, { "epoch": 0.3575451026969467, "grad_norm": 0.242864191532135, "learning_rate": 0.00027769330863760487, "loss": 4.7947, "step": 17460 }, { "epoch": 0.35764749247435135, "grad_norm": 0.24956855177879333, "learning_rate": 0.0002776608210958116, "loss": 4.7834, "step": 17465 }, { "epoch": 0.35774988225175597, "grad_norm": 0.25236186385154724, "learning_rate": 0.00027762831181699473, "loss": 4.7886, "step": 17470 }, { "epoch": 0.3578522720291606, "grad_norm": 0.2560493052005768, "learning_rate": 0.0002775957808066897, "loss": 4.7606, "step": 17475 }, { "epoch": 0.3579546618065652, "grad_norm": 0.27371951937675476, "learning_rate": 0.0002775632280704355, "loss": 4.774, "step": 17480 }, { "epoch": 0.35805705158396983, "grad_norm": 0.24178016185760498, "learning_rate": 0.0002775306536137751, "loss": 4.752, "step": 17485 }, { "epoch": 0.35815944136137445, "grad_norm": 0.24859295785427094, "learning_rate": 0.0002774980574422549, "loss": 4.8077, "step": 17490 }, { "epoch": 0.35826183113877913, "grad_norm": 0.2725426256656647, "learning_rate": 0.0002774654395614251, "loss": 4.8069, "step": 17495 }, { "epoch": 0.35836422091618375, "grad_norm": 0.25182583928108215, "learning_rate": 0.00027743279997683964, "loss": 4.7768, "step": 17500 }, { "epoch": 0.3584666106935884, "grad_norm": 0.2423197627067566, "learning_rate": 0.0002774001386940561, "loss": 4.78, "step": 17505 }, { "epoch": 0.358569000470993, "grad_norm": 0.2437697947025299, "learning_rate": 0.00027736745571863583, "loss": 4.8309, "step": 17510 }, { "epoch": 0.3586713902483976, "grad_norm": 0.25134211778640747, "learning_rate": 0.00027733475105614367, "loss": 4.7802, "step": 17515 }, { "epoch": 0.35877378002580224, "grad_norm": 0.2694650888442993, "learning_rate": 0.0002773020247121485, "loss": 4.772, "step": 17520 }, { "epoch": 0.35887616980320686, "grad_norm": 0.26452839374542236, "learning_rate": 0.00027726927669222254, "loss": 4.7724, "step": 17525 }, { "epoch": 0.3589785595806115, "grad_norm": 0.2345585823059082, "learning_rate": 0.0002772365070019419, "loss": 4.8204, "step": 17530 }, { "epoch": 0.3590809493580161, "grad_norm": 0.25094476342201233, "learning_rate": 0.0002772037156468863, "loss": 4.7761, "step": 17535 }, { "epoch": 0.3591833391354207, "grad_norm": 0.2713305652141571, "learning_rate": 0.00027717090263263924, "loss": 4.832, "step": 17540 }, { "epoch": 0.35928572891282534, "grad_norm": 0.2531552016735077, "learning_rate": 0.00027713806796478784, "loss": 4.7771, "step": 17545 }, { "epoch": 0.35938811869022996, "grad_norm": 0.2462606132030487, "learning_rate": 0.00027710521164892276, "loss": 4.7297, "step": 17550 }, { "epoch": 0.3594905084676346, "grad_norm": 0.23273545503616333, "learning_rate": 0.00027707233369063877, "loss": 4.7601, "step": 17555 }, { "epoch": 0.3595928982450392, "grad_norm": 0.2425830215215683, "learning_rate": 0.00027703943409553385, "loss": 4.8245, "step": 17560 }, { "epoch": 0.3596952880224438, "grad_norm": 0.25369793176651, "learning_rate": 0.00027700651286920993, "loss": 4.7523, "step": 17565 }, { "epoch": 0.35979767779984845, "grad_norm": 0.35871362686157227, "learning_rate": 0.0002769735700172726, "loss": 4.8501, "step": 17570 }, { "epoch": 0.35990006757725307, "grad_norm": 0.2735600173473358, "learning_rate": 0.00027694060554533104, "loss": 4.7961, "step": 17575 }, { "epoch": 0.3600024573546577, "grad_norm": 0.2527202367782593, "learning_rate": 0.0002769076194589982, "loss": 4.8039, "step": 17580 }, { "epoch": 0.3601048471320623, "grad_norm": 0.23288428783416748, "learning_rate": 0.0002768746117638906, "loss": 4.7856, "step": 17585 }, { "epoch": 0.36020723690946693, "grad_norm": 0.24862439930438995, "learning_rate": 0.0002768415824656287, "loss": 4.7309, "step": 17590 }, { "epoch": 0.3603096266868716, "grad_norm": 0.25607404112815857, "learning_rate": 0.00027680853156983624, "loss": 4.7547, "step": 17595 }, { "epoch": 0.36041201646427623, "grad_norm": 0.25481516122817993, "learning_rate": 0.00027677545908214095, "loss": 4.7573, "step": 17600 }, { "epoch": 0.36051440624168085, "grad_norm": 0.25555238127708435, "learning_rate": 0.00027674236500817415, "loss": 4.8065, "step": 17605 }, { "epoch": 0.3606167960190855, "grad_norm": 0.23385271430015564, "learning_rate": 0.0002767092493535708, "loss": 4.795, "step": 17610 }, { "epoch": 0.3607191857964901, "grad_norm": 0.24765196442604065, "learning_rate": 0.0002766761121239696, "loss": 4.8329, "step": 17615 }, { "epoch": 0.3608215755738947, "grad_norm": 0.2550165355205536, "learning_rate": 0.0002766429533250128, "loss": 4.793, "step": 17620 }, { "epoch": 0.36092396535129934, "grad_norm": 0.25023287534713745, "learning_rate": 0.0002766097729623465, "loss": 4.7971, "step": 17625 }, { "epoch": 0.36102635512870396, "grad_norm": 0.25133055448532104, "learning_rate": 0.0002765765710416203, "loss": 4.8053, "step": 17630 }, { "epoch": 0.3611287449061086, "grad_norm": 0.24358288943767548, "learning_rate": 0.00027654334756848763, "loss": 4.8307, "step": 17635 }, { "epoch": 0.3612311346835132, "grad_norm": 0.25189048051834106, "learning_rate": 0.0002765101025486054, "loss": 4.8124, "step": 17640 }, { "epoch": 0.3613335244609178, "grad_norm": 0.23731786012649536, "learning_rate": 0.0002764768359876343, "loss": 4.7909, "step": 17645 }, { "epoch": 0.36143591423832244, "grad_norm": 0.24712516367435455, "learning_rate": 0.00027644354789123886, "loss": 4.8159, "step": 17650 }, { "epoch": 0.36153830401572706, "grad_norm": 0.2414264678955078, "learning_rate": 0.0002764102382650869, "loss": 4.7784, "step": 17655 }, { "epoch": 0.3616406937931317, "grad_norm": 0.23889292776584625, "learning_rate": 0.0002763769071148502, "loss": 4.8693, "step": 17660 }, { "epoch": 0.3617430835705363, "grad_norm": 0.2331574559211731, "learning_rate": 0.00027634355444620403, "loss": 4.8343, "step": 17665 }, { "epoch": 0.3618454733479409, "grad_norm": 0.24381151795387268, "learning_rate": 0.0002763101802648275, "loss": 4.8148, "step": 17670 }, { "epoch": 0.36194786312534555, "grad_norm": 0.24279746413230896, "learning_rate": 0.0002762767845764033, "loss": 4.7826, "step": 17675 }, { "epoch": 0.36205025290275017, "grad_norm": 0.24833084642887115, "learning_rate": 0.0002762433673866177, "loss": 4.8797, "step": 17680 }, { "epoch": 0.3621526426801548, "grad_norm": 0.23561091721057892, "learning_rate": 0.0002762099287011607, "loss": 4.8394, "step": 17685 }, { "epoch": 0.3622550324575594, "grad_norm": 0.24129699170589447, "learning_rate": 0.00027617646852572597, "loss": 4.8028, "step": 17690 }, { "epoch": 0.3623574222349641, "grad_norm": 0.24755354225635529, "learning_rate": 0.0002761429868660109, "loss": 4.7667, "step": 17695 }, { "epoch": 0.3624598120123687, "grad_norm": 0.2509928047657013, "learning_rate": 0.00027610948372771643, "loss": 4.7892, "step": 17700 }, { "epoch": 0.36256220178977333, "grad_norm": 0.2602989971637726, "learning_rate": 0.0002760759591165471, "loss": 4.8036, "step": 17705 }, { "epoch": 0.36266459156717795, "grad_norm": 0.25540390610694885, "learning_rate": 0.0002760424130382113, "loss": 4.7706, "step": 17710 }, { "epoch": 0.3627669813445826, "grad_norm": 0.24948926270008087, "learning_rate": 0.000276008845498421, "loss": 4.8129, "step": 17715 }, { "epoch": 0.3628693711219872, "grad_norm": 0.2444363236427307, "learning_rate": 0.0002759752565028918, "loss": 4.8407, "step": 17720 }, { "epoch": 0.3629717608993918, "grad_norm": 0.2489061802625656, "learning_rate": 0.00027594164605734286, "loss": 4.7935, "step": 17725 }, { "epoch": 0.36307415067679644, "grad_norm": 0.25345760583877563, "learning_rate": 0.0002759080141674972, "loss": 4.773, "step": 17730 }, { "epoch": 0.36317654045420106, "grad_norm": 0.23497188091278076, "learning_rate": 0.0002758743608390813, "loss": 4.8028, "step": 17735 }, { "epoch": 0.3632789302316057, "grad_norm": 0.23632681369781494, "learning_rate": 0.00027584068607782537, "loss": 4.8242, "step": 17740 }, { "epoch": 0.3633813200090103, "grad_norm": 0.22838786244392395, "learning_rate": 0.0002758069898894633, "loss": 4.7714, "step": 17745 }, { "epoch": 0.3634837097864149, "grad_norm": 0.2365570217370987, "learning_rate": 0.00027577327227973255, "loss": 4.758, "step": 17750 }, { "epoch": 0.36358609956381954, "grad_norm": 0.22895501554012299, "learning_rate": 0.0002757395332543744, "loss": 4.7893, "step": 17755 }, { "epoch": 0.36368848934122416, "grad_norm": 0.2436460256576538, "learning_rate": 0.0002757057728191335, "loss": 4.7621, "step": 17760 }, { "epoch": 0.3637908791186288, "grad_norm": 0.2596724033355713, "learning_rate": 0.0002756719909797584, "loss": 4.7822, "step": 17765 }, { "epoch": 0.3638932688960334, "grad_norm": 0.24657489359378815, "learning_rate": 0.0002756381877420011, "loss": 4.7821, "step": 17770 }, { "epoch": 0.363995658673438, "grad_norm": 0.2288065105676651, "learning_rate": 0.0002756043631116175, "loss": 4.7956, "step": 17775 }, { "epoch": 0.36409804845084265, "grad_norm": 0.24526286125183105, "learning_rate": 0.0002755705170943667, "loss": 4.7953, "step": 17780 }, { "epoch": 0.36420043822824727, "grad_norm": 0.27630555629730225, "learning_rate": 0.000275536649696012, "loss": 4.8206, "step": 17785 }, { "epoch": 0.3643028280056519, "grad_norm": 0.24275371432304382, "learning_rate": 0.0002755027609223199, "loss": 4.7961, "step": 17790 }, { "epoch": 0.3644052177830565, "grad_norm": 0.25726425647735596, "learning_rate": 0.0002754688507790607, "loss": 4.8007, "step": 17795 }, { "epoch": 0.3645076075604612, "grad_norm": 0.26756492257118225, "learning_rate": 0.0002754349192720084, "loss": 4.846, "step": 17800 }, { "epoch": 0.3646099973378658, "grad_norm": 0.2504918873310089, "learning_rate": 0.0002754009664069406, "loss": 4.7891, "step": 17805 }, { "epoch": 0.36471238711527043, "grad_norm": 0.25351810455322266, "learning_rate": 0.0002753669921896384, "loss": 4.7709, "step": 17810 }, { "epoch": 0.36481477689267505, "grad_norm": 0.26117199659347534, "learning_rate": 0.0002753329966258868, "loss": 4.8621, "step": 17815 }, { "epoch": 0.3649171666700797, "grad_norm": 0.22292082011699677, "learning_rate": 0.00027529897972147413, "loss": 4.7652, "step": 17820 }, { "epoch": 0.3650195564474843, "grad_norm": 0.24843989312648773, "learning_rate": 0.0002752649414821926, "loss": 4.7668, "step": 17825 }, { "epoch": 0.3651219462248889, "grad_norm": 0.24700407683849335, "learning_rate": 0.0002752308819138379, "loss": 4.8542, "step": 17830 }, { "epoch": 0.36522433600229354, "grad_norm": 0.2571440041065216, "learning_rate": 0.00027519680102220946, "loss": 4.7304, "step": 17835 }, { "epoch": 0.36532672577969816, "grad_norm": 0.2338082641363144, "learning_rate": 0.0002751626988131102, "loss": 4.772, "step": 17840 }, { "epoch": 0.3654291155571028, "grad_norm": 0.24531778693199158, "learning_rate": 0.0002751285752923469, "loss": 4.7017, "step": 17845 }, { "epoch": 0.3655315053345074, "grad_norm": 0.25323912501335144, "learning_rate": 0.0002750944304657297, "loss": 4.8142, "step": 17850 }, { "epoch": 0.365633895111912, "grad_norm": 0.25650572776794434, "learning_rate": 0.00027506026433907264, "loss": 4.7979, "step": 17855 }, { "epoch": 0.36573628488931664, "grad_norm": 0.2664353549480438, "learning_rate": 0.0002750260769181931, "loss": 4.7791, "step": 17860 }, { "epoch": 0.36583867466672126, "grad_norm": 0.2455276995897293, "learning_rate": 0.0002749918682089123, "loss": 4.7854, "step": 17865 }, { "epoch": 0.3659410644441259, "grad_norm": 0.2544846832752228, "learning_rate": 0.000274957638217055, "loss": 4.7654, "step": 17870 }, { "epoch": 0.3660434542215305, "grad_norm": 0.2378363460302353, "learning_rate": 0.00027492338694844957, "loss": 4.763, "step": 17875 }, { "epoch": 0.3661458439989351, "grad_norm": 0.26293930411338806, "learning_rate": 0.00027488911440892814, "loss": 4.8123, "step": 17880 }, { "epoch": 0.36624823377633975, "grad_norm": 0.2636087238788605, "learning_rate": 0.00027485482060432624, "loss": 4.7265, "step": 17885 }, { "epoch": 0.36635062355374437, "grad_norm": 0.23257797956466675, "learning_rate": 0.0002748205055404832, "loss": 4.8015, "step": 17890 }, { "epoch": 0.366453013331149, "grad_norm": 0.25716593861579895, "learning_rate": 0.0002747861692232418, "loss": 4.793, "step": 17895 }, { "epoch": 0.36655540310855367, "grad_norm": 0.24235756695270538, "learning_rate": 0.0002747518116584487, "loss": 4.7958, "step": 17900 }, { "epoch": 0.3666577928859583, "grad_norm": 0.24651765823364258, "learning_rate": 0.00027471743285195383, "loss": 4.7845, "step": 17905 }, { "epoch": 0.3667601826633629, "grad_norm": 0.23017224669456482, "learning_rate": 0.00027468303280961116, "loss": 4.8313, "step": 17910 }, { "epoch": 0.36686257244076753, "grad_norm": 0.3161824941635132, "learning_rate": 0.00027464861153727785, "loss": 4.8361, "step": 17915 }, { "epoch": 0.36696496221817215, "grad_norm": 0.2504814565181732, "learning_rate": 0.00027461416904081494, "loss": 4.7972, "step": 17920 }, { "epoch": 0.3670673519955768, "grad_norm": 0.24127408862113953, "learning_rate": 0.00027457970532608703, "loss": 4.8106, "step": 17925 }, { "epoch": 0.3671697417729814, "grad_norm": 0.26364776492118835, "learning_rate": 0.00027454522039896227, "loss": 4.8322, "step": 17930 }, { "epoch": 0.367272131550386, "grad_norm": 0.6151398420333862, "learning_rate": 0.00027451071426531253, "loss": 4.8723, "step": 17935 }, { "epoch": 0.36737452132779064, "grad_norm": 0.2452814280986786, "learning_rate": 0.00027447618693101316, "loss": 4.7882, "step": 17940 }, { "epoch": 0.36747691110519526, "grad_norm": 0.2335476130247116, "learning_rate": 0.00027444163840194323, "loss": 4.8472, "step": 17945 }, { "epoch": 0.3675793008825999, "grad_norm": 0.2404443323612213, "learning_rate": 0.00027440706868398537, "loss": 4.6705, "step": 17950 }, { "epoch": 0.3676816906600045, "grad_norm": 0.2449806034564972, "learning_rate": 0.0002743724777830258, "loss": 4.8098, "step": 17955 }, { "epoch": 0.3677840804374091, "grad_norm": 0.2550186812877655, "learning_rate": 0.0002743378657049544, "loss": 4.7936, "step": 17960 }, { "epoch": 0.36788647021481374, "grad_norm": 0.24325023591518402, "learning_rate": 0.00027430323245566463, "loss": 4.7399, "step": 17965 }, { "epoch": 0.36798885999221836, "grad_norm": 0.25150230526924133, "learning_rate": 0.0002742685780410535, "loss": 4.8023, "step": 17970 }, { "epoch": 0.368091249769623, "grad_norm": 0.26221171021461487, "learning_rate": 0.0002742339024670217, "loss": 4.7608, "step": 17975 }, { "epoch": 0.3681936395470276, "grad_norm": 0.2458525151014328, "learning_rate": 0.00027419920573947356, "loss": 4.7636, "step": 17980 }, { "epoch": 0.36829602932443223, "grad_norm": 0.24908113479614258, "learning_rate": 0.00027416448786431684, "loss": 4.8019, "step": 17985 }, { "epoch": 0.36839841910183685, "grad_norm": 0.34281638264656067, "learning_rate": 0.00027412974884746317, "loss": 4.7678, "step": 17990 }, { "epoch": 0.36850080887924147, "grad_norm": 0.2360275685787201, "learning_rate": 0.00027409498869482746, "loss": 4.7555, "step": 17995 }, { "epoch": 0.36860319865664615, "grad_norm": 0.24784208834171295, "learning_rate": 0.00027406020741232843, "loss": 4.7721, "step": 18000 }, { "epoch": 0.36860319865664615, "eval_loss": 4.805654048919678, "eval_runtime": 17.2368, "eval_samples_per_second": 33.765, "eval_steps_per_second": 4.235, "step": 18000 }, { "epoch": 0.36870558843405077, "grad_norm": 0.2545544505119324, "learning_rate": 0.0002740254050058884, "loss": 4.8206, "step": 18005 }, { "epoch": 0.3688079782114554, "grad_norm": 0.2245105654001236, "learning_rate": 0.0002739905814814332, "loss": 4.8171, "step": 18010 }, { "epoch": 0.36891036798886, "grad_norm": 0.2906739413738251, "learning_rate": 0.0002739557368448923, "loss": 4.7607, "step": 18015 }, { "epoch": 0.36901275776626463, "grad_norm": 0.2782283425331116, "learning_rate": 0.0002739208711021986, "loss": 4.7538, "step": 18020 }, { "epoch": 0.36911514754366925, "grad_norm": 0.25412094593048096, "learning_rate": 0.000273885984259289, "loss": 4.7402, "step": 18025 }, { "epoch": 0.3692175373210739, "grad_norm": 0.2368987798690796, "learning_rate": 0.0002738510763221037, "loss": 4.815, "step": 18030 }, { "epoch": 0.3693199270984785, "grad_norm": 0.25744420289993286, "learning_rate": 0.00027381614729658635, "loss": 4.7611, "step": 18035 }, { "epoch": 0.3694223168758831, "grad_norm": 0.2621389925479889, "learning_rate": 0.0002737811971886846, "loss": 4.774, "step": 18040 }, { "epoch": 0.36952470665328774, "grad_norm": 0.24370889365673065, "learning_rate": 0.00027374622600434925, "loss": 4.8189, "step": 18045 }, { "epoch": 0.36962709643069236, "grad_norm": 0.24246658384799957, "learning_rate": 0.000273711233749535, "loss": 4.7838, "step": 18050 }, { "epoch": 0.369729486208097, "grad_norm": 0.28695717453956604, "learning_rate": 0.0002736762204302001, "loss": 4.8252, "step": 18055 }, { "epoch": 0.3698318759855016, "grad_norm": 0.2438812553882599, "learning_rate": 0.0002736411860523063, "loss": 4.8097, "step": 18060 }, { "epoch": 0.3699342657629062, "grad_norm": 0.2786584496498108, "learning_rate": 0.0002736061306218189, "loss": 4.7911, "step": 18065 }, { "epoch": 0.37003665554031084, "grad_norm": 0.2415219396352768, "learning_rate": 0.0002735710541447069, "loss": 4.796, "step": 18070 }, { "epoch": 0.37013904531771546, "grad_norm": 0.24593287706375122, "learning_rate": 0.0002735359566269428, "loss": 4.7672, "step": 18075 }, { "epoch": 0.3702414350951201, "grad_norm": 0.31544220447540283, "learning_rate": 0.0002735008380745027, "loss": 4.7847, "step": 18080 }, { "epoch": 0.3703438248725247, "grad_norm": 0.2856111228466034, "learning_rate": 0.00027346569849336636, "loss": 4.7575, "step": 18085 }, { "epoch": 0.37044621464992933, "grad_norm": 0.24270612001419067, "learning_rate": 0.000273430537889517, "loss": 4.8106, "step": 18090 }, { "epoch": 0.37054860442733395, "grad_norm": 0.24650530517101288, "learning_rate": 0.00027339535626894155, "loss": 4.787, "step": 18095 }, { "epoch": 0.3706509942047386, "grad_norm": 0.2618081867694855, "learning_rate": 0.00027336015363763034, "loss": 4.8112, "step": 18100 }, { "epoch": 0.37075338398214325, "grad_norm": 0.23776638507843018, "learning_rate": 0.0002733249300015774, "loss": 4.8249, "step": 18105 }, { "epoch": 0.37085577375954787, "grad_norm": 0.2623469829559326, "learning_rate": 0.0002732896853667804, "loss": 4.7815, "step": 18110 }, { "epoch": 0.3709581635369525, "grad_norm": 0.2656545341014862, "learning_rate": 0.00027325441973924045, "loss": 4.7354, "step": 18115 }, { "epoch": 0.3710605533143571, "grad_norm": 0.2753200829029083, "learning_rate": 0.0002732191331249622, "loss": 4.8207, "step": 18120 }, { "epoch": 0.37116294309176173, "grad_norm": 0.24409864842891693, "learning_rate": 0.00027318382552995416, "loss": 4.7847, "step": 18125 }, { "epoch": 0.37126533286916635, "grad_norm": 0.24308210611343384, "learning_rate": 0.0002731484969602281, "loss": 4.7126, "step": 18130 }, { "epoch": 0.371367722646571, "grad_norm": 0.2436607927083969, "learning_rate": 0.00027311314742179944, "loss": 4.7639, "step": 18135 }, { "epoch": 0.3714701124239756, "grad_norm": 0.23844410479068756, "learning_rate": 0.0002730777769206872, "loss": 4.7766, "step": 18140 }, { "epoch": 0.3715725022013802, "grad_norm": 0.23728613555431366, "learning_rate": 0.00027304238546291417, "loss": 4.7252, "step": 18145 }, { "epoch": 0.37167489197878484, "grad_norm": 0.25353166460990906, "learning_rate": 0.00027300697305450623, "loss": 4.7806, "step": 18150 }, { "epoch": 0.37177728175618946, "grad_norm": 0.23095594346523285, "learning_rate": 0.00027297153970149324, "loss": 4.8203, "step": 18155 }, { "epoch": 0.3718796715335941, "grad_norm": 0.23115617036819458, "learning_rate": 0.00027293608540990863, "loss": 4.7845, "step": 18160 }, { "epoch": 0.3719820613109987, "grad_norm": 0.2482440024614334, "learning_rate": 0.000272900610185789, "loss": 4.8398, "step": 18165 }, { "epoch": 0.3720844510884033, "grad_norm": 0.2375240921974182, "learning_rate": 0.000272865114035175, "loss": 4.7989, "step": 18170 }, { "epoch": 0.37218684086580794, "grad_norm": 0.26746800541877747, "learning_rate": 0.00027282959696411045, "loss": 4.8617, "step": 18175 }, { "epoch": 0.37228923064321257, "grad_norm": 0.2613542079925537, "learning_rate": 0.000272794058978643, "loss": 4.7514, "step": 18180 }, { "epoch": 0.3723916204206172, "grad_norm": 0.2713029086589813, "learning_rate": 0.00027275850008482376, "loss": 4.8221, "step": 18185 }, { "epoch": 0.3724940101980218, "grad_norm": 0.24614305794239044, "learning_rate": 0.0002727229202887074, "loss": 4.7811, "step": 18190 }, { "epoch": 0.37259639997542643, "grad_norm": 0.24781306087970734, "learning_rate": 0.0002726873195963521, "loss": 4.7565, "step": 18195 }, { "epoch": 0.37269878975283105, "grad_norm": 0.23562413454055786, "learning_rate": 0.00027265169801381976, "loss": 4.7957, "step": 18200 }, { "epoch": 0.3728011795302357, "grad_norm": 0.2612961530685425, "learning_rate": 0.00027261605554717563, "loss": 4.8467, "step": 18205 }, { "epoch": 0.37290356930764035, "grad_norm": 0.24376554787158966, "learning_rate": 0.0002725803922024886, "loss": 4.7415, "step": 18210 }, { "epoch": 0.37300595908504497, "grad_norm": 0.24161337316036224, "learning_rate": 0.0002725447079858312, "loss": 4.7421, "step": 18215 }, { "epoch": 0.3731083488624496, "grad_norm": 0.24609360098838806, "learning_rate": 0.0002725090029032794, "loss": 4.7671, "step": 18220 }, { "epoch": 0.3732107386398542, "grad_norm": 0.25302135944366455, "learning_rate": 0.00027247327696091277, "loss": 4.8153, "step": 18225 }, { "epoch": 0.37331312841725883, "grad_norm": 0.25249356031417847, "learning_rate": 0.00027243753016481444, "loss": 4.7782, "step": 18230 }, { "epoch": 0.37341551819466345, "grad_norm": 0.25398945808410645, "learning_rate": 0.00027240176252107105, "loss": 4.792, "step": 18235 }, { "epoch": 0.3735179079720681, "grad_norm": 0.23390665650367737, "learning_rate": 0.0002723659740357728, "loss": 4.798, "step": 18240 }, { "epoch": 0.3736202977494727, "grad_norm": 0.2441662847995758, "learning_rate": 0.00027233016471501354, "loss": 4.7666, "step": 18245 }, { "epoch": 0.3737226875268773, "grad_norm": 0.5501225590705872, "learning_rate": 0.00027229433456489057, "loss": 4.81, "step": 18250 }, { "epoch": 0.37382507730428194, "grad_norm": 0.25560975074768066, "learning_rate": 0.0002722584835915046, "loss": 4.8233, "step": 18255 }, { "epoch": 0.37392746708168656, "grad_norm": 0.29762735962867737, "learning_rate": 0.0002722226118009603, "loss": 4.8386, "step": 18260 }, { "epoch": 0.3740298568590912, "grad_norm": 0.28740373253822327, "learning_rate": 0.0002721867191993653, "loss": 4.8693, "step": 18265 }, { "epoch": 0.3741322466364958, "grad_norm": 0.31148993968963623, "learning_rate": 0.00027215080579283136, "loss": 4.7801, "step": 18270 }, { "epoch": 0.3742346364139004, "grad_norm": 0.30883365869522095, "learning_rate": 0.0002721148715874735, "loss": 4.8184, "step": 18275 }, { "epoch": 0.37433702619130504, "grad_norm": 0.2357916384935379, "learning_rate": 0.0002720789165894101, "loss": 4.7911, "step": 18280 }, { "epoch": 0.37443941596870967, "grad_norm": 0.28309592604637146, "learning_rate": 0.0002720429408047634, "loss": 4.8236, "step": 18285 }, { "epoch": 0.3745418057461143, "grad_norm": 0.23531167209148407, "learning_rate": 0.0002720069442396591, "loss": 4.7503, "step": 18290 }, { "epoch": 0.3746441955235189, "grad_norm": 0.2579338550567627, "learning_rate": 0.0002719709269002263, "loss": 4.8113, "step": 18295 }, { "epoch": 0.37474658530092353, "grad_norm": 0.23504072427749634, "learning_rate": 0.00027193488879259783, "loss": 4.7818, "step": 18300 }, { "epoch": 0.3748489750783282, "grad_norm": 0.23527486622333527, "learning_rate": 0.0002718988299229099, "loss": 4.8111, "step": 18305 }, { "epoch": 0.3749513648557328, "grad_norm": 0.2614368796348572, "learning_rate": 0.0002718627502973023, "loss": 4.7983, "step": 18310 }, { "epoch": 0.37505375463313745, "grad_norm": 0.25063908100128174, "learning_rate": 0.0002718266499219184, "loss": 4.7955, "step": 18315 }, { "epoch": 0.37515614441054207, "grad_norm": 0.24770070612430573, "learning_rate": 0.0002717905288029051, "loss": 4.7598, "step": 18320 }, { "epoch": 0.3752585341879467, "grad_norm": 0.2598324120044708, "learning_rate": 0.00027175438694641276, "loss": 4.7927, "step": 18325 }, { "epoch": 0.3753609239653513, "grad_norm": 0.23545841872692108, "learning_rate": 0.0002717182243585953, "loss": 4.759, "step": 18330 }, { "epoch": 0.37546331374275593, "grad_norm": 0.24323679506778717, "learning_rate": 0.00027168204104561025, "loss": 4.7429, "step": 18335 }, { "epoch": 0.37556570352016055, "grad_norm": 0.32741808891296387, "learning_rate": 0.00027164583701361856, "loss": 4.8504, "step": 18340 }, { "epoch": 0.3756680932975652, "grad_norm": 0.23775732517242432, "learning_rate": 0.00027160961226878474, "loss": 4.7991, "step": 18345 }, { "epoch": 0.3757704830749698, "grad_norm": 0.24511213600635529, "learning_rate": 0.0002715733668172769, "loss": 4.7407, "step": 18350 }, { "epoch": 0.3758728728523744, "grad_norm": 0.24636994302272797, "learning_rate": 0.00027153710066526646, "loss": 4.7733, "step": 18355 }, { "epoch": 0.37597526262977904, "grad_norm": 0.22909529507160187, "learning_rate": 0.00027150081381892874, "loss": 4.7889, "step": 18360 }, { "epoch": 0.37607765240718366, "grad_norm": 0.25632724165916443, "learning_rate": 0.0002714645062844422, "loss": 4.7977, "step": 18365 }, { "epoch": 0.3761800421845883, "grad_norm": 0.2533828914165497, "learning_rate": 0.00027142817806798906, "loss": 4.7285, "step": 18370 }, { "epoch": 0.3762824319619929, "grad_norm": 0.25821688771247864, "learning_rate": 0.0002713918291757549, "loss": 4.7774, "step": 18375 }, { "epoch": 0.3763848217393975, "grad_norm": 0.2438954859972, "learning_rate": 0.0002713554596139291, "loss": 4.7832, "step": 18380 }, { "epoch": 0.37648721151680214, "grad_norm": 0.2397865653038025, "learning_rate": 0.0002713190693887041, "loss": 4.8666, "step": 18385 }, { "epoch": 0.37658960129420677, "grad_norm": 0.2682671844959259, "learning_rate": 0.00027128265850627633, "loss": 4.7679, "step": 18390 }, { "epoch": 0.3766919910716114, "grad_norm": 0.2713569104671478, "learning_rate": 0.00027124622697284546, "loss": 4.7427, "step": 18395 }, { "epoch": 0.376794380849016, "grad_norm": 0.25199782848358154, "learning_rate": 0.00027120977479461476, "loss": 4.7515, "step": 18400 }, { "epoch": 0.3768967706264207, "grad_norm": 0.2549890875816345, "learning_rate": 0.000271173301977791, "loss": 4.8203, "step": 18405 }, { "epoch": 0.3769991604038253, "grad_norm": 0.25833216309547424, "learning_rate": 0.0002711368085285844, "loss": 4.7878, "step": 18410 }, { "epoch": 0.3771015501812299, "grad_norm": 0.2570663094520569, "learning_rate": 0.0002711002944532089, "loss": 4.8165, "step": 18415 }, { "epoch": 0.37720393995863455, "grad_norm": 0.24693752825260162, "learning_rate": 0.0002710637597578817, "loss": 4.794, "step": 18420 }, { "epoch": 0.37730632973603917, "grad_norm": 0.23728980123996735, "learning_rate": 0.0002710272044488237, "loss": 4.8135, "step": 18425 }, { "epoch": 0.3774087195134438, "grad_norm": 0.2658339738845825, "learning_rate": 0.0002709906285322592, "loss": 4.7731, "step": 18430 }, { "epoch": 0.3775111092908484, "grad_norm": 0.23852306604385376, "learning_rate": 0.00027095403201441603, "loss": 4.7737, "step": 18435 }, { "epoch": 0.37761349906825303, "grad_norm": 0.24067999422550201, "learning_rate": 0.00027091741490152554, "loss": 4.7919, "step": 18440 }, { "epoch": 0.37771588884565765, "grad_norm": 0.2539689540863037, "learning_rate": 0.0002708807771998226, "loss": 4.8109, "step": 18445 }, { "epoch": 0.3778182786230623, "grad_norm": 0.2400183230638504, "learning_rate": 0.0002708441189155456, "loss": 4.807, "step": 18450 }, { "epoch": 0.3779206684004669, "grad_norm": 0.26198068261146545, "learning_rate": 0.0002708074400549363, "loss": 4.7751, "step": 18455 }, { "epoch": 0.3780230581778715, "grad_norm": 0.23185764253139496, "learning_rate": 0.00027077074062424025, "loss": 4.7895, "step": 18460 }, { "epoch": 0.37812544795527614, "grad_norm": 0.23769047856330872, "learning_rate": 0.00027073402062970624, "loss": 4.7822, "step": 18465 }, { "epoch": 0.37822783773268076, "grad_norm": 0.24097198247909546, "learning_rate": 0.0002706972800775866, "loss": 4.8151, "step": 18470 }, { "epoch": 0.3783302275100854, "grad_norm": 0.25598886609077454, "learning_rate": 0.0002706605189741372, "loss": 4.7853, "step": 18475 }, { "epoch": 0.37843261728749, "grad_norm": 0.2380874752998352, "learning_rate": 0.00027062373732561745, "loss": 4.8168, "step": 18480 }, { "epoch": 0.3785350070648946, "grad_norm": 0.24340538680553436, "learning_rate": 0.0002705869351382903, "loss": 4.7703, "step": 18485 }, { "epoch": 0.37863739684229925, "grad_norm": 0.2664043605327606, "learning_rate": 0.00027055011241842195, "loss": 4.884, "step": 18490 }, { "epoch": 0.37873978661970387, "grad_norm": 0.2321472465991974, "learning_rate": 0.00027051326917228245, "loss": 4.7799, "step": 18495 }, { "epoch": 0.3788421763971085, "grad_norm": 0.2406431883573532, "learning_rate": 0.00027047640540614503, "loss": 4.7492, "step": 18500 }, { "epoch": 0.3789445661745131, "grad_norm": 0.2702486217021942, "learning_rate": 0.0002704395211262866, "loss": 4.7795, "step": 18505 }, { "epoch": 0.3790469559519178, "grad_norm": 0.2503416836261749, "learning_rate": 0.0002704026163389875, "loss": 4.7224, "step": 18510 }, { "epoch": 0.3791493457293224, "grad_norm": 0.25886625051498413, "learning_rate": 0.0002703656910505316, "loss": 4.7999, "step": 18515 }, { "epoch": 0.37925173550672703, "grad_norm": 0.22973226010799408, "learning_rate": 0.00027032874526720615, "loss": 4.7718, "step": 18520 }, { "epoch": 0.37935412528413165, "grad_norm": 0.27013394236564636, "learning_rate": 0.00027029177899530205, "loss": 4.81, "step": 18525 }, { "epoch": 0.37945651506153627, "grad_norm": 0.24470940232276917, "learning_rate": 0.0002702547922411136, "loss": 4.8061, "step": 18530 }, { "epoch": 0.3795589048389409, "grad_norm": 0.28320327401161194, "learning_rate": 0.0002702177850109386, "loss": 4.7697, "step": 18535 }, { "epoch": 0.3796612946163455, "grad_norm": 0.28084442019462585, "learning_rate": 0.00027018075731107823, "loss": 4.7264, "step": 18540 }, { "epoch": 0.37976368439375013, "grad_norm": 0.24529646337032318, "learning_rate": 0.0002701437091478374, "loss": 4.8008, "step": 18545 }, { "epoch": 0.37986607417115476, "grad_norm": 0.23677638173103333, "learning_rate": 0.00027010664052752424, "loss": 4.7524, "step": 18550 }, { "epoch": 0.3799684639485594, "grad_norm": 0.2633301615715027, "learning_rate": 0.00027006955145645066, "loss": 4.8039, "step": 18555 }, { "epoch": 0.380070853725964, "grad_norm": 0.2448771446943283, "learning_rate": 0.0002700324419409317, "loss": 4.8341, "step": 18560 }, { "epoch": 0.3801732435033686, "grad_norm": 0.30636823177337646, "learning_rate": 0.00026999531198728614, "loss": 4.703, "step": 18565 }, { "epoch": 0.38027563328077324, "grad_norm": 0.24533076584339142, "learning_rate": 0.0002699581616018362, "loss": 4.7632, "step": 18570 }, { "epoch": 0.38037802305817786, "grad_norm": 0.2559785544872284, "learning_rate": 0.0002699209907909074, "loss": 4.777, "step": 18575 }, { "epoch": 0.3804804128355825, "grad_norm": 0.2614833116531372, "learning_rate": 0.000269883799560829, "loss": 4.7689, "step": 18580 }, { "epoch": 0.3805828026129871, "grad_norm": 0.24599692225456238, "learning_rate": 0.0002698465879179336, "loss": 4.7589, "step": 18585 }, { "epoch": 0.3806851923903917, "grad_norm": 0.2603289783000946, "learning_rate": 0.00026980935586855726, "loss": 4.7759, "step": 18590 }, { "epoch": 0.38078758216779635, "grad_norm": 0.2516210079193115, "learning_rate": 0.0002697721034190396, "loss": 4.7509, "step": 18595 }, { "epoch": 0.38088997194520097, "grad_norm": 0.25227412581443787, "learning_rate": 0.00026973483057572353, "loss": 4.7953, "step": 18600 }, { "epoch": 0.3809923617226056, "grad_norm": 0.23142880201339722, "learning_rate": 0.0002696975373449557, "loss": 4.7931, "step": 18605 }, { "epoch": 0.38109475150001026, "grad_norm": 0.25117403268814087, "learning_rate": 0.000269660223733086, "loss": 4.7722, "step": 18610 }, { "epoch": 0.3811971412774149, "grad_norm": 0.26767903566360474, "learning_rate": 0.0002696228897464679, "loss": 4.7954, "step": 18615 }, { "epoch": 0.3812995310548195, "grad_norm": 0.30506137013435364, "learning_rate": 0.0002695855353914584, "loss": 4.7892, "step": 18620 }, { "epoch": 0.38140192083222413, "grad_norm": 0.2507057189941406, "learning_rate": 0.00026954816067441777, "loss": 4.8157, "step": 18625 }, { "epoch": 0.38150431060962875, "grad_norm": 0.24486896395683289, "learning_rate": 0.0002695107656017099, "loss": 4.7802, "step": 18630 }, { "epoch": 0.38160670038703337, "grad_norm": 0.23862124979496002, "learning_rate": 0.0002694733501797022, "loss": 4.7237, "step": 18635 }, { "epoch": 0.381709090164438, "grad_norm": 0.2557026743888855, "learning_rate": 0.00026943591441476537, "loss": 4.7983, "step": 18640 }, { "epoch": 0.3818114799418426, "grad_norm": 0.2563668489456177, "learning_rate": 0.00026939845831327363, "loss": 4.8076, "step": 18645 }, { "epoch": 0.38191386971924723, "grad_norm": 0.2395094484090805, "learning_rate": 0.0002693609818816048, "loss": 4.8071, "step": 18650 }, { "epoch": 0.38201625949665186, "grad_norm": 0.25724026560783386, "learning_rate": 0.00026932348512613997, "loss": 4.7311, "step": 18655 }, { "epoch": 0.3821186492740565, "grad_norm": 0.2464665174484253, "learning_rate": 0.0002692859680532638, "loss": 4.8108, "step": 18660 }, { "epoch": 0.3822210390514611, "grad_norm": 0.24866892397403717, "learning_rate": 0.0002692484306693644, "loss": 4.7831, "step": 18665 }, { "epoch": 0.3823234288288657, "grad_norm": 0.29046374559402466, "learning_rate": 0.0002692108729808333, "loss": 4.8312, "step": 18670 }, { "epoch": 0.38242581860627034, "grad_norm": 0.24792906641960144, "learning_rate": 0.00026917329499406557, "loss": 4.791, "step": 18675 }, { "epoch": 0.38252820838367496, "grad_norm": 0.2569676339626312, "learning_rate": 0.0002691356967154596, "loss": 4.7365, "step": 18680 }, { "epoch": 0.3826305981610796, "grad_norm": 0.2488475888967514, "learning_rate": 0.00026909807815141736, "loss": 4.7725, "step": 18685 }, { "epoch": 0.3827329879384842, "grad_norm": 0.24012289941310883, "learning_rate": 0.00026906043930834415, "loss": 4.7432, "step": 18690 }, { "epoch": 0.3828353777158888, "grad_norm": 0.28020796179771423, "learning_rate": 0.00026902278019264893, "loss": 4.7817, "step": 18695 }, { "epoch": 0.38293776749329345, "grad_norm": 0.24647168815135956, "learning_rate": 0.00026898510081074394, "loss": 4.7638, "step": 18700 }, { "epoch": 0.38304015727069807, "grad_norm": 0.24646998941898346, "learning_rate": 0.0002689474011690448, "loss": 4.8024, "step": 18705 }, { "epoch": 0.38314254704810274, "grad_norm": 0.2355469912290573, "learning_rate": 0.0002689096812739708, "loss": 4.7861, "step": 18710 }, { "epoch": 0.38324493682550737, "grad_norm": 0.2523438334465027, "learning_rate": 0.0002688719411319446, "loss": 4.7592, "step": 18715 }, { "epoch": 0.383347326602912, "grad_norm": 0.24443495273590088, "learning_rate": 0.00026883418074939215, "loss": 4.7463, "step": 18720 }, { "epoch": 0.3834497163803166, "grad_norm": 0.24216555058956146, "learning_rate": 0.00026879640013274304, "loss": 4.7913, "step": 18725 }, { "epoch": 0.38355210615772123, "grad_norm": 0.2408372312784195, "learning_rate": 0.0002687585992884303, "loss": 4.763, "step": 18730 }, { "epoch": 0.38365449593512585, "grad_norm": 0.2512153685092926, "learning_rate": 0.0002687207782228903, "loss": 4.7654, "step": 18735 }, { "epoch": 0.38375688571253047, "grad_norm": 0.2301095724105835, "learning_rate": 0.00026868293694256283, "loss": 4.7823, "step": 18740 }, { "epoch": 0.3838592754899351, "grad_norm": 0.2779386341571808, "learning_rate": 0.0002686450754538913, "loss": 4.7686, "step": 18745 }, { "epoch": 0.3839616652673397, "grad_norm": 0.24197597801685333, "learning_rate": 0.00026860719376332236, "loss": 4.7133, "step": 18750 }, { "epoch": 0.38406405504474433, "grad_norm": 0.40180352330207825, "learning_rate": 0.00026856929187730626, "loss": 4.7492, "step": 18755 }, { "epoch": 0.38416644482214896, "grad_norm": 0.2413758635520935, "learning_rate": 0.00026853136980229655, "loss": 4.7975, "step": 18760 }, { "epoch": 0.3842688345995536, "grad_norm": 0.24081037938594818, "learning_rate": 0.00026849342754475033, "loss": 4.7742, "step": 18765 }, { "epoch": 0.3843712243769582, "grad_norm": 0.27904465794563293, "learning_rate": 0.0002684554651111281, "loss": 4.7133, "step": 18770 }, { "epoch": 0.3844736141543628, "grad_norm": 0.23528353869915009, "learning_rate": 0.00026841748250789374, "loss": 4.7873, "step": 18775 }, { "epoch": 0.38457600393176744, "grad_norm": 0.23735223710536957, "learning_rate": 0.00026837947974151465, "loss": 4.8222, "step": 18780 }, { "epoch": 0.38467839370917206, "grad_norm": 0.2439945489168167, "learning_rate": 0.00026834145681846165, "loss": 4.7803, "step": 18785 }, { "epoch": 0.3847807834865767, "grad_norm": 0.2637277841567993, "learning_rate": 0.0002683034137452089, "loss": 4.8156, "step": 18790 }, { "epoch": 0.3848831732639813, "grad_norm": 0.2374177873134613, "learning_rate": 0.0002682653505282341, "loss": 4.7726, "step": 18795 }, { "epoch": 0.3849855630413859, "grad_norm": 0.24194015562534332, "learning_rate": 0.0002682272671740183, "loss": 4.8126, "step": 18800 }, { "epoch": 0.38508795281879055, "grad_norm": 0.2640838325023651, "learning_rate": 0.00026818916368904604, "loss": 4.8057, "step": 18805 }, { "epoch": 0.3851903425961952, "grad_norm": 0.24625428020954132, "learning_rate": 0.00026815104007980536, "loss": 4.8369, "step": 18810 }, { "epoch": 0.38529273237359984, "grad_norm": 0.24794188141822815, "learning_rate": 0.00026811289635278744, "loss": 4.816, "step": 18815 }, { "epoch": 0.38539512215100447, "grad_norm": 0.2448667287826538, "learning_rate": 0.00026807473251448725, "loss": 4.7601, "step": 18820 }, { "epoch": 0.3854975119284091, "grad_norm": 0.23488222062587738, "learning_rate": 0.0002680365485714029, "loss": 4.7819, "step": 18825 }, { "epoch": 0.3855999017058137, "grad_norm": 0.2530573010444641, "learning_rate": 0.00026799834453003615, "loss": 4.8268, "step": 18830 }, { "epoch": 0.38570229148321833, "grad_norm": 0.22342631220817566, "learning_rate": 0.0002679601203968919, "loss": 4.7262, "step": 18835 }, { "epoch": 0.38580468126062295, "grad_norm": 0.2440759539604187, "learning_rate": 0.00026792187617847874, "loss": 4.7564, "step": 18840 }, { "epoch": 0.38590707103802757, "grad_norm": 0.26362764835357666, "learning_rate": 0.0002678836118813086, "loss": 4.7602, "step": 18845 }, { "epoch": 0.3860094608154322, "grad_norm": 0.24483011662960052, "learning_rate": 0.0002678453275118968, "loss": 4.8533, "step": 18850 }, { "epoch": 0.3861118505928368, "grad_norm": 0.24865449965000153, "learning_rate": 0.000267807023076762, "loss": 4.7969, "step": 18855 }, { "epoch": 0.38621424037024144, "grad_norm": 0.23510952293872833, "learning_rate": 0.00026776869858242643, "loss": 4.7496, "step": 18860 }, { "epoch": 0.38631663014764606, "grad_norm": 0.24842463433742523, "learning_rate": 0.0002677303540354157, "loss": 4.7977, "step": 18865 }, { "epoch": 0.3864190199250507, "grad_norm": 0.2571222186088562, "learning_rate": 0.0002676919894422587, "loss": 4.7884, "step": 18870 }, { "epoch": 0.3865214097024553, "grad_norm": 0.2501884698867798, "learning_rate": 0.00026765360480948794, "loss": 4.8327, "step": 18875 }, { "epoch": 0.3866237994798599, "grad_norm": 0.462527871131897, "learning_rate": 0.00026761520014363915, "loss": 4.7341, "step": 18880 }, { "epoch": 0.38672618925726454, "grad_norm": 0.23223496973514557, "learning_rate": 0.00026757677545125166, "loss": 4.8168, "step": 18885 }, { "epoch": 0.38682857903466916, "grad_norm": 0.2507539987564087, "learning_rate": 0.000267538330738868, "loss": 4.777, "step": 18890 }, { "epoch": 0.3869309688120738, "grad_norm": 0.2501585781574249, "learning_rate": 0.0002674998660130342, "loss": 4.7874, "step": 18895 }, { "epoch": 0.3870333585894784, "grad_norm": 0.24588614702224731, "learning_rate": 0.00026746138128029985, "loss": 4.7358, "step": 18900 }, { "epoch": 0.387135748366883, "grad_norm": 0.2335381805896759, "learning_rate": 0.00026742287654721764, "loss": 4.7622, "step": 18905 }, { "epoch": 0.38723813814428765, "grad_norm": 1.543792724609375, "learning_rate": 0.00026738435182034403, "loss": 4.7863, "step": 18910 }, { "epoch": 0.3873405279216923, "grad_norm": 0.23785871267318726, "learning_rate": 0.0002673458071062385, "loss": 4.7614, "step": 18915 }, { "epoch": 0.38744291769909694, "grad_norm": 0.33225584030151367, "learning_rate": 0.0002673072424114643, "loss": 4.6968, "step": 18920 }, { "epoch": 0.38754530747650157, "grad_norm": 0.2608654201030731, "learning_rate": 0.00026726865774258775, "loss": 4.7292, "step": 18925 }, { "epoch": 0.3876476972539062, "grad_norm": 0.25150445103645325, "learning_rate": 0.0002672300531061788, "loss": 4.7513, "step": 18930 }, { "epoch": 0.3877500870313108, "grad_norm": 0.25601643323898315, "learning_rate": 0.00026719142850881075, "loss": 4.7742, "step": 18935 }, { "epoch": 0.38785247680871543, "grad_norm": 0.26587796211242676, "learning_rate": 0.0002671527839570603, "loss": 4.7692, "step": 18940 }, { "epoch": 0.38795486658612005, "grad_norm": 0.27763983607292175, "learning_rate": 0.0002671141194575074, "loss": 4.8354, "step": 18945 }, { "epoch": 0.38805725636352467, "grad_norm": 0.24708032608032227, "learning_rate": 0.00026707543501673555, "loss": 4.841, "step": 18950 }, { "epoch": 0.3881596461409293, "grad_norm": 0.25299885869026184, "learning_rate": 0.00026703673064133173, "loss": 4.7919, "step": 18955 }, { "epoch": 0.3882620359183339, "grad_norm": 0.24397580325603485, "learning_rate": 0.00026699800633788606, "loss": 4.7481, "step": 18960 }, { "epoch": 0.38836442569573854, "grad_norm": 0.23750890791416168, "learning_rate": 0.0002669592621129923, "loss": 4.7634, "step": 18965 }, { "epoch": 0.38846681547314316, "grad_norm": 0.2606174647808075, "learning_rate": 0.00026692049797324746, "loss": 4.7251, "step": 18970 }, { "epoch": 0.3885692052505478, "grad_norm": 0.26604393124580383, "learning_rate": 0.00026688171392525193, "loss": 4.7558, "step": 18975 }, { "epoch": 0.3886715950279524, "grad_norm": 0.23531261086463928, "learning_rate": 0.00026684290997560956, "loss": 4.8159, "step": 18980 }, { "epoch": 0.388773984805357, "grad_norm": 0.22895264625549316, "learning_rate": 0.00026680408613092763, "loss": 4.8155, "step": 18985 }, { "epoch": 0.38887637458276164, "grad_norm": 0.3297937512397766, "learning_rate": 0.00026676524239781663, "loss": 4.7798, "step": 18990 }, { "epoch": 0.38897876436016626, "grad_norm": 0.3952682912349701, "learning_rate": 0.00026672637878289064, "loss": 4.7737, "step": 18995 }, { "epoch": 0.3890811541375709, "grad_norm": 0.25012969970703125, "learning_rate": 0.000266687495292767, "loss": 4.7927, "step": 19000 }, { "epoch": 0.3890811541375709, "eval_loss": 4.8032073974609375, "eval_runtime": 22.9094, "eval_samples_per_second": 25.404, "eval_steps_per_second": 3.186, "step": 19000 }, { "epoch": 0.3891835439149755, "grad_norm": 0.2581552565097809, "learning_rate": 0.00026664859193406646, "loss": 4.7909, "step": 19005 }, { "epoch": 0.3892859336923801, "grad_norm": 0.26446202397346497, "learning_rate": 0.00026660966871341316, "loss": 4.7531, "step": 19010 }, { "epoch": 0.3893883234697848, "grad_norm": 0.24724797904491425, "learning_rate": 0.0002665707256374346, "loss": 4.799, "step": 19015 }, { "epoch": 0.3894907132471894, "grad_norm": 0.27956023812294006, "learning_rate": 0.00026653176271276174, "loss": 4.7934, "step": 19020 }, { "epoch": 0.38959310302459405, "grad_norm": 0.2637579143047333, "learning_rate": 0.00026649277994602883, "loss": 4.7787, "step": 19025 }, { "epoch": 0.38969549280199867, "grad_norm": 0.24771881103515625, "learning_rate": 0.00026645377734387356, "loss": 4.7993, "step": 19030 }, { "epoch": 0.3897978825794033, "grad_norm": 0.2541210949420929, "learning_rate": 0.00026641475491293693, "loss": 4.7925, "step": 19035 }, { "epoch": 0.3899002723568079, "grad_norm": 0.2655090093612671, "learning_rate": 0.00026637571265986337, "loss": 4.7878, "step": 19040 }, { "epoch": 0.39000266213421253, "grad_norm": 0.25652801990509033, "learning_rate": 0.0002663366505913007, "loss": 4.7968, "step": 19045 }, { "epoch": 0.39010505191161715, "grad_norm": 0.23862235248088837, "learning_rate": 0.00026629756871390006, "loss": 4.7485, "step": 19050 }, { "epoch": 0.3902074416890218, "grad_norm": 0.25127172470092773, "learning_rate": 0.0002662584670343159, "loss": 4.7509, "step": 19055 }, { "epoch": 0.3903098314664264, "grad_norm": 0.2848736047744751, "learning_rate": 0.0002662193455592063, "loss": 4.749, "step": 19060 }, { "epoch": 0.390412221243831, "grad_norm": 0.4911057651042938, "learning_rate": 0.00026618020429523235, "loss": 4.7249, "step": 19065 }, { "epoch": 0.39051461102123564, "grad_norm": 0.2578085958957672, "learning_rate": 0.0002661410432490589, "loss": 4.7815, "step": 19070 }, { "epoch": 0.39061700079864026, "grad_norm": 0.23536942899227142, "learning_rate": 0.00026610186242735385, "loss": 4.841, "step": 19075 }, { "epoch": 0.3907193905760449, "grad_norm": 0.2647155821323395, "learning_rate": 0.00026606266183678857, "loss": 4.7802, "step": 19080 }, { "epoch": 0.3908217803534495, "grad_norm": 0.2519124150276184, "learning_rate": 0.00026602344148403786, "loss": 4.7654, "step": 19085 }, { "epoch": 0.3909241701308541, "grad_norm": 0.24083812534809113, "learning_rate": 0.0002659842013757798, "loss": 4.7989, "step": 19090 }, { "epoch": 0.39102655990825874, "grad_norm": 0.26382654905319214, "learning_rate": 0.0002659449415186959, "loss": 4.78, "step": 19095 }, { "epoch": 0.39112894968566336, "grad_norm": 0.24345767498016357, "learning_rate": 0.000265905661919471, "loss": 4.8257, "step": 19100 }, { "epoch": 0.391231339463068, "grad_norm": 0.24560435116291046, "learning_rate": 0.0002658663625847933, "loss": 4.7817, "step": 19105 }, { "epoch": 0.3913337292404726, "grad_norm": 0.2307986617088318, "learning_rate": 0.0002658270435213544, "loss": 4.815, "step": 19110 }, { "epoch": 0.3914361190178773, "grad_norm": 0.24508611857891083, "learning_rate": 0.0002657877047358491, "loss": 4.7599, "step": 19115 }, { "epoch": 0.3915385087952819, "grad_norm": 0.25299644470214844, "learning_rate": 0.0002657483462349758, "loss": 4.8406, "step": 19120 }, { "epoch": 0.3916408985726865, "grad_norm": 0.2539313733577728, "learning_rate": 0.0002657089680254361, "loss": 4.7867, "step": 19125 }, { "epoch": 0.39174328835009115, "grad_norm": 0.23021337389945984, "learning_rate": 0.000265669570113935, "loss": 4.791, "step": 19130 }, { "epoch": 0.39184567812749577, "grad_norm": 0.25902292132377625, "learning_rate": 0.00026563015250718086, "loss": 4.7243, "step": 19135 }, { "epoch": 0.3919480679049004, "grad_norm": 0.25899389386177063, "learning_rate": 0.00026559071521188533, "loss": 4.8535, "step": 19140 }, { "epoch": 0.392050457682305, "grad_norm": 0.23541128635406494, "learning_rate": 0.00026555125823476353, "loss": 4.7362, "step": 19145 }, { "epoch": 0.39215284745970963, "grad_norm": 0.24286217987537384, "learning_rate": 0.0002655117815825338, "loss": 4.8065, "step": 19150 }, { "epoch": 0.39225523723711425, "grad_norm": 0.263705849647522, "learning_rate": 0.00026547228526191803, "loss": 4.749, "step": 19155 }, { "epoch": 0.3923576270145189, "grad_norm": 0.25759607553482056, "learning_rate": 0.00026543276927964114, "loss": 4.8114, "step": 19160 }, { "epoch": 0.3924600167919235, "grad_norm": 0.3094196021556854, "learning_rate": 0.0002653932336424317, "loss": 4.7475, "step": 19165 }, { "epoch": 0.3925624065693281, "grad_norm": 0.25293731689453125, "learning_rate": 0.00026535367835702145, "loss": 4.7546, "step": 19170 }, { "epoch": 0.39266479634673274, "grad_norm": 0.30960583686828613, "learning_rate": 0.00026531410343014563, "loss": 4.7823, "step": 19175 }, { "epoch": 0.39276718612413736, "grad_norm": 0.24387523531913757, "learning_rate": 0.0002652745088685426, "loss": 4.819, "step": 19180 }, { "epoch": 0.392869575901542, "grad_norm": 0.24154560267925262, "learning_rate": 0.0002652348946789543, "loss": 4.7687, "step": 19185 }, { "epoch": 0.3929719656789466, "grad_norm": 0.2489997297525406, "learning_rate": 0.0002651952608681259, "loss": 4.7429, "step": 19190 }, { "epoch": 0.3930743554563512, "grad_norm": 0.27558526396751404, "learning_rate": 0.00026515560744280585, "loss": 4.7931, "step": 19195 }, { "epoch": 0.39317674523375584, "grad_norm": 0.2533782422542572, "learning_rate": 0.000265115934409746, "loss": 4.7954, "step": 19200 }, { "epoch": 0.39327913501116046, "grad_norm": 0.24310936033725739, "learning_rate": 0.0002650762417757016, "loss": 4.762, "step": 19205 }, { "epoch": 0.3933815247885651, "grad_norm": 0.328246533870697, "learning_rate": 0.0002650365295474312, "loss": 4.7286, "step": 19210 }, { "epoch": 0.39348391456596976, "grad_norm": 0.2466014325618744, "learning_rate": 0.0002649967977316966, "loss": 4.7366, "step": 19215 }, { "epoch": 0.3935863043433744, "grad_norm": 0.2853134572505951, "learning_rate": 0.00026495704633526306, "loss": 4.7904, "step": 19220 }, { "epoch": 0.393688694120779, "grad_norm": 0.26766377687454224, "learning_rate": 0.00026491727536489905, "loss": 4.7574, "step": 19225 }, { "epoch": 0.3937910838981836, "grad_norm": 0.255753755569458, "learning_rate": 0.00026487748482737655, "loss": 4.7345, "step": 19230 }, { "epoch": 0.39389347367558825, "grad_norm": 0.24622055888175964, "learning_rate": 0.0002648376747294706, "loss": 4.8189, "step": 19235 }, { "epoch": 0.39399586345299287, "grad_norm": 0.23744042217731476, "learning_rate": 0.0002647978450779599, "loss": 4.7383, "step": 19240 }, { "epoch": 0.3940982532303975, "grad_norm": 0.23252515494823456, "learning_rate": 0.00026475799587962623, "loss": 4.7573, "step": 19245 }, { "epoch": 0.3942006430078021, "grad_norm": 0.2647829055786133, "learning_rate": 0.00026471812714125477, "loss": 4.8206, "step": 19250 }, { "epoch": 0.39430303278520673, "grad_norm": 0.2558216452598572, "learning_rate": 0.00026467823886963405, "loss": 4.7839, "step": 19255 }, { "epoch": 0.39440542256261135, "grad_norm": 0.46287602186203003, "learning_rate": 0.00026463833107155595, "loss": 4.7605, "step": 19260 }, { "epoch": 0.394507812340016, "grad_norm": 0.2570454776287079, "learning_rate": 0.0002645984037538156, "loss": 4.749, "step": 19265 }, { "epoch": 0.3946102021174206, "grad_norm": 0.2836085855960846, "learning_rate": 0.0002645584569232115, "loss": 4.8117, "step": 19270 }, { "epoch": 0.3947125918948252, "grad_norm": 0.2433154135942459, "learning_rate": 0.00026451849058654544, "loss": 4.7985, "step": 19275 }, { "epoch": 0.39481498167222984, "grad_norm": 0.2622375786304474, "learning_rate": 0.0002644785047506226, "loss": 4.8153, "step": 19280 }, { "epoch": 0.39491737144963446, "grad_norm": 0.2493102252483368, "learning_rate": 0.0002644384994222514, "loss": 4.7749, "step": 19285 }, { "epoch": 0.3950197612270391, "grad_norm": 0.24479986727237701, "learning_rate": 0.00026439847460824365, "loss": 4.7593, "step": 19290 }, { "epoch": 0.3951221510044437, "grad_norm": 0.2591876685619354, "learning_rate": 0.00026435843031541445, "loss": 4.7828, "step": 19295 }, { "epoch": 0.3952245407818483, "grad_norm": 0.2384757548570633, "learning_rate": 0.00026431836655058214, "loss": 4.7337, "step": 19300 }, { "epoch": 0.39532693055925294, "grad_norm": 0.23687398433685303, "learning_rate": 0.0002642782833205685, "loss": 4.7749, "step": 19305 }, { "epoch": 0.39542932033665756, "grad_norm": 0.24877889454364777, "learning_rate": 0.00026423818063219857, "loss": 4.7949, "step": 19310 }, { "epoch": 0.3955317101140622, "grad_norm": 0.24068373441696167, "learning_rate": 0.0002641980584923007, "loss": 4.7942, "step": 19315 }, { "epoch": 0.39563409989146686, "grad_norm": 0.232746422290802, "learning_rate": 0.0002641579169077065, "loss": 4.7772, "step": 19320 }, { "epoch": 0.3957364896688715, "grad_norm": 0.24453406035900116, "learning_rate": 0.000264117755885251, "loss": 4.7589, "step": 19325 }, { "epoch": 0.3958388794462761, "grad_norm": 0.2468212991952896, "learning_rate": 0.0002640775754317725, "loss": 4.7092, "step": 19330 }, { "epoch": 0.3959412692236807, "grad_norm": 0.27851566672325134, "learning_rate": 0.00026403737555411255, "loss": 4.7822, "step": 19335 }, { "epoch": 0.39604365900108535, "grad_norm": 0.25844505429267883, "learning_rate": 0.0002639971562591161, "loss": 4.7588, "step": 19340 }, { "epoch": 0.39614604877848997, "grad_norm": 0.23280110955238342, "learning_rate": 0.00026395691755363125, "loss": 4.7725, "step": 19345 }, { "epoch": 0.3962484385558946, "grad_norm": 0.2477782517671585, "learning_rate": 0.0002639166594445097, "loss": 4.7506, "step": 19350 }, { "epoch": 0.3963508283332992, "grad_norm": 0.23398302495479584, "learning_rate": 0.0002638763819386061, "loss": 4.7248, "step": 19355 }, { "epoch": 0.39645321811070383, "grad_norm": 0.24845682084560394, "learning_rate": 0.0002638360850427786, "loss": 4.7614, "step": 19360 }, { "epoch": 0.39655560788810845, "grad_norm": 0.2504514753818512, "learning_rate": 0.0002637957687638887, "loss": 4.744, "step": 19365 }, { "epoch": 0.3966579976655131, "grad_norm": 0.24058574438095093, "learning_rate": 0.00026375543310880095, "loss": 4.8003, "step": 19370 }, { "epoch": 0.3967603874429177, "grad_norm": 0.25265824794769287, "learning_rate": 0.0002637150780843836, "loss": 4.8176, "step": 19375 }, { "epoch": 0.3968627772203223, "grad_norm": 0.2544262707233429, "learning_rate": 0.0002636747036975078, "loss": 4.7584, "step": 19380 }, { "epoch": 0.39696516699772694, "grad_norm": 0.2531717121601105, "learning_rate": 0.00026363430995504823, "loss": 4.7528, "step": 19385 }, { "epoch": 0.39706755677513156, "grad_norm": 0.2622533440589905, "learning_rate": 0.0002635938968638828, "loss": 4.7799, "step": 19390 }, { "epoch": 0.3971699465525362, "grad_norm": 0.28076672554016113, "learning_rate": 0.0002635534644308927, "loss": 4.7745, "step": 19395 }, { "epoch": 0.3972723363299408, "grad_norm": 0.2232881784439087, "learning_rate": 0.00026351301266296247, "loss": 4.7611, "step": 19400 }, { "epoch": 0.3973747261073454, "grad_norm": 0.23840515315532684, "learning_rate": 0.0002634725415669798, "loss": 4.7596, "step": 19405 }, { "epoch": 0.39747711588475004, "grad_norm": 0.24760614335536957, "learning_rate": 0.0002634320511498359, "loss": 4.764, "step": 19410 }, { "epoch": 0.39757950566215466, "grad_norm": 0.2354411631822586, "learning_rate": 0.00026339154141842504, "loss": 4.758, "step": 19415 }, { "epoch": 0.39768189543955934, "grad_norm": 0.2832263708114624, "learning_rate": 0.00026335101237964494, "loss": 4.7504, "step": 19420 }, { "epoch": 0.39778428521696396, "grad_norm": 0.24699002504348755, "learning_rate": 0.0002633104640403965, "loss": 4.739, "step": 19425 }, { "epoch": 0.3978866749943686, "grad_norm": 0.23946571350097656, "learning_rate": 0.00026326989640758396, "loss": 4.7457, "step": 19430 }, { "epoch": 0.3979890647717732, "grad_norm": 0.2437146008014679, "learning_rate": 0.0002632293094881149, "loss": 4.7888, "step": 19435 }, { "epoch": 0.3980914545491778, "grad_norm": 0.2592165172100067, "learning_rate": 0.0002631887032889001, "loss": 4.7477, "step": 19440 }, { "epoch": 0.39819384432658245, "grad_norm": 0.24634739756584167, "learning_rate": 0.0002631480778168536, "loss": 4.8172, "step": 19445 }, { "epoch": 0.39829623410398707, "grad_norm": 0.2822176218032837, "learning_rate": 0.0002631074330788928, "loss": 4.7945, "step": 19450 }, { "epoch": 0.3983986238813917, "grad_norm": 0.26324695348739624, "learning_rate": 0.00026306676908193833, "loss": 4.8088, "step": 19455 }, { "epoch": 0.3985010136587963, "grad_norm": 0.249222069978714, "learning_rate": 0.00026302608583291415, "loss": 4.7186, "step": 19460 }, { "epoch": 0.39860340343620093, "grad_norm": 0.2556007504463196, "learning_rate": 0.00026298538333874744, "loss": 4.7699, "step": 19465 }, { "epoch": 0.39870579321360555, "grad_norm": 0.24117238819599152, "learning_rate": 0.00026294466160636874, "loss": 4.7778, "step": 19470 }, { "epoch": 0.3988081829910102, "grad_norm": 0.2642318606376648, "learning_rate": 0.0002629039206427117, "loss": 4.7332, "step": 19475 }, { "epoch": 0.3989105727684148, "grad_norm": 0.2724672555923462, "learning_rate": 0.0002628631604547134, "loss": 4.7972, "step": 19480 }, { "epoch": 0.3990129625458194, "grad_norm": 0.25742900371551514, "learning_rate": 0.0002628223810493142, "loss": 4.7281, "step": 19485 }, { "epoch": 0.39911535232322404, "grad_norm": 0.24829146265983582, "learning_rate": 0.0002627815824334576, "loss": 4.7601, "step": 19490 }, { "epoch": 0.39921774210062866, "grad_norm": 0.24778318405151367, "learning_rate": 0.00026274076461409054, "loss": 4.7738, "step": 19495 }, { "epoch": 0.3993201318780333, "grad_norm": 0.2703503966331482, "learning_rate": 0.000262699927598163, "loss": 4.7489, "step": 19500 }, { "epoch": 0.3994225216554379, "grad_norm": 0.24909624457359314, "learning_rate": 0.00026265907139262856, "loss": 4.7699, "step": 19505 }, { "epoch": 0.3995249114328425, "grad_norm": 0.27507728338241577, "learning_rate": 0.0002626181960044437, "loss": 4.8126, "step": 19510 }, { "epoch": 0.39962730121024714, "grad_norm": 0.26537808775901794, "learning_rate": 0.00026257730144056837, "loss": 4.7417, "step": 19515 }, { "epoch": 0.3997296909876518, "grad_norm": 0.27283066511154175, "learning_rate": 0.0002625363877079659, "loss": 4.7776, "step": 19520 }, { "epoch": 0.39983208076505644, "grad_norm": 0.2485787272453308, "learning_rate": 0.00026249545481360254, "loss": 4.7465, "step": 19525 }, { "epoch": 0.39993447054246106, "grad_norm": 0.24497090280056, "learning_rate": 0.00026245450276444815, "loss": 4.7873, "step": 19530 }, { "epoch": 0.4000368603198657, "grad_norm": 0.2390383630990982, "learning_rate": 0.00026241353156747566, "loss": 4.7217, "step": 19535 }, { "epoch": 0.4001392500972703, "grad_norm": 0.2366401106119156, "learning_rate": 0.00026237254122966127, "loss": 4.7863, "step": 19540 }, { "epoch": 0.4002416398746749, "grad_norm": 0.24308615922927856, "learning_rate": 0.00026233153175798447, "loss": 4.7597, "step": 19545 }, { "epoch": 0.40034402965207955, "grad_norm": 0.2379208356142044, "learning_rate": 0.0002622905031594281, "loss": 4.8152, "step": 19550 }, { "epoch": 0.40044641942948417, "grad_norm": 0.27165481448173523, "learning_rate": 0.0002622494554409781, "loss": 4.7024, "step": 19555 }, { "epoch": 0.4005488092068888, "grad_norm": 0.27947288751602173, "learning_rate": 0.00026220838860962375, "loss": 4.7907, "step": 19560 }, { "epoch": 0.4006511989842934, "grad_norm": 0.23369461297988892, "learning_rate": 0.00026216730267235755, "loss": 4.7771, "step": 19565 }, { "epoch": 0.40075358876169803, "grad_norm": 0.23187683522701263, "learning_rate": 0.0002621261976361753, "loss": 4.7554, "step": 19570 }, { "epoch": 0.40085597853910265, "grad_norm": 0.23148415982723236, "learning_rate": 0.000262085073508076, "loss": 4.769, "step": 19575 }, { "epoch": 0.4009583683165073, "grad_norm": 0.2489568144083023, "learning_rate": 0.0002620439302950619, "loss": 4.8002, "step": 19580 }, { "epoch": 0.4010607580939119, "grad_norm": 0.25447413325309753, "learning_rate": 0.0002620027680041386, "loss": 4.7785, "step": 19585 }, { "epoch": 0.4011631478713165, "grad_norm": 0.2597976326942444, "learning_rate": 0.0002619615866423148, "loss": 4.7758, "step": 19590 }, { "epoch": 0.40126553764872114, "grad_norm": 0.2571601867675781, "learning_rate": 0.00026192038621660266, "loss": 4.7568, "step": 19595 }, { "epoch": 0.40136792742612576, "grad_norm": 0.2353305220603943, "learning_rate": 0.00026187916673401724, "loss": 4.7823, "step": 19600 }, { "epoch": 0.4014703172035304, "grad_norm": 0.2579090893268585, "learning_rate": 0.0002618379282015772, "loss": 4.8025, "step": 19605 }, { "epoch": 0.401572706980935, "grad_norm": 0.25471198558807373, "learning_rate": 0.0002617966706263042, "loss": 4.7409, "step": 19610 }, { "epoch": 0.4016750967583396, "grad_norm": 0.2436644434928894, "learning_rate": 0.0002617553940152233, "loss": 4.7414, "step": 19615 }, { "epoch": 0.40177748653574424, "grad_norm": 0.2647945284843445, "learning_rate": 0.00026171409837536273, "loss": 4.7688, "step": 19620 }, { "epoch": 0.4018798763131489, "grad_norm": 0.2901569604873657, "learning_rate": 0.00026167278371375395, "loss": 4.763, "step": 19625 }, { "epoch": 0.40198226609055354, "grad_norm": 0.2863835096359253, "learning_rate": 0.00026163145003743165, "loss": 4.7224, "step": 19630 }, { "epoch": 0.40208465586795816, "grad_norm": 0.2397838532924652, "learning_rate": 0.0002615900973534339, "loss": 4.773, "step": 19635 }, { "epoch": 0.4021870456453628, "grad_norm": 0.2545151114463806, "learning_rate": 0.0002615487256688017, "loss": 4.7594, "step": 19640 }, { "epoch": 0.4022894354227674, "grad_norm": 0.2644392251968384, "learning_rate": 0.0002615073349905797, "loss": 4.8706, "step": 19645 }, { "epoch": 0.402391825200172, "grad_norm": 0.2648373246192932, "learning_rate": 0.0002614659253258154, "loss": 4.7467, "step": 19650 }, { "epoch": 0.40249421497757665, "grad_norm": 0.23473845422267914, "learning_rate": 0.0002614244966815597, "loss": 4.7412, "step": 19655 }, { "epoch": 0.40259660475498127, "grad_norm": 0.25363534688949585, "learning_rate": 0.0002613830490648669, "loss": 4.7506, "step": 19660 }, { "epoch": 0.4026989945323859, "grad_norm": 0.28193849325180054, "learning_rate": 0.0002613415824827941, "loss": 4.7699, "step": 19665 }, { "epoch": 0.4028013843097905, "grad_norm": 0.2774384617805481, "learning_rate": 0.00026130009694240203, "loss": 4.7576, "step": 19670 }, { "epoch": 0.40290377408719513, "grad_norm": 0.27691593766212463, "learning_rate": 0.00026125859245075447, "loss": 4.7844, "step": 19675 }, { "epoch": 0.40300616386459975, "grad_norm": 0.23163090646266937, "learning_rate": 0.00026121706901491857, "loss": 4.733, "step": 19680 }, { "epoch": 0.4031085536420044, "grad_norm": 0.23813559114933014, "learning_rate": 0.0002611755266419644, "loss": 4.7045, "step": 19685 }, { "epoch": 0.403210943419409, "grad_norm": 0.2549150586128235, "learning_rate": 0.0002611339653389656, "loss": 4.7576, "step": 19690 }, { "epoch": 0.4033133331968136, "grad_norm": 0.24459238350391388, "learning_rate": 0.0002610923851129988, "loss": 4.7647, "step": 19695 }, { "epoch": 0.40341572297421824, "grad_norm": 0.2550506889820099, "learning_rate": 0.000261050785971144, "loss": 4.7009, "step": 19700 }, { "epoch": 0.40351811275162286, "grad_norm": 0.23717327415943146, "learning_rate": 0.00026100916792048425, "loss": 4.7817, "step": 19705 }, { "epoch": 0.4036205025290275, "grad_norm": 0.27429184317588806, "learning_rate": 0.000260967530968106, "loss": 4.7967, "step": 19710 }, { "epoch": 0.4037228923064321, "grad_norm": 0.2548692226409912, "learning_rate": 0.0002609258751210989, "loss": 4.7328, "step": 19715 }, { "epoch": 0.4038252820838367, "grad_norm": 0.25398963689804077, "learning_rate": 0.00026088420038655573, "loss": 4.8318, "step": 19720 }, { "epoch": 0.4039276718612414, "grad_norm": 0.23279903829097748, "learning_rate": 0.0002608425067715724, "loss": 4.7604, "step": 19725 }, { "epoch": 0.404030061638646, "grad_norm": 0.2645326554775238, "learning_rate": 0.00026080079428324833, "loss": 4.7832, "step": 19730 }, { "epoch": 0.40413245141605064, "grad_norm": 0.26203349232673645, "learning_rate": 0.0002607590629286858, "loss": 4.7994, "step": 19735 }, { "epoch": 0.40423484119345526, "grad_norm": 0.256180077791214, "learning_rate": 0.00026071731271499067, "loss": 4.781, "step": 19740 }, { "epoch": 0.4043372309708599, "grad_norm": 0.257884681224823, "learning_rate": 0.00026067554364927163, "loss": 4.7952, "step": 19745 }, { "epoch": 0.4044396207482645, "grad_norm": 0.2481538951396942, "learning_rate": 0.00026063375573864085, "loss": 4.8146, "step": 19750 }, { "epoch": 0.4045420105256691, "grad_norm": 0.2525789141654968, "learning_rate": 0.0002605919489902137, "loss": 4.6853, "step": 19755 }, { "epoch": 0.40464440030307375, "grad_norm": 0.24754178524017334, "learning_rate": 0.0002605501234111086, "loss": 4.7383, "step": 19760 }, { "epoch": 0.40474679008047837, "grad_norm": 0.2472517043352127, "learning_rate": 0.00026050827900844727, "loss": 4.7838, "step": 19765 }, { "epoch": 0.404849179857883, "grad_norm": 0.24312861263751984, "learning_rate": 0.0002604664157893546, "loss": 4.8092, "step": 19770 }, { "epoch": 0.4049515696352876, "grad_norm": 0.2669880986213684, "learning_rate": 0.0002604245337609588, "loss": 4.7519, "step": 19775 }, { "epoch": 0.40505395941269223, "grad_norm": 0.27648940682411194, "learning_rate": 0.0002603826329303911, "loss": 4.7815, "step": 19780 }, { "epoch": 0.40515634919009685, "grad_norm": 0.251446008682251, "learning_rate": 0.00026034071330478613, "loss": 4.7885, "step": 19785 }, { "epoch": 0.4052587389675015, "grad_norm": 0.2564874589443207, "learning_rate": 0.0002602987748912816, "loss": 4.7329, "step": 19790 }, { "epoch": 0.4053611287449061, "grad_norm": 0.25271502137184143, "learning_rate": 0.0002602568176970182, "loss": 4.777, "step": 19795 }, { "epoch": 0.4054635185223107, "grad_norm": 0.24098694324493408, "learning_rate": 0.0002602148417291404, "loss": 4.7122, "step": 19800 }, { "epoch": 0.40556590829971534, "grad_norm": 0.2511517107486725, "learning_rate": 0.0002601728469947953, "loss": 4.7755, "step": 19805 }, { "epoch": 0.40566829807711996, "grad_norm": 0.23859263956546783, "learning_rate": 0.0002601308335011335, "loss": 4.8393, "step": 19810 }, { "epoch": 0.4057706878545246, "grad_norm": 0.27305012941360474, "learning_rate": 0.0002600888012553087, "loss": 4.7756, "step": 19815 }, { "epoch": 0.4058730776319292, "grad_norm": 0.296753466129303, "learning_rate": 0.0002600467502644777, "loss": 4.7139, "step": 19820 }, { "epoch": 0.4059754674093339, "grad_norm": 0.24156303703784943, "learning_rate": 0.0002600046805358007, "loss": 4.7338, "step": 19825 }, { "epoch": 0.4060778571867385, "grad_norm": 0.2724151611328125, "learning_rate": 0.000259962592076441, "loss": 4.7976, "step": 19830 }, { "epoch": 0.4061802469641431, "grad_norm": 0.2427988201379776, "learning_rate": 0.00025992048489356504, "loss": 4.7571, "step": 19835 }, { "epoch": 0.40628263674154774, "grad_norm": 0.24802915751934052, "learning_rate": 0.00025987835899434245, "loss": 4.7916, "step": 19840 }, { "epoch": 0.40638502651895236, "grad_norm": 0.2772034704685211, "learning_rate": 0.0002598362143859461, "loss": 4.7342, "step": 19845 }, { "epoch": 0.406487416296357, "grad_norm": 0.2626076340675354, "learning_rate": 0.0002597940510755521, "loss": 4.7238, "step": 19850 }, { "epoch": 0.4065898060737616, "grad_norm": 0.27586403489112854, "learning_rate": 0.0002597518690703395, "loss": 4.7822, "step": 19855 }, { "epoch": 0.4066921958511662, "grad_norm": 0.33457404375076294, "learning_rate": 0.00025970966837749083, "loss": 4.7788, "step": 19860 }, { "epoch": 0.40679458562857085, "grad_norm": 0.23057791590690613, "learning_rate": 0.0002596674490041916, "loss": 4.742, "step": 19865 }, { "epoch": 0.40689697540597547, "grad_norm": 0.2495986521244049, "learning_rate": 0.0002596252109576307, "loss": 4.7231, "step": 19870 }, { "epoch": 0.4069993651833801, "grad_norm": 0.23995955288410187, "learning_rate": 0.0002595829542449999, "loss": 4.7509, "step": 19875 }, { "epoch": 0.4071017549607847, "grad_norm": 0.32748815417289734, "learning_rate": 0.0002595406788734945, "loss": 4.7365, "step": 19880 }, { "epoch": 0.40720414473818933, "grad_norm": 0.24457184970378876, "learning_rate": 0.00025949838485031263, "loss": 4.8283, "step": 19885 }, { "epoch": 0.40730653451559395, "grad_norm": 0.23557250201702118, "learning_rate": 0.0002594560721826559, "loss": 4.7784, "step": 19890 }, { "epoch": 0.4074089242929986, "grad_norm": 0.2428516447544098, "learning_rate": 0.00025941374087772885, "loss": 4.7659, "step": 19895 }, { "epoch": 0.4075113140704032, "grad_norm": 0.24987810850143433, "learning_rate": 0.0002593713909427394, "loss": 4.7948, "step": 19900 }, { "epoch": 0.4076137038478078, "grad_norm": 0.2510607838630676, "learning_rate": 0.00025932902238489844, "loss": 4.7555, "step": 19905 }, { "epoch": 0.40771609362521244, "grad_norm": 0.2503218948841095, "learning_rate": 0.00025928663521142026, "loss": 4.7913, "step": 19910 }, { "epoch": 0.40781848340261706, "grad_norm": 0.2557161748409271, "learning_rate": 0.0002592442294295221, "loss": 4.7973, "step": 19915 }, { "epoch": 0.4079208731800217, "grad_norm": 0.25850632786750793, "learning_rate": 0.00025920180504642446, "loss": 4.7331, "step": 19920 }, { "epoch": 0.40802326295742636, "grad_norm": 0.25395697355270386, "learning_rate": 0.0002591593620693511, "loss": 4.7265, "step": 19925 }, { "epoch": 0.408125652734831, "grad_norm": 0.24870486557483673, "learning_rate": 0.0002591169005055288, "loss": 4.7729, "step": 19930 }, { "epoch": 0.4082280425122356, "grad_norm": 0.26790231466293335, "learning_rate": 0.00025907442036218753, "loss": 4.7463, "step": 19935 }, { "epoch": 0.4083304322896402, "grad_norm": 0.24698667228221893, "learning_rate": 0.0002590319216465605, "loss": 4.7259, "step": 19940 }, { "epoch": 0.40843282206704484, "grad_norm": 0.2523781359195709, "learning_rate": 0.0002589894043658841, "loss": 4.7681, "step": 19945 }, { "epoch": 0.40853521184444946, "grad_norm": 0.26594412326812744, "learning_rate": 0.00025894686852739765, "loss": 4.7841, "step": 19950 }, { "epoch": 0.4086376016218541, "grad_norm": 0.25075480341911316, "learning_rate": 0.00025890431413834393, "loss": 4.7966, "step": 19955 }, { "epoch": 0.4087399913992587, "grad_norm": 0.25574803352355957, "learning_rate": 0.0002588617412059687, "loss": 4.8014, "step": 19960 }, { "epoch": 0.4088423811766633, "grad_norm": 0.26465269923210144, "learning_rate": 0.00025881914973752097, "loss": 4.7336, "step": 19965 }, { "epoch": 0.40894477095406795, "grad_norm": 0.24622943997383118, "learning_rate": 0.0002587765397402529, "loss": 4.8056, "step": 19970 }, { "epoch": 0.40904716073147257, "grad_norm": 0.26525789499282837, "learning_rate": 0.0002587339112214196, "loss": 4.7851, "step": 19975 }, { "epoch": 0.4091495505088772, "grad_norm": 0.2609120309352875, "learning_rate": 0.00025869126418827956, "loss": 4.7708, "step": 19980 }, { "epoch": 0.4092519402862818, "grad_norm": 0.24709618091583252, "learning_rate": 0.0002586485986480945, "loss": 4.7359, "step": 19985 }, { "epoch": 0.40935433006368643, "grad_norm": 0.3181145191192627, "learning_rate": 0.00025860591460812905, "loss": 4.7702, "step": 19990 }, { "epoch": 0.40945671984109105, "grad_norm": 0.24727404117584229, "learning_rate": 0.00025856321207565105, "loss": 4.7353, "step": 19995 }, { "epoch": 0.4095591096184957, "grad_norm": 0.22693198919296265, "learning_rate": 0.0002585204910579316, "loss": 4.7821, "step": 20000 }, { "epoch": 0.4095591096184957, "eval_loss": 4.7799859046936035, "eval_runtime": 19.6546, "eval_samples_per_second": 29.611, "eval_steps_per_second": 3.714, "step": 20000 }, { "epoch": 0.4096614993959003, "grad_norm": 0.26770642399787903, "learning_rate": 0.00025847775156224485, "loss": 4.7473, "step": 20005 }, { "epoch": 0.4097638891733049, "grad_norm": 0.2308717966079712, "learning_rate": 0.00025843499359586817, "loss": 4.8047, "step": 20010 }, { "epoch": 0.40986627895070954, "grad_norm": 0.2362947165966034, "learning_rate": 0.00025839221716608197, "loss": 4.7703, "step": 20015 }, { "epoch": 0.40996866872811416, "grad_norm": 0.2500995397567749, "learning_rate": 0.0002583494222801699, "loss": 4.8034, "step": 20020 }, { "epoch": 0.4100710585055188, "grad_norm": 0.23485814034938812, "learning_rate": 0.0002583066089454187, "loss": 4.762, "step": 20025 }, { "epoch": 0.41017344828292346, "grad_norm": 0.23973077535629272, "learning_rate": 0.00025826377716911823, "loss": 4.7755, "step": 20030 }, { "epoch": 0.4102758380603281, "grad_norm": 0.2536141574382782, "learning_rate": 0.00025822092695856164, "loss": 4.7591, "step": 20035 }, { "epoch": 0.4103782278377327, "grad_norm": 0.23926681280136108, "learning_rate": 0.000258178058321045, "loss": 4.754, "step": 20040 }, { "epoch": 0.4104806176151373, "grad_norm": 0.24142976105213165, "learning_rate": 0.0002581351712638678, "loss": 4.7588, "step": 20045 }, { "epoch": 0.41058300739254194, "grad_norm": 0.24843670427799225, "learning_rate": 0.0002580922657943322, "loss": 4.7554, "step": 20050 }, { "epoch": 0.41068539716994656, "grad_norm": 0.2508491277694702, "learning_rate": 0.00025804934191974403, "loss": 4.7792, "step": 20055 }, { "epoch": 0.4107877869473512, "grad_norm": 0.24192366003990173, "learning_rate": 0.0002580063996474119, "loss": 4.7405, "step": 20060 }, { "epoch": 0.4108901767247558, "grad_norm": 0.25192683935165405, "learning_rate": 0.00025796343898464773, "loss": 4.7642, "step": 20065 }, { "epoch": 0.41099256650216043, "grad_norm": 0.24169355630874634, "learning_rate": 0.0002579204599387664, "loss": 4.7833, "step": 20070 }, { "epoch": 0.41109495627956505, "grad_norm": 0.27194544672966003, "learning_rate": 0.00025787746251708615, "loss": 4.721, "step": 20075 }, { "epoch": 0.41119734605696967, "grad_norm": 0.28333792090415955, "learning_rate": 0.00025783444672692815, "loss": 4.7635, "step": 20080 }, { "epoch": 0.4112997358343743, "grad_norm": 0.24671564996242523, "learning_rate": 0.00025779141257561684, "loss": 4.7218, "step": 20085 }, { "epoch": 0.4114021256117789, "grad_norm": 0.26813265681266785, "learning_rate": 0.0002577483600704796, "loss": 4.6989, "step": 20090 }, { "epoch": 0.41150451538918353, "grad_norm": 0.26639458537101746, "learning_rate": 0.00025770528921884717, "loss": 4.7613, "step": 20095 }, { "epoch": 0.41160690516658816, "grad_norm": 0.34087416529655457, "learning_rate": 0.0002576622000280532, "loss": 4.7471, "step": 20100 }, { "epoch": 0.4117092949439928, "grad_norm": 0.32042837142944336, "learning_rate": 0.0002576190925054346, "loss": 4.7166, "step": 20105 }, { "epoch": 0.4118116847213974, "grad_norm": 0.26964765787124634, "learning_rate": 0.00025757596665833136, "loss": 4.7626, "step": 20110 }, { "epoch": 0.411914074498802, "grad_norm": 0.24823670089244843, "learning_rate": 0.00025753282249408655, "loss": 4.7184, "step": 20115 }, { "epoch": 0.41201646427620664, "grad_norm": 0.24069556593894958, "learning_rate": 0.00025748966002004655, "loss": 4.7405, "step": 20120 }, { "epoch": 0.41211885405361126, "grad_norm": 0.24267128109931946, "learning_rate": 0.00025744647924356057, "loss": 4.7585, "step": 20125 }, { "epoch": 0.41222124383101594, "grad_norm": 0.27272501587867737, "learning_rate": 0.00025740328017198106, "loss": 4.7625, "step": 20130 }, { "epoch": 0.41232363360842056, "grad_norm": 0.2577752470970154, "learning_rate": 0.0002573600628126637, "loss": 4.7882, "step": 20135 }, { "epoch": 0.4124260233858252, "grad_norm": 0.26629868149757385, "learning_rate": 0.00025731682717296706, "loss": 4.7195, "step": 20140 }, { "epoch": 0.4125284131632298, "grad_norm": 0.23330718278884888, "learning_rate": 0.00025727357326025304, "loss": 4.7635, "step": 20145 }, { "epoch": 0.4126308029406344, "grad_norm": 0.2559824585914612, "learning_rate": 0.00025723030108188653, "loss": 4.7506, "step": 20150 }, { "epoch": 0.41273319271803904, "grad_norm": 0.2495318204164505, "learning_rate": 0.0002571870106452356, "loss": 4.7664, "step": 20155 }, { "epoch": 0.41283558249544366, "grad_norm": 0.2817722260951996, "learning_rate": 0.0002571437019576713, "loss": 4.7162, "step": 20160 }, { "epoch": 0.4129379722728483, "grad_norm": 0.26086512207984924, "learning_rate": 0.0002571003750265679, "loss": 4.7575, "step": 20165 }, { "epoch": 0.4130403620502529, "grad_norm": 0.3186280131340027, "learning_rate": 0.0002570570298593028, "loss": 4.7355, "step": 20170 }, { "epoch": 0.41314275182765753, "grad_norm": 0.33195388317108154, "learning_rate": 0.00025701366646325644, "loss": 4.7356, "step": 20175 }, { "epoch": 0.41324514160506215, "grad_norm": 0.2980879545211792, "learning_rate": 0.0002569702848458123, "loss": 4.763, "step": 20180 }, { "epoch": 0.41334753138246677, "grad_norm": 0.26233193278312683, "learning_rate": 0.00025692688501435714, "loss": 4.7139, "step": 20185 }, { "epoch": 0.4134499211598714, "grad_norm": 0.2996671795845032, "learning_rate": 0.0002568834669762807, "loss": 4.7229, "step": 20190 }, { "epoch": 0.413552310937276, "grad_norm": 0.24259331822395325, "learning_rate": 0.00025684003073897575, "loss": 4.7319, "step": 20195 }, { "epoch": 0.41365470071468063, "grad_norm": 0.2737988233566284, "learning_rate": 0.00025679657630983834, "loss": 4.6717, "step": 20200 }, { "epoch": 0.41375709049208526, "grad_norm": 0.23778383433818817, "learning_rate": 0.0002567531036962676, "loss": 4.7576, "step": 20205 }, { "epoch": 0.4138594802694899, "grad_norm": 0.24559806287288666, "learning_rate": 0.00025670961290566555, "loss": 4.781, "step": 20210 }, { "epoch": 0.4139618700468945, "grad_norm": 0.23543305695056915, "learning_rate": 0.00025666610394543745, "loss": 4.7329, "step": 20215 }, { "epoch": 0.4140642598242991, "grad_norm": 0.23338939249515533, "learning_rate": 0.00025662257682299174, "loss": 4.7182, "step": 20220 }, { "epoch": 0.41416664960170374, "grad_norm": 0.23803578317165375, "learning_rate": 0.00025657903154573976, "loss": 4.7271, "step": 20225 }, { "epoch": 0.4142690393791084, "grad_norm": 0.24348607659339905, "learning_rate": 0.0002565354681210961, "loss": 4.7413, "step": 20230 }, { "epoch": 0.41437142915651304, "grad_norm": 0.24406488239765167, "learning_rate": 0.00025649188655647837, "loss": 4.7605, "step": 20235 }, { "epoch": 0.41447381893391766, "grad_norm": 0.23760120570659637, "learning_rate": 0.0002564482868593073, "loss": 4.7714, "step": 20240 }, { "epoch": 0.4145762087113223, "grad_norm": 0.25055694580078125, "learning_rate": 0.00025640466903700663, "loss": 4.7181, "step": 20245 }, { "epoch": 0.4146785984887269, "grad_norm": 0.2715224325656891, "learning_rate": 0.0002563610330970032, "loss": 4.7606, "step": 20250 }, { "epoch": 0.4147809882661315, "grad_norm": 0.246859610080719, "learning_rate": 0.00025631737904672705, "loss": 4.7482, "step": 20255 }, { "epoch": 0.41488337804353614, "grad_norm": 0.25667813420295715, "learning_rate": 0.00025627370689361125, "loss": 4.7504, "step": 20260 }, { "epoch": 0.41498576782094077, "grad_norm": 0.24133725464344025, "learning_rate": 0.00025623001664509186, "loss": 4.7251, "step": 20265 }, { "epoch": 0.4150881575983454, "grad_norm": 0.25921839475631714, "learning_rate": 0.0002561863083086082, "loss": 4.7509, "step": 20270 }, { "epoch": 0.41519054737575, "grad_norm": 0.23406857252120972, "learning_rate": 0.0002561425818916024, "loss": 4.7663, "step": 20275 }, { "epoch": 0.41529293715315463, "grad_norm": 0.2401791661977768, "learning_rate": 0.00025609883740151994, "loss": 4.7768, "step": 20280 }, { "epoch": 0.41539532693055925, "grad_norm": 0.25224336981773376, "learning_rate": 0.0002560550748458093, "loss": 4.7404, "step": 20285 }, { "epoch": 0.41549771670796387, "grad_norm": 0.2500647008419037, "learning_rate": 0.00025601129423192187, "loss": 4.7744, "step": 20290 }, { "epoch": 0.4156001064853685, "grad_norm": 0.24695204198360443, "learning_rate": 0.00025596749556731235, "loss": 4.7639, "step": 20295 }, { "epoch": 0.4157024962627731, "grad_norm": 0.2416968047618866, "learning_rate": 0.00025592367885943847, "loss": 4.7445, "step": 20300 }, { "epoch": 0.41580488604017773, "grad_norm": 0.2485630363225937, "learning_rate": 0.0002558798441157608, "loss": 4.7599, "step": 20305 }, { "epoch": 0.41590727581758236, "grad_norm": 0.2630630135536194, "learning_rate": 0.0002558359913437433, "loss": 4.7542, "step": 20310 }, { "epoch": 0.416009665594987, "grad_norm": 0.2438594102859497, "learning_rate": 0.00025579212055085285, "loss": 4.7407, "step": 20315 }, { "epoch": 0.4161120553723916, "grad_norm": 0.2848489284515381, "learning_rate": 0.00025574823174455923, "loss": 4.766, "step": 20320 }, { "epoch": 0.4162144451497962, "grad_norm": 0.24623343348503113, "learning_rate": 0.00025570432493233575, "loss": 4.7318, "step": 20325 }, { "epoch": 0.41631683492720084, "grad_norm": 0.24903014302253723, "learning_rate": 0.00025566040012165825, "loss": 4.7153, "step": 20330 }, { "epoch": 0.4164192247046055, "grad_norm": 0.24170295894145966, "learning_rate": 0.00025561645732000604, "loss": 4.7581, "step": 20335 }, { "epoch": 0.41652161448201014, "grad_norm": 0.24682767689228058, "learning_rate": 0.00025557249653486127, "loss": 4.6759, "step": 20340 }, { "epoch": 0.41662400425941476, "grad_norm": 0.2513619363307953, "learning_rate": 0.0002555285177737091, "loss": 4.7425, "step": 20345 }, { "epoch": 0.4167263940368194, "grad_norm": 0.257161021232605, "learning_rate": 0.0002554845210440381, "loss": 4.7456, "step": 20350 }, { "epoch": 0.416828783814224, "grad_norm": 0.25595664978027344, "learning_rate": 0.00025544050635333953, "loss": 4.763, "step": 20355 }, { "epoch": 0.4169311735916286, "grad_norm": 0.26715409755706787, "learning_rate": 0.0002553964737091078, "loss": 4.7702, "step": 20360 }, { "epoch": 0.41703356336903324, "grad_norm": 0.25047749280929565, "learning_rate": 0.00025535242311884056, "loss": 4.7518, "step": 20365 }, { "epoch": 0.41713595314643787, "grad_norm": 0.25475624203681946, "learning_rate": 0.0002553083545900383, "loss": 4.7446, "step": 20370 }, { "epoch": 0.4172383429238425, "grad_norm": 0.25762513279914856, "learning_rate": 0.0002552642681302046, "loss": 4.7581, "step": 20375 }, { "epoch": 0.4173407327012471, "grad_norm": 0.24678002297878265, "learning_rate": 0.0002552201637468463, "loss": 4.7205, "step": 20380 }, { "epoch": 0.41744312247865173, "grad_norm": 0.2610575556755066, "learning_rate": 0.0002551760414474729, "loss": 4.793, "step": 20385 }, { "epoch": 0.41754551225605635, "grad_norm": 0.2367083877325058, "learning_rate": 0.00025513190123959735, "loss": 4.7223, "step": 20390 }, { "epoch": 0.41764790203346097, "grad_norm": 0.2624305486679077, "learning_rate": 0.0002550877431307354, "loss": 4.8046, "step": 20395 }, { "epoch": 0.4177502918108656, "grad_norm": 0.25630515813827515, "learning_rate": 0.00025504356712840597, "loss": 4.7957, "step": 20400 }, { "epoch": 0.4178526815882702, "grad_norm": 0.2714109420776367, "learning_rate": 0.0002549993732401309, "loss": 4.7571, "step": 20405 }, { "epoch": 0.41795507136567484, "grad_norm": 0.2652490437030792, "learning_rate": 0.00025495516147343533, "loss": 4.7841, "step": 20410 }, { "epoch": 0.41805746114307946, "grad_norm": 0.27956244349479675, "learning_rate": 0.00025491093183584705, "loss": 4.7662, "step": 20415 }, { "epoch": 0.4181598509204841, "grad_norm": 0.25715363025665283, "learning_rate": 0.00025486668433489726, "loss": 4.7849, "step": 20420 }, { "epoch": 0.4182622406978887, "grad_norm": 0.24916450679302216, "learning_rate": 0.0002548224189781201, "loss": 4.7661, "step": 20425 }, { "epoch": 0.4183646304752933, "grad_norm": 0.24065448343753815, "learning_rate": 0.0002547781357730525, "loss": 4.7972, "step": 20430 }, { "epoch": 0.418467020252698, "grad_norm": 0.3225402235984802, "learning_rate": 0.00025473383472723486, "loss": 4.7533, "step": 20435 }, { "epoch": 0.4185694100301026, "grad_norm": 0.25148963928222656, "learning_rate": 0.00025468951584821023, "loss": 4.7826, "step": 20440 }, { "epoch": 0.41867179980750724, "grad_norm": 0.25360456109046936, "learning_rate": 0.00025464517914352495, "loss": 4.7068, "step": 20445 }, { "epoch": 0.41877418958491186, "grad_norm": 0.2516668438911438, "learning_rate": 0.0002546008246207283, "loss": 4.7593, "step": 20450 }, { "epoch": 0.4188765793623165, "grad_norm": 0.2328442484140396, "learning_rate": 0.0002545564522873726, "loss": 4.7209, "step": 20455 }, { "epoch": 0.4189789691397211, "grad_norm": 0.25439298152923584, "learning_rate": 0.00025451206215101316, "loss": 4.7671, "step": 20460 }, { "epoch": 0.4190813589171257, "grad_norm": 0.30829206109046936, "learning_rate": 0.0002544676542192084, "loss": 4.6941, "step": 20465 }, { "epoch": 0.41918374869453034, "grad_norm": 0.2314857691526413, "learning_rate": 0.00025442322849951975, "loss": 4.7617, "step": 20470 }, { "epoch": 0.41928613847193497, "grad_norm": 0.27061936259269714, "learning_rate": 0.00025437878499951156, "loss": 4.7803, "step": 20475 }, { "epoch": 0.4193885282493396, "grad_norm": 0.24506454169750214, "learning_rate": 0.0002543343237267514, "loss": 4.7624, "step": 20480 }, { "epoch": 0.4194909180267442, "grad_norm": 0.26191064715385437, "learning_rate": 0.0002542898446888097, "loss": 4.769, "step": 20485 }, { "epoch": 0.41959330780414883, "grad_norm": 0.2626371681690216, "learning_rate": 0.00025424534789326005, "loss": 4.7364, "step": 20490 }, { "epoch": 0.41969569758155345, "grad_norm": 0.2845771610736847, "learning_rate": 0.00025420083334767895, "loss": 4.7913, "step": 20495 }, { "epoch": 0.41979808735895807, "grad_norm": 0.369780957698822, "learning_rate": 0.000254156301059646, "loss": 4.7815, "step": 20500 }, { "epoch": 0.4199004771363627, "grad_norm": 0.27729693055152893, "learning_rate": 0.0002541117510367437, "loss": 4.8546, "step": 20505 }, { "epoch": 0.4200028669137673, "grad_norm": 0.24958054721355438, "learning_rate": 0.00025406718328655777, "loss": 4.8021, "step": 20510 }, { "epoch": 0.42010525669117194, "grad_norm": 0.25474047660827637, "learning_rate": 0.00025402259781667676, "loss": 4.765, "step": 20515 }, { "epoch": 0.42020764646857656, "grad_norm": 0.24423550069332123, "learning_rate": 0.0002539779946346924, "loss": 4.7385, "step": 20520 }, { "epoch": 0.4203100362459812, "grad_norm": 0.24203768372535706, "learning_rate": 0.0002539333737481992, "loss": 4.7787, "step": 20525 }, { "epoch": 0.4204124260233858, "grad_norm": 0.25625908374786377, "learning_rate": 0.00025388873516479495, "loss": 4.7171, "step": 20530 }, { "epoch": 0.4205148158007905, "grad_norm": 0.24318137764930725, "learning_rate": 0.0002538440788920803, "loss": 4.7654, "step": 20535 }, { "epoch": 0.4206172055781951, "grad_norm": 0.2454153299331665, "learning_rate": 0.00025379940493765895, "loss": 4.7934, "step": 20540 }, { "epoch": 0.4207195953555997, "grad_norm": 0.2600332200527191, "learning_rate": 0.00025375471330913764, "loss": 4.8145, "step": 20545 }, { "epoch": 0.42082198513300434, "grad_norm": 0.23698943853378296, "learning_rate": 0.0002537100040141261, "loss": 4.7997, "step": 20550 }, { "epoch": 0.42092437491040896, "grad_norm": 0.2716785669326782, "learning_rate": 0.000253665277060237, "loss": 4.7199, "step": 20555 }, { "epoch": 0.4210267646878136, "grad_norm": 0.2378966361284256, "learning_rate": 0.00025362053245508607, "loss": 4.7607, "step": 20560 }, { "epoch": 0.4211291544652182, "grad_norm": 0.2592945396900177, "learning_rate": 0.00025357577020629205, "loss": 4.7984, "step": 20565 }, { "epoch": 0.4212315442426228, "grad_norm": 0.2741259038448334, "learning_rate": 0.0002535309903214768, "loss": 4.731, "step": 20570 }, { "epoch": 0.42133393402002745, "grad_norm": 0.2415311634540558, "learning_rate": 0.0002534861928082649, "loss": 4.7383, "step": 20575 }, { "epoch": 0.42143632379743207, "grad_norm": 0.2703542709350586, "learning_rate": 0.00025344137767428424, "loss": 4.7785, "step": 20580 }, { "epoch": 0.4215387135748367, "grad_norm": 0.23549619317054749, "learning_rate": 0.00025339654492716543, "loss": 4.7548, "step": 20585 }, { "epoch": 0.4216411033522413, "grad_norm": 0.2453819215297699, "learning_rate": 0.0002533516945745423, "loss": 4.7905, "step": 20590 }, { "epoch": 0.42174349312964593, "grad_norm": 0.26156413555145264, "learning_rate": 0.00025330682662405167, "loss": 4.7545, "step": 20595 }, { "epoch": 0.42184588290705055, "grad_norm": 0.28923532366752625, "learning_rate": 0.00025326194108333313, "loss": 4.7925, "step": 20600 }, { "epoch": 0.4219482726844552, "grad_norm": 0.24263493716716766, "learning_rate": 0.0002532170379600295, "loss": 4.7576, "step": 20605 }, { "epoch": 0.4220506624618598, "grad_norm": 0.2529463469982147, "learning_rate": 0.0002531721172617865, "loss": 4.7117, "step": 20610 }, { "epoch": 0.4221530522392644, "grad_norm": 0.2368769347667694, "learning_rate": 0.0002531271789962528, "loss": 4.7411, "step": 20615 }, { "epoch": 0.42225544201666904, "grad_norm": 0.253793329000473, "learning_rate": 0.00025308222317108026, "loss": 4.768, "step": 20620 }, { "epoch": 0.42235783179407366, "grad_norm": 0.2393905222415924, "learning_rate": 0.0002530372497939234, "loss": 4.7746, "step": 20625 }, { "epoch": 0.4224602215714783, "grad_norm": 0.24153396487236023, "learning_rate": 0.00025299225887244004, "loss": 4.7652, "step": 20630 }, { "epoch": 0.42256261134888295, "grad_norm": 0.24952054023742676, "learning_rate": 0.00025294725041429087, "loss": 4.699, "step": 20635 }, { "epoch": 0.4226650011262876, "grad_norm": 0.24828636646270752, "learning_rate": 0.0002529022244271394, "loss": 4.7754, "step": 20640 }, { "epoch": 0.4227673909036922, "grad_norm": 0.24602501094341278, "learning_rate": 0.0002528571809186525, "loss": 4.7367, "step": 20645 }, { "epoch": 0.4228697806810968, "grad_norm": 0.31363430619239807, "learning_rate": 0.00025281211989649957, "loss": 4.7321, "step": 20650 }, { "epoch": 0.42297217045850144, "grad_norm": 0.25800901651382446, "learning_rate": 0.00025276704136835345, "loss": 4.7531, "step": 20655 }, { "epoch": 0.42307456023590606, "grad_norm": 0.24754787981510162, "learning_rate": 0.0002527219453418896, "loss": 4.7282, "step": 20660 }, { "epoch": 0.4231769500133107, "grad_norm": 0.24348415434360504, "learning_rate": 0.00025267683182478664, "loss": 4.7162, "step": 20665 }, { "epoch": 0.4232793397907153, "grad_norm": 0.27512019872665405, "learning_rate": 0.00025263170082472607, "loss": 4.7666, "step": 20670 }, { "epoch": 0.4233817295681199, "grad_norm": 0.3168177902698517, "learning_rate": 0.0002525865523493925, "loss": 4.7289, "step": 20675 }, { "epoch": 0.42348411934552455, "grad_norm": 0.24315686523914337, "learning_rate": 0.0002525413864064734, "loss": 4.7274, "step": 20680 }, { "epoch": 0.42358650912292917, "grad_norm": 0.2777538001537323, "learning_rate": 0.0002524962030036593, "loss": 4.7145, "step": 20685 }, { "epoch": 0.4236888989003338, "grad_norm": 0.25496119260787964, "learning_rate": 0.00025245100214864357, "loss": 4.8036, "step": 20690 }, { "epoch": 0.4237912886777384, "grad_norm": 0.25127914547920227, "learning_rate": 0.0002524057838491227, "loss": 4.7427, "step": 20695 }, { "epoch": 0.42389367845514303, "grad_norm": 0.26537182927131653, "learning_rate": 0.000252360548112796, "loss": 4.7496, "step": 20700 }, { "epoch": 0.42399606823254765, "grad_norm": 0.2693651020526886, "learning_rate": 0.00025231529494736593, "loss": 4.7514, "step": 20705 }, { "epoch": 0.4240984580099523, "grad_norm": 0.2502761483192444, "learning_rate": 0.0002522700243605378, "loss": 4.7721, "step": 20710 }, { "epoch": 0.4242008477873569, "grad_norm": 0.263739675283432, "learning_rate": 0.00025222473636001995, "loss": 4.7405, "step": 20715 }, { "epoch": 0.4243032375647615, "grad_norm": 0.2528032958507538, "learning_rate": 0.0002521794309535235, "loss": 4.7784, "step": 20720 }, { "epoch": 0.42440562734216614, "grad_norm": 0.25531187653541565, "learning_rate": 0.00025213410814876284, "loss": 4.7421, "step": 20725 }, { "epoch": 0.42450801711957076, "grad_norm": 0.23182035982608795, "learning_rate": 0.0002520887679534551, "loss": 4.7566, "step": 20730 }, { "epoch": 0.4246104068969754, "grad_norm": 0.26235559582710266, "learning_rate": 0.0002520434103753204, "loss": 4.7696, "step": 20735 }, { "epoch": 0.42471279667438006, "grad_norm": 0.2579345405101776, "learning_rate": 0.0002519980354220819, "loss": 4.7361, "step": 20740 }, { "epoch": 0.4248151864517847, "grad_norm": 0.24372094869613647, "learning_rate": 0.00025195264310146564, "loss": 4.7092, "step": 20745 }, { "epoch": 0.4249175762291893, "grad_norm": 0.26634448766708374, "learning_rate": 0.0002519072334212006, "loss": 4.807, "step": 20750 }, { "epoch": 0.4250199660065939, "grad_norm": 0.2433907836675644, "learning_rate": 0.00025186180638901885, "loss": 4.737, "step": 20755 }, { "epoch": 0.42512235578399854, "grad_norm": 0.24627669155597687, "learning_rate": 0.00025181636201265534, "loss": 4.7326, "step": 20760 }, { "epoch": 0.42522474556140316, "grad_norm": 0.2820776700973511, "learning_rate": 0.00025177090029984787, "loss": 4.8008, "step": 20765 }, { "epoch": 0.4253271353388078, "grad_norm": 0.23752164840698242, "learning_rate": 0.00025172542125833736, "loss": 4.7742, "step": 20770 }, { "epoch": 0.4254295251162124, "grad_norm": 0.24931824207305908, "learning_rate": 0.0002516799248958675, "loss": 4.7357, "step": 20775 }, { "epoch": 0.425531914893617, "grad_norm": 0.28673723340034485, "learning_rate": 0.00025163441122018517, "loss": 4.7612, "step": 20780 }, { "epoch": 0.42563430467102165, "grad_norm": 0.2612729072570801, "learning_rate": 0.00025158888023903993, "loss": 4.736, "step": 20785 }, { "epoch": 0.42573669444842627, "grad_norm": 0.8941785097122192, "learning_rate": 0.00025154333196018456, "loss": 4.7835, "step": 20790 }, { "epoch": 0.4258390842258309, "grad_norm": 0.2597096264362335, "learning_rate": 0.0002514977663913745, "loss": 4.7651, "step": 20795 }, { "epoch": 0.4259414740032355, "grad_norm": 0.26881104707717896, "learning_rate": 0.0002514521835403684, "loss": 4.7823, "step": 20800 }, { "epoch": 0.42604386378064013, "grad_norm": 0.23725557327270508, "learning_rate": 0.00025140658341492767, "loss": 4.7017, "step": 20805 }, { "epoch": 0.42614625355804475, "grad_norm": 0.23989666998386383, "learning_rate": 0.00025136096602281665, "loss": 4.7279, "step": 20810 }, { "epoch": 0.4262486433354494, "grad_norm": 0.33615273237228394, "learning_rate": 0.0002513153313718028, "loss": 4.7719, "step": 20815 }, { "epoch": 0.426351033112854, "grad_norm": 0.2352549284696579, "learning_rate": 0.00025126967946965636, "loss": 4.7529, "step": 20820 }, { "epoch": 0.4264534228902586, "grad_norm": 0.2769213616847992, "learning_rate": 0.00025122401032415053, "loss": 4.7272, "step": 20825 }, { "epoch": 0.42655581266766324, "grad_norm": 0.2510347366333008, "learning_rate": 0.00025117832394306157, "loss": 4.7821, "step": 20830 }, { "epoch": 0.42665820244506786, "grad_norm": 0.2542051076889038, "learning_rate": 0.00025113262033416847, "loss": 4.7858, "step": 20835 }, { "epoch": 0.42676059222247253, "grad_norm": 0.24848873913288116, "learning_rate": 0.00025108689950525335, "loss": 4.8271, "step": 20840 }, { "epoch": 0.42686298199987716, "grad_norm": 0.3308088481426239, "learning_rate": 0.0002510411614641011, "loss": 4.7345, "step": 20845 }, { "epoch": 0.4269653717772818, "grad_norm": 0.24253885447978973, "learning_rate": 0.0002509954062184996, "loss": 4.7771, "step": 20850 }, { "epoch": 0.4270677615546864, "grad_norm": 0.24381297826766968, "learning_rate": 0.0002509496337762398, "loss": 4.7391, "step": 20855 }, { "epoch": 0.427170151332091, "grad_norm": 0.26204216480255127, "learning_rate": 0.0002509038441451153, "loss": 4.7607, "step": 20860 }, { "epoch": 0.42727254110949564, "grad_norm": 0.2552087604999542, "learning_rate": 0.00025085803733292285, "loss": 4.757, "step": 20865 }, { "epoch": 0.42737493088690026, "grad_norm": 0.2958095371723175, "learning_rate": 0.0002508122133474621, "loss": 4.7253, "step": 20870 }, { "epoch": 0.4274773206643049, "grad_norm": 0.2517816126346588, "learning_rate": 0.0002507663721965355, "loss": 4.7462, "step": 20875 }, { "epoch": 0.4275797104417095, "grad_norm": 0.22977468371391296, "learning_rate": 0.0002507205138879485, "loss": 4.7558, "step": 20880 }, { "epoch": 0.4276821002191141, "grad_norm": 0.25457119941711426, "learning_rate": 0.0002506746384295095, "loss": 4.7889, "step": 20885 }, { "epoch": 0.42778448999651875, "grad_norm": 0.261680543422699, "learning_rate": 0.0002506287458290298, "loss": 4.7809, "step": 20890 }, { "epoch": 0.42788687977392337, "grad_norm": 0.2524440884590149, "learning_rate": 0.00025058283609432356, "loss": 4.7378, "step": 20895 }, { "epoch": 0.427989269551328, "grad_norm": 0.25424009561538696, "learning_rate": 0.00025053690923320797, "loss": 4.7405, "step": 20900 }, { "epoch": 0.4280916593287326, "grad_norm": 0.24107684195041656, "learning_rate": 0.00025049096525350307, "loss": 4.7329, "step": 20905 }, { "epoch": 0.42819404910613723, "grad_norm": 0.2617661654949188, "learning_rate": 0.0002504450041630318, "loss": 4.7606, "step": 20910 }, { "epoch": 0.42829643888354185, "grad_norm": 0.2651958167552948, "learning_rate": 0.00025039902596962, "loss": 4.7175, "step": 20915 }, { "epoch": 0.4283988286609465, "grad_norm": 0.2467733472585678, "learning_rate": 0.00025035303068109653, "loss": 4.7578, "step": 20920 }, { "epoch": 0.4285012184383511, "grad_norm": 0.278435617685318, "learning_rate": 0.00025030701830529306, "loss": 4.7412, "step": 20925 }, { "epoch": 0.4286036082157557, "grad_norm": 0.27353397011756897, "learning_rate": 0.0002502609888500441, "loss": 4.7226, "step": 20930 }, { "epoch": 0.42870599799316034, "grad_norm": 0.2490510493516922, "learning_rate": 0.00025021494232318727, "loss": 4.7062, "step": 20935 }, { "epoch": 0.428808387770565, "grad_norm": 0.7028024792671204, "learning_rate": 0.00025016887873256295, "loss": 4.6703, "step": 20940 }, { "epoch": 0.42891077754796963, "grad_norm": 0.2787671983242035, "learning_rate": 0.0002501227980860145, "loss": 4.7194, "step": 20945 }, { "epoch": 0.42901316732537426, "grad_norm": 0.24122415482997894, "learning_rate": 0.0002500767003913881, "loss": 4.7723, "step": 20950 }, { "epoch": 0.4291155571027789, "grad_norm": 0.24152274429798126, "learning_rate": 0.00025003058565653297, "loss": 4.7381, "step": 20955 }, { "epoch": 0.4292179468801835, "grad_norm": 0.23557929694652557, "learning_rate": 0.000249984453889301, "loss": 4.7286, "step": 20960 }, { "epoch": 0.4293203366575881, "grad_norm": 0.30461108684539795, "learning_rate": 0.0002499383050975472, "loss": 4.7933, "step": 20965 }, { "epoch": 0.42942272643499274, "grad_norm": 0.24944448471069336, "learning_rate": 0.00024989213928912953, "loss": 4.7598, "step": 20970 }, { "epoch": 0.42952511621239736, "grad_norm": 0.23622015118598938, "learning_rate": 0.00024984595647190846, "loss": 4.7701, "step": 20975 }, { "epoch": 0.429627505989802, "grad_norm": 0.24173890054225922, "learning_rate": 0.00024979975665374783, "loss": 4.7568, "step": 20980 }, { "epoch": 0.4297298957672066, "grad_norm": 0.26110169291496277, "learning_rate": 0.0002497535398425141, "loss": 4.6862, "step": 20985 }, { "epoch": 0.4298322855446112, "grad_norm": 0.2331186681985855, "learning_rate": 0.00024970730604607656, "loss": 4.7182, "step": 20990 }, { "epoch": 0.42993467532201585, "grad_norm": 0.24164794385433197, "learning_rate": 0.0002496610552723077, "loss": 4.7811, "step": 20995 }, { "epoch": 0.43003706509942047, "grad_norm": 0.28270652890205383, "learning_rate": 0.0002496147875290827, "loss": 4.8024, "step": 21000 }, { "epoch": 0.43003706509942047, "eval_loss": 4.769644737243652, "eval_runtime": 17.2364, "eval_samples_per_second": 33.766, "eval_steps_per_second": 4.235, "step": 21000 }, { "epoch": 0.4301394548768251, "grad_norm": 0.24345621466636658, "learning_rate": 0.00024956850282427953, "loss": 4.7666, "step": 21005 }, { "epoch": 0.4302418446542297, "grad_norm": 0.2555341422557831, "learning_rate": 0.00024952220116577925, "loss": 4.7304, "step": 21010 }, { "epoch": 0.43034423443163433, "grad_norm": 0.25171372294425964, "learning_rate": 0.0002494758825614657, "loss": 4.8079, "step": 21015 }, { "epoch": 0.43044662420903895, "grad_norm": 0.25784316658973694, "learning_rate": 0.00024942954701922556, "loss": 4.815, "step": 21020 }, { "epoch": 0.4305490139864436, "grad_norm": 0.4336875379085541, "learning_rate": 0.0002493831945469486, "loss": 4.7852, "step": 21025 }, { "epoch": 0.4306514037638482, "grad_norm": 0.24383743107318878, "learning_rate": 0.00024933682515252716, "loss": 4.7327, "step": 21030 }, { "epoch": 0.4307537935412528, "grad_norm": 0.2472480684518814, "learning_rate": 0.0002492904388438568, "loss": 4.749, "step": 21035 }, { "epoch": 0.4308561833186575, "grad_norm": 0.25940173864364624, "learning_rate": 0.00024924403562883574, "loss": 4.7539, "step": 21040 }, { "epoch": 0.4309585730960621, "grad_norm": 0.27517426013946533, "learning_rate": 0.0002491976155153651, "loss": 4.7581, "step": 21045 }, { "epoch": 0.43106096287346674, "grad_norm": 0.272371768951416, "learning_rate": 0.00024915117851134896, "loss": 4.7662, "step": 21050 }, { "epoch": 0.43116335265087136, "grad_norm": 0.27990251779556274, "learning_rate": 0.00024910472462469417, "loss": 4.7864, "step": 21055 }, { "epoch": 0.431265742428276, "grad_norm": 0.320916086435318, "learning_rate": 0.00024905825386331055, "loss": 4.7337, "step": 21060 }, { "epoch": 0.4313681322056806, "grad_norm": 0.23804743587970734, "learning_rate": 0.0002490117662351108, "loss": 4.7596, "step": 21065 }, { "epoch": 0.4314705219830852, "grad_norm": 0.24924394488334656, "learning_rate": 0.00024896526174801034, "loss": 4.72, "step": 21070 }, { "epoch": 0.43157291176048984, "grad_norm": 0.2520267963409424, "learning_rate": 0.00024891874040992764, "loss": 4.6497, "step": 21075 }, { "epoch": 0.43167530153789446, "grad_norm": 0.2740488052368164, "learning_rate": 0.00024887220222878393, "loss": 4.7016, "step": 21080 }, { "epoch": 0.4317776913152991, "grad_norm": 0.2445664405822754, "learning_rate": 0.00024882564721250336, "loss": 4.7656, "step": 21085 }, { "epoch": 0.4318800810927037, "grad_norm": 0.28102514147758484, "learning_rate": 0.0002487790753690129, "loss": 4.749, "step": 21090 }, { "epoch": 0.4319824708701083, "grad_norm": 0.282574862241745, "learning_rate": 0.00024873248670624257, "loss": 4.7722, "step": 21095 }, { "epoch": 0.43208486064751295, "grad_norm": 0.2809067964553833, "learning_rate": 0.0002486858812321249, "loss": 4.7505, "step": 21100 }, { "epoch": 0.43218725042491757, "grad_norm": 0.27816784381866455, "learning_rate": 0.00024863925895459555, "loss": 4.7697, "step": 21105 }, { "epoch": 0.4322896402023222, "grad_norm": 0.25122854113578796, "learning_rate": 0.00024859261988159306, "loss": 4.742, "step": 21110 }, { "epoch": 0.4323920299797268, "grad_norm": 0.23729555308818817, "learning_rate": 0.0002485459640210586, "loss": 4.7465, "step": 21115 }, { "epoch": 0.43249441975713143, "grad_norm": 0.2549649178981781, "learning_rate": 0.0002484992913809364, "loss": 4.7455, "step": 21120 }, { "epoch": 0.43259680953453605, "grad_norm": 0.24888847768306732, "learning_rate": 0.0002484526019691736, "loss": 4.7683, "step": 21125 }, { "epoch": 0.4326991993119407, "grad_norm": 0.266985148191452, "learning_rate": 0.0002484058957937199, "loss": 4.7236, "step": 21130 }, { "epoch": 0.4328015890893453, "grad_norm": 0.2714773118495941, "learning_rate": 0.00024835917286252814, "loss": 4.767, "step": 21135 }, { "epoch": 0.4329039788667499, "grad_norm": 0.23292596638202667, "learning_rate": 0.00024831243318355396, "loss": 4.8115, "step": 21140 }, { "epoch": 0.4330063686441546, "grad_norm": 0.24832096695899963, "learning_rate": 0.0002482656767647557, "loss": 4.6607, "step": 21145 }, { "epoch": 0.4331087584215592, "grad_norm": 0.2521432042121887, "learning_rate": 0.0002482189036140947, "loss": 4.7279, "step": 21150 }, { "epoch": 0.43321114819896384, "grad_norm": 0.2571776509284973, "learning_rate": 0.0002481721137395351, "loss": 4.734, "step": 21155 }, { "epoch": 0.43331353797636846, "grad_norm": 0.2571667730808258, "learning_rate": 0.0002481253071490439, "loss": 4.7468, "step": 21160 }, { "epoch": 0.4334159277537731, "grad_norm": 0.25185245275497437, "learning_rate": 0.00024807848385059093, "loss": 4.72, "step": 21165 }, { "epoch": 0.4335183175311777, "grad_norm": 0.5772988796234131, "learning_rate": 0.00024803164385214885, "loss": 4.7641, "step": 21170 }, { "epoch": 0.4336207073085823, "grad_norm": 0.2447091042995453, "learning_rate": 0.0002479847871616932, "loss": 4.729, "step": 21175 }, { "epoch": 0.43372309708598694, "grad_norm": 0.29918625950813293, "learning_rate": 0.0002479379137872024, "loss": 4.7834, "step": 21180 }, { "epoch": 0.43382548686339156, "grad_norm": 0.24188275635242462, "learning_rate": 0.00024789102373665763, "loss": 4.7368, "step": 21185 }, { "epoch": 0.4339278766407962, "grad_norm": 0.24988071620464325, "learning_rate": 0.0002478441170180429, "loss": 4.7412, "step": 21190 }, { "epoch": 0.4340302664182008, "grad_norm": 0.2506921589374542, "learning_rate": 0.00024779719363934513, "loss": 4.7174, "step": 21195 }, { "epoch": 0.4341326561956054, "grad_norm": 0.25739920139312744, "learning_rate": 0.000247750253608554, "loss": 4.7428, "step": 21200 }, { "epoch": 0.43423504597301005, "grad_norm": 0.2615566551685333, "learning_rate": 0.00024770329693366206, "loss": 4.7792, "step": 21205 }, { "epoch": 0.43433743575041467, "grad_norm": 0.2501102685928345, "learning_rate": 0.00024765632362266487, "loss": 4.7425, "step": 21210 }, { "epoch": 0.4344398255278193, "grad_norm": 0.24423962831497192, "learning_rate": 0.0002476093336835604, "loss": 4.7163, "step": 21215 }, { "epoch": 0.4345422153052239, "grad_norm": 0.23754578828811646, "learning_rate": 0.0002475623271243499, "loss": 4.7604, "step": 21220 }, { "epoch": 0.43464460508262853, "grad_norm": 0.24553988873958588, "learning_rate": 0.0002475153039530372, "loss": 4.7635, "step": 21225 }, { "epoch": 0.43474699486003315, "grad_norm": 0.2519501745700836, "learning_rate": 0.000247468264177629, "loss": 4.7251, "step": 21230 }, { "epoch": 0.4348493846374378, "grad_norm": 0.2671903073787689, "learning_rate": 0.00024742120780613477, "loss": 4.7707, "step": 21235 }, { "epoch": 0.4349517744148424, "grad_norm": 0.27295219898223877, "learning_rate": 0.00024737413484656704, "loss": 4.7181, "step": 21240 }, { "epoch": 0.4350541641922471, "grad_norm": 0.24619005620479584, "learning_rate": 0.0002473270453069409, "loss": 4.7404, "step": 21245 }, { "epoch": 0.4351565539696517, "grad_norm": 0.26180773973464966, "learning_rate": 0.00024727993919527433, "loss": 4.7374, "step": 21250 }, { "epoch": 0.4352589437470563, "grad_norm": 0.2482661008834839, "learning_rate": 0.0002472328165195883, "loss": 4.7288, "step": 21255 }, { "epoch": 0.43536133352446094, "grad_norm": 0.25037941336631775, "learning_rate": 0.00024718567728790634, "loss": 4.7517, "step": 21260 }, { "epoch": 0.43546372330186556, "grad_norm": 0.23189325630664825, "learning_rate": 0.000247138521508255, "loss": 4.7501, "step": 21265 }, { "epoch": 0.4355661130792702, "grad_norm": 0.28822433948516846, "learning_rate": 0.0002470913491886635, "loss": 4.7335, "step": 21270 }, { "epoch": 0.4356685028566748, "grad_norm": 0.2456277757883072, "learning_rate": 0.00024704416033716404, "loss": 4.7491, "step": 21275 }, { "epoch": 0.4357708926340794, "grad_norm": 0.2418244481086731, "learning_rate": 0.0002469969549617915, "loss": 4.7966, "step": 21280 }, { "epoch": 0.43587328241148404, "grad_norm": 0.2540426552295685, "learning_rate": 0.00024694973307058356, "loss": 4.7318, "step": 21285 }, { "epoch": 0.43597567218888866, "grad_norm": 0.2391122579574585, "learning_rate": 0.00024690249467158095, "loss": 4.7782, "step": 21290 }, { "epoch": 0.4360780619662933, "grad_norm": 0.24399574100971222, "learning_rate": 0.00024685523977282684, "loss": 4.699, "step": 21295 }, { "epoch": 0.4361804517436979, "grad_norm": 0.2370738983154297, "learning_rate": 0.00024680796838236744, "loss": 4.7699, "step": 21300 }, { "epoch": 0.4362828415211025, "grad_norm": 0.24855202436447144, "learning_rate": 0.00024676068050825186, "loss": 4.7625, "step": 21305 }, { "epoch": 0.43638523129850715, "grad_norm": 0.2379562258720398, "learning_rate": 0.00024671337615853175, "loss": 4.7163, "step": 21310 }, { "epoch": 0.43648762107591177, "grad_norm": 0.2485109567642212, "learning_rate": 0.0002466660553412617, "loss": 4.7674, "step": 21315 }, { "epoch": 0.4365900108533164, "grad_norm": 0.2557595670223236, "learning_rate": 0.00024661871806449923, "loss": 4.7188, "step": 21320 }, { "epoch": 0.436692400630721, "grad_norm": 0.245519757270813, "learning_rate": 0.0002465713643363044, "loss": 4.7351, "step": 21325 }, { "epoch": 0.43679479040812563, "grad_norm": 0.2353067696094513, "learning_rate": 0.0002465239941647403, "loss": 4.7396, "step": 21330 }, { "epoch": 0.43689718018553025, "grad_norm": 0.24086587131023407, "learning_rate": 0.00024647660755787275, "loss": 4.7536, "step": 21335 }, { "epoch": 0.4369995699629349, "grad_norm": 0.28357189893722534, "learning_rate": 0.0002464292045237702, "loss": 4.7301, "step": 21340 }, { "epoch": 0.43710195974033955, "grad_norm": 0.2478339523077011, "learning_rate": 0.00024638178507050415, "loss": 4.8051, "step": 21345 }, { "epoch": 0.4372043495177442, "grad_norm": 0.2508557140827179, "learning_rate": 0.00024633434920614877, "loss": 4.7126, "step": 21350 }, { "epoch": 0.4373067392951488, "grad_norm": 0.25761789083480835, "learning_rate": 0.00024628689693878113, "loss": 4.7555, "step": 21355 }, { "epoch": 0.4374091290725534, "grad_norm": 0.2546573579311371, "learning_rate": 0.0002462394282764809, "loss": 4.7412, "step": 21360 }, { "epoch": 0.43751151884995804, "grad_norm": 0.2487485557794571, "learning_rate": 0.00024619194322733065, "loss": 4.6789, "step": 21365 }, { "epoch": 0.43761390862736266, "grad_norm": 0.24232380092144012, "learning_rate": 0.0002461444417994158, "loss": 4.7147, "step": 21370 }, { "epoch": 0.4377162984047673, "grad_norm": 0.25324276089668274, "learning_rate": 0.00024609692400082443, "loss": 4.7208, "step": 21375 }, { "epoch": 0.4378186881821719, "grad_norm": 0.2620543837547302, "learning_rate": 0.00024604938983964757, "loss": 4.7737, "step": 21380 }, { "epoch": 0.4379210779595765, "grad_norm": 0.23999658226966858, "learning_rate": 0.00024600183932397884, "loss": 4.7306, "step": 21385 }, { "epoch": 0.43802346773698114, "grad_norm": 0.25081706047058105, "learning_rate": 0.0002459542724619148, "loss": 4.7146, "step": 21390 }, { "epoch": 0.43812585751438576, "grad_norm": 0.304277241230011, "learning_rate": 0.00024590668926155473, "loss": 4.7099, "step": 21395 }, { "epoch": 0.4382282472917904, "grad_norm": 0.2500305473804474, "learning_rate": 0.0002458590897310007, "loss": 4.7558, "step": 21400 }, { "epoch": 0.438330637069195, "grad_norm": 0.2485106736421585, "learning_rate": 0.00024581147387835764, "loss": 4.7192, "step": 21405 }, { "epoch": 0.4384330268465996, "grad_norm": 0.25544965267181396, "learning_rate": 0.000245763841711733, "loss": 4.7744, "step": 21410 }, { "epoch": 0.43853541662400425, "grad_norm": 0.2420274317264557, "learning_rate": 0.00024571619323923737, "loss": 4.7154, "step": 21415 }, { "epoch": 0.43863780640140887, "grad_norm": 0.2997153103351593, "learning_rate": 0.00024566852846898384, "loss": 4.7067, "step": 21420 }, { "epoch": 0.4387401961788135, "grad_norm": 0.26231518387794495, "learning_rate": 0.0002456208474090884, "loss": 4.8259, "step": 21425 }, { "epoch": 0.4388425859562181, "grad_norm": 0.23175372183322906, "learning_rate": 0.00024557315006766984, "loss": 4.7194, "step": 21430 }, { "epoch": 0.43894497573362273, "grad_norm": 0.2603735029697418, "learning_rate": 0.0002455254364528495, "loss": 4.7663, "step": 21435 }, { "epoch": 0.43904736551102735, "grad_norm": 0.2584611475467682, "learning_rate": 0.0002454777065727518, "loss": 4.7981, "step": 21440 }, { "epoch": 0.439149755288432, "grad_norm": 0.24514956772327423, "learning_rate": 0.0002454299604355038, "loss": 4.7201, "step": 21445 }, { "epoch": 0.43925214506583665, "grad_norm": 0.24925577640533447, "learning_rate": 0.0002453821980492352, "loss": 4.7701, "step": 21450 }, { "epoch": 0.4393545348432413, "grad_norm": 0.2554652988910675, "learning_rate": 0.0002453344194220787, "loss": 4.7923, "step": 21455 }, { "epoch": 0.4394569246206459, "grad_norm": 0.32403281331062317, "learning_rate": 0.0002452866245621696, "loss": 4.7854, "step": 21460 }, { "epoch": 0.4395593143980505, "grad_norm": 0.2463287115097046, "learning_rate": 0.000245238813477646, "loss": 4.74, "step": 21465 }, { "epoch": 0.43966170417545514, "grad_norm": 0.2618178427219391, "learning_rate": 0.00024519098617664875, "loss": 4.7173, "step": 21470 }, { "epoch": 0.43976409395285976, "grad_norm": 0.24402408301830292, "learning_rate": 0.00024514314266732155, "loss": 4.7305, "step": 21475 }, { "epoch": 0.4398664837302644, "grad_norm": 0.34568819403648376, "learning_rate": 0.0002450952829578107, "loss": 4.7478, "step": 21480 }, { "epoch": 0.439968873507669, "grad_norm": 0.26306089758872986, "learning_rate": 0.0002450474070562655, "loss": 4.6287, "step": 21485 }, { "epoch": 0.4400712632850736, "grad_norm": 0.287149041891098, "learning_rate": 0.00024499951497083776, "loss": 4.7754, "step": 21490 }, { "epoch": 0.44017365306247824, "grad_norm": 0.3165466785430908, "learning_rate": 0.00024495160670968215, "loss": 4.6776, "step": 21495 }, { "epoch": 0.44027604283988286, "grad_norm": 0.26258593797683716, "learning_rate": 0.00024490368228095605, "loss": 4.8052, "step": 21500 }, { "epoch": 0.4403784326172875, "grad_norm": 0.26825210452079773, "learning_rate": 0.0002448557416928198, "loss": 4.7529, "step": 21505 }, { "epoch": 0.4404808223946921, "grad_norm": 0.2573893368244171, "learning_rate": 0.0002448077849534361, "loss": 4.7737, "step": 21510 }, { "epoch": 0.4405832121720967, "grad_norm": 0.24566176533699036, "learning_rate": 0.00024475981207097084, "loss": 4.73, "step": 21515 }, { "epoch": 0.44068560194950135, "grad_norm": 0.2359248548746109, "learning_rate": 0.00024471182305359224, "loss": 4.7148, "step": 21520 }, { "epoch": 0.44078799172690597, "grad_norm": 0.23471441864967346, "learning_rate": 0.00024466381790947165, "loss": 4.7509, "step": 21525 }, { "epoch": 0.4408903815043106, "grad_norm": 0.268425852060318, "learning_rate": 0.0002446157966467829, "loss": 4.7169, "step": 21530 }, { "epoch": 0.4409927712817152, "grad_norm": 0.24401435256004333, "learning_rate": 0.00024456775927370267, "loss": 4.7522, "step": 21535 }, { "epoch": 0.44109516105911983, "grad_norm": 0.24866032600402832, "learning_rate": 0.0002445197057984103, "loss": 4.6797, "step": 21540 }, { "epoch": 0.44119755083652445, "grad_norm": 0.2437106817960739, "learning_rate": 0.00024447163622908803, "loss": 4.794, "step": 21545 }, { "epoch": 0.44129994061392913, "grad_norm": 0.24281218647956848, "learning_rate": 0.00024442355057392067, "loss": 4.7386, "step": 21550 }, { "epoch": 0.44140233039133375, "grad_norm": 0.26372960209846497, "learning_rate": 0.0002443754488410959, "loss": 4.722, "step": 21555 }, { "epoch": 0.4415047201687384, "grad_norm": 0.25185683369636536, "learning_rate": 0.00024432733103880414, "loss": 4.7104, "step": 21560 }, { "epoch": 0.441607109946143, "grad_norm": 0.27980077266693115, "learning_rate": 0.0002442791971752383, "loss": 4.7497, "step": 21565 }, { "epoch": 0.4417094997235476, "grad_norm": 0.28467410802841187, "learning_rate": 0.0002442310472585944, "loss": 4.7019, "step": 21570 }, { "epoch": 0.44181188950095224, "grad_norm": 0.2608586251735687, "learning_rate": 0.0002441828812970709, "loss": 4.6961, "step": 21575 }, { "epoch": 0.44191427927835686, "grad_norm": 0.28476840257644653, "learning_rate": 0.00024413469929886918, "loss": 4.7437, "step": 21580 }, { "epoch": 0.4420166690557615, "grad_norm": 0.28951677680015564, "learning_rate": 0.00024408650127219324, "loss": 4.7665, "step": 21585 }, { "epoch": 0.4421190588331661, "grad_norm": 0.2582278251647949, "learning_rate": 0.0002440382872252498, "loss": 4.7652, "step": 21590 }, { "epoch": 0.4422214486105707, "grad_norm": 0.26101550459861755, "learning_rate": 0.00024399005716624835, "loss": 4.7786, "step": 21595 }, { "epoch": 0.44232383838797534, "grad_norm": 0.2525230348110199, "learning_rate": 0.00024394181110340117, "loss": 4.7128, "step": 21600 }, { "epoch": 0.44242622816537996, "grad_norm": 0.2606767416000366, "learning_rate": 0.00024389354904492315, "loss": 4.7473, "step": 21605 }, { "epoch": 0.4425286179427846, "grad_norm": 0.2644956707954407, "learning_rate": 0.00024384527099903194, "loss": 4.7048, "step": 21610 }, { "epoch": 0.4426310077201892, "grad_norm": 0.2370660901069641, "learning_rate": 0.00024379697697394796, "loss": 4.7694, "step": 21615 }, { "epoch": 0.44273339749759383, "grad_norm": 0.25547847151756287, "learning_rate": 0.00024374866697789429, "loss": 4.7107, "step": 21620 }, { "epoch": 0.44283578727499845, "grad_norm": 0.2429160326719284, "learning_rate": 0.0002437003410190968, "loss": 4.817, "step": 21625 }, { "epoch": 0.44293817705240307, "grad_norm": 0.25499096512794495, "learning_rate": 0.00024365199910578397, "loss": 4.7773, "step": 21630 }, { "epoch": 0.4430405668298077, "grad_norm": 0.23434969782829285, "learning_rate": 0.00024360364124618704, "loss": 4.7555, "step": 21635 }, { "epoch": 0.4431429566072123, "grad_norm": 0.24683192372322083, "learning_rate": 0.00024355526744854008, "loss": 4.6997, "step": 21640 }, { "epoch": 0.44324534638461693, "grad_norm": 0.2424478828907013, "learning_rate": 0.00024350687772107973, "loss": 4.6979, "step": 21645 }, { "epoch": 0.4433477361620216, "grad_norm": 0.27291345596313477, "learning_rate": 0.00024345847207204538, "loss": 4.7471, "step": 21650 }, { "epoch": 0.44345012593942623, "grad_norm": 0.25225764513015747, "learning_rate": 0.00024341005050967915, "loss": 4.7957, "step": 21655 }, { "epoch": 0.44355251571683085, "grad_norm": 0.2572467029094696, "learning_rate": 0.00024336161304222583, "loss": 4.7486, "step": 21660 }, { "epoch": 0.4436549054942355, "grad_norm": 0.45135772228240967, "learning_rate": 0.00024331315967793306, "loss": 4.6839, "step": 21665 }, { "epoch": 0.4437572952716401, "grad_norm": 0.23294630646705627, "learning_rate": 0.00024326469042505093, "loss": 4.7659, "step": 21670 }, { "epoch": 0.4438596850490447, "grad_norm": 0.26809754967689514, "learning_rate": 0.00024321620529183248, "loss": 4.7783, "step": 21675 }, { "epoch": 0.44396207482644934, "grad_norm": 0.24899956583976746, "learning_rate": 0.0002431677042865333, "loss": 4.7864, "step": 21680 }, { "epoch": 0.44406446460385396, "grad_norm": 0.27061086893081665, "learning_rate": 0.00024311918741741178, "loss": 4.7514, "step": 21685 }, { "epoch": 0.4441668543812586, "grad_norm": 0.2461511492729187, "learning_rate": 0.00024307065469272895, "loss": 4.7489, "step": 21690 }, { "epoch": 0.4442692441586632, "grad_norm": 0.6099108457565308, "learning_rate": 0.00024302210612074852, "loss": 4.6654, "step": 21695 }, { "epoch": 0.4443716339360678, "grad_norm": 0.25346803665161133, "learning_rate": 0.00024297354170973708, "loss": 4.7607, "step": 21700 }, { "epoch": 0.44447402371347244, "grad_norm": 0.27422893047332764, "learning_rate": 0.0002429249614679636, "loss": 4.7332, "step": 21705 }, { "epoch": 0.44457641349087706, "grad_norm": 0.25380709767341614, "learning_rate": 0.00024287636540369997, "loss": 4.7358, "step": 21710 }, { "epoch": 0.4446788032682817, "grad_norm": 0.2626173496246338, "learning_rate": 0.00024282775352522077, "loss": 4.7738, "step": 21715 }, { "epoch": 0.4447811930456863, "grad_norm": 0.25741249322891235, "learning_rate": 0.00024277912584080323, "loss": 4.7849, "step": 21720 }, { "epoch": 0.44488358282309093, "grad_norm": 0.253892183303833, "learning_rate": 0.0002427304823587272, "loss": 4.7328, "step": 21725 }, { "epoch": 0.44498597260049555, "grad_norm": 0.24379855394363403, "learning_rate": 0.00024268182308727533, "loss": 4.7645, "step": 21730 }, { "epoch": 0.44508836237790017, "grad_norm": 0.3527223467826843, "learning_rate": 0.00024263314803473296, "loss": 4.7587, "step": 21735 }, { "epoch": 0.4451907521553048, "grad_norm": 0.27313509583473206, "learning_rate": 0.000242584457209388, "loss": 4.775, "step": 21740 }, { "epoch": 0.4452931419327094, "grad_norm": 0.2845230996608734, "learning_rate": 0.00024253575061953115, "loss": 4.764, "step": 21745 }, { "epoch": 0.4453955317101141, "grad_norm": 0.25186723470687866, "learning_rate": 0.00024248702827345573, "loss": 4.7532, "step": 21750 }, { "epoch": 0.4454979214875187, "grad_norm": 0.27293211221694946, "learning_rate": 0.00024243829017945787, "loss": 4.7009, "step": 21755 }, { "epoch": 0.44560031126492333, "grad_norm": 0.2619374990463257, "learning_rate": 0.0002423895363458362, "loss": 4.7033, "step": 21760 }, { "epoch": 0.44570270104232795, "grad_norm": 0.25431960821151733, "learning_rate": 0.0002423407667808921, "loss": 4.7853, "step": 21765 }, { "epoch": 0.4458050908197326, "grad_norm": 0.24628354609012604, "learning_rate": 0.00024229198149292974, "loss": 4.7179, "step": 21770 }, { "epoch": 0.4459074805971372, "grad_norm": 0.27475932240486145, "learning_rate": 0.0002422431804902558, "loss": 4.7177, "step": 21775 }, { "epoch": 0.4460098703745418, "grad_norm": 0.2529583275318146, "learning_rate": 0.00024219436378117976, "loss": 4.724, "step": 21780 }, { "epoch": 0.44611226015194644, "grad_norm": 0.2600528299808502, "learning_rate": 0.00024214553137401364, "loss": 4.7666, "step": 21785 }, { "epoch": 0.44621464992935106, "grad_norm": 0.2494192272424698, "learning_rate": 0.00024209668327707229, "loss": 4.7305, "step": 21790 }, { "epoch": 0.4463170397067557, "grad_norm": 0.2513991594314575, "learning_rate": 0.00024204781949867314, "loss": 4.8177, "step": 21795 }, { "epoch": 0.4464194294841603, "grad_norm": 0.2659304738044739, "learning_rate": 0.00024199894004713632, "loss": 4.7779, "step": 21800 }, { "epoch": 0.4465218192615649, "grad_norm": 0.2444787472486496, "learning_rate": 0.0002419500449307846, "loss": 4.7648, "step": 21805 }, { "epoch": 0.44662420903896954, "grad_norm": 0.288303405046463, "learning_rate": 0.00024190113415794342, "loss": 4.7192, "step": 21810 }, { "epoch": 0.44672659881637417, "grad_norm": 0.24240879714488983, "learning_rate": 0.0002418522077369409, "loss": 4.7486, "step": 21815 }, { "epoch": 0.4468289885937788, "grad_norm": 0.24538648128509521, "learning_rate": 0.00024180326567610792, "loss": 4.7531, "step": 21820 }, { "epoch": 0.4469313783711834, "grad_norm": 0.24396349489688873, "learning_rate": 0.0002417543079837778, "loss": 4.7794, "step": 21825 }, { "epoch": 0.44703376814858803, "grad_norm": 0.2690068483352661, "learning_rate": 0.0002417053346682867, "loss": 4.7016, "step": 21830 }, { "epoch": 0.44713615792599265, "grad_norm": 0.2747132480144501, "learning_rate": 0.00024165634573797336, "loss": 4.7161, "step": 21835 }, { "epoch": 0.44723854770339727, "grad_norm": 0.26216763257980347, "learning_rate": 0.0002416073412011793, "loss": 4.7635, "step": 21840 }, { "epoch": 0.4473409374808019, "grad_norm": 0.2414170503616333, "learning_rate": 0.00024155832106624846, "loss": 4.7304, "step": 21845 }, { "epoch": 0.4474433272582065, "grad_norm": 0.26282960176467896, "learning_rate": 0.00024150928534152772, "loss": 4.718, "step": 21850 }, { "epoch": 0.4475457170356112, "grad_norm": 0.25752347707748413, "learning_rate": 0.00024146023403536637, "loss": 4.7169, "step": 21855 }, { "epoch": 0.4476481068130158, "grad_norm": 0.26910948753356934, "learning_rate": 0.00024141116715611655, "loss": 4.7501, "step": 21860 }, { "epoch": 0.44775049659042043, "grad_norm": 0.26379042863845825, "learning_rate": 0.00024136208471213294, "loss": 4.7613, "step": 21865 }, { "epoch": 0.44785288636782505, "grad_norm": 0.25984200835227966, "learning_rate": 0.00024131298671177282, "loss": 4.7327, "step": 21870 }, { "epoch": 0.4479552761452297, "grad_norm": 0.24715788662433624, "learning_rate": 0.00024126387316339624, "loss": 4.7458, "step": 21875 }, { "epoch": 0.4480576659226343, "grad_norm": 0.24957576394081116, "learning_rate": 0.00024121474407536584, "loss": 4.7534, "step": 21880 }, { "epoch": 0.4481600557000389, "grad_norm": 0.23778215050697327, "learning_rate": 0.0002411655994560469, "loss": 4.7317, "step": 21885 }, { "epoch": 0.44826244547744354, "grad_norm": 0.24345988035202026, "learning_rate": 0.00024111643931380743, "loss": 4.7047, "step": 21890 }, { "epoch": 0.44836483525484816, "grad_norm": 0.24695883691310883, "learning_rate": 0.00024106726365701793, "loss": 4.7261, "step": 21895 }, { "epoch": 0.4484672250322528, "grad_norm": 0.2646474838256836, "learning_rate": 0.00024101807249405163, "loss": 4.7856, "step": 21900 }, { "epoch": 0.4485696148096574, "grad_norm": 0.49546247720718384, "learning_rate": 0.0002409688658332844, "loss": 4.8024, "step": 21905 }, { "epoch": 0.448672004587062, "grad_norm": 0.3293178081512451, "learning_rate": 0.00024091964368309478, "loss": 4.747, "step": 21910 }, { "epoch": 0.44877439436446664, "grad_norm": 0.26098817586898804, "learning_rate": 0.00024087040605186383, "loss": 4.7359, "step": 21915 }, { "epoch": 0.44887678414187127, "grad_norm": 0.24753005802631378, "learning_rate": 0.00024082115294797541, "loss": 4.7252, "step": 21920 }, { "epoch": 0.4489791739192759, "grad_norm": 0.24103949964046478, "learning_rate": 0.00024077188437981588, "loss": 4.7508, "step": 21925 }, { "epoch": 0.4490815636966805, "grad_norm": 0.2502189576625824, "learning_rate": 0.0002407226003557743, "loss": 4.8036, "step": 21930 }, { "epoch": 0.44918395347408513, "grad_norm": 0.2760987877845764, "learning_rate": 0.00024067330088424238, "loss": 4.7353, "step": 21935 }, { "epoch": 0.44928634325148975, "grad_norm": 0.2822510004043579, "learning_rate": 0.00024062398597361433, "loss": 4.7491, "step": 21940 }, { "epoch": 0.44938873302889437, "grad_norm": 0.24212375283241272, "learning_rate": 0.00024057465563228714, "loss": 4.7312, "step": 21945 }, { "epoch": 0.449491122806299, "grad_norm": 0.25818729400634766, "learning_rate": 0.00024052530986866036, "loss": 4.7185, "step": 21950 }, { "epoch": 0.44959351258370367, "grad_norm": 0.2500952482223511, "learning_rate": 0.00024047594869113623, "loss": 4.725, "step": 21955 }, { "epoch": 0.4496959023611083, "grad_norm": 0.2536247968673706, "learning_rate": 0.00024042657210811947, "loss": 4.7612, "step": 21960 }, { "epoch": 0.4497982921385129, "grad_norm": 0.2786836326122284, "learning_rate": 0.00024037718012801757, "loss": 4.7726, "step": 21965 }, { "epoch": 0.44990068191591753, "grad_norm": 0.2442341446876526, "learning_rate": 0.00024032777275924053, "loss": 4.7498, "step": 21970 }, { "epoch": 0.45000307169332215, "grad_norm": 0.25481003522872925, "learning_rate": 0.00024027835001020113, "loss": 4.7003, "step": 21975 }, { "epoch": 0.4501054614707268, "grad_norm": 0.35789549350738525, "learning_rate": 0.00024022891188931462, "loss": 4.6755, "step": 21980 }, { "epoch": 0.4502078512481314, "grad_norm": 0.2654735743999481, "learning_rate": 0.00024017945840499886, "loss": 4.7754, "step": 21985 }, { "epoch": 0.450310241025536, "grad_norm": 0.24647344648838043, "learning_rate": 0.00024012998956567442, "loss": 4.7571, "step": 21990 }, { "epoch": 0.45041263080294064, "grad_norm": 0.256874680519104, "learning_rate": 0.00024008050537976442, "loss": 4.7364, "step": 21995 }, { "epoch": 0.45051502058034526, "grad_norm": 0.26698172092437744, "learning_rate": 0.00024003100585569465, "loss": 4.7593, "step": 22000 }, { "epoch": 0.45051502058034526, "eval_loss": 4.759491920471191, "eval_runtime": 17.2162, "eval_samples_per_second": 33.805, "eval_steps_per_second": 4.24, "step": 22000 }, { "epoch": 0.4506174103577499, "grad_norm": 0.22964538633823395, "learning_rate": 0.00023998149100189347, "loss": 4.7779, "step": 22005 }, { "epoch": 0.4507198001351545, "grad_norm": 0.25614720582962036, "learning_rate": 0.00023993196082679183, "loss": 4.7512, "step": 22010 }, { "epoch": 0.4508221899125591, "grad_norm": 0.2806569039821625, "learning_rate": 0.00023988241533882336, "loss": 4.7031, "step": 22015 }, { "epoch": 0.45092457968996374, "grad_norm": 0.2393021583557129, "learning_rate": 0.00023983285454642423, "loss": 4.7856, "step": 22020 }, { "epoch": 0.45102696946736837, "grad_norm": 0.24871160089969635, "learning_rate": 0.0002397832784580332, "loss": 4.7386, "step": 22025 }, { "epoch": 0.451129359244773, "grad_norm": 0.3182537853717804, "learning_rate": 0.00023973368708209172, "loss": 4.7502, "step": 22030 }, { "epoch": 0.4512317490221776, "grad_norm": 0.2552315592765808, "learning_rate": 0.00023968408042704384, "loss": 4.7061, "step": 22035 }, { "epoch": 0.45133413879958223, "grad_norm": 0.24092653393745422, "learning_rate": 0.0002396344585013361, "loss": 4.7735, "step": 22040 }, { "epoch": 0.45143652857698685, "grad_norm": 0.24381929636001587, "learning_rate": 0.00023958482131341774, "loss": 4.7371, "step": 22045 }, { "epoch": 0.45153891835439147, "grad_norm": 0.25864145159721375, "learning_rate": 0.00023953516887174054, "loss": 4.7348, "step": 22050 }, { "epoch": 0.45164130813179615, "grad_norm": 0.23586764931678772, "learning_rate": 0.0002394855011847589, "loss": 4.7175, "step": 22055 }, { "epoch": 0.45174369790920077, "grad_norm": 0.25393882393836975, "learning_rate": 0.0002394358182609299, "loss": 4.7234, "step": 22060 }, { "epoch": 0.4518460876866054, "grad_norm": 0.23844827711582184, "learning_rate": 0.00023938612010871303, "loss": 4.7125, "step": 22065 }, { "epoch": 0.45194847746401, "grad_norm": 0.2636282742023468, "learning_rate": 0.00023933640673657052, "loss": 4.74, "step": 22070 }, { "epoch": 0.45205086724141463, "grad_norm": 0.23698267340660095, "learning_rate": 0.00023928667815296723, "loss": 4.7738, "step": 22075 }, { "epoch": 0.45215325701881925, "grad_norm": 0.26341021060943604, "learning_rate": 0.0002392369343663704, "loss": 4.7323, "step": 22080 }, { "epoch": 0.4522556467962239, "grad_norm": 0.23851220309734344, "learning_rate": 0.00023918717538525002, "loss": 4.7549, "step": 22085 }, { "epoch": 0.4523580365736285, "grad_norm": 0.2447780817747116, "learning_rate": 0.0002391374012180787, "loss": 4.7244, "step": 22090 }, { "epoch": 0.4524604263510331, "grad_norm": 0.24237054586410522, "learning_rate": 0.0002390876118733315, "loss": 4.7143, "step": 22095 }, { "epoch": 0.45256281612843774, "grad_norm": 0.24432867765426636, "learning_rate": 0.0002390378073594862, "loss": 4.7231, "step": 22100 }, { "epoch": 0.45266520590584236, "grad_norm": 0.2664358913898468, "learning_rate": 0.0002389879876850231, "loss": 4.7944, "step": 22105 }, { "epoch": 0.452767595683247, "grad_norm": 0.2870197594165802, "learning_rate": 0.00023893815285842498, "loss": 4.7442, "step": 22110 }, { "epoch": 0.4528699854606516, "grad_norm": 0.23302584886550903, "learning_rate": 0.00023888830288817738, "loss": 4.7641, "step": 22115 }, { "epoch": 0.4529723752380562, "grad_norm": 0.2441381812095642, "learning_rate": 0.00023883843778276838, "loss": 4.771, "step": 22120 }, { "epoch": 0.45307476501546085, "grad_norm": 0.23622973263263702, "learning_rate": 0.00023878855755068846, "loss": 4.7065, "step": 22125 }, { "epoch": 0.45317715479286547, "grad_norm": 0.24003490805625916, "learning_rate": 0.00023873866220043094, "loss": 4.71, "step": 22130 }, { "epoch": 0.4532795445702701, "grad_norm": 0.234433114528656, "learning_rate": 0.00023868875174049158, "loss": 4.7812, "step": 22135 }, { "epoch": 0.4533819343476747, "grad_norm": 0.24909816682338715, "learning_rate": 0.00023863882617936857, "loss": 4.7732, "step": 22140 }, { "epoch": 0.45348432412507933, "grad_norm": 0.25630414485931396, "learning_rate": 0.000238588885525563, "loss": 4.703, "step": 22145 }, { "epoch": 0.45358671390248395, "grad_norm": 0.27781715989112854, "learning_rate": 0.0002385389297875783, "loss": 4.7235, "step": 22150 }, { "epoch": 0.4536891036798886, "grad_norm": 0.25707268714904785, "learning_rate": 0.00023848895897392045, "loss": 4.7447, "step": 22155 }, { "epoch": 0.45379149345729325, "grad_norm": 0.25520217418670654, "learning_rate": 0.00023843897309309812, "loss": 4.7422, "step": 22160 }, { "epoch": 0.45389388323469787, "grad_norm": 0.25873681902885437, "learning_rate": 0.00023838897215362246, "loss": 4.7242, "step": 22165 }, { "epoch": 0.4539962730121025, "grad_norm": 0.25034627318382263, "learning_rate": 0.00023833895616400726, "loss": 4.6886, "step": 22170 }, { "epoch": 0.4540986627895071, "grad_norm": 0.24327032268047333, "learning_rate": 0.0002382889251327688, "loss": 4.7031, "step": 22175 }, { "epoch": 0.45420105256691173, "grad_norm": 0.3189115524291992, "learning_rate": 0.00023823887906842593, "loss": 4.744, "step": 22180 }, { "epoch": 0.45430344234431635, "grad_norm": 0.25070321559906006, "learning_rate": 0.00023818881797950012, "loss": 4.778, "step": 22185 }, { "epoch": 0.454405832121721, "grad_norm": 0.27552253007888794, "learning_rate": 0.00023813874187451531, "loss": 4.7648, "step": 22190 }, { "epoch": 0.4545082218991256, "grad_norm": 0.24581551551818848, "learning_rate": 0.00023808865076199807, "loss": 4.7241, "step": 22195 }, { "epoch": 0.4546106116765302, "grad_norm": 0.25316470861434937, "learning_rate": 0.00023803854465047748, "loss": 4.7666, "step": 22200 }, { "epoch": 0.45471300145393484, "grad_norm": 0.24150055646896362, "learning_rate": 0.00023798842354848522, "loss": 4.6931, "step": 22205 }, { "epoch": 0.45481539123133946, "grad_norm": 0.2623421251773834, "learning_rate": 0.0002379382874645555, "loss": 4.7356, "step": 22210 }, { "epoch": 0.4549177810087441, "grad_norm": 0.2407127320766449, "learning_rate": 0.00023788813640722507, "loss": 4.7344, "step": 22215 }, { "epoch": 0.4550201707861487, "grad_norm": 0.2920372188091278, "learning_rate": 0.00023783797038503314, "loss": 4.753, "step": 22220 }, { "epoch": 0.4551225605635533, "grad_norm": 0.24697765707969666, "learning_rate": 0.0002377877894065217, "loss": 4.7418, "step": 22225 }, { "epoch": 0.45522495034095795, "grad_norm": 0.28075122833251953, "learning_rate": 0.0002377375934802351, "loss": 4.6846, "step": 22230 }, { "epoch": 0.45532734011836257, "grad_norm": 0.25362035632133484, "learning_rate": 0.00023768738261472028, "loss": 4.7605, "step": 22235 }, { "epoch": 0.4554297298957672, "grad_norm": 0.27194178104400635, "learning_rate": 0.00023763715681852674, "loss": 4.7013, "step": 22240 }, { "epoch": 0.4555321196731718, "grad_norm": 0.3001675009727478, "learning_rate": 0.00023758691610020646, "loss": 4.7209, "step": 22245 }, { "epoch": 0.45563450945057643, "grad_norm": 0.2572694420814514, "learning_rate": 0.00023753666046831407, "loss": 4.7078, "step": 22250 }, { "epoch": 0.45573689922798105, "grad_norm": 0.23649387061595917, "learning_rate": 0.00023748638993140669, "loss": 4.7168, "step": 22255 }, { "epoch": 0.45583928900538573, "grad_norm": 0.25332945585250854, "learning_rate": 0.00023743610449804392, "loss": 4.7397, "step": 22260 }, { "epoch": 0.45594167878279035, "grad_norm": 0.25807368755340576, "learning_rate": 0.00023738580417678794, "loss": 4.7024, "step": 22265 }, { "epoch": 0.45604406856019497, "grad_norm": 0.259543240070343, "learning_rate": 0.00023733548897620353, "loss": 4.7819, "step": 22270 }, { "epoch": 0.4561464583375996, "grad_norm": 0.26395493745803833, "learning_rate": 0.00023728515890485794, "loss": 4.7047, "step": 22275 }, { "epoch": 0.4562488481150042, "grad_norm": 0.9881982207298279, "learning_rate": 0.00023723481397132086, "loss": 4.6476, "step": 22280 }, { "epoch": 0.45635123789240883, "grad_norm": 0.24993281066417694, "learning_rate": 0.00023718445418416474, "loss": 4.712, "step": 22285 }, { "epoch": 0.45645362766981346, "grad_norm": 0.27237850427627563, "learning_rate": 0.00023713407955196436, "loss": 4.6818, "step": 22290 }, { "epoch": 0.4565560174472181, "grad_norm": 0.3099258542060852, "learning_rate": 0.00023708369008329703, "loss": 4.7796, "step": 22295 }, { "epoch": 0.4566584072246227, "grad_norm": 0.2862955331802368, "learning_rate": 0.00023703328578674281, "loss": 4.7674, "step": 22300 }, { "epoch": 0.4567607970020273, "grad_norm": 0.2516242563724518, "learning_rate": 0.00023698286667088393, "loss": 4.7262, "step": 22305 }, { "epoch": 0.45686318677943194, "grad_norm": 0.24596908688545227, "learning_rate": 0.00023693243274430553, "loss": 4.744, "step": 22310 }, { "epoch": 0.45696557655683656, "grad_norm": 0.2882581651210785, "learning_rate": 0.00023688198401559496, "loss": 4.7209, "step": 22315 }, { "epoch": 0.4570679663342412, "grad_norm": 0.2525341808795929, "learning_rate": 0.00023683152049334218, "loss": 4.7341, "step": 22320 }, { "epoch": 0.4571703561116458, "grad_norm": 0.24643680453300476, "learning_rate": 0.0002367810421861398, "loss": 4.7168, "step": 22325 }, { "epoch": 0.4572727458890504, "grad_norm": 0.25516244769096375, "learning_rate": 0.0002367305491025828, "loss": 4.7043, "step": 22330 }, { "epoch": 0.45737513566645505, "grad_norm": 0.24852685630321503, "learning_rate": 0.00023668004125126875, "loss": 4.7319, "step": 22335 }, { "epoch": 0.45747752544385967, "grad_norm": 0.26832687854766846, "learning_rate": 0.00023662951864079765, "loss": 4.7021, "step": 22340 }, { "epoch": 0.4575799152212643, "grad_norm": 0.25500577688217163, "learning_rate": 0.00023657898127977218, "loss": 4.7295, "step": 22345 }, { "epoch": 0.4576823049986689, "grad_norm": 0.253725528717041, "learning_rate": 0.00023652842917679727, "loss": 4.7624, "step": 22350 }, { "epoch": 0.45778469477607353, "grad_norm": 0.24359218776226044, "learning_rate": 0.00023647786234048062, "loss": 4.7484, "step": 22355 }, { "epoch": 0.4578870845534782, "grad_norm": 0.2668800354003906, "learning_rate": 0.00023642728077943228, "loss": 4.7343, "step": 22360 }, { "epoch": 0.45798947433088283, "grad_norm": 0.2451476752758026, "learning_rate": 0.0002363766845022649, "loss": 4.7147, "step": 22365 }, { "epoch": 0.45809186410828745, "grad_norm": 0.26905909180641174, "learning_rate": 0.0002363260735175936, "loss": 4.7641, "step": 22370 }, { "epoch": 0.45819425388569207, "grad_norm": 0.2415500432252884, "learning_rate": 0.0002362754478340359, "loss": 4.7479, "step": 22375 }, { "epoch": 0.4582966436630967, "grad_norm": 0.2391519844532013, "learning_rate": 0.000236224807460212, "loss": 4.7368, "step": 22380 }, { "epoch": 0.4583990334405013, "grad_norm": 0.2539454996585846, "learning_rate": 0.00023617415240474452, "loss": 4.6922, "step": 22385 }, { "epoch": 0.45850142321790593, "grad_norm": 0.5625364780426025, "learning_rate": 0.00023612348267625853, "loss": 4.7439, "step": 22390 }, { "epoch": 0.45860381299531056, "grad_norm": 0.23095041513442993, "learning_rate": 0.00023607279828338177, "loss": 4.6718, "step": 22395 }, { "epoch": 0.4587062027727152, "grad_norm": 0.26877298951148987, "learning_rate": 0.00023602209923474427, "loss": 4.7454, "step": 22400 }, { "epoch": 0.4588085925501198, "grad_norm": 0.2572716474533081, "learning_rate": 0.0002359713855389786, "loss": 4.656, "step": 22405 }, { "epoch": 0.4589109823275244, "grad_norm": 0.2368837147951126, "learning_rate": 0.00023592065720471996, "loss": 4.7213, "step": 22410 }, { "epoch": 0.45901337210492904, "grad_norm": 0.26700299978256226, "learning_rate": 0.00023586991424060592, "loss": 4.7123, "step": 22415 }, { "epoch": 0.45911576188233366, "grad_norm": 0.260018914937973, "learning_rate": 0.00023581915665527652, "loss": 4.7315, "step": 22420 }, { "epoch": 0.4592181516597383, "grad_norm": 0.26031601428985596, "learning_rate": 0.0002357683844573744, "loss": 4.7261, "step": 22425 }, { "epoch": 0.4593205414371429, "grad_norm": 0.24976696074008942, "learning_rate": 0.00023571759765554464, "loss": 4.7519, "step": 22430 }, { "epoch": 0.4594229312145475, "grad_norm": 0.25443023443222046, "learning_rate": 0.0002356667962584347, "loss": 4.7378, "step": 22435 }, { "epoch": 0.45952532099195215, "grad_norm": 0.24392306804656982, "learning_rate": 0.00023561598027469472, "loss": 4.7262, "step": 22440 }, { "epoch": 0.45962771076935677, "grad_norm": 0.2588663399219513, "learning_rate": 0.00023556514971297712, "loss": 4.7743, "step": 22445 }, { "epoch": 0.4597301005467614, "grad_norm": 0.24382399022579193, "learning_rate": 0.00023551430458193705, "loss": 4.7749, "step": 22450 }, { "epoch": 0.459832490324166, "grad_norm": 0.25846609473228455, "learning_rate": 0.0002354634448902319, "loss": 4.7296, "step": 22455 }, { "epoch": 0.4599348801015707, "grad_norm": 0.2652266323566437, "learning_rate": 0.0002354125706465216, "loss": 4.7466, "step": 22460 }, { "epoch": 0.4600372698789753, "grad_norm": 0.24637307226657867, "learning_rate": 0.0002353616818594687, "loss": 4.7446, "step": 22465 }, { "epoch": 0.46013965965637993, "grad_norm": 0.2609288692474365, "learning_rate": 0.00023531077853773804, "loss": 4.7121, "step": 22470 }, { "epoch": 0.46024204943378455, "grad_norm": 0.5267515778541565, "learning_rate": 0.00023525986068999706, "loss": 4.7199, "step": 22475 }, { "epoch": 0.46034443921118917, "grad_norm": 0.23161795735359192, "learning_rate": 0.00023520892832491557, "loss": 4.7501, "step": 22480 }, { "epoch": 0.4604468289885938, "grad_norm": 0.2667028605937958, "learning_rate": 0.00023515798145116597, "loss": 4.76, "step": 22485 }, { "epoch": 0.4605492187659984, "grad_norm": 0.2522803246974945, "learning_rate": 0.00023510702007742304, "loss": 4.7654, "step": 22490 }, { "epoch": 0.46065160854340303, "grad_norm": 0.28176870942115784, "learning_rate": 0.00023505604421236408, "loss": 4.7748, "step": 22495 }, { "epoch": 0.46075399832080766, "grad_norm": 0.27334338426589966, "learning_rate": 0.00023500505386466885, "loss": 4.7205, "step": 22500 }, { "epoch": 0.4608563880982123, "grad_norm": 0.2692910134792328, "learning_rate": 0.00023495404904301946, "loss": 4.7661, "step": 22505 }, { "epoch": 0.4609587778756169, "grad_norm": 0.24851375818252563, "learning_rate": 0.0002349030297561007, "loss": 4.7542, "step": 22510 }, { "epoch": 0.4610611676530215, "grad_norm": 0.25268369913101196, "learning_rate": 0.00023485199601259966, "loss": 4.7673, "step": 22515 }, { "epoch": 0.46116355743042614, "grad_norm": 0.24291718006134033, "learning_rate": 0.00023480094782120598, "loss": 4.7623, "step": 22520 }, { "epoch": 0.46126594720783076, "grad_norm": 0.2530502378940582, "learning_rate": 0.0002347498851906117, "loss": 4.7235, "step": 22525 }, { "epoch": 0.4613683369852354, "grad_norm": 0.2749159634113312, "learning_rate": 0.00023469880812951132, "loss": 4.7397, "step": 22530 }, { "epoch": 0.46147072676264, "grad_norm": 0.27258047461509705, "learning_rate": 0.00023464771664660183, "loss": 4.7494, "step": 22535 }, { "epoch": 0.4615731165400446, "grad_norm": 0.2481284886598587, "learning_rate": 0.00023459661075058268, "loss": 4.7916, "step": 22540 }, { "epoch": 0.46167550631744925, "grad_norm": 0.25946375727653503, "learning_rate": 0.0002345454904501557, "loss": 4.739, "step": 22545 }, { "epoch": 0.46177789609485387, "grad_norm": 0.25661179423332214, "learning_rate": 0.00023449435575402533, "loss": 4.695, "step": 22550 }, { "epoch": 0.4618802858722585, "grad_norm": 0.24974200129508972, "learning_rate": 0.00023444320667089828, "loss": 4.7466, "step": 22555 }, { "epoch": 0.4619826756496631, "grad_norm": 0.2404831200838089, "learning_rate": 0.0002343920432094838, "loss": 4.6757, "step": 22560 }, { "epoch": 0.4620850654270678, "grad_norm": 0.2554483115673065, "learning_rate": 0.00023434086537849365, "loss": 4.7205, "step": 22565 }, { "epoch": 0.4621874552044724, "grad_norm": 0.24611206352710724, "learning_rate": 0.00023428967318664187, "loss": 4.7079, "step": 22570 }, { "epoch": 0.46228984498187703, "grad_norm": 0.24876458942890167, "learning_rate": 0.00023423846664264507, "loss": 4.7568, "step": 22575 }, { "epoch": 0.46239223475928165, "grad_norm": 0.26299282908439636, "learning_rate": 0.00023418724575522236, "loss": 4.6751, "step": 22580 }, { "epoch": 0.46249462453668627, "grad_norm": 0.2555292844772339, "learning_rate": 0.00023413601053309506, "loss": 4.6836, "step": 22585 }, { "epoch": 0.4625970143140909, "grad_norm": 0.2545802891254425, "learning_rate": 0.00023408476098498717, "loss": 4.6831, "step": 22590 }, { "epoch": 0.4626994040914955, "grad_norm": 0.26119187474250793, "learning_rate": 0.000234033497119625, "loss": 4.7271, "step": 22595 }, { "epoch": 0.46280179386890014, "grad_norm": 0.2618061900138855, "learning_rate": 0.00023398221894573736, "loss": 4.752, "step": 22600 }, { "epoch": 0.46290418364630476, "grad_norm": 0.2994387447834015, "learning_rate": 0.0002339309264720555, "loss": 4.7242, "step": 22605 }, { "epoch": 0.4630065734237094, "grad_norm": 0.2792063355445862, "learning_rate": 0.00023387961970731301, "loss": 4.7183, "step": 22610 }, { "epoch": 0.463108963201114, "grad_norm": 1.7313650846481323, "learning_rate": 0.00023382829866024603, "loss": 4.6868, "step": 22615 }, { "epoch": 0.4632113529785186, "grad_norm": 0.25411492586135864, "learning_rate": 0.00023377696333959304, "loss": 4.7746, "step": 22620 }, { "epoch": 0.46331374275592324, "grad_norm": 0.23991341888904572, "learning_rate": 0.00023372561375409508, "loss": 4.7603, "step": 22625 }, { "epoch": 0.46341613253332786, "grad_norm": 0.2458183765411377, "learning_rate": 0.00023367424991249536, "loss": 4.7253, "step": 22630 }, { "epoch": 0.4635185223107325, "grad_norm": 0.26310741901397705, "learning_rate": 0.00023362287182353983, "loss": 4.7386, "step": 22635 }, { "epoch": 0.4636209120881371, "grad_norm": 0.2550501525402069, "learning_rate": 0.0002335714794959767, "loss": 4.7315, "step": 22640 }, { "epoch": 0.4637233018655417, "grad_norm": 0.25134924054145813, "learning_rate": 0.0002335200729385566, "loss": 4.7268, "step": 22645 }, { "epoch": 0.46382569164294635, "grad_norm": 0.24425670504570007, "learning_rate": 0.00023346865216003268, "loss": 4.7531, "step": 22650 }, { "epoch": 0.46392808142035097, "grad_norm": 0.25438258051872253, "learning_rate": 0.0002334172171691603, "loss": 4.7171, "step": 22655 }, { "epoch": 0.4640304711977556, "grad_norm": 0.2890762984752655, "learning_rate": 0.00023336576797469753, "loss": 4.6838, "step": 22660 }, { "epoch": 0.46413286097516027, "grad_norm": 0.23798921704292297, "learning_rate": 0.00023331430458540465, "loss": 4.7094, "step": 22665 }, { "epoch": 0.4642352507525649, "grad_norm": 0.3062776029109955, "learning_rate": 0.0002332628270100444, "loss": 4.7395, "step": 22670 }, { "epoch": 0.4643376405299695, "grad_norm": 0.24925167858600616, "learning_rate": 0.00023321133525738198, "loss": 4.7799, "step": 22675 }, { "epoch": 0.46444003030737413, "grad_norm": 0.2531532943248749, "learning_rate": 0.00023315982933618497, "loss": 4.7545, "step": 22680 }, { "epoch": 0.46454242008477875, "grad_norm": 0.24798624217510223, "learning_rate": 0.00023310830925522336, "loss": 4.7217, "step": 22685 }, { "epoch": 0.46464480986218337, "grad_norm": 0.2451784610748291, "learning_rate": 0.0002330567750232696, "loss": 4.7494, "step": 22690 }, { "epoch": 0.464747199639588, "grad_norm": 0.26014596223831177, "learning_rate": 0.0002330052266490985, "loss": 4.7399, "step": 22695 }, { "epoch": 0.4648495894169926, "grad_norm": 0.24459093809127808, "learning_rate": 0.00023295366414148724, "loss": 4.7729, "step": 22700 }, { "epoch": 0.46495197919439724, "grad_norm": 0.2558246850967407, "learning_rate": 0.00023290208750921545, "loss": 4.7228, "step": 22705 }, { "epoch": 0.46505436897180186, "grad_norm": 0.24167120456695557, "learning_rate": 0.0002328504967610653, "loss": 4.6931, "step": 22710 }, { "epoch": 0.4651567587492065, "grad_norm": 0.23493839800357819, "learning_rate": 0.00023279889190582108, "loss": 4.704, "step": 22715 }, { "epoch": 0.4652591485266111, "grad_norm": 0.25040683150291443, "learning_rate": 0.00023274727295226973, "loss": 4.7672, "step": 22720 }, { "epoch": 0.4653615383040157, "grad_norm": 0.25645115971565247, "learning_rate": 0.00023269563990920044, "loss": 4.6998, "step": 22725 }, { "epoch": 0.46546392808142034, "grad_norm": 0.27187323570251465, "learning_rate": 0.00023264399278540487, "loss": 4.7217, "step": 22730 }, { "epoch": 0.46556631785882496, "grad_norm": 0.25805917382240295, "learning_rate": 0.00023259233158967713, "loss": 4.7406, "step": 22735 }, { "epoch": 0.4656687076362296, "grad_norm": 0.24273861944675446, "learning_rate": 0.0002325406563308135, "loss": 4.7488, "step": 22740 }, { "epoch": 0.4657710974136342, "grad_norm": 0.24475401639938354, "learning_rate": 0.00023248896701761303, "loss": 4.7654, "step": 22745 }, { "epoch": 0.4658734871910388, "grad_norm": 0.23977327346801758, "learning_rate": 0.00023243726365887677, "loss": 4.7662, "step": 22750 }, { "epoch": 0.46597587696844345, "grad_norm": 0.2601017355918884, "learning_rate": 0.00023238554626340841, "loss": 4.7241, "step": 22755 }, { "epoch": 0.46607826674584807, "grad_norm": 0.2603398859500885, "learning_rate": 0.00023233381484001394, "loss": 4.7117, "step": 22760 }, { "epoch": 0.46618065652325275, "grad_norm": 0.6911476254463196, "learning_rate": 0.0002322820693975018, "loss": 4.789, "step": 22765 }, { "epoch": 0.46628304630065737, "grad_norm": 0.2640589773654938, "learning_rate": 0.0002322303099446827, "loss": 4.7677, "step": 22770 }, { "epoch": 0.466385436078062, "grad_norm": 0.2652299702167511, "learning_rate": 0.00023217853649036988, "loss": 4.6717, "step": 22775 }, { "epoch": 0.4664878258554666, "grad_norm": 0.2788437604904175, "learning_rate": 0.00023212674904337887, "loss": 4.7015, "step": 22780 }, { "epoch": 0.46659021563287123, "grad_norm": 0.24584704637527466, "learning_rate": 0.00023207494761252754, "loss": 4.7687, "step": 22785 }, { "epoch": 0.46669260541027585, "grad_norm": 0.22849427163600922, "learning_rate": 0.00023202313220663633, "loss": 4.6997, "step": 22790 }, { "epoch": 0.4667949951876805, "grad_norm": 0.2581017017364502, "learning_rate": 0.0002319713028345278, "loss": 4.763, "step": 22795 }, { "epoch": 0.4668973849650851, "grad_norm": 0.25005319714546204, "learning_rate": 0.00023191945950502718, "loss": 4.7233, "step": 22800 }, { "epoch": 0.4669997747424897, "grad_norm": 0.24875850975513458, "learning_rate": 0.0002318676022269618, "loss": 4.7219, "step": 22805 }, { "epoch": 0.46710216451989434, "grad_norm": 0.233003169298172, "learning_rate": 0.0002318157310091615, "loss": 4.748, "step": 22810 }, { "epoch": 0.46720455429729896, "grad_norm": 0.253690242767334, "learning_rate": 0.00023176384586045854, "loss": 4.7961, "step": 22815 }, { "epoch": 0.4673069440747036, "grad_norm": 0.2438765913248062, "learning_rate": 0.00023171194678968743, "loss": 4.7664, "step": 22820 }, { "epoch": 0.4674093338521082, "grad_norm": 0.25183239579200745, "learning_rate": 0.00023166003380568515, "loss": 4.7098, "step": 22825 }, { "epoch": 0.4675117236295128, "grad_norm": 0.24586036801338196, "learning_rate": 0.00023160810691729103, "loss": 4.7102, "step": 22830 }, { "epoch": 0.46761411340691744, "grad_norm": 0.3420575261116028, "learning_rate": 0.00023155616613334672, "loss": 4.7272, "step": 22835 }, { "epoch": 0.46771650318432206, "grad_norm": 0.2641162574291229, "learning_rate": 0.00023150421146269617, "loss": 4.7353, "step": 22840 }, { "epoch": 0.4678188929617267, "grad_norm": 0.25523972511291504, "learning_rate": 0.000231452242914186, "loss": 4.7731, "step": 22845 }, { "epoch": 0.4679212827391313, "grad_norm": 0.25930434465408325, "learning_rate": 0.0002314002604966648, "loss": 4.7182, "step": 22850 }, { "epoch": 0.4680236725165359, "grad_norm": 0.2686781883239746, "learning_rate": 0.00023134826421898375, "loss": 4.792, "step": 22855 }, { "epoch": 0.46812606229394055, "grad_norm": 0.2687591314315796, "learning_rate": 0.00023129625408999644, "loss": 4.7289, "step": 22860 }, { "epoch": 0.46822845207134517, "grad_norm": 0.24362003803253174, "learning_rate": 0.0002312442301185586, "loss": 4.7025, "step": 22865 }, { "epoch": 0.46833084184874985, "grad_norm": 0.25301697850227356, "learning_rate": 0.00023119219231352848, "loss": 4.7109, "step": 22870 }, { "epoch": 0.46843323162615447, "grad_norm": 0.2592464089393616, "learning_rate": 0.00023114014068376668, "loss": 4.7695, "step": 22875 }, { "epoch": 0.4685356214035591, "grad_norm": 0.2599638104438782, "learning_rate": 0.0002310880752381361, "loss": 4.6963, "step": 22880 }, { "epoch": 0.4686380111809637, "grad_norm": 0.25477394461631775, "learning_rate": 0.00023103599598550196, "loss": 4.7334, "step": 22885 }, { "epoch": 0.46874040095836833, "grad_norm": 0.2755983769893646, "learning_rate": 0.00023098390293473196, "loss": 4.7255, "step": 22890 }, { "epoch": 0.46884279073577295, "grad_norm": 0.26748162508010864, "learning_rate": 0.000230931796094696, "loss": 4.7001, "step": 22895 }, { "epoch": 0.4689451805131776, "grad_norm": 0.278424471616745, "learning_rate": 0.00023087967547426644, "loss": 4.7336, "step": 22900 }, { "epoch": 0.4690475702905822, "grad_norm": 0.2655482590198517, "learning_rate": 0.00023082754108231797, "loss": 4.7248, "step": 22905 }, { "epoch": 0.4691499600679868, "grad_norm": 0.2636497914791107, "learning_rate": 0.00023077539292772753, "loss": 4.7207, "step": 22910 }, { "epoch": 0.46925234984539144, "grad_norm": 0.27643537521362305, "learning_rate": 0.00023072323101937456, "loss": 4.772, "step": 22915 }, { "epoch": 0.46935473962279606, "grad_norm": 0.28624022006988525, "learning_rate": 0.0002306710553661407, "loss": 4.7101, "step": 22920 }, { "epoch": 0.4694571294002007, "grad_norm": 0.25350862741470337, "learning_rate": 0.00023061886597691, "loss": 4.715, "step": 22925 }, { "epoch": 0.4695595191776053, "grad_norm": 0.2603566646575928, "learning_rate": 0.00023056666286056884, "loss": 4.7286, "step": 22930 }, { "epoch": 0.4696619089550099, "grad_norm": 0.2457352727651596, "learning_rate": 0.0002305144460260059, "loss": 4.7614, "step": 22935 }, { "epoch": 0.46976429873241454, "grad_norm": 0.24755015969276428, "learning_rate": 0.0002304622154821123, "loss": 4.7627, "step": 22940 }, { "epoch": 0.46986668850981916, "grad_norm": 0.24351108074188232, "learning_rate": 0.0002304099712377813, "loss": 4.7481, "step": 22945 }, { "epoch": 0.4699690782872238, "grad_norm": 0.2639317810535431, "learning_rate": 0.00023035771330190877, "loss": 4.6929, "step": 22950 }, { "epoch": 0.4700714680646284, "grad_norm": 0.25225552916526794, "learning_rate": 0.00023030544168339264, "loss": 4.7569, "step": 22955 }, { "epoch": 0.470173857842033, "grad_norm": 0.27675190567970276, "learning_rate": 0.0002302531563911334, "loss": 4.7458, "step": 22960 }, { "epoch": 0.47027624761943765, "grad_norm": 0.2610032856464386, "learning_rate": 0.0002302008574340336, "loss": 4.7324, "step": 22965 }, { "epoch": 0.4703786373968423, "grad_norm": 0.24219560623168945, "learning_rate": 0.00023014854482099838, "loss": 4.7042, "step": 22970 }, { "epoch": 0.47048102717424695, "grad_norm": 0.25723206996917725, "learning_rate": 0.00023009621856093508, "loss": 4.762, "step": 22975 }, { "epoch": 0.47058341695165157, "grad_norm": 0.2543880343437195, "learning_rate": 0.00023004387866275338, "loss": 4.742, "step": 22980 }, { "epoch": 0.4706858067290562, "grad_norm": 0.2550751268863678, "learning_rate": 0.0002299915251353653, "loss": 4.7214, "step": 22985 }, { "epoch": 0.4707881965064608, "grad_norm": 0.23944367468357086, "learning_rate": 0.0002299391579876851, "loss": 4.7017, "step": 22990 }, { "epoch": 0.47089058628386543, "grad_norm": 0.5354617834091187, "learning_rate": 0.00022988677722862944, "loss": 4.6972, "step": 22995 }, { "epoch": 0.47099297606127005, "grad_norm": 0.24720068275928497, "learning_rate": 0.00022983438286711735, "loss": 4.7146, "step": 23000 }, { "epoch": 0.47099297606127005, "eval_loss": 4.7486653327941895, "eval_runtime": 31.0651, "eval_samples_per_second": 18.735, "eval_steps_per_second": 2.35, "step": 23000 }, { "epoch": 0.4710953658386747, "grad_norm": 0.24126434326171875, "learning_rate": 0.00022978197491207, "loss": 4.7339, "step": 23005 }, { "epoch": 0.4711977556160793, "grad_norm": 0.2385326325893402, "learning_rate": 0.00022972955337241112, "loss": 4.7227, "step": 23010 }, { "epoch": 0.4713001453934839, "grad_norm": 0.2848736047744751, "learning_rate": 0.00022967711825706648, "loss": 4.7474, "step": 23015 }, { "epoch": 0.47140253517088854, "grad_norm": 0.26521795988082886, "learning_rate": 0.00022962466957496437, "loss": 4.738, "step": 23020 }, { "epoch": 0.47150492494829316, "grad_norm": 0.24742761254310608, "learning_rate": 0.0002295722073350353, "loss": 4.6882, "step": 23025 }, { "epoch": 0.4716073147256978, "grad_norm": 0.2614652216434479, "learning_rate": 0.00022951973154621208, "loss": 4.7519, "step": 23030 }, { "epoch": 0.4717097045031024, "grad_norm": 0.24566341936588287, "learning_rate": 0.0002294672422174299, "loss": 4.6895, "step": 23035 }, { "epoch": 0.471812094280507, "grad_norm": 0.25319018959999084, "learning_rate": 0.00022941473935762612, "loss": 4.7537, "step": 23040 }, { "epoch": 0.47191448405791164, "grad_norm": 0.24296757578849792, "learning_rate": 0.0002293622229757406, "loss": 4.7501, "step": 23045 }, { "epoch": 0.47201687383531626, "grad_norm": 0.27099448442459106, "learning_rate": 0.00022930969308071532, "loss": 4.706, "step": 23050 }, { "epoch": 0.4721192636127209, "grad_norm": 0.2669123113155365, "learning_rate": 0.0002292571496814947, "loss": 4.7285, "step": 23055 }, { "epoch": 0.4722216533901255, "grad_norm": 0.26060131192207336, "learning_rate": 0.00022920459278702527, "loss": 4.7548, "step": 23060 }, { "epoch": 0.4723240431675301, "grad_norm": 0.2891134023666382, "learning_rate": 0.0002291520224062561, "loss": 4.6762, "step": 23065 }, { "epoch": 0.4724264329449348, "grad_norm": 0.246364563703537, "learning_rate": 0.0002290994385481384, "loss": 4.6962, "step": 23070 }, { "epoch": 0.4725288227223394, "grad_norm": 0.26051563024520874, "learning_rate": 0.0002290468412216257, "loss": 4.7511, "step": 23075 }, { "epoch": 0.47263121249974405, "grad_norm": 0.2498205602169037, "learning_rate": 0.00022899423043567385, "loss": 4.7198, "step": 23080 }, { "epoch": 0.47273360227714867, "grad_norm": 0.25932613015174866, "learning_rate": 0.00022894160619924095, "loss": 4.7391, "step": 23085 }, { "epoch": 0.4728359920545533, "grad_norm": 0.24442742764949799, "learning_rate": 0.00022888896852128748, "loss": 4.7416, "step": 23090 }, { "epoch": 0.4729383818319579, "grad_norm": 0.23264530301094055, "learning_rate": 0.0002288363174107761, "loss": 4.7422, "step": 23095 }, { "epoch": 0.47304077160936253, "grad_norm": 0.24659916758537292, "learning_rate": 0.00022878365287667181, "loss": 4.7565, "step": 23100 }, { "epoch": 0.47314316138676715, "grad_norm": 0.2867399752140045, "learning_rate": 0.00022873097492794188, "loss": 4.7029, "step": 23105 }, { "epoch": 0.4732455511641718, "grad_norm": 3.2085673809051514, "learning_rate": 0.00022867828357355587, "loss": 4.7253, "step": 23110 }, { "epoch": 0.4733479409415764, "grad_norm": 0.26261451840400696, "learning_rate": 0.00022862557882248562, "loss": 4.732, "step": 23115 }, { "epoch": 0.473450330718981, "grad_norm": 0.24705669283866882, "learning_rate": 0.00022857286068370532, "loss": 4.7413, "step": 23120 }, { "epoch": 0.47355272049638564, "grad_norm": 0.24625122547149658, "learning_rate": 0.00022852012916619135, "loss": 4.6805, "step": 23125 }, { "epoch": 0.47365511027379026, "grad_norm": 0.24632865190505981, "learning_rate": 0.00022846738427892232, "loss": 4.7188, "step": 23130 }, { "epoch": 0.4737575000511949, "grad_norm": 0.2599443197250366, "learning_rate": 0.00022841462603087923, "loss": 4.7463, "step": 23135 }, { "epoch": 0.4738598898285995, "grad_norm": 0.2681810259819031, "learning_rate": 0.0002283618544310454, "loss": 4.6996, "step": 23140 }, { "epoch": 0.4739622796060041, "grad_norm": 0.24931111931800842, "learning_rate": 0.00022830906948840624, "loss": 4.754, "step": 23145 }, { "epoch": 0.47406466938340874, "grad_norm": 0.2588179409503937, "learning_rate": 0.00022825627121194955, "loss": 4.7328, "step": 23150 }, { "epoch": 0.47416705916081336, "grad_norm": 0.2509627938270569, "learning_rate": 0.00022820345961066543, "loss": 4.7174, "step": 23155 }, { "epoch": 0.474269448938218, "grad_norm": 0.2449938803911209, "learning_rate": 0.0002281506346935461, "loss": 4.7319, "step": 23160 }, { "epoch": 0.4743718387156226, "grad_norm": 0.24512113630771637, "learning_rate": 0.0002280977964695863, "loss": 4.7549, "step": 23165 }, { "epoch": 0.4744742284930273, "grad_norm": 0.2512456476688385, "learning_rate": 0.00022804494494778276, "loss": 4.7605, "step": 23170 }, { "epoch": 0.4745766182704319, "grad_norm": 0.24117611348628998, "learning_rate": 0.00022799208013713466, "loss": 4.8048, "step": 23175 }, { "epoch": 0.4746790080478365, "grad_norm": 0.2468961775302887, "learning_rate": 0.0002279392020466433, "loss": 4.7062, "step": 23180 }, { "epoch": 0.47478139782524115, "grad_norm": 0.24405434727668762, "learning_rate": 0.00022788631068531244, "loss": 4.742, "step": 23185 }, { "epoch": 0.47488378760264577, "grad_norm": 0.34417128562927246, "learning_rate": 0.00022783340606214787, "loss": 4.761, "step": 23190 }, { "epoch": 0.4749861773800504, "grad_norm": 0.26643431186676025, "learning_rate": 0.00022778048818615783, "loss": 4.7816, "step": 23195 }, { "epoch": 0.475088567157455, "grad_norm": 0.26170021295547485, "learning_rate": 0.00022772755706635268, "loss": 4.7632, "step": 23200 }, { "epoch": 0.47519095693485963, "grad_norm": 0.2950766980648041, "learning_rate": 0.0002276746127117451, "loss": 4.7482, "step": 23205 }, { "epoch": 0.47529334671226425, "grad_norm": 0.2571079730987549, "learning_rate": 0.00022762165513135006, "loss": 4.7225, "step": 23210 }, { "epoch": 0.4753957364896689, "grad_norm": 0.26502344012260437, "learning_rate": 0.00022756868433418474, "loss": 4.7469, "step": 23215 }, { "epoch": 0.4754981262670735, "grad_norm": 0.26222342252731323, "learning_rate": 0.0002275157003292685, "loss": 4.7223, "step": 23220 }, { "epoch": 0.4756005160444781, "grad_norm": 0.2550445795059204, "learning_rate": 0.00022746270312562304, "loss": 4.6959, "step": 23225 }, { "epoch": 0.47570290582188274, "grad_norm": 0.25171804428100586, "learning_rate": 0.00022740969273227232, "loss": 4.7106, "step": 23230 }, { "epoch": 0.47580529559928736, "grad_norm": 0.23876318335533142, "learning_rate": 0.00022735666915824244, "loss": 4.678, "step": 23235 }, { "epoch": 0.475907685376692, "grad_norm": 0.29059797525405884, "learning_rate": 0.00022730363241256192, "loss": 4.708, "step": 23240 }, { "epoch": 0.4760100751540966, "grad_norm": 0.24677996337413788, "learning_rate": 0.0002272505825042613, "loss": 4.7111, "step": 23245 }, { "epoch": 0.4761124649315012, "grad_norm": 0.25260820984840393, "learning_rate": 0.00022719751944237355, "loss": 4.7577, "step": 23250 }, { "epoch": 0.47621485470890584, "grad_norm": 0.24880164861679077, "learning_rate": 0.0002271444432359338, "loss": 4.708, "step": 23255 }, { "epoch": 0.47631724448631046, "grad_norm": 0.2399342954158783, "learning_rate": 0.0002270913538939794, "loss": 4.7527, "step": 23260 }, { "epoch": 0.4764196342637151, "grad_norm": 0.25597113370895386, "learning_rate": 0.00022703825142555, "loss": 4.7383, "step": 23265 }, { "epoch": 0.4765220240411197, "grad_norm": 0.25812873244285583, "learning_rate": 0.00022698513583968737, "loss": 4.6974, "step": 23270 }, { "epoch": 0.4766244138185244, "grad_norm": 0.24173590540885925, "learning_rate": 0.0002269320071454356, "loss": 4.7384, "step": 23275 }, { "epoch": 0.476726803595929, "grad_norm": 0.25427424907684326, "learning_rate": 0.00022687886535184117, "loss": 4.7333, "step": 23280 }, { "epoch": 0.4768291933733336, "grad_norm": 0.24102525413036346, "learning_rate": 0.00022682571046795242, "loss": 4.751, "step": 23285 }, { "epoch": 0.47693158315073825, "grad_norm": 0.2437005639076233, "learning_rate": 0.00022677254250282026, "loss": 4.6898, "step": 23290 }, { "epoch": 0.47703397292814287, "grad_norm": 0.255088746547699, "learning_rate": 0.00022671936146549757, "loss": 4.7477, "step": 23295 }, { "epoch": 0.4771363627055475, "grad_norm": 0.2773576080799103, "learning_rate": 0.00022666616736503962, "loss": 4.7439, "step": 23300 }, { "epoch": 0.4772387524829521, "grad_norm": 0.2571176588535309, "learning_rate": 0.00022661296021050391, "loss": 4.7332, "step": 23305 }, { "epoch": 0.47734114226035673, "grad_norm": 0.2412247359752655, "learning_rate": 0.00022655974001095012, "loss": 4.7016, "step": 23310 }, { "epoch": 0.47744353203776135, "grad_norm": 0.2535102069377899, "learning_rate": 0.00022650650677544004, "loss": 4.7533, "step": 23315 }, { "epoch": 0.477545921815166, "grad_norm": 0.27320313453674316, "learning_rate": 0.00022645326051303784, "loss": 4.6689, "step": 23320 }, { "epoch": 0.4776483115925706, "grad_norm": 0.25510895252227783, "learning_rate": 0.0002264000012328099, "loss": 4.7309, "step": 23325 }, { "epoch": 0.4777507013699752, "grad_norm": 0.2580207586288452, "learning_rate": 0.00022634672894382464, "loss": 4.7344, "step": 23330 }, { "epoch": 0.47785309114737984, "grad_norm": 0.2654505968093872, "learning_rate": 0.00022629344365515295, "loss": 4.6744, "step": 23335 }, { "epoch": 0.47795548092478446, "grad_norm": 0.24057354032993317, "learning_rate": 0.00022624014537586776, "loss": 4.7311, "step": 23340 }, { "epoch": 0.4780578707021891, "grad_norm": 0.2518256604671478, "learning_rate": 0.00022618683411504423, "loss": 4.7433, "step": 23345 }, { "epoch": 0.4781602604795937, "grad_norm": 0.2699842154979706, "learning_rate": 0.00022613350988175983, "loss": 4.6862, "step": 23350 }, { "epoch": 0.4782626502569983, "grad_norm": 0.24945512413978577, "learning_rate": 0.0002260801726850941, "loss": 4.7247, "step": 23355 }, { "epoch": 0.47836504003440294, "grad_norm": 0.25566694140434265, "learning_rate": 0.00022602682253412886, "loss": 4.7578, "step": 23360 }, { "epoch": 0.47846742981180757, "grad_norm": 0.2524034082889557, "learning_rate": 0.0002259734594379482, "loss": 4.7608, "step": 23365 }, { "epoch": 0.4785698195892122, "grad_norm": 0.24215050041675568, "learning_rate": 0.0002259200834056382, "loss": 4.7336, "step": 23370 }, { "epoch": 0.47867220936661686, "grad_norm": 0.26179876923561096, "learning_rate": 0.00022586669444628747, "loss": 4.7207, "step": 23375 }, { "epoch": 0.4787745991440215, "grad_norm": 0.2582859694957733, "learning_rate": 0.0002258132925689865, "loss": 4.6859, "step": 23380 }, { "epoch": 0.4788769889214261, "grad_norm": 0.2548452913761139, "learning_rate": 0.00022575987778282819, "loss": 4.7869, "step": 23385 }, { "epoch": 0.4789793786988307, "grad_norm": 0.235408753156662, "learning_rate": 0.0002257064500969075, "loss": 4.7492, "step": 23390 }, { "epoch": 0.47908176847623535, "grad_norm": 0.24802841246128082, "learning_rate": 0.00022565300952032174, "loss": 4.6972, "step": 23395 }, { "epoch": 0.47918415825363997, "grad_norm": 0.4069477319717407, "learning_rate": 0.0002255995560621702, "loss": 4.8212, "step": 23400 }, { "epoch": 0.4792865480310446, "grad_norm": 0.2730862498283386, "learning_rate": 0.00022554608973155468, "loss": 4.7312, "step": 23405 }, { "epoch": 0.4793889378084492, "grad_norm": 0.9719353914260864, "learning_rate": 0.00022549261053757884, "loss": 4.6475, "step": 23410 }, { "epoch": 0.47949132758585383, "grad_norm": 0.25093239545822144, "learning_rate": 0.00022543911848934867, "loss": 4.6942, "step": 23415 }, { "epoch": 0.47959371736325845, "grad_norm": 0.2552156448364258, "learning_rate": 0.0002253856135959724, "loss": 4.716, "step": 23420 }, { "epoch": 0.4796961071406631, "grad_norm": 0.2522520422935486, "learning_rate": 0.00022533209586656037, "loss": 4.7008, "step": 23425 }, { "epoch": 0.4797984969180677, "grad_norm": 0.2818015515804291, "learning_rate": 0.00022527856531022523, "loss": 4.7332, "step": 23430 }, { "epoch": 0.4799008866954723, "grad_norm": 0.28834909200668335, "learning_rate": 0.00022522502193608154, "loss": 4.7325, "step": 23435 }, { "epoch": 0.48000327647287694, "grad_norm": 0.3477283716201782, "learning_rate": 0.00022517146575324638, "loss": 4.729, "step": 23440 }, { "epoch": 0.48010566625028156, "grad_norm": 0.24907995760440826, "learning_rate": 0.00022511789677083876, "loss": 4.7405, "step": 23445 }, { "epoch": 0.4802080560276862, "grad_norm": 0.2532285153865814, "learning_rate": 0.00022506431499798004, "loss": 4.7097, "step": 23450 }, { "epoch": 0.4803104458050908, "grad_norm": 0.2618994116783142, "learning_rate": 0.00022501072044379361, "loss": 4.7868, "step": 23455 }, { "epoch": 0.4804128355824954, "grad_norm": 0.259520024061203, "learning_rate": 0.00022495711311740509, "loss": 4.7504, "step": 23460 }, { "epoch": 0.48051522535990004, "grad_norm": 0.2821270823478699, "learning_rate": 0.00022490349302794235, "loss": 4.7465, "step": 23465 }, { "epoch": 0.48061761513730467, "grad_norm": 0.2771863639354706, "learning_rate": 0.00022484986018453534, "loss": 4.7235, "step": 23470 }, { "epoch": 0.48072000491470934, "grad_norm": 0.2585529386997223, "learning_rate": 0.00022479621459631623, "loss": 4.6119, "step": 23475 }, { "epoch": 0.48082239469211396, "grad_norm": 0.25866615772247314, "learning_rate": 0.00022474255627241935, "loss": 4.7308, "step": 23480 }, { "epoch": 0.4809247844695186, "grad_norm": 0.26243090629577637, "learning_rate": 0.00022468888522198115, "loss": 4.7026, "step": 23485 }, { "epoch": 0.4810271742469232, "grad_norm": 0.2763725817203522, "learning_rate": 0.00022463520145414036, "loss": 4.7614, "step": 23490 }, { "epoch": 0.4811295640243278, "grad_norm": 0.26582959294319153, "learning_rate": 0.00022458150497803775, "loss": 4.7269, "step": 23495 }, { "epoch": 0.48123195380173245, "grad_norm": 0.5712265372276306, "learning_rate": 0.00022452779580281636, "loss": 4.6891, "step": 23500 }, { "epoch": 0.48133434357913707, "grad_norm": 0.2590843141078949, "learning_rate": 0.00022447407393762125, "loss": 4.7279, "step": 23505 }, { "epoch": 0.4814367333565417, "grad_norm": 0.2678948640823364, "learning_rate": 0.00022442033939159984, "loss": 4.6712, "step": 23510 }, { "epoch": 0.4815391231339463, "grad_norm": 0.24214579164981842, "learning_rate": 0.00022436659217390153, "loss": 4.7322, "step": 23515 }, { "epoch": 0.48164151291135093, "grad_norm": 0.2527870237827301, "learning_rate": 0.00022431283229367802, "loss": 4.7619, "step": 23520 }, { "epoch": 0.48174390268875555, "grad_norm": 0.28357213735580444, "learning_rate": 0.000224259059760083, "loss": 4.6889, "step": 23525 }, { "epoch": 0.4818462924661602, "grad_norm": 0.2707435190677643, "learning_rate": 0.0002242052745822725, "loss": 4.7336, "step": 23530 }, { "epoch": 0.4819486822435648, "grad_norm": 0.29873546957969666, "learning_rate": 0.00022415147676940454, "loss": 4.7022, "step": 23535 }, { "epoch": 0.4820510720209694, "grad_norm": 0.25273212790489197, "learning_rate": 0.00022409766633063938, "loss": 4.7368, "step": 23540 }, { "epoch": 0.48215346179837404, "grad_norm": 0.25742873549461365, "learning_rate": 0.0002240438432751395, "loss": 4.689, "step": 23545 }, { "epoch": 0.48225585157577866, "grad_norm": 0.24842469394207, "learning_rate": 0.00022399000761206934, "loss": 4.6942, "step": 23550 }, { "epoch": 0.4823582413531833, "grad_norm": 0.26437339186668396, "learning_rate": 0.0002239361593505956, "loss": 4.6946, "step": 23555 }, { "epoch": 0.4824606311305879, "grad_norm": 0.27191320061683655, "learning_rate": 0.00022388229849988717, "loss": 4.7427, "step": 23560 }, { "epoch": 0.4825630209079925, "grad_norm": 0.2621234953403473, "learning_rate": 0.00022382842506911497, "loss": 4.7201, "step": 23565 }, { "epoch": 0.48266541068539714, "grad_norm": 0.2691076397895813, "learning_rate": 0.00022377453906745215, "loss": 4.742, "step": 23570 }, { "epoch": 0.4827678004628018, "grad_norm": 0.253271222114563, "learning_rate": 0.00022372064050407397, "loss": 4.6797, "step": 23575 }, { "epoch": 0.48287019024020644, "grad_norm": 0.24928729236125946, "learning_rate": 0.00022366672938815783, "loss": 4.7364, "step": 23580 }, { "epoch": 0.48297258001761106, "grad_norm": 0.2914057672023773, "learning_rate": 0.00022361280572888328, "loss": 4.7112, "step": 23585 }, { "epoch": 0.4830749697950157, "grad_norm": 0.46790847182273865, "learning_rate": 0.000223558869535432, "loss": 4.8043, "step": 23590 }, { "epoch": 0.4831773595724203, "grad_norm": 0.26335951685905457, "learning_rate": 0.0002235049208169877, "loss": 4.7362, "step": 23595 }, { "epoch": 0.4832797493498249, "grad_norm": 0.2944200932979584, "learning_rate": 0.00022345095958273641, "loss": 4.7088, "step": 23600 }, { "epoch": 0.48338213912722955, "grad_norm": 0.24995774030685425, "learning_rate": 0.00022339698584186623, "loss": 4.7012, "step": 23605 }, { "epoch": 0.48348452890463417, "grad_norm": 0.31491002440452576, "learning_rate": 0.00022334299960356726, "loss": 4.6802, "step": 23610 }, { "epoch": 0.4835869186820388, "grad_norm": 0.24595631659030914, "learning_rate": 0.00022328900087703195, "loss": 4.7284, "step": 23615 }, { "epoch": 0.4836893084594434, "grad_norm": 0.2661062777042389, "learning_rate": 0.00022323498967145462, "loss": 4.7217, "step": 23620 }, { "epoch": 0.48379169823684803, "grad_norm": 0.25394463539123535, "learning_rate": 0.0002231809659960319, "loss": 4.7482, "step": 23625 }, { "epoch": 0.48389408801425265, "grad_norm": 0.27144917845726013, "learning_rate": 0.00022312692985996255, "loss": 4.6937, "step": 23630 }, { "epoch": 0.4839964777916573, "grad_norm": 0.2578558027744293, "learning_rate": 0.00022307288127244732, "loss": 4.6935, "step": 23635 }, { "epoch": 0.4840988675690619, "grad_norm": 0.24560491740703583, "learning_rate": 0.0002230188202426892, "loss": 4.7088, "step": 23640 }, { "epoch": 0.4842012573464665, "grad_norm": 0.23649932444095612, "learning_rate": 0.00022296474677989324, "loss": 4.6857, "step": 23645 }, { "epoch": 0.48430364712387114, "grad_norm": 0.25463882088661194, "learning_rate": 0.00022291066089326662, "loss": 4.6973, "step": 23650 }, { "epoch": 0.48440603690127576, "grad_norm": 0.2707253694534302, "learning_rate": 0.00022285656259201866, "loss": 4.7359, "step": 23655 }, { "epoch": 0.4845084266786804, "grad_norm": 0.24852105975151062, "learning_rate": 0.0002228024518853607, "loss": 4.7687, "step": 23660 }, { "epoch": 0.484610816456085, "grad_norm": 0.25112882256507874, "learning_rate": 0.0002227483287825063, "loss": 4.7106, "step": 23665 }, { "epoch": 0.4847132062334896, "grad_norm": 0.25372880697250366, "learning_rate": 0.00022269419329267113, "loss": 4.7425, "step": 23670 }, { "epoch": 0.48481559601089425, "grad_norm": 0.2551497220993042, "learning_rate": 0.00022264004542507288, "loss": 4.7361, "step": 23675 }, { "epoch": 0.4849179857882989, "grad_norm": 0.25463464856147766, "learning_rate": 0.0002225858851889314, "loss": 4.7486, "step": 23680 }, { "epoch": 0.48502037556570354, "grad_norm": 0.25041845440864563, "learning_rate": 0.0002225317125934687, "loss": 4.7456, "step": 23685 }, { "epoch": 0.48512276534310816, "grad_norm": 0.26744791865348816, "learning_rate": 0.00022247752764790877, "loss": 4.7191, "step": 23690 }, { "epoch": 0.4852251551205128, "grad_norm": 0.27257540822029114, "learning_rate": 0.00022242333036147775, "loss": 4.6903, "step": 23695 }, { "epoch": 0.4853275448979174, "grad_norm": 0.24697956442832947, "learning_rate": 0.00022236912074340403, "loss": 4.7408, "step": 23700 }, { "epoch": 0.48542993467532203, "grad_norm": 0.28456252813339233, "learning_rate": 0.00022231489880291787, "loss": 4.6968, "step": 23705 }, { "epoch": 0.48553232445272665, "grad_norm": 0.2713773250579834, "learning_rate": 0.00022226066454925179, "loss": 4.7041, "step": 23710 }, { "epoch": 0.48563471423013127, "grad_norm": 0.25253403186798096, "learning_rate": 0.00022220641799164032, "loss": 4.7425, "step": 23715 }, { "epoch": 0.4857371040075359, "grad_norm": 0.26198431849479675, "learning_rate": 0.00022215215913932003, "loss": 4.7672, "step": 23720 }, { "epoch": 0.4858394937849405, "grad_norm": 0.2633711099624634, "learning_rate": 0.00022209788800152982, "loss": 4.7149, "step": 23725 }, { "epoch": 0.48594188356234513, "grad_norm": 0.2900274991989136, "learning_rate": 0.00022204360458751046, "loss": 4.6937, "step": 23730 }, { "epoch": 0.48604427333974976, "grad_norm": 0.23595717549324036, "learning_rate": 0.00022198930890650486, "loss": 4.7202, "step": 23735 }, { "epoch": 0.4861466631171544, "grad_norm": 0.24150396883487701, "learning_rate": 0.00022193500096775803, "loss": 4.7367, "step": 23740 }, { "epoch": 0.486249052894559, "grad_norm": 0.2442939579486847, "learning_rate": 0.00022188068078051712, "loss": 4.711, "step": 23745 }, { "epoch": 0.4863514426719636, "grad_norm": 0.272164523601532, "learning_rate": 0.00022182634835403125, "loss": 4.7202, "step": 23750 }, { "epoch": 0.48645383244936824, "grad_norm": 0.2705736756324768, "learning_rate": 0.0002217720036975518, "loss": 4.7096, "step": 23755 }, { "epoch": 0.48655622222677286, "grad_norm": 0.2533026337623596, "learning_rate": 0.0002217176468203321, "loss": 4.7322, "step": 23760 }, { "epoch": 0.4866586120041775, "grad_norm": 0.24849478900432587, "learning_rate": 0.00022166327773162742, "loss": 4.7143, "step": 23765 }, { "epoch": 0.4867610017815821, "grad_norm": 0.23685523867607117, "learning_rate": 0.00022160889644069553, "loss": 4.7141, "step": 23770 }, { "epoch": 0.4868633915589867, "grad_norm": 0.27315205335617065, "learning_rate": 0.00022155450295679584, "loss": 4.6996, "step": 23775 }, { "epoch": 0.4869657813363914, "grad_norm": 0.2574215531349182, "learning_rate": 0.00022150009728919015, "loss": 4.763, "step": 23780 }, { "epoch": 0.487068171113796, "grad_norm": 0.2711694836616516, "learning_rate": 0.0002214456794471421, "loss": 4.714, "step": 23785 }, { "epoch": 0.48717056089120064, "grad_norm": 0.2520192563533783, "learning_rate": 0.00022139124943991746, "loss": 4.7763, "step": 23790 }, { "epoch": 0.48727295066860526, "grad_norm": 0.3664661943912506, "learning_rate": 0.00022133680727678436, "loss": 4.7257, "step": 23795 }, { "epoch": 0.4873753404460099, "grad_norm": 0.26421457529067993, "learning_rate": 0.00022128235296701252, "loss": 4.7375, "step": 23800 }, { "epoch": 0.4874777302234145, "grad_norm": 0.2750388979911804, "learning_rate": 0.00022122788651987406, "loss": 4.7304, "step": 23805 }, { "epoch": 0.48758012000081913, "grad_norm": 0.25376227498054504, "learning_rate": 0.0002211734079446431, "loss": 4.7056, "step": 23810 }, { "epoch": 0.48768250977822375, "grad_norm": 0.26880383491516113, "learning_rate": 0.00022111891725059573, "loss": 4.7369, "step": 23815 }, { "epoch": 0.48778489955562837, "grad_norm": 0.28430506587028503, "learning_rate": 0.00022106441444701022, "loss": 4.748, "step": 23820 }, { "epoch": 0.487887289333033, "grad_norm": 0.27597060799598694, "learning_rate": 0.00022100989954316683, "loss": 4.7576, "step": 23825 }, { "epoch": 0.4879896791104376, "grad_norm": 0.2408474236726761, "learning_rate": 0.00022095537254834797, "loss": 4.7158, "step": 23830 }, { "epoch": 0.48809206888784223, "grad_norm": 0.249201238155365, "learning_rate": 0.0002209008334718379, "loss": 4.7159, "step": 23835 }, { "epoch": 0.48819445866524686, "grad_norm": 0.248459592461586, "learning_rate": 0.00022084628232292325, "loss": 4.6668, "step": 23840 }, { "epoch": 0.4882968484426515, "grad_norm": 0.24212941527366638, "learning_rate": 0.0002207917191108924, "loss": 4.765, "step": 23845 }, { "epoch": 0.4883992382200561, "grad_norm": 0.23875699937343597, "learning_rate": 0.00022073714384503605, "loss": 4.75, "step": 23850 }, { "epoch": 0.4885016279974607, "grad_norm": 0.251152366399765, "learning_rate": 0.00022068255653464675, "loss": 4.7478, "step": 23855 }, { "epoch": 0.48860401777486534, "grad_norm": 0.2696307897567749, "learning_rate": 0.0002206279571890191, "loss": 4.6872, "step": 23860 }, { "epoch": 0.48870640755226996, "grad_norm": 0.28312090039253235, "learning_rate": 0.00022057334581744993, "loss": 4.7609, "step": 23865 }, { "epoch": 0.4888087973296746, "grad_norm": 0.24442794919013977, "learning_rate": 0.00022051872242923802, "loss": 4.7271, "step": 23870 }, { "epoch": 0.4889111871070792, "grad_norm": 0.25698035955429077, "learning_rate": 0.0002204640870336841, "loss": 4.7861, "step": 23875 }, { "epoch": 0.4890135768844839, "grad_norm": 0.2503564953804016, "learning_rate": 0.00022040943964009107, "loss": 4.7242, "step": 23880 }, { "epoch": 0.4891159666618885, "grad_norm": 0.28338170051574707, "learning_rate": 0.00022035478025776386, "loss": 4.6962, "step": 23885 }, { "epoch": 0.4892183564392931, "grad_norm": 0.2796652019023895, "learning_rate": 0.00022030010889600939, "loss": 4.7119, "step": 23890 }, { "epoch": 0.48932074621669774, "grad_norm": 0.2485644519329071, "learning_rate": 0.00022024542556413668, "loss": 4.7181, "step": 23895 }, { "epoch": 0.48942313599410237, "grad_norm": 0.35051485896110535, "learning_rate": 0.00022019073027145669, "loss": 4.7348, "step": 23900 }, { "epoch": 0.489525525771507, "grad_norm": 0.25420159101486206, "learning_rate": 0.0002201360230272825, "loss": 4.7435, "step": 23905 }, { "epoch": 0.4896279155489116, "grad_norm": 0.25134554505348206, "learning_rate": 0.0002200813038409292, "loss": 4.7209, "step": 23910 }, { "epoch": 0.48973030532631623, "grad_norm": 0.2469230741262436, "learning_rate": 0.00022002657272171398, "loss": 4.7314, "step": 23915 }, { "epoch": 0.48983269510372085, "grad_norm": 0.2539413869380951, "learning_rate": 0.00021997182967895598, "loss": 4.7025, "step": 23920 }, { "epoch": 0.48993508488112547, "grad_norm": 0.2581627070903778, "learning_rate": 0.00021991707472197633, "loss": 4.7102, "step": 23925 }, { "epoch": 0.4900374746585301, "grad_norm": 0.2602821886539459, "learning_rate": 0.00021986230786009828, "loss": 4.6872, "step": 23930 }, { "epoch": 0.4901398644359347, "grad_norm": 0.251945823431015, "learning_rate": 0.0002198075291026471, "loss": 4.7519, "step": 23935 }, { "epoch": 0.49024225421333933, "grad_norm": 0.25386083126068115, "learning_rate": 0.00021975273845895003, "loss": 4.7025, "step": 23940 }, { "epoch": 0.49034464399074396, "grad_norm": 0.25990474224090576, "learning_rate": 0.0002196979359383364, "loss": 4.6574, "step": 23945 }, { "epoch": 0.4904470337681486, "grad_norm": 0.2560100853443146, "learning_rate": 0.0002196431215501375, "loss": 4.782, "step": 23950 }, { "epoch": 0.4905494235455532, "grad_norm": 0.2552954852581024, "learning_rate": 0.00021958829530368667, "loss": 4.7536, "step": 23955 }, { "epoch": 0.4906518133229578, "grad_norm": 0.24191182851791382, "learning_rate": 0.00021953345720831925, "loss": 4.7152, "step": 23960 }, { "epoch": 0.49075420310036244, "grad_norm": 0.2537633180618286, "learning_rate": 0.00021947860727337268, "loss": 4.6867, "step": 23965 }, { "epoch": 0.49085659287776706, "grad_norm": 0.24744947254657745, "learning_rate": 0.00021942374550818627, "loss": 4.7524, "step": 23970 }, { "epoch": 0.4909589826551717, "grad_norm": 0.27883878350257874, "learning_rate": 0.00021936887192210147, "loss": 4.7562, "step": 23975 }, { "epoch": 0.4910613724325763, "grad_norm": 0.2712114155292511, "learning_rate": 0.00021931398652446168, "loss": 4.7652, "step": 23980 }, { "epoch": 0.491163762209981, "grad_norm": 0.2643401026725769, "learning_rate": 0.00021925908932461237, "loss": 4.6906, "step": 23985 }, { "epoch": 0.4912661519873856, "grad_norm": 0.25378501415252686, "learning_rate": 0.00021920418033190097, "loss": 4.7286, "step": 23990 }, { "epoch": 0.4913685417647902, "grad_norm": 0.2989406883716583, "learning_rate": 0.0002191492595556769, "loss": 4.7293, "step": 23995 }, { "epoch": 0.49147093154219484, "grad_norm": 0.27138394117355347, "learning_rate": 0.00021909432700529158, "loss": 4.6571, "step": 24000 }, { "epoch": 0.49147093154219484, "eval_loss": 4.740637302398682, "eval_runtime": 17.1233, "eval_samples_per_second": 33.989, "eval_steps_per_second": 4.263, "step": 24000 }, { "epoch": 0.49157332131959947, "grad_norm": 0.24590237438678741, "learning_rate": 0.0002190393826900985, "loss": 4.722, "step": 24005 }, { "epoch": 0.4916757110970041, "grad_norm": 0.24786077439785004, "learning_rate": 0.0002189844266194532, "loss": 4.7176, "step": 24010 }, { "epoch": 0.4917781008744087, "grad_norm": 0.2574801445007324, "learning_rate": 0.00021892945880271303, "loss": 4.7583, "step": 24015 }, { "epoch": 0.49188049065181333, "grad_norm": 0.25858595967292786, "learning_rate": 0.00021887447924923752, "loss": 4.6977, "step": 24020 }, { "epoch": 0.49198288042921795, "grad_norm": 0.6016584038734436, "learning_rate": 0.0002188194879683881, "loss": 4.6965, "step": 24025 }, { "epoch": 0.49208527020662257, "grad_norm": 0.24698983132839203, "learning_rate": 0.0002187644849695282, "loss": 4.7379, "step": 24030 }, { "epoch": 0.4921876599840272, "grad_norm": 0.2544917166233063, "learning_rate": 0.00021870947026202331, "loss": 4.732, "step": 24035 }, { "epoch": 0.4922900497614318, "grad_norm": 0.24248680472373962, "learning_rate": 0.00021865444385524093, "loss": 4.6828, "step": 24040 }, { "epoch": 0.49239243953883644, "grad_norm": 0.2829208970069885, "learning_rate": 0.00021859940575855044, "loss": 4.711, "step": 24045 }, { "epoch": 0.49249482931624106, "grad_norm": 0.2585568130016327, "learning_rate": 0.00021854435598132324, "loss": 4.6618, "step": 24050 }, { "epoch": 0.4925972190936457, "grad_norm": 0.2694754898548126, "learning_rate": 0.0002184892945329328, "loss": 4.7447, "step": 24055 }, { "epoch": 0.4926996088710503, "grad_norm": 0.3631758689880371, "learning_rate": 0.0002184342214227545, "loss": 4.7033, "step": 24060 }, { "epoch": 0.4928019986484549, "grad_norm": 0.24882563948631287, "learning_rate": 0.00021837913666016576, "loss": 4.7405, "step": 24065 }, { "epoch": 0.49290438842585954, "grad_norm": 0.24229872226715088, "learning_rate": 0.00021832404025454592, "loss": 4.7549, "step": 24070 }, { "epoch": 0.49300677820326416, "grad_norm": 0.2614671289920807, "learning_rate": 0.00021826893221527636, "loss": 4.7364, "step": 24075 }, { "epoch": 0.4931091679806688, "grad_norm": 0.26160407066345215, "learning_rate": 0.00021821381255174042, "loss": 4.7489, "step": 24080 }, { "epoch": 0.49321155775807346, "grad_norm": 0.2651068866252899, "learning_rate": 0.0002181586812733234, "loss": 4.711, "step": 24085 }, { "epoch": 0.4933139475354781, "grad_norm": 0.2535269856452942, "learning_rate": 0.00021810353838941262, "loss": 4.7428, "step": 24090 }, { "epoch": 0.4934163373128827, "grad_norm": 0.3206854462623596, "learning_rate": 0.00021804838390939734, "loss": 4.7096, "step": 24095 }, { "epoch": 0.4935187270902873, "grad_norm": 0.2458593249320984, "learning_rate": 0.00021799321784266877, "loss": 4.713, "step": 24100 }, { "epoch": 0.49362111686769194, "grad_norm": 0.24413485825061798, "learning_rate": 0.00021793804019862024, "loss": 4.7451, "step": 24105 }, { "epoch": 0.49372350664509657, "grad_norm": 0.2457171082496643, "learning_rate": 0.00021788285098664682, "loss": 4.6979, "step": 24110 }, { "epoch": 0.4938258964225012, "grad_norm": 0.29445162415504456, "learning_rate": 0.0002178276502161457, "loss": 4.6508, "step": 24115 }, { "epoch": 0.4939282861999058, "grad_norm": 0.24881939589977264, "learning_rate": 0.00021777243789651603, "loss": 4.7094, "step": 24120 }, { "epoch": 0.49403067597731043, "grad_norm": 0.2543780207633972, "learning_rate": 0.00021771721403715896, "loss": 4.7056, "step": 24125 }, { "epoch": 0.49413306575471505, "grad_norm": 0.26519331336021423, "learning_rate": 0.00021766197864747742, "loss": 4.7269, "step": 24130 }, { "epoch": 0.49423545553211967, "grad_norm": 0.28295549750328064, "learning_rate": 0.00021760673173687657, "loss": 4.7403, "step": 24135 }, { "epoch": 0.4943378453095243, "grad_norm": 0.2623783349990845, "learning_rate": 0.00021755147331476327, "loss": 4.7187, "step": 24140 }, { "epoch": 0.4944402350869289, "grad_norm": 0.26957452297210693, "learning_rate": 0.00021749620339054653, "loss": 4.7374, "step": 24145 }, { "epoch": 0.49454262486433354, "grad_norm": 0.251097708940506, "learning_rate": 0.00021744092197363725, "loss": 4.7435, "step": 24150 }, { "epoch": 0.49464501464173816, "grad_norm": 0.24897147715091705, "learning_rate": 0.0002173856290734483, "loss": 4.7092, "step": 24155 }, { "epoch": 0.4947474044191428, "grad_norm": 0.27389004826545715, "learning_rate": 0.00021733032469939447, "loss": 4.763, "step": 24160 }, { "epoch": 0.4948497941965474, "grad_norm": 0.2514142096042633, "learning_rate": 0.00021727500886089255, "loss": 4.749, "step": 24165 }, { "epoch": 0.494952183973952, "grad_norm": 0.24971139430999756, "learning_rate": 0.00021721968156736122, "loss": 4.736, "step": 24170 }, { "epoch": 0.49505457375135664, "grad_norm": 0.2452549934387207, "learning_rate": 0.0002171643428282212, "loss": 4.732, "step": 24175 }, { "epoch": 0.49515696352876126, "grad_norm": 0.2623090445995331, "learning_rate": 0.00021710899265289514, "loss": 4.6775, "step": 24180 }, { "epoch": 0.49525935330616594, "grad_norm": 0.24953871965408325, "learning_rate": 0.0002170536310508075, "loss": 4.7024, "step": 24185 }, { "epoch": 0.49536174308357056, "grad_norm": 0.2427460104227066, "learning_rate": 0.0002169982580313849, "loss": 4.725, "step": 24190 }, { "epoch": 0.4954641328609752, "grad_norm": 0.24066436290740967, "learning_rate": 0.00021694287360405574, "loss": 4.7284, "step": 24195 }, { "epoch": 0.4955665226383798, "grad_norm": 0.25126057863235474, "learning_rate": 0.00021688747777825042, "loss": 4.7395, "step": 24200 }, { "epoch": 0.4956689124157844, "grad_norm": 0.3491162061691284, "learning_rate": 0.00021683207056340134, "loss": 4.7523, "step": 24205 }, { "epoch": 0.49577130219318905, "grad_norm": 0.27271589636802673, "learning_rate": 0.00021677665196894269, "loss": 4.7545, "step": 24210 }, { "epoch": 0.49587369197059367, "grad_norm": 0.2561257779598236, "learning_rate": 0.0002167212220043107, "loss": 4.7135, "step": 24215 }, { "epoch": 0.4959760817479983, "grad_norm": 0.2725783586502075, "learning_rate": 0.00021666578067894364, "loss": 4.7224, "step": 24220 }, { "epoch": 0.4960784715254029, "grad_norm": 0.2379966825246811, "learning_rate": 0.00021661032800228145, "loss": 4.6912, "step": 24225 }, { "epoch": 0.49618086130280753, "grad_norm": 0.2789928913116455, "learning_rate": 0.00021655486398376624, "loss": 4.7488, "step": 24230 }, { "epoch": 0.49628325108021215, "grad_norm": 0.26592832803726196, "learning_rate": 0.0002164993886328419, "loss": 4.7053, "step": 24235 }, { "epoch": 0.4963856408576168, "grad_norm": 0.268175333738327, "learning_rate": 0.0002164439019589544, "loss": 4.6547, "step": 24240 }, { "epoch": 0.4964880306350214, "grad_norm": 0.2482205480337143, "learning_rate": 0.00021638840397155147, "loss": 4.738, "step": 24245 }, { "epoch": 0.496590420412426, "grad_norm": 0.25238728523254395, "learning_rate": 0.0002163328946800829, "loss": 4.7493, "step": 24250 }, { "epoch": 0.49669281018983064, "grad_norm": 0.24435508251190186, "learning_rate": 0.0002162773740940003, "loss": 4.681, "step": 24255 }, { "epoch": 0.49679519996723526, "grad_norm": 0.24986614286899567, "learning_rate": 0.00021622184222275725, "loss": 4.7567, "step": 24260 }, { "epoch": 0.4968975897446399, "grad_norm": 0.25959527492523193, "learning_rate": 0.00021616629907580934, "loss": 4.6976, "step": 24265 }, { "epoch": 0.4969999795220445, "grad_norm": 0.24627871811389923, "learning_rate": 0.00021611074466261392, "loss": 4.7185, "step": 24270 }, { "epoch": 0.4971023692994491, "grad_norm": 0.2620392441749573, "learning_rate": 0.0002160551789926304, "loss": 4.7508, "step": 24275 }, { "epoch": 0.49720475907685374, "grad_norm": 0.26908978819847107, "learning_rate": 0.0002159996020753199, "loss": 4.7035, "step": 24280 }, { "epoch": 0.4973071488542584, "grad_norm": 0.25050944089889526, "learning_rate": 0.00021594401392014572, "loss": 4.7076, "step": 24285 }, { "epoch": 0.49740953863166304, "grad_norm": 0.26778820157051086, "learning_rate": 0.00021588841453657295, "loss": 4.7205, "step": 24290 }, { "epoch": 0.49751192840906766, "grad_norm": 0.24914218485355377, "learning_rate": 0.00021583280393406852, "loss": 4.6853, "step": 24295 }, { "epoch": 0.4976143181864723, "grad_norm": 0.2422572374343872, "learning_rate": 0.00021577718212210145, "loss": 4.7562, "step": 24300 }, { "epoch": 0.4977167079638769, "grad_norm": 0.26418182253837585, "learning_rate": 0.0002157215491101424, "loss": 4.7084, "step": 24305 }, { "epoch": 0.4978190977412815, "grad_norm": 0.26421383023262024, "learning_rate": 0.00021566590490766424, "loss": 4.7017, "step": 24310 }, { "epoch": 0.49792148751868615, "grad_norm": 0.2425071895122528, "learning_rate": 0.00021561024952414156, "loss": 4.767, "step": 24315 }, { "epoch": 0.49802387729609077, "grad_norm": 0.2629650831222534, "learning_rate": 0.00021555458296905087, "loss": 4.6466, "step": 24320 }, { "epoch": 0.4981262670734954, "grad_norm": 0.28130167722702026, "learning_rate": 0.00021549890525187062, "loss": 4.6324, "step": 24325 }, { "epoch": 0.4982286568509, "grad_norm": 0.2503277659416199, "learning_rate": 0.00021544321638208115, "loss": 4.7381, "step": 24330 }, { "epoch": 0.49833104662830463, "grad_norm": 0.2546005845069885, "learning_rate": 0.0002153875163691647, "loss": 4.6521, "step": 24335 }, { "epoch": 0.49843343640570925, "grad_norm": 3.1699318885803223, "learning_rate": 0.0002153318052226054, "loss": 4.7349, "step": 24340 }, { "epoch": 0.4985358261831139, "grad_norm": 0.25961750745773315, "learning_rate": 0.0002152760829518893, "loss": 4.6934, "step": 24345 }, { "epoch": 0.4986382159605185, "grad_norm": 0.2727004587650299, "learning_rate": 0.00021522034956650434, "loss": 4.7158, "step": 24350 }, { "epoch": 0.4987406057379231, "grad_norm": 0.25228407979011536, "learning_rate": 0.00021516460507594025, "loss": 4.7123, "step": 24355 }, { "epoch": 0.49884299551532774, "grad_norm": 0.2500511109828949, "learning_rate": 0.00021510884948968884, "loss": 4.708, "step": 24360 }, { "epoch": 0.49894538529273236, "grad_norm": 0.258484423160553, "learning_rate": 0.00021505308281724367, "loss": 4.7536, "step": 24365 }, { "epoch": 0.499047775070137, "grad_norm": 0.24826906621456146, "learning_rate": 0.0002149973050681002, "loss": 4.7293, "step": 24370 }, { "epoch": 0.4991501648475416, "grad_norm": 0.23662418127059937, "learning_rate": 0.00021494151625175585, "loss": 4.7134, "step": 24375 }, { "epoch": 0.4992525546249462, "grad_norm": 0.25076043605804443, "learning_rate": 0.0002148857163777098, "loss": 4.7352, "step": 24380 }, { "epoch": 0.49935494440235084, "grad_norm": 0.2418327033519745, "learning_rate": 0.0002148299054554633, "loss": 4.7292, "step": 24385 }, { "epoch": 0.4994573341797555, "grad_norm": 0.25087326765060425, "learning_rate": 0.0002147740834945193, "loss": 4.7174, "step": 24390 }, { "epoch": 0.49955972395716014, "grad_norm": 0.282532274723053, "learning_rate": 0.00021471825050438267, "loss": 4.6723, "step": 24395 }, { "epoch": 0.49966211373456476, "grad_norm": 0.252346009016037, "learning_rate": 0.00021466240649456022, "loss": 4.733, "step": 24400 }, { "epoch": 0.4997645035119694, "grad_norm": 0.2513236701488495, "learning_rate": 0.00021460655147456065, "loss": 4.7049, "step": 24405 }, { "epoch": 0.499866893289374, "grad_norm": 0.2769632339477539, "learning_rate": 0.00021455068545389442, "loss": 4.6374, "step": 24410 }, { "epoch": 0.4999692830667786, "grad_norm": 0.24667078256607056, "learning_rate": 0.000214494808442074, "loss": 4.6938, "step": 24415 }, { "epoch": 0.5000716728441832, "grad_norm": 0.2368348091840744, "learning_rate": 0.00021443892044861355, "loss": 4.6575, "step": 24420 }, { "epoch": 0.5001740626215878, "grad_norm": 0.24722762405872345, "learning_rate": 0.00021438302148302931, "loss": 4.6666, "step": 24425 }, { "epoch": 0.5002764523989924, "grad_norm": 0.26125872135162354, "learning_rate": 0.0002143271115548393, "loss": 4.7624, "step": 24430 }, { "epoch": 0.500378842176397, "grad_norm": 0.24780777096748352, "learning_rate": 0.00021427119067356337, "loss": 4.7497, "step": 24435 }, { "epoch": 0.5004812319538018, "grad_norm": 0.25851914286613464, "learning_rate": 0.00021421525884872327, "loss": 4.6976, "step": 24440 }, { "epoch": 0.5005836217312064, "grad_norm": 0.2696174085140228, "learning_rate": 0.00021415931608984254, "loss": 4.6988, "step": 24445 }, { "epoch": 0.500686011508611, "grad_norm": 0.2505180835723877, "learning_rate": 0.00021410336240644672, "loss": 4.7444, "step": 24450 }, { "epoch": 0.5007884012860156, "grad_norm": 0.2612415552139282, "learning_rate": 0.00021404739780806313, "loss": 4.6558, "step": 24455 }, { "epoch": 0.5008907910634203, "grad_norm": 0.26742544770240784, "learning_rate": 0.00021399142230422096, "loss": 4.7245, "step": 24460 }, { "epoch": 0.5009931808408249, "grad_norm": 0.27145445346832275, "learning_rate": 0.0002139354359044512, "loss": 4.7398, "step": 24465 }, { "epoch": 0.5010955706182295, "grad_norm": 0.268049418926239, "learning_rate": 0.00021387943861828684, "loss": 4.7316, "step": 24470 }, { "epoch": 0.5011979603956341, "grad_norm": 0.2802181839942932, "learning_rate": 0.00021382343045526253, "loss": 4.7352, "step": 24475 }, { "epoch": 0.5013003501730388, "grad_norm": 0.25395095348358154, "learning_rate": 0.00021376741142491493, "loss": 4.7198, "step": 24480 }, { "epoch": 0.5014027399504434, "grad_norm": 0.27157557010650635, "learning_rate": 0.00021371138153678253, "loss": 4.7411, "step": 24485 }, { "epoch": 0.501505129727848, "grad_norm": 0.252315491437912, "learning_rate": 0.0002136553408004056, "loss": 4.7233, "step": 24490 }, { "epoch": 0.5016075195052526, "grad_norm": 0.2508240044116974, "learning_rate": 0.00021359928922532625, "loss": 4.7015, "step": 24495 }, { "epoch": 0.5017099092826572, "grad_norm": 0.27361515164375305, "learning_rate": 0.00021354322682108853, "loss": 4.7081, "step": 24500 }, { "epoch": 0.5018122990600619, "grad_norm": 0.2511800527572632, "learning_rate": 0.00021348715359723825, "loss": 4.7356, "step": 24505 }, { "epoch": 0.5019146888374665, "grad_norm": 0.25007495284080505, "learning_rate": 0.00021343106956332313, "loss": 4.7204, "step": 24510 }, { "epoch": 0.5020170786148711, "grad_norm": 0.2599777281284332, "learning_rate": 0.00021337497472889263, "loss": 4.7074, "step": 24515 }, { "epoch": 0.5021194683922757, "grad_norm": 0.26869091391563416, "learning_rate": 0.00021331886910349815, "loss": 4.7148, "step": 24520 }, { "epoch": 0.5022218581696803, "grad_norm": 0.27923208475112915, "learning_rate": 0.00021326275269669296, "loss": 4.655, "step": 24525 }, { "epoch": 0.502324247947085, "grad_norm": 0.2597445547580719, "learning_rate": 0.000213206625518032, "loss": 4.7167, "step": 24530 }, { "epoch": 0.5024266377244896, "grad_norm": 0.3291465938091278, "learning_rate": 0.00021315048757707215, "loss": 4.7631, "step": 24535 }, { "epoch": 0.5025290275018942, "grad_norm": 0.25353655219078064, "learning_rate": 0.0002130943388833721, "loss": 4.7166, "step": 24540 }, { "epoch": 0.5026314172792988, "grad_norm": 0.24162156879901886, "learning_rate": 0.00021303817944649242, "loss": 4.7468, "step": 24545 }, { "epoch": 0.5027338070567035, "grad_norm": 0.24550728499889374, "learning_rate": 0.00021298200927599546, "loss": 4.7383, "step": 24550 }, { "epoch": 0.5028361968341081, "grad_norm": 0.2765519320964813, "learning_rate": 0.00021292582838144546, "loss": 4.7263, "step": 24555 }, { "epoch": 0.5029385866115127, "grad_norm": 0.2508819103240967, "learning_rate": 0.00021286963677240832, "loss": 4.6969, "step": 24560 }, { "epoch": 0.5030409763889173, "grad_norm": 0.2590639889240265, "learning_rate": 0.00021281343445845197, "loss": 4.7415, "step": 24565 }, { "epoch": 0.5031433661663219, "grad_norm": 0.26587069034576416, "learning_rate": 0.00021275722144914607, "loss": 4.7175, "step": 24570 }, { "epoch": 0.5032457559437266, "grad_norm": 0.25631362199783325, "learning_rate": 0.00021270099775406206, "loss": 4.7137, "step": 24575 }, { "epoch": 0.5033481457211312, "grad_norm": 0.261626660823822, "learning_rate": 0.0002126447633827733, "loss": 4.6867, "step": 24580 }, { "epoch": 0.5034505354985358, "grad_norm": 0.2893737256526947, "learning_rate": 0.00021258851834485485, "loss": 4.7523, "step": 24585 }, { "epoch": 0.5035529252759404, "grad_norm": 0.27564457058906555, "learning_rate": 0.00021253226264988367, "loss": 4.7037, "step": 24590 }, { "epoch": 0.503655315053345, "grad_norm": 0.26541513204574585, "learning_rate": 0.00021247599630743857, "loss": 4.645, "step": 24595 }, { "epoch": 0.5037577048307497, "grad_norm": 0.24438264966011047, "learning_rate": 0.00021241971932710004, "loss": 4.7093, "step": 24600 }, { "epoch": 0.5038600946081543, "grad_norm": 0.2826894223690033, "learning_rate": 0.00021236343171845048, "loss": 4.7249, "step": 24605 }, { "epoch": 0.5039624843855589, "grad_norm": 0.2549968957901001, "learning_rate": 0.00021230713349107406, "loss": 4.708, "step": 24610 }, { "epoch": 0.5040648741629635, "grad_norm": 0.24997398257255554, "learning_rate": 0.00021225082465455684, "loss": 4.6947, "step": 24615 }, { "epoch": 0.5041672639403681, "grad_norm": 0.2487078458070755, "learning_rate": 0.0002121945052184865, "loss": 4.7086, "step": 24620 }, { "epoch": 0.5042696537177728, "grad_norm": 0.244797483086586, "learning_rate": 0.00021213817519245276, "loss": 4.6922, "step": 24625 }, { "epoch": 0.5043720434951774, "grad_norm": 0.24336354434490204, "learning_rate": 0.00021208183458604698, "loss": 4.7448, "step": 24630 }, { "epoch": 0.504474433272582, "grad_norm": 0.2476019561290741, "learning_rate": 0.00021202548340886235, "loss": 4.6856, "step": 24635 }, { "epoch": 0.5045768230499867, "grad_norm": 0.2573014199733734, "learning_rate": 0.00021196912167049397, "loss": 4.7023, "step": 24640 }, { "epoch": 0.5046792128273914, "grad_norm": 0.26645344495773315, "learning_rate": 0.00021191274938053853, "loss": 4.705, "step": 24645 }, { "epoch": 0.504781602604796, "grad_norm": 0.25060755014419556, "learning_rate": 0.0002118563665485947, "loss": 4.6944, "step": 24650 }, { "epoch": 0.5048839923822006, "grad_norm": 0.245570108294487, "learning_rate": 0.00021179997318426287, "loss": 4.7521, "step": 24655 }, { "epoch": 0.5049863821596052, "grad_norm": 0.2431022971868515, "learning_rate": 0.00021174356929714523, "loss": 4.7062, "step": 24660 }, { "epoch": 0.5050887719370099, "grad_norm": 0.25514674186706543, "learning_rate": 0.00021168715489684582, "loss": 4.6733, "step": 24665 }, { "epoch": 0.5051911617144145, "grad_norm": 0.25629234313964844, "learning_rate": 0.00021163072999297032, "loss": 4.7279, "step": 24670 }, { "epoch": 0.5052935514918191, "grad_norm": 0.2432234287261963, "learning_rate": 0.00021157429459512636, "loss": 4.7288, "step": 24675 }, { "epoch": 0.5053959412692237, "grad_norm": 0.2488624006509781, "learning_rate": 0.00021151784871292322, "loss": 4.7273, "step": 24680 }, { "epoch": 0.5054983310466283, "grad_norm": 0.2719564139842987, "learning_rate": 0.00021146139235597212, "loss": 4.6633, "step": 24685 }, { "epoch": 0.505600720824033, "grad_norm": 0.26692262291908264, "learning_rate": 0.0002114049255338859, "loss": 4.7522, "step": 24690 }, { "epoch": 0.5057031106014376, "grad_norm": 0.26064640283584595, "learning_rate": 0.00021134844825627935, "loss": 4.7518, "step": 24695 }, { "epoch": 0.5058055003788422, "grad_norm": 0.2623884677886963, "learning_rate": 0.00021129196053276892, "loss": 4.6863, "step": 24700 }, { "epoch": 0.5059078901562468, "grad_norm": 0.2715488374233246, "learning_rate": 0.0002112354623729728, "loss": 4.7502, "step": 24705 }, { "epoch": 0.5060102799336514, "grad_norm": 0.2498241662979126, "learning_rate": 0.00021117895378651113, "loss": 4.6931, "step": 24710 }, { "epoch": 0.5061126697110561, "grad_norm": 0.26348575949668884, "learning_rate": 0.00021112243478300566, "loss": 4.6818, "step": 24715 }, { "epoch": 0.5062150594884607, "grad_norm": 0.2448241412639618, "learning_rate": 0.00021106590537207999, "loss": 4.6994, "step": 24720 }, { "epoch": 0.5063174492658653, "grad_norm": 0.2700660228729248, "learning_rate": 0.00021100936556335946, "loss": 4.729, "step": 24725 }, { "epoch": 0.5064198390432699, "grad_norm": 0.25830090045928955, "learning_rate": 0.00021095281536647126, "loss": 4.6994, "step": 24730 }, { "epoch": 0.5065222288206745, "grad_norm": 0.2823108434677124, "learning_rate": 0.00021089625479104424, "loss": 4.6936, "step": 24735 }, { "epoch": 0.5066246185980792, "grad_norm": 0.27551400661468506, "learning_rate": 0.00021083968384670907, "loss": 4.6821, "step": 24740 }, { "epoch": 0.5067270083754838, "grad_norm": 0.25167158246040344, "learning_rate": 0.00021078310254309815, "loss": 4.6748, "step": 24745 }, { "epoch": 0.5068293981528884, "grad_norm": 0.28884032368659973, "learning_rate": 0.0002107265108898457, "loss": 4.7889, "step": 24750 }, { "epoch": 0.506931787930293, "grad_norm": 0.26263895630836487, "learning_rate": 0.0002106699088965877, "loss": 4.7195, "step": 24755 }, { "epoch": 0.5070341777076977, "grad_norm": 0.26021355390548706, "learning_rate": 0.00021061329657296184, "loss": 4.7819, "step": 24760 }, { "epoch": 0.5071365674851023, "grad_norm": 0.2741910517215729, "learning_rate": 0.00021055667392860764, "loss": 4.7208, "step": 24765 }, { "epoch": 0.5072389572625069, "grad_norm": 0.3258890211582184, "learning_rate": 0.00021050004097316623, "loss": 4.6659, "step": 24770 }, { "epoch": 0.5073413470399115, "grad_norm": 0.2708551287651062, "learning_rate": 0.00021044339771628072, "loss": 4.7837, "step": 24775 }, { "epoch": 0.5074437368173161, "grad_norm": 0.26590174436569214, "learning_rate": 0.0002103867441675958, "loss": 4.7363, "step": 24780 }, { "epoch": 0.5075461265947208, "grad_norm": 0.25326523184776306, "learning_rate": 0.00021033008033675794, "loss": 4.726, "step": 24785 }, { "epoch": 0.5076485163721254, "grad_norm": 0.2555302381515503, "learning_rate": 0.00021027340623341542, "loss": 4.6985, "step": 24790 }, { "epoch": 0.50775090614953, "grad_norm": 0.25960206985473633, "learning_rate": 0.00021021672186721826, "loss": 4.7171, "step": 24795 }, { "epoch": 0.5078532959269346, "grad_norm": 0.27280300855636597, "learning_rate": 0.00021016002724781808, "loss": 4.6646, "step": 24800 }, { "epoch": 0.5079556857043392, "grad_norm": 0.28094059228897095, "learning_rate": 0.00021010332238486853, "loss": 4.7707, "step": 24805 }, { "epoch": 0.5080580754817439, "grad_norm": 0.26008355617523193, "learning_rate": 0.0002100466072880248, "loss": 4.7649, "step": 24810 }, { "epoch": 0.5081604652591485, "grad_norm": 0.2612993121147156, "learning_rate": 0.00020998988196694378, "loss": 4.6929, "step": 24815 }, { "epoch": 0.5082628550365531, "grad_norm": 0.2467452585697174, "learning_rate": 0.00020993314643128424, "loss": 4.7253, "step": 24820 }, { "epoch": 0.5083652448139577, "grad_norm": 0.28310737013816833, "learning_rate": 0.00020987640069070667, "loss": 4.6759, "step": 24825 }, { "epoch": 0.5084676345913623, "grad_norm": 0.270855575799942, "learning_rate": 0.0002098196447548732, "loss": 4.699, "step": 24830 }, { "epoch": 0.508570024368767, "grad_norm": 0.2647227644920349, "learning_rate": 0.0002097628786334478, "loss": 4.7276, "step": 24835 }, { "epoch": 0.5086724141461716, "grad_norm": 0.28125500679016113, "learning_rate": 0.0002097061023360961, "loss": 4.7177, "step": 24840 }, { "epoch": 0.5087748039235763, "grad_norm": 0.24579207599163055, "learning_rate": 0.00020964931587248557, "loss": 4.7113, "step": 24845 }, { "epoch": 0.508877193700981, "grad_norm": 0.2567944824695587, "learning_rate": 0.00020959251925228528, "loss": 4.7238, "step": 24850 }, { "epoch": 0.5089795834783856, "grad_norm": 0.2535143196582794, "learning_rate": 0.00020953571248516606, "loss": 4.7567, "step": 24855 }, { "epoch": 0.5090819732557902, "grad_norm": 0.2610457241535187, "learning_rate": 0.00020947889558080052, "loss": 4.7, "step": 24860 }, { "epoch": 0.5091843630331948, "grad_norm": 0.24878627061843872, "learning_rate": 0.00020942206854886303, "loss": 4.7271, "step": 24865 }, { "epoch": 0.5092867528105994, "grad_norm": 0.2622509002685547, "learning_rate": 0.00020936523139902948, "loss": 4.6982, "step": 24870 }, { "epoch": 0.509389142588004, "grad_norm": 0.26678067445755005, "learning_rate": 0.0002093083841409778, "loss": 4.6707, "step": 24875 }, { "epoch": 0.5094915323654087, "grad_norm": 0.24062874913215637, "learning_rate": 0.00020925152678438736, "loss": 4.6962, "step": 24880 }, { "epoch": 0.5095939221428133, "grad_norm": 0.24843214452266693, "learning_rate": 0.0002091946593389394, "loss": 4.7047, "step": 24885 }, { "epoch": 0.5096963119202179, "grad_norm": 0.25527897477149963, "learning_rate": 0.0002091377818143168, "loss": 4.709, "step": 24890 }, { "epoch": 0.5097987016976225, "grad_norm": 0.2503810524940491, "learning_rate": 0.00020908089422020426, "loss": 4.7533, "step": 24895 }, { "epoch": 0.5099010914750272, "grad_norm": 0.23670713603496552, "learning_rate": 0.000209023996566288, "loss": 4.7076, "step": 24900 }, { "epoch": 0.5100034812524318, "grad_norm": 0.24563787877559662, "learning_rate": 0.00020896708886225624, "loss": 4.6996, "step": 24905 }, { "epoch": 0.5101058710298364, "grad_norm": 0.27107053995132446, "learning_rate": 0.00020891017111779862, "loss": 4.6914, "step": 24910 }, { "epoch": 0.510208260807241, "grad_norm": 0.25058692693710327, "learning_rate": 0.00020885324334260667, "loss": 4.6982, "step": 24915 }, { "epoch": 0.5103106505846456, "grad_norm": 0.2572046220302582, "learning_rate": 0.00020879630554637365, "loss": 4.6911, "step": 24920 }, { "epoch": 0.5104130403620503, "grad_norm": 0.26619139313697815, "learning_rate": 0.0002087393577387943, "loss": 4.7387, "step": 24925 }, { "epoch": 0.5105154301394549, "grad_norm": 0.24175913631916046, "learning_rate": 0.00020868239992956537, "loss": 4.7599, "step": 24930 }, { "epoch": 0.5106178199168595, "grad_norm": 0.24409683048725128, "learning_rate": 0.0002086254321283851, "loss": 4.6678, "step": 24935 }, { "epoch": 0.5107202096942641, "grad_norm": 0.26042482256889343, "learning_rate": 0.0002085684543449534, "loss": 4.7323, "step": 24940 }, { "epoch": 0.5108225994716687, "grad_norm": 0.284839928150177, "learning_rate": 0.00020851146658897214, "loss": 4.763, "step": 24945 }, { "epoch": 0.5109249892490734, "grad_norm": 0.284509539604187, "learning_rate": 0.00020845446887014465, "loss": 4.6373, "step": 24950 }, { "epoch": 0.511027379026478, "grad_norm": 0.2701762616634369, "learning_rate": 0.00020839746119817598, "loss": 4.7613, "step": 24955 }, { "epoch": 0.5111297688038826, "grad_norm": 0.26169511675834656, "learning_rate": 0.00020834044358277297, "loss": 4.7125, "step": 24960 }, { "epoch": 0.5112321585812872, "grad_norm": 0.34681040048599243, "learning_rate": 0.00020828341603364414, "loss": 4.7033, "step": 24965 }, { "epoch": 0.5113345483586919, "grad_norm": 0.26275140047073364, "learning_rate": 0.0002082263785604996, "loss": 4.7016, "step": 24970 }, { "epoch": 0.5114369381360965, "grad_norm": 0.4569779336452484, "learning_rate": 0.00020816933117305124, "loss": 4.6567, "step": 24975 }, { "epoch": 0.5115393279135011, "grad_norm": 0.27598029375076294, "learning_rate": 0.00020811227388101264, "loss": 4.6915, "step": 24980 }, { "epoch": 0.5116417176909057, "grad_norm": 0.2594294250011444, "learning_rate": 0.000208055206694099, "loss": 4.7079, "step": 24985 }, { "epoch": 0.5117441074683103, "grad_norm": 0.27375197410583496, "learning_rate": 0.0002079981296220273, "loss": 4.6944, "step": 24990 }, { "epoch": 0.511846497245715, "grad_norm": 0.26600924134254456, "learning_rate": 0.00020794104267451605, "loss": 4.7308, "step": 24995 }, { "epoch": 0.5119488870231196, "grad_norm": 0.27649980783462524, "learning_rate": 0.0002078839458612857, "loss": 4.7236, "step": 25000 }, { "epoch": 0.5119488870231196, "eval_loss": 4.726166725158691, "eval_runtime": 17.1335, "eval_samples_per_second": 33.969, "eval_steps_per_second": 4.261, "step": 25000 }, { "epoch": 0.5120512768005242, "grad_norm": 0.27099522948265076, "learning_rate": 0.0002078268391920581, "loss": 4.74, "step": 25005 }, { "epoch": 0.5121536665779288, "grad_norm": 0.2632291913032532, "learning_rate": 0.00020776972267655687, "loss": 4.7022, "step": 25010 }, { "epoch": 0.5122560563553334, "grad_norm": 0.3414386212825775, "learning_rate": 0.00020771259632450747, "loss": 4.6967, "step": 25015 }, { "epoch": 0.5123584461327381, "grad_norm": 0.2538946568965912, "learning_rate": 0.0002076554601456368, "loss": 4.6859, "step": 25020 }, { "epoch": 0.5124608359101427, "grad_norm": 0.2705353796482086, "learning_rate": 0.00020759831414967356, "loss": 4.6758, "step": 25025 }, { "epoch": 0.5125632256875473, "grad_norm": 0.2563190162181854, "learning_rate": 0.00020754115834634808, "loss": 4.6858, "step": 25030 }, { "epoch": 0.5126656154649519, "grad_norm": 0.2726873457431793, "learning_rate": 0.00020748399274539238, "loss": 4.6825, "step": 25035 }, { "epoch": 0.5127680052423566, "grad_norm": 0.24901781976222992, "learning_rate": 0.00020742681735654015, "loss": 4.6136, "step": 25040 }, { "epoch": 0.5128703950197612, "grad_norm": 0.28644707798957825, "learning_rate": 0.00020736963218952677, "loss": 4.7109, "step": 25045 }, { "epoch": 0.5129727847971659, "grad_norm": 0.2551325559616089, "learning_rate": 0.00020731243725408922, "loss": 4.6807, "step": 25050 }, { "epoch": 0.5130751745745705, "grad_norm": 0.27223217487335205, "learning_rate": 0.00020725523255996615, "loss": 4.7506, "step": 25055 }, { "epoch": 0.5131775643519751, "grad_norm": 0.25584861636161804, "learning_rate": 0.00020719801811689797, "loss": 4.7083, "step": 25060 }, { "epoch": 0.5132799541293798, "grad_norm": 0.2572309076786041, "learning_rate": 0.00020714079393462667, "loss": 4.7186, "step": 25065 }, { "epoch": 0.5133823439067844, "grad_norm": 0.2758883237838745, "learning_rate": 0.00020708356002289584, "loss": 4.6903, "step": 25070 }, { "epoch": 0.513484733684189, "grad_norm": 0.2584627866744995, "learning_rate": 0.00020702631639145087, "loss": 4.7015, "step": 25075 }, { "epoch": 0.5135871234615936, "grad_norm": 0.2779619097709656, "learning_rate": 0.00020696906305003862, "loss": 4.6788, "step": 25080 }, { "epoch": 0.5136895132389983, "grad_norm": 0.2768387198448181, "learning_rate": 0.0002069118000084079, "loss": 4.7122, "step": 25085 }, { "epoch": 0.5137919030164029, "grad_norm": 0.2851027846336365, "learning_rate": 0.00020685452727630882, "loss": 4.683, "step": 25090 }, { "epoch": 0.5138942927938075, "grad_norm": 0.25141647458076477, "learning_rate": 0.00020679724486349335, "loss": 4.6939, "step": 25095 }, { "epoch": 0.5139966825712121, "grad_norm": 0.2560679614543915, "learning_rate": 0.00020673995277971512, "loss": 4.7098, "step": 25100 }, { "epoch": 0.5140990723486167, "grad_norm": 0.24338766932487488, "learning_rate": 0.00020668265103472924, "loss": 4.7023, "step": 25105 }, { "epoch": 0.5142014621260214, "grad_norm": 0.26247096061706543, "learning_rate": 0.00020662533963829266, "loss": 4.7534, "step": 25110 }, { "epoch": 0.514303851903426, "grad_norm": 0.23718619346618652, "learning_rate": 0.0002065680186001639, "loss": 4.7087, "step": 25115 }, { "epoch": 0.5144062416808306, "grad_norm": 0.2554474472999573, "learning_rate": 0.00020651068793010305, "loss": 4.6975, "step": 25120 }, { "epoch": 0.5145086314582352, "grad_norm": 0.3297739326953888, "learning_rate": 0.00020645334763787194, "loss": 4.701, "step": 25125 }, { "epoch": 0.5146110212356398, "grad_norm": 0.28824537992477417, "learning_rate": 0.00020639599773323395, "loss": 4.6585, "step": 25130 }, { "epoch": 0.5147134110130445, "grad_norm": 0.29799553751945496, "learning_rate": 0.0002063386382259542, "loss": 4.7184, "step": 25135 }, { "epoch": 0.5148158007904491, "grad_norm": 0.2818523645401001, "learning_rate": 0.0002062812691257994, "loss": 4.7205, "step": 25140 }, { "epoch": 0.5149181905678537, "grad_norm": 0.5003705620765686, "learning_rate": 0.00020622389044253785, "loss": 4.7401, "step": 25145 }, { "epoch": 0.5150205803452583, "grad_norm": 0.24921785295009613, "learning_rate": 0.00020616650218593946, "loss": 4.7274, "step": 25150 }, { "epoch": 0.515122970122663, "grad_norm": 0.30534884333610535, "learning_rate": 0.00020610910436577595, "loss": 4.6842, "step": 25155 }, { "epoch": 0.5152253599000676, "grad_norm": 0.26093873381614685, "learning_rate": 0.00020605169699182044, "loss": 4.7124, "step": 25160 }, { "epoch": 0.5153277496774722, "grad_norm": 0.2568979263305664, "learning_rate": 0.00020599428007384783, "loss": 4.713, "step": 25165 }, { "epoch": 0.5154301394548768, "grad_norm": 0.25099727511405945, "learning_rate": 0.0002059368536216346, "loss": 4.679, "step": 25170 }, { "epoch": 0.5155325292322814, "grad_norm": 0.2550967037677765, "learning_rate": 0.0002058794176449588, "loss": 4.6942, "step": 25175 }, { "epoch": 0.515634919009686, "grad_norm": 0.2863609194755554, "learning_rate": 0.00020582197215360018, "loss": 4.7301, "step": 25180 }, { "epoch": 0.5157373087870907, "grad_norm": 0.2509368062019348, "learning_rate": 0.00020576451715734014, "loss": 4.7251, "step": 25185 }, { "epoch": 0.5158396985644953, "grad_norm": 0.26072463393211365, "learning_rate": 0.00020570705266596156, "loss": 4.7283, "step": 25190 }, { "epoch": 0.5159420883418999, "grad_norm": 0.24949073791503906, "learning_rate": 0.00020564957868924902, "loss": 4.6715, "step": 25195 }, { "epoch": 0.5160444781193045, "grad_norm": 0.27192243933677673, "learning_rate": 0.00020559209523698873, "loss": 4.7054, "step": 25200 }, { "epoch": 0.5161468678967092, "grad_norm": 0.2541794180870056, "learning_rate": 0.0002055346023189685, "loss": 4.7155, "step": 25205 }, { "epoch": 0.5162492576741138, "grad_norm": 0.24772867560386658, "learning_rate": 0.00020547709994497777, "loss": 4.7165, "step": 25210 }, { "epoch": 0.5163516474515184, "grad_norm": 0.25157585740089417, "learning_rate": 0.00020541958812480751, "loss": 4.7328, "step": 25215 }, { "epoch": 0.516454037228923, "grad_norm": 0.2582609951496124, "learning_rate": 0.0002053620668682504, "loss": 4.6885, "step": 25220 }, { "epoch": 0.5165564270063276, "grad_norm": 0.2692011594772339, "learning_rate": 0.00020530453618510064, "loss": 4.7216, "step": 25225 }, { "epoch": 0.5166588167837323, "grad_norm": 0.2550601363182068, "learning_rate": 0.00020524699608515416, "loss": 4.7391, "step": 25230 }, { "epoch": 0.5167612065611369, "grad_norm": 0.2598309814929962, "learning_rate": 0.00020518944657820827, "loss": 4.7313, "step": 25235 }, { "epoch": 0.5168635963385415, "grad_norm": 0.2678385078907013, "learning_rate": 0.00020513188767406216, "loss": 4.7061, "step": 25240 }, { "epoch": 0.5169659861159461, "grad_norm": 0.251507431268692, "learning_rate": 0.00020507431938251641, "loss": 4.729, "step": 25245 }, { "epoch": 0.5170683758933509, "grad_norm": 0.2598932981491089, "learning_rate": 0.0002050167417133733, "loss": 4.7031, "step": 25250 }, { "epoch": 0.5171707656707555, "grad_norm": 0.25043338537216187, "learning_rate": 0.00020495915467643664, "loss": 4.7563, "step": 25255 }, { "epoch": 0.5172731554481601, "grad_norm": 0.25743046402931213, "learning_rate": 0.00020490155828151185, "loss": 4.6892, "step": 25260 }, { "epoch": 0.5173755452255647, "grad_norm": 0.2507784962654114, "learning_rate": 0.0002048439525384061, "loss": 4.6957, "step": 25265 }, { "epoch": 0.5174779350029693, "grad_norm": 0.2602917551994324, "learning_rate": 0.00020478633745692784, "loss": 4.6799, "step": 25270 }, { "epoch": 0.517580324780374, "grad_norm": 0.32359644770622253, "learning_rate": 0.00020472871304688743, "loss": 4.7462, "step": 25275 }, { "epoch": 0.5176827145577786, "grad_norm": 0.27157193422317505, "learning_rate": 0.00020467107931809663, "loss": 4.7443, "step": 25280 }, { "epoch": 0.5177851043351832, "grad_norm": 0.2694609463214874, "learning_rate": 0.00020461343628036882, "loss": 4.7407, "step": 25285 }, { "epoch": 0.5178874941125878, "grad_norm": 0.25350046157836914, "learning_rate": 0.00020455578394351894, "loss": 4.6895, "step": 25290 }, { "epoch": 0.5179898838899925, "grad_norm": 0.2541326880455017, "learning_rate": 0.0002044981223173636, "loss": 4.7259, "step": 25295 }, { "epoch": 0.5180922736673971, "grad_norm": 0.2908807098865509, "learning_rate": 0.00020444045141172095, "loss": 4.7015, "step": 25300 }, { "epoch": 0.5181946634448017, "grad_norm": 0.24267639219760895, "learning_rate": 0.00020438277123641067, "loss": 4.6619, "step": 25305 }, { "epoch": 0.5182970532222063, "grad_norm": 0.2692250907421112, "learning_rate": 0.00020432508180125413, "loss": 4.6891, "step": 25310 }, { "epoch": 0.5183994429996109, "grad_norm": 0.6035641431808472, "learning_rate": 0.0002042673831160741, "loss": 4.7316, "step": 25315 }, { "epoch": 0.5185018327770156, "grad_norm": 0.2575273811817169, "learning_rate": 0.00020420967519069517, "loss": 4.7513, "step": 25320 }, { "epoch": 0.5186042225544202, "grad_norm": 0.25130707025527954, "learning_rate": 0.00020415195803494325, "loss": 4.7517, "step": 25325 }, { "epoch": 0.5187066123318248, "grad_norm": 0.2885972857475281, "learning_rate": 0.00020409423165864598, "loss": 4.6608, "step": 25330 }, { "epoch": 0.5188090021092294, "grad_norm": 0.25472113490104675, "learning_rate": 0.00020403649607163253, "loss": 4.7282, "step": 25335 }, { "epoch": 0.518911391886634, "grad_norm": 0.3163636326789856, "learning_rate": 0.0002039787512837336, "loss": 4.7619, "step": 25340 }, { "epoch": 0.5190137816640387, "grad_norm": 0.24546130001544952, "learning_rate": 0.00020392099730478158, "loss": 4.6859, "step": 25345 }, { "epoch": 0.5191161714414433, "grad_norm": 0.2689363658428192, "learning_rate": 0.00020386323414461026, "loss": 4.8244, "step": 25350 }, { "epoch": 0.5192185612188479, "grad_norm": 0.25374606251716614, "learning_rate": 0.00020380546181305513, "loss": 4.6985, "step": 25355 }, { "epoch": 0.5193209509962525, "grad_norm": 0.242715984582901, "learning_rate": 0.00020374768031995307, "loss": 4.7155, "step": 25360 }, { "epoch": 0.5194233407736571, "grad_norm": 0.2540392577648163, "learning_rate": 0.00020368988967514275, "loss": 4.6721, "step": 25365 }, { "epoch": 0.5195257305510618, "grad_norm": 0.2697451114654541, "learning_rate": 0.00020363208988846424, "loss": 4.6991, "step": 25370 }, { "epoch": 0.5196281203284664, "grad_norm": 0.24948923289775848, "learning_rate": 0.00020357428096975922, "loss": 4.7085, "step": 25375 }, { "epoch": 0.519730510105871, "grad_norm": 0.25771522521972656, "learning_rate": 0.0002035164629288709, "loss": 4.7562, "step": 25380 }, { "epoch": 0.5198328998832756, "grad_norm": 0.2981756627559662, "learning_rate": 0.00020345863577564402, "loss": 4.6578, "step": 25385 }, { "epoch": 0.5199352896606803, "grad_norm": 0.25592824816703796, "learning_rate": 0.00020340079951992497, "loss": 4.7013, "step": 25390 }, { "epoch": 0.5200376794380849, "grad_norm": 0.2738116979598999, "learning_rate": 0.0002033429541715616, "loss": 4.7512, "step": 25395 }, { "epoch": 0.5201400692154895, "grad_norm": 0.2536337375640869, "learning_rate": 0.00020328509974040332, "loss": 4.7228, "step": 25400 }, { "epoch": 0.5202424589928941, "grad_norm": 0.24680015444755554, "learning_rate": 0.0002032272362363011, "loss": 4.6811, "step": 25405 }, { "epoch": 0.5203448487702987, "grad_norm": 0.2532963454723358, "learning_rate": 0.0002031693636691075, "loss": 4.6936, "step": 25410 }, { "epoch": 0.5204472385477034, "grad_norm": 0.2937077283859253, "learning_rate": 0.00020311148204867652, "loss": 4.6856, "step": 25415 }, { "epoch": 0.520549628325108, "grad_norm": 0.2579612731933594, "learning_rate": 0.00020305359138486385, "loss": 4.7116, "step": 25420 }, { "epoch": 0.5206520181025126, "grad_norm": 0.27243170142173767, "learning_rate": 0.00020299569168752657, "loss": 4.6914, "step": 25425 }, { "epoch": 0.5207544078799172, "grad_norm": 0.25620657205581665, "learning_rate": 0.00020293778296652328, "loss": 4.6985, "step": 25430 }, { "epoch": 0.5208567976573218, "grad_norm": 0.2616143226623535, "learning_rate": 0.0002028798652317143, "loss": 4.6425, "step": 25435 }, { "epoch": 0.5209591874347265, "grad_norm": 0.30034929513931274, "learning_rate": 0.00020282193849296143, "loss": 4.7288, "step": 25440 }, { "epoch": 0.5210615772121311, "grad_norm": 0.25712305307388306, "learning_rate": 0.00020276400276012785, "loss": 4.7359, "step": 25445 }, { "epoch": 0.5211639669895357, "grad_norm": 0.2787604331970215, "learning_rate": 0.00020270605804307843, "loss": 4.6328, "step": 25450 }, { "epoch": 0.5212663567669404, "grad_norm": 0.26481419801712036, "learning_rate": 0.00020264810435167944, "loss": 4.6835, "step": 25455 }, { "epoch": 0.5213687465443451, "grad_norm": 0.2781488299369812, "learning_rate": 0.0002025901416957988, "loss": 4.7679, "step": 25460 }, { "epoch": 0.5214711363217497, "grad_norm": 0.29370707273483276, "learning_rate": 0.00020253217008530598, "loss": 4.7493, "step": 25465 }, { "epoch": 0.5215735260991543, "grad_norm": 0.28654757142066956, "learning_rate": 0.0002024741895300718, "loss": 4.7388, "step": 25470 }, { "epoch": 0.5216759158765589, "grad_norm": 0.2706935405731201, "learning_rate": 0.00020241620003996877, "loss": 4.6992, "step": 25475 }, { "epoch": 0.5217783056539635, "grad_norm": 0.27572545409202576, "learning_rate": 0.0002023582016248708, "loss": 4.6964, "step": 25480 }, { "epoch": 0.5218806954313682, "grad_norm": 0.2670837640762329, "learning_rate": 0.00020230019429465343, "loss": 4.7087, "step": 25485 }, { "epoch": 0.5219830852087728, "grad_norm": 0.24532637000083923, "learning_rate": 0.00020224217805919363, "loss": 4.7042, "step": 25490 }, { "epoch": 0.5220854749861774, "grad_norm": 0.2530348300933838, "learning_rate": 0.00020218415292837, "loss": 4.7387, "step": 25495 }, { "epoch": 0.522187864763582, "grad_norm": 0.23660606145858765, "learning_rate": 0.00020212611891206247, "loss": 4.7322, "step": 25500 }, { "epoch": 0.5222902545409867, "grad_norm": 0.28594502806663513, "learning_rate": 0.0002020680760201526, "loss": 4.7184, "step": 25505 }, { "epoch": 0.5223926443183913, "grad_norm": 0.2688608169555664, "learning_rate": 0.00020201002426252358, "loss": 4.7371, "step": 25510 }, { "epoch": 0.5224950340957959, "grad_norm": 0.26218748092651367, "learning_rate": 0.00020195196364905982, "loss": 4.7225, "step": 25515 }, { "epoch": 0.5225974238732005, "grad_norm": 0.24824967980384827, "learning_rate": 0.0002018938941896475, "loss": 4.6963, "step": 25520 }, { "epoch": 0.5226998136506051, "grad_norm": 0.30935096740722656, "learning_rate": 0.0002018358158941742, "loss": 4.767, "step": 25525 }, { "epoch": 0.5228022034280098, "grad_norm": 0.27268844842910767, "learning_rate": 0.0002017777287725289, "loss": 4.6938, "step": 25530 }, { "epoch": 0.5229045932054144, "grad_norm": 0.2782441973686218, "learning_rate": 0.00020171963283460237, "loss": 4.6773, "step": 25535 }, { "epoch": 0.523006982982819, "grad_norm": 0.2511165142059326, "learning_rate": 0.00020166152809028654, "loss": 4.7178, "step": 25540 }, { "epoch": 0.5231093727602236, "grad_norm": 0.28301969170570374, "learning_rate": 0.00020160341454947514, "loss": 4.6913, "step": 25545 }, { "epoch": 0.5232117625376282, "grad_norm": 0.25940707325935364, "learning_rate": 0.0002015452922220631, "loss": 4.7002, "step": 25550 }, { "epoch": 0.5233141523150329, "grad_norm": 0.25021982192993164, "learning_rate": 0.00020148716111794717, "loss": 4.6538, "step": 25555 }, { "epoch": 0.5234165420924375, "grad_norm": 0.2511366009712219, "learning_rate": 0.00020142902124702534, "loss": 4.7248, "step": 25560 }, { "epoch": 0.5235189318698421, "grad_norm": 0.25236549973487854, "learning_rate": 0.00020137087261919723, "loss": 4.7055, "step": 25565 }, { "epoch": 0.5236213216472467, "grad_norm": 0.42735183238983154, "learning_rate": 0.00020131271524436388, "loss": 4.6687, "step": 25570 }, { "epoch": 0.5237237114246514, "grad_norm": 0.26809757947921753, "learning_rate": 0.00020125454913242778, "loss": 4.7019, "step": 25575 }, { "epoch": 0.523826101202056, "grad_norm": 0.3365897238254547, "learning_rate": 0.0002011963742932931, "loss": 4.6834, "step": 25580 }, { "epoch": 0.5239284909794606, "grad_norm": 0.2797791063785553, "learning_rate": 0.00020113819073686528, "loss": 4.7245, "step": 25585 }, { "epoch": 0.5240308807568652, "grad_norm": 0.25630486011505127, "learning_rate": 0.0002010799984730514, "loss": 4.7393, "step": 25590 }, { "epoch": 0.5241332705342698, "grad_norm": 0.26039737462997437, "learning_rate": 0.00020102179751175982, "loss": 4.7142, "step": 25595 }, { "epoch": 0.5242356603116745, "grad_norm": 0.2776634097099304, "learning_rate": 0.00020096358786290069, "loss": 4.676, "step": 25600 }, { "epoch": 0.5243380500890791, "grad_norm": 0.2771133482456207, "learning_rate": 0.00020090536953638535, "loss": 4.7037, "step": 25605 }, { "epoch": 0.5244404398664837, "grad_norm": 0.24834643304347992, "learning_rate": 0.00020084714254212677, "loss": 4.6868, "step": 25610 }, { "epoch": 0.5245428296438883, "grad_norm": 0.2764914929866791, "learning_rate": 0.00020078890689003936, "loss": 4.7024, "step": 25615 }, { "epoch": 0.5246452194212929, "grad_norm": 0.2530945837497711, "learning_rate": 0.00020073066259003897, "loss": 4.7163, "step": 25620 }, { "epoch": 0.5247476091986976, "grad_norm": 0.24554969370365143, "learning_rate": 0.000200672409652043, "loss": 4.692, "step": 25625 }, { "epoch": 0.5248499989761022, "grad_norm": 0.25487956404685974, "learning_rate": 0.00020061414808597027, "loss": 4.7367, "step": 25630 }, { "epoch": 0.5249523887535068, "grad_norm": 0.24713367223739624, "learning_rate": 0.00020055587790174107, "loss": 4.6828, "step": 25635 }, { "epoch": 0.5250547785309114, "grad_norm": 0.27729615569114685, "learning_rate": 0.00020049759910927716, "loss": 4.6629, "step": 25640 }, { "epoch": 0.525157168308316, "grad_norm": 0.25373226404190063, "learning_rate": 0.00020043931171850177, "loss": 4.6707, "step": 25645 }, { "epoch": 0.5252595580857207, "grad_norm": 0.2616504728794098, "learning_rate": 0.00020038101573933956, "loss": 4.745, "step": 25650 }, { "epoch": 0.5253619478631254, "grad_norm": 0.2694254219532013, "learning_rate": 0.0002003227111817167, "loss": 4.659, "step": 25655 }, { "epoch": 0.52546433764053, "grad_norm": 0.23335018754005432, "learning_rate": 0.0002002643980555609, "loss": 4.6865, "step": 25660 }, { "epoch": 0.5255667274179346, "grad_norm": 0.2629484236240387, "learning_rate": 0.00020020607637080104, "loss": 4.7246, "step": 25665 }, { "epoch": 0.5256691171953393, "grad_norm": 0.24341432750225067, "learning_rate": 0.0002001477461373678, "loss": 4.6833, "step": 25670 }, { "epoch": 0.5257715069727439, "grad_norm": 0.24989259243011475, "learning_rate": 0.0002000894073651932, "loss": 4.6567, "step": 25675 }, { "epoch": 0.5258738967501485, "grad_norm": 0.2554193437099457, "learning_rate": 0.00020003106006421056, "loss": 4.7217, "step": 25680 }, { "epoch": 0.5259762865275531, "grad_norm": 0.2519819140434265, "learning_rate": 0.0001999727042443548, "loss": 4.7194, "step": 25685 }, { "epoch": 0.5260786763049577, "grad_norm": 0.2581077218055725, "learning_rate": 0.00019991433991556234, "loss": 4.7198, "step": 25690 }, { "epoch": 0.5261810660823624, "grad_norm": 0.24898824095726013, "learning_rate": 0.0001998559670877709, "loss": 4.7215, "step": 25695 }, { "epoch": 0.526283455859767, "grad_norm": 0.24466998875141144, "learning_rate": 0.00019979758577091976, "loss": 4.6968, "step": 25700 }, { "epoch": 0.5263858456371716, "grad_norm": 0.28194326162338257, "learning_rate": 0.00019973919597494959, "loss": 4.6962, "step": 25705 }, { "epoch": 0.5264882354145762, "grad_norm": 0.2566097676753998, "learning_rate": 0.0001996807977098025, "loss": 4.6931, "step": 25710 }, { "epoch": 0.5265906251919809, "grad_norm": 0.23906411230564117, "learning_rate": 0.00019962239098542207, "loss": 4.685, "step": 25715 }, { "epoch": 0.5266930149693855, "grad_norm": 0.2488403469324112, "learning_rate": 0.00019956397581175333, "loss": 4.6711, "step": 25720 }, { "epoch": 0.5267954047467901, "grad_norm": 0.2994610369205475, "learning_rate": 0.00019950555219874273, "loss": 4.64, "step": 25725 }, { "epoch": 0.5268977945241947, "grad_norm": 0.26433423161506653, "learning_rate": 0.00019944712015633817, "loss": 4.6997, "step": 25730 }, { "epoch": 0.5270001843015993, "grad_norm": 0.26021307706832886, "learning_rate": 0.00019938867969448892, "loss": 4.7127, "step": 25735 }, { "epoch": 0.527102574079004, "grad_norm": 0.2602417767047882, "learning_rate": 0.00019933023082314573, "loss": 4.7322, "step": 25740 }, { "epoch": 0.5272049638564086, "grad_norm": 0.25909242033958435, "learning_rate": 0.0001992717735522609, "loss": 4.7043, "step": 25745 }, { "epoch": 0.5273073536338132, "grad_norm": 0.2980313301086426, "learning_rate": 0.00019921330789178794, "loss": 4.7074, "step": 25750 }, { "epoch": 0.5274097434112178, "grad_norm": 0.2667423188686371, "learning_rate": 0.00019915483385168196, "loss": 4.6994, "step": 25755 }, { "epoch": 0.5275121331886224, "grad_norm": 0.25480931997299194, "learning_rate": 0.00019909635144189932, "loss": 4.7004, "step": 25760 }, { "epoch": 0.5276145229660271, "grad_norm": 0.2579711973667145, "learning_rate": 0.00019903786067239807, "loss": 4.6608, "step": 25765 }, { "epoch": 0.5277169127434317, "grad_norm": 0.26290157437324524, "learning_rate": 0.00019897936155313748, "loss": 4.7055, "step": 25770 }, { "epoch": 0.5278193025208363, "grad_norm": 0.23719623684883118, "learning_rate": 0.00019892085409407826, "loss": 4.6872, "step": 25775 }, { "epoch": 0.5279216922982409, "grad_norm": 0.2714715600013733, "learning_rate": 0.00019886233830518257, "loss": 4.7071, "step": 25780 }, { "epoch": 0.5280240820756456, "grad_norm": 0.25175049901008606, "learning_rate": 0.000198803814196414, "loss": 4.6765, "step": 25785 }, { "epoch": 0.5281264718530502, "grad_norm": 0.2570139169692993, "learning_rate": 0.00019874528177773763, "loss": 4.7269, "step": 25790 }, { "epoch": 0.5282288616304548, "grad_norm": 0.2545175552368164, "learning_rate": 0.00019868674105911972, "loss": 4.6839, "step": 25795 }, { "epoch": 0.5283312514078594, "grad_norm": 0.24411524832248688, "learning_rate": 0.0001986281920505283, "loss": 4.7008, "step": 25800 }, { "epoch": 0.528433641185264, "grad_norm": 0.2793164849281311, "learning_rate": 0.00019856963476193236, "loss": 4.6892, "step": 25805 }, { "epoch": 0.5285360309626687, "grad_norm": 0.2509508728981018, "learning_rate": 0.00019851106920330277, "loss": 4.7038, "step": 25810 }, { "epoch": 0.5286384207400733, "grad_norm": 0.2581671476364136, "learning_rate": 0.00019845249538461145, "loss": 4.6889, "step": 25815 }, { "epoch": 0.5287408105174779, "grad_norm": 0.2587572932243347, "learning_rate": 0.00019839391331583192, "loss": 4.711, "step": 25820 }, { "epoch": 0.5288432002948825, "grad_norm": 0.24888691306114197, "learning_rate": 0.00019833532300693907, "loss": 4.6986, "step": 25825 }, { "epoch": 0.5289455900722871, "grad_norm": 0.2639336585998535, "learning_rate": 0.00019827672446790914, "loss": 4.7135, "step": 25830 }, { "epoch": 0.5290479798496918, "grad_norm": 0.2600058913230896, "learning_rate": 0.00019821811770871976, "loss": 4.6807, "step": 25835 }, { "epoch": 0.5291503696270964, "grad_norm": 0.2666216492652893, "learning_rate": 0.0001981595027393501, "loss": 4.6908, "step": 25840 }, { "epoch": 0.529252759404501, "grad_norm": 0.2627016305923462, "learning_rate": 0.00019810087956978055, "loss": 4.712, "step": 25845 }, { "epoch": 0.5293551491819056, "grad_norm": 0.27691221237182617, "learning_rate": 0.00019804224820999295, "loss": 4.7209, "step": 25850 }, { "epoch": 0.5294575389593102, "grad_norm": 0.24884915351867676, "learning_rate": 0.00019798360866997065, "loss": 4.6783, "step": 25855 }, { "epoch": 0.529559928736715, "grad_norm": 0.25666943192481995, "learning_rate": 0.00019792496095969827, "loss": 4.6969, "step": 25860 }, { "epoch": 0.5296623185141196, "grad_norm": 0.23978044092655182, "learning_rate": 0.00019786630508916183, "loss": 4.7119, "step": 25865 }, { "epoch": 0.5297647082915242, "grad_norm": 0.2427990585565567, "learning_rate": 0.0001978076410683488, "loss": 4.7305, "step": 25870 }, { "epoch": 0.5298670980689288, "grad_norm": 0.2595253884792328, "learning_rate": 0.00019774896890724798, "loss": 4.6835, "step": 25875 }, { "epoch": 0.5299694878463335, "grad_norm": 0.2647393047809601, "learning_rate": 0.00019769028861584955, "loss": 4.7471, "step": 25880 }, { "epoch": 0.5300718776237381, "grad_norm": 0.2709347903728485, "learning_rate": 0.00019763160020414518, "loss": 4.709, "step": 25885 }, { "epoch": 0.5301742674011427, "grad_norm": 0.2865538001060486, "learning_rate": 0.00019757290368212777, "loss": 4.7528, "step": 25890 }, { "epoch": 0.5302766571785473, "grad_norm": 0.2557232081890106, "learning_rate": 0.00019751419905979177, "loss": 4.7577, "step": 25895 }, { "epoch": 0.530379046955952, "grad_norm": 0.26313456892967224, "learning_rate": 0.00019745548634713276, "loss": 4.7332, "step": 25900 }, { "epoch": 0.5304814367333566, "grad_norm": 0.2618166506290436, "learning_rate": 0.00019739676555414795, "loss": 4.7336, "step": 25905 }, { "epoch": 0.5305838265107612, "grad_norm": 0.26226091384887695, "learning_rate": 0.0001973380366908359, "loss": 4.7233, "step": 25910 }, { "epoch": 0.5306862162881658, "grad_norm": 0.24330765008926392, "learning_rate": 0.00019727929976719633, "loss": 4.7354, "step": 25915 }, { "epoch": 0.5307886060655704, "grad_norm": 0.2726133465766907, "learning_rate": 0.00019722055479323057, "loss": 4.7441, "step": 25920 }, { "epoch": 0.530890995842975, "grad_norm": 0.2631099224090576, "learning_rate": 0.00019716180177894118, "loss": 4.7255, "step": 25925 }, { "epoch": 0.5309933856203797, "grad_norm": 0.28416886925697327, "learning_rate": 0.00019710304073433215, "loss": 4.6626, "step": 25930 }, { "epoch": 0.5310957753977843, "grad_norm": 0.25636976957321167, "learning_rate": 0.0001970442716694088, "loss": 4.6742, "step": 25935 }, { "epoch": 0.5311981651751889, "grad_norm": 0.25970378518104553, "learning_rate": 0.00019698549459417792, "loss": 4.6955, "step": 25940 }, { "epoch": 0.5313005549525935, "grad_norm": 0.26659950613975525, "learning_rate": 0.0001969267095186475, "loss": 4.7669, "step": 25945 }, { "epoch": 0.5314029447299982, "grad_norm": 0.3476724624633789, "learning_rate": 0.00019686791645282697, "loss": 4.6512, "step": 25950 }, { "epoch": 0.5315053345074028, "grad_norm": 0.2607884705066681, "learning_rate": 0.00019680911540672716, "loss": 4.6727, "step": 25955 }, { "epoch": 0.5316077242848074, "grad_norm": 0.3115827143192291, "learning_rate": 0.00019675030639036022, "loss": 4.7441, "step": 25960 }, { "epoch": 0.531710114062212, "grad_norm": 0.26234233379364014, "learning_rate": 0.00019669148941373966, "loss": 4.7156, "step": 25965 }, { "epoch": 0.5318125038396166, "grad_norm": 0.25020137429237366, "learning_rate": 0.0001966326644868803, "loss": 4.7013, "step": 25970 }, { "epoch": 0.5319148936170213, "grad_norm": 0.26293113827705383, "learning_rate": 0.00019657383161979846, "loss": 4.7357, "step": 25975 }, { "epoch": 0.5320172833944259, "grad_norm": 0.2511340081691742, "learning_rate": 0.00019651499082251167, "loss": 4.686, "step": 25980 }, { "epoch": 0.5321196731718305, "grad_norm": 0.26991066336631775, "learning_rate": 0.0001964561421050388, "loss": 4.6865, "step": 25985 }, { "epoch": 0.5322220629492351, "grad_norm": 0.26026806235313416, "learning_rate": 0.00019639728547740014, "loss": 4.7252, "step": 25990 }, { "epoch": 0.5323244527266398, "grad_norm": 0.2697601914405823, "learning_rate": 0.00019633842094961732, "loss": 4.6772, "step": 25995 }, { "epoch": 0.5324268425040444, "grad_norm": 0.27490803599357605, "learning_rate": 0.00019627954853171332, "loss": 4.6808, "step": 26000 }, { "epoch": 0.5324268425040444, "eval_loss": 4.715832233428955, "eval_runtime": 17.1217, "eval_samples_per_second": 33.992, "eval_steps_per_second": 4.264, "step": 26000 }, { "epoch": 0.532529232281449, "grad_norm": 0.2695653438568115, "learning_rate": 0.00019622066823371247, "loss": 4.7279, "step": 26005 }, { "epoch": 0.5326316220588536, "grad_norm": 0.2591971457004547, "learning_rate": 0.00019616178006564032, "loss": 4.7927, "step": 26010 }, { "epoch": 0.5327340118362582, "grad_norm": 0.26702797412872314, "learning_rate": 0.00019610288403752393, "loss": 4.6334, "step": 26015 }, { "epoch": 0.5328364016136629, "grad_norm": 0.26594123244285583, "learning_rate": 0.0001960439801593916, "loss": 4.7251, "step": 26020 }, { "epoch": 0.5329387913910675, "grad_norm": 0.25022926926612854, "learning_rate": 0.00019598506844127305, "loss": 4.6757, "step": 26025 }, { "epoch": 0.5330411811684721, "grad_norm": 0.26219940185546875, "learning_rate": 0.0001959261488931992, "loss": 4.7274, "step": 26030 }, { "epoch": 0.5331435709458767, "grad_norm": 0.26517900824546814, "learning_rate": 0.0001958672215252024, "loss": 4.6714, "step": 26035 }, { "epoch": 0.5332459607232813, "grad_norm": 0.24350258708000183, "learning_rate": 0.0001958082863473164, "loss": 4.7222, "step": 26040 }, { "epoch": 0.533348350500686, "grad_norm": 0.2614833414554596, "learning_rate": 0.00019574934336957604, "loss": 4.717, "step": 26045 }, { "epoch": 0.5334507402780906, "grad_norm": 0.2601943910121918, "learning_rate": 0.00019569039260201775, "loss": 4.74, "step": 26050 }, { "epoch": 0.5335531300554952, "grad_norm": 0.28414246439933777, "learning_rate": 0.00019563143405467915, "loss": 4.7131, "step": 26055 }, { "epoch": 0.5336555198328999, "grad_norm": 0.2536947727203369, "learning_rate": 0.00019557246773759923, "loss": 4.7276, "step": 26060 }, { "epoch": 0.5337579096103046, "grad_norm": 0.2547471523284912, "learning_rate": 0.00019551349366081822, "loss": 4.6709, "step": 26065 }, { "epoch": 0.5338602993877092, "grad_norm": 0.27495211362838745, "learning_rate": 0.00019545451183437777, "loss": 4.7397, "step": 26070 }, { "epoch": 0.5339626891651138, "grad_norm": 0.2624116837978363, "learning_rate": 0.00019539552226832086, "loss": 4.6874, "step": 26075 }, { "epoch": 0.5340650789425184, "grad_norm": 0.25751790404319763, "learning_rate": 0.0001953365249726917, "loss": 4.6688, "step": 26080 }, { "epoch": 0.534167468719923, "grad_norm": 0.2581915557384491, "learning_rate": 0.00019527751995753587, "loss": 4.7396, "step": 26085 }, { "epoch": 0.5342698584973277, "grad_norm": 0.26121985912323, "learning_rate": 0.0001952185072329002, "loss": 4.7678, "step": 26090 }, { "epoch": 0.5343722482747323, "grad_norm": 0.2548855245113373, "learning_rate": 0.00019515948680883297, "loss": 4.7034, "step": 26095 }, { "epoch": 0.5344746380521369, "grad_norm": 0.24417440593242645, "learning_rate": 0.00019510045869538365, "loss": 4.7148, "step": 26100 }, { "epoch": 0.5345770278295415, "grad_norm": 0.2615807056427002, "learning_rate": 0.0001950414229026031, "loss": 4.6572, "step": 26105 }, { "epoch": 0.5346794176069462, "grad_norm": 0.27709394693374634, "learning_rate": 0.00019498237944054337, "loss": 4.7087, "step": 26110 }, { "epoch": 0.5347818073843508, "grad_norm": 0.44070112705230713, "learning_rate": 0.00019492332831925785, "loss": 4.7618, "step": 26115 }, { "epoch": 0.5348841971617554, "grad_norm": 0.2976377308368683, "learning_rate": 0.00019486426954880143, "loss": 4.6753, "step": 26120 }, { "epoch": 0.53498658693916, "grad_norm": 0.241999089717865, "learning_rate": 0.0001948052031392301, "loss": 4.6477, "step": 26125 }, { "epoch": 0.5350889767165646, "grad_norm": 0.2862304449081421, "learning_rate": 0.0001947461291006011, "loss": 4.7064, "step": 26130 }, { "epoch": 0.5351913664939693, "grad_norm": 0.2612556219100952, "learning_rate": 0.0001946870474429731, "loss": 4.6663, "step": 26135 }, { "epoch": 0.5352937562713739, "grad_norm": 0.2638161778450012, "learning_rate": 0.00019462795817640612, "loss": 4.6824, "step": 26140 }, { "epoch": 0.5353961460487785, "grad_norm": 0.2660267949104309, "learning_rate": 0.0001945688613109613, "loss": 4.6729, "step": 26145 }, { "epoch": 0.5354985358261831, "grad_norm": 0.26595360040664673, "learning_rate": 0.00019450975685670122, "loss": 4.7145, "step": 26150 }, { "epoch": 0.5356009256035877, "grad_norm": 0.24708572030067444, "learning_rate": 0.00019445064482368965, "loss": 4.749, "step": 26155 }, { "epoch": 0.5357033153809924, "grad_norm": 0.26008662581443787, "learning_rate": 0.0001943915252219917, "loss": 4.7328, "step": 26160 }, { "epoch": 0.535805705158397, "grad_norm": 0.2674421966075897, "learning_rate": 0.0001943323980616738, "loss": 4.7129, "step": 26165 }, { "epoch": 0.5359080949358016, "grad_norm": 0.2631687521934509, "learning_rate": 0.0001942732633528036, "loss": 4.6784, "step": 26170 }, { "epoch": 0.5360104847132062, "grad_norm": 0.26723283529281616, "learning_rate": 0.00019421412110545008, "loss": 4.7509, "step": 26175 }, { "epoch": 0.5361128744906108, "grad_norm": 0.2441859096288681, "learning_rate": 0.00019415497132968353, "loss": 4.6971, "step": 26180 }, { "epoch": 0.5362152642680155, "grad_norm": 0.2712872624397278, "learning_rate": 0.00019409581403557533, "loss": 4.6796, "step": 26185 }, { "epoch": 0.5363176540454201, "grad_norm": 0.6035029292106628, "learning_rate": 0.0001940366492331985, "loss": 4.6581, "step": 26190 }, { "epoch": 0.5364200438228247, "grad_norm": 0.27305108308792114, "learning_rate": 0.000193977476932627, "loss": 4.6978, "step": 26195 }, { "epoch": 0.5365224336002293, "grad_norm": 0.2726241648197174, "learning_rate": 0.00019391829714393626, "loss": 4.7578, "step": 26200 }, { "epoch": 0.536624823377634, "grad_norm": 0.25249484181404114, "learning_rate": 0.00019385910987720282, "loss": 4.7464, "step": 26205 }, { "epoch": 0.5367272131550386, "grad_norm": 0.2539021074771881, "learning_rate": 0.00019379991514250472, "loss": 4.6595, "step": 26210 }, { "epoch": 0.5368296029324432, "grad_norm": 0.2574290633201599, "learning_rate": 0.0001937407129499211, "loss": 4.6557, "step": 26215 }, { "epoch": 0.5369319927098478, "grad_norm": 0.3053162097930908, "learning_rate": 0.00019368150330953242, "loss": 4.6835, "step": 26220 }, { "epoch": 0.5370343824872524, "grad_norm": 0.24399952590465546, "learning_rate": 0.00019362228623142039, "loss": 4.69, "step": 26225 }, { "epoch": 0.5371367722646571, "grad_norm": 0.2516840994358063, "learning_rate": 0.00019356306172566796, "loss": 4.7066, "step": 26230 }, { "epoch": 0.5372391620420617, "grad_norm": 0.26467376947402954, "learning_rate": 0.0001935038298023595, "loss": 4.7172, "step": 26235 }, { "epoch": 0.5373415518194663, "grad_norm": 0.25264087319374084, "learning_rate": 0.00019344459047158046, "loss": 4.7694, "step": 26240 }, { "epoch": 0.5374439415968709, "grad_norm": 0.2514682710170746, "learning_rate": 0.0001933853437434176, "loss": 4.7084, "step": 26245 }, { "epoch": 0.5375463313742755, "grad_norm": 0.2545846402645111, "learning_rate": 0.00019332608962795904, "loss": 4.7126, "step": 26250 }, { "epoch": 0.5376487211516802, "grad_norm": 0.254913866519928, "learning_rate": 0.00019326682813529392, "loss": 4.6826, "step": 26255 }, { "epoch": 0.5377511109290848, "grad_norm": 0.27251002192497253, "learning_rate": 0.000193207559275513, "loss": 4.669, "step": 26260 }, { "epoch": 0.5378535007064895, "grad_norm": 0.26807644963264465, "learning_rate": 0.00019314828305870796, "loss": 4.7196, "step": 26265 }, { "epoch": 0.5379558904838941, "grad_norm": 0.2559725046157837, "learning_rate": 0.00019308899949497187, "loss": 4.6049, "step": 26270 }, { "epoch": 0.5380582802612988, "grad_norm": 0.2622315585613251, "learning_rate": 0.0001930297085943991, "loss": 4.7031, "step": 26275 }, { "epoch": 0.5381606700387034, "grad_norm": 0.2548885941505432, "learning_rate": 0.00019297041036708517, "loss": 4.7116, "step": 26280 }, { "epoch": 0.538263059816108, "grad_norm": 0.2712334096431732, "learning_rate": 0.00019291110482312686, "loss": 4.6726, "step": 26285 }, { "epoch": 0.5383654495935126, "grad_norm": 0.34245362877845764, "learning_rate": 0.00019285179197262232, "loss": 4.7, "step": 26290 }, { "epoch": 0.5384678393709172, "grad_norm": 0.2666618824005127, "learning_rate": 0.00019279247182567077, "loss": 4.6762, "step": 26295 }, { "epoch": 0.5385702291483219, "grad_norm": 0.24612635374069214, "learning_rate": 0.0001927331443923727, "loss": 4.746, "step": 26300 }, { "epoch": 0.5386726189257265, "grad_norm": 0.2689967453479767, "learning_rate": 0.00019267380968283004, "loss": 4.7046, "step": 26305 }, { "epoch": 0.5387750087031311, "grad_norm": 0.25115466117858887, "learning_rate": 0.00019261446770714571, "loss": 4.6499, "step": 26310 }, { "epoch": 0.5388773984805357, "grad_norm": 0.2397141456604004, "learning_rate": 0.00019255511847542404, "loss": 4.6902, "step": 26315 }, { "epoch": 0.5389797882579404, "grad_norm": 0.2797822654247284, "learning_rate": 0.0001924957619977704, "loss": 4.7158, "step": 26320 }, { "epoch": 0.539082178035345, "grad_norm": 0.27893519401550293, "learning_rate": 0.00019243639828429163, "loss": 4.6728, "step": 26325 }, { "epoch": 0.5391845678127496, "grad_norm": 0.2753532826900482, "learning_rate": 0.00019237702734509568, "loss": 4.7019, "step": 26330 }, { "epoch": 0.5392869575901542, "grad_norm": 0.2707187533378601, "learning_rate": 0.0001923176491902917, "loss": 4.729, "step": 26335 }, { "epoch": 0.5393893473675588, "grad_norm": 0.26808810234069824, "learning_rate": 0.0001922582638299901, "loss": 4.6933, "step": 26340 }, { "epoch": 0.5394917371449635, "grad_norm": 0.2533009946346283, "learning_rate": 0.00019219887127430255, "loss": 4.6881, "step": 26345 }, { "epoch": 0.5395941269223681, "grad_norm": 0.27040114998817444, "learning_rate": 0.00019213947153334194, "loss": 4.6711, "step": 26350 }, { "epoch": 0.5396965166997727, "grad_norm": 0.26400721073150635, "learning_rate": 0.00019208006461722232, "loss": 4.6366, "step": 26355 }, { "epoch": 0.5397989064771773, "grad_norm": 0.2703004777431488, "learning_rate": 0.00019202065053605904, "loss": 4.6477, "step": 26360 }, { "epoch": 0.5399012962545819, "grad_norm": 0.2598544657230377, "learning_rate": 0.00019196122929996865, "loss": 4.6492, "step": 26365 }, { "epoch": 0.5400036860319866, "grad_norm": 0.2817550599575043, "learning_rate": 0.00019190180091906885, "loss": 4.7068, "step": 26370 }, { "epoch": 0.5401060758093912, "grad_norm": 0.2642737925052643, "learning_rate": 0.00019184236540347862, "loss": 4.7371, "step": 26375 }, { "epoch": 0.5402084655867958, "grad_norm": 0.24548190832138062, "learning_rate": 0.00019178292276331822, "loss": 4.7364, "step": 26380 }, { "epoch": 0.5403108553642004, "grad_norm": 0.6843188405036926, "learning_rate": 0.00019172347300870898, "loss": 4.717, "step": 26385 }, { "epoch": 0.540413245141605, "grad_norm": 0.269150048494339, "learning_rate": 0.00019166401614977355, "loss": 4.6323, "step": 26390 }, { "epoch": 0.5405156349190097, "grad_norm": 0.26181933283805847, "learning_rate": 0.00019160455219663562, "loss": 4.7094, "step": 26395 }, { "epoch": 0.5406180246964143, "grad_norm": 0.2523230016231537, "learning_rate": 0.00019154508115942044, "loss": 4.6977, "step": 26400 }, { "epoch": 0.5407204144738189, "grad_norm": 0.25122886896133423, "learning_rate": 0.00019148560304825405, "loss": 4.6931, "step": 26405 }, { "epoch": 0.5408228042512235, "grad_norm": 0.25125694274902344, "learning_rate": 0.000191426117873264, "loss": 4.7033, "step": 26410 }, { "epoch": 0.5409251940286282, "grad_norm": 0.25516924262046814, "learning_rate": 0.00019136662564457892, "loss": 4.7436, "step": 26415 }, { "epoch": 0.5410275838060328, "grad_norm": 0.2656814455986023, "learning_rate": 0.00019130712637232855, "loss": 4.7096, "step": 26420 }, { "epoch": 0.5411299735834374, "grad_norm": 0.27654868364334106, "learning_rate": 0.00019124762006664405, "loss": 4.6982, "step": 26425 }, { "epoch": 0.541232363360842, "grad_norm": 0.26052772998809814, "learning_rate": 0.00019118810673765765, "loss": 4.7112, "step": 26430 }, { "epoch": 0.5413347531382466, "grad_norm": 0.24654258787631989, "learning_rate": 0.00019112858639550276, "loss": 4.6829, "step": 26435 }, { "epoch": 0.5414371429156513, "grad_norm": 0.2553759217262268, "learning_rate": 0.00019106905905031396, "loss": 4.71, "step": 26440 }, { "epoch": 0.5415395326930559, "grad_norm": 0.2568046450614929, "learning_rate": 0.00019100952471222713, "loss": 4.7044, "step": 26445 }, { "epoch": 0.5416419224704605, "grad_norm": 0.3430255949497223, "learning_rate": 0.00019094998339137925, "loss": 4.6998, "step": 26450 }, { "epoch": 0.5417443122478651, "grad_norm": 0.25559189915657043, "learning_rate": 0.00019089043509790859, "loss": 4.6574, "step": 26455 }, { "epoch": 0.5418467020252697, "grad_norm": 0.283968985080719, "learning_rate": 0.00019083087984195443, "loss": 4.7017, "step": 26460 }, { "epoch": 0.5419490918026745, "grad_norm": 0.2678341269493103, "learning_rate": 0.0001907713176336574, "loss": 4.6707, "step": 26465 }, { "epoch": 0.5420514815800791, "grad_norm": 0.2516711950302124, "learning_rate": 0.00019071174848315928, "loss": 4.7012, "step": 26470 }, { "epoch": 0.5421538713574837, "grad_norm": 0.25756508111953735, "learning_rate": 0.00019065217240060302, "loss": 4.7477, "step": 26475 }, { "epoch": 0.5422562611348883, "grad_norm": 0.2792092263698578, "learning_rate": 0.00019059258939613266, "loss": 4.7309, "step": 26480 }, { "epoch": 0.542358650912293, "grad_norm": 0.2740444242954254, "learning_rate": 0.00019053299947989358, "loss": 4.7099, "step": 26485 }, { "epoch": 0.5424610406896976, "grad_norm": 0.2703920602798462, "learning_rate": 0.0001904734026620322, "loss": 4.7371, "step": 26490 }, { "epoch": 0.5425634304671022, "grad_norm": 0.2684652805328369, "learning_rate": 0.00019041379895269622, "loss": 4.7206, "step": 26495 }, { "epoch": 0.5426658202445068, "grad_norm": 0.27988335490226746, "learning_rate": 0.00019035418836203444, "loss": 4.7138, "step": 26500 }, { "epoch": 0.5427682100219114, "grad_norm": 0.27285775542259216, "learning_rate": 0.00019029457090019686, "loss": 4.7015, "step": 26505 }, { "epoch": 0.5428705997993161, "grad_norm": 0.28652337193489075, "learning_rate": 0.0001902349465773346, "loss": 4.665, "step": 26510 }, { "epoch": 0.5429729895767207, "grad_norm": 0.2635771632194519, "learning_rate": 0.00019017531540360002, "loss": 4.7252, "step": 26515 }, { "epoch": 0.5430753793541253, "grad_norm": 0.2735646069049835, "learning_rate": 0.0001901156773891467, "loss": 4.6917, "step": 26520 }, { "epoch": 0.5431777691315299, "grad_norm": 0.25290530920028687, "learning_rate": 0.0001900560325441292, "loss": 4.6999, "step": 26525 }, { "epoch": 0.5432801589089346, "grad_norm": 0.26772749423980713, "learning_rate": 0.0001899963808787034, "loss": 4.7049, "step": 26530 }, { "epoch": 0.5433825486863392, "grad_norm": 0.2506352663040161, "learning_rate": 0.00018993672240302627, "loss": 4.6646, "step": 26535 }, { "epoch": 0.5434849384637438, "grad_norm": 0.2651379108428955, "learning_rate": 0.00018987705712725597, "loss": 4.7177, "step": 26540 }, { "epoch": 0.5435873282411484, "grad_norm": 0.25979286432266235, "learning_rate": 0.00018981738506155182, "loss": 4.7103, "step": 26545 }, { "epoch": 0.543689718018553, "grad_norm": 0.2624056935310364, "learning_rate": 0.00018975770621607423, "loss": 4.7129, "step": 26550 }, { "epoch": 0.5437921077959577, "grad_norm": 0.26970383524894714, "learning_rate": 0.00018969802060098492, "loss": 4.7168, "step": 26555 }, { "epoch": 0.5438944975733623, "grad_norm": 0.2662735879421234, "learning_rate": 0.00018963832822644657, "loss": 4.6698, "step": 26560 }, { "epoch": 0.5439968873507669, "grad_norm": 0.2513354420661926, "learning_rate": 0.00018957862910262306, "loss": 4.6928, "step": 26565 }, { "epoch": 0.5440992771281715, "grad_norm": 0.2497677505016327, "learning_rate": 0.00018951892323967962, "loss": 4.644, "step": 26570 }, { "epoch": 0.5442016669055761, "grad_norm": 0.24549132585525513, "learning_rate": 0.00018945921064778237, "loss": 4.6796, "step": 26575 }, { "epoch": 0.5443040566829808, "grad_norm": 0.2421257197856903, "learning_rate": 0.00018939949133709865, "loss": 4.6875, "step": 26580 }, { "epoch": 0.5444064464603854, "grad_norm": 0.2439357191324234, "learning_rate": 0.00018933976531779702, "loss": 4.704, "step": 26585 }, { "epoch": 0.54450883623779, "grad_norm": 0.29698094725608826, "learning_rate": 0.00018928003260004712, "loss": 4.5639, "step": 26590 }, { "epoch": 0.5446112260151946, "grad_norm": 0.27436554431915283, "learning_rate": 0.00018922029319401966, "loss": 4.6947, "step": 26595 }, { "epoch": 0.5447136157925992, "grad_norm": 0.24455125629901886, "learning_rate": 0.00018916054710988673, "loss": 4.6753, "step": 26600 }, { "epoch": 0.5448160055700039, "grad_norm": 0.2663513720035553, "learning_rate": 0.00018910079435782124, "loss": 4.7759, "step": 26605 }, { "epoch": 0.5449183953474085, "grad_norm": 0.2627456486225128, "learning_rate": 0.00018904103494799744, "loss": 4.6826, "step": 26610 }, { "epoch": 0.5450207851248131, "grad_norm": 0.2518863379955292, "learning_rate": 0.00018898126889059074, "loss": 4.6251, "step": 26615 }, { "epoch": 0.5451231749022177, "grad_norm": 0.2908294200897217, "learning_rate": 0.00018892149619577757, "loss": 4.6404, "step": 26620 }, { "epoch": 0.5452255646796224, "grad_norm": 0.2405155599117279, "learning_rate": 0.00018886171687373547, "loss": 4.6741, "step": 26625 }, { "epoch": 0.545327954457027, "grad_norm": 0.24741297960281372, "learning_rate": 0.00018880193093464323, "loss": 4.7229, "step": 26630 }, { "epoch": 0.5454303442344316, "grad_norm": 0.26555123925209045, "learning_rate": 0.00018874213838868063, "loss": 4.7053, "step": 26635 }, { "epoch": 0.5455327340118362, "grad_norm": 0.26296570897102356, "learning_rate": 0.00018868233924602872, "loss": 4.6912, "step": 26640 }, { "epoch": 0.5456351237892408, "grad_norm": 0.2616727650165558, "learning_rate": 0.00018862253351686962, "loss": 4.6611, "step": 26645 }, { "epoch": 0.5457375135666455, "grad_norm": 0.2673822343349457, "learning_rate": 0.00018856272121138646, "loss": 4.671, "step": 26650 }, { "epoch": 0.5458399033440501, "grad_norm": 0.25794851779937744, "learning_rate": 0.00018850290233976362, "loss": 4.6675, "step": 26655 }, { "epoch": 0.5459422931214547, "grad_norm": 0.2626766860485077, "learning_rate": 0.0001884430769121866, "loss": 4.6574, "step": 26660 }, { "epoch": 0.5460446828988593, "grad_norm": 0.25485607981681824, "learning_rate": 0.00018838324493884197, "loss": 4.6978, "step": 26665 }, { "epoch": 0.5461470726762641, "grad_norm": 0.24853378534317017, "learning_rate": 0.00018832340642991737, "loss": 4.6955, "step": 26670 }, { "epoch": 0.5462494624536687, "grad_norm": 0.25438764691352844, "learning_rate": 0.00018826356139560165, "loss": 4.6484, "step": 26675 }, { "epoch": 0.5463518522310733, "grad_norm": 0.28246045112609863, "learning_rate": 0.00018820370984608468, "loss": 4.6657, "step": 26680 }, { "epoch": 0.5464542420084779, "grad_norm": 0.261288583278656, "learning_rate": 0.00018814385179155755, "loss": 4.6753, "step": 26685 }, { "epoch": 0.5465566317858825, "grad_norm": 0.26489171385765076, "learning_rate": 0.00018808398724221232, "loss": 4.7225, "step": 26690 }, { "epoch": 0.5466590215632872, "grad_norm": 0.2542685568332672, "learning_rate": 0.0001880241162082423, "loss": 4.6645, "step": 26695 }, { "epoch": 0.5467614113406918, "grad_norm": 0.31563809514045715, "learning_rate": 0.00018796423869984176, "loss": 4.6155, "step": 26700 }, { "epoch": 0.5468638011180964, "grad_norm": 0.2422453612089157, "learning_rate": 0.00018790435472720618, "loss": 4.7036, "step": 26705 }, { "epoch": 0.546966190895501, "grad_norm": 0.2594524025917053, "learning_rate": 0.00018784446430053213, "loss": 4.6983, "step": 26710 }, { "epoch": 0.5470685806729056, "grad_norm": 0.26316750049591064, "learning_rate": 0.00018778456743001723, "loss": 4.6909, "step": 26715 }, { "epoch": 0.5471709704503103, "grad_norm": 0.270597904920578, "learning_rate": 0.0001877246641258602, "loss": 4.6347, "step": 26720 }, { "epoch": 0.5472733602277149, "grad_norm": 0.26570531725883484, "learning_rate": 0.00018766475439826089, "loss": 4.7231, "step": 26725 }, { "epoch": 0.5473757500051195, "grad_norm": 0.26886647939682007, "learning_rate": 0.00018760483825742028, "loss": 4.6587, "step": 26730 }, { "epoch": 0.5474781397825241, "grad_norm": 0.24525582790374756, "learning_rate": 0.0001875449157135403, "loss": 4.6629, "step": 26735 }, { "epoch": 0.5475805295599288, "grad_norm": 0.2805182635784149, "learning_rate": 0.00018748498677682418, "loss": 4.6969, "step": 26740 }, { "epoch": 0.5476829193373334, "grad_norm": 0.24619364738464355, "learning_rate": 0.00018742505145747596, "loss": 4.6606, "step": 26745 }, { "epoch": 0.547785309114738, "grad_norm": 0.2541234493255615, "learning_rate": 0.00018736510976570105, "loss": 4.644, "step": 26750 }, { "epoch": 0.5478876988921426, "grad_norm": 0.32706886529922485, "learning_rate": 0.00018730516171170587, "loss": 4.6714, "step": 26755 }, { "epoch": 0.5479900886695472, "grad_norm": 0.25250479578971863, "learning_rate": 0.00018724520730569773, "loss": 4.7627, "step": 26760 }, { "epoch": 0.5480924784469519, "grad_norm": 0.24195893108844757, "learning_rate": 0.00018718524655788525, "loss": 4.6815, "step": 26765 }, { "epoch": 0.5481948682243565, "grad_norm": 0.24525094032287598, "learning_rate": 0.00018712527947847805, "loss": 4.7774, "step": 26770 }, { "epoch": 0.5482972580017611, "grad_norm": 0.25464239716529846, "learning_rate": 0.00018706530607768678, "loss": 4.6538, "step": 26775 }, { "epoch": 0.5483996477791657, "grad_norm": 0.2611331045627594, "learning_rate": 0.0001870053263657233, "loss": 4.6961, "step": 26780 }, { "epoch": 0.5485020375565703, "grad_norm": 0.24742406606674194, "learning_rate": 0.00018694534035280038, "loss": 4.6935, "step": 26785 }, { "epoch": 0.548604427333975, "grad_norm": 0.26102006435394287, "learning_rate": 0.00018688534804913197, "loss": 4.7044, "step": 26790 }, { "epoch": 0.5487068171113796, "grad_norm": 0.26690933108329773, "learning_rate": 0.000186825349464933, "loss": 4.7174, "step": 26795 }, { "epoch": 0.5488092068887842, "grad_norm": 0.2613051235675812, "learning_rate": 0.00018676534461041966, "loss": 4.7217, "step": 26800 }, { "epoch": 0.5489115966661888, "grad_norm": 0.2524051070213318, "learning_rate": 0.00018670533349580894, "loss": 4.6833, "step": 26805 }, { "epoch": 0.5490139864435934, "grad_norm": 0.2654576003551483, "learning_rate": 0.00018664531613131913, "loss": 4.6963, "step": 26810 }, { "epoch": 0.5491163762209981, "grad_norm": 0.27269455790519714, "learning_rate": 0.00018658529252716944, "loss": 4.5475, "step": 26815 }, { "epoch": 0.5492187659984027, "grad_norm": 0.27000007033348083, "learning_rate": 0.0001865252626935802, "loss": 4.6638, "step": 26820 }, { "epoch": 0.5493211557758073, "grad_norm": 0.26834815740585327, "learning_rate": 0.00018646522664077282, "loss": 4.6984, "step": 26825 }, { "epoch": 0.5494235455532119, "grad_norm": 0.2604410946369171, "learning_rate": 0.0001864051843789697, "loss": 4.6812, "step": 26830 }, { "epoch": 0.5495259353306166, "grad_norm": 0.2676681876182556, "learning_rate": 0.00018634513591839436, "loss": 4.6832, "step": 26835 }, { "epoch": 0.5496283251080212, "grad_norm": 0.26295915246009827, "learning_rate": 0.0001862850812692713, "loss": 4.6604, "step": 26840 }, { "epoch": 0.5497307148854258, "grad_norm": 0.25580817461013794, "learning_rate": 0.0001862250204418262, "loss": 4.638, "step": 26845 }, { "epoch": 0.5498331046628304, "grad_norm": 0.24330177903175354, "learning_rate": 0.0001861649534462857, "loss": 4.7265, "step": 26850 }, { "epoch": 0.549935494440235, "grad_norm": 0.25567755103111267, "learning_rate": 0.00018610488029287752, "loss": 4.6356, "step": 26855 }, { "epoch": 0.5500378842176397, "grad_norm": 0.280666321516037, "learning_rate": 0.00018604480099183033, "loss": 4.6588, "step": 26860 }, { "epoch": 0.5501402739950443, "grad_norm": 0.23846778273582458, "learning_rate": 0.00018598471555337403, "loss": 4.7061, "step": 26865 }, { "epoch": 0.5502426637724489, "grad_norm": 0.27296262979507446, "learning_rate": 0.00018592462398773945, "loss": 4.6588, "step": 26870 }, { "epoch": 0.5503450535498536, "grad_norm": 0.2844870686531067, "learning_rate": 0.00018586452630515847, "loss": 4.6414, "step": 26875 }, { "epoch": 0.5504474433272583, "grad_norm": 0.26245835423469543, "learning_rate": 0.00018580442251586406, "loss": 4.6617, "step": 26880 }, { "epoch": 0.5505498331046629, "grad_norm": 0.2403590828180313, "learning_rate": 0.00018574431263009009, "loss": 4.6952, "step": 26885 }, { "epoch": 0.5506522228820675, "grad_norm": 0.28531143069267273, "learning_rate": 0.00018568419665807167, "loss": 4.6971, "step": 26890 }, { "epoch": 0.5507546126594721, "grad_norm": 0.2594854533672333, "learning_rate": 0.00018562407461004482, "loss": 4.6822, "step": 26895 }, { "epoch": 0.5508570024368767, "grad_norm": 0.2994579076766968, "learning_rate": 0.00018556394649624664, "loss": 4.6721, "step": 26900 }, { "epoch": 0.5509593922142814, "grad_norm": 0.25801777839660645, "learning_rate": 0.00018550381232691524, "loss": 4.6886, "step": 26905 }, { "epoch": 0.551061781991686, "grad_norm": 0.28385981917381287, "learning_rate": 0.00018544367211228969, "loss": 4.6774, "step": 26910 }, { "epoch": 0.5511641717690906, "grad_norm": 0.2582709789276123, "learning_rate": 0.00018538352586261025, "loss": 4.6501, "step": 26915 }, { "epoch": 0.5512665615464952, "grad_norm": 0.2655521631240845, "learning_rate": 0.00018532337358811816, "loss": 4.6338, "step": 26920 }, { "epoch": 0.5513689513238998, "grad_norm": 0.2572382986545563, "learning_rate": 0.00018526321529905554, "loss": 4.6787, "step": 26925 }, { "epoch": 0.5514713411013045, "grad_norm": 0.25258669257164, "learning_rate": 0.00018520305100566567, "loss": 4.6881, "step": 26930 }, { "epoch": 0.5515737308787091, "grad_norm": 0.7811693549156189, "learning_rate": 0.0001851428807181929, "loss": 4.6534, "step": 26935 }, { "epoch": 0.5516761206561137, "grad_norm": 0.2604113519191742, "learning_rate": 0.0001850827044468825, "loss": 4.7078, "step": 26940 }, { "epoch": 0.5517785104335183, "grad_norm": 0.2545107305049896, "learning_rate": 0.0001850225222019807, "loss": 4.7246, "step": 26945 }, { "epoch": 0.551880900210923, "grad_norm": 0.24491417407989502, "learning_rate": 0.00018496233399373495, "loss": 4.6983, "step": 26950 }, { "epoch": 0.5519832899883276, "grad_norm": 0.4022284150123596, "learning_rate": 0.0001849021398323935, "loss": 4.753, "step": 26955 }, { "epoch": 0.5520856797657322, "grad_norm": 0.2897582948207855, "learning_rate": 0.00018484193972820575, "loss": 4.7317, "step": 26960 }, { "epoch": 0.5521880695431368, "grad_norm": 0.24569040536880493, "learning_rate": 0.00018478173369142212, "loss": 4.6981, "step": 26965 }, { "epoch": 0.5522904593205414, "grad_norm": 0.25419145822525024, "learning_rate": 0.00018472152173229392, "loss": 4.7112, "step": 26970 }, { "epoch": 0.5523928490979461, "grad_norm": 0.2585573196411133, "learning_rate": 0.00018466130386107362, "loss": 4.6837, "step": 26975 }, { "epoch": 0.5524952388753507, "grad_norm": 0.24350248277187347, "learning_rate": 0.00018460108008801454, "loss": 4.7195, "step": 26980 }, { "epoch": 0.5525976286527553, "grad_norm": 0.26366153359413147, "learning_rate": 0.00018454085042337107, "loss": 4.6877, "step": 26985 }, { "epoch": 0.5527000184301599, "grad_norm": 3.328211545944214, "learning_rate": 0.00018448061487739875, "loss": 4.6177, "step": 26990 }, { "epoch": 0.5528024082075645, "grad_norm": 0.2649926245212555, "learning_rate": 0.00018442037346035387, "loss": 4.6967, "step": 26995 }, { "epoch": 0.5529047979849692, "grad_norm": 0.26315954327583313, "learning_rate": 0.00018436012618249385, "loss": 4.6918, "step": 27000 }, { "epoch": 0.5529047979849692, "eval_loss": 4.707661151885986, "eval_runtime": 17.1438, "eval_samples_per_second": 33.948, "eval_steps_per_second": 4.258, "step": 27000 }, { "epoch": 0.5530071877623738, "grad_norm": 0.2600516974925995, "learning_rate": 0.0001842998730540771, "loss": 4.6866, "step": 27005 }, { "epoch": 0.5531095775397784, "grad_norm": 0.24640217423439026, "learning_rate": 0.0001842396140853631, "loss": 4.7314, "step": 27010 }, { "epoch": 0.553211967317183, "grad_norm": 0.2561544179916382, "learning_rate": 0.0001841793492866121, "loss": 4.6893, "step": 27015 }, { "epoch": 0.5533143570945876, "grad_norm": 0.2652214467525482, "learning_rate": 0.00018411907866808567, "loss": 4.6634, "step": 27020 }, { "epoch": 0.5534167468719923, "grad_norm": 0.2577473521232605, "learning_rate": 0.000184058802240046, "loss": 4.7232, "step": 27025 }, { "epoch": 0.5535191366493969, "grad_norm": 0.24600835144519806, "learning_rate": 0.00018399852001275662, "loss": 4.6832, "step": 27030 }, { "epoch": 0.5536215264268015, "grad_norm": 0.24829618632793427, "learning_rate": 0.00018393823199648179, "loss": 4.6178, "step": 27035 }, { "epoch": 0.5537239162042061, "grad_norm": 0.2764621078968048, "learning_rate": 0.00018387793820148687, "loss": 4.723, "step": 27040 }, { "epoch": 0.5538263059816108, "grad_norm": 0.25059133768081665, "learning_rate": 0.00018381763863803822, "loss": 4.6999, "step": 27045 }, { "epoch": 0.5539286957590154, "grad_norm": 0.2450833022594452, "learning_rate": 0.0001837573333164031, "loss": 4.6377, "step": 27050 }, { "epoch": 0.55403108553642, "grad_norm": 0.25288718938827515, "learning_rate": 0.00018369702224684982, "loss": 4.6913, "step": 27055 }, { "epoch": 0.5541334753138246, "grad_norm": 0.24497826397418976, "learning_rate": 0.00018363670543964768, "loss": 4.6703, "step": 27060 }, { "epoch": 0.5542358650912292, "grad_norm": 0.2574631869792938, "learning_rate": 0.0001835763829050669, "loss": 4.6749, "step": 27065 }, { "epoch": 0.5543382548686339, "grad_norm": 0.28040191531181335, "learning_rate": 0.0001835160546533787, "loss": 4.7238, "step": 27070 }, { "epoch": 0.5544406446460386, "grad_norm": 0.2564941644668579, "learning_rate": 0.00018345572069485523, "loss": 4.7229, "step": 27075 }, { "epoch": 0.5545430344234432, "grad_norm": 0.2596915662288666, "learning_rate": 0.00018339538103976974, "loss": 4.6761, "step": 27080 }, { "epoch": 0.5546454242008478, "grad_norm": 0.2704494297504425, "learning_rate": 0.00018333503569839627, "loss": 4.7243, "step": 27085 }, { "epoch": 0.5547478139782525, "grad_norm": 0.25303056836128235, "learning_rate": 0.00018327468468101003, "loss": 4.7087, "step": 27090 }, { "epoch": 0.5548502037556571, "grad_norm": 0.2724940776824951, "learning_rate": 0.00018321432799788697, "loss": 4.7116, "step": 27095 }, { "epoch": 0.5549525935330617, "grad_norm": 0.26068201661109924, "learning_rate": 0.0001831539656593042, "loss": 4.657, "step": 27100 }, { "epoch": 0.5550549833104663, "grad_norm": 0.2497115582227707, "learning_rate": 0.00018309359767553976, "loss": 4.6816, "step": 27105 }, { "epoch": 0.5551573730878709, "grad_norm": 0.2723061740398407, "learning_rate": 0.00018303322405687248, "loss": 4.6453, "step": 27110 }, { "epoch": 0.5552597628652756, "grad_norm": 0.3353561758995056, "learning_rate": 0.00018297284481358242, "loss": 4.5887, "step": 27115 }, { "epoch": 0.5553621526426802, "grad_norm": 0.2825780212879181, "learning_rate": 0.00018291245995595033, "loss": 4.6267, "step": 27120 }, { "epoch": 0.5554645424200848, "grad_norm": 0.2716694474220276, "learning_rate": 0.00018285206949425812, "loss": 4.7231, "step": 27125 }, { "epoch": 0.5555669321974894, "grad_norm": 0.2531558871269226, "learning_rate": 0.00018279167343878859, "loss": 4.6276, "step": 27130 }, { "epoch": 0.555669321974894, "grad_norm": 0.2798287868499756, "learning_rate": 0.0001827312717998254, "loss": 4.7516, "step": 27135 }, { "epoch": 0.5557717117522987, "grad_norm": 0.2748793661594391, "learning_rate": 0.0001826708645876533, "loss": 4.6299, "step": 27140 }, { "epoch": 0.5558741015297033, "grad_norm": 0.24465809762477875, "learning_rate": 0.0001826104518125579, "loss": 4.6925, "step": 27145 }, { "epoch": 0.5559764913071079, "grad_norm": 0.29484617710113525, "learning_rate": 0.00018255003348482577, "loss": 4.6911, "step": 27150 }, { "epoch": 0.5560788810845125, "grad_norm": 0.28605300188064575, "learning_rate": 0.0001824896096147445, "loss": 4.7485, "step": 27155 }, { "epoch": 0.5561812708619172, "grad_norm": 0.2805459797382355, "learning_rate": 0.00018242918021260256, "loss": 4.7061, "step": 27160 }, { "epoch": 0.5562836606393218, "grad_norm": 0.3476477861404419, "learning_rate": 0.00018236874528868927, "loss": 4.6931, "step": 27165 }, { "epoch": 0.5563860504167264, "grad_norm": 0.26884692907333374, "learning_rate": 0.00018230830485329506, "loss": 4.7284, "step": 27170 }, { "epoch": 0.556488440194131, "grad_norm": 0.25742143392562866, "learning_rate": 0.00018224785891671124, "loss": 4.6757, "step": 27175 }, { "epoch": 0.5565908299715356, "grad_norm": 0.27536290884017944, "learning_rate": 0.00018218740748923002, "loss": 4.6719, "step": 27180 }, { "epoch": 0.5566932197489403, "grad_norm": 0.24896275997161865, "learning_rate": 0.00018212695058114457, "loss": 4.7062, "step": 27185 }, { "epoch": 0.5567956095263449, "grad_norm": 0.2572430968284607, "learning_rate": 0.00018206648820274894, "loss": 4.6957, "step": 27190 }, { "epoch": 0.5568979993037495, "grad_norm": 0.28539586067199707, "learning_rate": 0.00018200602036433825, "loss": 4.7002, "step": 27195 }, { "epoch": 0.5570003890811541, "grad_norm": 0.2676612138748169, "learning_rate": 0.00018194554707620842, "loss": 4.6843, "step": 27200 }, { "epoch": 0.5571027788585587, "grad_norm": 0.2554614543914795, "learning_rate": 0.00018188506834865636, "loss": 4.6578, "step": 27205 }, { "epoch": 0.5572051686359634, "grad_norm": 0.27748146653175354, "learning_rate": 0.00018182458419197984, "loss": 4.6778, "step": 27210 }, { "epoch": 0.557307558413368, "grad_norm": 0.2704891860485077, "learning_rate": 0.0001817640946164776, "loss": 4.697, "step": 27215 }, { "epoch": 0.5574099481907726, "grad_norm": 0.30365341901779175, "learning_rate": 0.00018170359963244934, "loss": 4.6481, "step": 27220 }, { "epoch": 0.5575123379681772, "grad_norm": 0.2857930362224579, "learning_rate": 0.00018164309925019565, "loss": 4.7163, "step": 27225 }, { "epoch": 0.5576147277455819, "grad_norm": 0.3207056522369385, "learning_rate": 0.000181582593480018, "loss": 4.7017, "step": 27230 }, { "epoch": 0.5577171175229865, "grad_norm": 0.462871789932251, "learning_rate": 0.00018152208233221878, "loss": 4.701, "step": 27235 }, { "epoch": 0.5578195073003911, "grad_norm": 0.26674333214759827, "learning_rate": 0.00018146156581710137, "loss": 4.7067, "step": 27240 }, { "epoch": 0.5579218970777957, "grad_norm": 0.2612088918685913, "learning_rate": 0.00018140104394497007, "loss": 4.5949, "step": 27245 }, { "epoch": 0.5580242868552003, "grad_norm": 0.2627280056476593, "learning_rate": 0.00018134051672612992, "loss": 4.6408, "step": 27250 }, { "epoch": 0.558126676632605, "grad_norm": 0.24926099181175232, "learning_rate": 0.0001812799841708871, "loss": 4.6388, "step": 27255 }, { "epoch": 0.5582290664100096, "grad_norm": 0.27249667048454285, "learning_rate": 0.00018121944628954852, "loss": 4.6831, "step": 27260 }, { "epoch": 0.5583314561874142, "grad_norm": 0.29811960458755493, "learning_rate": 0.0001811589030924221, "loss": 4.6652, "step": 27265 }, { "epoch": 0.5584338459648188, "grad_norm": 0.28687939047813416, "learning_rate": 0.00018109835458981666, "loss": 4.6839, "step": 27270 }, { "epoch": 0.5585362357422234, "grad_norm": 0.27148640155792236, "learning_rate": 0.00018103780079204186, "loss": 4.6828, "step": 27275 }, { "epoch": 0.5586386255196282, "grad_norm": 0.25214913487434387, "learning_rate": 0.00018097724170940824, "loss": 4.6627, "step": 27280 }, { "epoch": 0.5587410152970328, "grad_norm": 0.26098278164863586, "learning_rate": 0.00018091667735222738, "loss": 4.6741, "step": 27285 }, { "epoch": 0.5588434050744374, "grad_norm": 0.27879834175109863, "learning_rate": 0.0001808561077308117, "loss": 4.6534, "step": 27290 }, { "epoch": 0.558945794851842, "grad_norm": 0.2642155587673187, "learning_rate": 0.0001807955328554744, "loss": 4.7127, "step": 27295 }, { "epoch": 0.5590481846292467, "grad_norm": 0.26316457986831665, "learning_rate": 0.00018073495273652972, "loss": 4.6811, "step": 27300 }, { "epoch": 0.5591505744066513, "grad_norm": 0.25082215666770935, "learning_rate": 0.00018067436738429275, "loss": 4.699, "step": 27305 }, { "epoch": 0.5592529641840559, "grad_norm": 0.278416246175766, "learning_rate": 0.0001806137768090794, "loss": 4.6951, "step": 27310 }, { "epoch": 0.5593553539614605, "grad_norm": 0.3086649477481842, "learning_rate": 0.0001805531810212066, "loss": 4.7411, "step": 27315 }, { "epoch": 0.5594577437388651, "grad_norm": 0.2515885531902313, "learning_rate": 0.00018049258003099207, "loss": 4.7039, "step": 27320 }, { "epoch": 0.5595601335162698, "grad_norm": 0.2530098855495453, "learning_rate": 0.00018043197384875445, "loss": 4.7325, "step": 27325 }, { "epoch": 0.5596625232936744, "grad_norm": 1.3420644998550415, "learning_rate": 0.00018037136248481321, "loss": 4.6194, "step": 27330 }, { "epoch": 0.559764913071079, "grad_norm": 0.26888757944107056, "learning_rate": 0.0001803107459494888, "loss": 4.694, "step": 27335 }, { "epoch": 0.5598673028484836, "grad_norm": 0.26449164748191833, "learning_rate": 0.00018025012425310248, "loss": 4.656, "step": 27340 }, { "epoch": 0.5599696926258882, "grad_norm": 0.2640646696090698, "learning_rate": 0.00018018949740597647, "loss": 4.6354, "step": 27345 }, { "epoch": 0.5600720824032929, "grad_norm": 0.26847779750823975, "learning_rate": 0.00018012886541843366, "loss": 4.6841, "step": 27350 }, { "epoch": 0.5601744721806975, "grad_norm": 0.26787444949150085, "learning_rate": 0.00018006822830079808, "loss": 4.6685, "step": 27355 }, { "epoch": 0.5602768619581021, "grad_norm": 0.26082074642181396, "learning_rate": 0.00018000758606339455, "loss": 4.725, "step": 27360 }, { "epoch": 0.5603792517355067, "grad_norm": 0.26512712240219116, "learning_rate": 0.00017994693871654859, "loss": 4.6762, "step": 27365 }, { "epoch": 0.5604816415129114, "grad_norm": 0.2519755959510803, "learning_rate": 0.00017988628627058683, "loss": 4.6567, "step": 27370 }, { "epoch": 0.560584031290316, "grad_norm": 0.25382062792778015, "learning_rate": 0.00017982562873583664, "loss": 4.6997, "step": 27375 }, { "epoch": 0.5606864210677206, "grad_norm": 0.24935616552829742, "learning_rate": 0.00017976496612262622, "loss": 4.7334, "step": 27380 }, { "epoch": 0.5607888108451252, "grad_norm": 1.4657069444656372, "learning_rate": 0.0001797042984412848, "loss": 4.7416, "step": 27385 }, { "epoch": 0.5608912006225298, "grad_norm": 0.27165108919143677, "learning_rate": 0.0001796436257021423, "loss": 4.7026, "step": 27390 }, { "epoch": 0.5609935903999345, "grad_norm": 0.2536027431488037, "learning_rate": 0.00017958294791552961, "loss": 4.7337, "step": 27395 }, { "epoch": 0.5610959801773391, "grad_norm": 0.28293851017951965, "learning_rate": 0.0001795222650917784, "loss": 4.6477, "step": 27400 }, { "epoch": 0.5611983699547437, "grad_norm": 0.2583474814891815, "learning_rate": 0.0001794615772412212, "loss": 4.7176, "step": 27405 }, { "epoch": 0.5613007597321483, "grad_norm": 0.25620806217193604, "learning_rate": 0.00017940088437419155, "loss": 4.7288, "step": 27410 }, { "epoch": 0.561403149509553, "grad_norm": 0.29453903436660767, "learning_rate": 0.00017934018650102366, "loss": 4.6706, "step": 27415 }, { "epoch": 0.5615055392869576, "grad_norm": 0.2408190816640854, "learning_rate": 0.00017927948363205266, "loss": 4.6994, "step": 27420 }, { "epoch": 0.5616079290643622, "grad_norm": 0.2770805060863495, "learning_rate": 0.0001792187757776145, "loss": 4.6142, "step": 27425 }, { "epoch": 0.5617103188417668, "grad_norm": 0.2456747442483902, "learning_rate": 0.0001791580629480461, "loss": 4.7398, "step": 27430 }, { "epoch": 0.5618127086191714, "grad_norm": 0.2638939619064331, "learning_rate": 0.00017909734515368503, "loss": 4.7796, "step": 27435 }, { "epoch": 0.561915098396576, "grad_norm": 0.25840067863464355, "learning_rate": 0.0001790366224048699, "loss": 4.6846, "step": 27440 }, { "epoch": 0.5620174881739807, "grad_norm": 0.25280097126960754, "learning_rate": 0.00017897589471193998, "loss": 4.6698, "step": 27445 }, { "epoch": 0.5621198779513853, "grad_norm": 0.27671337127685547, "learning_rate": 0.00017891516208523556, "loss": 4.6482, "step": 27450 }, { "epoch": 0.5622222677287899, "grad_norm": 0.25754889845848083, "learning_rate": 0.00017885442453509766, "loss": 4.6554, "step": 27455 }, { "epoch": 0.5623246575061945, "grad_norm": 0.25561729073524475, "learning_rate": 0.00017879368207186815, "loss": 4.6459, "step": 27460 }, { "epoch": 0.5624270472835992, "grad_norm": 0.27110201120376587, "learning_rate": 0.00017873293470588978, "loss": 4.7197, "step": 27465 }, { "epoch": 0.5625294370610038, "grad_norm": 0.2641063332557678, "learning_rate": 0.0001786721824475061, "loss": 4.6728, "step": 27470 }, { "epoch": 0.5626318268384084, "grad_norm": 0.2695915400981903, "learning_rate": 0.00017861142530706145, "loss": 4.6747, "step": 27475 }, { "epoch": 0.5627342166158131, "grad_norm": 0.26590949296951294, "learning_rate": 0.00017855066329490113, "loss": 4.6944, "step": 27480 }, { "epoch": 0.5628366063932178, "grad_norm": 0.2543744742870331, "learning_rate": 0.00017848989642137118, "loss": 4.7019, "step": 27485 }, { "epoch": 0.5629389961706224, "grad_norm": 0.3397654592990875, "learning_rate": 0.00017842912469681838, "loss": 4.6556, "step": 27490 }, { "epoch": 0.563041385948027, "grad_norm": 0.27330344915390015, "learning_rate": 0.00017836834813159054, "loss": 4.6798, "step": 27495 }, { "epoch": 0.5631437757254316, "grad_norm": 0.27935123443603516, "learning_rate": 0.00017830756673603614, "loss": 4.6958, "step": 27500 }, { "epoch": 0.5632461655028362, "grad_norm": 0.3052602708339691, "learning_rate": 0.00017824678052050455, "loss": 4.6644, "step": 27505 }, { "epoch": 0.5633485552802409, "grad_norm": 0.27773189544677734, "learning_rate": 0.00017818598949534593, "loss": 4.7347, "step": 27510 }, { "epoch": 0.5634509450576455, "grad_norm": 0.2945863902568817, "learning_rate": 0.0001781251936709113, "loss": 4.6685, "step": 27515 }, { "epoch": 0.5635533348350501, "grad_norm": 0.26481300592422485, "learning_rate": 0.0001780643930575524, "loss": 4.7081, "step": 27520 }, { "epoch": 0.5636557246124547, "grad_norm": 0.26331692934036255, "learning_rate": 0.00017800358766562196, "loss": 4.6974, "step": 27525 }, { "epoch": 0.5637581143898593, "grad_norm": 0.25365984439849854, "learning_rate": 0.00017794277750547328, "loss": 4.7501, "step": 27530 }, { "epoch": 0.563860504167264, "grad_norm": 0.25934815406799316, "learning_rate": 0.00017788196258746074, "loss": 4.7036, "step": 27535 }, { "epoch": 0.5639628939446686, "grad_norm": 0.24910928308963776, "learning_rate": 0.00017782114292193933, "loss": 4.6434, "step": 27540 }, { "epoch": 0.5640652837220732, "grad_norm": 0.26640039682388306, "learning_rate": 0.0001777603185192649, "loss": 4.687, "step": 27545 }, { "epoch": 0.5641676734994778, "grad_norm": 0.2597857713699341, "learning_rate": 0.00017769948938979417, "loss": 4.6525, "step": 27550 }, { "epoch": 0.5642700632768824, "grad_norm": 0.2706809937953949, "learning_rate": 0.00017763865554388465, "loss": 4.6735, "step": 27555 }, { "epoch": 0.5643724530542871, "grad_norm": 0.2589665651321411, "learning_rate": 0.00017757781699189446, "loss": 4.7307, "step": 27560 }, { "epoch": 0.5644748428316917, "grad_norm": 0.26661667227745056, "learning_rate": 0.00017751697374418287, "loss": 4.6581, "step": 27565 }, { "epoch": 0.5645772326090963, "grad_norm": 0.27051541209220886, "learning_rate": 0.0001774561258111097, "loss": 4.6218, "step": 27570 }, { "epoch": 0.5646796223865009, "grad_norm": 0.26295870542526245, "learning_rate": 0.00017739527320303556, "loss": 4.7004, "step": 27575 }, { "epoch": 0.5647820121639056, "grad_norm": 0.25100234150886536, "learning_rate": 0.00017733441593032205, "loss": 4.7355, "step": 27580 }, { "epoch": 0.5648844019413102, "grad_norm": 0.2552400827407837, "learning_rate": 0.00017727355400333134, "loss": 4.6514, "step": 27585 }, { "epoch": 0.5649867917187148, "grad_norm": 0.2714831531047821, "learning_rate": 0.00017721268743242654, "loss": 4.6485, "step": 27590 }, { "epoch": 0.5650891814961194, "grad_norm": 0.24230805039405823, "learning_rate": 0.00017715181622797148, "loss": 4.6641, "step": 27595 }, { "epoch": 0.565191571273524, "grad_norm": 0.28542134165763855, "learning_rate": 0.0001770909404003308, "loss": 4.7616, "step": 27600 }, { "epoch": 0.5652939610509287, "grad_norm": 0.25475913286209106, "learning_rate": 0.00017703005995986997, "loss": 4.6521, "step": 27605 }, { "epoch": 0.5653963508283333, "grad_norm": 0.300935298204422, "learning_rate": 0.00017696917491695518, "loss": 4.6605, "step": 27610 }, { "epoch": 0.5654987406057379, "grad_norm": 0.2584412395954132, "learning_rate": 0.0001769082852819534, "loss": 4.6719, "step": 27615 }, { "epoch": 0.5656011303831425, "grad_norm": 0.25238272547721863, "learning_rate": 0.00017684739106523247, "loss": 4.7001, "step": 27620 }, { "epoch": 0.5657035201605471, "grad_norm": 0.26562386751174927, "learning_rate": 0.00017678649227716094, "loss": 4.6864, "step": 27625 }, { "epoch": 0.5658059099379518, "grad_norm": 0.2750887870788574, "learning_rate": 0.00017672558892810808, "loss": 4.6785, "step": 27630 }, { "epoch": 0.5659082997153564, "grad_norm": 0.25745710730552673, "learning_rate": 0.0001766646810284441, "loss": 4.626, "step": 27635 }, { "epoch": 0.566010689492761, "grad_norm": 0.29567331075668335, "learning_rate": 0.00017660376858853982, "loss": 4.6796, "step": 27640 }, { "epoch": 0.5661130792701656, "grad_norm": 0.26070085167884827, "learning_rate": 0.00017654285161876695, "loss": 4.6649, "step": 27645 }, { "epoch": 0.5662154690475703, "grad_norm": 0.266722172498703, "learning_rate": 0.0001764819301294979, "loss": 4.6549, "step": 27650 }, { "epoch": 0.5663178588249749, "grad_norm": 0.2570223808288574, "learning_rate": 0.00017642100413110587, "loss": 4.6951, "step": 27655 }, { "epoch": 0.5664202486023795, "grad_norm": 0.2593010365962982, "learning_rate": 0.00017636007363396486, "loss": 4.6673, "step": 27660 }, { "epoch": 0.5665226383797841, "grad_norm": 0.2559462785720825, "learning_rate": 0.0001762991386484496, "loss": 4.6072, "step": 27665 }, { "epoch": 0.5666250281571887, "grad_norm": 0.28588035702705383, "learning_rate": 0.00017623819918493556, "loss": 4.7359, "step": 27670 }, { "epoch": 0.5667274179345934, "grad_norm": 0.25574782490730286, "learning_rate": 0.00017617725525379907, "loss": 4.5986, "step": 27675 }, { "epoch": 0.566829807711998, "grad_norm": 0.2513871490955353, "learning_rate": 0.00017611630686541712, "loss": 4.6416, "step": 27680 }, { "epoch": 0.5669321974894027, "grad_norm": 0.2426123172044754, "learning_rate": 0.00017605535403016745, "loss": 4.6405, "step": 27685 }, { "epoch": 0.5670345872668073, "grad_norm": 0.25610238313674927, "learning_rate": 0.00017599439675842867, "loss": 4.707, "step": 27690 }, { "epoch": 0.567136977044212, "grad_norm": 0.24854488670825958, "learning_rate": 0.00017593343506058003, "loss": 4.7291, "step": 27695 }, { "epoch": 0.5672393668216166, "grad_norm": 0.2657903730869293, "learning_rate": 0.0001758724689470016, "loss": 4.6798, "step": 27700 }, { "epoch": 0.5673417565990212, "grad_norm": 0.27302059531211853, "learning_rate": 0.00017581149842807426, "loss": 4.5984, "step": 27705 }, { "epoch": 0.5674441463764258, "grad_norm": 0.25416168570518494, "learning_rate": 0.00017575052351417942, "loss": 4.6871, "step": 27710 }, { "epoch": 0.5675465361538304, "grad_norm": 0.26361799240112305, "learning_rate": 0.00017568954421569945, "loss": 4.7508, "step": 27715 }, { "epoch": 0.5676489259312351, "grad_norm": 0.25274792313575745, "learning_rate": 0.00017562856054301746, "loss": 4.6261, "step": 27720 }, { "epoch": 0.5677513157086397, "grad_norm": 0.2605419158935547, "learning_rate": 0.0001755675725065171, "loss": 4.7088, "step": 27725 }, { "epoch": 0.5678537054860443, "grad_norm": 0.26386305689811707, "learning_rate": 0.00017550658011658305, "loss": 4.7259, "step": 27730 }, { "epoch": 0.5679560952634489, "grad_norm": 0.26385360956192017, "learning_rate": 0.0001754455833836005, "loss": 4.6939, "step": 27735 }, { "epoch": 0.5680584850408535, "grad_norm": 0.2676510214805603, "learning_rate": 0.0001753845823179555, "loss": 4.6886, "step": 27740 }, { "epoch": 0.5681608748182582, "grad_norm": 0.2556968331336975, "learning_rate": 0.0001753235769300348, "loss": 4.7153, "step": 27745 }, { "epoch": 0.5682632645956628, "grad_norm": 0.25364434719085693, "learning_rate": 0.0001752625672302259, "loss": 4.6915, "step": 27750 }, { "epoch": 0.5683656543730674, "grad_norm": 0.3793516755104065, "learning_rate": 0.000175201553228917, "loss": 4.6868, "step": 27755 }, { "epoch": 0.568468044150472, "grad_norm": 0.35917139053344727, "learning_rate": 0.00017514053493649704, "loss": 4.6942, "step": 27760 }, { "epoch": 0.5685704339278767, "grad_norm": 0.35769927501678467, "learning_rate": 0.00017507951236335578, "loss": 4.6506, "step": 27765 }, { "epoch": 0.5686728237052813, "grad_norm": 0.2890172600746155, "learning_rate": 0.00017501848551988359, "loss": 4.6713, "step": 27770 }, { "epoch": 0.5687752134826859, "grad_norm": 0.2619645595550537, "learning_rate": 0.0001749574544164716, "loss": 4.6714, "step": 27775 }, { "epoch": 0.5688776032600905, "grad_norm": 0.2589600384235382, "learning_rate": 0.00017489641906351172, "loss": 4.5968, "step": 27780 }, { "epoch": 0.5689799930374951, "grad_norm": 0.259791761636734, "learning_rate": 0.00017483537947139645, "loss": 4.6749, "step": 27785 }, { "epoch": 0.5690823828148998, "grad_norm": 0.28064924478530884, "learning_rate": 0.00017477433565051925, "loss": 4.7284, "step": 27790 }, { "epoch": 0.5691847725923044, "grad_norm": 0.2724134027957916, "learning_rate": 0.00017471328761127405, "loss": 4.6741, "step": 27795 }, { "epoch": 0.569287162369709, "grad_norm": 0.26181870698928833, "learning_rate": 0.00017465223536405566, "loss": 4.6555, "step": 27800 }, { "epoch": 0.5693895521471136, "grad_norm": 0.26660504937171936, "learning_rate": 0.00017459117891925946, "loss": 4.6492, "step": 27805 }, { "epoch": 0.5694919419245182, "grad_norm": 0.26408490538597107, "learning_rate": 0.00017453011828728171, "loss": 4.6067, "step": 27810 }, { "epoch": 0.5695943317019229, "grad_norm": 0.25006911158561707, "learning_rate": 0.00017446905347851931, "loss": 4.6796, "step": 27815 }, { "epoch": 0.5696967214793275, "grad_norm": 0.2554749548435211, "learning_rate": 0.0001744079845033699, "loss": 4.6869, "step": 27820 }, { "epoch": 0.5697991112567321, "grad_norm": 0.2597162127494812, "learning_rate": 0.0001743469113722317, "loss": 4.6845, "step": 27825 }, { "epoch": 0.5699015010341367, "grad_norm": 0.5917623043060303, "learning_rate": 0.00017428583409550373, "loss": 4.5729, "step": 27830 }, { "epoch": 0.5700038908115413, "grad_norm": 0.25182023644447327, "learning_rate": 0.00017422475268358585, "loss": 4.693, "step": 27835 }, { "epoch": 0.570106280588946, "grad_norm": 0.2560945153236389, "learning_rate": 0.00017416366714687834, "loss": 4.7037, "step": 27840 }, { "epoch": 0.5702086703663506, "grad_norm": 0.2556745409965515, "learning_rate": 0.0001741025774957825, "loss": 4.6833, "step": 27845 }, { "epoch": 0.5703110601437552, "grad_norm": 0.2526347041130066, "learning_rate": 0.00017404148374070003, "loss": 4.6932, "step": 27850 }, { "epoch": 0.5704134499211598, "grad_norm": 0.2605088949203491, "learning_rate": 0.00017398038589203354, "loss": 4.7266, "step": 27855 }, { "epoch": 0.5705158396985645, "grad_norm": 0.25798657536506653, "learning_rate": 0.00017391928396018623, "loss": 4.6746, "step": 27860 }, { "epoch": 0.5706182294759691, "grad_norm": 0.27800747752189636, "learning_rate": 0.00017385817795556206, "loss": 4.6993, "step": 27865 }, { "epoch": 0.5707206192533737, "grad_norm": 0.2565612494945526, "learning_rate": 0.00017379706788856565, "loss": 4.6535, "step": 27870 }, { "epoch": 0.5708230090307783, "grad_norm": 0.25342145562171936, "learning_rate": 0.00017373595376960225, "loss": 4.5707, "step": 27875 }, { "epoch": 0.5709253988081829, "grad_norm": 0.27122899889945984, "learning_rate": 0.00017367483560907793, "loss": 4.7282, "step": 27880 }, { "epoch": 0.5710277885855877, "grad_norm": 0.24127766489982605, "learning_rate": 0.00017361371341739944, "loss": 4.6964, "step": 27885 }, { "epoch": 0.5711301783629923, "grad_norm": 0.2535822093486786, "learning_rate": 0.000173552587204974, "loss": 4.6916, "step": 27890 }, { "epoch": 0.5712325681403969, "grad_norm": 0.28374695777893066, "learning_rate": 0.00017349145698220981, "loss": 4.6998, "step": 27895 }, { "epoch": 0.5713349579178015, "grad_norm": 0.2652948498725891, "learning_rate": 0.0001734303227595155, "loss": 4.6927, "step": 27900 }, { "epoch": 0.5714373476952062, "grad_norm": 0.28421682119369507, "learning_rate": 0.00017336918454730063, "loss": 4.6617, "step": 27905 }, { "epoch": 0.5715397374726108, "grad_norm": 0.27349361777305603, "learning_rate": 0.0001733080423559752, "loss": 4.6623, "step": 27910 }, { "epoch": 0.5716421272500154, "grad_norm": 0.269765168428421, "learning_rate": 0.00017324689619595008, "loss": 4.6512, "step": 27915 }, { "epoch": 0.57174451702742, "grad_norm": 0.2743627727031708, "learning_rate": 0.00017318574607763664, "loss": 4.6922, "step": 27920 }, { "epoch": 0.5718469068048246, "grad_norm": 0.2587744891643524, "learning_rate": 0.00017312459201144706, "loss": 4.6939, "step": 27925 }, { "epoch": 0.5719492965822293, "grad_norm": 0.24722817540168762, "learning_rate": 0.00017306343400779416, "loss": 4.6897, "step": 27930 }, { "epoch": 0.5720516863596339, "grad_norm": 0.2554013133049011, "learning_rate": 0.0001730022720770914, "loss": 4.6531, "step": 27935 }, { "epoch": 0.5721540761370385, "grad_norm": 0.2546253800392151, "learning_rate": 0.00017294110622975294, "loss": 4.7333, "step": 27940 }, { "epoch": 0.5722564659144431, "grad_norm": 0.2455330789089203, "learning_rate": 0.00017287993647619352, "loss": 4.5944, "step": 27945 }, { "epoch": 0.5723588556918477, "grad_norm": 0.26898321509361267, "learning_rate": 0.0001728187628268287, "loss": 4.6866, "step": 27950 }, { "epoch": 0.5724612454692524, "grad_norm": 0.26398465037345886, "learning_rate": 0.0001727575852920746, "loss": 4.7012, "step": 27955 }, { "epoch": 0.572563635246657, "grad_norm": 0.27200570702552795, "learning_rate": 0.00017269640388234804, "loss": 4.6234, "step": 27960 }, { "epoch": 0.5726660250240616, "grad_norm": 0.2645496129989624, "learning_rate": 0.00017263521860806644, "loss": 4.6344, "step": 27965 }, { "epoch": 0.5727684148014662, "grad_norm": 0.2591147720813751, "learning_rate": 0.00017257402947964793, "loss": 4.7042, "step": 27970 }, { "epoch": 0.5728708045788709, "grad_norm": 0.254060298204422, "learning_rate": 0.00017251283650751133, "loss": 4.7294, "step": 27975 }, { "epoch": 0.5729731943562755, "grad_norm": 0.26018354296684265, "learning_rate": 0.00017245163970207598, "loss": 4.6871, "step": 27980 }, { "epoch": 0.5730755841336801, "grad_norm": 0.2611655294895172, "learning_rate": 0.0001723904390737621, "loss": 4.6886, "step": 27985 }, { "epoch": 0.5731779739110847, "grad_norm": 0.2722248136997223, "learning_rate": 0.0001723292346329903, "loss": 4.6262, "step": 27990 }, { "epoch": 0.5732803636884893, "grad_norm": 0.2603926658630371, "learning_rate": 0.00017226802639018195, "loss": 4.6824, "step": 27995 }, { "epoch": 0.573382753465894, "grad_norm": 0.25823864340782166, "learning_rate": 0.00017220681435575923, "loss": 4.6908, "step": 28000 }, { "epoch": 0.573382753465894, "eval_loss": 4.6973042488098145, "eval_runtime": 17.1349, "eval_samples_per_second": 33.966, "eval_steps_per_second": 4.26, "step": 28000 }, { "epoch": 0.5734851432432986, "grad_norm": 0.2666439712047577, "learning_rate": 0.00017214559854014468, "loss": 4.6527, "step": 28005 }, { "epoch": 0.5735875330207032, "grad_norm": 0.26047781109809875, "learning_rate": 0.00017208437895376168, "loss": 4.7092, "step": 28010 }, { "epoch": 0.5736899227981078, "grad_norm": 0.25635117292404175, "learning_rate": 0.00017202315560703416, "loss": 4.7167, "step": 28015 }, { "epoch": 0.5737923125755124, "grad_norm": 0.26363733410835266, "learning_rate": 0.00017196192851038676, "loss": 4.6531, "step": 28020 }, { "epoch": 0.5738947023529171, "grad_norm": 0.2589953541755676, "learning_rate": 0.00017190069767424472, "loss": 4.6882, "step": 28025 }, { "epoch": 0.5739970921303217, "grad_norm": 0.28121355175971985, "learning_rate": 0.0001718394631090339, "loss": 4.7118, "step": 28030 }, { "epoch": 0.5740994819077263, "grad_norm": 0.2536048889160156, "learning_rate": 0.00017177822482518078, "loss": 4.6569, "step": 28035 }, { "epoch": 0.5742018716851309, "grad_norm": 0.24868465960025787, "learning_rate": 0.00017171698283311254, "loss": 4.6585, "step": 28040 }, { "epoch": 0.5743042614625355, "grad_norm": 0.2640113830566406, "learning_rate": 0.00017165573714325697, "loss": 4.656, "step": 28045 }, { "epoch": 0.5744066512399402, "grad_norm": 0.24657516181468964, "learning_rate": 0.0001715944877660424, "loss": 4.6864, "step": 28050 }, { "epoch": 0.5745090410173448, "grad_norm": 0.2633567452430725, "learning_rate": 0.00017153323471189803, "loss": 4.7001, "step": 28055 }, { "epoch": 0.5746114307947494, "grad_norm": 0.29227760434150696, "learning_rate": 0.00017147197799125333, "loss": 4.645, "step": 28060 }, { "epoch": 0.574713820572154, "grad_norm": 0.26054689288139343, "learning_rate": 0.00017141071761453872, "loss": 4.6636, "step": 28065 }, { "epoch": 0.5748162103495587, "grad_norm": 0.2804853320121765, "learning_rate": 0.00017134945359218508, "loss": 4.7059, "step": 28070 }, { "epoch": 0.5749186001269633, "grad_norm": 0.26142027974128723, "learning_rate": 0.0001712881859346239, "loss": 4.6987, "step": 28075 }, { "epoch": 0.5750209899043679, "grad_norm": 0.2628867030143738, "learning_rate": 0.00017122691465228733, "loss": 4.6591, "step": 28080 }, { "epoch": 0.5751233796817725, "grad_norm": 0.26298439502716064, "learning_rate": 0.00017116563975560817, "loss": 4.7099, "step": 28085 }, { "epoch": 0.5752257694591772, "grad_norm": 0.2536146640777588, "learning_rate": 0.00017110436125501976, "loss": 4.6849, "step": 28090 }, { "epoch": 0.5753281592365819, "grad_norm": 0.2704731225967407, "learning_rate": 0.00017104307916095615, "loss": 4.6969, "step": 28095 }, { "epoch": 0.5754305490139865, "grad_norm": 0.24779964983463287, "learning_rate": 0.0001709817934838519, "loss": 4.6344, "step": 28100 }, { "epoch": 0.5755329387913911, "grad_norm": 0.2553950250148773, "learning_rate": 0.00017092050423414226, "loss": 4.718, "step": 28105 }, { "epoch": 0.5756353285687957, "grad_norm": 0.27989503741264343, "learning_rate": 0.00017085921142226298, "loss": 4.6681, "step": 28110 }, { "epoch": 0.5757377183462004, "grad_norm": 0.26345035433769226, "learning_rate": 0.00017079791505865058, "loss": 4.6979, "step": 28115 }, { "epoch": 0.575840108123605, "grad_norm": 0.2537969648838043, "learning_rate": 0.00017073661515374205, "loss": 4.7218, "step": 28120 }, { "epoch": 0.5759424979010096, "grad_norm": 0.27907902002334595, "learning_rate": 0.00017067531171797504, "loss": 4.6075, "step": 28125 }, { "epoch": 0.5760448876784142, "grad_norm": 0.2688951790332794, "learning_rate": 0.00017061400476178778, "loss": 4.6963, "step": 28130 }, { "epoch": 0.5761472774558188, "grad_norm": 0.25790733098983765, "learning_rate": 0.0001705526942956191, "loss": 4.6751, "step": 28135 }, { "epoch": 0.5762496672332235, "grad_norm": 0.24855847656726837, "learning_rate": 0.00017049138032990845, "loss": 4.6854, "step": 28140 }, { "epoch": 0.5763520570106281, "grad_norm": 0.2617311179637909, "learning_rate": 0.0001704300628750959, "loss": 4.6834, "step": 28145 }, { "epoch": 0.5764544467880327, "grad_norm": 0.26312047243118286, "learning_rate": 0.00017036874194162202, "loss": 4.6839, "step": 28150 }, { "epoch": 0.5765568365654373, "grad_norm": 0.26814526319503784, "learning_rate": 0.00017030741753992806, "loss": 4.6945, "step": 28155 }, { "epoch": 0.576659226342842, "grad_norm": 0.399715393781662, "learning_rate": 0.00017024608968045578, "loss": 4.7197, "step": 28160 }, { "epoch": 0.5767616161202466, "grad_norm": 0.25246548652648926, "learning_rate": 0.00017018475837364766, "loss": 4.6981, "step": 28165 }, { "epoch": 0.5768640058976512, "grad_norm": 0.2590174674987793, "learning_rate": 0.00017012342362994663, "loss": 4.6296, "step": 28170 }, { "epoch": 0.5769663956750558, "grad_norm": 0.2699030041694641, "learning_rate": 0.0001700620854597963, "loss": 4.6516, "step": 28175 }, { "epoch": 0.5770687854524604, "grad_norm": 0.2671308219432831, "learning_rate": 0.00017000074387364073, "loss": 4.7291, "step": 28180 }, { "epoch": 0.577171175229865, "grad_norm": 0.2662724256515503, "learning_rate": 0.00016993939888192478, "loss": 4.6775, "step": 28185 }, { "epoch": 0.5772735650072697, "grad_norm": 0.283224493265152, "learning_rate": 0.0001698780504950937, "loss": 4.7108, "step": 28190 }, { "epoch": 0.5773759547846743, "grad_norm": 0.2565319836139679, "learning_rate": 0.00016981669872359335, "loss": 4.6752, "step": 28195 }, { "epoch": 0.5774783445620789, "grad_norm": 0.26022616028785706, "learning_rate": 0.00016975534357787026, "loss": 4.6665, "step": 28200 }, { "epoch": 0.5775807343394835, "grad_norm": 0.2849796712398529, "learning_rate": 0.00016969398506837144, "loss": 4.7026, "step": 28205 }, { "epoch": 0.5776831241168882, "grad_norm": 0.2585475444793701, "learning_rate": 0.00016963262320554456, "loss": 4.7047, "step": 28210 }, { "epoch": 0.5777855138942928, "grad_norm": 0.2459850162267685, "learning_rate": 0.00016957125799983775, "loss": 4.6862, "step": 28215 }, { "epoch": 0.5778879036716974, "grad_norm": 0.26823312044143677, "learning_rate": 0.00016950988946169983, "loss": 4.7114, "step": 28220 }, { "epoch": 0.577990293449102, "grad_norm": 0.28757771849632263, "learning_rate": 0.00016944851760158003, "loss": 4.6745, "step": 28225 }, { "epoch": 0.5780926832265066, "grad_norm": 0.2935873866081238, "learning_rate": 0.00016938714242992827, "loss": 4.6881, "step": 28230 }, { "epoch": 0.5781950730039113, "grad_norm": 0.25757095217704773, "learning_rate": 0.00016932576395719507, "loss": 4.6828, "step": 28235 }, { "epoch": 0.5782974627813159, "grad_norm": 0.2548655867576599, "learning_rate": 0.0001692643821938314, "loss": 4.6973, "step": 28240 }, { "epoch": 0.5783998525587205, "grad_norm": 0.2801399528980255, "learning_rate": 0.00016920299715028882, "loss": 4.6526, "step": 28245 }, { "epoch": 0.5785022423361251, "grad_norm": 0.2520582675933838, "learning_rate": 0.0001691416088370195, "loss": 4.6791, "step": 28250 }, { "epoch": 0.5786046321135297, "grad_norm": 0.25987929105758667, "learning_rate": 0.00016908021726447613, "loss": 4.6524, "step": 28255 }, { "epoch": 0.5787070218909344, "grad_norm": 0.2543611526489258, "learning_rate": 0.00016901882244311188, "loss": 4.668, "step": 28260 }, { "epoch": 0.578809411668339, "grad_norm": 0.2713223993778229, "learning_rate": 0.00016895742438338067, "loss": 4.6145, "step": 28265 }, { "epoch": 0.5789118014457436, "grad_norm": 0.2809205651283264, "learning_rate": 0.0001688960230957368, "loss": 4.6519, "step": 28270 }, { "epoch": 0.5790141912231482, "grad_norm": 0.27665913105010986, "learning_rate": 0.00016883461859063513, "loss": 4.7191, "step": 28275 }, { "epoch": 0.5791165810005529, "grad_norm": 0.2994121313095093, "learning_rate": 0.00016877321087853118, "loss": 4.6631, "step": 28280 }, { "epoch": 0.5792189707779575, "grad_norm": 0.2573554515838623, "learning_rate": 0.00016871179996988083, "loss": 4.6867, "step": 28285 }, { "epoch": 0.5793213605553622, "grad_norm": 0.2663307189941406, "learning_rate": 0.00016865038587514077, "loss": 4.6789, "step": 28290 }, { "epoch": 0.5794237503327668, "grad_norm": 0.2640919089317322, "learning_rate": 0.00016858896860476799, "loss": 4.6987, "step": 28295 }, { "epoch": 0.5795261401101715, "grad_norm": 0.2629927098751068, "learning_rate": 0.0001685275481692201, "loss": 4.6202, "step": 28300 }, { "epoch": 0.5796285298875761, "grad_norm": 0.5783554315567017, "learning_rate": 0.00016846612457895534, "loss": 4.6934, "step": 28305 }, { "epoch": 0.5797309196649807, "grad_norm": 0.29603996872901917, "learning_rate": 0.0001684046978444324, "loss": 4.6555, "step": 28310 }, { "epoch": 0.5798333094423853, "grad_norm": 0.2576543986797333, "learning_rate": 0.00016834326797611037, "loss": 4.6764, "step": 28315 }, { "epoch": 0.5799356992197899, "grad_norm": 0.2601107656955719, "learning_rate": 0.00016828183498444915, "loss": 4.6808, "step": 28320 }, { "epoch": 0.5800380889971946, "grad_norm": 0.25905126333236694, "learning_rate": 0.00016822039887990907, "loss": 4.6454, "step": 28325 }, { "epoch": 0.5801404787745992, "grad_norm": 0.27695906162261963, "learning_rate": 0.00016815895967295085, "loss": 4.6744, "step": 28330 }, { "epoch": 0.5802428685520038, "grad_norm": 0.2757084369659424, "learning_rate": 0.00016809751737403592, "loss": 4.7141, "step": 28335 }, { "epoch": 0.5803452583294084, "grad_norm": 0.27058765292167664, "learning_rate": 0.00016803607199362614, "loss": 4.6815, "step": 28340 }, { "epoch": 0.580447648106813, "grad_norm": 0.26202329993247986, "learning_rate": 0.00016797462354218388, "loss": 4.6967, "step": 28345 }, { "epoch": 0.5805500378842177, "grad_norm": 0.2569383382797241, "learning_rate": 0.00016791317203017217, "loss": 4.6194, "step": 28350 }, { "epoch": 0.5806524276616223, "grad_norm": 0.2605290710926056, "learning_rate": 0.0001678517174680544, "loss": 4.707, "step": 28355 }, { "epoch": 0.5807548174390269, "grad_norm": 0.31713876128196716, "learning_rate": 0.00016779025986629452, "loss": 4.6597, "step": 28360 }, { "epoch": 0.5808572072164315, "grad_norm": 0.25696828961372375, "learning_rate": 0.00016772879923535706, "loss": 4.69, "step": 28365 }, { "epoch": 0.5809595969938361, "grad_norm": 0.31100475788116455, "learning_rate": 0.00016766733558570696, "loss": 4.6482, "step": 28370 }, { "epoch": 0.5810619867712408, "grad_norm": 0.26407870650291443, "learning_rate": 0.0001676058689278099, "loss": 4.7087, "step": 28375 }, { "epoch": 0.5811643765486454, "grad_norm": 0.26597467064857483, "learning_rate": 0.00016754439927213174, "loss": 4.6945, "step": 28380 }, { "epoch": 0.58126676632605, "grad_norm": 0.25429606437683105, "learning_rate": 0.00016748292662913908, "loss": 4.6643, "step": 28385 }, { "epoch": 0.5813691561034546, "grad_norm": 0.2809353172779083, "learning_rate": 0.00016742145100929898, "loss": 4.7229, "step": 28390 }, { "epoch": 0.5814715458808593, "grad_norm": 0.2482432723045349, "learning_rate": 0.00016735997242307903, "loss": 4.6838, "step": 28395 }, { "epoch": 0.5815739356582639, "grad_norm": 0.265804648399353, "learning_rate": 0.0001672984908809472, "loss": 4.6401, "step": 28400 }, { "epoch": 0.5816763254356685, "grad_norm": 0.2800675928592682, "learning_rate": 0.00016723700639337217, "loss": 4.6847, "step": 28405 }, { "epoch": 0.5817787152130731, "grad_norm": 0.38202622532844543, "learning_rate": 0.00016717551897082287, "loss": 4.7052, "step": 28410 }, { "epoch": 0.5818811049904777, "grad_norm": 0.2673313319683075, "learning_rate": 0.00016711402862376902, "loss": 4.6676, "step": 28415 }, { "epoch": 0.5819834947678824, "grad_norm": 0.27206218242645264, "learning_rate": 0.00016705253536268062, "loss": 4.6282, "step": 28420 }, { "epoch": 0.582085884545287, "grad_norm": 0.26041173934936523, "learning_rate": 0.00016699103919802817, "loss": 4.6551, "step": 28425 }, { "epoch": 0.5821882743226916, "grad_norm": 0.2748391926288605, "learning_rate": 0.0001669295401402828, "loss": 4.6714, "step": 28430 }, { "epoch": 0.5822906641000962, "grad_norm": 0.2595105469226837, "learning_rate": 0.00016686803819991605, "loss": 4.6219, "step": 28435 }, { "epoch": 0.5823930538775008, "grad_norm": 0.2541680335998535, "learning_rate": 0.00016680653338739992, "loss": 4.6717, "step": 28440 }, { "epoch": 0.5824954436549055, "grad_norm": 0.2788894474506378, "learning_rate": 0.000166745025713207, "loss": 4.6475, "step": 28445 }, { "epoch": 0.5825978334323101, "grad_norm": 0.2586911916732788, "learning_rate": 0.0001666835151878103, "loss": 4.6912, "step": 28450 }, { "epoch": 0.5827002232097147, "grad_norm": 0.4290415942668915, "learning_rate": 0.00016662200182168322, "loss": 4.7443, "step": 28455 }, { "epoch": 0.5828026129871193, "grad_norm": 0.26729732751846313, "learning_rate": 0.00016656048562529982, "loss": 4.6921, "step": 28460 }, { "epoch": 0.582905002764524, "grad_norm": 0.27923545241355896, "learning_rate": 0.00016649896660913462, "loss": 4.6826, "step": 28465 }, { "epoch": 0.5830073925419286, "grad_norm": 0.25673234462738037, "learning_rate": 0.00016643744478366245, "loss": 4.667, "step": 28470 }, { "epoch": 0.5831097823193332, "grad_norm": 0.2606159448623657, "learning_rate": 0.00016637592015935886, "loss": 4.6494, "step": 28475 }, { "epoch": 0.5832121720967378, "grad_norm": 0.25604692101478577, "learning_rate": 0.00016631439274669964, "loss": 4.6766, "step": 28480 }, { "epoch": 0.5833145618741424, "grad_norm": 0.25201162695884705, "learning_rate": 0.00016625286255616124, "loss": 4.7532, "step": 28485 }, { "epoch": 0.583416951651547, "grad_norm": 0.24749386310577393, "learning_rate": 0.00016619132959822046, "loss": 4.7172, "step": 28490 }, { "epoch": 0.5835193414289518, "grad_norm": 0.25692281126976013, "learning_rate": 0.0001661297938833547, "loss": 4.6548, "step": 28495 }, { "epoch": 0.5836217312063564, "grad_norm": 0.24823105335235596, "learning_rate": 0.00016606825542204163, "loss": 4.6655, "step": 28500 }, { "epoch": 0.583724120983761, "grad_norm": 0.25307193398475647, "learning_rate": 0.00016600671422475959, "loss": 4.7401, "step": 28505 }, { "epoch": 0.5838265107611657, "grad_norm": 0.25224077701568604, "learning_rate": 0.00016594517030198727, "loss": 4.7756, "step": 28510 }, { "epoch": 0.5839289005385703, "grad_norm": 0.2719027101993561, "learning_rate": 0.0001658836236642039, "loss": 4.6432, "step": 28515 }, { "epoch": 0.5840312903159749, "grad_norm": 0.2612490653991699, "learning_rate": 0.00016582207432188912, "loss": 4.759, "step": 28520 }, { "epoch": 0.5841336800933795, "grad_norm": 0.25117358565330505, "learning_rate": 0.00016576052228552298, "loss": 4.7374, "step": 28525 }, { "epoch": 0.5842360698707841, "grad_norm": 0.24579162895679474, "learning_rate": 0.0001656989675655861, "loss": 4.6565, "step": 28530 }, { "epoch": 0.5843384596481888, "grad_norm": 0.3122329115867615, "learning_rate": 0.00016563741017255952, "loss": 4.669, "step": 28535 }, { "epoch": 0.5844408494255934, "grad_norm": 0.2683679461479187, "learning_rate": 0.0001655758501169247, "loss": 4.664, "step": 28540 }, { "epoch": 0.584543239202998, "grad_norm": 0.275326669216156, "learning_rate": 0.00016551428740916357, "loss": 4.7266, "step": 28545 }, { "epoch": 0.5846456289804026, "grad_norm": 0.30032962560653687, "learning_rate": 0.00016545272205975852, "loss": 4.661, "step": 28550 }, { "epoch": 0.5847480187578072, "grad_norm": 0.36011090874671936, "learning_rate": 0.00016539115407919236, "loss": 4.6642, "step": 28555 }, { "epoch": 0.5848504085352119, "grad_norm": 0.30276724696159363, "learning_rate": 0.00016532958347794844, "loss": 4.6804, "step": 28560 }, { "epoch": 0.5849527983126165, "grad_norm": 0.2697668969631195, "learning_rate": 0.00016526801026651045, "loss": 4.6549, "step": 28565 }, { "epoch": 0.5850551880900211, "grad_norm": 0.28318217396736145, "learning_rate": 0.00016520643445536257, "loss": 4.6579, "step": 28570 }, { "epoch": 0.5851575778674257, "grad_norm": 0.26577475666999817, "learning_rate": 0.00016514485605498936, "loss": 4.6787, "step": 28575 }, { "epoch": 0.5852599676448303, "grad_norm": 0.27212128043174744, "learning_rate": 0.00016508327507587595, "loss": 4.6718, "step": 28580 }, { "epoch": 0.585362357422235, "grad_norm": 0.27527156472206116, "learning_rate": 0.00016502169152850786, "loss": 4.6573, "step": 28585 }, { "epoch": 0.5854647471996396, "grad_norm": 0.2678073048591614, "learning_rate": 0.000164960105423371, "loss": 4.6498, "step": 28590 }, { "epoch": 0.5855671369770442, "grad_norm": 0.2557547688484192, "learning_rate": 0.00016489851677095165, "loss": 4.6869, "step": 28595 }, { "epoch": 0.5856695267544488, "grad_norm": 0.3396357595920563, "learning_rate": 0.0001648369255817367, "loss": 4.6218, "step": 28600 }, { "epoch": 0.5857719165318535, "grad_norm": 0.2767936885356903, "learning_rate": 0.00016477533186621345, "loss": 4.7138, "step": 28605 }, { "epoch": 0.5858743063092581, "grad_norm": 0.28629255294799805, "learning_rate": 0.0001647137356348694, "loss": 4.7251, "step": 28610 }, { "epoch": 0.5859766960866627, "grad_norm": 0.30186545848846436, "learning_rate": 0.0001646521368981928, "loss": 4.6946, "step": 28615 }, { "epoch": 0.5860790858640673, "grad_norm": 0.3243028223514557, "learning_rate": 0.0001645905356666721, "loss": 4.6657, "step": 28620 }, { "epoch": 0.5861814756414719, "grad_norm": 0.3595491349697113, "learning_rate": 0.00016452893195079618, "loss": 4.5852, "step": 28625 }, { "epoch": 0.5862838654188766, "grad_norm": 0.25961947441101074, "learning_rate": 0.00016446732576105453, "loss": 4.6496, "step": 28630 }, { "epoch": 0.5863862551962812, "grad_norm": 0.4880898892879486, "learning_rate": 0.00016440571710793692, "loss": 4.6133, "step": 28635 }, { "epoch": 0.5864886449736858, "grad_norm": 0.2825993001461029, "learning_rate": 0.00016434410600193347, "loss": 4.7074, "step": 28640 }, { "epoch": 0.5865910347510904, "grad_norm": 0.2694116532802582, "learning_rate": 0.00016428249245353487, "loss": 4.6548, "step": 28645 }, { "epoch": 0.586693424528495, "grad_norm": 0.27538102865219116, "learning_rate": 0.00016422087647323218, "loss": 4.6847, "step": 28650 }, { "epoch": 0.5867958143058997, "grad_norm": 0.25950124859809875, "learning_rate": 0.00016415925807151683, "loss": 4.6639, "step": 28655 }, { "epoch": 0.5868982040833043, "grad_norm": 0.25048917531967163, "learning_rate": 0.00016409763725888074, "loss": 4.6673, "step": 28660 }, { "epoch": 0.5870005938607089, "grad_norm": 0.2542320787906647, "learning_rate": 0.00016403601404581608, "loss": 4.6901, "step": 28665 }, { "epoch": 0.5871029836381135, "grad_norm": 0.25401172041893005, "learning_rate": 0.00016397438844281562, "loss": 4.6929, "step": 28670 }, { "epoch": 0.5872053734155182, "grad_norm": 0.2795599400997162, "learning_rate": 0.00016391276046037243, "loss": 4.6901, "step": 28675 }, { "epoch": 0.5873077631929228, "grad_norm": 0.26690590381622314, "learning_rate": 0.00016385113010898003, "loss": 4.6625, "step": 28680 }, { "epoch": 0.5874101529703274, "grad_norm": 0.2658803164958954, "learning_rate": 0.00016378949739913234, "loss": 4.6648, "step": 28685 }, { "epoch": 0.587512542747732, "grad_norm": 0.2667374908924103, "learning_rate": 0.00016372786234132363, "loss": 4.6663, "step": 28690 }, { "epoch": 0.5876149325251366, "grad_norm": 0.2675791084766388, "learning_rate": 0.00016366622494604858, "loss": 4.6718, "step": 28695 }, { "epoch": 0.5877173223025414, "grad_norm": 0.2973770499229431, "learning_rate": 0.00016360458522380234, "loss": 4.6598, "step": 28700 }, { "epoch": 0.587819712079946, "grad_norm": 0.2763424217700958, "learning_rate": 0.00016354294318508044, "loss": 4.5858, "step": 28705 }, { "epoch": 0.5879221018573506, "grad_norm": 0.2831302285194397, "learning_rate": 0.00016348129884037867, "loss": 4.6891, "step": 28710 }, { "epoch": 0.5880244916347552, "grad_norm": 0.2575315237045288, "learning_rate": 0.00016341965220019342, "loss": 4.6859, "step": 28715 }, { "epoch": 0.5881268814121599, "grad_norm": 0.28888994455337524, "learning_rate": 0.0001633580032750213, "loss": 4.6963, "step": 28720 }, { "epoch": 0.5882292711895645, "grad_norm": 0.2642802894115448, "learning_rate": 0.0001632963520753594, "loss": 4.6827, "step": 28725 }, { "epoch": 0.5883316609669691, "grad_norm": 0.2706809341907501, "learning_rate": 0.00016323469861170522, "loss": 4.6905, "step": 28730 }, { "epoch": 0.5884340507443737, "grad_norm": 0.3463543653488159, "learning_rate": 0.0001631730428945565, "loss": 4.6254, "step": 28735 }, { "epoch": 0.5885364405217783, "grad_norm": 0.26060375571250916, "learning_rate": 0.00016311138493441154, "loss": 4.6735, "step": 28740 }, { "epoch": 0.588638830299183, "grad_norm": 0.2775377333164215, "learning_rate": 0.00016304972474176895, "loss": 4.6673, "step": 28745 }, { "epoch": 0.5887412200765876, "grad_norm": 0.29451581835746765, "learning_rate": 0.00016298806232712762, "loss": 4.647, "step": 28750 }, { "epoch": 0.5888436098539922, "grad_norm": 0.28171879053115845, "learning_rate": 0.00016292639770098708, "loss": 4.6576, "step": 28755 }, { "epoch": 0.5889459996313968, "grad_norm": 0.2790437638759613, "learning_rate": 0.00016286473087384694, "loss": 4.7157, "step": 28760 }, { "epoch": 0.5890483894088014, "grad_norm": 0.27042701840400696, "learning_rate": 0.0001628030618562073, "loss": 4.7097, "step": 28765 }, { "epoch": 0.5891507791862061, "grad_norm": 0.35401538014411926, "learning_rate": 0.00016274139065856872, "loss": 4.725, "step": 28770 }, { "epoch": 0.5892531689636107, "grad_norm": 0.2562393546104431, "learning_rate": 0.00016267971729143205, "loss": 4.6841, "step": 28775 }, { "epoch": 0.5893555587410153, "grad_norm": 0.2634783387184143, "learning_rate": 0.00016261804176529848, "loss": 4.6525, "step": 28780 }, { "epoch": 0.5894579485184199, "grad_norm": 0.37353622913360596, "learning_rate": 0.00016255636409066965, "loss": 4.6772, "step": 28785 }, { "epoch": 0.5895603382958245, "grad_norm": 0.31675294041633606, "learning_rate": 0.00016249468427804745, "loss": 4.6367, "step": 28790 }, { "epoch": 0.5896627280732292, "grad_norm": 0.2585216760635376, "learning_rate": 0.0001624330023379343, "loss": 4.6942, "step": 28795 }, { "epoch": 0.5897651178506338, "grad_norm": 0.28349608182907104, "learning_rate": 0.0001623713182808329, "loss": 4.7013, "step": 28800 }, { "epoch": 0.5898675076280384, "grad_norm": 0.25947660207748413, "learning_rate": 0.00016230963211724613, "loss": 4.6381, "step": 28805 }, { "epoch": 0.589969897405443, "grad_norm": 0.2591255009174347, "learning_rate": 0.0001622479438576776, "loss": 4.6237, "step": 28810 }, { "epoch": 0.5900722871828477, "grad_norm": 0.2591240406036377, "learning_rate": 0.00016218625351263093, "loss": 4.6259, "step": 28815 }, { "epoch": 0.5901746769602523, "grad_norm": 0.275452196598053, "learning_rate": 0.0001621245610926103, "loss": 4.6352, "step": 28820 }, { "epoch": 0.5902770667376569, "grad_norm": 0.2508513331413269, "learning_rate": 0.0001620628666081202, "loss": 4.6222, "step": 28825 }, { "epoch": 0.5903794565150615, "grad_norm": 0.25792360305786133, "learning_rate": 0.00016200117006966543, "loss": 4.6654, "step": 28830 }, { "epoch": 0.5904818462924661, "grad_norm": 0.263846218585968, "learning_rate": 0.0001619394714877511, "loss": 4.6983, "step": 28835 }, { "epoch": 0.5905842360698708, "grad_norm": 0.26363155245780945, "learning_rate": 0.00016187777087288286, "loss": 4.7044, "step": 28840 }, { "epoch": 0.5906866258472754, "grad_norm": 0.29336559772491455, "learning_rate": 0.0001618160682355665, "loss": 4.7013, "step": 28845 }, { "epoch": 0.59078901562468, "grad_norm": 0.24546614289283752, "learning_rate": 0.00016175436358630823, "loss": 4.669, "step": 28850 }, { "epoch": 0.5908914054020846, "grad_norm": 0.2472277730703354, "learning_rate": 0.00016169265693561464, "loss": 4.589, "step": 28855 }, { "epoch": 0.5909937951794892, "grad_norm": 0.37090569734573364, "learning_rate": 0.00016163094829399258, "loss": 4.6549, "step": 28860 }, { "epoch": 0.5910961849568939, "grad_norm": 0.25700247287750244, "learning_rate": 0.00016156923767194937, "loss": 4.6596, "step": 28865 }, { "epoch": 0.5911985747342985, "grad_norm": 0.27771201729774475, "learning_rate": 0.00016150752507999248, "loss": 4.7326, "step": 28870 }, { "epoch": 0.5913009645117031, "grad_norm": 0.26109370589256287, "learning_rate": 0.00016144581052862983, "loss": 4.6741, "step": 28875 }, { "epoch": 0.5914033542891077, "grad_norm": 0.2527990937232971, "learning_rate": 0.00016138409402836974, "loss": 4.7112, "step": 28880 }, { "epoch": 0.5915057440665124, "grad_norm": 0.2642720639705658, "learning_rate": 0.0001613223755897207, "loss": 4.6787, "step": 28885 }, { "epoch": 0.591608133843917, "grad_norm": 0.2582034170627594, "learning_rate": 0.0001612606552231917, "loss": 4.6967, "step": 28890 }, { "epoch": 0.5917105236213216, "grad_norm": 0.26001131534576416, "learning_rate": 0.00016119893293929186, "loss": 4.6871, "step": 28895 }, { "epoch": 0.5918129133987263, "grad_norm": 0.2612674832344055, "learning_rate": 0.00016113720874853083, "loss": 4.6856, "step": 28900 }, { "epoch": 0.591915303176131, "grad_norm": 0.269471675157547, "learning_rate": 0.00016107548266141843, "loss": 4.7022, "step": 28905 }, { "epoch": 0.5920176929535356, "grad_norm": 0.26346471905708313, "learning_rate": 0.0001610137546884649, "loss": 4.7134, "step": 28910 }, { "epoch": 0.5921200827309402, "grad_norm": 0.2597687840461731, "learning_rate": 0.00016095202484018076, "loss": 4.6904, "step": 28915 }, { "epoch": 0.5922224725083448, "grad_norm": 0.27877506613731384, "learning_rate": 0.00016089029312707685, "loss": 4.5869, "step": 28920 }, { "epoch": 0.5923248622857494, "grad_norm": 0.27964040637016296, "learning_rate": 0.00016082855955966438, "loss": 4.7057, "step": 28925 }, { "epoch": 0.592427252063154, "grad_norm": 0.7393932342529297, "learning_rate": 0.00016076682414845468, "loss": 4.6607, "step": 28930 }, { "epoch": 0.5925296418405587, "grad_norm": 0.27063611149787903, "learning_rate": 0.00016070508690395978, "loss": 4.6405, "step": 28935 }, { "epoch": 0.5926320316179633, "grad_norm": 0.28568169474601746, "learning_rate": 0.00016064334783669158, "loss": 4.7347, "step": 28940 }, { "epoch": 0.5927344213953679, "grad_norm": 0.27155593037605286, "learning_rate": 0.00016058160695716256, "loss": 4.6198, "step": 28945 }, { "epoch": 0.5928368111727725, "grad_norm": 0.2821517884731293, "learning_rate": 0.0001605198642758855, "loss": 4.7014, "step": 28950 }, { "epoch": 0.5929392009501772, "grad_norm": 0.27491268515586853, "learning_rate": 0.00016045811980337335, "loss": 4.691, "step": 28955 }, { "epoch": 0.5930415907275818, "grad_norm": 0.27691519260406494, "learning_rate": 0.00016039637355013946, "loss": 4.6568, "step": 28960 }, { "epoch": 0.5931439805049864, "grad_norm": 0.25760793685913086, "learning_rate": 0.0001603346255266975, "loss": 4.6677, "step": 28965 }, { "epoch": 0.593246370282391, "grad_norm": 0.297439306974411, "learning_rate": 0.00016027287574356148, "loss": 4.6604, "step": 28970 }, { "epoch": 0.5933487600597956, "grad_norm": 0.27333223819732666, "learning_rate": 0.00016021112421124546, "loss": 4.6476, "step": 28975 }, { "epoch": 0.5934511498372003, "grad_norm": 0.28570982813835144, "learning_rate": 0.00016014937094026407, "loss": 4.6237, "step": 28980 }, { "epoch": 0.5935535396146049, "grad_norm": 0.25136297941207886, "learning_rate": 0.00016008761594113222, "loss": 4.6771, "step": 28985 }, { "epoch": 0.5936559293920095, "grad_norm": 0.26670604944229126, "learning_rate": 0.00016002585922436492, "loss": 4.7002, "step": 28990 }, { "epoch": 0.5937583191694141, "grad_norm": 0.26492464542388916, "learning_rate": 0.00015996410080047765, "loss": 4.685, "step": 28995 }, { "epoch": 0.5938607089468187, "grad_norm": 0.26494765281677246, "learning_rate": 0.0001599023406799861, "loss": 4.6326, "step": 29000 }, { "epoch": 0.5938607089468187, "eval_loss": 4.6867289543151855, "eval_runtime": 17.1353, "eval_samples_per_second": 33.965, "eval_steps_per_second": 4.26, "step": 29000 }, { "epoch": 0.5939630987242234, "grad_norm": 0.28002622723579407, "learning_rate": 0.00015984057887340626, "loss": 4.6923, "step": 29005 }, { "epoch": 0.594065488501628, "grad_norm": 0.2913799285888672, "learning_rate": 0.00015977881539125448, "loss": 4.6041, "step": 29010 }, { "epoch": 0.5941678782790326, "grad_norm": 0.27965831756591797, "learning_rate": 0.00015971705024404726, "loss": 4.6498, "step": 29015 }, { "epoch": 0.5942702680564372, "grad_norm": 0.27786508202552795, "learning_rate": 0.00015965528344230148, "loss": 4.6857, "step": 29020 }, { "epoch": 0.5943726578338419, "grad_norm": 0.26868125796318054, "learning_rate": 0.0001595935149965343, "loss": 4.7213, "step": 29025 }, { "epoch": 0.5944750476112465, "grad_norm": 0.28863799571990967, "learning_rate": 0.00015953174491726307, "loss": 4.7019, "step": 29030 }, { "epoch": 0.5945774373886511, "grad_norm": 0.3015957176685333, "learning_rate": 0.00015946997321500563, "loss": 4.6739, "step": 29035 }, { "epoch": 0.5946798271660557, "grad_norm": 0.2558874189853668, "learning_rate": 0.0001594081999002798, "loss": 4.6626, "step": 29040 }, { "epoch": 0.5947822169434603, "grad_norm": 0.26330509781837463, "learning_rate": 0.00015934642498360388, "loss": 4.7061, "step": 29045 }, { "epoch": 0.594884606720865, "grad_norm": 0.2671198546886444, "learning_rate": 0.00015928464847549635, "loss": 4.627, "step": 29050 }, { "epoch": 0.5949869964982696, "grad_norm": 0.27151599526405334, "learning_rate": 0.00015922287038647615, "loss": 4.6833, "step": 29055 }, { "epoch": 0.5950893862756742, "grad_norm": 0.24983324110507965, "learning_rate": 0.00015916109072706215, "loss": 4.6917, "step": 29060 }, { "epoch": 0.5951917760530788, "grad_norm": 0.26332369446754456, "learning_rate": 0.0001590993095077738, "loss": 4.6526, "step": 29065 }, { "epoch": 0.5952941658304834, "grad_norm": 0.26414334774017334, "learning_rate": 0.00015903752673913061, "loss": 4.7179, "step": 29070 }, { "epoch": 0.5953965556078881, "grad_norm": 0.2591826021671295, "learning_rate": 0.0001589757424316525, "loss": 4.6453, "step": 29075 }, { "epoch": 0.5954989453852927, "grad_norm": 0.26921385526657104, "learning_rate": 0.00015891395659585963, "loss": 4.6801, "step": 29080 }, { "epoch": 0.5956013351626973, "grad_norm": 0.2525412142276764, "learning_rate": 0.00015885216924227223, "loss": 4.7421, "step": 29085 }, { "epoch": 0.5957037249401019, "grad_norm": 0.2510041892528534, "learning_rate": 0.00015879038038141106, "loss": 4.7037, "step": 29090 }, { "epoch": 0.5958061147175066, "grad_norm": 0.27487054467201233, "learning_rate": 0.00015872859002379695, "loss": 4.7657, "step": 29095 }, { "epoch": 0.5959085044949112, "grad_norm": 0.25881633162498474, "learning_rate": 0.00015866679817995108, "loss": 4.6379, "step": 29100 }, { "epoch": 0.5960108942723159, "grad_norm": 0.27736732363700867, "learning_rate": 0.00015860500486039489, "loss": 4.6497, "step": 29105 }, { "epoch": 0.5961132840497205, "grad_norm": 0.2656888961791992, "learning_rate": 0.00015854321007564997, "loss": 4.6758, "step": 29110 }, { "epoch": 0.5962156738271251, "grad_norm": 0.25772184133529663, "learning_rate": 0.00015848141383623823, "loss": 4.6871, "step": 29115 }, { "epoch": 0.5963180636045298, "grad_norm": 1.7596418857574463, "learning_rate": 0.00015841961615268183, "loss": 4.6907, "step": 29120 }, { "epoch": 0.5964204533819344, "grad_norm": 0.26713013648986816, "learning_rate": 0.0001583578170355032, "loss": 4.6423, "step": 29125 }, { "epoch": 0.596522843159339, "grad_norm": 0.2562316358089447, "learning_rate": 0.0001582960164952249, "loss": 4.6231, "step": 29130 }, { "epoch": 0.5966252329367436, "grad_norm": 0.2903379201889038, "learning_rate": 0.00015823421454236992, "loss": 4.6779, "step": 29135 }, { "epoch": 0.5967276227141483, "grad_norm": 0.2811809480190277, "learning_rate": 0.00015817241118746124, "loss": 4.6525, "step": 29140 }, { "epoch": 0.5968300124915529, "grad_norm": 0.29069632291793823, "learning_rate": 0.00015811060644102233, "loss": 4.6847, "step": 29145 }, { "epoch": 0.5969324022689575, "grad_norm": 0.25295203924179077, "learning_rate": 0.0001580488003135768, "loss": 4.6821, "step": 29150 }, { "epoch": 0.5970347920463621, "grad_norm": 0.2610161602497101, "learning_rate": 0.00015798699281564845, "loss": 4.6196, "step": 29155 }, { "epoch": 0.5971371818237667, "grad_norm": 0.26808518171310425, "learning_rate": 0.00015792518395776133, "loss": 4.6069, "step": 29160 }, { "epoch": 0.5972395716011714, "grad_norm": 0.260791152715683, "learning_rate": 0.00015786337375043973, "loss": 4.7065, "step": 29165 }, { "epoch": 0.597341961378576, "grad_norm": 0.26363709568977356, "learning_rate": 0.00015780156220420818, "loss": 4.6931, "step": 29170 }, { "epoch": 0.5974443511559806, "grad_norm": 0.26608097553253174, "learning_rate": 0.00015773974932959154, "loss": 4.6345, "step": 29175 }, { "epoch": 0.5975467409333852, "grad_norm": 0.2536289095878601, "learning_rate": 0.0001576779351371147, "loss": 4.693, "step": 29180 }, { "epoch": 0.5976491307107898, "grad_norm": 0.24805046617984772, "learning_rate": 0.00015761611963730282, "loss": 4.6448, "step": 29185 }, { "epoch": 0.5977515204881945, "grad_norm": 0.26312559843063354, "learning_rate": 0.00015755430284068137, "loss": 4.6967, "step": 29190 }, { "epoch": 0.5978539102655991, "grad_norm": 0.2719932794570923, "learning_rate": 0.0001574924847577761, "loss": 4.7166, "step": 29195 }, { "epoch": 0.5979563000430037, "grad_norm": 0.39098620414733887, "learning_rate": 0.00015743066539911275, "loss": 4.6323, "step": 29200 }, { "epoch": 0.5980586898204083, "grad_norm": 0.2763932943344116, "learning_rate": 0.00015736884477521749, "loss": 4.6671, "step": 29205 }, { "epoch": 0.598161079597813, "grad_norm": 0.27419716119766235, "learning_rate": 0.0001573070228966165, "loss": 4.6826, "step": 29210 }, { "epoch": 0.5982634693752176, "grad_norm": 0.26688000559806824, "learning_rate": 0.00015724519977383642, "loss": 4.6959, "step": 29215 }, { "epoch": 0.5983658591526222, "grad_norm": 0.2649906575679779, "learning_rate": 0.00015718337541740398, "loss": 4.6427, "step": 29220 }, { "epoch": 0.5984682489300268, "grad_norm": 0.25840264558792114, "learning_rate": 0.00015712154983784602, "loss": 4.6831, "step": 29225 }, { "epoch": 0.5985706387074314, "grad_norm": 0.2627979815006256, "learning_rate": 0.00015705972304568978, "loss": 4.6888, "step": 29230 }, { "epoch": 0.5986730284848361, "grad_norm": 0.2715907692909241, "learning_rate": 0.00015699789505146258, "loss": 4.6857, "step": 29235 }, { "epoch": 0.5987754182622407, "grad_norm": 0.28489550948143005, "learning_rate": 0.00015693606586569193, "loss": 4.6919, "step": 29240 }, { "epoch": 0.5988778080396453, "grad_norm": 0.2671533524990082, "learning_rate": 0.00015687423549890566, "loss": 4.6655, "step": 29245 }, { "epoch": 0.5989801978170499, "grad_norm": 0.26588207483291626, "learning_rate": 0.00015681240396163174, "loss": 4.6856, "step": 29250 }, { "epoch": 0.5990825875944545, "grad_norm": 0.3119635581970215, "learning_rate": 0.00015675057126439824, "loss": 4.6705, "step": 29255 }, { "epoch": 0.5991849773718592, "grad_norm": 0.25209978222846985, "learning_rate": 0.00015668873741773355, "loss": 4.6643, "step": 29260 }, { "epoch": 0.5992873671492638, "grad_norm": 0.25916942954063416, "learning_rate": 0.00015662690243216628, "loss": 4.6447, "step": 29265 }, { "epoch": 0.5993897569266684, "grad_norm": 0.26691484451293945, "learning_rate": 0.00015656506631822514, "loss": 4.6826, "step": 29270 }, { "epoch": 0.599492146704073, "grad_norm": 0.35600513219833374, "learning_rate": 0.00015650322908643908, "loss": 4.6915, "step": 29275 }, { "epoch": 0.5995945364814776, "grad_norm": 0.2651926875114441, "learning_rate": 0.00015644139074733716, "loss": 4.6604, "step": 29280 }, { "epoch": 0.5996969262588823, "grad_norm": 0.2603910565376282, "learning_rate": 0.0001563795513114488, "loss": 4.6681, "step": 29285 }, { "epoch": 0.5997993160362869, "grad_norm": 0.26804107427597046, "learning_rate": 0.00015631771078930347, "loss": 4.675, "step": 29290 }, { "epoch": 0.5999017058136915, "grad_norm": 0.2554345428943634, "learning_rate": 0.00015625586919143085, "loss": 4.6594, "step": 29295 }, { "epoch": 0.6000040955910961, "grad_norm": 0.2568386197090149, "learning_rate": 0.00015619402652836085, "loss": 4.6619, "step": 29300 }, { "epoch": 0.6001064853685009, "grad_norm": 0.28163617849349976, "learning_rate": 0.00015613218281062343, "loss": 4.637, "step": 29305 }, { "epoch": 0.6002088751459055, "grad_norm": 0.2598473131656647, "learning_rate": 0.00015607033804874892, "loss": 4.7283, "step": 29310 }, { "epoch": 0.6003112649233101, "grad_norm": 0.2979266345500946, "learning_rate": 0.0001560084922532677, "loss": 4.6401, "step": 29315 }, { "epoch": 0.6004136547007147, "grad_norm": 0.251546174287796, "learning_rate": 0.00015594664543471039, "loss": 4.6493, "step": 29320 }, { "epoch": 0.6005160444781193, "grad_norm": 0.26767000555992126, "learning_rate": 0.00015588479760360766, "loss": 4.6874, "step": 29325 }, { "epoch": 0.600618434255524, "grad_norm": 0.26352837681770325, "learning_rate": 0.0001558229487704905, "loss": 4.6588, "step": 29330 }, { "epoch": 0.6007208240329286, "grad_norm": 0.319023996591568, "learning_rate": 0.00015576109894589008, "loss": 4.7051, "step": 29335 }, { "epoch": 0.6008232138103332, "grad_norm": 0.24961285293102264, "learning_rate": 0.00015569924814033752, "loss": 4.6927, "step": 29340 }, { "epoch": 0.6009256035877378, "grad_norm": 0.26609739661216736, "learning_rate": 0.00015563739636436445, "loss": 4.6607, "step": 29345 }, { "epoch": 0.6010279933651425, "grad_norm": 0.2591767907142639, "learning_rate": 0.00015557554362850232, "loss": 4.6741, "step": 29350 }, { "epoch": 0.6011303831425471, "grad_norm": 0.2783168852329254, "learning_rate": 0.00015551368994328298, "loss": 4.6915, "step": 29355 }, { "epoch": 0.6012327729199517, "grad_norm": 0.30141308903694153, "learning_rate": 0.00015545183531923834, "loss": 4.6651, "step": 29360 }, { "epoch": 0.6013351626973563, "grad_norm": 0.2937859296798706, "learning_rate": 0.0001553899797669005, "loss": 4.6587, "step": 29365 }, { "epoch": 0.6014375524747609, "grad_norm": 0.2556871175765991, "learning_rate": 0.00015532812329680168, "loss": 4.6832, "step": 29370 }, { "epoch": 0.6015399422521656, "grad_norm": 0.2573263943195343, "learning_rate": 0.0001552662659194743, "loss": 4.6928, "step": 29375 }, { "epoch": 0.6016423320295702, "grad_norm": 0.249510258436203, "learning_rate": 0.00015520440764545096, "loss": 4.604, "step": 29380 }, { "epoch": 0.6017447218069748, "grad_norm": 0.27031224966049194, "learning_rate": 0.00015514254848526433, "loss": 4.667, "step": 29385 }, { "epoch": 0.6018471115843794, "grad_norm": 0.2602192461490631, "learning_rate": 0.00015508068844944734, "loss": 4.6744, "step": 29390 }, { "epoch": 0.601949501361784, "grad_norm": 0.2823655605316162, "learning_rate": 0.00015501882754853286, "loss": 4.6708, "step": 29395 }, { "epoch": 0.6020518911391887, "grad_norm": 0.2946068048477173, "learning_rate": 0.00015495696579305413, "loss": 4.6657, "step": 29400 }, { "epoch": 0.6021542809165933, "grad_norm": 0.2830444574356079, "learning_rate": 0.00015489510319354456, "loss": 4.659, "step": 29405 }, { "epoch": 0.6022566706939979, "grad_norm": 0.27121251821517944, "learning_rate": 0.00015483323976053743, "loss": 4.6996, "step": 29410 }, { "epoch": 0.6023590604714025, "grad_norm": 0.2876390218734741, "learning_rate": 0.00015477137550456643, "loss": 4.6174, "step": 29415 }, { "epoch": 0.6024614502488072, "grad_norm": 0.2655010223388672, "learning_rate": 0.00015470951043616523, "loss": 4.6378, "step": 29420 }, { "epoch": 0.6025638400262118, "grad_norm": 0.2648998200893402, "learning_rate": 0.00015464764456586775, "loss": 4.6921, "step": 29425 }, { "epoch": 0.6026662298036164, "grad_norm": 0.26296836137771606, "learning_rate": 0.000154585777904208, "loss": 4.682, "step": 29430 }, { "epoch": 0.602768619581021, "grad_norm": 0.2766362428665161, "learning_rate": 0.0001545239104617201, "loss": 4.6228, "step": 29435 }, { "epoch": 0.6028710093584256, "grad_norm": 0.2620740532875061, "learning_rate": 0.00015446204224893833, "loss": 4.6544, "step": 29440 }, { "epoch": 0.6029733991358303, "grad_norm": 0.264326810836792, "learning_rate": 0.00015440017327639704, "loss": 4.7159, "step": 29445 }, { "epoch": 0.6030757889132349, "grad_norm": 0.2602043151855469, "learning_rate": 0.00015433830355463083, "loss": 4.672, "step": 29450 }, { "epoch": 0.6031781786906395, "grad_norm": 0.26989299058914185, "learning_rate": 0.00015427643309417439, "loss": 4.679, "step": 29455 }, { "epoch": 0.6032805684680441, "grad_norm": 0.2631540298461914, "learning_rate": 0.00015421456190556246, "loss": 4.6572, "step": 29460 }, { "epoch": 0.6033829582454487, "grad_norm": 0.2787879407405853, "learning_rate": 0.00015415268999932993, "loss": 4.6719, "step": 29465 }, { "epoch": 0.6034853480228534, "grad_norm": 0.261043906211853, "learning_rate": 0.00015409081738601183, "loss": 4.6791, "step": 29470 }, { "epoch": 0.603587737800258, "grad_norm": 0.2684559226036072, "learning_rate": 0.00015402894407614342, "loss": 4.6567, "step": 29475 }, { "epoch": 0.6036901275776626, "grad_norm": 0.28989318013191223, "learning_rate": 0.00015396707008025982, "loss": 4.7154, "step": 29480 }, { "epoch": 0.6037925173550672, "grad_norm": 0.26652899384498596, "learning_rate": 0.00015390519540889658, "loss": 4.6232, "step": 29485 }, { "epoch": 0.6038949071324718, "grad_norm": 0.269520103931427, "learning_rate": 0.00015384332007258904, "loss": 4.6293, "step": 29490 }, { "epoch": 0.6039972969098765, "grad_norm": 0.29781582951545715, "learning_rate": 0.00015378144408187294, "loss": 4.6637, "step": 29495 }, { "epoch": 0.6040996866872811, "grad_norm": 0.2606029212474823, "learning_rate": 0.000153719567447284, "loss": 4.7466, "step": 29500 }, { "epoch": 0.6042020764646857, "grad_norm": 0.27624937891960144, "learning_rate": 0.000153657690179358, "loss": 4.5986, "step": 29505 }, { "epoch": 0.6043044662420904, "grad_norm": 0.28331923484802246, "learning_rate": 0.0001535958122886309, "loss": 4.6918, "step": 29510 }, { "epoch": 0.6044068560194951, "grad_norm": 0.26128119230270386, "learning_rate": 0.0001535339337856388, "loss": 4.7142, "step": 29515 }, { "epoch": 0.6045092457968997, "grad_norm": 0.3133602440357208, "learning_rate": 0.0001534720546809178, "loss": 4.6729, "step": 29520 }, { "epoch": 0.6046116355743043, "grad_norm": 0.265758752822876, "learning_rate": 0.0001534101749850042, "loss": 4.708, "step": 29525 }, { "epoch": 0.6047140253517089, "grad_norm": 0.25712907314300537, "learning_rate": 0.00015334829470843437, "loss": 4.674, "step": 29530 }, { "epoch": 0.6048164151291135, "grad_norm": 0.27626919746398926, "learning_rate": 0.00015328641386174466, "loss": 4.6511, "step": 29535 }, { "epoch": 0.6049188049065182, "grad_norm": 0.30588817596435547, "learning_rate": 0.00015322453245547176, "loss": 4.6458, "step": 29540 }, { "epoch": 0.6050211946839228, "grad_norm": 0.25631025433540344, "learning_rate": 0.00015316265050015234, "loss": 4.6855, "step": 29545 }, { "epoch": 0.6051235844613274, "grad_norm": 0.26386743783950806, "learning_rate": 0.000153100768006323, "loss": 4.6843, "step": 29550 }, { "epoch": 0.605225974238732, "grad_norm": 0.310792475938797, "learning_rate": 0.00015303888498452068, "loss": 4.6306, "step": 29555 }, { "epoch": 0.6053283640161367, "grad_norm": 0.2991289496421814, "learning_rate": 0.00015297700144528226, "loss": 4.6873, "step": 29560 }, { "epoch": 0.6054307537935413, "grad_norm": 0.2646171450614929, "learning_rate": 0.00015291511739914476, "loss": 4.6461, "step": 29565 }, { "epoch": 0.6055331435709459, "grad_norm": 0.2500417232513428, "learning_rate": 0.00015285323285664536, "loss": 4.6622, "step": 29570 }, { "epoch": 0.6056355333483505, "grad_norm": 0.26394060254096985, "learning_rate": 0.00015279134782832113, "loss": 4.6331, "step": 29575 }, { "epoch": 0.6057379231257551, "grad_norm": 0.25219351053237915, "learning_rate": 0.00015272946232470943, "loss": 4.6639, "step": 29580 }, { "epoch": 0.6058403129031598, "grad_norm": 0.2596396803855896, "learning_rate": 0.00015266757635634752, "loss": 4.6249, "step": 29585 }, { "epoch": 0.6059427026805644, "grad_norm": 0.33561503887176514, "learning_rate": 0.00015260568993377292, "loss": 4.6586, "step": 29590 }, { "epoch": 0.606045092457969, "grad_norm": 0.2690029740333557, "learning_rate": 0.00015254380306752312, "loss": 4.6338, "step": 29595 }, { "epoch": 0.6061474822353736, "grad_norm": 0.27321794629096985, "learning_rate": 0.00015248191576813563, "loss": 4.6541, "step": 29600 }, { "epoch": 0.6062498720127782, "grad_norm": 0.26653915643692017, "learning_rate": 0.00015242002804614815, "loss": 4.6559, "step": 29605 }, { "epoch": 0.6063522617901829, "grad_norm": 0.44555196166038513, "learning_rate": 0.0001523581399120984, "loss": 4.5864, "step": 29610 }, { "epoch": 0.6064546515675875, "grad_norm": 0.26160576939582825, "learning_rate": 0.00015229625137652423, "loss": 4.688, "step": 29615 }, { "epoch": 0.6065570413449921, "grad_norm": 0.26186686754226685, "learning_rate": 0.00015223436244996341, "loss": 4.6661, "step": 29620 }, { "epoch": 0.6066594311223967, "grad_norm": 0.2802843749523163, "learning_rate": 0.00015217247314295397, "loss": 4.6789, "step": 29625 }, { "epoch": 0.6067618208998014, "grad_norm": 0.6387742161750793, "learning_rate": 0.00015211058346603382, "loss": 4.784, "step": 29630 }, { "epoch": 0.606864210677206, "grad_norm": 0.2868768274784088, "learning_rate": 0.00015204869342974104, "loss": 4.6126, "step": 29635 }, { "epoch": 0.6069666004546106, "grad_norm": 0.2657756805419922, "learning_rate": 0.00015198680304461386, "loss": 4.6796, "step": 29640 }, { "epoch": 0.6070689902320152, "grad_norm": 0.25940439105033875, "learning_rate": 0.00015192491232119025, "loss": 4.6374, "step": 29645 }, { "epoch": 0.6071713800094198, "grad_norm": 0.2724342346191406, "learning_rate": 0.0001518630212700087, "loss": 4.6236, "step": 29650 }, { "epoch": 0.6072737697868245, "grad_norm": 0.26338621973991394, "learning_rate": 0.00015180112990160726, "loss": 4.6229, "step": 29655 }, { "epoch": 0.6073761595642291, "grad_norm": 0.2676234543323517, "learning_rate": 0.00015173923822652444, "loss": 4.6505, "step": 29660 }, { "epoch": 0.6074785493416337, "grad_norm": 0.2560276389122009, "learning_rate": 0.0001516773462552986, "loss": 4.6728, "step": 29665 }, { "epoch": 0.6075809391190383, "grad_norm": 0.25123026967048645, "learning_rate": 0.00015161545399846815, "loss": 4.6781, "step": 29670 }, { "epoch": 0.6076833288964429, "grad_norm": 0.2650813162326813, "learning_rate": 0.00015155356146657163, "loss": 4.6428, "step": 29675 }, { "epoch": 0.6077857186738476, "grad_norm": 0.2570786476135254, "learning_rate": 0.00015149166867014754, "loss": 4.654, "step": 29680 }, { "epoch": 0.6078881084512522, "grad_norm": 0.26854678988456726, "learning_rate": 0.0001514297756197346, "loss": 4.6582, "step": 29685 }, { "epoch": 0.6079904982286568, "grad_norm": 0.25913646817207336, "learning_rate": 0.0001513678823258713, "loss": 4.6373, "step": 29690 }, { "epoch": 0.6080928880060614, "grad_norm": 0.2833520174026489, "learning_rate": 0.00015130598879909636, "loss": 4.6458, "step": 29695 }, { "epoch": 0.608195277783466, "grad_norm": 0.27404841780662537, "learning_rate": 0.00015124409504994847, "loss": 4.696, "step": 29700 }, { "epoch": 0.6082976675608707, "grad_norm": 0.27438125014305115, "learning_rate": 0.00015118220108896645, "loss": 4.698, "step": 29705 }, { "epoch": 0.6084000573382754, "grad_norm": 0.2868211269378662, "learning_rate": 0.00015112030692668906, "loss": 4.6449, "step": 29710 }, { "epoch": 0.60850244711568, "grad_norm": 0.2659040093421936, "learning_rate": 0.0001510584125736551, "loss": 4.674, "step": 29715 }, { "epoch": 0.6086048368930846, "grad_norm": 0.27274179458618164, "learning_rate": 0.00015099651804040346, "loss": 4.7177, "step": 29720 }, { "epoch": 0.6087072266704893, "grad_norm": 0.24936515092849731, "learning_rate": 0.00015093462333747298, "loss": 4.6933, "step": 29725 }, { "epoch": 0.6088096164478939, "grad_norm": 0.2781531810760498, "learning_rate": 0.00015087272847540261, "loss": 4.6862, "step": 29730 }, { "epoch": 0.6089120062252985, "grad_norm": 0.2787362039089203, "learning_rate": 0.00015081083346473134, "loss": 4.6542, "step": 29735 }, { "epoch": 0.6090143960027031, "grad_norm": 0.2624753713607788, "learning_rate": 0.00015074893831599806, "loss": 4.6621, "step": 29740 }, { "epoch": 0.6091167857801078, "grad_norm": 0.26399970054626465, "learning_rate": 0.00015068704303974176, "loss": 4.6525, "step": 29745 }, { "epoch": 0.6092191755575124, "grad_norm": 0.2662067413330078, "learning_rate": 0.0001506251476465015, "loss": 4.6695, "step": 29750 }, { "epoch": 0.609321565334917, "grad_norm": 0.27489542961120605, "learning_rate": 0.0001505632521468163, "loss": 4.6658, "step": 29755 }, { "epoch": 0.6094239551123216, "grad_norm": 0.2619228661060333, "learning_rate": 0.0001505013565512252, "loss": 4.6896, "step": 29760 }, { "epoch": 0.6095263448897262, "grad_norm": 0.3220798671245575, "learning_rate": 0.00015043946087026728, "loss": 4.6468, "step": 29765 }, { "epoch": 0.6096287346671309, "grad_norm": 0.26916709542274475, "learning_rate": 0.0001503775651144816, "loss": 4.7052, "step": 29770 }, { "epoch": 0.6097311244445355, "grad_norm": 0.2660595178604126, "learning_rate": 0.0001503156692944073, "loss": 4.6184, "step": 29775 }, { "epoch": 0.6098335142219401, "grad_norm": 0.26139718294143677, "learning_rate": 0.00015025377342058342, "loss": 4.6833, "step": 29780 }, { "epoch": 0.6099359039993447, "grad_norm": 0.25445857644081116, "learning_rate": 0.00015019187750354914, "loss": 4.6651, "step": 29785 }, { "epoch": 0.6100382937767493, "grad_norm": 0.24757958948612213, "learning_rate": 0.00015012998155384353, "loss": 4.6792, "step": 29790 }, { "epoch": 0.610140683554154, "grad_norm": 0.24935737252235413, "learning_rate": 0.00015006808558200576, "loss": 4.6747, "step": 29795 }, { "epoch": 0.6102430733315586, "grad_norm": 0.2751321792602539, "learning_rate": 0.00015000618959857493, "loss": 4.6873, "step": 29800 }, { "epoch": 0.6103454631089632, "grad_norm": 0.2626931667327881, "learning_rate": 0.0001499442936140902, "loss": 4.7037, "step": 29805 }, { "epoch": 0.6104478528863678, "grad_norm": 0.26975181698799133, "learning_rate": 0.00014988239763909068, "loss": 4.621, "step": 29810 }, { "epoch": 0.6105502426637724, "grad_norm": 0.25670450925827026, "learning_rate": 0.00014982050168411555, "loss": 4.5853, "step": 29815 }, { "epoch": 0.6106526324411771, "grad_norm": 0.2627800703048706, "learning_rate": 0.00014975860575970384, "loss": 4.6774, "step": 29820 }, { "epoch": 0.6107550222185817, "grad_norm": 0.2657567858695984, "learning_rate": 0.00014969670987639473, "loss": 4.6183, "step": 29825 }, { "epoch": 0.6108574119959863, "grad_norm": 0.25893691182136536, "learning_rate": 0.00014963481404472743, "loss": 4.6916, "step": 29830 }, { "epoch": 0.6109598017733909, "grad_norm": 0.26803627610206604, "learning_rate": 0.0001495729182752409, "loss": 4.6461, "step": 29835 }, { "epoch": 0.6110621915507956, "grad_norm": 0.26486125588417053, "learning_rate": 0.00014951102257847426, "loss": 4.6816, "step": 29840 }, { "epoch": 0.6111645813282002, "grad_norm": 0.2670956552028656, "learning_rate": 0.0001494491269649667, "loss": 4.5994, "step": 29845 }, { "epoch": 0.6112669711056048, "grad_norm": 0.2605704665184021, "learning_rate": 0.00014938723144525718, "loss": 4.6656, "step": 29850 }, { "epoch": 0.6113693608830094, "grad_norm": 0.2715194821357727, "learning_rate": 0.00014932533602988478, "loss": 4.6504, "step": 29855 }, { "epoch": 0.611471750660414, "grad_norm": 0.245295450091362, "learning_rate": 0.00014926344072938865, "loss": 4.6806, "step": 29860 }, { "epoch": 0.6115741404378187, "grad_norm": 0.6133278012275696, "learning_rate": 0.0001492015455543076, "loss": 4.6174, "step": 29865 }, { "epoch": 0.6116765302152233, "grad_norm": 0.2752784192562103, "learning_rate": 0.00014913965051518077, "loss": 4.6169, "step": 29870 }, { "epoch": 0.6117789199926279, "grad_norm": 0.2706621587276459, "learning_rate": 0.00014907775562254707, "loss": 4.6717, "step": 29875 }, { "epoch": 0.6118813097700325, "grad_norm": 0.3045152425765991, "learning_rate": 0.0001490158608869456, "loss": 4.6855, "step": 29880 }, { "epoch": 0.6119836995474371, "grad_norm": 0.7316267490386963, "learning_rate": 0.00014895396631891506, "loss": 4.6194, "step": 29885 }, { "epoch": 0.6120860893248418, "grad_norm": 0.29746928811073303, "learning_rate": 0.00014889207192899444, "loss": 4.6297, "step": 29890 }, { "epoch": 0.6121884791022464, "grad_norm": 0.26595771312713623, "learning_rate": 0.00014883017772772264, "loss": 4.6982, "step": 29895 }, { "epoch": 0.612290868879651, "grad_norm": 0.27452102303504944, "learning_rate": 0.00014876828372563844, "loss": 4.6291, "step": 29900 }, { "epoch": 0.6123932586570556, "grad_norm": 0.28861257433891296, "learning_rate": 0.00014870638993328062, "loss": 4.6327, "step": 29905 }, { "epoch": 0.6124956484344602, "grad_norm": 0.27628979086875916, "learning_rate": 0.00014864449636118804, "loss": 4.6293, "step": 29910 }, { "epoch": 0.612598038211865, "grad_norm": 0.28083112835884094, "learning_rate": 0.00014858260301989928, "loss": 4.5978, "step": 29915 }, { "epoch": 0.6127004279892696, "grad_norm": 0.2867166996002197, "learning_rate": 0.0001485207099199531, "loss": 4.6514, "step": 29920 }, { "epoch": 0.6128028177666742, "grad_norm": 0.27758774161338806, "learning_rate": 0.00014845881707188822, "loss": 4.6544, "step": 29925 }, { "epoch": 0.6129052075440788, "grad_norm": 0.26441365480422974, "learning_rate": 0.00014839692448624309, "loss": 4.645, "step": 29930 }, { "epoch": 0.6130075973214835, "grad_norm": 0.26363876461982727, "learning_rate": 0.00014833503217355636, "loss": 4.6959, "step": 29935 }, { "epoch": 0.6131099870988881, "grad_norm": 0.2617667019367218, "learning_rate": 0.00014827314014436652, "loss": 4.6219, "step": 29940 }, { "epoch": 0.6132123768762927, "grad_norm": 0.2581588625907898, "learning_rate": 0.000148211248409212, "loss": 4.6411, "step": 29945 }, { "epoch": 0.6133147666536973, "grad_norm": 0.25130411982536316, "learning_rate": 0.00014814935697863123, "loss": 4.6319, "step": 29950 }, { "epoch": 0.613417156431102, "grad_norm": 0.26718178391456604, "learning_rate": 0.00014808746586316264, "loss": 4.6441, "step": 29955 }, { "epoch": 0.6135195462085066, "grad_norm": 0.2654898166656494, "learning_rate": 0.00014802557507334437, "loss": 4.7215, "step": 29960 }, { "epoch": 0.6136219359859112, "grad_norm": 0.2652888000011444, "learning_rate": 0.0001479636846197148, "loss": 4.6066, "step": 29965 }, { "epoch": 0.6137243257633158, "grad_norm": 0.2645012140274048, "learning_rate": 0.00014790179451281215, "loss": 4.7078, "step": 29970 }, { "epoch": 0.6138267155407204, "grad_norm": 0.29193446040153503, "learning_rate": 0.00014783990476317443, "loss": 4.5795, "step": 29975 }, { "epoch": 0.6139291053181251, "grad_norm": 0.26642873883247375, "learning_rate": 0.00014777801538133978, "loss": 4.6555, "step": 29980 }, { "epoch": 0.6140314950955297, "grad_norm": 0.26247337460517883, "learning_rate": 0.00014771612637784622, "loss": 4.6602, "step": 29985 }, { "epoch": 0.6141338848729343, "grad_norm": 0.28664398193359375, "learning_rate": 0.00014765423776323168, "loss": 4.6469, "step": 29990 }, { "epoch": 0.6142362746503389, "grad_norm": 0.2641092538833618, "learning_rate": 0.000147592349548034, "loss": 4.6224, "step": 29995 }, { "epoch": 0.6143386644277435, "grad_norm": 0.2592602074146271, "learning_rate": 0.0001475304617427911, "loss": 4.6271, "step": 30000 }, { "epoch": 0.6143386644277435, "eval_loss": 4.678186416625977, "eval_runtime": 17.1354, "eval_samples_per_second": 33.965, "eval_steps_per_second": 4.26, "step": 30000 }, { "epoch": 0.6144410542051482, "grad_norm": 0.27276498079299927, "learning_rate": 0.0001474685743580406, "loss": 4.6502, "step": 30005 }, { "epoch": 0.6145434439825528, "grad_norm": 0.30016595125198364, "learning_rate": 0.00014740668740432017, "loss": 4.6169, "step": 30010 }, { "epoch": 0.6146458337599574, "grad_norm": 0.27015364170074463, "learning_rate": 0.00014734480089216752, "loss": 4.6303, "step": 30015 }, { "epoch": 0.614748223537362, "grad_norm": 0.25933071970939636, "learning_rate": 0.00014728291483212015, "loss": 4.655, "step": 30020 }, { "epoch": 0.6148506133147666, "grad_norm": 0.2642268240451813, "learning_rate": 0.00014722102923471538, "loss": 4.6966, "step": 30025 }, { "epoch": 0.6149530030921713, "grad_norm": 0.26965948939323425, "learning_rate": 0.0001471591441104907, "loss": 4.6566, "step": 30030 }, { "epoch": 0.6150553928695759, "grad_norm": 0.26817694306373596, "learning_rate": 0.00014709725946998335, "loss": 4.6389, "step": 30035 }, { "epoch": 0.6151577826469805, "grad_norm": 0.27362197637557983, "learning_rate": 0.0001470353753237305, "loss": 4.633, "step": 30040 }, { "epoch": 0.6152601724243851, "grad_norm": 0.25554922223091125, "learning_rate": 0.00014697349168226928, "loss": 4.6694, "step": 30045 }, { "epoch": 0.6153625622017898, "grad_norm": 0.25772884488105774, "learning_rate": 0.00014691160855613683, "loss": 4.6689, "step": 30050 }, { "epoch": 0.6154649519791944, "grad_norm": 0.27397212386131287, "learning_rate": 0.00014684972595586994, "loss": 4.6297, "step": 30055 }, { "epoch": 0.615567341756599, "grad_norm": 0.2920845150947571, "learning_rate": 0.0001467878438920055, "loss": 4.6813, "step": 30060 }, { "epoch": 0.6156697315340036, "grad_norm": 0.27796587347984314, "learning_rate": 0.00014672596237508043, "loss": 4.6721, "step": 30065 }, { "epoch": 0.6157721213114082, "grad_norm": 0.27973082661628723, "learning_rate": 0.00014666408141563117, "loss": 4.6396, "step": 30070 }, { "epoch": 0.6158745110888129, "grad_norm": 0.2719156742095947, "learning_rate": 0.00014660220102419443, "loss": 4.6355, "step": 30075 }, { "epoch": 0.6159769008662175, "grad_norm": 0.26830217242240906, "learning_rate": 0.00014654032121130667, "loss": 4.55, "step": 30080 }, { "epoch": 0.6160792906436221, "grad_norm": 0.28855401277542114, "learning_rate": 0.00014647844198750423, "loss": 4.692, "step": 30085 }, { "epoch": 0.6161816804210267, "grad_norm": 0.27030399441719055, "learning_rate": 0.00014641656336332346, "loss": 4.6713, "step": 30090 }, { "epoch": 0.6162840701984313, "grad_norm": 0.25906726717948914, "learning_rate": 0.00014635468534930052, "loss": 4.7163, "step": 30095 }, { "epoch": 0.616386459975836, "grad_norm": 0.2601703405380249, "learning_rate": 0.0001462928079559714, "loss": 4.6709, "step": 30100 }, { "epoch": 0.6164888497532406, "grad_norm": 0.2649032473564148, "learning_rate": 0.0001462309311938721, "loss": 4.6223, "step": 30105 }, { "epoch": 0.6165912395306452, "grad_norm": 0.26456424593925476, "learning_rate": 0.00014616905507353864, "loss": 4.6261, "step": 30110 }, { "epoch": 0.6166936293080498, "grad_norm": 0.2632828950881958, "learning_rate": 0.00014610717960550655, "loss": 4.662, "step": 30115 }, { "epoch": 0.6167960190854546, "grad_norm": 0.28897950053215027, "learning_rate": 0.00014604530480031154, "loss": 4.6395, "step": 30120 }, { "epoch": 0.6168984088628592, "grad_norm": 0.2776760756969452, "learning_rate": 0.00014598343066848923, "loss": 4.6749, "step": 30125 }, { "epoch": 0.6170007986402638, "grad_norm": 0.3099904954433441, "learning_rate": 0.0001459215572205749, "loss": 4.6695, "step": 30130 }, { "epoch": 0.6171031884176684, "grad_norm": 0.28448745608329773, "learning_rate": 0.00014585968446710395, "loss": 4.6232, "step": 30135 }, { "epoch": 0.617205578195073, "grad_norm": 0.2703884541988373, "learning_rate": 0.00014579781241861158, "loss": 4.6903, "step": 30140 }, { "epoch": 0.6173079679724777, "grad_norm": 0.2632048428058624, "learning_rate": 0.0001457359410856327, "loss": 4.6358, "step": 30145 }, { "epoch": 0.6174103577498823, "grad_norm": 0.2635573148727417, "learning_rate": 0.0001456740704787023, "loss": 4.6238, "step": 30150 }, { "epoch": 0.6175127475272869, "grad_norm": 0.27382394671440125, "learning_rate": 0.00014561220060835527, "loss": 4.6477, "step": 30155 }, { "epoch": 0.6176151373046915, "grad_norm": 0.2560978829860687, "learning_rate": 0.0001455503314851263, "loss": 4.653, "step": 30160 }, { "epoch": 0.6177175270820962, "grad_norm": 0.26267510652542114, "learning_rate": 0.00014548846311954986, "loss": 4.6569, "step": 30165 }, { "epoch": 0.6178199168595008, "grad_norm": 0.2728870213031769, "learning_rate": 0.0001454265955221604, "loss": 4.6395, "step": 30170 }, { "epoch": 0.6179223066369054, "grad_norm": 0.2571091651916504, "learning_rate": 0.00014536472870349228, "loss": 4.6942, "step": 30175 }, { "epoch": 0.61802469641431, "grad_norm": 0.27143728733062744, "learning_rate": 0.00014530286267407962, "loss": 4.5975, "step": 30180 }, { "epoch": 0.6181270861917146, "grad_norm": 0.2803422808647156, "learning_rate": 0.00014524099744445647, "loss": 4.6274, "step": 30185 }, { "epoch": 0.6182294759691193, "grad_norm": 0.25826266407966614, "learning_rate": 0.00014517913302515678, "loss": 4.6414, "step": 30190 }, { "epoch": 0.6183318657465239, "grad_norm": 0.2646903693675995, "learning_rate": 0.00014511726942671417, "loss": 4.7091, "step": 30195 }, { "epoch": 0.6184342555239285, "grad_norm": 0.27455615997314453, "learning_rate": 0.0001450554066596623, "loss": 4.6893, "step": 30200 }, { "epoch": 0.6185366453013331, "grad_norm": 0.2511138617992401, "learning_rate": 0.0001449935447345348, "loss": 4.682, "step": 30205 }, { "epoch": 0.6186390350787377, "grad_norm": 0.27153676748275757, "learning_rate": 0.00014493168366186483, "loss": 4.6948, "step": 30210 }, { "epoch": 0.6187414248561424, "grad_norm": 0.2816159725189209, "learning_rate": 0.00014486982345218564, "loss": 4.6846, "step": 30215 }, { "epoch": 0.618843814633547, "grad_norm": 0.2648754119873047, "learning_rate": 0.00014480796411603034, "loss": 4.6257, "step": 30220 }, { "epoch": 0.6189462044109516, "grad_norm": 0.2981020212173462, "learning_rate": 0.0001447461056639317, "loss": 4.6035, "step": 30225 }, { "epoch": 0.6190485941883562, "grad_norm": 0.270654559135437, "learning_rate": 0.00014468424810642252, "loss": 4.6823, "step": 30230 }, { "epoch": 0.6191509839657608, "grad_norm": 0.30308738350868225, "learning_rate": 0.00014462239145403546, "loss": 4.6935, "step": 30235 }, { "epoch": 0.6192533737431655, "grad_norm": 0.25525858998298645, "learning_rate": 0.0001445605357173028, "loss": 4.6432, "step": 30240 }, { "epoch": 0.6193557635205701, "grad_norm": 0.2812856435775757, "learning_rate": 0.0001444986809067569, "loss": 4.6421, "step": 30245 }, { "epoch": 0.6194581532979747, "grad_norm": 0.2733539044857025, "learning_rate": 0.00014443682703293, "loss": 4.6383, "step": 30250 }, { "epoch": 0.6195605430753793, "grad_norm": 0.2560039758682251, "learning_rate": 0.00014437497410635389, "loss": 4.6515, "step": 30255 }, { "epoch": 0.619662932852784, "grad_norm": 0.2572624981403351, "learning_rate": 0.00014431312213756043, "loss": 4.6698, "step": 30260 }, { "epoch": 0.6197653226301886, "grad_norm": 0.25209107995033264, "learning_rate": 0.0001442512711370813, "loss": 4.6784, "step": 30265 }, { "epoch": 0.6198677124075932, "grad_norm": 0.3089885115623474, "learning_rate": 0.0001441894211154479, "loss": 4.6849, "step": 30270 }, { "epoch": 0.6199701021849978, "grad_norm": 0.27978983521461487, "learning_rate": 0.0001441275720831916, "loss": 4.6624, "step": 30275 }, { "epoch": 0.6200724919624024, "grad_norm": 0.29287540912628174, "learning_rate": 0.00014406572405084358, "loss": 4.6392, "step": 30280 }, { "epoch": 0.6201748817398071, "grad_norm": 0.25806277990341187, "learning_rate": 0.0001440038770289347, "loss": 4.6299, "step": 30285 }, { "epoch": 0.6202772715172117, "grad_norm": 0.2711588740348816, "learning_rate": 0.00014394203102799575, "loss": 4.6432, "step": 30290 }, { "epoch": 0.6203796612946163, "grad_norm": 0.261165052652359, "learning_rate": 0.0001438801860585574, "loss": 4.712, "step": 30295 }, { "epoch": 0.6204820510720209, "grad_norm": 0.2696019411087036, "learning_rate": 0.00014381834213115022, "loss": 4.6437, "step": 30300 }, { "epoch": 0.6205844408494255, "grad_norm": 0.2878361642360687, "learning_rate": 0.0001437564992563043, "loss": 4.6677, "step": 30305 }, { "epoch": 0.6206868306268302, "grad_norm": 0.285234272480011, "learning_rate": 0.00014369465744454977, "loss": 4.6823, "step": 30310 }, { "epoch": 0.6207892204042348, "grad_norm": 0.27212634682655334, "learning_rate": 0.00014363281670641664, "loss": 4.6405, "step": 30315 }, { "epoch": 0.6208916101816395, "grad_norm": 0.2737378776073456, "learning_rate": 0.0001435709770524345, "loss": 4.6519, "step": 30320 }, { "epoch": 0.6209939999590441, "grad_norm": 0.2617710828781128, "learning_rate": 0.00014350913849313295, "loss": 4.6914, "step": 30325 }, { "epoch": 0.6210963897364488, "grad_norm": 0.30267632007598877, "learning_rate": 0.00014344730103904143, "loss": 4.6453, "step": 30330 }, { "epoch": 0.6211987795138534, "grad_norm": 0.2610156834125519, "learning_rate": 0.00014338546470068895, "loss": 4.6167, "step": 30335 }, { "epoch": 0.621301169291258, "grad_norm": 0.2700619101524353, "learning_rate": 0.00014332362948860458, "loss": 4.6627, "step": 30340 }, { "epoch": 0.6214035590686626, "grad_norm": 0.27404138445854187, "learning_rate": 0.00014326179541331716, "loss": 4.6283, "step": 30345 }, { "epoch": 0.6215059488460672, "grad_norm": 0.2615413963794708, "learning_rate": 0.00014319996248535514, "loss": 4.6532, "step": 30350 }, { "epoch": 0.6216083386234719, "grad_norm": 0.27365297079086304, "learning_rate": 0.000143138130715247, "loss": 4.6558, "step": 30355 }, { "epoch": 0.6217107284008765, "grad_norm": 0.25237101316452026, "learning_rate": 0.00014307630011352093, "loss": 4.6647, "step": 30360 }, { "epoch": 0.6218131181782811, "grad_norm": 0.2741037607192993, "learning_rate": 0.00014301447069070496, "loss": 4.668, "step": 30365 }, { "epoch": 0.6219155079556857, "grad_norm": 0.2604462802410126, "learning_rate": 0.00014295264245732682, "loss": 4.6666, "step": 30370 }, { "epoch": 0.6220178977330904, "grad_norm": 0.26137575507164, "learning_rate": 0.00014289081542391425, "loss": 4.6485, "step": 30375 }, { "epoch": 0.622120287510495, "grad_norm": 0.2824261486530304, "learning_rate": 0.00014282898960099445, "loss": 4.5958, "step": 30380 }, { "epoch": 0.6222226772878996, "grad_norm": 0.2548956871032715, "learning_rate": 0.0001427671649990947, "loss": 4.6923, "step": 30385 }, { "epoch": 0.6223250670653042, "grad_norm": 0.2591424882411957, "learning_rate": 0.00014270534162874203, "loss": 4.6416, "step": 30390 }, { "epoch": 0.6224274568427088, "grad_norm": 0.2611396014690399, "learning_rate": 0.00014264351950046312, "loss": 4.6187, "step": 30395 }, { "epoch": 0.6225298466201135, "grad_norm": 0.27166223526000977, "learning_rate": 0.0001425816986247846, "loss": 4.6821, "step": 30400 }, { "epoch": 0.6226322363975181, "grad_norm": 0.2582108676433563, "learning_rate": 0.00014251987901223276, "loss": 4.6494, "step": 30405 }, { "epoch": 0.6227346261749227, "grad_norm": 0.2771899700164795, "learning_rate": 0.00014245806067333377, "loss": 4.6585, "step": 30410 }, { "epoch": 0.6228370159523273, "grad_norm": 0.29010680317878723, "learning_rate": 0.0001423962436186135, "loss": 4.6465, "step": 30415 }, { "epoch": 0.6229394057297319, "grad_norm": 0.2566431164741516, "learning_rate": 0.00014233442785859774, "loss": 4.6721, "step": 30420 }, { "epoch": 0.6230417955071366, "grad_norm": 0.27273184061050415, "learning_rate": 0.00014227261340381186, "loss": 4.6872, "step": 30425 }, { "epoch": 0.6231441852845412, "grad_norm": 0.26049673557281494, "learning_rate": 0.0001422108002647811, "loss": 4.6581, "step": 30430 }, { "epoch": 0.6232465750619458, "grad_norm": 0.2618211507797241, "learning_rate": 0.00014214898845203052, "loss": 4.6511, "step": 30435 }, { "epoch": 0.6233489648393504, "grad_norm": 0.2608950734138489, "learning_rate": 0.00014208717797608504, "loss": 4.6518, "step": 30440 }, { "epoch": 0.623451354616755, "grad_norm": 0.2676299214363098, "learning_rate": 0.00014202536884746906, "loss": 4.6916, "step": 30445 }, { "epoch": 0.6235537443941597, "grad_norm": 0.2924249768257141, "learning_rate": 0.00014196356107670698, "loss": 4.6485, "step": 30450 }, { "epoch": 0.6236561341715643, "grad_norm": 0.27814215421676636, "learning_rate": 0.00014190175467432297, "loss": 4.6598, "step": 30455 }, { "epoch": 0.6237585239489689, "grad_norm": 0.2750813663005829, "learning_rate": 0.00014183994965084084, "loss": 4.6518, "step": 30460 }, { "epoch": 0.6238609137263735, "grad_norm": 0.2613241672515869, "learning_rate": 0.00014177814601678429, "loss": 4.6314, "step": 30465 }, { "epoch": 0.6239633035037782, "grad_norm": 0.25416532158851624, "learning_rate": 0.00014171634378267673, "loss": 4.6913, "step": 30470 }, { "epoch": 0.6240656932811828, "grad_norm": 0.2595835030078888, "learning_rate": 0.00014165454295904124, "loss": 4.6661, "step": 30475 }, { "epoch": 0.6241680830585874, "grad_norm": 0.27630704641342163, "learning_rate": 0.00014159274355640084, "loss": 4.7037, "step": 30480 }, { "epoch": 0.624270472835992, "grad_norm": 0.3200695514678955, "learning_rate": 0.0001415309455852782, "loss": 4.6411, "step": 30485 }, { "epoch": 0.6243728626133966, "grad_norm": 0.26501190662384033, "learning_rate": 0.00014146914905619574, "loss": 4.6098, "step": 30490 }, { "epoch": 0.6244752523908013, "grad_norm": 0.26884472370147705, "learning_rate": 0.00014140735397967567, "loss": 4.627, "step": 30495 }, { "epoch": 0.6245776421682059, "grad_norm": 0.2895987033843994, "learning_rate": 0.00014134556036624, "loss": 4.6695, "step": 30500 }, { "epoch": 0.6246800319456105, "grad_norm": 0.30213648080825806, "learning_rate": 0.00014128376822641032, "loss": 4.6489, "step": 30505 }, { "epoch": 0.6247824217230151, "grad_norm": 0.25780001282691956, "learning_rate": 0.0001412219775707082, "loss": 4.6279, "step": 30510 }, { "epoch": 0.6248848115004197, "grad_norm": 0.2536289393901825, "learning_rate": 0.00014116018840965478, "loss": 4.6697, "step": 30515 }, { "epoch": 0.6249872012778244, "grad_norm": 0.2722889482975006, "learning_rate": 0.000141098400753771, "loss": 4.679, "step": 30520 }, { "epoch": 0.6250895910552291, "grad_norm": 0.26849550008773804, "learning_rate": 0.00014103661461357752, "loss": 4.6523, "step": 30525 }, { "epoch": 0.6251919808326337, "grad_norm": 0.28604739904403687, "learning_rate": 0.00014097482999959483, "loss": 4.6449, "step": 30530 }, { "epoch": 0.6252943706100383, "grad_norm": 0.2754095196723938, "learning_rate": 0.0001409130469223431, "loss": 4.6632, "step": 30535 }, { "epoch": 0.625396760387443, "grad_norm": 0.26504576206207275, "learning_rate": 0.00014085126539234216, "loss": 4.742, "step": 30540 }, { "epoch": 0.6254991501648476, "grad_norm": 0.26037588715553284, "learning_rate": 0.0001407894854201118, "loss": 4.6251, "step": 30545 }, { "epoch": 0.6256015399422522, "grad_norm": 0.25871458649635315, "learning_rate": 0.00014072770701617124, "loss": 4.685, "step": 30550 }, { "epoch": 0.6257039297196568, "grad_norm": 0.27008435130119324, "learning_rate": 0.00014066593019103967, "loss": 4.6656, "step": 30555 }, { "epoch": 0.6258063194970614, "grad_norm": 0.29850319027900696, "learning_rate": 0.00014060415495523602, "loss": 4.713, "step": 30560 }, { "epoch": 0.6259087092744661, "grad_norm": 0.3362254798412323, "learning_rate": 0.0001405423813192787, "loss": 4.6315, "step": 30565 }, { "epoch": 0.6260110990518707, "grad_norm": 0.26641160249710083, "learning_rate": 0.00014048060929368607, "loss": 4.643, "step": 30570 }, { "epoch": 0.6261134888292753, "grad_norm": 0.2786027491092682, "learning_rate": 0.00014041883888897623, "loss": 4.6421, "step": 30575 }, { "epoch": 0.6262158786066799, "grad_norm": 0.25587013363838196, "learning_rate": 0.0001403570701156668, "loss": 4.6483, "step": 30580 }, { "epoch": 0.6263182683840846, "grad_norm": 0.2661448121070862, "learning_rate": 0.00014029530298427536, "loss": 4.6201, "step": 30585 }, { "epoch": 0.6264206581614892, "grad_norm": 0.26119163632392883, "learning_rate": 0.00014023353750531906, "loss": 4.6333, "step": 30590 }, { "epoch": 0.6265230479388938, "grad_norm": 0.26306217908859253, "learning_rate": 0.00014017177368931487, "loss": 4.6623, "step": 30595 }, { "epoch": 0.6266254377162984, "grad_norm": 0.2618086338043213, "learning_rate": 0.0001401100115467793, "loss": 4.6762, "step": 30600 }, { "epoch": 0.626727827493703, "grad_norm": 0.2748504877090454, "learning_rate": 0.0001400482510882288, "loss": 4.7071, "step": 30605 }, { "epoch": 0.6268302172711077, "grad_norm": 0.2736903727054596, "learning_rate": 0.00013998649232417948, "loss": 4.6199, "step": 30610 }, { "epoch": 0.6269326070485123, "grad_norm": 0.27250659465789795, "learning_rate": 0.00013992473526514693, "loss": 4.643, "step": 30615 }, { "epoch": 0.6270349968259169, "grad_norm": 0.2631660997867584, "learning_rate": 0.0001398629799216467, "loss": 4.6248, "step": 30620 }, { "epoch": 0.6271373866033215, "grad_norm": 0.262580007314682, "learning_rate": 0.00013980122630419408, "loss": 4.6678, "step": 30625 }, { "epoch": 0.6272397763807261, "grad_norm": 0.26323434710502625, "learning_rate": 0.0001397394744233038, "loss": 4.6388, "step": 30630 }, { "epoch": 0.6273421661581308, "grad_norm": 0.26835930347442627, "learning_rate": 0.00013967772428949058, "loss": 4.6852, "step": 30635 }, { "epoch": 0.6274445559355354, "grad_norm": 0.2563098073005676, "learning_rate": 0.0001396159759132687, "loss": 4.6606, "step": 30640 }, { "epoch": 0.62754694571294, "grad_norm": 0.2582773268222809, "learning_rate": 0.0001395542293051521, "loss": 4.6492, "step": 30645 }, { "epoch": 0.6276493354903446, "grad_norm": 0.26118898391723633, "learning_rate": 0.0001394924844756545, "loss": 4.6497, "step": 30650 }, { "epoch": 0.6277517252677492, "grad_norm": 0.2593550980091095, "learning_rate": 0.00013943074143528942, "loss": 4.6605, "step": 30655 }, { "epoch": 0.6278541150451539, "grad_norm": 0.274387389421463, "learning_rate": 0.00013936900019456976, "loss": 4.6924, "step": 30660 }, { "epoch": 0.6279565048225585, "grad_norm": 0.28171506524086, "learning_rate": 0.0001393072607640084, "loss": 4.6535, "step": 30665 }, { "epoch": 0.6280588945999631, "grad_norm": 0.27562254667282104, "learning_rate": 0.00013924552315411784, "loss": 4.6853, "step": 30670 }, { "epoch": 0.6281612843773677, "grad_norm": 0.2885560095310211, "learning_rate": 0.00013918378737541018, "loss": 4.6677, "step": 30675 }, { "epoch": 0.6282636741547724, "grad_norm": 0.28729793429374695, "learning_rate": 0.0001391220534383973, "loss": 4.6241, "step": 30680 }, { "epoch": 0.628366063932177, "grad_norm": 0.2732503414154053, "learning_rate": 0.00013906032135359078, "loss": 4.6551, "step": 30685 }, { "epoch": 0.6284684537095816, "grad_norm": 0.2720591127872467, "learning_rate": 0.0001389985911315018, "loss": 4.6453, "step": 30690 }, { "epoch": 0.6285708434869862, "grad_norm": 0.2801700234413147, "learning_rate": 0.00013893686278264127, "loss": 4.6355, "step": 30695 }, { "epoch": 0.6286732332643908, "grad_norm": 0.25980231165885925, "learning_rate": 0.0001388751363175199, "loss": 4.6317, "step": 30700 }, { "epoch": 0.6287756230417955, "grad_norm": 0.2581658363342285, "learning_rate": 0.00013881341174664776, "loss": 4.6678, "step": 30705 }, { "epoch": 0.6288780128192001, "grad_norm": 0.27167370915412903, "learning_rate": 0.00013875168908053493, "loss": 4.6547, "step": 30710 }, { "epoch": 0.6289804025966047, "grad_norm": 0.2577280104160309, "learning_rate": 0.00013868996832969098, "loss": 4.6499, "step": 30715 }, { "epoch": 0.6290827923740093, "grad_norm": 0.2653399109840393, "learning_rate": 0.00013862824950462523, "loss": 4.626, "step": 30720 }, { "epoch": 0.6291851821514141, "grad_norm": 0.2757514417171478, "learning_rate": 0.00013856653261584662, "loss": 4.5893, "step": 30725 }, { "epoch": 0.6292875719288187, "grad_norm": 0.27251172065734863, "learning_rate": 0.00013850481767386383, "loss": 4.7225, "step": 30730 }, { "epoch": 0.6293899617062233, "grad_norm": 0.2618646025657654, "learning_rate": 0.00013844310468918517, "loss": 4.6658, "step": 30735 }, { "epoch": 0.6294923514836279, "grad_norm": 0.256133109331131, "learning_rate": 0.00013838139367231857, "loss": 4.7011, "step": 30740 }, { "epoch": 0.6295947412610325, "grad_norm": 0.2629673480987549, "learning_rate": 0.0001383196846337717, "loss": 4.649, "step": 30745 }, { "epoch": 0.6296971310384372, "grad_norm": 0.2708331346511841, "learning_rate": 0.00013825797758405188, "loss": 4.6626, "step": 30750 }, { "epoch": 0.6297995208158418, "grad_norm": 0.26925358176231384, "learning_rate": 0.000138196272533666, "loss": 4.6328, "step": 30755 }, { "epoch": 0.6299019105932464, "grad_norm": 0.25351065397262573, "learning_rate": 0.00013813456949312075, "loss": 4.6229, "step": 30760 }, { "epoch": 0.630004300370651, "grad_norm": 0.257271945476532, "learning_rate": 0.00013807286847292242, "loss": 4.6823, "step": 30765 }, { "epoch": 0.6301066901480556, "grad_norm": 0.2567111551761627, "learning_rate": 0.00013801116948357686, "loss": 4.6937, "step": 30770 }, { "epoch": 0.6302090799254603, "grad_norm": 0.2745841443538666, "learning_rate": 0.00013794947253558974, "loss": 4.6696, "step": 30775 }, { "epoch": 0.6303114697028649, "grad_norm": 0.26817578077316284, "learning_rate": 0.0001378877776394663, "loss": 4.7009, "step": 30780 }, { "epoch": 0.6304138594802695, "grad_norm": 0.25737127661705017, "learning_rate": 0.0001378260848057114, "loss": 4.6116, "step": 30785 }, { "epoch": 0.6305162492576741, "grad_norm": 0.2685026228427887, "learning_rate": 0.0001377643940448296, "loss": 4.6796, "step": 30790 }, { "epoch": 0.6306186390350788, "grad_norm": 0.2860910892486572, "learning_rate": 0.00013770270536732517, "loss": 4.697, "step": 30795 }, { "epoch": 0.6307210288124834, "grad_norm": 0.2825358808040619, "learning_rate": 0.00013764101878370176, "loss": 4.6613, "step": 30800 }, { "epoch": 0.630823418589888, "grad_norm": 0.2780124247074127, "learning_rate": 0.00013757933430446297, "loss": 4.6359, "step": 30805 }, { "epoch": 0.6309258083672926, "grad_norm": 0.26248306035995483, "learning_rate": 0.00013751765194011195, "loss": 4.6537, "step": 30810 }, { "epoch": 0.6310281981446972, "grad_norm": 0.27073562145233154, "learning_rate": 0.0001374559717011514, "loss": 4.6388, "step": 30815 }, { "epoch": 0.6311305879221019, "grad_norm": 0.25821781158447266, "learning_rate": 0.0001373942935980837, "loss": 4.6805, "step": 30820 }, { "epoch": 0.6312329776995065, "grad_norm": 0.30033940076828003, "learning_rate": 0.00013733261764141096, "loss": 4.6187, "step": 30825 }, { "epoch": 0.6313353674769111, "grad_norm": 0.26460811495780945, "learning_rate": 0.00013727094384163478, "loss": 4.7039, "step": 30830 }, { "epoch": 0.6314377572543157, "grad_norm": 0.2702391445636749, "learning_rate": 0.0001372092722092565, "loss": 4.6402, "step": 30835 }, { "epoch": 0.6315401470317203, "grad_norm": 0.2818358838558197, "learning_rate": 0.0001371476027547771, "loss": 4.643, "step": 30840 }, { "epoch": 0.631642536809125, "grad_norm": 0.26885858178138733, "learning_rate": 0.000137085935488697, "loss": 4.6741, "step": 30845 }, { "epoch": 0.6317449265865296, "grad_norm": 0.2621515989303589, "learning_rate": 0.0001370242704215165, "loss": 4.6976, "step": 30850 }, { "epoch": 0.6318473163639342, "grad_norm": 0.2500268816947937, "learning_rate": 0.0001369626075637354, "loss": 4.7203, "step": 30855 }, { "epoch": 0.6319497061413388, "grad_norm": 0.26177504658699036, "learning_rate": 0.0001369009469258531, "loss": 4.6571, "step": 30860 }, { "epoch": 0.6320520959187435, "grad_norm": 0.27457505464553833, "learning_rate": 0.0001368392885183687, "loss": 4.6319, "step": 30865 }, { "epoch": 0.6321544856961481, "grad_norm": 0.26758092641830444, "learning_rate": 0.0001367776323517809, "loss": 4.6879, "step": 30870 }, { "epoch": 0.6322568754735527, "grad_norm": 0.2595585882663727, "learning_rate": 0.00013671597843658794, "loss": 4.7071, "step": 30875 }, { "epoch": 0.6323592652509573, "grad_norm": 0.25526174902915955, "learning_rate": 0.0001366543267832878, "loss": 4.6196, "step": 30880 }, { "epoch": 0.6324616550283619, "grad_norm": 0.26654160022735596, "learning_rate": 0.00013659267740237792, "loss": 4.7092, "step": 30885 }, { "epoch": 0.6325640448057666, "grad_norm": 0.25316905975341797, "learning_rate": 0.0001365310303043556, "loss": 4.6671, "step": 30890 }, { "epoch": 0.6326664345831712, "grad_norm": 0.27131474018096924, "learning_rate": 0.00013646938549971742, "loss": 4.5842, "step": 30895 }, { "epoch": 0.6327688243605758, "grad_norm": 0.2656306028366089, "learning_rate": 0.0001364077429989598, "loss": 4.6664, "step": 30900 }, { "epoch": 0.6328712141379804, "grad_norm": 0.2851974368095398, "learning_rate": 0.00013634610281257882, "loss": 4.6328, "step": 30905 }, { "epoch": 0.632973603915385, "grad_norm": 0.28098243474960327, "learning_rate": 0.0001362844649510699, "loss": 4.6705, "step": 30910 }, { "epoch": 0.6330759936927897, "grad_norm": 0.2572101056575775, "learning_rate": 0.00013622282942492831, "loss": 4.6928, "step": 30915 }, { "epoch": 0.6331783834701943, "grad_norm": 0.27140459418296814, "learning_rate": 0.00013616119624464888, "loss": 4.6777, "step": 30920 }, { "epoch": 0.6332807732475989, "grad_norm": 0.2552447021007538, "learning_rate": 0.00013609956542072588, "loss": 4.6599, "step": 30925 }, { "epoch": 0.6333831630250036, "grad_norm": 0.2926422655582428, "learning_rate": 0.00013603793696365335, "loss": 4.6575, "step": 30930 }, { "epoch": 0.6334855528024083, "grad_norm": 0.3174777925014496, "learning_rate": 0.00013597631088392494, "loss": 4.6245, "step": 30935 }, { "epoch": 0.6335879425798129, "grad_norm": 0.2757773697376251, "learning_rate": 0.0001359146871920337, "loss": 4.7168, "step": 30940 }, { "epoch": 0.6336903323572175, "grad_norm": 0.2805185616016388, "learning_rate": 0.00013585306589847246, "loss": 4.6643, "step": 30945 }, { "epoch": 0.6337927221346221, "grad_norm": 0.2652919888496399, "learning_rate": 0.0001357914470137336, "loss": 4.6943, "step": 30950 }, { "epoch": 0.6338951119120267, "grad_norm": 0.271441787481308, "learning_rate": 0.00013572983054830904, "loss": 4.6667, "step": 30955 }, { "epoch": 0.6339975016894314, "grad_norm": 0.25050222873687744, "learning_rate": 0.00013566821651269032, "loss": 4.6832, "step": 30960 }, { "epoch": 0.634099891466836, "grad_norm": 0.25222840905189514, "learning_rate": 0.00013560660491736864, "loss": 4.6669, "step": 30965 }, { "epoch": 0.6342022812442406, "grad_norm": 0.27370142936706543, "learning_rate": 0.00013554499577283455, "loss": 4.6698, "step": 30970 }, { "epoch": 0.6343046710216452, "grad_norm": 0.2555899918079376, "learning_rate": 0.0001354833890895785, "loss": 4.6676, "step": 30975 }, { "epoch": 0.6344070607990498, "grad_norm": 0.2767821252346039, "learning_rate": 0.00013542178487809035, "loss": 4.6232, "step": 30980 }, { "epoch": 0.6345094505764545, "grad_norm": 0.26810166239738464, "learning_rate": 0.00013536018314885947, "loss": 4.6715, "step": 30985 }, { "epoch": 0.6346118403538591, "grad_norm": 0.26725059747695923, "learning_rate": 0.00013529858391237492, "loss": 4.6095, "step": 30990 }, { "epoch": 0.6347142301312637, "grad_norm": 0.26766064763069153, "learning_rate": 0.00013523698717912534, "loss": 4.6691, "step": 30995 }, { "epoch": 0.6348166199086683, "grad_norm": 0.27194923162460327, "learning_rate": 0.00013517539295959885, "loss": 4.6947, "step": 31000 }, { "epoch": 0.6348166199086683, "eval_loss": 4.668798923492432, "eval_runtime": 17.1246, "eval_samples_per_second": 33.986, "eval_steps_per_second": 4.263, "step": 31000 }, { "epoch": 0.634919009686073, "grad_norm": 0.27915051579475403, "learning_rate": 0.00013511380126428324, "loss": 4.6686, "step": 31005 }, { "epoch": 0.6350213994634776, "grad_norm": 0.27530568838119507, "learning_rate": 0.00013505221210366583, "loss": 4.647, "step": 31010 }, { "epoch": 0.6351237892408822, "grad_norm": 0.271456778049469, "learning_rate": 0.00013499062548823353, "loss": 4.6884, "step": 31015 }, { "epoch": 0.6352261790182868, "grad_norm": 0.26434803009033203, "learning_rate": 0.00013492904142847276, "loss": 4.6459, "step": 31020 }, { "epoch": 0.6353285687956914, "grad_norm": 0.2877468168735504, "learning_rate": 0.00013486745993486956, "loss": 4.6195, "step": 31025 }, { "epoch": 0.6354309585730961, "grad_norm": 0.2787456214427948, "learning_rate": 0.00013480588101790958, "loss": 4.586, "step": 31030 }, { "epoch": 0.6355333483505007, "grad_norm": 0.2634255588054657, "learning_rate": 0.00013474430468807784, "loss": 4.6536, "step": 31035 }, { "epoch": 0.6356357381279053, "grad_norm": 0.2897188365459442, "learning_rate": 0.0001346827309558591, "loss": 4.6444, "step": 31040 }, { "epoch": 0.6357381279053099, "grad_norm": 0.28307461738586426, "learning_rate": 0.00013462115983173766, "loss": 4.6021, "step": 31045 }, { "epoch": 0.6358405176827145, "grad_norm": 0.2803022563457489, "learning_rate": 0.00013455959132619726, "loss": 4.6446, "step": 31050 }, { "epoch": 0.6359429074601192, "grad_norm": 0.2702583074569702, "learning_rate": 0.00013449802544972134, "loss": 4.6254, "step": 31055 }, { "epoch": 0.6360452972375238, "grad_norm": 0.2827287018299103, "learning_rate": 0.00013443646221279287, "loss": 4.6596, "step": 31060 }, { "epoch": 0.6361476870149284, "grad_norm": 0.2646181285381317, "learning_rate": 0.00013437490162589416, "loss": 4.6375, "step": 31065 }, { "epoch": 0.636250076792333, "grad_norm": 0.26009178161621094, "learning_rate": 0.00013431334369950739, "loss": 4.64, "step": 31070 }, { "epoch": 0.6363524665697377, "grad_norm": 0.2934807240962982, "learning_rate": 0.00013425178844411414, "loss": 4.6307, "step": 31075 }, { "epoch": 0.6364548563471423, "grad_norm": 0.2529505789279938, "learning_rate": 0.00013419023587019538, "loss": 4.6737, "step": 31080 }, { "epoch": 0.6365572461245469, "grad_norm": 0.2632039189338684, "learning_rate": 0.00013412868598823186, "loss": 4.6603, "step": 31085 }, { "epoch": 0.6366596359019515, "grad_norm": 0.2729152739048004, "learning_rate": 0.00013406713880870385, "loss": 4.6461, "step": 31090 }, { "epoch": 0.6367620256793561, "grad_norm": 0.29835352301597595, "learning_rate": 0.00013400559434209095, "loss": 4.7061, "step": 31095 }, { "epoch": 0.6368644154567608, "grad_norm": 0.28775349259376526, "learning_rate": 0.00013394405259887257, "loss": 4.5874, "step": 31100 }, { "epoch": 0.6369668052341654, "grad_norm": 0.2684108018875122, "learning_rate": 0.00013388251358952748, "loss": 4.6305, "step": 31105 }, { "epoch": 0.63706919501157, "grad_norm": 0.26177436113357544, "learning_rate": 0.00013382097732453393, "loss": 4.6794, "step": 31110 }, { "epoch": 0.6371715847889746, "grad_norm": 0.2835049331188202, "learning_rate": 0.00013375944381436997, "loss": 4.6515, "step": 31115 }, { "epoch": 0.6372739745663792, "grad_norm": 0.26999524235725403, "learning_rate": 0.00013369791306951296, "loss": 4.6284, "step": 31120 }, { "epoch": 0.6373763643437839, "grad_norm": 0.26571357250213623, "learning_rate": 0.00013363638510043978, "loss": 4.6305, "step": 31125 }, { "epoch": 0.6374787541211886, "grad_norm": 0.2659294605255127, "learning_rate": 0.00013357485991762695, "loss": 4.6506, "step": 31130 }, { "epoch": 0.6375811438985932, "grad_norm": 0.26096558570861816, "learning_rate": 0.0001335133375315505, "loss": 4.6697, "step": 31135 }, { "epoch": 0.6376835336759978, "grad_norm": 0.26418620347976685, "learning_rate": 0.00013345181795268584, "loss": 4.6154, "step": 31140 }, { "epoch": 0.6377859234534025, "grad_norm": 0.2718810439109802, "learning_rate": 0.00013339030119150812, "loss": 4.5965, "step": 31145 }, { "epoch": 0.6378883132308071, "grad_norm": 0.27645042538642883, "learning_rate": 0.00013332878725849187, "loss": 4.6343, "step": 31150 }, { "epoch": 0.6379907030082117, "grad_norm": 0.2620978355407715, "learning_rate": 0.0001332672761641112, "loss": 4.6612, "step": 31155 }, { "epoch": 0.6380930927856163, "grad_norm": 0.27411043643951416, "learning_rate": 0.0001332057679188396, "loss": 4.6513, "step": 31160 }, { "epoch": 0.6381954825630209, "grad_norm": 0.25224819779396057, "learning_rate": 0.0001331442625331503, "loss": 4.5905, "step": 31165 }, { "epoch": 0.6382978723404256, "grad_norm": 0.29067060351371765, "learning_rate": 0.00013308276001751596, "loss": 4.6709, "step": 31170 }, { "epoch": 0.6384002621178302, "grad_norm": 0.2866936922073364, "learning_rate": 0.00013302126038240855, "loss": 4.612, "step": 31175 }, { "epoch": 0.6385026518952348, "grad_norm": 0.2631337642669678, "learning_rate": 0.00013295976363829982, "loss": 4.6359, "step": 31180 }, { "epoch": 0.6386050416726394, "grad_norm": 0.2646958827972412, "learning_rate": 0.00013289826979566096, "loss": 4.6162, "step": 31185 }, { "epoch": 0.638707431450044, "grad_norm": 0.2835996448993683, "learning_rate": 0.00013283677886496252, "loss": 4.6625, "step": 31190 }, { "epoch": 0.6388098212274487, "grad_norm": 0.26449674367904663, "learning_rate": 0.00013277529085667475, "loss": 4.6393, "step": 31195 }, { "epoch": 0.6389122110048533, "grad_norm": 0.27873530983924866, "learning_rate": 0.00013271380578126738, "loss": 4.6502, "step": 31200 }, { "epoch": 0.6390146007822579, "grad_norm": 0.2593189775943756, "learning_rate": 0.00013265232364920938, "loss": 4.6423, "step": 31205 }, { "epoch": 0.6391169905596625, "grad_norm": 0.2723395824432373, "learning_rate": 0.00013259084447096955, "loss": 4.6721, "step": 31210 }, { "epoch": 0.6392193803370672, "grad_norm": 0.2658664882183075, "learning_rate": 0.0001325293682570161, "loss": 4.6312, "step": 31215 }, { "epoch": 0.6393217701144718, "grad_norm": 0.2618985176086426, "learning_rate": 0.00013246789501781658, "loss": 4.6702, "step": 31220 }, { "epoch": 0.6394241598918764, "grad_norm": 0.27947092056274414, "learning_rate": 0.00013240642476383818, "loss": 4.6387, "step": 31225 }, { "epoch": 0.639526549669281, "grad_norm": 0.27498918771743774, "learning_rate": 0.00013234495750554757, "loss": 4.5731, "step": 31230 }, { "epoch": 0.6396289394466856, "grad_norm": 0.27096331119537354, "learning_rate": 0.00013228349325341082, "loss": 4.6752, "step": 31235 }, { "epoch": 0.6397313292240903, "grad_norm": 0.26269224286079407, "learning_rate": 0.0001322220320178936, "loss": 4.687, "step": 31240 }, { "epoch": 0.6398337190014949, "grad_norm": 0.26830875873565674, "learning_rate": 0.0001321605738094611, "loss": 4.6535, "step": 31245 }, { "epoch": 0.6399361087788995, "grad_norm": 0.26473885774612427, "learning_rate": 0.00013209911863857775, "loss": 4.6585, "step": 31250 }, { "epoch": 0.6400384985563041, "grad_norm": 0.27840396761894226, "learning_rate": 0.00013203766651570767, "loss": 4.6332, "step": 31255 }, { "epoch": 0.6401408883337087, "grad_norm": 0.26939570903778076, "learning_rate": 0.00013197621745131453, "loss": 4.6491, "step": 31260 }, { "epoch": 0.6402432781111134, "grad_norm": 0.2661779820919037, "learning_rate": 0.00013191477145586123, "loss": 4.6901, "step": 31265 }, { "epoch": 0.640345667888518, "grad_norm": 0.2732379734516144, "learning_rate": 0.00013185332853981035, "loss": 4.6541, "step": 31270 }, { "epoch": 0.6404480576659226, "grad_norm": 0.2726680040359497, "learning_rate": 0.0001317918887136239, "loss": 4.6936, "step": 31275 }, { "epoch": 0.6405504474433272, "grad_norm": 0.2657804787158966, "learning_rate": 0.00013173045198776326, "loss": 4.6607, "step": 31280 }, { "epoch": 0.6406528372207319, "grad_norm": 0.26409026980400085, "learning_rate": 0.0001316690183726894, "loss": 4.6207, "step": 31285 }, { "epoch": 0.6407552269981365, "grad_norm": 0.26010459661483765, "learning_rate": 0.00013160758787886278, "loss": 4.5902, "step": 31290 }, { "epoch": 0.6408576167755411, "grad_norm": 0.26215532422065735, "learning_rate": 0.00013154616051674328, "loss": 4.6221, "step": 31295 }, { "epoch": 0.6409600065529457, "grad_norm": 0.24972034990787506, "learning_rate": 0.0001314847362967901, "loss": 4.6785, "step": 31300 }, { "epoch": 0.6410623963303503, "grad_norm": 0.2610864043235779, "learning_rate": 0.00013142331522946217, "loss": 4.6598, "step": 31305 }, { "epoch": 0.641164786107755, "grad_norm": 0.2775006592273712, "learning_rate": 0.00013136189732521782, "loss": 4.6107, "step": 31310 }, { "epoch": 0.6412671758851596, "grad_norm": 0.27934861183166504, "learning_rate": 0.00013130048259451458, "loss": 4.6204, "step": 31315 }, { "epoch": 0.6413695656625642, "grad_norm": 0.2705919146537781, "learning_rate": 0.0001312390710478098, "loss": 4.6791, "step": 31320 }, { "epoch": 0.6414719554399688, "grad_norm": 0.26356828212738037, "learning_rate": 0.00013117766269556008, "loss": 4.6094, "step": 31325 }, { "epoch": 0.6415743452173734, "grad_norm": 0.281583309173584, "learning_rate": 0.0001311162575482215, "loss": 4.6482, "step": 31330 }, { "epoch": 0.6416767349947782, "grad_norm": 0.2770513594150543, "learning_rate": 0.0001310548556162496, "loss": 4.6195, "step": 31335 }, { "epoch": 0.6417791247721828, "grad_norm": 0.27134597301483154, "learning_rate": 0.00013099345691009953, "loss": 4.6189, "step": 31340 }, { "epoch": 0.6418815145495874, "grad_norm": 0.2557509243488312, "learning_rate": 0.00013093206144022557, "loss": 4.6669, "step": 31345 }, { "epoch": 0.641983904326992, "grad_norm": 0.2638404965400696, "learning_rate": 0.0001308706692170817, "loss": 4.6664, "step": 31350 }, { "epoch": 0.6420862941043967, "grad_norm": 0.26745322346687317, "learning_rate": 0.00013080928025112137, "loss": 4.6851, "step": 31355 }, { "epoch": 0.6421886838818013, "grad_norm": 0.26370060443878174, "learning_rate": 0.00013074789455279725, "loss": 4.6362, "step": 31360 }, { "epoch": 0.6422910736592059, "grad_norm": 0.2882365882396698, "learning_rate": 0.00013068651213256165, "loss": 4.6646, "step": 31365 }, { "epoch": 0.6423934634366105, "grad_norm": 0.2712881863117218, "learning_rate": 0.00013062513300086625, "loss": 4.7021, "step": 31370 }, { "epoch": 0.6424958532140151, "grad_norm": 0.2631036043167114, "learning_rate": 0.00013056375716816217, "loss": 4.611, "step": 31375 }, { "epoch": 0.6425982429914198, "grad_norm": 0.28645893931388855, "learning_rate": 0.00013050238464489997, "loss": 4.6319, "step": 31380 }, { "epoch": 0.6427006327688244, "grad_norm": 0.2656135559082031, "learning_rate": 0.00013044101544152975, "loss": 4.6208, "step": 31385 }, { "epoch": 0.642803022546229, "grad_norm": 0.2631455361843109, "learning_rate": 0.00013037964956850076, "loss": 4.6291, "step": 31390 }, { "epoch": 0.6429054123236336, "grad_norm": 0.26318174600601196, "learning_rate": 0.00013031828703626198, "loss": 4.6321, "step": 31395 }, { "epoch": 0.6430078021010383, "grad_norm": 0.26336249709129333, "learning_rate": 0.00013025692785526176, "loss": 4.6368, "step": 31400 }, { "epoch": 0.6431101918784429, "grad_norm": 0.2661740183830261, "learning_rate": 0.00013019557203594774, "loss": 4.6368, "step": 31405 }, { "epoch": 0.6432125816558475, "grad_norm": 0.2788700461387634, "learning_rate": 0.0001301342195887671, "loss": 4.6237, "step": 31410 }, { "epoch": 0.6433149714332521, "grad_norm": 0.2701438069343567, "learning_rate": 0.00013007287052416652, "loss": 4.6335, "step": 31415 }, { "epoch": 0.6434173612106567, "grad_norm": 0.27556711435317993, "learning_rate": 0.00013001152485259186, "loss": 4.6613, "step": 31420 }, { "epoch": 0.6435197509880614, "grad_norm": 0.2829678952693939, "learning_rate": 0.00012995018258448864, "loss": 4.6782, "step": 31425 }, { "epoch": 0.643622140765466, "grad_norm": 0.27381062507629395, "learning_rate": 0.00012988884373030165, "loss": 4.6606, "step": 31430 }, { "epoch": 0.6437245305428706, "grad_norm": 0.2785673141479492, "learning_rate": 0.0001298275083004753, "loss": 4.6144, "step": 31435 }, { "epoch": 0.6438269203202752, "grad_norm": 0.2944762110710144, "learning_rate": 0.00012976617630545314, "loss": 4.6269, "step": 31440 }, { "epoch": 0.6439293100976798, "grad_norm": 0.2634918689727783, "learning_rate": 0.00012970484775567826, "loss": 4.6674, "step": 31445 }, { "epoch": 0.6440316998750845, "grad_norm": 0.2722005248069763, "learning_rate": 0.0001296435226615933, "loss": 4.5888, "step": 31450 }, { "epoch": 0.6441340896524891, "grad_norm": 0.2757498025894165, "learning_rate": 0.00012958220103364006, "loss": 4.6541, "step": 31455 }, { "epoch": 0.6442364794298937, "grad_norm": 0.27535200119018555, "learning_rate": 0.00012952088288225997, "loss": 4.6466, "step": 31460 }, { "epoch": 0.6443388692072983, "grad_norm": 0.26694583892822266, "learning_rate": 0.00012945956821789373, "loss": 4.6304, "step": 31465 }, { "epoch": 0.644441258984703, "grad_norm": 0.2681289315223694, "learning_rate": 0.0001293982570509815, "loss": 4.6285, "step": 31470 }, { "epoch": 0.6445436487621076, "grad_norm": 0.2661706805229187, "learning_rate": 0.0001293369493919628, "loss": 4.6666, "step": 31475 }, { "epoch": 0.6446460385395122, "grad_norm": 0.2730381190776825, "learning_rate": 0.0001292756452512767, "loss": 4.6437, "step": 31480 }, { "epoch": 0.6447484283169168, "grad_norm": 0.27071261405944824, "learning_rate": 0.00012921434463936143, "loss": 4.6378, "step": 31485 }, { "epoch": 0.6448508180943214, "grad_norm": 0.25690731406211853, "learning_rate": 0.00012915304756665479, "loss": 4.5901, "step": 31490 }, { "epoch": 0.644953207871726, "grad_norm": 0.26359280943870544, "learning_rate": 0.00012909175404359402, "loss": 4.6265, "step": 31495 }, { "epoch": 0.6450555976491307, "grad_norm": 0.2577619254589081, "learning_rate": 0.00012903046408061555, "loss": 4.6133, "step": 31500 }, { "epoch": 0.6451579874265353, "grad_norm": 0.27889639139175415, "learning_rate": 0.00012896917768815536, "loss": 4.6308, "step": 31505 }, { "epoch": 0.6452603772039399, "grad_norm": 0.2662416398525238, "learning_rate": 0.00012890789487664882, "loss": 4.6571, "step": 31510 }, { "epoch": 0.6453627669813445, "grad_norm": 0.2627926468849182, "learning_rate": 0.00012884661565653064, "loss": 4.6088, "step": 31515 }, { "epoch": 0.6454651567587492, "grad_norm": 0.26997891068458557, "learning_rate": 0.00012878534003823492, "loss": 4.6122, "step": 31520 }, { "epoch": 0.6455675465361538, "grad_norm": 0.2604016363620758, "learning_rate": 0.00012872406803219526, "loss": 4.6726, "step": 31525 }, { "epoch": 0.6456699363135584, "grad_norm": 0.2653314471244812, "learning_rate": 0.00012866279964884438, "loss": 4.6571, "step": 31530 }, { "epoch": 0.6457723260909631, "grad_norm": 0.25445958971977234, "learning_rate": 0.00012860153489861468, "loss": 4.6389, "step": 31535 }, { "epoch": 0.6458747158683678, "grad_norm": 0.26626482605934143, "learning_rate": 0.00012854027379193778, "loss": 4.6169, "step": 31540 }, { "epoch": 0.6459771056457724, "grad_norm": 0.2648249864578247, "learning_rate": 0.00012847901633924466, "loss": 4.6308, "step": 31545 }, { "epoch": 0.646079495423177, "grad_norm": 0.27532705664634705, "learning_rate": 0.00012841776255096579, "loss": 4.6544, "step": 31550 }, { "epoch": 0.6461818852005816, "grad_norm": 0.26371702551841736, "learning_rate": 0.00012835651243753098, "loss": 4.683, "step": 31555 }, { "epoch": 0.6462842749779862, "grad_norm": 0.26055431365966797, "learning_rate": 0.0001282952660093693, "loss": 4.6081, "step": 31560 }, { "epoch": 0.6463866647553909, "grad_norm": 0.31222742795944214, "learning_rate": 0.00012823402327690934, "loss": 4.677, "step": 31565 }, { "epoch": 0.6464890545327955, "grad_norm": 0.2880314886569977, "learning_rate": 0.000128172784250579, "loss": 4.6435, "step": 31570 }, { "epoch": 0.6465914443102001, "grad_norm": 0.2676008641719818, "learning_rate": 0.0001281115489408056, "loss": 4.6519, "step": 31575 }, { "epoch": 0.6466938340876047, "grad_norm": 0.25955820083618164, "learning_rate": 0.00012805031735801568, "loss": 4.7021, "step": 31580 }, { "epoch": 0.6467962238650093, "grad_norm": 0.26532554626464844, "learning_rate": 0.00012798908951263527, "loss": 4.6321, "step": 31585 }, { "epoch": 0.646898613642414, "grad_norm": 0.27449071407318115, "learning_rate": 0.00012792786541508983, "loss": 4.6442, "step": 31590 }, { "epoch": 0.6470010034198186, "grad_norm": 0.2585086226463318, "learning_rate": 0.00012786664507580397, "loss": 4.6519, "step": 31595 }, { "epoch": 0.6471033931972232, "grad_norm": 0.2651681900024414, "learning_rate": 0.00012780542850520185, "loss": 4.6639, "step": 31600 }, { "epoch": 0.6472057829746278, "grad_norm": 0.2532927989959717, "learning_rate": 0.0001277442157137069, "loss": 4.6369, "step": 31605 }, { "epoch": 0.6473081727520325, "grad_norm": 0.27088263630867004, "learning_rate": 0.00012768300671174194, "loss": 4.5084, "step": 31610 }, { "epoch": 0.6474105625294371, "grad_norm": 0.2733481824398041, "learning_rate": 0.00012762180150972907, "loss": 4.6762, "step": 31615 }, { "epoch": 0.6475129523068417, "grad_norm": 0.2534061372280121, "learning_rate": 0.00012756060011808997, "loss": 4.6468, "step": 31620 }, { "epoch": 0.6476153420842463, "grad_norm": 0.32298529148101807, "learning_rate": 0.0001274994025472453, "loss": 4.6615, "step": 31625 }, { "epoch": 0.6477177318616509, "grad_norm": 0.270778626203537, "learning_rate": 0.00012743820880761533, "loss": 4.6509, "step": 31630 }, { "epoch": 0.6478201216390556, "grad_norm": 0.277631551027298, "learning_rate": 0.0001273770189096197, "loss": 4.6263, "step": 31635 }, { "epoch": 0.6479225114164602, "grad_norm": 0.2573779821395874, "learning_rate": 0.00012731583286367725, "loss": 4.6196, "step": 31640 }, { "epoch": 0.6480249011938648, "grad_norm": 0.2725055515766144, "learning_rate": 0.00012725465068020623, "loss": 4.6395, "step": 31645 }, { "epoch": 0.6481272909712694, "grad_norm": 0.27602913975715637, "learning_rate": 0.00012719347236962432, "loss": 4.6314, "step": 31650 }, { "epoch": 0.648229680748674, "grad_norm": 0.2726741433143616, "learning_rate": 0.0001271322979423483, "loss": 4.6858, "step": 31655 }, { "epoch": 0.6483320705260787, "grad_norm": 0.27213045954704285, "learning_rate": 0.00012707112740879456, "loss": 4.6656, "step": 31660 }, { "epoch": 0.6484344603034833, "grad_norm": 0.27257275581359863, "learning_rate": 0.00012700996077937875, "loss": 4.6502, "step": 31665 }, { "epoch": 0.6485368500808879, "grad_norm": 0.2729254961013794, "learning_rate": 0.00012694879806451567, "loss": 4.6305, "step": 31670 }, { "epoch": 0.6486392398582925, "grad_norm": 0.2568114399909973, "learning_rate": 0.00012688763927461968, "loss": 4.6921, "step": 31675 }, { "epoch": 0.6487416296356971, "grad_norm": 0.2689872086048126, "learning_rate": 0.00012682648442010442, "loss": 4.6759, "step": 31680 }, { "epoch": 0.6488440194131018, "grad_norm": 0.260543555021286, "learning_rate": 0.00012676533351138275, "loss": 4.6728, "step": 31685 }, { "epoch": 0.6489464091905064, "grad_norm": 0.2777576744556427, "learning_rate": 0.00012670418655886699, "loss": 4.6378, "step": 31690 }, { "epoch": 0.649048798967911, "grad_norm": 0.2914818525314331, "learning_rate": 0.00012664304357296876, "loss": 4.6359, "step": 31695 }, { "epoch": 0.6491511887453156, "grad_norm": 0.2843390107154846, "learning_rate": 0.00012658190456409893, "loss": 4.6131, "step": 31700 }, { "epoch": 0.6492535785227203, "grad_norm": 0.279347687959671, "learning_rate": 0.00012652076954266774, "loss": 4.583, "step": 31705 }, { "epoch": 0.6493559683001249, "grad_norm": 0.2818780541419983, "learning_rate": 0.00012645963851908476, "loss": 4.6393, "step": 31710 }, { "epoch": 0.6494583580775295, "grad_norm": 0.2638985514640808, "learning_rate": 0.00012639851150375898, "loss": 4.6603, "step": 31715 }, { "epoch": 0.6495607478549341, "grad_norm": 0.2956162691116333, "learning_rate": 0.00012633738850709845, "loss": 4.635, "step": 31720 }, { "epoch": 0.6496631376323387, "grad_norm": 0.2939032316207886, "learning_rate": 0.0001262762695395107, "loss": 4.7239, "step": 31725 }, { "epoch": 0.6497655274097434, "grad_norm": 0.28255006670951843, "learning_rate": 0.00012621515461140267, "loss": 4.598, "step": 31730 }, { "epoch": 0.649867917187148, "grad_norm": 0.274516224861145, "learning_rate": 0.0001261540437331804, "loss": 4.6531, "step": 31735 }, { "epoch": 0.6499703069645527, "grad_norm": 0.2611735761165619, "learning_rate": 0.00012609293691524936, "loss": 4.5758, "step": 31740 }, { "epoch": 0.6500726967419573, "grad_norm": 0.28692975640296936, "learning_rate": 0.00012603183416801437, "loss": 4.64, "step": 31745 }, { "epoch": 0.650175086519362, "grad_norm": 0.2735917568206787, "learning_rate": 0.00012597073550187941, "loss": 4.6833, "step": 31750 }, { "epoch": 0.6502774762967666, "grad_norm": 0.26063647866249084, "learning_rate": 0.0001259096409272479, "loss": 4.6374, "step": 31755 }, { "epoch": 0.6503798660741712, "grad_norm": 0.3492627441883087, "learning_rate": 0.0001258485504545226, "loss": 4.5286, "step": 31760 }, { "epoch": 0.6504822558515758, "grad_norm": 0.30432385206222534, "learning_rate": 0.00012578746409410533, "loss": 4.6764, "step": 31765 }, { "epoch": 0.6505846456289804, "grad_norm": 0.2966804802417755, "learning_rate": 0.00012572638185639744, "loss": 4.6625, "step": 31770 }, { "epoch": 0.6506870354063851, "grad_norm": 0.26955991983413696, "learning_rate": 0.00012566530375179952, "loss": 4.6665, "step": 31775 }, { "epoch": 0.6507894251837897, "grad_norm": 0.27826356887817383, "learning_rate": 0.00012560422979071141, "loss": 4.6067, "step": 31780 }, { "epoch": 0.6508918149611943, "grad_norm": 0.26850879192352295, "learning_rate": 0.00012554315998353232, "loss": 4.6274, "step": 31785 }, { "epoch": 0.6509942047385989, "grad_norm": 0.34333401918411255, "learning_rate": 0.0001254820943406607, "loss": 4.6176, "step": 31790 }, { "epoch": 0.6510965945160035, "grad_norm": 0.26371055841445923, "learning_rate": 0.00012542103287249424, "loss": 4.6695, "step": 31795 }, { "epoch": 0.6511989842934082, "grad_norm": 0.2705726623535156, "learning_rate": 0.00012535997558943, "loss": 4.6457, "step": 31800 }, { "epoch": 0.6513013740708128, "grad_norm": 0.26995766162872314, "learning_rate": 0.00012529892250186443, "loss": 4.6459, "step": 31805 }, { "epoch": 0.6514037638482174, "grad_norm": 0.26318830251693726, "learning_rate": 0.00012523787362019297, "loss": 4.6526, "step": 31810 }, { "epoch": 0.651506153625622, "grad_norm": 0.2708052098751068, "learning_rate": 0.0001251768289548106, "loss": 4.6319, "step": 31815 }, { "epoch": 0.6516085434030267, "grad_norm": 0.25832897424697876, "learning_rate": 0.0001251157885161115, "loss": 4.6209, "step": 31820 }, { "epoch": 0.6517109331804313, "grad_norm": 0.2967028319835663, "learning_rate": 0.0001250547523144891, "loss": 4.6093, "step": 31825 }, { "epoch": 0.6518133229578359, "grad_norm": 0.27143245935440063, "learning_rate": 0.00012499372036033613, "loss": 4.6182, "step": 31830 }, { "epoch": 0.6519157127352405, "grad_norm": 0.28250429034233093, "learning_rate": 0.00012493269266404468, "loss": 4.6537, "step": 31835 }, { "epoch": 0.6520181025126451, "grad_norm": 0.26262298226356506, "learning_rate": 0.00012487166923600597, "loss": 4.7006, "step": 31840 }, { "epoch": 0.6521204922900498, "grad_norm": 0.27226799726486206, "learning_rate": 0.00012481065008661054, "loss": 4.7019, "step": 31845 }, { "epoch": 0.6522228820674544, "grad_norm": 0.2654237151145935, "learning_rate": 0.0001247496352262483, "loss": 4.647, "step": 31850 }, { "epoch": 0.652325271844859, "grad_norm": 0.2802788019180298, "learning_rate": 0.00012468862466530835, "loss": 4.6323, "step": 31855 }, { "epoch": 0.6524276616222636, "grad_norm": 0.26382607221603394, "learning_rate": 0.00012462761841417895, "loss": 4.5944, "step": 31860 }, { "epoch": 0.6525300513996682, "grad_norm": 0.278029203414917, "learning_rate": 0.00012456661648324786, "loss": 4.6604, "step": 31865 }, { "epoch": 0.6526324411770729, "grad_norm": 0.2683279514312744, "learning_rate": 0.00012450561888290192, "loss": 4.6052, "step": 31870 }, { "epoch": 0.6527348309544775, "grad_norm": 0.2615223824977875, "learning_rate": 0.0001244446256235273, "loss": 4.6708, "step": 31875 }, { "epoch": 0.6528372207318821, "grad_norm": 0.25342893600463867, "learning_rate": 0.00012438363671550944, "loss": 4.6668, "step": 31880 }, { "epoch": 0.6529396105092867, "grad_norm": 0.2844807803630829, "learning_rate": 0.00012432265216923304, "loss": 4.6415, "step": 31885 }, { "epoch": 0.6530420002866913, "grad_norm": 0.27546849846839905, "learning_rate": 0.000124261671995082, "loss": 4.5932, "step": 31890 }, { "epoch": 0.653144390064096, "grad_norm": 0.26301658153533936, "learning_rate": 0.00012420069620343953, "loss": 4.5966, "step": 31895 }, { "epoch": 0.6532467798415006, "grad_norm": 0.27280715107917786, "learning_rate": 0.00012413972480468816, "loss": 4.6455, "step": 31900 }, { "epoch": 0.6533491696189052, "grad_norm": 0.29577556252479553, "learning_rate": 0.00012407875780920944, "loss": 4.5637, "step": 31905 }, { "epoch": 0.6534515593963098, "grad_norm": 0.2669891119003296, "learning_rate": 0.0001240177952273844, "loss": 4.6018, "step": 31910 }, { "epoch": 0.6535539491737145, "grad_norm": 0.27085086703300476, "learning_rate": 0.00012395683706959332, "loss": 4.6722, "step": 31915 }, { "epoch": 0.6536563389511191, "grad_norm": 0.3017023503780365, "learning_rate": 0.00012389588334621552, "loss": 4.6263, "step": 31920 }, { "epoch": 0.6537587287285237, "grad_norm": 0.2614699900150299, "learning_rate": 0.00012383493406762974, "loss": 4.6629, "step": 31925 }, { "epoch": 0.6538611185059283, "grad_norm": 0.2590034306049347, "learning_rate": 0.00012377398924421398, "loss": 4.6495, "step": 31930 }, { "epoch": 0.6539635082833329, "grad_norm": 0.27455946803092957, "learning_rate": 0.00012371304888634532, "loss": 4.6404, "step": 31935 }, { "epoch": 0.6540658980607376, "grad_norm": 0.2596089541912079, "learning_rate": 0.00012365211300440022, "loss": 4.6562, "step": 31940 }, { "epoch": 0.6541682878381423, "grad_norm": 0.2656812071800232, "learning_rate": 0.00012359118160875442, "loss": 4.6422, "step": 31945 }, { "epoch": 0.6542706776155469, "grad_norm": 0.5823690295219421, "learning_rate": 0.00012353025470978265, "loss": 4.6762, "step": 31950 }, { "epoch": 0.6543730673929515, "grad_norm": 0.3039045035839081, "learning_rate": 0.0001234693323178591, "loss": 4.6236, "step": 31955 }, { "epoch": 0.6544754571703562, "grad_norm": 0.288870245218277, "learning_rate": 0.00012340841444335716, "loss": 4.6372, "step": 31960 }, { "epoch": 0.6545778469477608, "grad_norm": 0.2619956433773041, "learning_rate": 0.00012334750109664941, "loss": 4.6564, "step": 31965 }, { "epoch": 0.6546802367251654, "grad_norm": 0.28278493881225586, "learning_rate": 0.00012328659228810765, "loss": 4.6393, "step": 31970 }, { "epoch": 0.65478262650257, "grad_norm": 0.2665233016014099, "learning_rate": 0.00012322568802810297, "loss": 4.6199, "step": 31975 }, { "epoch": 0.6548850162799746, "grad_norm": 0.2708400785923004, "learning_rate": 0.00012316478832700558, "loss": 4.6483, "step": 31980 }, { "epoch": 0.6549874060573793, "grad_norm": 0.28332042694091797, "learning_rate": 0.000123103893195185, "loss": 4.6458, "step": 31985 }, { "epoch": 0.6550897958347839, "grad_norm": 0.26691529154777527, "learning_rate": 0.00012304300264300995, "loss": 4.6489, "step": 31990 }, { "epoch": 0.6551921856121885, "grad_norm": 0.2727983593940735, "learning_rate": 0.0001229821166808484, "loss": 4.6138, "step": 31995 }, { "epoch": 0.6552945753895931, "grad_norm": 0.268181711435318, "learning_rate": 0.00012292123531906744, "loss": 4.6083, "step": 32000 }, { "epoch": 0.6552945753895931, "eval_loss": 4.658926963806152, "eval_runtime": 17.137, "eval_samples_per_second": 33.962, "eval_steps_per_second": 4.26, "step": 32000 }, { "epoch": 0.6553969651669977, "grad_norm": 0.2773132920265198, "learning_rate": 0.00012286035856803347, "loss": 4.6312, "step": 32005 }, { "epoch": 0.6554993549444024, "grad_norm": 0.27755749225616455, "learning_rate": 0.00012279948643811214, "loss": 4.6845, "step": 32010 }, { "epoch": 0.655601744721807, "grad_norm": 0.2796289920806885, "learning_rate": 0.00012273861893966813, "loss": 4.6156, "step": 32015 }, { "epoch": 0.6557041344992116, "grad_norm": 0.2649158239364624, "learning_rate": 0.0001226777560830655, "loss": 4.6517, "step": 32020 }, { "epoch": 0.6558065242766162, "grad_norm": 0.26124560832977295, "learning_rate": 0.00012261689787866755, "loss": 4.6151, "step": 32025 }, { "epoch": 0.6559089140540209, "grad_norm": 0.27674198150634766, "learning_rate": 0.0001225560443368366, "loss": 4.6402, "step": 32030 }, { "epoch": 0.6560113038314255, "grad_norm": 0.26916542649269104, "learning_rate": 0.00012249519546793432, "loss": 4.6554, "step": 32035 }, { "epoch": 0.6561136936088301, "grad_norm": 0.3102434277534485, "learning_rate": 0.00012243435128232166, "loss": 4.6373, "step": 32040 }, { "epoch": 0.6562160833862347, "grad_norm": 0.26134636998176575, "learning_rate": 0.00012237351179035847, "loss": 4.6307, "step": 32045 }, { "epoch": 0.6563184731636393, "grad_norm": 0.2624574899673462, "learning_rate": 0.00012231267700240408, "loss": 4.6192, "step": 32050 }, { "epoch": 0.656420862941044, "grad_norm": 0.2781785726547241, "learning_rate": 0.00012225184692881696, "loss": 4.6371, "step": 32055 }, { "epoch": 0.6565232527184486, "grad_norm": 0.2835623025894165, "learning_rate": 0.0001221910215799547, "loss": 4.6352, "step": 32060 }, { "epoch": 0.6566256424958532, "grad_norm": 0.2875203490257263, "learning_rate": 0.0001221302009661742, "loss": 4.6118, "step": 32065 }, { "epoch": 0.6567280322732578, "grad_norm": 0.27794086933135986, "learning_rate": 0.00012206938509783146, "loss": 4.6267, "step": 32070 }, { "epoch": 0.6568304220506624, "grad_norm": 0.26146331429481506, "learning_rate": 0.00012200857398528166, "loss": 4.6389, "step": 32075 }, { "epoch": 0.6569328118280671, "grad_norm": 0.2839362621307373, "learning_rate": 0.00012194776763887922, "loss": 4.6356, "step": 32080 }, { "epoch": 0.6570352016054717, "grad_norm": 0.279504656791687, "learning_rate": 0.00012188696606897784, "loss": 4.633, "step": 32085 }, { "epoch": 0.6571375913828763, "grad_norm": 0.2745186686515808, "learning_rate": 0.00012182616928593016, "loss": 4.6342, "step": 32090 }, { "epoch": 0.6572399811602809, "grad_norm": 0.2650163769721985, "learning_rate": 0.00012176537730008822, "loss": 4.6658, "step": 32095 }, { "epoch": 0.6573423709376855, "grad_norm": 0.2790212333202362, "learning_rate": 0.0001217045901218032, "loss": 4.5968, "step": 32100 }, { "epoch": 0.6574447607150902, "grad_norm": 0.2662540674209595, "learning_rate": 0.00012164380776142539, "loss": 4.5886, "step": 32105 }, { "epoch": 0.6575471504924948, "grad_norm": 0.30496945977211, "learning_rate": 0.00012158303022930432, "loss": 4.6244, "step": 32110 }, { "epoch": 0.6576495402698994, "grad_norm": 0.2908116579055786, "learning_rate": 0.00012152225753578871, "loss": 4.6374, "step": 32115 }, { "epoch": 0.657751930047304, "grad_norm": 0.27005699276924133, "learning_rate": 0.00012146148969122637, "loss": 4.5542, "step": 32120 }, { "epoch": 0.6578543198247087, "grad_norm": 0.26197630167007446, "learning_rate": 0.00012140072670596438, "loss": 4.6129, "step": 32125 }, { "epoch": 0.6579567096021133, "grad_norm": 0.2653680145740509, "learning_rate": 0.00012133996859034896, "loss": 4.6552, "step": 32130 }, { "epoch": 0.6580590993795179, "grad_norm": 0.2568564713001251, "learning_rate": 0.00012127921535472554, "loss": 4.6844, "step": 32135 }, { "epoch": 0.6581614891569225, "grad_norm": 0.29861655831336975, "learning_rate": 0.00012121846700943857, "loss": 4.6407, "step": 32140 }, { "epoch": 0.6582638789343273, "grad_norm": 0.2732132077217102, "learning_rate": 0.00012115772356483185, "loss": 4.6709, "step": 32145 }, { "epoch": 0.6583662687117319, "grad_norm": 0.2853640019893646, "learning_rate": 0.00012109698503124825, "loss": 4.5996, "step": 32150 }, { "epoch": 0.6584686584891365, "grad_norm": 0.27273640036582947, "learning_rate": 0.00012103625141902981, "loss": 4.6275, "step": 32155 }, { "epoch": 0.6585710482665411, "grad_norm": 0.274423748254776, "learning_rate": 0.00012097552273851775, "loss": 4.6608, "step": 32160 }, { "epoch": 0.6586734380439457, "grad_norm": 0.2623249292373657, "learning_rate": 0.00012091479900005251, "loss": 4.6841, "step": 32165 }, { "epoch": 0.6587758278213504, "grad_norm": 0.25459566712379456, "learning_rate": 0.00012085408021397349, "loss": 4.6704, "step": 32170 }, { "epoch": 0.658878217598755, "grad_norm": 0.2660377323627472, "learning_rate": 0.00012079336639061948, "loss": 4.6507, "step": 32175 }, { "epoch": 0.6589806073761596, "grad_norm": 0.2769128084182739, "learning_rate": 0.00012073265754032839, "loss": 4.6678, "step": 32180 }, { "epoch": 0.6590829971535642, "grad_norm": 0.27020207047462463, "learning_rate": 0.00012067195367343701, "loss": 4.6802, "step": 32185 }, { "epoch": 0.6591853869309688, "grad_norm": 0.29727283120155334, "learning_rate": 0.00012061125480028164, "loss": 4.6547, "step": 32190 }, { "epoch": 0.6592877767083735, "grad_norm": 0.28506189584732056, "learning_rate": 0.0001205505609311976, "loss": 4.643, "step": 32195 }, { "epoch": 0.6593901664857781, "grad_norm": 0.27035385370254517, "learning_rate": 0.00012048987207651921, "loss": 4.6403, "step": 32200 }, { "epoch": 0.6594925562631827, "grad_norm": 0.2715008556842804, "learning_rate": 0.00012042918824658017, "loss": 4.641, "step": 32205 }, { "epoch": 0.6595949460405873, "grad_norm": 0.35396960377693176, "learning_rate": 0.00012036850945171322, "loss": 4.6404, "step": 32210 }, { "epoch": 0.659697335817992, "grad_norm": 0.2653484046459198, "learning_rate": 0.00012030783570225017, "loss": 4.6788, "step": 32215 }, { "epoch": 0.6597997255953966, "grad_norm": 0.25991418957710266, "learning_rate": 0.0001202471670085221, "loss": 4.6357, "step": 32220 }, { "epoch": 0.6599021153728012, "grad_norm": 0.25958484411239624, "learning_rate": 0.00012018650338085918, "loss": 4.6591, "step": 32225 }, { "epoch": 0.6600045051502058, "grad_norm": 0.25597479939460754, "learning_rate": 0.00012012584482959064, "loss": 4.5933, "step": 32230 }, { "epoch": 0.6601068949276104, "grad_norm": 0.28442850708961487, "learning_rate": 0.00012006519136504495, "loss": 4.5905, "step": 32235 }, { "epoch": 0.660209284705015, "grad_norm": 0.2714941203594208, "learning_rate": 0.00012000454299754971, "loss": 4.6265, "step": 32240 }, { "epoch": 0.6603116744824197, "grad_norm": 0.2652340531349182, "learning_rate": 0.00011994389973743159, "loss": 4.5818, "step": 32245 }, { "epoch": 0.6604140642598243, "grad_norm": 0.28526750206947327, "learning_rate": 0.00011988326159501638, "loss": 4.6545, "step": 32250 }, { "epoch": 0.6605164540372289, "grad_norm": 0.27121856808662415, "learning_rate": 0.00011982262858062918, "loss": 4.6917, "step": 32255 }, { "epoch": 0.6606188438146335, "grad_norm": 0.27800634503364563, "learning_rate": 0.00011976200070459385, "loss": 4.6577, "step": 32260 }, { "epoch": 0.6607212335920382, "grad_norm": 0.277902752161026, "learning_rate": 0.00011970137797723378, "loss": 4.5837, "step": 32265 }, { "epoch": 0.6608236233694428, "grad_norm": 0.2635326087474823, "learning_rate": 0.00011964076040887125, "loss": 4.6866, "step": 32270 }, { "epoch": 0.6609260131468474, "grad_norm": 0.27133452892303467, "learning_rate": 0.00011958014800982777, "loss": 4.5943, "step": 32275 }, { "epoch": 0.661028402924252, "grad_norm": 0.27321186661720276, "learning_rate": 0.00011951954079042381, "loss": 4.6676, "step": 32280 }, { "epoch": 0.6611307927016566, "grad_norm": 0.26956549286842346, "learning_rate": 0.00011945893876097913, "loss": 4.6727, "step": 32285 }, { "epoch": 0.6612331824790613, "grad_norm": 0.29847729206085205, "learning_rate": 0.00011939834193181255, "loss": 4.6154, "step": 32290 }, { "epoch": 0.6613355722564659, "grad_norm": 0.31802478432655334, "learning_rate": 0.00011933775031324192, "loss": 4.6107, "step": 32295 }, { "epoch": 0.6614379620338705, "grad_norm": 0.2739986777305603, "learning_rate": 0.00011927716391558436, "loss": 4.6971, "step": 32300 }, { "epoch": 0.6615403518112751, "grad_norm": 0.2841625213623047, "learning_rate": 0.000119216582749156, "loss": 4.6881, "step": 32305 }, { "epoch": 0.6616427415886798, "grad_norm": 0.26190415024757385, "learning_rate": 0.00011915600682427206, "loss": 4.7229, "step": 32310 }, { "epoch": 0.6617451313660844, "grad_norm": 0.2650793492794037, "learning_rate": 0.00011909543615124691, "loss": 4.6629, "step": 32315 }, { "epoch": 0.661847521143489, "grad_norm": 0.2658609449863434, "learning_rate": 0.00011903487074039413, "loss": 4.6527, "step": 32320 }, { "epoch": 0.6619499109208936, "grad_norm": 0.2587004601955414, "learning_rate": 0.00011897431060202613, "loss": 4.6361, "step": 32325 }, { "epoch": 0.6620523006982982, "grad_norm": 0.25448375940322876, "learning_rate": 0.00011891375574645467, "loss": 4.6343, "step": 32330 }, { "epoch": 0.6621546904757029, "grad_norm": 0.25848206877708435, "learning_rate": 0.00011885320618399056, "loss": 4.6221, "step": 32335 }, { "epoch": 0.6622570802531075, "grad_norm": 0.28528285026550293, "learning_rate": 0.00011879266192494358, "loss": 4.631, "step": 32340 }, { "epoch": 0.6623594700305121, "grad_norm": 0.2573157548904419, "learning_rate": 0.00011873212297962277, "loss": 4.6226, "step": 32345 }, { "epoch": 0.6624618598079168, "grad_norm": 0.2667756676673889, "learning_rate": 0.00011867158935833625, "loss": 4.6758, "step": 32350 }, { "epoch": 0.6625642495853215, "grad_norm": 0.26147395372390747, "learning_rate": 0.00011861106107139103, "loss": 4.5983, "step": 32355 }, { "epoch": 0.6626666393627261, "grad_norm": 0.2658372223377228, "learning_rate": 0.00011855053812909347, "loss": 4.6626, "step": 32360 }, { "epoch": 0.6627690291401307, "grad_norm": 0.26490965485572815, "learning_rate": 0.00011849002054174898, "loss": 4.6241, "step": 32365 }, { "epoch": 0.6628714189175353, "grad_norm": 0.26870647072792053, "learning_rate": 0.00011842950831966185, "loss": 4.6698, "step": 32370 }, { "epoch": 0.6629738086949399, "grad_norm": 0.2598232924938202, "learning_rate": 0.00011836900147313564, "loss": 4.6495, "step": 32375 }, { "epoch": 0.6630761984723446, "grad_norm": 0.26024144887924194, "learning_rate": 0.000118308500012473, "loss": 4.63, "step": 32380 }, { "epoch": 0.6631785882497492, "grad_norm": 0.30041414499282837, "learning_rate": 0.00011824800394797556, "loss": 4.6471, "step": 32385 }, { "epoch": 0.6632809780271538, "grad_norm": 0.2771242558956146, "learning_rate": 0.0001181875132899441, "loss": 4.6601, "step": 32390 }, { "epoch": 0.6633833678045584, "grad_norm": 0.26687443256378174, "learning_rate": 0.00011812702804867856, "loss": 4.6366, "step": 32395 }, { "epoch": 0.663485757581963, "grad_norm": 0.2735798954963684, "learning_rate": 0.0001180665482344777, "loss": 4.6597, "step": 32400 }, { "epoch": 0.6635881473593677, "grad_norm": 0.29402485489845276, "learning_rate": 0.00011800607385763962, "loss": 4.6251, "step": 32405 }, { "epoch": 0.6636905371367723, "grad_norm": 0.28436583280563354, "learning_rate": 0.00011794560492846138, "loss": 4.6099, "step": 32410 }, { "epoch": 0.6637929269141769, "grad_norm": 0.26935702562332153, "learning_rate": 0.00011788514145723918, "loss": 4.6544, "step": 32415 }, { "epoch": 0.6638953166915815, "grad_norm": 0.26539263129234314, "learning_rate": 0.00011782468345426813, "loss": 4.6665, "step": 32420 }, { "epoch": 0.6639977064689861, "grad_norm": 0.2789909541606903, "learning_rate": 0.00011776423092984256, "loss": 4.6838, "step": 32425 }, { "epoch": 0.6641000962463908, "grad_norm": 0.8912553191184998, "learning_rate": 0.00011770378389425586, "loss": 4.6717, "step": 32430 }, { "epoch": 0.6642024860237954, "grad_norm": 0.27865198254585266, "learning_rate": 0.0001176433423578004, "loss": 4.6275, "step": 32435 }, { "epoch": 0.6643048758012, "grad_norm": 0.33352676033973694, "learning_rate": 0.00011758290633076768, "loss": 4.6398, "step": 32440 }, { "epoch": 0.6644072655786046, "grad_norm": 0.2718777060508728, "learning_rate": 0.00011752247582344831, "loss": 4.6421, "step": 32445 }, { "epoch": 0.6645096553560093, "grad_norm": 0.2849903702735901, "learning_rate": 0.00011746205084613173, "loss": 4.6293, "step": 32450 }, { "epoch": 0.6646120451334139, "grad_norm": 0.27411556243896484, "learning_rate": 0.00011740163140910676, "loss": 4.6501, "step": 32455 }, { "epoch": 0.6647144349108185, "grad_norm": 0.27114495635032654, "learning_rate": 0.0001173412175226611, "loss": 4.6542, "step": 32460 }, { "epoch": 0.6648168246882231, "grad_norm": 0.2645331919193268, "learning_rate": 0.00011728080919708144, "loss": 4.6172, "step": 32465 }, { "epoch": 0.6649192144656277, "grad_norm": 0.2525201737880707, "learning_rate": 0.00011722040644265365, "loss": 4.6177, "step": 32470 }, { "epoch": 0.6650216042430324, "grad_norm": 0.3081844747066498, "learning_rate": 0.00011716000926966264, "loss": 4.693, "step": 32475 }, { "epoch": 0.665123994020437, "grad_norm": 0.37288954854011536, "learning_rate": 0.00011709961768839231, "loss": 4.6399, "step": 32480 }, { "epoch": 0.6652263837978416, "grad_norm": 0.3004577159881592, "learning_rate": 0.00011703923170912562, "loss": 4.6216, "step": 32485 }, { "epoch": 0.6653287735752462, "grad_norm": 0.27851006388664246, "learning_rate": 0.00011697885134214468, "loss": 4.6834, "step": 32490 }, { "epoch": 0.6654311633526508, "grad_norm": 0.2763761579990387, "learning_rate": 0.0001169184765977304, "loss": 4.4737, "step": 32495 }, { "epoch": 0.6655335531300555, "grad_norm": 0.2804718315601349, "learning_rate": 0.000116858107486163, "loss": 4.6298, "step": 32500 }, { "epoch": 0.6656359429074601, "grad_norm": 0.25102970004081726, "learning_rate": 0.0001167977440177217, "loss": 4.6373, "step": 32505 }, { "epoch": 0.6657383326848647, "grad_norm": 0.2638339698314667, "learning_rate": 0.0001167373862026845, "loss": 4.6466, "step": 32510 }, { "epoch": 0.6658407224622693, "grad_norm": 0.276727557182312, "learning_rate": 0.00011667703405132877, "loss": 4.6038, "step": 32515 }, { "epoch": 0.665943112239674, "grad_norm": 0.26491910219192505, "learning_rate": 0.00011661668757393073, "loss": 4.6227, "step": 32520 }, { "epoch": 0.6660455020170786, "grad_norm": 0.26587238907814026, "learning_rate": 0.00011655634678076566, "loss": 4.6304, "step": 32525 }, { "epoch": 0.6661478917944832, "grad_norm": 0.2602909803390503, "learning_rate": 0.00011649601168210791, "loss": 4.6159, "step": 32530 }, { "epoch": 0.6662502815718878, "grad_norm": 0.2717461585998535, "learning_rate": 0.00011643568228823087, "loss": 4.7061, "step": 32535 }, { "epoch": 0.6663526713492924, "grad_norm": 0.2562514841556549, "learning_rate": 0.00011637535860940687, "loss": 4.6409, "step": 32540 }, { "epoch": 0.6664550611266971, "grad_norm": 0.28094133734703064, "learning_rate": 0.00011631504065590727, "loss": 4.615, "step": 32545 }, { "epoch": 0.6665574509041018, "grad_norm": 0.27160724997520447, "learning_rate": 0.00011625472843800265, "loss": 4.646, "step": 32550 }, { "epoch": 0.6666598406815064, "grad_norm": 0.2829086482524872, "learning_rate": 0.00011619442196596248, "loss": 4.6315, "step": 32555 }, { "epoch": 0.666762230458911, "grad_norm": 0.6439187526702881, "learning_rate": 0.00011613412125005509, "loss": 4.6353, "step": 32560 }, { "epoch": 0.6668646202363157, "grad_norm": 0.31718677282333374, "learning_rate": 0.00011607382630054808, "loss": 4.6793, "step": 32565 }, { "epoch": 0.6669670100137203, "grad_norm": 0.2708340585231781, "learning_rate": 0.000116013537127708, "loss": 4.6164, "step": 32570 }, { "epoch": 0.6670693997911249, "grad_norm": 0.2751806378364563, "learning_rate": 0.0001159532537418003, "loss": 4.6222, "step": 32575 }, { "epoch": 0.6671717895685295, "grad_norm": 0.273276150226593, "learning_rate": 0.0001158929761530896, "loss": 4.6342, "step": 32580 }, { "epoch": 0.6672741793459341, "grad_norm": 0.2833123505115509, "learning_rate": 0.00011583270437183953, "loss": 4.6526, "step": 32585 }, { "epoch": 0.6673765691233388, "grad_norm": 0.2649253010749817, "learning_rate": 0.00011577243840831248, "loss": 4.6608, "step": 32590 }, { "epoch": 0.6674789589007434, "grad_norm": 0.2850389778614044, "learning_rate": 0.00011571217827277018, "loss": 4.6305, "step": 32595 }, { "epoch": 0.667581348678148, "grad_norm": 0.2866162061691284, "learning_rate": 0.00011565192397547326, "loss": 4.7226, "step": 32600 }, { "epoch": 0.6676837384555526, "grad_norm": 0.2693106532096863, "learning_rate": 0.0001155916755266812, "loss": 4.6469, "step": 32605 }, { "epoch": 0.6677861282329572, "grad_norm": 0.2595615088939667, "learning_rate": 0.00011553143293665264, "loss": 4.6919, "step": 32610 }, { "epoch": 0.6678885180103619, "grad_norm": 0.2672061324119568, "learning_rate": 0.00011547119621564525, "loss": 4.6689, "step": 32615 }, { "epoch": 0.6679909077877665, "grad_norm": 0.276113897562027, "learning_rate": 0.00011541096537391555, "loss": 4.6552, "step": 32620 }, { "epoch": 0.6680932975651711, "grad_norm": 0.2687895894050598, "learning_rate": 0.00011535074042171921, "loss": 4.6789, "step": 32625 }, { "epoch": 0.6681956873425757, "grad_norm": 0.2709292769432068, "learning_rate": 0.00011529052136931089, "loss": 4.6898, "step": 32630 }, { "epoch": 0.6682980771199803, "grad_norm": 0.28728988766670227, "learning_rate": 0.00011523030822694403, "loss": 4.668, "step": 32635 }, { "epoch": 0.668400466897385, "grad_norm": 0.2673599421977997, "learning_rate": 0.00011517010100487129, "loss": 4.604, "step": 32640 }, { "epoch": 0.6685028566747896, "grad_norm": 0.26948410272598267, "learning_rate": 0.00011510989971334435, "loss": 4.6317, "step": 32645 }, { "epoch": 0.6686052464521942, "grad_norm": 0.25881901383399963, "learning_rate": 0.00011504970436261367, "loss": 4.614, "step": 32650 }, { "epoch": 0.6687076362295988, "grad_norm": 0.29786792397499084, "learning_rate": 0.00011498951496292886, "loss": 4.5969, "step": 32655 }, { "epoch": 0.6688100260070035, "grad_norm": 0.2857189178466797, "learning_rate": 0.00011492933152453849, "loss": 4.6861, "step": 32660 }, { "epoch": 0.6689124157844081, "grad_norm": 0.2728564143180847, "learning_rate": 0.00011486915405769003, "loss": 4.6531, "step": 32665 }, { "epoch": 0.6690148055618127, "grad_norm": 0.26709920167922974, "learning_rate": 0.00011480898257263005, "loss": 4.615, "step": 32670 }, { "epoch": 0.6691171953392173, "grad_norm": 0.32166990637779236, "learning_rate": 0.00011474881707960411, "loss": 4.6531, "step": 32675 }, { "epoch": 0.6692195851166219, "grad_norm": 0.2602331340312958, "learning_rate": 0.00011468865758885653, "loss": 4.6367, "step": 32680 }, { "epoch": 0.6693219748940266, "grad_norm": 0.2882189154624939, "learning_rate": 0.00011462850411063084, "loss": 4.6099, "step": 32685 }, { "epoch": 0.6694243646714312, "grad_norm": 0.26194214820861816, "learning_rate": 0.0001145683566551696, "loss": 4.6382, "step": 32690 }, { "epoch": 0.6695267544488358, "grad_norm": 0.25957241654396057, "learning_rate": 0.00011450821523271404, "loss": 4.6092, "step": 32695 }, { "epoch": 0.6696291442262404, "grad_norm": 0.26672765612602234, "learning_rate": 0.00011444807985350463, "loss": 4.6781, "step": 32700 }, { "epoch": 0.669731534003645, "grad_norm": 0.2787538468837738, "learning_rate": 0.00011438795052778069, "loss": 4.6599, "step": 32705 }, { "epoch": 0.6698339237810497, "grad_norm": 0.25827673077583313, "learning_rate": 0.00011432782726578058, "loss": 4.6233, "step": 32710 }, { "epoch": 0.6699363135584543, "grad_norm": 0.2902181148529053, "learning_rate": 0.00011426771007774154, "loss": 4.5983, "step": 32715 }, { "epoch": 0.6700387033358589, "grad_norm": 0.27394747734069824, "learning_rate": 0.00011420759897389986, "loss": 4.6603, "step": 32720 }, { "epoch": 0.6701410931132635, "grad_norm": 0.2899582087993622, "learning_rate": 0.00011414749396449081, "loss": 4.6078, "step": 32725 }, { "epoch": 0.6702434828906682, "grad_norm": 0.27544349431991577, "learning_rate": 0.00011408739505974846, "loss": 4.6579, "step": 32730 }, { "epoch": 0.6703458726680728, "grad_norm": 0.2853153944015503, "learning_rate": 0.00011402730226990598, "loss": 4.6325, "step": 32735 }, { "epoch": 0.6704482624454774, "grad_norm": 0.27195265889167786, "learning_rate": 0.00011396721560519558, "loss": 4.6751, "step": 32740 }, { "epoch": 0.670550652222882, "grad_norm": 0.2665818929672241, "learning_rate": 0.00011390713507584815, "loss": 4.6516, "step": 32745 }, { "epoch": 0.6706530420002866, "grad_norm": 0.26640045642852783, "learning_rate": 0.0001138470606920938, "loss": 4.6279, "step": 32750 }, { "epoch": 0.6707554317776914, "grad_norm": 0.26836442947387695, "learning_rate": 0.00011378699246416152, "loss": 4.6299, "step": 32755 }, { "epoch": 0.670857821555096, "grad_norm": 0.26381194591522217, "learning_rate": 0.00011372693040227916, "loss": 4.6103, "step": 32760 }, { "epoch": 0.6709602113325006, "grad_norm": 0.27477508783340454, "learning_rate": 0.00011366687451667363, "loss": 4.6513, "step": 32765 }, { "epoch": 0.6710626011099052, "grad_norm": 0.26479724049568176, "learning_rate": 0.00011360682481757081, "loss": 4.6292, "step": 32770 }, { "epoch": 0.6711649908873099, "grad_norm": 0.2636280059814453, "learning_rate": 0.0001135467813151953, "loss": 4.6685, "step": 32775 }, { "epoch": 0.6712673806647145, "grad_norm": 0.26332464814186096, "learning_rate": 0.00011348674401977089, "loss": 4.6522, "step": 32780 }, { "epoch": 0.6713697704421191, "grad_norm": 0.25292447209358215, "learning_rate": 0.00011342671294152031, "loss": 4.6101, "step": 32785 }, { "epoch": 0.6714721602195237, "grad_norm": 0.2679852247238159, "learning_rate": 0.00011336668809066505, "loss": 4.6559, "step": 32790 }, { "epoch": 0.6715745499969283, "grad_norm": 0.2763931453227997, "learning_rate": 0.00011330666947742567, "loss": 4.6216, "step": 32795 }, { "epoch": 0.671676939774333, "grad_norm": 0.2601395547389984, "learning_rate": 0.00011324665711202168, "loss": 4.6413, "step": 32800 }, { "epoch": 0.6717793295517376, "grad_norm": 0.31791824102401733, "learning_rate": 0.00011318665100467142, "loss": 4.6098, "step": 32805 }, { "epoch": 0.6718817193291422, "grad_norm": 0.2943015396595001, "learning_rate": 0.00011312665116559227, "loss": 4.6116, "step": 32810 }, { "epoch": 0.6719841091065468, "grad_norm": 0.2961193919181824, "learning_rate": 0.00011306665760500056, "loss": 4.6739, "step": 32815 }, { "epoch": 0.6720864988839514, "grad_norm": 0.26749667525291443, "learning_rate": 0.00011300667033311137, "loss": 4.6054, "step": 32820 }, { "epoch": 0.6721888886613561, "grad_norm": 0.2678840458393097, "learning_rate": 0.0001129466893601389, "loss": 4.6751, "step": 32825 }, { "epoch": 0.6722912784387607, "grad_norm": 0.2867529094219208, "learning_rate": 0.00011288671469629626, "loss": 4.6906, "step": 32830 }, { "epoch": 0.6723936682161653, "grad_norm": 0.2722114026546478, "learning_rate": 0.00011282674635179534, "loss": 4.6143, "step": 32835 }, { "epoch": 0.6724960579935699, "grad_norm": 0.2897540032863617, "learning_rate": 0.00011276678433684711, "loss": 4.6943, "step": 32840 }, { "epoch": 0.6725984477709746, "grad_norm": 0.27606886625289917, "learning_rate": 0.00011270682866166139, "loss": 4.658, "step": 32845 }, { "epoch": 0.6727008375483792, "grad_norm": 0.2667522430419922, "learning_rate": 0.00011264687933644695, "loss": 4.6368, "step": 32850 }, { "epoch": 0.6728032273257838, "grad_norm": 0.2723812758922577, "learning_rate": 0.00011258693637141145, "loss": 4.6668, "step": 32855 }, { "epoch": 0.6729056171031884, "grad_norm": 0.27237269282341003, "learning_rate": 0.00011252699977676146, "loss": 4.6197, "step": 32860 }, { "epoch": 0.673008006880593, "grad_norm": 0.2706470191478729, "learning_rate": 0.00011246706956270257, "loss": 4.6622, "step": 32865 }, { "epoch": 0.6731103966579977, "grad_norm": 0.2632223963737488, "learning_rate": 0.00011240714573943907, "loss": 4.6628, "step": 32870 }, { "epoch": 0.6732127864354023, "grad_norm": 0.27151986956596375, "learning_rate": 0.00011234722831717432, "loss": 4.613, "step": 32875 }, { "epoch": 0.6733151762128069, "grad_norm": 0.2898189425468445, "learning_rate": 0.00011228731730611071, "loss": 4.682, "step": 32880 }, { "epoch": 0.6734175659902115, "grad_norm": 0.2605278491973877, "learning_rate": 0.00011222741271644919, "loss": 4.6921, "step": 32885 }, { "epoch": 0.6735199557676161, "grad_norm": 0.26354148983955383, "learning_rate": 0.0001121675145583899, "loss": 4.6374, "step": 32890 }, { "epoch": 0.6736223455450208, "grad_norm": 0.2641417980194092, "learning_rate": 0.00011210762284213185, "loss": 4.6463, "step": 32895 }, { "epoch": 0.6737247353224254, "grad_norm": 0.2970120906829834, "learning_rate": 0.00011204773757787282, "loss": 4.6127, "step": 32900 }, { "epoch": 0.67382712509983, "grad_norm": 0.2714480757713318, "learning_rate": 0.00011198785877580959, "loss": 4.5852, "step": 32905 }, { "epoch": 0.6739295148772346, "grad_norm": 0.2645569145679474, "learning_rate": 0.00011192798644613792, "loss": 4.5893, "step": 32910 }, { "epoch": 0.6740319046546392, "grad_norm": 0.2686305046081543, "learning_rate": 0.00011186812059905222, "loss": 4.6661, "step": 32915 }, { "epoch": 0.6741342944320439, "grad_norm": 0.27101930975914, "learning_rate": 0.00011180826124474606, "loss": 4.6242, "step": 32920 }, { "epoch": 0.6742366842094485, "grad_norm": 0.2710999846458435, "learning_rate": 0.00011174840839341177, "loss": 4.6523, "step": 32925 }, { "epoch": 0.6743390739868531, "grad_norm": 0.5384253263473511, "learning_rate": 0.00011168856205524054, "loss": 4.6621, "step": 32930 }, { "epoch": 0.6744414637642577, "grad_norm": 0.2712736427783966, "learning_rate": 0.00011162872224042257, "loss": 4.6247, "step": 32935 }, { "epoch": 0.6745438535416624, "grad_norm": 0.2693881392478943, "learning_rate": 0.0001115688889591469, "loss": 4.6727, "step": 32940 }, { "epoch": 0.674646243319067, "grad_norm": 0.28540852665901184, "learning_rate": 0.00011150906222160136, "loss": 4.625, "step": 32945 }, { "epoch": 0.6747486330964716, "grad_norm": 0.3191269636154175, "learning_rate": 0.00011144924203797283, "loss": 4.6696, "step": 32950 }, { "epoch": 0.6748510228738763, "grad_norm": 0.26214319467544556, "learning_rate": 0.00011138942841844703, "loss": 4.6142, "step": 32955 }, { "epoch": 0.674953412651281, "grad_norm": 0.28582412004470825, "learning_rate": 0.00011132962137320837, "loss": 4.6199, "step": 32960 }, { "epoch": 0.6750558024286856, "grad_norm": 0.26200976967811584, "learning_rate": 0.00011126982091244041, "loss": 4.6284, "step": 32965 }, { "epoch": 0.6751581922060902, "grad_norm": 0.2682289481163025, "learning_rate": 0.00011121002704632548, "loss": 4.6267, "step": 32970 }, { "epoch": 0.6752605819834948, "grad_norm": 0.26642361283302307, "learning_rate": 0.0001111502397850447, "loss": 4.6857, "step": 32975 }, { "epoch": 0.6753629717608994, "grad_norm": 0.27904707193374634, "learning_rate": 0.00011109045913877825, "loss": 4.6061, "step": 32980 }, { "epoch": 0.675465361538304, "grad_norm": 0.26362839341163635, "learning_rate": 0.00011103068511770499, "loss": 4.6184, "step": 32985 }, { "epoch": 0.6755677513157087, "grad_norm": 0.2662644386291504, "learning_rate": 0.00011097091773200285, "loss": 4.6761, "step": 32990 }, { "epoch": 0.6756701410931133, "grad_norm": 0.28967827558517456, "learning_rate": 0.00011091115699184843, "loss": 4.6337, "step": 32995 }, { "epoch": 0.6757725308705179, "grad_norm": 0.2763548791408539, "learning_rate": 0.00011085140290741731, "loss": 4.6565, "step": 33000 }, { "epoch": 0.6757725308705179, "eval_loss": 4.649252891540527, "eval_runtime": 17.1281, "eval_samples_per_second": 33.979, "eval_steps_per_second": 4.262, "step": 33000 }, { "epoch": 0.6758749206479225, "grad_norm": 0.26872551441192627, "learning_rate": 0.00011079165548888403, "loss": 4.6717, "step": 33005 }, { "epoch": 0.6759773104253272, "grad_norm": 0.3147912323474884, "learning_rate": 0.00011073191474642168, "loss": 4.6426, "step": 33010 }, { "epoch": 0.6760797002027318, "grad_norm": 0.2672083377838135, "learning_rate": 0.00011067218069020256, "loss": 4.5943, "step": 33015 }, { "epoch": 0.6761820899801364, "grad_norm": 0.304654985666275, "learning_rate": 0.00011061245333039767, "loss": 4.6745, "step": 33020 }, { "epoch": 0.676284479757541, "grad_norm": 0.2691044807434082, "learning_rate": 0.00011055273267717684, "loss": 4.6056, "step": 33025 }, { "epoch": 0.6763868695349456, "grad_norm": 0.2770857810974121, "learning_rate": 0.00011049301874070886, "loss": 4.6658, "step": 33030 }, { "epoch": 0.6764892593123503, "grad_norm": 0.2694748640060425, "learning_rate": 0.0001104333115311613, "loss": 4.6105, "step": 33035 }, { "epoch": 0.6765916490897549, "grad_norm": 0.2787577211856842, "learning_rate": 0.00011037361105870058, "loss": 4.6421, "step": 33040 }, { "epoch": 0.6766940388671595, "grad_norm": 0.27521374821662903, "learning_rate": 0.00011031391733349206, "loss": 4.6528, "step": 33045 }, { "epoch": 0.6767964286445641, "grad_norm": 0.26650211215019226, "learning_rate": 0.00011025423036569988, "loss": 4.6218, "step": 33050 }, { "epoch": 0.6768988184219688, "grad_norm": 0.26677361130714417, "learning_rate": 0.000110194550165487, "loss": 4.6694, "step": 33055 }, { "epoch": 0.6770012081993734, "grad_norm": 0.262565940618515, "learning_rate": 0.00011013487674301526, "loss": 4.6456, "step": 33060 }, { "epoch": 0.677103597976778, "grad_norm": 0.2637573778629303, "learning_rate": 0.00011007521010844543, "loss": 4.642, "step": 33065 }, { "epoch": 0.6772059877541826, "grad_norm": 0.28802940249443054, "learning_rate": 0.00011001555027193698, "loss": 4.6428, "step": 33070 }, { "epoch": 0.6773083775315872, "grad_norm": 0.26708462834358215, "learning_rate": 0.0001099558972436483, "loss": 4.648, "step": 33075 }, { "epoch": 0.6774107673089919, "grad_norm": 0.2635321021080017, "learning_rate": 0.00010989625103373665, "loss": 4.6278, "step": 33080 }, { "epoch": 0.6775131570863965, "grad_norm": 0.27121537923812866, "learning_rate": 0.00010983661165235806, "loss": 4.6934, "step": 33085 }, { "epoch": 0.6776155468638011, "grad_norm": 0.2633886933326721, "learning_rate": 0.00010977697910966741, "loss": 4.6117, "step": 33090 }, { "epoch": 0.6777179366412057, "grad_norm": 0.2765393555164337, "learning_rate": 0.00010971735341581854, "loss": 4.6651, "step": 33095 }, { "epoch": 0.6778203264186103, "grad_norm": 0.2615356147289276, "learning_rate": 0.00010965773458096387, "loss": 4.6445, "step": 33100 }, { "epoch": 0.677922716196015, "grad_norm": 0.2716805636882782, "learning_rate": 0.00010959812261525484, "loss": 4.6622, "step": 33105 }, { "epoch": 0.6780251059734196, "grad_norm": 0.2718418836593628, "learning_rate": 0.00010953851752884178, "loss": 4.5962, "step": 33110 }, { "epoch": 0.6781274957508242, "grad_norm": 0.2732146084308624, "learning_rate": 0.00010947891933187362, "loss": 4.6422, "step": 33115 }, { "epoch": 0.6782298855282288, "grad_norm": 0.25753968954086304, "learning_rate": 0.0001094193280344983, "loss": 4.556, "step": 33120 }, { "epoch": 0.6783322753056334, "grad_norm": 0.2574491798877716, "learning_rate": 0.00010935974364686253, "loss": 4.636, "step": 33125 }, { "epoch": 0.6784346650830381, "grad_norm": 0.5101560354232788, "learning_rate": 0.00010930016617911188, "loss": 4.6327, "step": 33130 }, { "epoch": 0.6785370548604427, "grad_norm": 0.2720508873462677, "learning_rate": 0.00010924059564139067, "loss": 4.6171, "step": 33135 }, { "epoch": 0.6786394446378473, "grad_norm": 0.2660973072052002, "learning_rate": 0.00010918103204384204, "loss": 4.6107, "step": 33140 }, { "epoch": 0.6787418344152519, "grad_norm": 0.26955458521842957, "learning_rate": 0.00010912147539660808, "loss": 4.622, "step": 33145 }, { "epoch": 0.6788442241926566, "grad_norm": 0.2750113606452942, "learning_rate": 0.0001090619257098295, "loss": 4.6351, "step": 33150 }, { "epoch": 0.6789466139700612, "grad_norm": 0.28483280539512634, "learning_rate": 0.00010900238299364595, "loss": 4.6206, "step": 33155 }, { "epoch": 0.6790490037474659, "grad_norm": 0.26131486892700195, "learning_rate": 0.00010894284725819595, "loss": 4.6404, "step": 33160 }, { "epoch": 0.6791513935248705, "grad_norm": 0.26914942264556885, "learning_rate": 0.00010888331851361663, "loss": 4.6868, "step": 33165 }, { "epoch": 0.6792537833022751, "grad_norm": 0.28682154417037964, "learning_rate": 0.00010882379677004412, "loss": 4.6411, "step": 33170 }, { "epoch": 0.6793561730796798, "grad_norm": 0.26749324798583984, "learning_rate": 0.00010876428203761329, "loss": 4.6421, "step": 33175 }, { "epoch": 0.6794585628570844, "grad_norm": 0.26501280069351196, "learning_rate": 0.00010870477432645779, "loss": 4.6195, "step": 33180 }, { "epoch": 0.679560952634489, "grad_norm": 0.26755979657173157, "learning_rate": 0.00010864527364671011, "loss": 4.6563, "step": 33185 }, { "epoch": 0.6796633424118936, "grad_norm": 0.2696143686771393, "learning_rate": 0.00010858578000850157, "loss": 4.5667, "step": 33190 }, { "epoch": 0.6797657321892983, "grad_norm": 0.26739928126335144, "learning_rate": 0.00010852629342196217, "loss": 4.6025, "step": 33195 }, { "epoch": 0.6798681219667029, "grad_norm": 0.27433258295059204, "learning_rate": 0.00010846681389722083, "loss": 4.669, "step": 33200 }, { "epoch": 0.6799705117441075, "grad_norm": 0.2671515941619873, "learning_rate": 0.00010840734144440524, "loss": 4.6127, "step": 33205 }, { "epoch": 0.6800729015215121, "grad_norm": 0.26458939909935, "learning_rate": 0.00010834787607364187, "loss": 4.6186, "step": 33210 }, { "epoch": 0.6801752912989167, "grad_norm": 0.274859219789505, "learning_rate": 0.00010828841779505596, "loss": 4.5913, "step": 33215 }, { "epoch": 0.6802776810763214, "grad_norm": 0.29292112588882446, "learning_rate": 0.00010822896661877165, "loss": 4.5322, "step": 33220 }, { "epoch": 0.680380070853726, "grad_norm": 0.2590983808040619, "learning_rate": 0.00010816952255491167, "loss": 4.6205, "step": 33225 }, { "epoch": 0.6804824606311306, "grad_norm": 0.27183979749679565, "learning_rate": 0.00010811008561359777, "loss": 4.5831, "step": 33230 }, { "epoch": 0.6805848504085352, "grad_norm": 0.2869648337364197, "learning_rate": 0.0001080506558049504, "loss": 4.6256, "step": 33235 }, { "epoch": 0.6806872401859398, "grad_norm": 0.2832396626472473, "learning_rate": 0.00010799123313908863, "loss": 4.6307, "step": 33240 }, { "epoch": 0.6807896299633445, "grad_norm": 0.2963630259037018, "learning_rate": 0.00010793181762613053, "loss": 4.6324, "step": 33245 }, { "epoch": 0.6808920197407491, "grad_norm": 0.2685511112213135, "learning_rate": 0.00010787240927619295, "loss": 4.5991, "step": 33250 }, { "epoch": 0.6809944095181537, "grad_norm": 0.5867858529090881, "learning_rate": 0.00010781300809939135, "loss": 4.6226, "step": 33255 }, { "epoch": 0.6810967992955583, "grad_norm": 0.268569678068161, "learning_rate": 0.00010775361410584011, "loss": 4.6311, "step": 33260 }, { "epoch": 0.681199189072963, "grad_norm": 0.26071876287460327, "learning_rate": 0.00010769422730565234, "loss": 4.6936, "step": 33265 }, { "epoch": 0.6813015788503676, "grad_norm": 0.2950969636440277, "learning_rate": 0.00010763484770894, "loss": 4.6456, "step": 33270 }, { "epoch": 0.6814039686277722, "grad_norm": 0.29240158200263977, "learning_rate": 0.00010757547532581362, "loss": 4.653, "step": 33275 }, { "epoch": 0.6815063584051768, "grad_norm": 0.26814985275268555, "learning_rate": 0.00010751611016638274, "loss": 4.6649, "step": 33280 }, { "epoch": 0.6816087481825814, "grad_norm": 0.29863035678863525, "learning_rate": 0.00010745675224075558, "loss": 4.5942, "step": 33285 }, { "epoch": 0.6817111379599861, "grad_norm": 0.3239007890224457, "learning_rate": 0.00010739740155903899, "loss": 4.6163, "step": 33290 }, { "epoch": 0.6818135277373907, "grad_norm": 0.30782201886177063, "learning_rate": 0.0001073380581313388, "loss": 4.6345, "step": 33295 }, { "epoch": 0.6819159175147953, "grad_norm": 0.28785884380340576, "learning_rate": 0.00010727872196775951, "loss": 4.6283, "step": 33300 }, { "epoch": 0.6820183072921999, "grad_norm": 0.30212146043777466, "learning_rate": 0.00010721939307840436, "loss": 4.6903, "step": 33305 }, { "epoch": 0.6821206970696045, "grad_norm": 0.27822816371917725, "learning_rate": 0.00010716007147337542, "loss": 4.6804, "step": 33310 }, { "epoch": 0.6822230868470092, "grad_norm": 0.2746981978416443, "learning_rate": 0.00010710075716277348, "loss": 4.6548, "step": 33315 }, { "epoch": 0.6823254766244138, "grad_norm": 0.35819563269615173, "learning_rate": 0.00010704145015669804, "loss": 4.6212, "step": 33320 }, { "epoch": 0.6824278664018184, "grad_norm": 0.2573152780532837, "learning_rate": 0.00010698215046524744, "loss": 4.6293, "step": 33325 }, { "epoch": 0.682530256179223, "grad_norm": 0.2639943063259125, "learning_rate": 0.0001069228580985188, "loss": 4.6469, "step": 33330 }, { "epoch": 0.6826326459566276, "grad_norm": 0.26783081889152527, "learning_rate": 0.00010686357306660777, "loss": 4.6176, "step": 33335 }, { "epoch": 0.6827350357340323, "grad_norm": 0.2767474353313446, "learning_rate": 0.00010680429537960904, "loss": 4.6259, "step": 33340 }, { "epoch": 0.6828374255114369, "grad_norm": 0.271543949842453, "learning_rate": 0.0001067450250476159, "loss": 4.6345, "step": 33345 }, { "epoch": 0.6829398152888415, "grad_norm": 0.2669256627559662, "learning_rate": 0.00010668576208072036, "loss": 4.6357, "step": 33350 }, { "epoch": 0.6830422050662461, "grad_norm": 0.27458763122558594, "learning_rate": 0.00010662650648901328, "loss": 4.5843, "step": 33355 }, { "epoch": 0.6831445948436509, "grad_norm": 0.2733932137489319, "learning_rate": 0.00010656725828258422, "loss": 4.662, "step": 33360 }, { "epoch": 0.6832469846210555, "grad_norm": 0.32873204350471497, "learning_rate": 0.0001065080174715214, "loss": 4.6297, "step": 33365 }, { "epoch": 0.6833493743984601, "grad_norm": 0.27415570616722107, "learning_rate": 0.00010644878406591192, "loss": 4.6236, "step": 33370 }, { "epoch": 0.6834517641758647, "grad_norm": 0.2847776710987091, "learning_rate": 0.00010638955807584158, "loss": 4.622, "step": 33375 }, { "epoch": 0.6835541539532694, "grad_norm": 0.2700304388999939, "learning_rate": 0.00010633033951139478, "loss": 4.6632, "step": 33380 }, { "epoch": 0.683656543730674, "grad_norm": 0.2543303072452545, "learning_rate": 0.00010627112838265482, "loss": 4.6415, "step": 33385 }, { "epoch": 0.6837589335080786, "grad_norm": 0.2689971625804901, "learning_rate": 0.00010621192469970373, "loss": 4.6474, "step": 33390 }, { "epoch": 0.6838613232854832, "grad_norm": 0.2835336923599243, "learning_rate": 0.00010615272847262213, "loss": 4.6392, "step": 33395 }, { "epoch": 0.6839637130628878, "grad_norm": 0.26349833607673645, "learning_rate": 0.00010609353971148948, "loss": 4.6257, "step": 33400 }, { "epoch": 0.6840661028402925, "grad_norm": 0.2775791883468628, "learning_rate": 0.000106034358426384, "loss": 4.7107, "step": 33405 }, { "epoch": 0.6841684926176971, "grad_norm": 0.3532986640930176, "learning_rate": 0.00010597518462738261, "loss": 4.5854, "step": 33410 }, { "epoch": 0.6842708823951017, "grad_norm": 0.2774605453014374, "learning_rate": 0.00010591601832456084, "loss": 4.6145, "step": 33415 }, { "epoch": 0.6843732721725063, "grad_norm": 0.27627623081207275, "learning_rate": 0.00010585685952799308, "loss": 4.6395, "step": 33420 }, { "epoch": 0.6844756619499109, "grad_norm": 0.7018665075302124, "learning_rate": 0.00010579770824775246, "loss": 4.5492, "step": 33425 }, { "epoch": 0.6845780517273156, "grad_norm": 0.2698715627193451, "learning_rate": 0.00010573856449391065, "loss": 4.5794, "step": 33430 }, { "epoch": 0.6846804415047202, "grad_norm": 0.26496490836143494, "learning_rate": 0.00010567942827653822, "loss": 4.6022, "step": 33435 }, { "epoch": 0.6847828312821248, "grad_norm": 0.27712804079055786, "learning_rate": 0.00010562029960570443, "loss": 4.6484, "step": 33440 }, { "epoch": 0.6848852210595294, "grad_norm": 0.284860759973526, "learning_rate": 0.00010556117849147713, "loss": 4.619, "step": 33445 }, { "epoch": 0.684987610836934, "grad_norm": 0.26327624917030334, "learning_rate": 0.00010550206494392301, "loss": 4.6726, "step": 33450 }, { "epoch": 0.6850900006143387, "grad_norm": 0.26512905955314636, "learning_rate": 0.0001054429589731075, "loss": 4.646, "step": 33455 }, { "epoch": 0.6851923903917433, "grad_norm": 0.28669658303260803, "learning_rate": 0.00010538386058909458, "loss": 4.6625, "step": 33460 }, { "epoch": 0.6852947801691479, "grad_norm": 0.27869224548339844, "learning_rate": 0.00010532476980194706, "loss": 4.645, "step": 33465 }, { "epoch": 0.6853971699465525, "grad_norm": 0.28113362193107605, "learning_rate": 0.00010526568662172649, "loss": 4.615, "step": 33470 }, { "epoch": 0.6854995597239572, "grad_norm": 0.3861667811870575, "learning_rate": 0.00010520661105849297, "loss": 4.6463, "step": 33475 }, { "epoch": 0.6856019495013618, "grad_norm": 0.28890690207481384, "learning_rate": 0.0001051475431223054, "loss": 4.5874, "step": 33480 }, { "epoch": 0.6857043392787664, "grad_norm": 0.28581398725509644, "learning_rate": 0.0001050884828232215, "loss": 4.6466, "step": 33485 }, { "epoch": 0.685806729056171, "grad_norm": 0.2695760428905487, "learning_rate": 0.00010502943017129741, "loss": 4.6051, "step": 33490 }, { "epoch": 0.6859091188335756, "grad_norm": 0.28361669182777405, "learning_rate": 0.00010497038517658821, "loss": 4.6244, "step": 33495 }, { "epoch": 0.6860115086109803, "grad_norm": 0.2638608515262604, "learning_rate": 0.00010491134784914762, "loss": 4.6105, "step": 33500 }, { "epoch": 0.6861138983883849, "grad_norm": 0.2701319754123688, "learning_rate": 0.00010485231819902791, "loss": 4.6511, "step": 33505 }, { "epoch": 0.6862162881657895, "grad_norm": 0.2725416123867035, "learning_rate": 0.00010479329623628023, "loss": 4.6427, "step": 33510 }, { "epoch": 0.6863186779431941, "grad_norm": 0.26230761408805847, "learning_rate": 0.00010473428197095443, "loss": 4.5727, "step": 33515 }, { "epoch": 0.6864210677205987, "grad_norm": 0.27070945501327515, "learning_rate": 0.00010467527541309882, "loss": 4.6456, "step": 33520 }, { "epoch": 0.6865234574980034, "grad_norm": 0.2823569178581238, "learning_rate": 0.00010461627657276063, "loss": 4.644, "step": 33525 }, { "epoch": 0.686625847275408, "grad_norm": 0.2608167827129364, "learning_rate": 0.0001045572854599857, "loss": 4.5949, "step": 33530 }, { "epoch": 0.6867282370528126, "grad_norm": 0.32188713550567627, "learning_rate": 0.00010449830208481846, "loss": 4.646, "step": 33535 }, { "epoch": 0.6868306268302172, "grad_norm": 0.25692039728164673, "learning_rate": 0.0001044393264573022, "loss": 4.6223, "step": 33540 }, { "epoch": 0.6869330166076218, "grad_norm": 0.2823508083820343, "learning_rate": 0.00010438035858747876, "loss": 4.6348, "step": 33545 }, { "epoch": 0.6870354063850265, "grad_norm": 0.285574346780777, "learning_rate": 0.00010432139848538875, "loss": 4.6248, "step": 33550 }, { "epoch": 0.6871377961624311, "grad_norm": 0.2840411067008972, "learning_rate": 0.00010426244616107133, "loss": 4.5983, "step": 33555 }, { "epoch": 0.6872401859398357, "grad_norm": 0.27879831194877625, "learning_rate": 0.00010420350162456444, "loss": 4.5808, "step": 33560 }, { "epoch": 0.6873425757172404, "grad_norm": 0.2666771113872528, "learning_rate": 0.00010414456488590476, "loss": 4.6369, "step": 33565 }, { "epoch": 0.6874449654946451, "grad_norm": 0.2672293186187744, "learning_rate": 0.00010408563595512738, "loss": 4.656, "step": 33570 }, { "epoch": 0.6875473552720497, "grad_norm": 0.2605060040950775, "learning_rate": 0.0001040267148422663, "loss": 4.627, "step": 33575 }, { "epoch": 0.6876497450494543, "grad_norm": 0.26467475295066833, "learning_rate": 0.00010396780155735417, "loss": 4.6612, "step": 33580 }, { "epoch": 0.6877521348268589, "grad_norm": 0.2730306386947632, "learning_rate": 0.00010390889611042218, "loss": 4.6743, "step": 33585 }, { "epoch": 0.6878545246042636, "grad_norm": 0.2668982744216919, "learning_rate": 0.00010384999851150029, "loss": 4.6168, "step": 33590 }, { "epoch": 0.6879569143816682, "grad_norm": 0.2712850272655487, "learning_rate": 0.00010379110877061716, "loss": 4.5973, "step": 33595 }, { "epoch": 0.6880593041590728, "grad_norm": 0.27507054805755615, "learning_rate": 0.00010373222689779988, "loss": 4.5955, "step": 33600 }, { "epoch": 0.6881616939364774, "grad_norm": 0.30687814950942993, "learning_rate": 0.0001036733529030745, "loss": 4.6138, "step": 33605 }, { "epoch": 0.688264083713882, "grad_norm": 0.2817098796367645, "learning_rate": 0.00010361448679646563, "loss": 4.5709, "step": 33610 }, { "epoch": 0.6883664734912867, "grad_norm": 0.3625824451446533, "learning_rate": 0.00010355562858799636, "loss": 4.6557, "step": 33615 }, { "epoch": 0.6884688632686913, "grad_norm": 0.26989036798477173, "learning_rate": 0.00010349677828768865, "loss": 4.6515, "step": 33620 }, { "epoch": 0.6885712530460959, "grad_norm": 0.2847817838191986, "learning_rate": 0.00010343793590556308, "loss": 4.6436, "step": 33625 }, { "epoch": 0.6886736428235005, "grad_norm": 0.28808367252349854, "learning_rate": 0.00010337910145163876, "loss": 4.6747, "step": 33630 }, { "epoch": 0.6887760326009051, "grad_norm": 0.2900857627391815, "learning_rate": 0.00010332027493593359, "loss": 4.6243, "step": 33635 }, { "epoch": 0.6888784223783098, "grad_norm": 0.42385783791542053, "learning_rate": 0.00010326145636846408, "loss": 4.648, "step": 33640 }, { "epoch": 0.6889808121557144, "grad_norm": 0.2676391303539276, "learning_rate": 0.00010320264575924525, "loss": 4.6579, "step": 33645 }, { "epoch": 0.689083201933119, "grad_norm": 0.2803729772567749, "learning_rate": 0.00010314384311829099, "loss": 4.6161, "step": 33650 }, { "epoch": 0.6891855917105236, "grad_norm": 0.2810579240322113, "learning_rate": 0.00010308504845561377, "loss": 4.6226, "step": 33655 }, { "epoch": 0.6892879814879282, "grad_norm": 0.27086740732192993, "learning_rate": 0.0001030262617812245, "loss": 4.6079, "step": 33660 }, { "epoch": 0.6893903712653329, "grad_norm": 0.27384790778160095, "learning_rate": 0.00010296748310513298, "loss": 4.6516, "step": 33665 }, { "epoch": 0.6894927610427375, "grad_norm": 0.270010769367218, "learning_rate": 0.00010290871243734759, "loss": 4.638, "step": 33670 }, { "epoch": 0.6895951508201421, "grad_norm": 0.2743312418460846, "learning_rate": 0.00010284994978787521, "loss": 4.6167, "step": 33675 }, { "epoch": 0.6896975405975467, "grad_norm": 0.27382251620292664, "learning_rate": 0.00010279119516672151, "loss": 4.6374, "step": 33680 }, { "epoch": 0.6897999303749514, "grad_norm": 0.26476651430130005, "learning_rate": 0.00010273244858389074, "loss": 4.6424, "step": 33685 }, { "epoch": 0.689902320152356, "grad_norm": 0.2602652311325073, "learning_rate": 0.00010267371004938586, "loss": 4.6489, "step": 33690 }, { "epoch": 0.6900047099297606, "grad_norm": 0.519544780254364, "learning_rate": 0.00010261497957320821, "loss": 4.5621, "step": 33695 }, { "epoch": 0.6901070997071652, "grad_norm": 0.28236058354377747, "learning_rate": 0.00010255625716535802, "loss": 4.5656, "step": 33700 }, { "epoch": 0.6902094894845698, "grad_norm": 0.28950345516204834, "learning_rate": 0.00010249754283583412, "loss": 4.6169, "step": 33705 }, { "epoch": 0.6903118792619745, "grad_norm": 0.3103809952735901, "learning_rate": 0.00010243883659463378, "loss": 4.6476, "step": 33710 }, { "epoch": 0.6904142690393791, "grad_norm": 0.2706397473812103, "learning_rate": 0.00010238013845175304, "loss": 4.5904, "step": 33715 }, { "epoch": 0.6905166588167837, "grad_norm": 0.26464343070983887, "learning_rate": 0.00010232144841718657, "loss": 4.5943, "step": 33720 }, { "epoch": 0.6906190485941883, "grad_norm": 0.26613783836364746, "learning_rate": 0.0001022627665009276, "loss": 4.6609, "step": 33725 }, { "epoch": 0.6907214383715929, "grad_norm": 0.28778529167175293, "learning_rate": 0.00010220409271296799, "loss": 4.6519, "step": 33730 }, { "epoch": 0.6908238281489976, "grad_norm": 0.2581515610218048, "learning_rate": 0.00010214542706329829, "loss": 4.5733, "step": 33735 }, { "epoch": 0.6909262179264022, "grad_norm": 0.27304625511169434, "learning_rate": 0.00010208676956190744, "loss": 4.6519, "step": 33740 }, { "epoch": 0.6910286077038068, "grad_norm": 0.26651403307914734, "learning_rate": 0.00010202812021878331, "loss": 4.6317, "step": 33745 }, { "epoch": 0.6911309974812114, "grad_norm": 0.274996817111969, "learning_rate": 0.00010196947904391222, "loss": 4.612, "step": 33750 }, { "epoch": 0.691233387258616, "grad_norm": 0.2680855393409729, "learning_rate": 0.00010191084604727899, "loss": 4.5888, "step": 33755 }, { "epoch": 0.6913357770360207, "grad_norm": 0.2705151438713074, "learning_rate": 0.00010185222123886722, "loss": 4.6379, "step": 33760 }, { "epoch": 0.6914381668134253, "grad_norm": 0.29698050022125244, "learning_rate": 0.0001017936046286591, "loss": 4.6247, "step": 33765 }, { "epoch": 0.69154055659083, "grad_norm": 0.27303194999694824, "learning_rate": 0.0001017349962266353, "loss": 4.6425, "step": 33770 }, { "epoch": 0.6916429463682346, "grad_norm": 0.2617448568344116, "learning_rate": 0.00010167639604277519, "loss": 4.6372, "step": 33775 }, { "epoch": 0.6917453361456393, "grad_norm": 0.2696899473667145, "learning_rate": 0.00010161780408705681, "loss": 4.6355, "step": 33780 }, { "epoch": 0.6918477259230439, "grad_norm": 0.28130942583084106, "learning_rate": 0.00010155922036945658, "loss": 4.5759, "step": 33785 }, { "epoch": 0.6919501157004485, "grad_norm": 0.28385117650032043, "learning_rate": 0.00010150064489994965, "loss": 4.6041, "step": 33790 }, { "epoch": 0.6920525054778531, "grad_norm": 0.2776240408420563, "learning_rate": 0.00010144207768850994, "loss": 4.6411, "step": 33795 }, { "epoch": 0.6921548952552578, "grad_norm": 0.272550493478775, "learning_rate": 0.0001013835187451096, "loss": 4.6128, "step": 33800 }, { "epoch": 0.6922572850326624, "grad_norm": 0.31850337982177734, "learning_rate": 0.00010132496807971964, "loss": 4.5977, "step": 33805 }, { "epoch": 0.692359674810067, "grad_norm": 0.28273338079452515, "learning_rate": 0.0001012664257023096, "loss": 4.6208, "step": 33810 }, { "epoch": 0.6924620645874716, "grad_norm": 0.267389178276062, "learning_rate": 0.00010120789162284753, "loss": 4.6402, "step": 33815 }, { "epoch": 0.6925644543648762, "grad_norm": 0.3180309236049652, "learning_rate": 0.00010114936585130016, "loss": 4.6548, "step": 33820 }, { "epoch": 0.6926668441422809, "grad_norm": 0.28237926959991455, "learning_rate": 0.00010109084839763277, "loss": 4.5276, "step": 33825 }, { "epoch": 0.6927692339196855, "grad_norm": 0.27617645263671875, "learning_rate": 0.00010103233927180929, "loss": 4.6769, "step": 33830 }, { "epoch": 0.6928716236970901, "grad_norm": 0.48459896445274353, "learning_rate": 0.00010097383848379198, "loss": 4.5951, "step": 33835 }, { "epoch": 0.6929740134744947, "grad_norm": 0.2679106593132019, "learning_rate": 0.00010091534604354204, "loss": 4.5875, "step": 33840 }, { "epoch": 0.6930764032518993, "grad_norm": 0.26665279269218445, "learning_rate": 0.00010085686196101909, "loss": 4.6357, "step": 33845 }, { "epoch": 0.693178793029304, "grad_norm": 0.30987781286239624, "learning_rate": 0.00010079838624618119, "loss": 4.6206, "step": 33850 }, { "epoch": 0.6932811828067086, "grad_norm": 0.26174652576446533, "learning_rate": 0.00010073991890898516, "loss": 4.6197, "step": 33855 }, { "epoch": 0.6933835725841132, "grad_norm": 0.2697709798812866, "learning_rate": 0.00010068145995938637, "loss": 4.6602, "step": 33860 }, { "epoch": 0.6934859623615178, "grad_norm": 0.2693204879760742, "learning_rate": 0.00010062300940733863, "loss": 4.6778, "step": 33865 }, { "epoch": 0.6935883521389224, "grad_norm": 0.2792659103870392, "learning_rate": 0.0001005645672627945, "loss": 4.6052, "step": 33870 }, { "epoch": 0.6936907419163271, "grad_norm": 0.27753615379333496, "learning_rate": 0.00010050613353570505, "loss": 4.6265, "step": 33875 }, { "epoch": 0.6937931316937317, "grad_norm": 0.25686487555503845, "learning_rate": 0.00010044770823601975, "loss": 4.673, "step": 33880 }, { "epoch": 0.6938955214711363, "grad_norm": 0.2649760842323303, "learning_rate": 0.00010038929137368683, "loss": 4.6462, "step": 33885 }, { "epoch": 0.6939979112485409, "grad_norm": 0.26999253034591675, "learning_rate": 0.00010033088295865316, "loss": 4.6194, "step": 33890 }, { "epoch": 0.6941003010259456, "grad_norm": 0.29455089569091797, "learning_rate": 0.00010027248300086385, "loss": 4.6102, "step": 33895 }, { "epoch": 0.6942026908033502, "grad_norm": 0.27291572093963623, "learning_rate": 0.00010021409151026287, "loss": 4.6452, "step": 33900 }, { "epoch": 0.6943050805807548, "grad_norm": 0.2754967510700226, "learning_rate": 0.00010015570849679263, "loss": 4.6245, "step": 33905 }, { "epoch": 0.6944074703581594, "grad_norm": 0.2637985944747925, "learning_rate": 0.00010009733397039405, "loss": 4.6501, "step": 33910 }, { "epoch": 0.694509860135564, "grad_norm": 0.27101102471351624, "learning_rate": 0.0001000389679410067, "loss": 4.6049, "step": 33915 }, { "epoch": 0.6946122499129687, "grad_norm": 0.2719937264919281, "learning_rate": 9.998061041856871e-05, "loss": 4.632, "step": 33920 }, { "epoch": 0.6947146396903733, "grad_norm": 0.2812808156013489, "learning_rate": 9.992226141301661e-05, "loss": 4.6184, "step": 33925 }, { "epoch": 0.6948170294677779, "grad_norm": 0.28687840700149536, "learning_rate": 9.986392093428561e-05, "loss": 4.6098, "step": 33930 }, { "epoch": 0.6949194192451825, "grad_norm": 0.28334131836891174, "learning_rate": 9.980558899230953e-05, "loss": 4.5923, "step": 33935 }, { "epoch": 0.6950218090225871, "grad_norm": 0.2913331389427185, "learning_rate": 9.974726559702055e-05, "loss": 4.6653, "step": 33940 }, { "epoch": 0.6951241987999918, "grad_norm": 0.29087406396865845, "learning_rate": 9.96889507583495e-05, "loss": 4.6465, "step": 33945 }, { "epoch": 0.6952265885773964, "grad_norm": 0.2694760859012604, "learning_rate": 9.963064448622583e-05, "loss": 4.6269, "step": 33950 }, { "epoch": 0.695328978354801, "grad_norm": 0.2800590693950653, "learning_rate": 9.957234679057735e-05, "loss": 4.611, "step": 33955 }, { "epoch": 0.6954313681322056, "grad_norm": 0.2676231265068054, "learning_rate": 9.951405768133056e-05, "loss": 4.6176, "step": 33960 }, { "epoch": 0.6955337579096103, "grad_norm": 0.2661445736885071, "learning_rate": 9.945577716841043e-05, "loss": 4.582, "step": 33965 }, { "epoch": 0.695636147687015, "grad_norm": 0.2714359164237976, "learning_rate": 9.939750526174052e-05, "loss": 4.586, "step": 33970 }, { "epoch": 0.6957385374644196, "grad_norm": 0.2764875888824463, "learning_rate": 9.933924197124281e-05, "loss": 4.6132, "step": 33975 }, { "epoch": 0.6958409272418242, "grad_norm": 0.2633768916130066, "learning_rate": 9.92809873068379e-05, "loss": 4.5958, "step": 33980 }, { "epoch": 0.6959433170192288, "grad_norm": 0.33859845995903015, "learning_rate": 9.922274127844503e-05, "loss": 4.5989, "step": 33985 }, { "epoch": 0.6960457067966335, "grad_norm": 0.2747366428375244, "learning_rate": 9.916450389598169e-05, "loss": 4.6358, "step": 33990 }, { "epoch": 0.6961480965740381, "grad_norm": 0.28387874364852905, "learning_rate": 9.910627516936414e-05, "loss": 4.6491, "step": 33995 }, { "epoch": 0.6962504863514427, "grad_norm": 0.28758618235588074, "learning_rate": 9.904805510850709e-05, "loss": 4.6424, "step": 34000 }, { "epoch": 0.6962504863514427, "eval_loss": 4.641878128051758, "eval_runtime": 17.1317, "eval_samples_per_second": 33.972, "eval_steps_per_second": 4.261, "step": 34000 }, { "epoch": 0.6963528761288473, "grad_norm": 0.2643364369869232, "learning_rate": 9.898984372332372e-05, "loss": 4.6064, "step": 34005 }, { "epoch": 0.696455265906252, "grad_norm": 0.26672685146331787, "learning_rate": 9.893164102372579e-05, "loss": 4.6363, "step": 34010 }, { "epoch": 0.6965576556836566, "grad_norm": 0.26538074016571045, "learning_rate": 9.887344701962368e-05, "loss": 4.6558, "step": 34015 }, { "epoch": 0.6966600454610612, "grad_norm": 0.2577105164527893, "learning_rate": 9.881526172092603e-05, "loss": 4.6318, "step": 34020 }, { "epoch": 0.6967624352384658, "grad_norm": 0.2822088599205017, "learning_rate": 9.875708513754017e-05, "loss": 4.56, "step": 34025 }, { "epoch": 0.6968648250158704, "grad_norm": 0.27342158555984497, "learning_rate": 9.869891727937206e-05, "loss": 4.5965, "step": 34030 }, { "epoch": 0.6969672147932751, "grad_norm": 0.30411356687545776, "learning_rate": 9.864075815632589e-05, "loss": 4.5894, "step": 34035 }, { "epoch": 0.6970696045706797, "grad_norm": 0.2588144540786743, "learning_rate": 9.858260777830456e-05, "loss": 4.6367, "step": 34040 }, { "epoch": 0.6971719943480843, "grad_norm": 0.2642608880996704, "learning_rate": 9.852446615520948e-05, "loss": 4.6082, "step": 34045 }, { "epoch": 0.6972743841254889, "grad_norm": 0.27088019251823425, "learning_rate": 9.846633329694046e-05, "loss": 4.6136, "step": 34050 }, { "epoch": 0.6973767739028935, "grad_norm": 0.27060768008232117, "learning_rate": 9.840820921339587e-05, "loss": 4.5861, "step": 34055 }, { "epoch": 0.6974791636802982, "grad_norm": 0.26521429419517517, "learning_rate": 9.835009391447272e-05, "loss": 4.6138, "step": 34060 }, { "epoch": 0.6975815534577028, "grad_norm": 0.28483012318611145, "learning_rate": 9.829198741006624e-05, "loss": 4.5893, "step": 34065 }, { "epoch": 0.6976839432351074, "grad_norm": 0.28838014602661133, "learning_rate": 9.823388971007041e-05, "loss": 4.6379, "step": 34070 }, { "epoch": 0.697786333012512, "grad_norm": 0.26538369059562683, "learning_rate": 9.817580082437761e-05, "loss": 4.6638, "step": 34075 }, { "epoch": 0.6978887227899166, "grad_norm": 0.2743552625179291, "learning_rate": 9.81177207628787e-05, "loss": 4.6491, "step": 34080 }, { "epoch": 0.6979911125673213, "grad_norm": 0.29531776905059814, "learning_rate": 9.805964953546311e-05, "loss": 4.6233, "step": 34085 }, { "epoch": 0.6980935023447259, "grad_norm": 0.27311116456985474, "learning_rate": 9.800158715201874e-05, "loss": 4.6743, "step": 34090 }, { "epoch": 0.6981958921221305, "grad_norm": 0.29840296506881714, "learning_rate": 9.794353362243192e-05, "loss": 4.6264, "step": 34095 }, { "epoch": 0.6982982818995351, "grad_norm": 0.28220632672309875, "learning_rate": 9.788548895658754e-05, "loss": 4.6377, "step": 34100 }, { "epoch": 0.6984006716769398, "grad_norm": 0.2777146100997925, "learning_rate": 9.782745316436897e-05, "loss": 4.6099, "step": 34105 }, { "epoch": 0.6985030614543444, "grad_norm": 0.2898336946964264, "learning_rate": 9.776942625565812e-05, "loss": 4.6604, "step": 34110 }, { "epoch": 0.698605451231749, "grad_norm": 0.28175127506256104, "learning_rate": 9.771140824033523e-05, "loss": 4.5905, "step": 34115 }, { "epoch": 0.6987078410091536, "grad_norm": 0.2771725356578827, "learning_rate": 9.765339912827911e-05, "loss": 4.6262, "step": 34120 }, { "epoch": 0.6988102307865582, "grad_norm": 0.27140840888023376, "learning_rate": 9.759539892936725e-05, "loss": 4.5966, "step": 34125 }, { "epoch": 0.6989126205639629, "grad_norm": 0.2764374911785126, "learning_rate": 9.753740765347522e-05, "loss": 4.6608, "step": 34130 }, { "epoch": 0.6990150103413675, "grad_norm": 0.26501938700675964, "learning_rate": 9.747942531047742e-05, "loss": 4.6205, "step": 34135 }, { "epoch": 0.6991174001187721, "grad_norm": 0.27190810441970825, "learning_rate": 9.742145191024661e-05, "loss": 4.5789, "step": 34140 }, { "epoch": 0.6992197898961767, "grad_norm": 0.2597341239452362, "learning_rate": 9.736348746265394e-05, "loss": 4.6178, "step": 34145 }, { "epoch": 0.6993221796735813, "grad_norm": 0.27105921506881714, "learning_rate": 9.730553197756915e-05, "loss": 4.6153, "step": 34150 }, { "epoch": 0.699424569450986, "grad_norm": 0.26925212144851685, "learning_rate": 9.724758546486048e-05, "loss": 4.5839, "step": 34155 }, { "epoch": 0.6995269592283906, "grad_norm": 0.2827601432800293, "learning_rate": 9.718964793439447e-05, "loss": 4.6578, "step": 34160 }, { "epoch": 0.6996293490057952, "grad_norm": 0.2963789999485016, "learning_rate": 9.713171939603631e-05, "loss": 4.6487, "step": 34165 }, { "epoch": 0.6997317387831998, "grad_norm": 0.2923237383365631, "learning_rate": 9.70737998596496e-05, "loss": 4.6496, "step": 34170 }, { "epoch": 0.6998341285606046, "grad_norm": 0.274325966835022, "learning_rate": 9.701588933509635e-05, "loss": 4.6433, "step": 34175 }, { "epoch": 0.6999365183380092, "grad_norm": 0.2733668386936188, "learning_rate": 9.69579878322371e-05, "loss": 4.6413, "step": 34180 }, { "epoch": 0.7000389081154138, "grad_norm": 0.270188570022583, "learning_rate": 9.690009536093086e-05, "loss": 4.6081, "step": 34185 }, { "epoch": 0.7001412978928184, "grad_norm": 0.27390149235725403, "learning_rate": 9.684221193103505e-05, "loss": 4.6254, "step": 34190 }, { "epoch": 0.700243687670223, "grad_norm": 0.27170562744140625, "learning_rate": 9.678433755240558e-05, "loss": 4.6129, "step": 34195 }, { "epoch": 0.7003460774476277, "grad_norm": 0.330304354429245, "learning_rate": 9.672647223489689e-05, "loss": 4.6597, "step": 34200 }, { "epoch": 0.7004484672250323, "grad_norm": 0.2667689323425293, "learning_rate": 9.66686159883617e-05, "loss": 4.6014, "step": 34205 }, { "epoch": 0.7005508570024369, "grad_norm": 0.26405391097068787, "learning_rate": 9.661076882265132e-05, "loss": 4.5914, "step": 34210 }, { "epoch": 0.7006532467798415, "grad_norm": 0.27725109457969666, "learning_rate": 9.655293074761554e-05, "loss": 4.6374, "step": 34215 }, { "epoch": 0.7007556365572462, "grad_norm": 0.27563926577568054, "learning_rate": 9.649510177310247e-05, "loss": 4.6258, "step": 34220 }, { "epoch": 0.7008580263346508, "grad_norm": 0.28803661465644836, "learning_rate": 9.643728190895878e-05, "loss": 4.6535, "step": 34225 }, { "epoch": 0.7009604161120554, "grad_norm": 0.2780574858188629, "learning_rate": 9.637947116502959e-05, "loss": 4.6744, "step": 34230 }, { "epoch": 0.70106280588946, "grad_norm": 0.2649870812892914, "learning_rate": 9.632166955115839e-05, "loss": 4.6309, "step": 34235 }, { "epoch": 0.7011651956668646, "grad_norm": 0.2766774892807007, "learning_rate": 9.626387707718715e-05, "loss": 4.6788, "step": 34240 }, { "epoch": 0.7012675854442693, "grad_norm": 0.286602646112442, "learning_rate": 9.620609375295632e-05, "loss": 4.6395, "step": 34245 }, { "epoch": 0.7013699752216739, "grad_norm": 0.26506146788597107, "learning_rate": 9.614831958830484e-05, "loss": 4.6513, "step": 34250 }, { "epoch": 0.7014723649990785, "grad_norm": 0.280479371547699, "learning_rate": 9.609055459306986e-05, "loss": 4.6245, "step": 34255 }, { "epoch": 0.7015747547764831, "grad_norm": 0.28955379128456116, "learning_rate": 9.603279877708718e-05, "loss": 4.6074, "step": 34260 }, { "epoch": 0.7016771445538877, "grad_norm": 0.26712292432785034, "learning_rate": 9.597505215019104e-05, "loss": 4.6086, "step": 34265 }, { "epoch": 0.7017795343312924, "grad_norm": 0.2787074148654938, "learning_rate": 9.591731472221397e-05, "loss": 4.6254, "step": 34270 }, { "epoch": 0.701881924108697, "grad_norm": 0.26819899678230286, "learning_rate": 9.585958650298706e-05, "loss": 4.5885, "step": 34275 }, { "epoch": 0.7019843138861016, "grad_norm": 0.2676922380924225, "learning_rate": 9.580186750233983e-05, "loss": 4.5781, "step": 34280 }, { "epoch": 0.7020867036635062, "grad_norm": 0.263573557138443, "learning_rate": 9.574415773010015e-05, "loss": 4.6143, "step": 34285 }, { "epoch": 0.7021890934409108, "grad_norm": 0.5991362929344177, "learning_rate": 9.568645719609433e-05, "loss": 4.6701, "step": 34290 }, { "epoch": 0.7022914832183155, "grad_norm": 0.26259157061576843, "learning_rate": 9.562876591014725e-05, "loss": 4.5848, "step": 34295 }, { "epoch": 0.7023938729957201, "grad_norm": 0.2694006562232971, "learning_rate": 9.557108388208195e-05, "loss": 4.6143, "step": 34300 }, { "epoch": 0.7024962627731247, "grad_norm": 0.2679491341114044, "learning_rate": 9.551341112172013e-05, "loss": 4.6394, "step": 34305 }, { "epoch": 0.7025986525505293, "grad_norm": 0.2616468667984009, "learning_rate": 9.545574763888186e-05, "loss": 4.6475, "step": 34310 }, { "epoch": 0.702701042327934, "grad_norm": 0.2682160437107086, "learning_rate": 9.539809344338551e-05, "loss": 4.6331, "step": 34315 }, { "epoch": 0.7028034321053386, "grad_norm": 0.2596127390861511, "learning_rate": 9.534044854504802e-05, "loss": 4.5799, "step": 34320 }, { "epoch": 0.7029058218827432, "grad_norm": 0.28490668535232544, "learning_rate": 9.528281295368469e-05, "loss": 4.6401, "step": 34325 }, { "epoch": 0.7030082116601478, "grad_norm": 0.26502755284309387, "learning_rate": 9.522518667910917e-05, "loss": 4.6217, "step": 34330 }, { "epoch": 0.7031106014375524, "grad_norm": 0.2625313997268677, "learning_rate": 9.516756973113364e-05, "loss": 4.5792, "step": 34335 }, { "epoch": 0.7032129912149571, "grad_norm": 0.3207727074623108, "learning_rate": 9.510996211956867e-05, "loss": 4.6167, "step": 34340 }, { "epoch": 0.7033153809923617, "grad_norm": 0.30994778871536255, "learning_rate": 9.50523638542231e-05, "loss": 4.6362, "step": 34345 }, { "epoch": 0.7034177707697663, "grad_norm": 0.26821982860565186, "learning_rate": 9.499477494490433e-05, "loss": 4.6061, "step": 34350 }, { "epoch": 0.7035201605471709, "grad_norm": 0.2646442949771881, "learning_rate": 9.493719540141815e-05, "loss": 4.6474, "step": 34355 }, { "epoch": 0.7036225503245755, "grad_norm": 0.25915464758872986, "learning_rate": 9.487962523356868e-05, "loss": 4.6719, "step": 34360 }, { "epoch": 0.7037249401019802, "grad_norm": 0.2941591143608093, "learning_rate": 9.48220644511585e-05, "loss": 4.6098, "step": 34365 }, { "epoch": 0.7038273298793848, "grad_norm": 0.26016825437545776, "learning_rate": 9.476451306398864e-05, "loss": 4.6346, "step": 34370 }, { "epoch": 0.7039297196567895, "grad_norm": 0.2864300608634949, "learning_rate": 9.470697108185838e-05, "loss": 4.678, "step": 34375 }, { "epoch": 0.7040321094341941, "grad_norm": 0.2741391658782959, "learning_rate": 9.464943851456556e-05, "loss": 4.6368, "step": 34380 }, { "epoch": 0.7041344992115988, "grad_norm": 0.2671283781528473, "learning_rate": 9.459191537190631e-05, "loss": 4.6042, "step": 34385 }, { "epoch": 0.7042368889890034, "grad_norm": 0.2896394729614258, "learning_rate": 9.453440166367529e-05, "loss": 4.6453, "step": 34390 }, { "epoch": 0.704339278766408, "grad_norm": 0.2839994728565216, "learning_rate": 9.447689739966527e-05, "loss": 4.6129, "step": 34395 }, { "epoch": 0.7044416685438126, "grad_norm": 0.27805691957473755, "learning_rate": 9.441940258966774e-05, "loss": 4.5951, "step": 34400 }, { "epoch": 0.7045440583212172, "grad_norm": 0.27397221326828003, "learning_rate": 9.43619172434724e-05, "loss": 4.5941, "step": 34405 }, { "epoch": 0.7046464480986219, "grad_norm": 0.2726888358592987, "learning_rate": 9.430444137086738e-05, "loss": 4.6141, "step": 34410 }, { "epoch": 0.7047488378760265, "grad_norm": 0.2829132080078125, "learning_rate": 9.424697498163917e-05, "loss": 4.6236, "step": 34415 }, { "epoch": 0.7048512276534311, "grad_norm": 0.30598700046539307, "learning_rate": 9.418951808557271e-05, "loss": 4.6069, "step": 34420 }, { "epoch": 0.7049536174308357, "grad_norm": 0.27091097831726074, "learning_rate": 9.413207069245125e-05, "loss": 4.6398, "step": 34425 }, { "epoch": 0.7050560072082404, "grad_norm": 0.26590096950531006, "learning_rate": 9.407463281205647e-05, "loss": 4.6303, "step": 34430 }, { "epoch": 0.705158396985645, "grad_norm": 0.2824147939682007, "learning_rate": 9.401720445416847e-05, "loss": 4.5699, "step": 34435 }, { "epoch": 0.7052607867630496, "grad_norm": 0.28604599833488464, "learning_rate": 9.395978562856553e-05, "loss": 4.6345, "step": 34440 }, { "epoch": 0.7053631765404542, "grad_norm": 0.2786464989185333, "learning_rate": 9.390237634502457e-05, "loss": 4.6476, "step": 34445 }, { "epoch": 0.7054655663178588, "grad_norm": 0.2748449742794037, "learning_rate": 9.384497661332075e-05, "loss": 4.6295, "step": 34450 }, { "epoch": 0.7055679560952635, "grad_norm": 0.2649158537387848, "learning_rate": 9.378758644322756e-05, "loss": 4.57, "step": 34455 }, { "epoch": 0.7056703458726681, "grad_norm": 0.29101142287254333, "learning_rate": 9.373020584451698e-05, "loss": 4.6764, "step": 34460 }, { "epoch": 0.7057727356500727, "grad_norm": 0.2751317322254181, "learning_rate": 9.36728348269593e-05, "loss": 4.6075, "step": 34465 }, { "epoch": 0.7058751254274773, "grad_norm": 0.27171221375465393, "learning_rate": 9.361547340032315e-05, "loss": 4.6013, "step": 34470 }, { "epoch": 0.705977515204882, "grad_norm": 0.2699592113494873, "learning_rate": 9.355812157437553e-05, "loss": 4.5751, "step": 34475 }, { "epoch": 0.7060799049822866, "grad_norm": 0.5079491138458252, "learning_rate": 9.350077935888198e-05, "loss": 4.6284, "step": 34480 }, { "epoch": 0.7061822947596912, "grad_norm": 0.26962020993232727, "learning_rate": 9.344344676360606e-05, "loss": 4.626, "step": 34485 }, { "epoch": 0.7062846845370958, "grad_norm": 0.2690640091896057, "learning_rate": 9.338612379830999e-05, "loss": 4.6348, "step": 34490 }, { "epoch": 0.7063870743145004, "grad_norm": 0.28714755177497864, "learning_rate": 9.332881047275427e-05, "loss": 4.6122, "step": 34495 }, { "epoch": 0.706489464091905, "grad_norm": 0.27294567227363586, "learning_rate": 9.327150679669766e-05, "loss": 4.6603, "step": 34500 }, { "epoch": 0.7065918538693097, "grad_norm": 0.27827540040016174, "learning_rate": 9.32142127798974e-05, "loss": 4.6319, "step": 34505 }, { "epoch": 0.7066942436467143, "grad_norm": 0.2705303132534027, "learning_rate": 9.315692843210905e-05, "loss": 4.628, "step": 34510 }, { "epoch": 0.7067966334241189, "grad_norm": 0.28062865138053894, "learning_rate": 9.309965376308648e-05, "loss": 4.6471, "step": 34515 }, { "epoch": 0.7068990232015235, "grad_norm": 0.2768203020095825, "learning_rate": 9.304238878258196e-05, "loss": 4.6557, "step": 34520 }, { "epoch": 0.7070014129789282, "grad_norm": 0.26975247263908386, "learning_rate": 9.29851335003461e-05, "loss": 4.64, "step": 34525 }, { "epoch": 0.7071038027563328, "grad_norm": 0.3466828763484955, "learning_rate": 9.292788792612791e-05, "loss": 4.6212, "step": 34530 }, { "epoch": 0.7072061925337374, "grad_norm": 0.2951856851577759, "learning_rate": 9.287065206967457e-05, "loss": 4.6391, "step": 34535 }, { "epoch": 0.707308582311142, "grad_norm": 0.27098989486694336, "learning_rate": 9.281342594073176e-05, "loss": 4.6481, "step": 34540 }, { "epoch": 0.7074109720885466, "grad_norm": 0.26753920316696167, "learning_rate": 9.275620954904355e-05, "loss": 4.634, "step": 34545 }, { "epoch": 0.7075133618659513, "grad_norm": 0.26743990182876587, "learning_rate": 9.269900290435219e-05, "loss": 4.6, "step": 34550 }, { "epoch": 0.7076157516433559, "grad_norm": 0.26732194423675537, "learning_rate": 9.264180601639838e-05, "loss": 4.5985, "step": 34555 }, { "epoch": 0.7077181414207605, "grad_norm": 0.25865405797958374, "learning_rate": 9.258461889492116e-05, "loss": 4.5589, "step": 34560 }, { "epoch": 0.7078205311981651, "grad_norm": 0.2651233673095703, "learning_rate": 9.252744154965783e-05, "loss": 4.629, "step": 34565 }, { "epoch": 0.7079229209755697, "grad_norm": 0.26940491795539856, "learning_rate": 9.247027399034409e-05, "loss": 4.5988, "step": 34570 }, { "epoch": 0.7080253107529744, "grad_norm": 0.2711552679538727, "learning_rate": 9.241311622671397e-05, "loss": 4.581, "step": 34575 }, { "epoch": 0.7081277005303791, "grad_norm": 0.27430421113967896, "learning_rate": 9.23559682684998e-05, "loss": 4.6122, "step": 34580 }, { "epoch": 0.7082300903077837, "grad_norm": 0.3444492220878601, "learning_rate": 9.229883012543225e-05, "loss": 4.6314, "step": 34585 }, { "epoch": 0.7083324800851883, "grad_norm": 0.2723939120769501, "learning_rate": 9.224170180724037e-05, "loss": 4.6284, "step": 34590 }, { "epoch": 0.708434869862593, "grad_norm": 0.274140864610672, "learning_rate": 9.218458332365144e-05, "loss": 4.6356, "step": 34595 }, { "epoch": 0.7085372596399976, "grad_norm": 0.2797831594944, "learning_rate": 9.212747468439117e-05, "loss": 4.6264, "step": 34600 }, { "epoch": 0.7086396494174022, "grad_norm": 0.27547451853752136, "learning_rate": 9.207037589918353e-05, "loss": 4.6597, "step": 34605 }, { "epoch": 0.7087420391948068, "grad_norm": 0.2748309373855591, "learning_rate": 9.20132869777508e-05, "loss": 4.6356, "step": 34610 }, { "epoch": 0.7088444289722114, "grad_norm": 0.2922535538673401, "learning_rate": 9.195620792981362e-05, "loss": 4.6006, "step": 34615 }, { "epoch": 0.7089468187496161, "grad_norm": 0.27265986800193787, "learning_rate": 9.189913876509102e-05, "loss": 4.6749, "step": 34620 }, { "epoch": 0.7090492085270207, "grad_norm": 0.26631078124046326, "learning_rate": 9.184207949330012e-05, "loss": 4.6095, "step": 34625 }, { "epoch": 0.7091515983044253, "grad_norm": 0.2759389579296112, "learning_rate": 9.178503012415658e-05, "loss": 4.5712, "step": 34630 }, { "epoch": 0.7092539880818299, "grad_norm": 0.2792028784751892, "learning_rate": 9.172799066737429e-05, "loss": 4.6401, "step": 34635 }, { "epoch": 0.7093563778592346, "grad_norm": 0.2706100642681122, "learning_rate": 9.167096113266541e-05, "loss": 4.6037, "step": 34640 }, { "epoch": 0.7094587676366392, "grad_norm": 0.27416983246803284, "learning_rate": 9.16139415297405e-05, "loss": 4.5819, "step": 34645 }, { "epoch": 0.7095611574140438, "grad_norm": 0.28448888659477234, "learning_rate": 9.155693186830839e-05, "loss": 4.6489, "step": 34650 }, { "epoch": 0.7096635471914484, "grad_norm": 0.27139759063720703, "learning_rate": 9.149993215807619e-05, "loss": 4.6525, "step": 34655 }, { "epoch": 0.709765936968853, "grad_norm": 0.2676253616809845, "learning_rate": 9.144294240874933e-05, "loss": 4.66, "step": 34660 }, { "epoch": 0.7098683267462577, "grad_norm": 0.2800234854221344, "learning_rate": 9.138596263003153e-05, "loss": 4.6374, "step": 34665 }, { "epoch": 0.7099707165236623, "grad_norm": 0.2750464081764221, "learning_rate": 9.132899283162498e-05, "loss": 4.643, "step": 34670 }, { "epoch": 0.7100731063010669, "grad_norm": 0.3019641041755676, "learning_rate": 9.127203302322981e-05, "loss": 4.6366, "step": 34675 }, { "epoch": 0.7101754960784715, "grad_norm": 0.39563891291618347, "learning_rate": 9.121508321454478e-05, "loss": 4.6529, "step": 34680 }, { "epoch": 0.7102778858558761, "grad_norm": 0.279168039560318, "learning_rate": 9.115814341526683e-05, "loss": 4.5976, "step": 34685 }, { "epoch": 0.7103802756332808, "grad_norm": 0.2793745994567871, "learning_rate": 9.110121363509114e-05, "loss": 4.5832, "step": 34690 }, { "epoch": 0.7104826654106854, "grad_norm": 0.264337956905365, "learning_rate": 9.104429388371129e-05, "loss": 4.6195, "step": 34695 }, { "epoch": 0.71058505518809, "grad_norm": 0.27568039298057556, "learning_rate": 9.098738417081911e-05, "loss": 4.6585, "step": 34700 }, { "epoch": 0.7106874449654946, "grad_norm": 0.31890881061553955, "learning_rate": 9.093048450610467e-05, "loss": 4.6064, "step": 34705 }, { "epoch": 0.7107898347428993, "grad_norm": 0.2892616391181946, "learning_rate": 9.08735948992564e-05, "loss": 4.5892, "step": 34710 }, { "epoch": 0.7108922245203039, "grad_norm": 0.28401005268096924, "learning_rate": 9.081671535996104e-05, "loss": 4.5841, "step": 34715 }, { "epoch": 0.7109946142977085, "grad_norm": 0.27535170316696167, "learning_rate": 9.075984589790344e-05, "loss": 4.6577, "step": 34720 }, { "epoch": 0.7110970040751131, "grad_norm": 0.272706538438797, "learning_rate": 9.070298652276693e-05, "loss": 4.5934, "step": 34725 }, { "epoch": 0.7111993938525177, "grad_norm": 0.26996663212776184, "learning_rate": 9.064613724423308e-05, "loss": 4.6322, "step": 34730 }, { "epoch": 0.7113017836299224, "grad_norm": 0.30381351709365845, "learning_rate": 9.058929807198166e-05, "loss": 4.5868, "step": 34735 }, { "epoch": 0.711404173407327, "grad_norm": 0.2938435971736908, "learning_rate": 9.053246901569081e-05, "loss": 4.6256, "step": 34740 }, { "epoch": 0.7115065631847316, "grad_norm": 0.2768428325653076, "learning_rate": 9.047565008503689e-05, "loss": 4.5957, "step": 34745 }, { "epoch": 0.7116089529621362, "grad_norm": 0.3060249388217926, "learning_rate": 9.041884128969455e-05, "loss": 4.5915, "step": 34750 }, { "epoch": 0.7117113427395408, "grad_norm": 0.273645281791687, "learning_rate": 9.036204263933672e-05, "loss": 4.6285, "step": 34755 }, { "epoch": 0.7118137325169455, "grad_norm": 0.2825133502483368, "learning_rate": 9.030525414363468e-05, "loss": 4.6175, "step": 34760 }, { "epoch": 0.7119161222943501, "grad_norm": 0.26800164580345154, "learning_rate": 9.024847581225777e-05, "loss": 4.6281, "step": 34765 }, { "epoch": 0.7120185120717547, "grad_norm": 0.2778143584728241, "learning_rate": 9.019170765487379e-05, "loss": 4.6035, "step": 34770 }, { "epoch": 0.7121209018491593, "grad_norm": 0.26916933059692383, "learning_rate": 9.013494968114879e-05, "loss": 4.6818, "step": 34775 }, { "epoch": 0.7122232916265641, "grad_norm": 0.2809462547302246, "learning_rate": 9.007820190074699e-05, "loss": 4.4804, "step": 34780 }, { "epoch": 0.7123256814039687, "grad_norm": 0.3131517469882965, "learning_rate": 9.002146432333095e-05, "loss": 4.6172, "step": 34785 }, { "epoch": 0.7124280711813733, "grad_norm": 0.29829517006874084, "learning_rate": 8.996473695856151e-05, "loss": 4.6227, "step": 34790 }, { "epoch": 0.7125304609587779, "grad_norm": 0.2847922742366791, "learning_rate": 8.990801981609765e-05, "loss": 4.5335, "step": 34795 }, { "epoch": 0.7126328507361825, "grad_norm": 0.2825530171394348, "learning_rate": 8.985131290559676e-05, "loss": 4.5997, "step": 34800 }, { "epoch": 0.7127352405135872, "grad_norm": 0.27248820662498474, "learning_rate": 8.979461623671445e-05, "loss": 4.6408, "step": 34805 }, { "epoch": 0.7128376302909918, "grad_norm": 0.3130579888820648, "learning_rate": 8.973792981910445e-05, "loss": 4.6386, "step": 34810 }, { "epoch": 0.7129400200683964, "grad_norm": 0.26889511942863464, "learning_rate": 8.968125366241895e-05, "loss": 4.6432, "step": 34815 }, { "epoch": 0.713042409845801, "grad_norm": 0.2863365113735199, "learning_rate": 8.962458777630824e-05, "loss": 4.5673, "step": 34820 }, { "epoch": 0.7131447996232056, "grad_norm": 0.2696453034877777, "learning_rate": 8.956793217042095e-05, "loss": 4.615, "step": 34825 }, { "epoch": 0.7132471894006103, "grad_norm": 0.28150179982185364, "learning_rate": 8.95112868544039e-05, "loss": 4.5932, "step": 34830 }, { "epoch": 0.7133495791780149, "grad_norm": 0.287253737449646, "learning_rate": 8.945465183790219e-05, "loss": 4.495, "step": 34835 }, { "epoch": 0.7134519689554195, "grad_norm": 0.2708670496940613, "learning_rate": 8.93980271305592e-05, "loss": 4.6332, "step": 34840 }, { "epoch": 0.7135543587328241, "grad_norm": 0.30999526381492615, "learning_rate": 8.934141274201645e-05, "loss": 4.5616, "step": 34845 }, { "epoch": 0.7136567485102288, "grad_norm": 0.2686518132686615, "learning_rate": 8.92848086819138e-05, "loss": 4.6334, "step": 34850 }, { "epoch": 0.7137591382876334, "grad_norm": 0.30306416749954224, "learning_rate": 8.922821495988936e-05, "loss": 4.5848, "step": 34855 }, { "epoch": 0.713861528065038, "grad_norm": 0.33884480595588684, "learning_rate": 8.917163158557935e-05, "loss": 4.5813, "step": 34860 }, { "epoch": 0.7139639178424426, "grad_norm": 0.2970154881477356, "learning_rate": 8.911505856861836e-05, "loss": 4.6241, "step": 34865 }, { "epoch": 0.7140663076198472, "grad_norm": 0.27281296253204346, "learning_rate": 8.90584959186392e-05, "loss": 4.6425, "step": 34870 }, { "epoch": 0.7141686973972519, "grad_norm": 0.28088513016700745, "learning_rate": 8.900194364527283e-05, "loss": 4.6148, "step": 34875 }, { "epoch": 0.7142710871746565, "grad_norm": 0.3154902458190918, "learning_rate": 8.894540175814854e-05, "loss": 4.6612, "step": 34880 }, { "epoch": 0.7143734769520611, "grad_norm": 0.27968519926071167, "learning_rate": 8.888887026689386e-05, "loss": 4.5898, "step": 34885 }, { "epoch": 0.7144758667294657, "grad_norm": 0.26756688952445984, "learning_rate": 8.883234918113436e-05, "loss": 4.5878, "step": 34890 }, { "epoch": 0.7145782565068703, "grad_norm": 0.28028222918510437, "learning_rate": 8.877583851049411e-05, "loss": 4.6771, "step": 34895 }, { "epoch": 0.714680646284275, "grad_norm": 0.2732871174812317, "learning_rate": 8.871933826459528e-05, "loss": 4.5864, "step": 34900 }, { "epoch": 0.7147830360616796, "grad_norm": 0.3130098581314087, "learning_rate": 8.866284845305818e-05, "loss": 4.6858, "step": 34905 }, { "epoch": 0.7148854258390842, "grad_norm": 0.2788325846195221, "learning_rate": 8.860636908550144e-05, "loss": 4.6588, "step": 34910 }, { "epoch": 0.7149878156164888, "grad_norm": 0.2613585889339447, "learning_rate": 8.854990017154197e-05, "loss": 4.5716, "step": 34915 }, { "epoch": 0.7150902053938935, "grad_norm": 0.2701241374015808, "learning_rate": 8.849344172079473e-05, "loss": 4.6231, "step": 34920 }, { "epoch": 0.7151925951712981, "grad_norm": 0.3269418179988861, "learning_rate": 8.843699374287307e-05, "loss": 4.621, "step": 34925 }, { "epoch": 0.7152949849487027, "grad_norm": 0.31243759393692017, "learning_rate": 8.83805562473885e-05, "loss": 4.6237, "step": 34930 }, { "epoch": 0.7153973747261073, "grad_norm": 0.29419705271720886, "learning_rate": 8.832412924395061e-05, "loss": 4.6194, "step": 34935 }, { "epoch": 0.7154997645035119, "grad_norm": 0.2717318534851074, "learning_rate": 8.826771274216741e-05, "loss": 4.5815, "step": 34940 }, { "epoch": 0.7156021542809166, "grad_norm": 0.27413737773895264, "learning_rate": 8.82113067516451e-05, "loss": 4.6183, "step": 34945 }, { "epoch": 0.7157045440583212, "grad_norm": 0.2781221270561218, "learning_rate": 8.815491128198787e-05, "loss": 4.6954, "step": 34950 }, { "epoch": 0.7158069338357258, "grad_norm": 0.28618308901786804, "learning_rate": 8.809852634279835e-05, "loss": 4.6715, "step": 34955 }, { "epoch": 0.7159093236131304, "grad_norm": 0.27211809158325195, "learning_rate": 8.804215194367728e-05, "loss": 4.5879, "step": 34960 }, { "epoch": 0.716011713390535, "grad_norm": 0.300509512424469, "learning_rate": 8.798578809422367e-05, "loss": 4.6073, "step": 34965 }, { "epoch": 0.7161141031679397, "grad_norm": 0.2684769332408905, "learning_rate": 8.792943480403463e-05, "loss": 4.5932, "step": 34970 }, { "epoch": 0.7162164929453443, "grad_norm": 0.28665891289711, "learning_rate": 8.787309208270556e-05, "loss": 4.5766, "step": 34975 }, { "epoch": 0.7163188827227489, "grad_norm": 0.29640817642211914, "learning_rate": 8.781675993983009e-05, "loss": 4.5813, "step": 34980 }, { "epoch": 0.7164212725001536, "grad_norm": 0.27339333295822144, "learning_rate": 8.776043838499982e-05, "loss": 4.6273, "step": 34985 }, { "epoch": 0.7165236622775583, "grad_norm": 0.27096307277679443, "learning_rate": 8.770412742780487e-05, "loss": 4.6263, "step": 34990 }, { "epoch": 0.7166260520549629, "grad_norm": 0.2680208086967468, "learning_rate": 8.76478270778334e-05, "loss": 4.7049, "step": 34995 }, { "epoch": 0.7167284418323675, "grad_norm": 0.28621193766593933, "learning_rate": 8.759153734467166e-05, "loss": 4.6445, "step": 35000 }, { "epoch": 0.7167284418323675, "eval_loss": 4.63367223739624, "eval_runtime": 17.1299, "eval_samples_per_second": 33.976, "eval_steps_per_second": 4.262, "step": 35000 }, { "epoch": 0.7168308316097721, "grad_norm": 0.3055669963359833, "learning_rate": 8.753525823790423e-05, "loss": 4.6183, "step": 35005 }, { "epoch": 0.7169332213871767, "grad_norm": 0.2594819664955139, "learning_rate": 8.747898976711399e-05, "loss": 4.6443, "step": 35010 }, { "epoch": 0.7170356111645814, "grad_norm": 0.26975128054618835, "learning_rate": 8.742273194188165e-05, "loss": 4.6332, "step": 35015 }, { "epoch": 0.717138000941986, "grad_norm": 0.2627929151058197, "learning_rate": 8.736648477178645e-05, "loss": 4.654, "step": 35020 }, { "epoch": 0.7172403907193906, "grad_norm": 0.2685806453227997, "learning_rate": 8.73102482664057e-05, "loss": 4.5743, "step": 35025 }, { "epoch": 0.7173427804967952, "grad_norm": 0.27302104234695435, "learning_rate": 8.725402243531477e-05, "loss": 4.6513, "step": 35030 }, { "epoch": 0.7174451702741999, "grad_norm": 0.2837328612804413, "learning_rate": 8.719780728808749e-05, "loss": 4.6101, "step": 35035 }, { "epoch": 0.7175475600516045, "grad_norm": 0.27872976660728455, "learning_rate": 8.714160283429561e-05, "loss": 4.6854, "step": 35040 }, { "epoch": 0.7176499498290091, "grad_norm": 0.284140944480896, "learning_rate": 8.708540908350913e-05, "loss": 4.6184, "step": 35045 }, { "epoch": 0.7177523396064137, "grad_norm": 0.27065593004226685, "learning_rate": 8.702922604529637e-05, "loss": 4.6628, "step": 35050 }, { "epoch": 0.7178547293838183, "grad_norm": 0.2855406701564789, "learning_rate": 8.697305372922363e-05, "loss": 4.5895, "step": 35055 }, { "epoch": 0.717957119161223, "grad_norm": 0.284546434879303, "learning_rate": 8.691689214485547e-05, "loss": 4.6093, "step": 35060 }, { "epoch": 0.7180595089386276, "grad_norm": 0.27335232496261597, "learning_rate": 8.686074130175456e-05, "loss": 4.6557, "step": 35065 }, { "epoch": 0.7181618987160322, "grad_norm": 0.27810123562812805, "learning_rate": 8.680460120948193e-05, "loss": 4.6233, "step": 35070 }, { "epoch": 0.7182642884934368, "grad_norm": 0.2812967002391815, "learning_rate": 8.674847187759658e-05, "loss": 4.6, "step": 35075 }, { "epoch": 0.7183666782708414, "grad_norm": 0.27613675594329834, "learning_rate": 8.669235331565569e-05, "loss": 4.568, "step": 35080 }, { "epoch": 0.7184690680482461, "grad_norm": 0.28897684812545776, "learning_rate": 8.663624553321477e-05, "loss": 4.5767, "step": 35085 }, { "epoch": 0.7185714578256507, "grad_norm": 0.2786373794078827, "learning_rate": 8.658014853982736e-05, "loss": 4.5984, "step": 35090 }, { "epoch": 0.7186738476030553, "grad_norm": 0.2863420844078064, "learning_rate": 8.652406234504512e-05, "loss": 4.6394, "step": 35095 }, { "epoch": 0.7187762373804599, "grad_norm": 0.29722583293914795, "learning_rate": 8.6467986958418e-05, "loss": 4.6108, "step": 35100 }, { "epoch": 0.7188786271578645, "grad_norm": 0.28767991065979004, "learning_rate": 8.641192238949406e-05, "loss": 4.6258, "step": 35105 }, { "epoch": 0.7189810169352692, "grad_norm": 0.27794715762138367, "learning_rate": 8.635586864781948e-05, "loss": 4.6028, "step": 35110 }, { "epoch": 0.7190834067126738, "grad_norm": 0.2857860326766968, "learning_rate": 8.629982574293858e-05, "loss": 4.6554, "step": 35115 }, { "epoch": 0.7191857964900784, "grad_norm": 0.2754746079444885, "learning_rate": 8.624379368439399e-05, "loss": 4.6117, "step": 35120 }, { "epoch": 0.719288186267483, "grad_norm": 0.29061755537986755, "learning_rate": 8.618777248172632e-05, "loss": 4.5841, "step": 35125 }, { "epoch": 0.7193905760448877, "grad_norm": 0.31932756304740906, "learning_rate": 8.613176214447435e-05, "loss": 4.6827, "step": 35130 }, { "epoch": 0.7194929658222923, "grad_norm": 0.28680405020713806, "learning_rate": 8.607576268217515e-05, "loss": 4.5784, "step": 35135 }, { "epoch": 0.7195953555996969, "grad_norm": 0.2824251055717468, "learning_rate": 8.601977410436381e-05, "loss": 4.5641, "step": 35140 }, { "epoch": 0.7196977453771015, "grad_norm": 0.2622056305408478, "learning_rate": 8.596379642057355e-05, "loss": 4.6209, "step": 35145 }, { "epoch": 0.7198001351545061, "grad_norm": 0.28085857629776, "learning_rate": 8.59078296403359e-05, "loss": 4.5532, "step": 35150 }, { "epoch": 0.7199025249319108, "grad_norm": 0.2631789743900299, "learning_rate": 8.585187377318025e-05, "loss": 4.6004, "step": 35155 }, { "epoch": 0.7200049147093154, "grad_norm": 0.2821058928966522, "learning_rate": 8.579592882863444e-05, "loss": 4.6997, "step": 35160 }, { "epoch": 0.72010730448672, "grad_norm": 0.29496118426322937, "learning_rate": 8.573999481622427e-05, "loss": 4.618, "step": 35165 }, { "epoch": 0.7202096942641246, "grad_norm": 0.30153515934944153, "learning_rate": 8.568407174547363e-05, "loss": 4.5909, "step": 35170 }, { "epoch": 0.7203120840415292, "grad_norm": 0.2813776433467865, "learning_rate": 8.562815962590481e-05, "loss": 4.5943, "step": 35175 }, { "epoch": 0.7204144738189339, "grad_norm": 0.29179972410202026, "learning_rate": 8.557225846703795e-05, "loss": 4.6066, "step": 35180 }, { "epoch": 0.7205168635963386, "grad_norm": 0.274844229221344, "learning_rate": 8.551636827839137e-05, "loss": 4.6262, "step": 35185 }, { "epoch": 0.7206192533737432, "grad_norm": 0.2772488296031952, "learning_rate": 8.546048906948176e-05, "loss": 4.6229, "step": 35190 }, { "epoch": 0.7207216431511478, "grad_norm": 0.26569491624832153, "learning_rate": 8.540462084982366e-05, "loss": 4.5865, "step": 35195 }, { "epoch": 0.7208240329285525, "grad_norm": 0.25651228427886963, "learning_rate": 8.534876362892985e-05, "loss": 4.6133, "step": 35200 }, { "epoch": 0.7209264227059571, "grad_norm": 0.2695825397968292, "learning_rate": 8.52929174163112e-05, "loss": 4.6394, "step": 35205 }, { "epoch": 0.7210288124833617, "grad_norm": 0.2850077152252197, "learning_rate": 8.523708222147684e-05, "loss": 4.6254, "step": 35210 }, { "epoch": 0.7211312022607663, "grad_norm": 0.2739093601703644, "learning_rate": 8.518125805393386e-05, "loss": 4.6471, "step": 35215 }, { "epoch": 0.721233592038171, "grad_norm": 0.43531596660614014, "learning_rate": 8.512544492318748e-05, "loss": 4.582, "step": 35220 }, { "epoch": 0.7213359818155756, "grad_norm": 0.27666881680488586, "learning_rate": 8.506964283874124e-05, "loss": 4.6576, "step": 35225 }, { "epoch": 0.7214383715929802, "grad_norm": 0.2962600886821747, "learning_rate": 8.501385181009657e-05, "loss": 4.7266, "step": 35230 }, { "epoch": 0.7215407613703848, "grad_norm": 0.2809500992298126, "learning_rate": 8.495807184675303e-05, "loss": 4.5417, "step": 35235 }, { "epoch": 0.7216431511477894, "grad_norm": 0.27637147903442383, "learning_rate": 8.49023029582085e-05, "loss": 4.6507, "step": 35240 }, { "epoch": 0.721745540925194, "grad_norm": 0.28118348121643066, "learning_rate": 8.48465451539588e-05, "loss": 4.6487, "step": 35245 }, { "epoch": 0.7218479307025987, "grad_norm": 0.27437835931777954, "learning_rate": 8.47907984434979e-05, "loss": 4.5611, "step": 35250 }, { "epoch": 0.7219503204800033, "grad_norm": 0.27034005522727966, "learning_rate": 8.473506283631781e-05, "loss": 4.5887, "step": 35255 }, { "epoch": 0.7220527102574079, "grad_norm": 0.2759099006652832, "learning_rate": 8.467933834190885e-05, "loss": 4.6269, "step": 35260 }, { "epoch": 0.7221551000348125, "grad_norm": 0.44436919689178467, "learning_rate": 8.462362496975926e-05, "loss": 4.5557, "step": 35265 }, { "epoch": 0.7222574898122172, "grad_norm": 0.27312707901000977, "learning_rate": 8.456792272935541e-05, "loss": 4.6757, "step": 35270 }, { "epoch": 0.7223598795896218, "grad_norm": 0.30309104919433594, "learning_rate": 8.451223163018192e-05, "loss": 4.5831, "step": 35275 }, { "epoch": 0.7224622693670264, "grad_norm": 0.2631854712963104, "learning_rate": 8.445655168172136e-05, "loss": 4.6387, "step": 35280 }, { "epoch": 0.722564659144431, "grad_norm": 0.30008718371391296, "learning_rate": 8.440088289345438e-05, "loss": 4.5845, "step": 35285 }, { "epoch": 0.7226670489218356, "grad_norm": 0.287648469209671, "learning_rate": 8.434522527486e-05, "loss": 4.646, "step": 35290 }, { "epoch": 0.7227694386992403, "grad_norm": 0.277083158493042, "learning_rate": 8.428957883541483e-05, "loss": 4.65, "step": 35295 }, { "epoch": 0.7228718284766449, "grad_norm": 0.26783913373947144, "learning_rate": 8.423394358459412e-05, "loss": 4.6158, "step": 35300 }, { "epoch": 0.7229742182540495, "grad_norm": 0.2731381058692932, "learning_rate": 8.417831953187093e-05, "loss": 4.6213, "step": 35305 }, { "epoch": 0.7230766080314541, "grad_norm": 0.2617172598838806, "learning_rate": 8.412270668671637e-05, "loss": 4.6631, "step": 35310 }, { "epoch": 0.7231789978088587, "grad_norm": 0.27751821279525757, "learning_rate": 8.406710505859985e-05, "loss": 4.6043, "step": 35315 }, { "epoch": 0.7232813875862634, "grad_norm": 0.2831033170223236, "learning_rate": 8.40115146569887e-05, "loss": 4.6168, "step": 35320 }, { "epoch": 0.723383777363668, "grad_norm": 0.26409485936164856, "learning_rate": 8.395593549134836e-05, "loss": 4.6152, "step": 35325 }, { "epoch": 0.7234861671410726, "grad_norm": 0.2689374089241028, "learning_rate": 8.390036757114246e-05, "loss": 4.6712, "step": 35330 }, { "epoch": 0.7235885569184772, "grad_norm": 0.30068737268447876, "learning_rate": 8.384481090583263e-05, "loss": 4.6699, "step": 35335 }, { "epoch": 0.7236909466958819, "grad_norm": 0.27237340807914734, "learning_rate": 8.378926550487855e-05, "loss": 4.6071, "step": 35340 }, { "epoch": 0.7237933364732865, "grad_norm": 0.27116408944129944, "learning_rate": 8.3733731377738e-05, "loss": 4.6052, "step": 35345 }, { "epoch": 0.7238957262506911, "grad_norm": 0.26923108100891113, "learning_rate": 8.3678208533867e-05, "loss": 4.6141, "step": 35350 }, { "epoch": 0.7239981160280957, "grad_norm": 0.27835944294929504, "learning_rate": 8.362269698271943e-05, "loss": 4.5921, "step": 35355 }, { "epoch": 0.7241005058055003, "grad_norm": 0.28105825185775757, "learning_rate": 8.356719673374729e-05, "loss": 4.6204, "step": 35360 }, { "epoch": 0.724202895582905, "grad_norm": 0.28705254197120667, "learning_rate": 8.351170779640087e-05, "loss": 4.6037, "step": 35365 }, { "epoch": 0.7243052853603096, "grad_norm": 0.2829848527908325, "learning_rate": 8.345623018012814e-05, "loss": 4.5822, "step": 35370 }, { "epoch": 0.7244076751377142, "grad_norm": 0.2970631420612335, "learning_rate": 8.340076389437548e-05, "loss": 4.6239, "step": 35375 }, { "epoch": 0.7245100649151188, "grad_norm": 0.27193063497543335, "learning_rate": 8.33453089485873e-05, "loss": 4.6157, "step": 35380 }, { "epoch": 0.7246124546925234, "grad_norm": 0.35156217217445374, "learning_rate": 8.328986535220593e-05, "loss": 4.562, "step": 35385 }, { "epoch": 0.7247148444699282, "grad_norm": 0.26313531398773193, "learning_rate": 8.323443311467184e-05, "loss": 4.6291, "step": 35390 }, { "epoch": 0.7248172342473328, "grad_norm": 0.28509581089019775, "learning_rate": 8.317901224542354e-05, "loss": 4.5954, "step": 35395 }, { "epoch": 0.7249196240247374, "grad_norm": 0.27369093894958496, "learning_rate": 8.312360275389775e-05, "loss": 4.6007, "step": 35400 }, { "epoch": 0.725022013802142, "grad_norm": 0.2805980443954468, "learning_rate": 8.306820464952902e-05, "loss": 4.5833, "step": 35405 }, { "epoch": 0.7251244035795467, "grad_norm": 0.2740568518638611, "learning_rate": 8.301281794175011e-05, "loss": 4.6279, "step": 35410 }, { "epoch": 0.7252267933569513, "grad_norm": 0.2633547782897949, "learning_rate": 8.295744263999185e-05, "loss": 4.6034, "step": 35415 }, { "epoch": 0.7253291831343559, "grad_norm": 0.3053842782974243, "learning_rate": 8.290207875368307e-05, "loss": 4.5261, "step": 35420 }, { "epoch": 0.7254315729117605, "grad_norm": 0.2739124596118927, "learning_rate": 8.284672629225061e-05, "loss": 4.5951, "step": 35425 }, { "epoch": 0.7255339626891651, "grad_norm": 0.27901989221572876, "learning_rate": 8.279138526511959e-05, "loss": 4.6457, "step": 35430 }, { "epoch": 0.7256363524665698, "grad_norm": 0.27331113815307617, "learning_rate": 8.273605568171278e-05, "loss": 4.6224, "step": 35435 }, { "epoch": 0.7257387422439744, "grad_norm": 0.312152624130249, "learning_rate": 8.268073755145145e-05, "loss": 4.6285, "step": 35440 }, { "epoch": 0.725841132021379, "grad_norm": 0.28793373703956604, "learning_rate": 8.26254308837546e-05, "loss": 4.6206, "step": 35445 }, { "epoch": 0.7259435217987836, "grad_norm": 0.30180874466896057, "learning_rate": 8.25701356880394e-05, "loss": 4.6198, "step": 35450 }, { "epoch": 0.7260459115761883, "grad_norm": 0.27114638686180115, "learning_rate": 8.25148519737211e-05, "loss": 4.6382, "step": 35455 }, { "epoch": 0.7261483013535929, "grad_norm": 0.2920796275138855, "learning_rate": 8.245957975021293e-05, "loss": 4.6466, "step": 35460 }, { "epoch": 0.7262506911309975, "grad_norm": 0.42375123500823975, "learning_rate": 8.240431902692615e-05, "loss": 4.5568, "step": 35465 }, { "epoch": 0.7263530809084021, "grad_norm": 0.26363077759742737, "learning_rate": 8.234906981327018e-05, "loss": 4.6527, "step": 35470 }, { "epoch": 0.7264554706858067, "grad_norm": 0.26325905323028564, "learning_rate": 8.229383211865234e-05, "loss": 4.5654, "step": 35475 }, { "epoch": 0.7265578604632114, "grad_norm": 0.27255550026893616, "learning_rate": 8.223860595247806e-05, "loss": 4.6126, "step": 35480 }, { "epoch": 0.726660250240616, "grad_norm": 0.2707035541534424, "learning_rate": 8.218339132415072e-05, "loss": 4.5827, "step": 35485 }, { "epoch": 0.7267626400180206, "grad_norm": 0.2907780706882477, "learning_rate": 8.212818824307192e-05, "loss": 4.5935, "step": 35490 }, { "epoch": 0.7268650297954252, "grad_norm": 0.27819907665252686, "learning_rate": 8.207299671864117e-05, "loss": 4.6256, "step": 35495 }, { "epoch": 0.7269674195728298, "grad_norm": 0.2932546138763428, "learning_rate": 8.201781676025589e-05, "loss": 4.6447, "step": 35500 }, { "epoch": 0.7270698093502345, "grad_norm": 0.2775144577026367, "learning_rate": 8.196264837731191e-05, "loss": 4.6417, "step": 35505 }, { "epoch": 0.7271721991276391, "grad_norm": 0.2768743634223938, "learning_rate": 8.190749157920257e-05, "loss": 4.6247, "step": 35510 }, { "epoch": 0.7272745889050437, "grad_norm": 0.28260377049446106, "learning_rate": 8.185234637531965e-05, "loss": 4.6204, "step": 35515 }, { "epoch": 0.7273769786824483, "grad_norm": 0.28319257497787476, "learning_rate": 8.179721277505283e-05, "loss": 4.6336, "step": 35520 }, { "epoch": 0.727479368459853, "grad_norm": 0.2869589924812317, "learning_rate": 8.174209078778982e-05, "loss": 4.6033, "step": 35525 }, { "epoch": 0.7275817582372576, "grad_norm": 0.27521735429763794, "learning_rate": 8.168698042291626e-05, "loss": 4.5958, "step": 35530 }, { "epoch": 0.7276841480146622, "grad_norm": 0.2832813560962677, "learning_rate": 8.163188168981587e-05, "loss": 4.6506, "step": 35535 }, { "epoch": 0.7277865377920668, "grad_norm": 0.28442350029945374, "learning_rate": 8.157679459787051e-05, "loss": 4.544, "step": 35540 }, { "epoch": 0.7278889275694714, "grad_norm": 0.2788909375667572, "learning_rate": 8.15217191564599e-05, "loss": 4.5901, "step": 35545 }, { "epoch": 0.727991317346876, "grad_norm": 0.2873976528644562, "learning_rate": 8.146665537496173e-05, "loss": 4.6138, "step": 35550 }, { "epoch": 0.7280937071242807, "grad_norm": 0.2732342481613159, "learning_rate": 8.141160326275205e-05, "loss": 4.591, "step": 35555 }, { "epoch": 0.7281960969016853, "grad_norm": 0.268839031457901, "learning_rate": 8.135656282920437e-05, "loss": 4.6524, "step": 35560 }, { "epoch": 0.7282984866790899, "grad_norm": 0.3491988778114319, "learning_rate": 8.130153408369068e-05, "loss": 4.5966, "step": 35565 }, { "epoch": 0.7284008764564945, "grad_norm": 0.7563338875770569, "learning_rate": 8.124651703558092e-05, "loss": 4.5912, "step": 35570 }, { "epoch": 0.7285032662338992, "grad_norm": 0.2689594626426697, "learning_rate": 8.119151169424271e-05, "loss": 4.6556, "step": 35575 }, { "epoch": 0.7286056560113038, "grad_norm": 0.27166950702667236, "learning_rate": 8.113651806904207e-05, "loss": 4.5771, "step": 35580 }, { "epoch": 0.7287080457887084, "grad_norm": 0.2878290116786957, "learning_rate": 8.108153616934279e-05, "loss": 4.6205, "step": 35585 }, { "epoch": 0.728810435566113, "grad_norm": 0.2817739248275757, "learning_rate": 8.10265660045067e-05, "loss": 4.5326, "step": 35590 }, { "epoch": 0.7289128253435178, "grad_norm": 0.26659125089645386, "learning_rate": 8.097160758389375e-05, "loss": 4.6098, "step": 35595 }, { "epoch": 0.7290152151209224, "grad_norm": 0.26286768913269043, "learning_rate": 8.091666091686175e-05, "loss": 4.6222, "step": 35600 }, { "epoch": 0.729117604898327, "grad_norm": 0.266060471534729, "learning_rate": 8.086172601276653e-05, "loss": 4.6103, "step": 35605 }, { "epoch": 0.7292199946757316, "grad_norm": 0.3146880865097046, "learning_rate": 8.080680288096204e-05, "loss": 4.6348, "step": 35610 }, { "epoch": 0.7293223844531362, "grad_norm": 0.2808399498462677, "learning_rate": 8.075189153080008e-05, "loss": 4.594, "step": 35615 }, { "epoch": 0.7294247742305409, "grad_norm": 0.287859171628952, "learning_rate": 8.06969919716305e-05, "loss": 4.6327, "step": 35620 }, { "epoch": 0.7295271640079455, "grad_norm": 0.27413034439086914, "learning_rate": 8.064210421280107e-05, "loss": 4.6109, "step": 35625 }, { "epoch": 0.7296295537853501, "grad_norm": 0.27877911925315857, "learning_rate": 8.058722826365776e-05, "loss": 4.5684, "step": 35630 }, { "epoch": 0.7297319435627547, "grad_norm": 0.26801979541778564, "learning_rate": 8.053236413354434e-05, "loss": 4.5709, "step": 35635 }, { "epoch": 0.7298343333401593, "grad_norm": 0.29206186532974243, "learning_rate": 8.047751183180252e-05, "loss": 4.5712, "step": 35640 }, { "epoch": 0.729936723117564, "grad_norm": 0.35441872477531433, "learning_rate": 8.042267136777231e-05, "loss": 4.5272, "step": 35645 }, { "epoch": 0.7300391128949686, "grad_norm": 0.2733302116394043, "learning_rate": 8.036784275079123e-05, "loss": 4.6058, "step": 35650 }, { "epoch": 0.7301415026723732, "grad_norm": 0.28338730335235596, "learning_rate": 8.031302599019516e-05, "loss": 4.6214, "step": 35655 }, { "epoch": 0.7302438924497778, "grad_norm": 0.2677144706249237, "learning_rate": 8.025822109531792e-05, "loss": 4.6192, "step": 35660 }, { "epoch": 0.7303462822271825, "grad_norm": 0.2829006016254425, "learning_rate": 8.020342807549113e-05, "loss": 4.6536, "step": 35665 }, { "epoch": 0.7304486720045871, "grad_norm": 0.2853788137435913, "learning_rate": 8.014864694004455e-05, "loss": 4.6486, "step": 35670 }, { "epoch": 0.7305510617819917, "grad_norm": 0.2703281044960022, "learning_rate": 8.009387769830575e-05, "loss": 4.6321, "step": 35675 }, { "epoch": 0.7306534515593963, "grad_norm": 0.3253069221973419, "learning_rate": 8.003912035960051e-05, "loss": 4.5723, "step": 35680 }, { "epoch": 0.7307558413368009, "grad_norm": 0.2917252779006958, "learning_rate": 7.998437493325241e-05, "loss": 4.6112, "step": 35685 }, { "epoch": 0.7308582311142056, "grad_norm": 0.2790091037750244, "learning_rate": 7.992964142858299e-05, "loss": 4.6069, "step": 35690 }, { "epoch": 0.7309606208916102, "grad_norm": 0.2863832712173462, "learning_rate": 7.987491985491197e-05, "loss": 4.6444, "step": 35695 }, { "epoch": 0.7310630106690148, "grad_norm": 0.27816665172576904, "learning_rate": 7.982021022155668e-05, "loss": 4.6117, "step": 35700 }, { "epoch": 0.7311654004464194, "grad_norm": 0.28824666142463684, "learning_rate": 7.97655125378327e-05, "loss": 4.6057, "step": 35705 }, { "epoch": 0.731267790223824, "grad_norm": 0.30592820048332214, "learning_rate": 7.971082681305363e-05, "loss": 4.5846, "step": 35710 }, { "epoch": 0.7313701800012287, "grad_norm": 0.27617231011390686, "learning_rate": 7.965615305653069e-05, "loss": 4.5765, "step": 35715 }, { "epoch": 0.7314725697786333, "grad_norm": 0.26955723762512207, "learning_rate": 7.960149127757342e-05, "loss": 4.6057, "step": 35720 }, { "epoch": 0.7315749595560379, "grad_norm": 0.27496469020843506, "learning_rate": 7.954684148548912e-05, "loss": 4.6161, "step": 35725 }, { "epoch": 0.7316773493334425, "grad_norm": 0.26673826575279236, "learning_rate": 7.949220368958304e-05, "loss": 4.5882, "step": 35730 }, { "epoch": 0.7317797391108471, "grad_norm": 0.27812322974205017, "learning_rate": 7.943757789915861e-05, "loss": 4.6123, "step": 35735 }, { "epoch": 0.7318821288882518, "grad_norm": 0.2739894688129425, "learning_rate": 7.938296412351692e-05, "loss": 4.6414, "step": 35740 }, { "epoch": 0.7319845186656564, "grad_norm": 0.28313854336738586, "learning_rate": 7.932836237195723e-05, "loss": 4.5572, "step": 35745 }, { "epoch": 0.732086908443061, "grad_norm": 0.2770625948905945, "learning_rate": 7.927377265377656e-05, "loss": 4.5784, "step": 35750 }, { "epoch": 0.7321892982204656, "grad_norm": 0.27078017592430115, "learning_rate": 7.921919497827012e-05, "loss": 4.6036, "step": 35755 }, { "epoch": 0.7322916879978703, "grad_norm": 0.28478506207466125, "learning_rate": 7.916462935473089e-05, "loss": 4.6117, "step": 35760 }, { "epoch": 0.7323940777752749, "grad_norm": 0.27009883522987366, "learning_rate": 7.911007579244978e-05, "loss": 4.5985, "step": 35765 }, { "epoch": 0.7324964675526795, "grad_norm": 0.2889600396156311, "learning_rate": 7.905553430071589e-05, "loss": 4.619, "step": 35770 }, { "epoch": 0.7325988573300841, "grad_norm": 0.27302753925323486, "learning_rate": 7.900100488881598e-05, "loss": 4.6127, "step": 35775 }, { "epoch": 0.7327012471074887, "grad_norm": 0.27285054326057434, "learning_rate": 7.89464875660348e-05, "loss": 4.6208, "step": 35780 }, { "epoch": 0.7328036368848934, "grad_norm": 0.2659536600112915, "learning_rate": 7.889198234165529e-05, "loss": 4.6039, "step": 35785 }, { "epoch": 0.732906026662298, "grad_norm": 0.2752051055431366, "learning_rate": 7.883748922495794e-05, "loss": 4.5993, "step": 35790 }, { "epoch": 0.7330084164397027, "grad_norm": 0.2768678069114685, "learning_rate": 7.87830082252215e-05, "loss": 4.5965, "step": 35795 }, { "epoch": 0.7331108062171073, "grad_norm": 0.2679316997528076, "learning_rate": 7.872853935172255e-05, "loss": 4.6243, "step": 35800 }, { "epoch": 0.733213195994512, "grad_norm": 0.2648327350616455, "learning_rate": 7.86740826137356e-05, "loss": 4.6027, "step": 35805 }, { "epoch": 0.7333155857719166, "grad_norm": 0.2949255108833313, "learning_rate": 7.861963802053305e-05, "loss": 4.6423, "step": 35810 }, { "epoch": 0.7334179755493212, "grad_norm": 0.29031598567962646, "learning_rate": 7.856520558138523e-05, "loss": 4.5904, "step": 35815 }, { "epoch": 0.7335203653267258, "grad_norm": 0.4511967599391937, "learning_rate": 7.851078530556055e-05, "loss": 4.6195, "step": 35820 }, { "epoch": 0.7336227551041304, "grad_norm": 0.3028342127799988, "learning_rate": 7.845637720232522e-05, "loss": 4.6299, "step": 35825 }, { "epoch": 0.7337251448815351, "grad_norm": 0.2744481861591339, "learning_rate": 7.84019812809433e-05, "loss": 4.5955, "step": 35830 }, { "epoch": 0.7338275346589397, "grad_norm": 0.2706994116306305, "learning_rate": 7.834759755067704e-05, "loss": 4.5803, "step": 35835 }, { "epoch": 0.7339299244363443, "grad_norm": 0.2765248119831085, "learning_rate": 7.829322602078627e-05, "loss": 4.5825, "step": 35840 }, { "epoch": 0.7340323142137489, "grad_norm": 0.2680410146713257, "learning_rate": 7.8238866700529e-05, "loss": 4.5614, "step": 35845 }, { "epoch": 0.7341347039911535, "grad_norm": 0.29486092925071716, "learning_rate": 7.818451959916122e-05, "loss": 4.6369, "step": 35850 }, { "epoch": 0.7342370937685582, "grad_norm": 0.3106747269630432, "learning_rate": 7.813018472593645e-05, "loss": 4.6024, "step": 35855 }, { "epoch": 0.7343394835459628, "grad_norm": 0.27558979392051697, "learning_rate": 7.807586209010655e-05, "loss": 4.5633, "step": 35860 }, { "epoch": 0.7344418733233674, "grad_norm": 0.27722612023353577, "learning_rate": 7.802155170092112e-05, "loss": 4.6287, "step": 35865 }, { "epoch": 0.734544263100772, "grad_norm": 0.2866869866847992, "learning_rate": 7.796725356762754e-05, "loss": 4.5713, "step": 35870 }, { "epoch": 0.7346466528781767, "grad_norm": 0.2874201238155365, "learning_rate": 7.791296769947144e-05, "loss": 4.6416, "step": 35875 }, { "epoch": 0.7347490426555813, "grad_norm": 0.27711090445518494, "learning_rate": 7.785869410569605e-05, "loss": 4.6211, "step": 35880 }, { "epoch": 0.7348514324329859, "grad_norm": 0.26898038387298584, "learning_rate": 7.780443279554268e-05, "loss": 4.6596, "step": 35885 }, { "epoch": 0.7349538222103905, "grad_norm": 0.2773260176181793, "learning_rate": 7.775018377825041e-05, "loss": 4.6504, "step": 35890 }, { "epoch": 0.7350562119877951, "grad_norm": 0.2722721993923187, "learning_rate": 7.76959470630564e-05, "loss": 4.588, "step": 35895 }, { "epoch": 0.7351586017651998, "grad_norm": 0.28141921758651733, "learning_rate": 7.764172265919561e-05, "loss": 4.6155, "step": 35900 }, { "epoch": 0.7352609915426044, "grad_norm": 0.27787846326828003, "learning_rate": 7.758751057590084e-05, "loss": 4.6119, "step": 35905 }, { "epoch": 0.735363381320009, "grad_norm": 0.26910316944122314, "learning_rate": 7.753331082240302e-05, "loss": 4.64, "step": 35910 }, { "epoch": 0.7354657710974136, "grad_norm": 0.26209381222724915, "learning_rate": 7.747912340793073e-05, "loss": 4.6252, "step": 35915 }, { "epoch": 0.7355681608748182, "grad_norm": 0.2831920087337494, "learning_rate": 7.742494834171055e-05, "loss": 4.6328, "step": 35920 }, { "epoch": 0.7356705506522229, "grad_norm": 0.28854113817214966, "learning_rate": 7.737078563296706e-05, "loss": 4.5855, "step": 35925 }, { "epoch": 0.7357729404296275, "grad_norm": 0.2855021059513092, "learning_rate": 7.731663529092248e-05, "loss": 4.6104, "step": 35930 }, { "epoch": 0.7358753302070321, "grad_norm": 0.3063238561153412, "learning_rate": 7.72624973247972e-05, "loss": 4.5534, "step": 35935 }, { "epoch": 0.7359777199844367, "grad_norm": 0.3179500997066498, "learning_rate": 7.72083717438093e-05, "loss": 4.6236, "step": 35940 }, { "epoch": 0.7360801097618414, "grad_norm": 0.27991044521331787, "learning_rate": 7.715425855717492e-05, "loss": 4.6413, "step": 35945 }, { "epoch": 0.736182499539246, "grad_norm": 0.27822068333625793, "learning_rate": 7.710015777410798e-05, "loss": 4.6782, "step": 35950 }, { "epoch": 0.7362848893166506, "grad_norm": 0.27126359939575195, "learning_rate": 7.704606940382024e-05, "loss": 4.662, "step": 35955 }, { "epoch": 0.7363872790940552, "grad_norm": 0.2927936017513275, "learning_rate": 7.699199345552151e-05, "loss": 4.5777, "step": 35960 }, { "epoch": 0.7364896688714598, "grad_norm": 0.27688297629356384, "learning_rate": 7.693792993841936e-05, "loss": 4.6065, "step": 35965 }, { "epoch": 0.7365920586488645, "grad_norm": 0.26905474066734314, "learning_rate": 7.688387886171924e-05, "loss": 4.6138, "step": 35970 }, { "epoch": 0.7366944484262691, "grad_norm": 0.26644816994667053, "learning_rate": 7.682984023462463e-05, "loss": 4.6613, "step": 35975 }, { "epoch": 0.7367968382036737, "grad_norm": 0.2702935039997101, "learning_rate": 7.677581406633661e-05, "loss": 4.622, "step": 35980 }, { "epoch": 0.7368992279810783, "grad_norm": 0.2773968279361725, "learning_rate": 7.672180036605437e-05, "loss": 4.5575, "step": 35985 }, { "epoch": 0.7370016177584829, "grad_norm": 0.2763274908065796, "learning_rate": 7.666779914297508e-05, "loss": 4.6339, "step": 35990 }, { "epoch": 0.7371040075358876, "grad_norm": 0.28777623176574707, "learning_rate": 7.661381040629338e-05, "loss": 4.6024, "step": 35995 }, { "epoch": 0.7372063973132923, "grad_norm": 0.3007139265537262, "learning_rate": 7.655983416520213e-05, "loss": 4.5967, "step": 36000 }, { "epoch": 0.7372063973132923, "eval_loss": 4.625242233276367, "eval_runtime": 17.131, "eval_samples_per_second": 33.974, "eval_steps_per_second": 4.261, "step": 36000 }, { "epoch": 0.7373087870906969, "grad_norm": 0.3373059332370758, "learning_rate": 7.6505870428892e-05, "loss": 4.6184, "step": 36005 }, { "epoch": 0.7374111768681015, "grad_norm": 0.28736239671707153, "learning_rate": 7.645191920655133e-05, "loss": 4.6159, "step": 36010 }, { "epoch": 0.7375135666455062, "grad_norm": 0.26335597038269043, "learning_rate": 7.639798050736667e-05, "loss": 4.6496, "step": 36015 }, { "epoch": 0.7376159564229108, "grad_norm": 0.3809347152709961, "learning_rate": 7.634405434052217e-05, "loss": 4.5739, "step": 36020 }, { "epoch": 0.7377183462003154, "grad_norm": 0.27838921546936035, "learning_rate": 7.629014071519994e-05, "loss": 4.6096, "step": 36025 }, { "epoch": 0.73782073597772, "grad_norm": 0.38840600848197937, "learning_rate": 7.623623964057987e-05, "loss": 4.5863, "step": 36030 }, { "epoch": 0.7379231257551246, "grad_norm": 0.267988920211792, "learning_rate": 7.618235112583991e-05, "loss": 4.5714, "step": 36035 }, { "epoch": 0.7380255155325293, "grad_norm": 0.26880499720573425, "learning_rate": 7.61284751801557e-05, "loss": 4.6244, "step": 36040 }, { "epoch": 0.7381279053099339, "grad_norm": 0.282042920589447, "learning_rate": 7.607461181270073e-05, "loss": 4.6137, "step": 36045 }, { "epoch": 0.7382302950873385, "grad_norm": 0.28289055824279785, "learning_rate": 7.60207610326465e-05, "loss": 4.6429, "step": 36050 }, { "epoch": 0.7383326848647431, "grad_norm": 0.2695794999599457, "learning_rate": 7.596692284916223e-05, "loss": 4.6405, "step": 36055 }, { "epoch": 0.7384350746421477, "grad_norm": 0.28726011514663696, "learning_rate": 7.5913097271415e-05, "loss": 4.6271, "step": 36060 }, { "epoch": 0.7385374644195524, "grad_norm": 0.26992538571357727, "learning_rate": 7.585928430856991e-05, "loss": 4.6412, "step": 36065 }, { "epoch": 0.738639854196957, "grad_norm": 0.2706857919692993, "learning_rate": 7.580548396978962e-05, "loss": 4.6197, "step": 36070 }, { "epoch": 0.7387422439743616, "grad_norm": 0.3034684956073761, "learning_rate": 7.575169626423491e-05, "loss": 4.6319, "step": 36075 }, { "epoch": 0.7388446337517662, "grad_norm": 0.27745088934898376, "learning_rate": 7.56979212010642e-05, "loss": 4.5837, "step": 36080 }, { "epoch": 0.7389470235291709, "grad_norm": 0.2573217451572418, "learning_rate": 7.564415878943402e-05, "loss": 4.6254, "step": 36085 }, { "epoch": 0.7390494133065755, "grad_norm": 0.28609785437583923, "learning_rate": 7.559040903849848e-05, "loss": 4.5843, "step": 36090 }, { "epoch": 0.7391518030839801, "grad_norm": 0.286507785320282, "learning_rate": 7.553667195740963e-05, "loss": 4.6006, "step": 36095 }, { "epoch": 0.7392541928613847, "grad_norm": 0.5472362041473389, "learning_rate": 7.548294755531743e-05, "loss": 4.6578, "step": 36100 }, { "epoch": 0.7393565826387893, "grad_norm": 0.2746458947658539, "learning_rate": 7.542923584136963e-05, "loss": 4.5932, "step": 36105 }, { "epoch": 0.739458972416194, "grad_norm": 0.28057464957237244, "learning_rate": 7.53755368247117e-05, "loss": 4.5992, "step": 36110 }, { "epoch": 0.7395613621935986, "grad_norm": 0.3029986321926117, "learning_rate": 7.532185051448728e-05, "loss": 4.6165, "step": 36115 }, { "epoch": 0.7396637519710032, "grad_norm": 0.2978089451789856, "learning_rate": 7.526817691983738e-05, "loss": 4.6251, "step": 36120 }, { "epoch": 0.7397661417484078, "grad_norm": 0.2777213156223297, "learning_rate": 7.521451604990127e-05, "loss": 4.581, "step": 36125 }, { "epoch": 0.7398685315258124, "grad_norm": 0.27353715896606445, "learning_rate": 7.516086791381579e-05, "loss": 4.6552, "step": 36130 }, { "epoch": 0.7399709213032171, "grad_norm": 0.28740575909614563, "learning_rate": 7.510723252071569e-05, "loss": 4.6224, "step": 36135 }, { "epoch": 0.7400733110806217, "grad_norm": 0.307476282119751, "learning_rate": 7.505360987973366e-05, "loss": 4.6363, "step": 36140 }, { "epoch": 0.7401757008580263, "grad_norm": 0.29532861709594727, "learning_rate": 7.500000000000002e-05, "loss": 4.6364, "step": 36145 }, { "epoch": 0.7402780906354309, "grad_norm": 0.3010016083717346, "learning_rate": 7.4946402890643e-05, "loss": 4.6357, "step": 36150 }, { "epoch": 0.7403804804128356, "grad_norm": 0.3083871901035309, "learning_rate": 7.489281856078879e-05, "loss": 4.6255, "step": 36155 }, { "epoch": 0.7404828701902402, "grad_norm": 0.3375720679759979, "learning_rate": 7.48392470195612e-05, "loss": 4.5509, "step": 36160 }, { "epoch": 0.7405852599676448, "grad_norm": 0.27293142676353455, "learning_rate": 7.478568827608194e-05, "loss": 4.6333, "step": 36165 }, { "epoch": 0.7406876497450494, "grad_norm": 0.27576547861099243, "learning_rate": 7.473214233947053e-05, "loss": 4.6031, "step": 36170 }, { "epoch": 0.740790039522454, "grad_norm": 0.292348176240921, "learning_rate": 7.46786092188444e-05, "loss": 4.6116, "step": 36175 }, { "epoch": 0.7408924292998587, "grad_norm": 0.28747648000717163, "learning_rate": 7.462508892331871e-05, "loss": 4.592, "step": 36180 }, { "epoch": 0.7409948190772633, "grad_norm": 0.2726137340068817, "learning_rate": 7.457158146200635e-05, "loss": 4.6099, "step": 36185 }, { "epoch": 0.7410972088546679, "grad_norm": 0.27743956446647644, "learning_rate": 7.451808684401827e-05, "loss": 4.5679, "step": 36190 }, { "epoch": 0.7411995986320725, "grad_norm": 0.273608922958374, "learning_rate": 7.446460507846303e-05, "loss": 4.6028, "step": 36195 }, { "epoch": 0.7413019884094773, "grad_norm": 0.30002596974372864, "learning_rate": 7.441113617444698e-05, "loss": 4.5459, "step": 36200 }, { "epoch": 0.7414043781868819, "grad_norm": 0.29475438594818115, "learning_rate": 7.435768014107457e-05, "loss": 4.6182, "step": 36205 }, { "epoch": 0.7415067679642865, "grad_norm": 0.26289767026901245, "learning_rate": 7.430423698744758e-05, "loss": 4.6427, "step": 36210 }, { "epoch": 0.7416091577416911, "grad_norm": 0.2837486267089844, "learning_rate": 7.425080672266606e-05, "loss": 4.5846, "step": 36215 }, { "epoch": 0.7417115475190957, "grad_norm": 0.29650506377220154, "learning_rate": 7.419738935582756e-05, "loss": 4.6243, "step": 36220 }, { "epoch": 0.7418139372965004, "grad_norm": 0.2735283374786377, "learning_rate": 7.414398489602765e-05, "loss": 4.6259, "step": 36225 }, { "epoch": 0.741916327073905, "grad_norm": 0.26285502314567566, "learning_rate": 7.409059335235955e-05, "loss": 4.5776, "step": 36230 }, { "epoch": 0.7420187168513096, "grad_norm": 0.2663193643093109, "learning_rate": 7.403721473391423e-05, "loss": 4.6158, "step": 36235 }, { "epoch": 0.7421211066287142, "grad_norm": 0.31137824058532715, "learning_rate": 7.398384904978075e-05, "loss": 4.6326, "step": 36240 }, { "epoch": 0.7422234964061188, "grad_norm": 0.2721947133541107, "learning_rate": 7.393049630904564e-05, "loss": 4.5878, "step": 36245 }, { "epoch": 0.7423258861835235, "grad_norm": 0.3060890734195709, "learning_rate": 7.387715652079335e-05, "loss": 4.6295, "step": 36250 }, { "epoch": 0.7424282759609281, "grad_norm": 0.2630886137485504, "learning_rate": 7.382382969410631e-05, "loss": 4.5788, "step": 36255 }, { "epoch": 0.7425306657383327, "grad_norm": 0.274039626121521, "learning_rate": 7.377051583806431e-05, "loss": 4.6033, "step": 36260 }, { "epoch": 0.7426330555157373, "grad_norm": 0.3136425018310547, "learning_rate": 7.371721496174538e-05, "loss": 4.6146, "step": 36265 }, { "epoch": 0.742735445293142, "grad_norm": 0.27234914898872375, "learning_rate": 7.36639270742251e-05, "loss": 4.6283, "step": 36270 }, { "epoch": 0.7428378350705466, "grad_norm": 0.27690574526786804, "learning_rate": 7.36106521845768e-05, "loss": 4.6369, "step": 36275 }, { "epoch": 0.7429402248479512, "grad_norm": 0.2764059603214264, "learning_rate": 7.355739030187184e-05, "loss": 4.6023, "step": 36280 }, { "epoch": 0.7430426146253558, "grad_norm": 0.27220767736434937, "learning_rate": 7.350414143517914e-05, "loss": 4.5813, "step": 36285 }, { "epoch": 0.7431450044027604, "grad_norm": 0.2708590626716614, "learning_rate": 7.34509055935654e-05, "loss": 4.6038, "step": 36290 }, { "epoch": 0.743247394180165, "grad_norm": 0.2739364504814148, "learning_rate": 7.339768278609531e-05, "loss": 4.6233, "step": 36295 }, { "epoch": 0.7433497839575697, "grad_norm": 0.2813643217086792, "learning_rate": 7.334447302183114e-05, "loss": 4.5869, "step": 36300 }, { "epoch": 0.7434521737349743, "grad_norm": 0.2877044081687927, "learning_rate": 7.329127630983302e-05, "loss": 4.6042, "step": 36305 }, { "epoch": 0.7435545635123789, "grad_norm": 0.3283853530883789, "learning_rate": 7.323809265915877e-05, "loss": 4.5801, "step": 36310 }, { "epoch": 0.7436569532897835, "grad_norm": 0.27724689245224, "learning_rate": 7.318492207886417e-05, "loss": 4.6033, "step": 36315 }, { "epoch": 0.7437593430671882, "grad_norm": 0.27958622574806213, "learning_rate": 7.313176457800264e-05, "loss": 4.6044, "step": 36320 }, { "epoch": 0.7438617328445928, "grad_norm": 0.27751678228378296, "learning_rate": 7.30786201656253e-05, "loss": 4.6069, "step": 36325 }, { "epoch": 0.7439641226219974, "grad_norm": 0.3063555955886841, "learning_rate": 7.302548885078128e-05, "loss": 4.5691, "step": 36330 }, { "epoch": 0.744066512399402, "grad_norm": 0.29780712723731995, "learning_rate": 7.297237064251728e-05, "loss": 4.6259, "step": 36335 }, { "epoch": 0.7441689021768066, "grad_norm": 0.2759324610233307, "learning_rate": 7.291926554987775e-05, "loss": 4.5851, "step": 36340 }, { "epoch": 0.7442712919542113, "grad_norm": 0.28593888878822327, "learning_rate": 7.286617358190519e-05, "loss": 4.5673, "step": 36345 }, { "epoch": 0.7443736817316159, "grad_norm": 0.2743288278579712, "learning_rate": 7.281309474763939e-05, "loss": 4.6147, "step": 36350 }, { "epoch": 0.7444760715090205, "grad_norm": 0.3501961827278137, "learning_rate": 7.276002905611837e-05, "loss": 4.6164, "step": 36355 }, { "epoch": 0.7445784612864251, "grad_norm": 0.3279406428337097, "learning_rate": 7.27069765163776e-05, "loss": 4.5896, "step": 36360 }, { "epoch": 0.7446808510638298, "grad_norm": 0.2953859567642212, "learning_rate": 7.265393713745052e-05, "loss": 4.6175, "step": 36365 }, { "epoch": 0.7447832408412344, "grad_norm": 0.3140943646430969, "learning_rate": 7.26009109283682e-05, "loss": 4.5764, "step": 36370 }, { "epoch": 0.744885630618639, "grad_norm": 0.2758210003376007, "learning_rate": 7.254789789815944e-05, "loss": 4.6308, "step": 36375 }, { "epoch": 0.7449880203960436, "grad_norm": 0.27534064650535583, "learning_rate": 7.249489805585096e-05, "loss": 4.628, "step": 36380 }, { "epoch": 0.7450904101734482, "grad_norm": 0.30455341935157776, "learning_rate": 7.24419114104671e-05, "loss": 4.5605, "step": 36385 }, { "epoch": 0.7451927999508529, "grad_norm": 0.27069637179374695, "learning_rate": 7.238893797102989e-05, "loss": 4.6278, "step": 36390 }, { "epoch": 0.7452951897282575, "grad_norm": 0.30609190464019775, "learning_rate": 7.233597774655942e-05, "loss": 4.6255, "step": 36395 }, { "epoch": 0.7453975795056621, "grad_norm": 0.6586396098136902, "learning_rate": 7.228303074607307e-05, "loss": 4.6865, "step": 36400 }, { "epoch": 0.7454999692830668, "grad_norm": 0.2929151654243469, "learning_rate": 7.223009697858638e-05, "loss": 4.663, "step": 36405 }, { "epoch": 0.7456023590604715, "grad_norm": 0.284393846988678, "learning_rate": 7.217717645311243e-05, "loss": 4.6215, "step": 36410 }, { "epoch": 0.7457047488378761, "grad_norm": 0.2875935435295105, "learning_rate": 7.2124269178662e-05, "loss": 4.6083, "step": 36415 }, { "epoch": 0.7458071386152807, "grad_norm": 0.2916814684867859, "learning_rate": 7.207137516424385e-05, "loss": 4.5626, "step": 36420 }, { "epoch": 0.7459095283926853, "grad_norm": 0.2665678858757019, "learning_rate": 7.201849441886426e-05, "loss": 4.6012, "step": 36425 }, { "epoch": 0.7460119181700899, "grad_norm": 0.2775857150554657, "learning_rate": 7.196562695152727e-05, "loss": 4.6326, "step": 36430 }, { "epoch": 0.7461143079474946, "grad_norm": 0.2888798415660858, "learning_rate": 7.191277277123483e-05, "loss": 4.6155, "step": 36435 }, { "epoch": 0.7462166977248992, "grad_norm": 0.2871415913105011, "learning_rate": 7.185993188698645e-05, "loss": 4.5875, "step": 36440 }, { "epoch": 0.7463190875023038, "grad_norm": 0.272702157497406, "learning_rate": 7.180710430777943e-05, "loss": 4.5732, "step": 36445 }, { "epoch": 0.7464214772797084, "grad_norm": 0.2818405032157898, "learning_rate": 7.175429004260876e-05, "loss": 4.5388, "step": 36450 }, { "epoch": 0.746523867057113, "grad_norm": 0.28034403920173645, "learning_rate": 7.170148910046734e-05, "loss": 4.6082, "step": 36455 }, { "epoch": 0.7466262568345177, "grad_norm": 0.2826538383960724, "learning_rate": 7.164870149034559e-05, "loss": 4.5642, "step": 36460 }, { "epoch": 0.7467286466119223, "grad_norm": 0.2753027379512787, "learning_rate": 7.159592722123173e-05, "loss": 4.6034, "step": 36465 }, { "epoch": 0.7468310363893269, "grad_norm": 0.27757155895233154, "learning_rate": 7.154316630211179e-05, "loss": 4.5861, "step": 36470 }, { "epoch": 0.7469334261667315, "grad_norm": 0.2987254559993744, "learning_rate": 7.149041874196945e-05, "loss": 4.6495, "step": 36475 }, { "epoch": 0.7470358159441362, "grad_norm": 0.28315460681915283, "learning_rate": 7.143768454978602e-05, "loss": 4.5983, "step": 36480 }, { "epoch": 0.7471382057215408, "grad_norm": 0.2767767310142517, "learning_rate": 7.138496373454085e-05, "loss": 4.6073, "step": 36485 }, { "epoch": 0.7472405954989454, "grad_norm": 0.28215596079826355, "learning_rate": 7.133225630521056e-05, "loss": 4.6359, "step": 36490 }, { "epoch": 0.74734298527635, "grad_norm": 0.2782031297683716, "learning_rate": 7.127956227076992e-05, "loss": 4.6096, "step": 36495 }, { "epoch": 0.7474453750537546, "grad_norm": 0.2703072428703308, "learning_rate": 7.122688164019111e-05, "loss": 4.6411, "step": 36500 }, { "epoch": 0.7475477648311593, "grad_norm": 0.2731671631336212, "learning_rate": 7.117421442244423e-05, "loss": 4.6308, "step": 36505 }, { "epoch": 0.7476501546085639, "grad_norm": 0.29174426198005676, "learning_rate": 7.112156062649701e-05, "loss": 4.6083, "step": 36510 }, { "epoch": 0.7477525443859685, "grad_norm": 0.28482604026794434, "learning_rate": 7.10689202613148e-05, "loss": 4.593, "step": 36515 }, { "epoch": 0.7478549341633731, "grad_norm": 0.2897595763206482, "learning_rate": 7.10162933358609e-05, "loss": 4.5042, "step": 36520 }, { "epoch": 0.7479573239407777, "grad_norm": 0.2659852206707001, "learning_rate": 7.096367985909615e-05, "loss": 4.5957, "step": 36525 }, { "epoch": 0.7480597137181824, "grad_norm": 0.2799854874610901, "learning_rate": 7.091107983997904e-05, "loss": 4.5992, "step": 36530 }, { "epoch": 0.748162103495587, "grad_norm": 0.2943868339061737, "learning_rate": 7.085849328746606e-05, "loss": 4.5843, "step": 36535 }, { "epoch": 0.7482644932729916, "grad_norm": 0.2765563726425171, "learning_rate": 7.080592021051096e-05, "loss": 4.5918, "step": 36540 }, { "epoch": 0.7483668830503962, "grad_norm": 0.2719048857688904, "learning_rate": 7.075336061806565e-05, "loss": 4.6418, "step": 36545 }, { "epoch": 0.7484692728278008, "grad_norm": 0.2761386036872864, "learning_rate": 7.070081451907945e-05, "loss": 4.6334, "step": 36550 }, { "epoch": 0.7485716626052055, "grad_norm": 0.3091813921928406, "learning_rate": 7.064828192249944e-05, "loss": 4.5794, "step": 36555 }, { "epoch": 0.7486740523826101, "grad_norm": 0.27586930990219116, "learning_rate": 7.059576283727054e-05, "loss": 4.6489, "step": 36560 }, { "epoch": 0.7487764421600147, "grad_norm": 0.27593696117401123, "learning_rate": 7.05432572723352e-05, "loss": 4.5955, "step": 36565 }, { "epoch": 0.7488788319374193, "grad_norm": 0.27841171622276306, "learning_rate": 7.049076523663357e-05, "loss": 4.6438, "step": 36570 }, { "epoch": 0.748981221714824, "grad_norm": 0.27219220995903015, "learning_rate": 7.04382867391037e-05, "loss": 4.6261, "step": 36575 }, { "epoch": 0.7490836114922286, "grad_norm": 0.27954012155532837, "learning_rate": 7.03858217886811e-05, "loss": 4.5801, "step": 36580 }, { "epoch": 0.7491860012696332, "grad_norm": 0.2719559073448181, "learning_rate": 7.03333703942991e-05, "loss": 4.5984, "step": 36585 }, { "epoch": 0.7492883910470378, "grad_norm": 0.345603883266449, "learning_rate": 7.02809325648886e-05, "loss": 4.5991, "step": 36590 }, { "epoch": 0.7493907808244424, "grad_norm": 0.2791408896446228, "learning_rate": 7.022850830937841e-05, "loss": 4.5987, "step": 36595 }, { "epoch": 0.7494931706018471, "grad_norm": 0.26974305510520935, "learning_rate": 7.017609763669484e-05, "loss": 4.5859, "step": 36600 }, { "epoch": 0.7495955603792518, "grad_norm": 0.3174270689487457, "learning_rate": 7.012370055576187e-05, "loss": 4.5971, "step": 36605 }, { "epoch": 0.7496979501566564, "grad_norm": 0.29159948229789734, "learning_rate": 7.007131707550143e-05, "loss": 4.6908, "step": 36610 }, { "epoch": 0.749800339934061, "grad_norm": 0.5217298865318298, "learning_rate": 7.00189472048327e-05, "loss": 4.5889, "step": 36615 }, { "epoch": 0.7499027297114657, "grad_norm": 0.2673492431640625, "learning_rate": 6.996659095267292e-05, "loss": 4.5983, "step": 36620 }, { "epoch": 0.7500051194888703, "grad_norm": 0.26948046684265137, "learning_rate": 6.991424832793696e-05, "loss": 4.6268, "step": 36625 }, { "epoch": 0.7501075092662749, "grad_norm": 0.2933151125907898, "learning_rate": 6.98619193395371e-05, "loss": 4.6163, "step": 36630 }, { "epoch": 0.7502098990436795, "grad_norm": 0.2934931814670563, "learning_rate": 6.980960399638361e-05, "loss": 4.6159, "step": 36635 }, { "epoch": 0.7503122888210841, "grad_norm": 0.2906499207019806, "learning_rate": 6.975730230738425e-05, "loss": 4.6167, "step": 36640 }, { "epoch": 0.7504146785984888, "grad_norm": 0.2920824885368347, "learning_rate": 6.970501428144457e-05, "loss": 4.5922, "step": 36645 }, { "epoch": 0.7505170683758934, "grad_norm": 0.27768781781196594, "learning_rate": 6.965273992746774e-05, "loss": 4.578, "step": 36650 }, { "epoch": 0.750619458153298, "grad_norm": 0.269756555557251, "learning_rate": 6.960047925435453e-05, "loss": 4.6133, "step": 36655 }, { "epoch": 0.7507218479307026, "grad_norm": 0.2730586528778076, "learning_rate": 6.954823227100353e-05, "loss": 4.5932, "step": 36660 }, { "epoch": 0.7508242377081072, "grad_norm": 0.28651493787765503, "learning_rate": 6.949599898631091e-05, "loss": 4.586, "step": 36665 }, { "epoch": 0.7509266274855119, "grad_norm": 0.3220368027687073, "learning_rate": 6.944377940917044e-05, "loss": 4.5902, "step": 36670 }, { "epoch": 0.7510290172629165, "grad_norm": 0.28390300273895264, "learning_rate": 6.939157354847384e-05, "loss": 4.5908, "step": 36675 }, { "epoch": 0.7511314070403211, "grad_norm": 0.29922792315483093, "learning_rate": 6.933938141311e-05, "loss": 4.6444, "step": 36680 }, { "epoch": 0.7512337968177257, "grad_norm": 0.29864510893821716, "learning_rate": 6.928720301196599e-05, "loss": 4.6025, "step": 36685 }, { "epoch": 0.7513361865951304, "grad_norm": 0.2763952314853668, "learning_rate": 6.923503835392623e-05, "loss": 4.5839, "step": 36690 }, { "epoch": 0.751438576372535, "grad_norm": 0.27281010150909424, "learning_rate": 6.918288744787284e-05, "loss": 4.6022, "step": 36695 }, { "epoch": 0.7515409661499396, "grad_norm": 0.26916012167930603, "learning_rate": 6.913075030268576e-05, "loss": 4.5863, "step": 36700 }, { "epoch": 0.7516433559273442, "grad_norm": 0.27656713128089905, "learning_rate": 6.907862692724238e-05, "loss": 4.6107, "step": 36705 }, { "epoch": 0.7517457457047488, "grad_norm": 0.3377551734447479, "learning_rate": 6.902651733041782e-05, "loss": 4.6203, "step": 36710 }, { "epoch": 0.7518481354821535, "grad_norm": 0.32195934653282166, "learning_rate": 6.897442152108496e-05, "loss": 4.5797, "step": 36715 }, { "epoch": 0.7519505252595581, "grad_norm": 0.28840330243110657, "learning_rate": 6.892233950811418e-05, "loss": 4.5722, "step": 36720 }, { "epoch": 0.7520529150369627, "grad_norm": 0.2755777835845947, "learning_rate": 6.88702713003736e-05, "loss": 4.6154, "step": 36725 }, { "epoch": 0.7521553048143673, "grad_norm": 0.2770107686519623, "learning_rate": 6.881821690672887e-05, "loss": 4.5714, "step": 36730 }, { "epoch": 0.7522576945917719, "grad_norm": 0.34963473677635193, "learning_rate": 6.876617633604352e-05, "loss": 4.6218, "step": 36735 }, { "epoch": 0.7523600843691766, "grad_norm": 0.3187248110771179, "learning_rate": 6.871414959717852e-05, "loss": 4.6303, "step": 36740 }, { "epoch": 0.7524624741465812, "grad_norm": 0.2622198462486267, "learning_rate": 6.86621366989925e-05, "loss": 4.5782, "step": 36745 }, { "epoch": 0.7525648639239858, "grad_norm": 0.27882781624794006, "learning_rate": 6.861013765034194e-05, "loss": 4.544, "step": 36750 }, { "epoch": 0.7526672537013904, "grad_norm": 0.27558428049087524, "learning_rate": 6.855815246008059e-05, "loss": 4.6296, "step": 36755 }, { "epoch": 0.752769643478795, "grad_norm": 0.27447035908699036, "learning_rate": 6.850618113706016e-05, "loss": 4.6102, "step": 36760 }, { "epoch": 0.7528720332561997, "grad_norm": 0.27361446619033813, "learning_rate": 6.845422369013e-05, "loss": 4.5727, "step": 36765 }, { "epoch": 0.7529744230336043, "grad_norm": 0.27006056904792786, "learning_rate": 6.84022801281368e-05, "loss": 4.6457, "step": 36770 }, { "epoch": 0.7530768128110089, "grad_norm": 0.30991798639297485, "learning_rate": 6.835035045992522e-05, "loss": 4.5653, "step": 36775 }, { "epoch": 0.7531792025884135, "grad_norm": 0.2698194682598114, "learning_rate": 6.829843469433732e-05, "loss": 4.6258, "step": 36780 }, { "epoch": 0.7532815923658182, "grad_norm": 0.2809673845767975, "learning_rate": 6.824653284021296e-05, "loss": 4.6027, "step": 36785 }, { "epoch": 0.7533839821432228, "grad_norm": 0.2807193398475647, "learning_rate": 6.819464490638954e-05, "loss": 4.6427, "step": 36790 }, { "epoch": 0.7534863719206274, "grad_norm": 0.29017332196235657, "learning_rate": 6.814277090170204e-05, "loss": 4.6518, "step": 36795 }, { "epoch": 0.753588761698032, "grad_norm": 0.28695717453956604, "learning_rate": 6.809091083498328e-05, "loss": 4.5811, "step": 36800 }, { "epoch": 0.7536911514754366, "grad_norm": 0.27444949746131897, "learning_rate": 6.803906471506336e-05, "loss": 4.6075, "step": 36805 }, { "epoch": 0.7537935412528414, "grad_norm": 0.2674769461154938, "learning_rate": 6.798723255077032e-05, "loss": 4.5853, "step": 36810 }, { "epoch": 0.753895931030246, "grad_norm": 0.29308798909187317, "learning_rate": 6.793541435092981e-05, "loss": 4.631, "step": 36815 }, { "epoch": 0.7539983208076506, "grad_norm": 0.2815810739994049, "learning_rate": 6.78836101243648e-05, "loss": 4.6146, "step": 36820 }, { "epoch": 0.7541007105850552, "grad_norm": 0.29573461413383484, "learning_rate": 6.783181987989621e-05, "loss": 4.6258, "step": 36825 }, { "epoch": 0.7542031003624599, "grad_norm": 0.2775028645992279, "learning_rate": 6.778004362634243e-05, "loss": 4.6229, "step": 36830 }, { "epoch": 0.7543054901398645, "grad_norm": 0.26205408573150635, "learning_rate": 6.772828137251943e-05, "loss": 4.5634, "step": 36835 }, { "epoch": 0.7544078799172691, "grad_norm": 0.26867425441741943, "learning_rate": 6.767653312724096e-05, "loss": 4.6122, "step": 36840 }, { "epoch": 0.7545102696946737, "grad_norm": 0.2727378308773041, "learning_rate": 6.762479889931823e-05, "loss": 4.5925, "step": 36845 }, { "epoch": 0.7546126594720783, "grad_norm": 0.28438353538513184, "learning_rate": 6.757307869756005e-05, "loss": 4.5865, "step": 36850 }, { "epoch": 0.754715049249483, "grad_norm": 0.2814081907272339, "learning_rate": 6.752137253077304e-05, "loss": 4.555, "step": 36855 }, { "epoch": 0.7548174390268876, "grad_norm": 0.32102739810943604, "learning_rate": 6.74696804077612e-05, "loss": 4.6096, "step": 36860 }, { "epoch": 0.7549198288042922, "grad_norm": 0.2817835211753845, "learning_rate": 6.741800233732628e-05, "loss": 4.6303, "step": 36865 }, { "epoch": 0.7550222185816968, "grad_norm": 0.2916010022163391, "learning_rate": 6.73663383282675e-05, "loss": 4.6123, "step": 36870 }, { "epoch": 0.7551246083591014, "grad_norm": 0.27132171392440796, "learning_rate": 6.731468838938193e-05, "loss": 4.5794, "step": 36875 }, { "epoch": 0.7552269981365061, "grad_norm": 0.2867998480796814, "learning_rate": 6.726305252946401e-05, "loss": 4.6391, "step": 36880 }, { "epoch": 0.7553293879139107, "grad_norm": 0.2729617953300476, "learning_rate": 6.72114307573058e-05, "loss": 4.6125, "step": 36885 }, { "epoch": 0.7554317776913153, "grad_norm": 0.2647687494754791, "learning_rate": 6.715982308169721e-05, "loss": 4.6241, "step": 36890 }, { "epoch": 0.7555341674687199, "grad_norm": 0.2753639817237854, "learning_rate": 6.710822951142535e-05, "loss": 4.5655, "step": 36895 }, { "epoch": 0.7556365572461246, "grad_norm": 0.3001912832260132, "learning_rate": 6.705665005527522e-05, "loss": 4.6188, "step": 36900 }, { "epoch": 0.7557389470235292, "grad_norm": 0.26426348090171814, "learning_rate": 6.70050847220295e-05, "loss": 4.6051, "step": 36905 }, { "epoch": 0.7558413368009338, "grad_norm": 0.2681485712528229, "learning_rate": 6.695353352046806e-05, "loss": 4.6461, "step": 36910 }, { "epoch": 0.7559437265783384, "grad_norm": 0.28377512097358704, "learning_rate": 6.690199645936876e-05, "loss": 4.6207, "step": 36915 }, { "epoch": 0.756046116355743, "grad_norm": 0.2729427218437195, "learning_rate": 6.685047354750688e-05, "loss": 4.6266, "step": 36920 }, { "epoch": 0.7561485061331477, "grad_norm": 0.2970889210700989, "learning_rate": 6.679896479365524e-05, "loss": 4.5379, "step": 36925 }, { "epoch": 0.7562508959105523, "grad_norm": 0.2982221245765686, "learning_rate": 6.674747020658443e-05, "loss": 4.6025, "step": 36930 }, { "epoch": 0.7563532856879569, "grad_norm": 0.2927866280078888, "learning_rate": 6.669598979506244e-05, "loss": 4.5632, "step": 36935 }, { "epoch": 0.7564556754653615, "grad_norm": 0.27343738079071045, "learning_rate": 6.664452356785504e-05, "loss": 4.5915, "step": 36940 }, { "epoch": 0.7565580652427661, "grad_norm": 0.283540278673172, "learning_rate": 6.659307153372528e-05, "loss": 4.6048, "step": 36945 }, { "epoch": 0.7566604550201708, "grad_norm": 0.2763478457927704, "learning_rate": 6.654163370143411e-05, "loss": 4.6093, "step": 36950 }, { "epoch": 0.7567628447975754, "grad_norm": 0.2957182228565216, "learning_rate": 6.649021007974002e-05, "loss": 4.5892, "step": 36955 }, { "epoch": 0.75686523457498, "grad_norm": 0.7029393911361694, "learning_rate": 6.643880067739879e-05, "loss": 4.5881, "step": 36960 }, { "epoch": 0.7569676243523846, "grad_norm": 0.2768975794315338, "learning_rate": 6.638740550316416e-05, "loss": 4.5847, "step": 36965 }, { "epoch": 0.7570700141297892, "grad_norm": 0.2871471643447876, "learning_rate": 6.63360245657872e-05, "loss": 4.6242, "step": 36970 }, { "epoch": 0.7571724039071939, "grad_norm": 0.30523785948753357, "learning_rate": 6.628465787401657e-05, "loss": 4.5973, "step": 36975 }, { "epoch": 0.7572747936845985, "grad_norm": 0.27099236845970154, "learning_rate": 6.623330543659869e-05, "loss": 4.5939, "step": 36980 }, { "epoch": 0.7573771834620031, "grad_norm": 0.3290117681026459, "learning_rate": 6.618196726227736e-05, "loss": 4.6173, "step": 36985 }, { "epoch": 0.7574795732394077, "grad_norm": 0.333617240190506, "learning_rate": 6.613064335979403e-05, "loss": 4.5882, "step": 36990 }, { "epoch": 0.7575819630168124, "grad_norm": 0.346046507358551, "learning_rate": 6.607933373788764e-05, "loss": 4.6062, "step": 36995 }, { "epoch": 0.757684352794217, "grad_norm": 0.30597689747810364, "learning_rate": 6.602803840529488e-05, "loss": 4.6224, "step": 37000 }, { "epoch": 0.757684352794217, "eval_loss": 4.618415832519531, "eval_runtime": 27.973, "eval_samples_per_second": 20.806, "eval_steps_per_second": 2.61, "step": 37000 }, { "epoch": 0.7577867425716216, "grad_norm": 0.2713710367679596, "learning_rate": 6.597675737074985e-05, "loss": 4.6253, "step": 37005 }, { "epoch": 0.7578891323490262, "grad_norm": 0.29117444157600403, "learning_rate": 6.592549064298417e-05, "loss": 4.6411, "step": 37010 }, { "epoch": 0.757991522126431, "grad_norm": 0.27179235219955444, "learning_rate": 6.587423823072727e-05, "loss": 4.5576, "step": 37015 }, { "epoch": 0.7580939119038356, "grad_norm": 0.2707149386405945, "learning_rate": 6.582300014270588e-05, "loss": 4.5971, "step": 37020 }, { "epoch": 0.7581963016812402, "grad_norm": 0.29081299901008606, "learning_rate": 6.577177638764439e-05, "loss": 4.5642, "step": 37025 }, { "epoch": 0.7582986914586448, "grad_norm": 0.29647204279899597, "learning_rate": 6.572056697426489e-05, "loss": 4.6166, "step": 37030 }, { "epoch": 0.7584010812360494, "grad_norm": 0.27781417965888977, "learning_rate": 6.566937191128666e-05, "loss": 4.5966, "step": 37035 }, { "epoch": 0.7585034710134541, "grad_norm": 0.3100695312023163, "learning_rate": 6.561819120742691e-05, "loss": 4.6062, "step": 37040 }, { "epoch": 0.7586058607908587, "grad_norm": 0.28153157234191895, "learning_rate": 6.556702487140037e-05, "loss": 4.5566, "step": 37045 }, { "epoch": 0.7587082505682633, "grad_norm": 0.2895149290561676, "learning_rate": 6.551587291191899e-05, "loss": 4.6071, "step": 37050 }, { "epoch": 0.7588106403456679, "grad_norm": 0.29634541273117065, "learning_rate": 6.546473533769266e-05, "loss": 4.6792, "step": 37055 }, { "epoch": 0.7589130301230725, "grad_norm": 0.28362852334976196, "learning_rate": 6.541361215742862e-05, "loss": 4.6141, "step": 37060 }, { "epoch": 0.7590154199004772, "grad_norm": 0.27192139625549316, "learning_rate": 6.536250337983163e-05, "loss": 4.5812, "step": 37065 }, { "epoch": 0.7591178096778818, "grad_norm": 0.27279242873191833, "learning_rate": 6.531140901360421e-05, "loss": 4.594, "step": 37070 }, { "epoch": 0.7592201994552864, "grad_norm": 0.2664951980113983, "learning_rate": 6.526032906744613e-05, "loss": 4.6143, "step": 37075 }, { "epoch": 0.759322589232691, "grad_norm": 0.29349449276924133, "learning_rate": 6.520926355005505e-05, "loss": 4.6169, "step": 37080 }, { "epoch": 0.7594249790100956, "grad_norm": 0.2738834619522095, "learning_rate": 6.515821247012574e-05, "loss": 4.6126, "step": 37085 }, { "epoch": 0.7595273687875003, "grad_norm": 0.2886669933795929, "learning_rate": 6.510717583635089e-05, "loss": 4.6484, "step": 37090 }, { "epoch": 0.7596297585649049, "grad_norm": 0.2747144401073456, "learning_rate": 6.50561536574207e-05, "loss": 4.6175, "step": 37095 }, { "epoch": 0.7597321483423095, "grad_norm": 0.28018927574157715, "learning_rate": 6.500514594202258e-05, "loss": 4.5505, "step": 37100 }, { "epoch": 0.7598345381197141, "grad_norm": 0.3059290945529938, "learning_rate": 6.495415269884187e-05, "loss": 4.6257, "step": 37105 }, { "epoch": 0.7599369278971188, "grad_norm": 0.28672704100608826, "learning_rate": 6.490317393656118e-05, "loss": 4.6077, "step": 37110 }, { "epoch": 0.7600393176745234, "grad_norm": 0.277348130941391, "learning_rate": 6.485220966386074e-05, "loss": 4.6029, "step": 37115 }, { "epoch": 0.760141707451928, "grad_norm": 0.3029509484767914, "learning_rate": 6.480125988941845e-05, "loss": 4.6646, "step": 37120 }, { "epoch": 0.7602440972293326, "grad_norm": 0.26892364025115967, "learning_rate": 6.475032462190949e-05, "loss": 4.6111, "step": 37125 }, { "epoch": 0.7603464870067372, "grad_norm": 0.40442508459091187, "learning_rate": 6.469940387000674e-05, "loss": 4.6424, "step": 37130 }, { "epoch": 0.7604488767841419, "grad_norm": 0.2839796245098114, "learning_rate": 6.46484976423805e-05, "loss": 4.5256, "step": 37135 }, { "epoch": 0.7605512665615465, "grad_norm": 0.2990724742412567, "learning_rate": 6.459760594769879e-05, "loss": 4.5923, "step": 37140 }, { "epoch": 0.7606536563389511, "grad_norm": 0.28737035393714905, "learning_rate": 6.454672879462693e-05, "loss": 4.59, "step": 37145 }, { "epoch": 0.7607560461163557, "grad_norm": 0.28966018557548523, "learning_rate": 6.449586619182783e-05, "loss": 4.5735, "step": 37150 }, { "epoch": 0.7608584358937603, "grad_norm": 0.29240936040878296, "learning_rate": 6.444501814796205e-05, "loss": 4.6054, "step": 37155 }, { "epoch": 0.760960825671165, "grad_norm": 0.3128106892108917, "learning_rate": 6.439418467168754e-05, "loss": 4.5458, "step": 37160 }, { "epoch": 0.7610632154485696, "grad_norm": 0.2720398008823395, "learning_rate": 6.43433657716597e-05, "loss": 4.6539, "step": 37165 }, { "epoch": 0.7611656052259742, "grad_norm": 0.26985418796539307, "learning_rate": 6.429256145653175e-05, "loss": 4.632, "step": 37170 }, { "epoch": 0.7612679950033788, "grad_norm": 0.28882190585136414, "learning_rate": 6.424177173495401e-05, "loss": 4.5902, "step": 37175 }, { "epoch": 0.7613703847807834, "grad_norm": 0.2826944589614868, "learning_rate": 6.419099661557463e-05, "loss": 4.5867, "step": 37180 }, { "epoch": 0.7614727745581881, "grad_norm": 0.26971328258514404, "learning_rate": 6.414023610703928e-05, "loss": 4.5876, "step": 37185 }, { "epoch": 0.7615751643355927, "grad_norm": 0.28219759464263916, "learning_rate": 6.408949021799084e-05, "loss": 4.6054, "step": 37190 }, { "epoch": 0.7616775541129973, "grad_norm": 0.2821265757083893, "learning_rate": 6.403875895707003e-05, "loss": 4.6131, "step": 37195 }, { "epoch": 0.7617799438904019, "grad_norm": 0.2961079478263855, "learning_rate": 6.398804233291492e-05, "loss": 4.6034, "step": 37200 }, { "epoch": 0.7618823336678066, "grad_norm": 0.28093793988227844, "learning_rate": 6.393734035416105e-05, "loss": 4.5963, "step": 37205 }, { "epoch": 0.7619847234452112, "grad_norm": 0.2887776792049408, "learning_rate": 6.388665302944163e-05, "loss": 4.587, "step": 37210 }, { "epoch": 0.7620871132226159, "grad_norm": 0.2903097867965698, "learning_rate": 6.38359803673872e-05, "loss": 4.5828, "step": 37215 }, { "epoch": 0.7621895030000205, "grad_norm": 0.27850037813186646, "learning_rate": 6.3785322376626e-05, "loss": 4.5873, "step": 37220 }, { "epoch": 0.7622918927774252, "grad_norm": 0.28699085116386414, "learning_rate": 6.373467906578347e-05, "loss": 4.5973, "step": 37225 }, { "epoch": 0.7623942825548298, "grad_norm": 0.2638085186481476, "learning_rate": 6.368405044348283e-05, "loss": 4.6501, "step": 37230 }, { "epoch": 0.7624966723322344, "grad_norm": 0.286062628030777, "learning_rate": 6.36334365183448e-05, "loss": 4.6185, "step": 37235 }, { "epoch": 0.762599062109639, "grad_norm": 0.28829097747802734, "learning_rate": 6.35828372989873e-05, "loss": 4.6196, "step": 37240 }, { "epoch": 0.7627014518870436, "grad_norm": 0.29331281781196594, "learning_rate": 6.353225279402607e-05, "loss": 4.5655, "step": 37245 }, { "epoch": 0.7628038416644483, "grad_norm": 0.28903159499168396, "learning_rate": 6.34816830120742e-05, "loss": 4.5723, "step": 37250 }, { "epoch": 0.7629062314418529, "grad_norm": 0.29470354318618774, "learning_rate": 6.343112796174223e-05, "loss": 4.5886, "step": 37255 }, { "epoch": 0.7630086212192575, "grad_norm": 0.30720871686935425, "learning_rate": 6.338058765163838e-05, "loss": 4.5834, "step": 37260 }, { "epoch": 0.7631110109966621, "grad_norm": 0.2884748876094818, "learning_rate": 6.333006209036813e-05, "loss": 4.5836, "step": 37265 }, { "epoch": 0.7632134007740667, "grad_norm": 0.28455010056495667, "learning_rate": 6.32795512865346e-05, "loss": 4.5724, "step": 37270 }, { "epoch": 0.7633157905514714, "grad_norm": 0.2831284701824188, "learning_rate": 6.322905524873827e-05, "loss": 4.5781, "step": 37275 }, { "epoch": 0.763418180328876, "grad_norm": 0.30581367015838623, "learning_rate": 6.31785739855773e-05, "loss": 4.6296, "step": 37280 }, { "epoch": 0.7635205701062806, "grad_norm": 0.3325393497943878, "learning_rate": 6.312810750564718e-05, "loss": 4.5549, "step": 37285 }, { "epoch": 0.7636229598836852, "grad_norm": 0.26691895723342896, "learning_rate": 6.307765581754085e-05, "loss": 4.6191, "step": 37290 }, { "epoch": 0.7637253496610898, "grad_norm": 0.27299752831459045, "learning_rate": 6.30272189298489e-05, "loss": 4.5532, "step": 37295 }, { "epoch": 0.7638277394384945, "grad_norm": 0.2956436276435852, "learning_rate": 6.29767968511593e-05, "loss": 4.5685, "step": 37300 }, { "epoch": 0.7639301292158991, "grad_norm": 0.3262317478656769, "learning_rate": 6.292638959005741e-05, "loss": 4.6277, "step": 37305 }, { "epoch": 0.7640325189933037, "grad_norm": 0.2778327465057373, "learning_rate": 6.287599715512632e-05, "loss": 4.5751, "step": 37310 }, { "epoch": 0.7641349087707083, "grad_norm": 0.28934070467948914, "learning_rate": 6.282561955494623e-05, "loss": 4.6262, "step": 37315 }, { "epoch": 0.764237298548113, "grad_norm": 0.27689051628112793, "learning_rate": 6.277525679809518e-05, "loss": 4.6306, "step": 37320 }, { "epoch": 0.7643396883255176, "grad_norm": 0.2804756760597229, "learning_rate": 6.272490889314847e-05, "loss": 4.5508, "step": 37325 }, { "epoch": 0.7644420781029222, "grad_norm": 0.2770512104034424, "learning_rate": 6.267457584867887e-05, "loss": 4.5737, "step": 37330 }, { "epoch": 0.7645444678803268, "grad_norm": 0.27728188037872314, "learning_rate": 6.262425767325677e-05, "loss": 4.5765, "step": 37335 }, { "epoch": 0.7646468576577314, "grad_norm": 0.2826690077781677, "learning_rate": 6.257395437544988e-05, "loss": 4.6062, "step": 37340 }, { "epoch": 0.7647492474351361, "grad_norm": 0.2677403390407562, "learning_rate": 6.252366596382338e-05, "loss": 4.5737, "step": 37345 }, { "epoch": 0.7648516372125407, "grad_norm": 0.29870080947875977, "learning_rate": 6.247339244694007e-05, "loss": 4.6348, "step": 37350 }, { "epoch": 0.7649540269899453, "grad_norm": 0.27728724479675293, "learning_rate": 6.242313383336e-05, "loss": 4.5664, "step": 37355 }, { "epoch": 0.7650564167673499, "grad_norm": 0.27288126945495605, "learning_rate": 6.237289013164093e-05, "loss": 4.6004, "step": 37360 }, { "epoch": 0.7651588065447545, "grad_norm": 0.2839893102645874, "learning_rate": 6.232266135033775e-05, "loss": 4.5854, "step": 37365 }, { "epoch": 0.7652611963221592, "grad_norm": 0.3005150258541107, "learning_rate": 6.227244749800309e-05, "loss": 4.5727, "step": 37370 }, { "epoch": 0.7653635860995638, "grad_norm": 0.2766365706920624, "learning_rate": 6.222224858318706e-05, "loss": 4.5729, "step": 37375 }, { "epoch": 0.7654659758769684, "grad_norm": 0.28654855489730835, "learning_rate": 6.217206461443692e-05, "loss": 4.5589, "step": 37380 }, { "epoch": 0.765568365654373, "grad_norm": 0.2733350694179535, "learning_rate": 6.212189560029772e-05, "loss": 4.5948, "step": 37385 }, { "epoch": 0.7656707554317776, "grad_norm": 0.27922144532203674, "learning_rate": 6.207174154931177e-05, "loss": 4.574, "step": 37390 }, { "epoch": 0.7657731452091823, "grad_norm": 0.27702051401138306, "learning_rate": 6.202160247001883e-05, "loss": 4.6048, "step": 37395 }, { "epoch": 0.7658755349865869, "grad_norm": 0.2795740067958832, "learning_rate": 6.197147837095627e-05, "loss": 4.6568, "step": 37400 }, { "epoch": 0.7659779247639915, "grad_norm": 0.28041133284568787, "learning_rate": 6.192136926065876e-05, "loss": 4.5447, "step": 37405 }, { "epoch": 0.7660803145413961, "grad_norm": 0.33503907918930054, "learning_rate": 6.187127514765846e-05, "loss": 4.6238, "step": 37410 }, { "epoch": 0.7661827043188008, "grad_norm": 0.28736916184425354, "learning_rate": 6.182119604048493e-05, "loss": 4.6131, "step": 37415 }, { "epoch": 0.7662850940962055, "grad_norm": 0.2802198827266693, "learning_rate": 6.177113194766532e-05, "loss": 4.6347, "step": 37420 }, { "epoch": 0.7663874838736101, "grad_norm": 0.2767556607723236, "learning_rate": 6.172108287772408e-05, "loss": 4.593, "step": 37425 }, { "epoch": 0.7664898736510147, "grad_norm": 0.26829272508621216, "learning_rate": 6.16710488391831e-05, "loss": 4.5967, "step": 37430 }, { "epoch": 0.7665922634284194, "grad_norm": 0.27992019057273865, "learning_rate": 6.162102984056186e-05, "loss": 4.5953, "step": 37435 }, { "epoch": 0.766694653205824, "grad_norm": 0.2766593098640442, "learning_rate": 6.157102589037715e-05, "loss": 4.5804, "step": 37440 }, { "epoch": 0.7667970429832286, "grad_norm": 0.30000054836273193, "learning_rate": 6.152103699714314e-05, "loss": 4.6185, "step": 37445 }, { "epoch": 0.7668994327606332, "grad_norm": 0.30132678151130676, "learning_rate": 6.147106316937169e-05, "loss": 4.6012, "step": 37450 }, { "epoch": 0.7670018225380378, "grad_norm": 0.271884948015213, "learning_rate": 6.142110441557174e-05, "loss": 4.5928, "step": 37455 }, { "epoch": 0.7671042123154425, "grad_norm": 0.2828238606452942, "learning_rate": 6.137116074424997e-05, "loss": 4.6395, "step": 37460 }, { "epoch": 0.7672066020928471, "grad_norm": 0.2779662013053894, "learning_rate": 6.132123216391036e-05, "loss": 4.5751, "step": 37465 }, { "epoch": 0.7673089918702517, "grad_norm": 0.26808327436447144, "learning_rate": 6.127131868305426e-05, "loss": 4.5592, "step": 37470 }, { "epoch": 0.7674113816476563, "grad_norm": 0.2726622223854065, "learning_rate": 6.122142031018062e-05, "loss": 4.6192, "step": 37475 }, { "epoch": 0.7675137714250609, "grad_norm": 0.27267372608184814, "learning_rate": 6.117153705378571e-05, "loss": 4.5802, "step": 37480 }, { "epoch": 0.7676161612024656, "grad_norm": 0.2768978476524353, "learning_rate": 6.112166892236314e-05, "loss": 4.6243, "step": 37485 }, { "epoch": 0.7677185509798702, "grad_norm": 0.28463026881217957, "learning_rate": 6.107181592440414e-05, "loss": 4.652, "step": 37490 }, { "epoch": 0.7678209407572748, "grad_norm": 0.3004951775074005, "learning_rate": 6.10219780683972e-05, "loss": 4.5509, "step": 37495 }, { "epoch": 0.7679233305346794, "grad_norm": 0.3154648542404175, "learning_rate": 6.097215536282842e-05, "loss": 4.5509, "step": 37500 }, { "epoch": 0.768025720312084, "grad_norm": 0.2721574008464813, "learning_rate": 6.0922347816181e-05, "loss": 4.5981, "step": 37505 }, { "epoch": 0.7681281100894887, "grad_norm": 0.2939707040786743, "learning_rate": 6.087255543693591e-05, "loss": 4.62, "step": 37510 }, { "epoch": 0.7682304998668933, "grad_norm": 0.27639541029930115, "learning_rate": 6.082277823357134e-05, "loss": 4.6862, "step": 37515 }, { "epoch": 0.7683328896442979, "grad_norm": 0.29966050386428833, "learning_rate": 6.077301621456288e-05, "loss": 4.6113, "step": 37520 }, { "epoch": 0.7684352794217025, "grad_norm": 0.26800772547721863, "learning_rate": 6.072326938838367e-05, "loss": 4.5855, "step": 37525 }, { "epoch": 0.7685376691991072, "grad_norm": 0.267238587141037, "learning_rate": 6.0673537763504185e-05, "loss": 4.5581, "step": 37530 }, { "epoch": 0.7686400589765118, "grad_norm": 0.2806229889392853, "learning_rate": 6.062382134839222e-05, "loss": 4.6597, "step": 37535 }, { "epoch": 0.7687424487539164, "grad_norm": 0.2705599367618561, "learning_rate": 6.05741201515132e-05, "loss": 4.5987, "step": 37540 }, { "epoch": 0.768844838531321, "grad_norm": 0.26709672808647156, "learning_rate": 6.052443418132977e-05, "loss": 4.5802, "step": 37545 }, { "epoch": 0.7689472283087256, "grad_norm": 0.2724774181842804, "learning_rate": 6.047476344630203e-05, "loss": 4.5914, "step": 37550 }, { "epoch": 0.7690496180861303, "grad_norm": 0.29947328567504883, "learning_rate": 6.042510795488748e-05, "loss": 4.6055, "step": 37555 }, { "epoch": 0.7691520078635349, "grad_norm": 0.284551739692688, "learning_rate": 6.037546771554114e-05, "loss": 4.6055, "step": 37560 }, { "epoch": 0.7692543976409395, "grad_norm": 0.31512659788131714, "learning_rate": 6.032584273671526e-05, "loss": 4.5537, "step": 37565 }, { "epoch": 0.7693567874183441, "grad_norm": 0.2858293354511261, "learning_rate": 6.027623302685953e-05, "loss": 4.67, "step": 37570 }, { "epoch": 0.7694591771957487, "grad_norm": 0.2723812758922577, "learning_rate": 6.022663859442119e-05, "loss": 4.5688, "step": 37575 }, { "epoch": 0.7695615669731534, "grad_norm": 0.274245947599411, "learning_rate": 6.017705944784472e-05, "loss": 4.5995, "step": 37580 }, { "epoch": 0.769663956750558, "grad_norm": 0.2815904915332794, "learning_rate": 6.0127495595571975e-05, "loss": 4.5514, "step": 37585 }, { "epoch": 0.7697663465279626, "grad_norm": 0.2978804409503937, "learning_rate": 6.007794704604246e-05, "loss": 4.6603, "step": 37590 }, { "epoch": 0.7698687363053672, "grad_norm": 0.2692318558692932, "learning_rate": 6.002841380769266e-05, "loss": 4.5663, "step": 37595 }, { "epoch": 0.7699711260827719, "grad_norm": 0.3634311854839325, "learning_rate": 5.9978895888956825e-05, "loss": 4.5738, "step": 37600 }, { "epoch": 0.7700735158601765, "grad_norm": 0.2816818058490753, "learning_rate": 5.992939329826644e-05, "loss": 4.5572, "step": 37605 }, { "epoch": 0.7701759056375811, "grad_norm": 0.27900898456573486, "learning_rate": 5.987990604405033e-05, "loss": 4.5772, "step": 37610 }, { "epoch": 0.7702782954149857, "grad_norm": 0.2727174758911133, "learning_rate": 5.983043413473484e-05, "loss": 4.5825, "step": 37615 }, { "epoch": 0.7703806851923904, "grad_norm": 0.3469656705856323, "learning_rate": 5.978097757874364e-05, "loss": 4.5801, "step": 37620 }, { "epoch": 0.7704830749697951, "grad_norm": 0.27901020646095276, "learning_rate": 5.9731536384497705e-05, "loss": 4.601, "step": 37625 }, { "epoch": 0.7705854647471997, "grad_norm": 0.28966450691223145, "learning_rate": 5.968211056041557e-05, "loss": 4.6858, "step": 37630 }, { "epoch": 0.7706878545246043, "grad_norm": 0.2710847854614258, "learning_rate": 5.9632700114912944e-05, "loss": 4.6181, "step": 37635 }, { "epoch": 0.7707902443020089, "grad_norm": 0.8204988241195679, "learning_rate": 5.95833050564032e-05, "loss": 4.5649, "step": 37640 }, { "epoch": 0.7708926340794136, "grad_norm": 0.2956174314022064, "learning_rate": 5.9533925393296714e-05, "loss": 4.5958, "step": 37645 }, { "epoch": 0.7709950238568182, "grad_norm": 0.2778283655643463, "learning_rate": 5.948456113400158e-05, "loss": 4.6055, "step": 37650 }, { "epoch": 0.7710974136342228, "grad_norm": 0.269569456577301, "learning_rate": 5.943521228692308e-05, "loss": 4.5891, "step": 37655 }, { "epoch": 0.7711998034116274, "grad_norm": 0.28458067774772644, "learning_rate": 5.9385878860463905e-05, "loss": 4.5533, "step": 37660 }, { "epoch": 0.771302193189032, "grad_norm": 0.3094825744628906, "learning_rate": 5.93365608630242e-05, "loss": 4.6104, "step": 37665 }, { "epoch": 0.7714045829664367, "grad_norm": 0.27909228205680847, "learning_rate": 5.928725830300143e-05, "loss": 4.6095, "step": 37670 }, { "epoch": 0.7715069727438413, "grad_norm": 0.27436867356300354, "learning_rate": 5.92379711887903e-05, "loss": 4.5679, "step": 37675 }, { "epoch": 0.7716093625212459, "grad_norm": 0.2822236120700836, "learning_rate": 5.918869952878319e-05, "loss": 4.6314, "step": 37680 }, { "epoch": 0.7717117522986505, "grad_norm": 0.293186217546463, "learning_rate": 5.913944333136955e-05, "loss": 4.55, "step": 37685 }, { "epoch": 0.7718141420760551, "grad_norm": 0.29767438769340515, "learning_rate": 5.909020260493636e-05, "loss": 4.5905, "step": 37690 }, { "epoch": 0.7719165318534598, "grad_norm": 0.2744883894920349, "learning_rate": 5.904097735786786e-05, "loss": 4.6283, "step": 37695 }, { "epoch": 0.7720189216308644, "grad_norm": 0.2894776463508606, "learning_rate": 5.8991767598545806e-05, "loss": 4.5789, "step": 37700 }, { "epoch": 0.772121311408269, "grad_norm": 0.2848042845726013, "learning_rate": 5.8942573335349194e-05, "loss": 4.5971, "step": 37705 }, { "epoch": 0.7722237011856736, "grad_norm": 0.2997800409793854, "learning_rate": 5.889339457665434e-05, "loss": 4.6336, "step": 37710 }, { "epoch": 0.7723260909630782, "grad_norm": 0.2878294587135315, "learning_rate": 5.8844231330835124e-05, "loss": 4.6161, "step": 37715 }, { "epoch": 0.7724284807404829, "grad_norm": 0.27903154492378235, "learning_rate": 5.8795083606262594e-05, "loss": 4.5961, "step": 37720 }, { "epoch": 0.7725308705178875, "grad_norm": 0.276852011680603, "learning_rate": 5.874595141130514e-05, "loss": 4.6124, "step": 37725 }, { "epoch": 0.7726332602952921, "grad_norm": 0.28702512383461, "learning_rate": 5.869683475432877e-05, "loss": 4.6696, "step": 37730 }, { "epoch": 0.7727356500726967, "grad_norm": 0.3925803601741791, "learning_rate": 5.8647733643696455e-05, "loss": 4.5229, "step": 37735 }, { "epoch": 0.7728380398501014, "grad_norm": 0.2754385769367218, "learning_rate": 5.859864808776886e-05, "loss": 4.5714, "step": 37740 }, { "epoch": 0.772940429627506, "grad_norm": 0.29182150959968567, "learning_rate": 5.8549578094903813e-05, "loss": 4.589, "step": 37745 }, { "epoch": 0.7730428194049106, "grad_norm": 0.28724372386932373, "learning_rate": 5.850052367345649e-05, "loss": 4.5774, "step": 37750 }, { "epoch": 0.7731452091823152, "grad_norm": 0.2842564582824707, "learning_rate": 5.8451484831779616e-05, "loss": 4.5768, "step": 37755 }, { "epoch": 0.7732475989597198, "grad_norm": 0.276254266500473, "learning_rate": 5.840246157822301e-05, "loss": 4.6367, "step": 37760 }, { "epoch": 0.7733499887371245, "grad_norm": 0.2792101204395294, "learning_rate": 5.835345392113391e-05, "loss": 4.5904, "step": 37765 }, { "epoch": 0.7734523785145291, "grad_norm": 0.29593244194984436, "learning_rate": 5.8304461868857074e-05, "loss": 4.6085, "step": 37770 }, { "epoch": 0.7735547682919337, "grad_norm": 0.27402353286743164, "learning_rate": 5.82554854297343e-05, "loss": 4.6298, "step": 37775 }, { "epoch": 0.7736571580693383, "grad_norm": 0.2819097638130188, "learning_rate": 5.820652461210508e-05, "loss": 4.6513, "step": 37780 }, { "epoch": 0.773759547846743, "grad_norm": 0.2651239037513733, "learning_rate": 5.815757942430584e-05, "loss": 4.5607, "step": 37785 }, { "epoch": 0.7738619376241476, "grad_norm": 0.3204714059829712, "learning_rate": 5.8108649874670706e-05, "loss": 4.6206, "step": 37790 }, { "epoch": 0.7739643274015522, "grad_norm": 0.30024945735931396, "learning_rate": 5.805973597153096e-05, "loss": 4.6077, "step": 37795 }, { "epoch": 0.7740667171789568, "grad_norm": 0.27434980869293213, "learning_rate": 5.801083772321518e-05, "loss": 4.585, "step": 37800 }, { "epoch": 0.7741691069563614, "grad_norm": 0.2731095254421234, "learning_rate": 5.796195513804948e-05, "loss": 4.5724, "step": 37805 }, { "epoch": 0.774271496733766, "grad_norm": 0.2779126465320587, "learning_rate": 5.791308822435712e-05, "loss": 4.522, "step": 37810 }, { "epoch": 0.7743738865111707, "grad_norm": 0.26512452960014343, "learning_rate": 5.7864236990458704e-05, "loss": 4.5672, "step": 37815 }, { "epoch": 0.7744762762885753, "grad_norm": 0.271696537733078, "learning_rate": 5.781540144467229e-05, "loss": 4.564, "step": 37820 }, { "epoch": 0.77457866606598, "grad_norm": 0.2706857919692993, "learning_rate": 5.776658159531318e-05, "loss": 4.5892, "step": 37825 }, { "epoch": 0.7746810558433846, "grad_norm": 0.27617236971855164, "learning_rate": 5.7717777450694e-05, "loss": 4.5907, "step": 37830 }, { "epoch": 0.7747834456207893, "grad_norm": 0.278952419757843, "learning_rate": 5.766898901912464e-05, "loss": 4.6563, "step": 37835 }, { "epoch": 0.7748858353981939, "grad_norm": 0.29718565940856934, "learning_rate": 5.762021630891251e-05, "loss": 4.577, "step": 37840 }, { "epoch": 0.7749882251755985, "grad_norm": 0.268237829208374, "learning_rate": 5.7571459328362165e-05, "loss": 4.5341, "step": 37845 }, { "epoch": 0.7750906149530031, "grad_norm": 0.2789965271949768, "learning_rate": 5.7522718085775496e-05, "loss": 4.5685, "step": 37850 }, { "epoch": 0.7751930047304078, "grad_norm": 0.2974662184715271, "learning_rate": 5.7473992589451865e-05, "loss": 4.6499, "step": 37855 }, { "epoch": 0.7752953945078124, "grad_norm": 0.2781980633735657, "learning_rate": 5.7425282847687756e-05, "loss": 4.6085, "step": 37860 }, { "epoch": 0.775397784285217, "grad_norm": 0.3048999309539795, "learning_rate": 5.737658886877704e-05, "loss": 4.5934, "step": 37865 }, { "epoch": 0.7755001740626216, "grad_norm": 0.28093934059143066, "learning_rate": 5.732791066101107e-05, "loss": 4.5681, "step": 37870 }, { "epoch": 0.7756025638400262, "grad_norm": 0.3381670415401459, "learning_rate": 5.727924823267817e-05, "loss": 4.6332, "step": 37875 }, { "epoch": 0.7757049536174309, "grad_norm": 0.26587992906570435, "learning_rate": 5.7230601592064304e-05, "loss": 4.5393, "step": 37880 }, { "epoch": 0.7758073433948355, "grad_norm": 0.2888360917568207, "learning_rate": 5.718197074745259e-05, "loss": 4.5999, "step": 37885 }, { "epoch": 0.7759097331722401, "grad_norm": 0.2756347954273224, "learning_rate": 5.713335570712342e-05, "loss": 4.5961, "step": 37890 }, { "epoch": 0.7760121229496447, "grad_norm": 0.29758137464523315, "learning_rate": 5.7084756479354673e-05, "loss": 4.6282, "step": 37895 }, { "epoch": 0.7761145127270493, "grad_norm": 0.2744895815849304, "learning_rate": 5.703617307242136e-05, "loss": 4.6118, "step": 37900 }, { "epoch": 0.776216902504454, "grad_norm": 0.2868340313434601, "learning_rate": 5.698760549459581e-05, "loss": 4.5726, "step": 37905 }, { "epoch": 0.7763192922818586, "grad_norm": 0.2860718071460724, "learning_rate": 5.69390537541478e-05, "loss": 4.5842, "step": 37910 }, { "epoch": 0.7764216820592632, "grad_norm": 0.2746017575263977, "learning_rate": 5.689051785934423e-05, "loss": 4.6116, "step": 37915 }, { "epoch": 0.7765240718366678, "grad_norm": 0.2836153209209442, "learning_rate": 5.6841997818449537e-05, "loss": 4.5946, "step": 37920 }, { "epoch": 0.7766264616140724, "grad_norm": 0.36896389722824097, "learning_rate": 5.67934936397251e-05, "loss": 4.5661, "step": 37925 }, { "epoch": 0.7767288513914771, "grad_norm": 0.28288668394088745, "learning_rate": 5.6745005331429986e-05, "loss": 4.574, "step": 37930 }, { "epoch": 0.7768312411688817, "grad_norm": 0.2924422025680542, "learning_rate": 5.669653290182029e-05, "loss": 4.6238, "step": 37935 }, { "epoch": 0.7769336309462863, "grad_norm": 0.2679120600223541, "learning_rate": 5.6648076359149476e-05, "loss": 4.557, "step": 37940 }, { "epoch": 0.7770360207236909, "grad_norm": 0.29370808601379395, "learning_rate": 5.6599635711668404e-05, "loss": 4.616, "step": 37945 }, { "epoch": 0.7771384105010956, "grad_norm": 0.2717418968677521, "learning_rate": 5.6551210967625105e-05, "loss": 4.5977, "step": 37950 }, { "epoch": 0.7772408002785002, "grad_norm": 0.26935046911239624, "learning_rate": 5.650280213526489e-05, "loss": 4.6466, "step": 37955 }, { "epoch": 0.7773431900559048, "grad_norm": 0.27534303069114685, "learning_rate": 5.6454409222830507e-05, "loss": 4.6109, "step": 37960 }, { "epoch": 0.7774455798333094, "grad_norm": 0.28345656394958496, "learning_rate": 5.640603223856185e-05, "loss": 4.5577, "step": 37965 }, { "epoch": 0.777547969610714, "grad_norm": 0.27641671895980835, "learning_rate": 5.635767119069615e-05, "loss": 4.5759, "step": 37970 }, { "epoch": 0.7776503593881187, "grad_norm": 0.2733546197414398, "learning_rate": 5.630932608746789e-05, "loss": 4.6053, "step": 37975 }, { "epoch": 0.7777527491655233, "grad_norm": 0.27268344163894653, "learning_rate": 5.626099693710894e-05, "loss": 4.5986, "step": 37980 }, { "epoch": 0.7778551389429279, "grad_norm": 0.29251590371131897, "learning_rate": 5.621268374784837e-05, "loss": 4.5748, "step": 37985 }, { "epoch": 0.7779575287203325, "grad_norm": 0.2936033010482788, "learning_rate": 5.6164386527912473e-05, "loss": 4.6025, "step": 37990 }, { "epoch": 0.7780599184977371, "grad_norm": 0.2928719222545624, "learning_rate": 5.611610528552508e-05, "loss": 4.6442, "step": 37995 }, { "epoch": 0.7781623082751418, "grad_norm": 0.3127742409706116, "learning_rate": 5.606784002890688e-05, "loss": 4.4863, "step": 38000 }, { "epoch": 0.7781623082751418, "eval_loss": 4.6109232902526855, "eval_runtime": 28.7515, "eval_samples_per_second": 20.242, "eval_steps_per_second": 2.539, "step": 38000 }, { "epoch": 0.7782646980525464, "grad_norm": 0.28875091671943665, "learning_rate": 5.60195907662762e-05, "loss": 4.5884, "step": 38005 }, { "epoch": 0.778367087829951, "grad_norm": 0.2909495234489441, "learning_rate": 5.5971357505848625e-05, "loss": 4.5766, "step": 38010 }, { "epoch": 0.7784694776073556, "grad_norm": 0.28629270195961, "learning_rate": 5.5923140255836696e-05, "loss": 4.609, "step": 38015 }, { "epoch": 0.7785718673847603, "grad_norm": 0.28371039032936096, "learning_rate": 5.5874939024450625e-05, "loss": 4.562, "step": 38020 }, { "epoch": 0.778674257162165, "grad_norm": 0.2701385021209717, "learning_rate": 5.5826753819897654e-05, "loss": 4.5909, "step": 38025 }, { "epoch": 0.7787766469395696, "grad_norm": 0.2845456302165985, "learning_rate": 5.5778584650382307e-05, "loss": 4.5905, "step": 38030 }, { "epoch": 0.7788790367169742, "grad_norm": 0.30409011244773865, "learning_rate": 5.573043152410652e-05, "loss": 4.5325, "step": 38035 }, { "epoch": 0.7789814264943788, "grad_norm": 0.3092602491378784, "learning_rate": 5.568229444926937e-05, "loss": 4.5918, "step": 38040 }, { "epoch": 0.7790838162717835, "grad_norm": 0.2709414064884186, "learning_rate": 5.563417343406718e-05, "loss": 4.5563, "step": 38045 }, { "epoch": 0.7791862060491881, "grad_norm": 0.3534092903137207, "learning_rate": 5.5586068486693705e-05, "loss": 4.6015, "step": 38050 }, { "epoch": 0.7792885958265927, "grad_norm": 0.29504942893981934, "learning_rate": 5.553797961533977e-05, "loss": 4.5999, "step": 38055 }, { "epoch": 0.7793909856039973, "grad_norm": 0.31223994493484497, "learning_rate": 5.5489906828193656e-05, "loss": 4.6144, "step": 38060 }, { "epoch": 0.779493375381402, "grad_norm": 0.3386547863483429, "learning_rate": 5.5441850133440634e-05, "loss": 4.5956, "step": 38065 }, { "epoch": 0.7795957651588066, "grad_norm": 0.27085545659065247, "learning_rate": 5.5393809539263535e-05, "loss": 4.5194, "step": 38070 }, { "epoch": 0.7796981549362112, "grad_norm": 0.5431325435638428, "learning_rate": 5.534578505384226e-05, "loss": 4.5828, "step": 38075 }, { "epoch": 0.7798005447136158, "grad_norm": 0.2880275845527649, "learning_rate": 5.529777668535398e-05, "loss": 4.5893, "step": 38080 }, { "epoch": 0.7799029344910204, "grad_norm": 0.27586859464645386, "learning_rate": 5.5249784441973244e-05, "loss": 4.5867, "step": 38085 }, { "epoch": 0.7800053242684251, "grad_norm": 0.30714014172554016, "learning_rate": 5.5201808331871736e-05, "loss": 4.5511, "step": 38090 }, { "epoch": 0.7801077140458297, "grad_norm": 0.28004249930381775, "learning_rate": 5.515384836321838e-05, "loss": 4.6185, "step": 38095 }, { "epoch": 0.7802101038232343, "grad_norm": 0.2809518277645111, "learning_rate": 5.5105904544179496e-05, "loss": 4.5796, "step": 38100 }, { "epoch": 0.7803124936006389, "grad_norm": 0.2761389911174774, "learning_rate": 5.5057976882918516e-05, "loss": 4.5321, "step": 38105 }, { "epoch": 0.7804148833780435, "grad_norm": 0.3077124059200287, "learning_rate": 5.501006538759615e-05, "loss": 4.5659, "step": 38110 }, { "epoch": 0.7805172731554482, "grad_norm": 0.2878948748111725, "learning_rate": 5.496217006637032e-05, "loss": 4.5707, "step": 38115 }, { "epoch": 0.7806196629328528, "grad_norm": 0.2937699854373932, "learning_rate": 5.491429092739636e-05, "loss": 4.5778, "step": 38120 }, { "epoch": 0.7807220527102574, "grad_norm": 0.2876644730567932, "learning_rate": 5.486642797882667e-05, "loss": 4.6051, "step": 38125 }, { "epoch": 0.780824442487662, "grad_norm": 0.31219735741615295, "learning_rate": 5.4818581228810906e-05, "loss": 4.585, "step": 38130 }, { "epoch": 0.7809268322650667, "grad_norm": 0.26896148920059204, "learning_rate": 5.477075068549617e-05, "loss": 4.6168, "step": 38135 }, { "epoch": 0.7810292220424713, "grad_norm": 0.334077924489975, "learning_rate": 5.472293635702644e-05, "loss": 4.6099, "step": 38140 }, { "epoch": 0.7811316118198759, "grad_norm": 0.2796894907951355, "learning_rate": 5.467513825154325e-05, "loss": 4.5843, "step": 38145 }, { "epoch": 0.7812340015972805, "grad_norm": 0.3577290177345276, "learning_rate": 5.462735637718535e-05, "loss": 4.6231, "step": 38150 }, { "epoch": 0.7813363913746851, "grad_norm": 0.27947860956192017, "learning_rate": 5.4579590742088466e-05, "loss": 4.6125, "step": 38155 }, { "epoch": 0.7814387811520898, "grad_norm": 0.2753649950027466, "learning_rate": 5.453184135438587e-05, "loss": 4.6266, "step": 38160 }, { "epoch": 0.7815411709294944, "grad_norm": 0.3409067988395691, "learning_rate": 5.448410822220786e-05, "loss": 4.5962, "step": 38165 }, { "epoch": 0.781643560706899, "grad_norm": 0.2706255316734314, "learning_rate": 5.443639135368203e-05, "loss": 4.5975, "step": 38170 }, { "epoch": 0.7817459504843036, "grad_norm": 0.2951542139053345, "learning_rate": 5.438869075693329e-05, "loss": 4.5826, "step": 38175 }, { "epoch": 0.7818483402617082, "grad_norm": 0.2736496031284332, "learning_rate": 5.434100644008365e-05, "loss": 4.5533, "step": 38180 }, { "epoch": 0.7819507300391129, "grad_norm": 0.270050048828125, "learning_rate": 5.4293338411252387e-05, "loss": 4.5591, "step": 38185 }, { "epoch": 0.7820531198165175, "grad_norm": 0.28652334213256836, "learning_rate": 5.424568667855598e-05, "loss": 4.5555, "step": 38190 }, { "epoch": 0.7821555095939221, "grad_norm": 0.27525264024734497, "learning_rate": 5.4198051250108214e-05, "loss": 4.6588, "step": 38195 }, { "epoch": 0.7822578993713267, "grad_norm": 0.2802639603614807, "learning_rate": 5.415043213402018e-05, "loss": 4.6168, "step": 38200 }, { "epoch": 0.7823602891487313, "grad_norm": 0.27300724387168884, "learning_rate": 5.410282933839983e-05, "loss": 4.5991, "step": 38205 }, { "epoch": 0.782462678926136, "grad_norm": 0.27656131982803345, "learning_rate": 5.405524287135275e-05, "loss": 4.5723, "step": 38210 }, { "epoch": 0.7825650687035406, "grad_norm": 0.30277910828590393, "learning_rate": 5.400767274098149e-05, "loss": 4.5386, "step": 38215 }, { "epoch": 0.7826674584809452, "grad_norm": 0.28365784883499146, "learning_rate": 5.396011895538587e-05, "loss": 4.5923, "step": 38220 }, { "epoch": 0.7827698482583498, "grad_norm": 0.2815352976322174, "learning_rate": 5.391258152266304e-05, "loss": 4.498, "step": 38225 }, { "epoch": 0.7828722380357546, "grad_norm": 0.2815379202365875, "learning_rate": 5.386506045090724e-05, "loss": 4.6176, "step": 38230 }, { "epoch": 0.7829746278131592, "grad_norm": 0.2661249339580536, "learning_rate": 5.381755574820989e-05, "loss": 4.5613, "step": 38235 }, { "epoch": 0.7830770175905638, "grad_norm": 0.30869239568710327, "learning_rate": 5.377006742265984e-05, "loss": 4.5774, "step": 38240 }, { "epoch": 0.7831794073679684, "grad_norm": 0.26999416947364807, "learning_rate": 5.3722595482342925e-05, "loss": 4.5696, "step": 38245 }, { "epoch": 0.783281797145373, "grad_norm": 0.2677248418331146, "learning_rate": 5.367513993534226e-05, "loss": 4.6001, "step": 38250 }, { "epoch": 0.7833841869227777, "grad_norm": 0.2759122848510742, "learning_rate": 5.3627700789738156e-05, "loss": 4.6219, "step": 38255 }, { "epoch": 0.7834865767001823, "grad_norm": 0.26571130752563477, "learning_rate": 5.358027805360826e-05, "loss": 4.6068, "step": 38260 }, { "epoch": 0.7835889664775869, "grad_norm": 0.2919250726699829, "learning_rate": 5.353287173502726e-05, "loss": 4.6113, "step": 38265 }, { "epoch": 0.7836913562549915, "grad_norm": 0.2928933799266815, "learning_rate": 5.348548184206705e-05, "loss": 4.598, "step": 38270 }, { "epoch": 0.7837937460323962, "grad_norm": 0.2709066867828369, "learning_rate": 5.3438108382796945e-05, "loss": 4.5881, "step": 38275 }, { "epoch": 0.7838961358098008, "grad_norm": 0.30528467893600464, "learning_rate": 5.339075136528312e-05, "loss": 4.6173, "step": 38280 }, { "epoch": 0.7839985255872054, "grad_norm": 0.2768768072128296, "learning_rate": 5.33434107975892e-05, "loss": 4.5767, "step": 38285 }, { "epoch": 0.78410091536461, "grad_norm": 0.27764803171157837, "learning_rate": 5.3296086687776084e-05, "loss": 4.6202, "step": 38290 }, { "epoch": 0.7842033051420146, "grad_norm": 0.27811387181282043, "learning_rate": 5.32487790439015e-05, "loss": 4.6258, "step": 38295 }, { "epoch": 0.7843056949194193, "grad_norm": 0.29696446657180786, "learning_rate": 5.320148787402077e-05, "loss": 4.6243, "step": 38300 }, { "epoch": 0.7844080846968239, "grad_norm": 0.29483506083488464, "learning_rate": 5.3154213186186165e-05, "loss": 4.5898, "step": 38305 }, { "epoch": 0.7845104744742285, "grad_norm": 0.2779998183250427, "learning_rate": 5.3106954988447204e-05, "loss": 4.5817, "step": 38310 }, { "epoch": 0.7846128642516331, "grad_norm": 0.2914763391017914, "learning_rate": 5.305971328885071e-05, "loss": 4.5334, "step": 38315 }, { "epoch": 0.7847152540290377, "grad_norm": 0.2841431796550751, "learning_rate": 5.301248809544053e-05, "loss": 4.5977, "step": 38320 }, { "epoch": 0.7848176438064424, "grad_norm": 0.26590022444725037, "learning_rate": 5.296527941625782e-05, "loss": 4.5799, "step": 38325 }, { "epoch": 0.784920033583847, "grad_norm": 0.280025452375412, "learning_rate": 5.2918087259340816e-05, "loss": 4.5617, "step": 38330 }, { "epoch": 0.7850224233612516, "grad_norm": 0.2960115373134613, "learning_rate": 5.2870911632725045e-05, "loss": 4.4566, "step": 38335 }, { "epoch": 0.7851248131386562, "grad_norm": 0.3256969749927521, "learning_rate": 5.2823752544443304e-05, "loss": 4.5568, "step": 38340 }, { "epoch": 0.7852272029160609, "grad_norm": 0.3043464720249176, "learning_rate": 5.277661000252523e-05, "loss": 4.5841, "step": 38345 }, { "epoch": 0.7853295926934655, "grad_norm": 0.2751842439174652, "learning_rate": 5.2729484014998006e-05, "loss": 4.5798, "step": 38350 }, { "epoch": 0.7854319824708701, "grad_norm": 0.28036338090896606, "learning_rate": 5.268237458988583e-05, "loss": 4.5693, "step": 38355 }, { "epoch": 0.7855343722482747, "grad_norm": 0.27092334628105164, "learning_rate": 5.263528173521004e-05, "loss": 4.6061, "step": 38360 }, { "epoch": 0.7856367620256793, "grad_norm": 0.33851486444473267, "learning_rate": 5.2588205458989305e-05, "loss": 4.5907, "step": 38365 }, { "epoch": 0.785739151803084, "grad_norm": 0.2805909514427185, "learning_rate": 5.2541145769239355e-05, "loss": 4.5984, "step": 38370 }, { "epoch": 0.7858415415804886, "grad_norm": 0.29815682768821716, "learning_rate": 5.249410267397309e-05, "loss": 4.5704, "step": 38375 }, { "epoch": 0.7859439313578932, "grad_norm": 0.27646151185035706, "learning_rate": 5.244707618120061e-05, "loss": 4.5422, "step": 38380 }, { "epoch": 0.7860463211352978, "grad_norm": 0.32369154691696167, "learning_rate": 5.2400066298929264e-05, "loss": 4.5315, "step": 38385 }, { "epoch": 0.7861487109127024, "grad_norm": 0.28201964497566223, "learning_rate": 5.235307303516348e-05, "loss": 4.6324, "step": 38390 }, { "epoch": 0.7862511006901071, "grad_norm": 0.27669161558151245, "learning_rate": 5.230609639790479e-05, "loss": 4.5939, "step": 38395 }, { "epoch": 0.7863534904675117, "grad_norm": 0.28544601798057556, "learning_rate": 5.225913639515212e-05, "loss": 4.6243, "step": 38400 }, { "epoch": 0.7864558802449163, "grad_norm": 0.3115270137786865, "learning_rate": 5.2212193034901365e-05, "loss": 4.5518, "step": 38405 }, { "epoch": 0.7865582700223209, "grad_norm": 0.2983291745185852, "learning_rate": 5.21652663251456e-05, "loss": 4.6791, "step": 38410 }, { "epoch": 0.7866606597997255, "grad_norm": 0.8530471324920654, "learning_rate": 5.211835627387529e-05, "loss": 4.566, "step": 38415 }, { "epoch": 0.7867630495771302, "grad_norm": 0.2956152856349945, "learning_rate": 5.207146288907765e-05, "loss": 4.5612, "step": 38420 }, { "epoch": 0.7868654393545348, "grad_norm": 0.2825814187526703, "learning_rate": 5.20245861787374e-05, "loss": 4.5784, "step": 38425 }, { "epoch": 0.7869678291319395, "grad_norm": 0.2654380798339844, "learning_rate": 5.1977726150836456e-05, "loss": 4.5618, "step": 38430 }, { "epoch": 0.7870702189093441, "grad_norm": 0.2694794833660126, "learning_rate": 5.1930882813353506e-05, "loss": 4.6147, "step": 38435 }, { "epoch": 0.7871726086867488, "grad_norm": 0.2700869143009186, "learning_rate": 5.1884056174264825e-05, "loss": 4.581, "step": 38440 }, { "epoch": 0.7872749984641534, "grad_norm": 0.2823755145072937, "learning_rate": 5.183724624154362e-05, "loss": 4.6208, "step": 38445 }, { "epoch": 0.787377388241558, "grad_norm": 0.30310189723968506, "learning_rate": 5.179045302316021e-05, "loss": 4.5524, "step": 38450 }, { "epoch": 0.7874797780189626, "grad_norm": 0.2830992043018341, "learning_rate": 5.174367652708228e-05, "loss": 4.6022, "step": 38455 }, { "epoch": 0.7875821677963672, "grad_norm": 0.27573373913764954, "learning_rate": 5.16969167612745e-05, "loss": 4.5758, "step": 38460 }, { "epoch": 0.7876845575737719, "grad_norm": 0.2764032781124115, "learning_rate": 5.1650173733698714e-05, "loss": 4.5994, "step": 38465 }, { "epoch": 0.7877869473511765, "grad_norm": 0.2729256749153137, "learning_rate": 5.160344745231391e-05, "loss": 4.5756, "step": 38470 }, { "epoch": 0.7878893371285811, "grad_norm": 0.277967244386673, "learning_rate": 5.155673792507628e-05, "loss": 4.4947, "step": 38475 }, { "epoch": 0.7879917269059857, "grad_norm": 0.28639402985572815, "learning_rate": 5.151004515993924e-05, "loss": 4.5977, "step": 38480 }, { "epoch": 0.7880941166833904, "grad_norm": 0.27312996983528137, "learning_rate": 5.1463369164853063e-05, "loss": 4.5738, "step": 38485 }, { "epoch": 0.788196506460795, "grad_norm": 0.2720889747142792, "learning_rate": 5.141670994776546e-05, "loss": 4.6397, "step": 38490 }, { "epoch": 0.7882988962381996, "grad_norm": 0.2668980360031128, "learning_rate": 5.1370067516621164e-05, "loss": 4.5876, "step": 38495 }, { "epoch": 0.7884012860156042, "grad_norm": 0.27054527401924133, "learning_rate": 5.1323441879361995e-05, "loss": 4.5783, "step": 38500 }, { "epoch": 0.7885036757930088, "grad_norm": 0.28997063636779785, "learning_rate": 5.1276833043927075e-05, "loss": 4.5805, "step": 38505 }, { "epoch": 0.7886060655704135, "grad_norm": 0.30233728885650635, "learning_rate": 5.123024101825254e-05, "loss": 4.6095, "step": 38510 }, { "epoch": 0.7887084553478181, "grad_norm": 0.4892871677875519, "learning_rate": 5.1183665810271657e-05, "loss": 4.4724, "step": 38515 }, { "epoch": 0.7888108451252227, "grad_norm": 0.27788451313972473, "learning_rate": 5.113710742791484e-05, "loss": 4.5916, "step": 38520 }, { "epoch": 0.7889132349026273, "grad_norm": 0.27443966269493103, "learning_rate": 5.1090565879109766e-05, "loss": 4.5886, "step": 38525 }, { "epoch": 0.789015624680032, "grad_norm": 0.2806726396083832, "learning_rate": 5.104404117178108e-05, "loss": 4.5716, "step": 38530 }, { "epoch": 0.7891180144574366, "grad_norm": 0.330605149269104, "learning_rate": 5.099753331385057e-05, "loss": 4.5924, "step": 38535 }, { "epoch": 0.7892204042348412, "grad_norm": 0.2898256778717041, "learning_rate": 5.095104231323732e-05, "loss": 4.5542, "step": 38540 }, { "epoch": 0.7893227940122458, "grad_norm": 0.28494715690612793, "learning_rate": 5.090456817785739e-05, "loss": 4.5786, "step": 38545 }, { "epoch": 0.7894251837896504, "grad_norm": 0.2826581299304962, "learning_rate": 5.085811091562393e-05, "loss": 4.559, "step": 38550 }, { "epoch": 0.789527573567055, "grad_norm": 0.2861868441104889, "learning_rate": 5.081167053444747e-05, "loss": 4.55, "step": 38555 }, { "epoch": 0.7896299633444597, "grad_norm": 0.27087080478668213, "learning_rate": 5.076524704223529e-05, "loss": 4.5422, "step": 38560 }, { "epoch": 0.7897323531218643, "grad_norm": 0.29426097869873047, "learning_rate": 5.0718840446892126e-05, "loss": 4.5025, "step": 38565 }, { "epoch": 0.7898347428992689, "grad_norm": 0.9471340775489807, "learning_rate": 5.067245075631968e-05, "loss": 4.4886, "step": 38570 }, { "epoch": 0.7899371326766735, "grad_norm": 0.2938225567340851, "learning_rate": 5.0626077978416746e-05, "loss": 4.578, "step": 38575 }, { "epoch": 0.7900395224540782, "grad_norm": 0.2911272644996643, "learning_rate": 5.0579722121079395e-05, "loss": 4.5978, "step": 38580 }, { "epoch": 0.7901419122314828, "grad_norm": 0.27577346563339233, "learning_rate": 5.053338319220068e-05, "loss": 4.6174, "step": 38585 }, { "epoch": 0.7902443020088874, "grad_norm": 0.31076574325561523, "learning_rate": 5.0487061199670724e-05, "loss": 4.5837, "step": 38590 }, { "epoch": 0.790346691786292, "grad_norm": 0.27288109064102173, "learning_rate": 5.0440756151376985e-05, "loss": 4.5775, "step": 38595 }, { "epoch": 0.7904490815636966, "grad_norm": 0.3731415569782257, "learning_rate": 5.039446805520385e-05, "loss": 4.6132, "step": 38600 }, { "epoch": 0.7905514713411013, "grad_norm": 0.2759488523006439, "learning_rate": 5.034819691903283e-05, "loss": 4.5818, "step": 38605 }, { "epoch": 0.7906538611185059, "grad_norm": 0.27843356132507324, "learning_rate": 5.0301942750742596e-05, "loss": 4.5705, "step": 38610 }, { "epoch": 0.7907562508959105, "grad_norm": 0.29055437445640564, "learning_rate": 5.025570555820893e-05, "loss": 4.6136, "step": 38615 }, { "epoch": 0.7908586406733151, "grad_norm": 0.2942349314689636, "learning_rate": 5.0209485349304854e-05, "loss": 4.6066, "step": 38620 }, { "epoch": 0.7909610304507197, "grad_norm": 0.28809309005737305, "learning_rate": 5.016328213190013e-05, "loss": 4.629, "step": 38625 }, { "epoch": 0.7910634202281244, "grad_norm": 0.27980896830558777, "learning_rate": 5.011709591386202e-05, "loss": 4.5645, "step": 38630 }, { "epoch": 0.7911658100055291, "grad_norm": 0.2762549817562103, "learning_rate": 5.007092670305467e-05, "loss": 4.5757, "step": 38635 }, { "epoch": 0.7912681997829337, "grad_norm": 0.2708359956741333, "learning_rate": 5.0024774507339354e-05, "loss": 4.5934, "step": 38640 }, { "epoch": 0.7913705895603383, "grad_norm": 0.29552069306373596, "learning_rate": 4.997863933457459e-05, "loss": 4.5822, "step": 38645 }, { "epoch": 0.791472979337743, "grad_norm": 0.2964145243167877, "learning_rate": 4.993252119261582e-05, "loss": 4.5853, "step": 38650 }, { "epoch": 0.7915753691151476, "grad_norm": 0.28075456619262695, "learning_rate": 4.988642008931567e-05, "loss": 4.5389, "step": 38655 }, { "epoch": 0.7916777588925522, "grad_norm": 0.2771109342575073, "learning_rate": 4.984033603252379e-05, "loss": 4.6293, "step": 38660 }, { "epoch": 0.7917801486699568, "grad_norm": 0.2782790958881378, "learning_rate": 4.9794269030087126e-05, "loss": 4.6077, "step": 38665 }, { "epoch": 0.7918825384473615, "grad_norm": 0.28569403290748596, "learning_rate": 4.974821908984948e-05, "loss": 4.6277, "step": 38670 }, { "epoch": 0.7919849282247661, "grad_norm": 0.28869330883026123, "learning_rate": 4.970218621965185e-05, "loss": 4.5884, "step": 38675 }, { "epoch": 0.7920873180021707, "grad_norm": 0.28534212708473206, "learning_rate": 4.965617042733241e-05, "loss": 4.5845, "step": 38680 }, { "epoch": 0.7921897077795753, "grad_norm": 0.28274044394493103, "learning_rate": 4.9610171720726297e-05, "loss": 4.621, "step": 38685 }, { "epoch": 0.7922920975569799, "grad_norm": 0.28722789883613586, "learning_rate": 4.956419010766575e-05, "loss": 4.5866, "step": 38690 }, { "epoch": 0.7923944873343846, "grad_norm": 0.28945234417915344, "learning_rate": 4.9518225595980274e-05, "loss": 4.5916, "step": 38695 }, { "epoch": 0.7924968771117892, "grad_norm": 0.27766236662864685, "learning_rate": 4.9472278193496115e-05, "loss": 4.6023, "step": 38700 }, { "epoch": 0.7925992668891938, "grad_norm": 0.2779901921749115, "learning_rate": 4.9426347908036985e-05, "loss": 4.5972, "step": 38705 }, { "epoch": 0.7927016566665984, "grad_norm": 0.2771102488040924, "learning_rate": 4.938043474742345e-05, "loss": 4.5655, "step": 38710 }, { "epoch": 0.792804046444003, "grad_norm": 0.26912617683410645, "learning_rate": 4.9334538719473184e-05, "loss": 4.6776, "step": 38715 }, { "epoch": 0.7929064362214077, "grad_norm": 0.2799225449562073, "learning_rate": 4.928865983200107e-05, "loss": 4.5719, "step": 38720 }, { "epoch": 0.7930088259988123, "grad_norm": 0.2933421730995178, "learning_rate": 4.924279809281893e-05, "loss": 4.5874, "step": 38725 }, { "epoch": 0.7931112157762169, "grad_norm": 0.27239277958869934, "learning_rate": 4.919695350973567e-05, "loss": 4.6052, "step": 38730 }, { "epoch": 0.7932136055536215, "grad_norm": 0.30010247230529785, "learning_rate": 4.915112609055743e-05, "loss": 4.5632, "step": 38735 }, { "epoch": 0.7933159953310261, "grad_norm": 0.4915987253189087, "learning_rate": 4.9105315843087285e-05, "loss": 4.6218, "step": 38740 }, { "epoch": 0.7934183851084308, "grad_norm": 0.2945227026939392, "learning_rate": 4.905952277512541e-05, "loss": 4.5622, "step": 38745 }, { "epoch": 0.7935207748858354, "grad_norm": 0.27611690759658813, "learning_rate": 4.9013746894469e-05, "loss": 4.5864, "step": 38750 }, { "epoch": 0.79362316466324, "grad_norm": 0.2813268303871155, "learning_rate": 4.8967988208912514e-05, "loss": 4.6249, "step": 38755 }, { "epoch": 0.7937255544406446, "grad_norm": 0.2851206362247467, "learning_rate": 4.892224672624732e-05, "loss": 4.5687, "step": 38760 }, { "epoch": 0.7938279442180493, "grad_norm": 0.29030922055244446, "learning_rate": 4.887652245426182e-05, "loss": 4.5937, "step": 38765 }, { "epoch": 0.7939303339954539, "grad_norm": 0.3038567006587982, "learning_rate": 4.883081540074169e-05, "loss": 4.5905, "step": 38770 }, { "epoch": 0.7940327237728585, "grad_norm": 0.27633538842201233, "learning_rate": 4.878512557346948e-05, "loss": 4.6112, "step": 38775 }, { "epoch": 0.7941351135502631, "grad_norm": 0.2828822135925293, "learning_rate": 4.873945298022481e-05, "loss": 4.6193, "step": 38780 }, { "epoch": 0.7942375033276677, "grad_norm": 0.2662680447101593, "learning_rate": 4.869379762878457e-05, "loss": 4.5674, "step": 38785 }, { "epoch": 0.7943398931050724, "grad_norm": 0.2869718074798584, "learning_rate": 4.8648159526922494e-05, "loss": 4.6003, "step": 38790 }, { "epoch": 0.794442282882477, "grad_norm": 0.2725258469581604, "learning_rate": 4.860253868240946e-05, "loss": 4.6024, "step": 38795 }, { "epoch": 0.7945446726598816, "grad_norm": 0.36530521512031555, "learning_rate": 4.8556935103013374e-05, "loss": 4.6455, "step": 38800 }, { "epoch": 0.7946470624372862, "grad_norm": 0.27146273851394653, "learning_rate": 4.85113487964993e-05, "loss": 4.5913, "step": 38805 }, { "epoch": 0.7947494522146908, "grad_norm": 0.2952762842178345, "learning_rate": 4.8465779770629303e-05, "loss": 4.5957, "step": 38810 }, { "epoch": 0.7948518419920955, "grad_norm": 0.278006911277771, "learning_rate": 4.8420228033162395e-05, "loss": 4.6793, "step": 38815 }, { "epoch": 0.7949542317695001, "grad_norm": 0.2903681695461273, "learning_rate": 4.837469359185486e-05, "loss": 4.5519, "step": 38820 }, { "epoch": 0.7950566215469047, "grad_norm": 0.301890105009079, "learning_rate": 4.832917645445989e-05, "loss": 4.6172, "step": 38825 }, { "epoch": 0.7951590113243093, "grad_norm": 0.2739580571651459, "learning_rate": 4.828367662872769e-05, "loss": 4.5555, "step": 38830 }, { "epoch": 0.795261401101714, "grad_norm": 0.2762344777584076, "learning_rate": 4.8238194122405767e-05, "loss": 4.5947, "step": 38835 }, { "epoch": 0.7953637908791187, "grad_norm": 0.2905920147895813, "learning_rate": 4.81927289432383e-05, "loss": 4.589, "step": 38840 }, { "epoch": 0.7954661806565233, "grad_norm": 0.29773831367492676, "learning_rate": 4.8147281098966856e-05, "loss": 4.5164, "step": 38845 }, { "epoch": 0.7955685704339279, "grad_norm": 0.27923330664634705, "learning_rate": 4.8101850597329875e-05, "loss": 4.5958, "step": 38850 }, { "epoch": 0.7956709602113325, "grad_norm": 0.2984638214111328, "learning_rate": 4.805643744606285e-05, "loss": 4.5721, "step": 38855 }, { "epoch": 0.7957733499887372, "grad_norm": 0.2867434024810791, "learning_rate": 4.801104165289843e-05, "loss": 4.6152, "step": 38860 }, { "epoch": 0.7958757397661418, "grad_norm": 0.2986920177936554, "learning_rate": 4.7965663225566195e-05, "loss": 4.593, "step": 38865 }, { "epoch": 0.7959781295435464, "grad_norm": 0.29490727186203003, "learning_rate": 4.792030217179275e-05, "loss": 4.6056, "step": 38870 }, { "epoch": 0.796080519320951, "grad_norm": 0.28355735540390015, "learning_rate": 4.787495849930191e-05, "loss": 4.5794, "step": 38875 }, { "epoch": 0.7961829090983557, "grad_norm": 0.2785748541355133, "learning_rate": 4.782963221581437e-05, "loss": 4.5824, "step": 38880 }, { "epoch": 0.7962852988757603, "grad_norm": 0.2727014720439911, "learning_rate": 4.778432332904789e-05, "loss": 4.5673, "step": 38885 }, { "epoch": 0.7963876886531649, "grad_norm": 0.2806575298309326, "learning_rate": 4.773903184671726e-05, "loss": 4.5598, "step": 38890 }, { "epoch": 0.7964900784305695, "grad_norm": 0.2848622798919678, "learning_rate": 4.7693757776534446e-05, "loss": 4.6432, "step": 38895 }, { "epoch": 0.7965924682079741, "grad_norm": 0.28101709485054016, "learning_rate": 4.764850112620826e-05, "loss": 4.6285, "step": 38900 }, { "epoch": 0.7966948579853788, "grad_norm": 0.26877561211586, "learning_rate": 4.7603261903444615e-05, "loss": 4.5875, "step": 38905 }, { "epoch": 0.7967972477627834, "grad_norm": 0.3063620626926422, "learning_rate": 4.7558040115946536e-05, "loss": 4.5648, "step": 38910 }, { "epoch": 0.796899637540188, "grad_norm": 0.28085240721702576, "learning_rate": 4.751283577141398e-05, "loss": 4.6174, "step": 38915 }, { "epoch": 0.7970020273175926, "grad_norm": 0.2840745747089386, "learning_rate": 4.7467648877543925e-05, "loss": 4.6076, "step": 38920 }, { "epoch": 0.7971044170949972, "grad_norm": 0.2984735071659088, "learning_rate": 4.742247944203051e-05, "loss": 4.6254, "step": 38925 }, { "epoch": 0.7972068068724019, "grad_norm": 0.29564350843429565, "learning_rate": 4.7377327472564755e-05, "loss": 4.5739, "step": 38930 }, { "epoch": 0.7973091966498065, "grad_norm": 0.2938145101070404, "learning_rate": 4.733219297683477e-05, "loss": 4.5883, "step": 38935 }, { "epoch": 0.7974115864272111, "grad_norm": 0.28416699171066284, "learning_rate": 4.7287075962525634e-05, "loss": 4.569, "step": 38940 }, { "epoch": 0.7975139762046157, "grad_norm": 0.48297443985939026, "learning_rate": 4.724197643731958e-05, "loss": 4.5492, "step": 38945 }, { "epoch": 0.7976163659820203, "grad_norm": 0.32836616039276123, "learning_rate": 4.719689440889576e-05, "loss": 4.5837, "step": 38950 }, { "epoch": 0.797718755759425, "grad_norm": 0.2783101797103882, "learning_rate": 4.7151829884930296e-05, "loss": 4.5946, "step": 38955 }, { "epoch": 0.7978211455368296, "grad_norm": 0.27545347809791565, "learning_rate": 4.7106782873096495e-05, "loss": 4.5771, "step": 38960 }, { "epoch": 0.7979235353142342, "grad_norm": 0.2808891832828522, "learning_rate": 4.706175338106457e-05, "loss": 4.5738, "step": 38965 }, { "epoch": 0.7980259250916388, "grad_norm": 0.2797003984451294, "learning_rate": 4.7016741416501664e-05, "loss": 4.6184, "step": 38970 }, { "epoch": 0.7981283148690435, "grad_norm": 0.2799162268638611, "learning_rate": 4.697174698707224e-05, "loss": 4.5623, "step": 38975 }, { "epoch": 0.7982307046464481, "grad_norm": 0.2852189540863037, "learning_rate": 4.692677010043737e-05, "loss": 4.5475, "step": 38980 }, { "epoch": 0.7983330944238527, "grad_norm": 0.2748004198074341, "learning_rate": 4.688181076425547e-05, "loss": 4.5872, "step": 38985 }, { "epoch": 0.7984354842012573, "grad_norm": 0.2709660828113556, "learning_rate": 4.68368689861818e-05, "loss": 4.6041, "step": 38990 }, { "epoch": 0.7985378739786619, "grad_norm": 0.29953452944755554, "learning_rate": 4.6791944773868635e-05, "loss": 4.5579, "step": 38995 }, { "epoch": 0.7986402637560666, "grad_norm": 0.2670140564441681, "learning_rate": 4.674703813496539e-05, "loss": 4.6503, "step": 39000 }, { "epoch": 0.7986402637560666, "eval_loss": 4.60409688949585, "eval_runtime": 17.1316, "eval_samples_per_second": 33.972, "eval_steps_per_second": 4.261, "step": 39000 }, { "epoch": 0.7987426535334712, "grad_norm": 0.2708469331264496, "learning_rate": 4.670214907711835e-05, "loss": 4.5807, "step": 39005 }, { "epoch": 0.7988450433108758, "grad_norm": 0.2768626809120178, "learning_rate": 4.6657277607970775e-05, "loss": 4.5605, "step": 39010 }, { "epoch": 0.7989474330882804, "grad_norm": 0.28735220432281494, "learning_rate": 4.661242373516315e-05, "loss": 4.5613, "step": 39015 }, { "epoch": 0.799049822865685, "grad_norm": 0.29037731885910034, "learning_rate": 4.6567587466332724e-05, "loss": 4.5707, "step": 39020 }, { "epoch": 0.7991522126430897, "grad_norm": 0.27839553356170654, "learning_rate": 4.652276880911387e-05, "loss": 4.6282, "step": 39025 }, { "epoch": 0.7992546024204943, "grad_norm": 0.28246358036994934, "learning_rate": 4.647796777113788e-05, "loss": 4.6188, "step": 39030 }, { "epoch": 0.7993569921978989, "grad_norm": 0.27705833315849304, "learning_rate": 4.64331843600332e-05, "loss": 4.6162, "step": 39035 }, { "epoch": 0.7994593819753036, "grad_norm": 0.2875292897224426, "learning_rate": 4.638841858342514e-05, "loss": 4.5495, "step": 39040 }, { "epoch": 0.7995617717527083, "grad_norm": 0.2829298973083496, "learning_rate": 4.634367044893598e-05, "loss": 4.5837, "step": 39045 }, { "epoch": 0.7996641615301129, "grad_norm": 0.32871323823928833, "learning_rate": 4.6298939964185164e-05, "loss": 4.5854, "step": 39050 }, { "epoch": 0.7997665513075175, "grad_norm": 0.2664298415184021, "learning_rate": 4.6254227136788986e-05, "loss": 4.6725, "step": 39055 }, { "epoch": 0.7998689410849221, "grad_norm": 0.2779654264450073, "learning_rate": 4.6209531974360705e-05, "loss": 4.6213, "step": 39060 }, { "epoch": 0.7999713308623267, "grad_norm": 0.2852494716644287, "learning_rate": 4.6164854484510764e-05, "loss": 4.6053, "step": 39065 }, { "epoch": 0.8000737206397314, "grad_norm": 0.3127801716327667, "learning_rate": 4.6120194674846426e-05, "loss": 4.5472, "step": 39070 }, { "epoch": 0.800176110417136, "grad_norm": 0.2792738676071167, "learning_rate": 4.607555255297196e-05, "loss": 4.5388, "step": 39075 }, { "epoch": 0.8002785001945406, "grad_norm": 0.2929307520389557, "learning_rate": 4.603092812648862e-05, "loss": 4.6292, "step": 39080 }, { "epoch": 0.8003808899719452, "grad_norm": 0.29168641567230225, "learning_rate": 4.5986321402994805e-05, "loss": 4.5284, "step": 39085 }, { "epoch": 0.8004832797493499, "grad_norm": 0.270711749792099, "learning_rate": 4.594173239008572e-05, "loss": 4.5815, "step": 39090 }, { "epoch": 0.8005856695267545, "grad_norm": 0.28868192434310913, "learning_rate": 4.5897161095353526e-05, "loss": 4.5659, "step": 39095 }, { "epoch": 0.8006880593041591, "grad_norm": 0.27453774213790894, "learning_rate": 4.58526075263876e-05, "loss": 4.5569, "step": 39100 }, { "epoch": 0.8007904490815637, "grad_norm": 0.29395386576652527, "learning_rate": 4.5808071690774064e-05, "loss": 4.5804, "step": 39105 }, { "epoch": 0.8008928388589683, "grad_norm": 0.27555522322654724, "learning_rate": 4.5763553596096104e-05, "loss": 4.6182, "step": 39110 }, { "epoch": 0.800995228636373, "grad_norm": 0.2816535532474518, "learning_rate": 4.5719053249934e-05, "loss": 4.5691, "step": 39115 }, { "epoch": 0.8010976184137776, "grad_norm": 0.28808650374412537, "learning_rate": 4.5674570659864717e-05, "loss": 4.6323, "step": 39120 }, { "epoch": 0.8012000081911822, "grad_norm": 0.28865253925323486, "learning_rate": 4.5630105833462526e-05, "loss": 4.5761, "step": 39125 }, { "epoch": 0.8013023979685868, "grad_norm": 0.30832570791244507, "learning_rate": 4.5585658778298485e-05, "loss": 4.6301, "step": 39130 }, { "epoch": 0.8014047877459914, "grad_norm": 0.290213018655777, "learning_rate": 4.554122950194062e-05, "loss": 4.6544, "step": 39135 }, { "epoch": 0.8015071775233961, "grad_norm": 0.2868783473968506, "learning_rate": 4.549681801195408e-05, "loss": 4.6037, "step": 39140 }, { "epoch": 0.8016095673008007, "grad_norm": 0.2926163375377655, "learning_rate": 4.545242431590081e-05, "loss": 4.6423, "step": 39145 }, { "epoch": 0.8017119570782053, "grad_norm": 0.2848108112812042, "learning_rate": 4.5408048421339796e-05, "loss": 4.5587, "step": 39150 }, { "epoch": 0.8018143468556099, "grad_norm": 0.35969194769859314, "learning_rate": 4.536369033582705e-05, "loss": 4.6215, "step": 39155 }, { "epoch": 0.8019167366330145, "grad_norm": 0.36385461688041687, "learning_rate": 4.531935006691548e-05, "loss": 4.5802, "step": 39160 }, { "epoch": 0.8020191264104192, "grad_norm": 0.2822389304637909, "learning_rate": 4.5275027622154966e-05, "loss": 4.6143, "step": 39165 }, { "epoch": 0.8021215161878238, "grad_norm": 0.2893722951412201, "learning_rate": 4.5230723009092326e-05, "loss": 4.5839, "step": 39170 }, { "epoch": 0.8022239059652284, "grad_norm": 0.28926926851272583, "learning_rate": 4.518643623527145e-05, "loss": 4.5928, "step": 39175 }, { "epoch": 0.802326295742633, "grad_norm": 0.27141159772872925, "learning_rate": 4.51421673082331e-05, "loss": 4.5484, "step": 39180 }, { "epoch": 0.8024286855200377, "grad_norm": 0.27838727831840515, "learning_rate": 4.509791623551498e-05, "loss": 4.6451, "step": 39185 }, { "epoch": 0.8025310752974423, "grad_norm": 0.2750069200992584, "learning_rate": 4.5053683024651856e-05, "loss": 4.5899, "step": 39190 }, { "epoch": 0.8026334650748469, "grad_norm": 0.30682578682899475, "learning_rate": 4.500946768317538e-05, "loss": 4.5936, "step": 39195 }, { "epoch": 0.8027358548522515, "grad_norm": 0.31645169854164124, "learning_rate": 4.496527021861411e-05, "loss": 4.5895, "step": 39200 }, { "epoch": 0.8028382446296561, "grad_norm": 0.2702699303627014, "learning_rate": 4.49210906384937e-05, "loss": 4.6033, "step": 39205 }, { "epoch": 0.8029406344070608, "grad_norm": 0.2715344727039337, "learning_rate": 4.487692895033666e-05, "loss": 4.5896, "step": 39210 }, { "epoch": 0.8030430241844654, "grad_norm": 0.27582502365112305, "learning_rate": 4.483278516166249e-05, "loss": 4.6032, "step": 39215 }, { "epoch": 0.80314541396187, "grad_norm": 0.27882251143455505, "learning_rate": 4.4788659279987535e-05, "loss": 4.6138, "step": 39220 }, { "epoch": 0.8032478037392746, "grad_norm": 0.2906031906604767, "learning_rate": 4.47445513128253e-05, "loss": 4.5029, "step": 39225 }, { "epoch": 0.8033501935166792, "grad_norm": 0.2661421000957489, "learning_rate": 4.4700461267686084e-05, "loss": 4.5761, "step": 39230 }, { "epoch": 0.8034525832940839, "grad_norm": 0.2729162871837616, "learning_rate": 4.4656389152077107e-05, "loss": 4.5717, "step": 39235 }, { "epoch": 0.8035549730714885, "grad_norm": 0.28579139709472656, "learning_rate": 4.461233497350278e-05, "loss": 4.6001, "step": 39240 }, { "epoch": 0.8036573628488932, "grad_norm": 0.2768882215023041, "learning_rate": 4.456829873946405e-05, "loss": 4.5897, "step": 39245 }, { "epoch": 0.8037597526262978, "grad_norm": 0.29882630705833435, "learning_rate": 4.452428045745915e-05, "loss": 4.5495, "step": 39250 }, { "epoch": 0.8038621424037025, "grad_norm": 0.284458726644516, "learning_rate": 4.448028013498325e-05, "loss": 4.5268, "step": 39255 }, { "epoch": 0.8039645321811071, "grad_norm": 0.27446749806404114, "learning_rate": 4.4436297779528174e-05, "loss": 4.5373, "step": 39260 }, { "epoch": 0.8040669219585117, "grad_norm": 0.27560141682624817, "learning_rate": 4.439233339858301e-05, "loss": 4.6377, "step": 39265 }, { "epoch": 0.8041693117359163, "grad_norm": 0.33306679129600525, "learning_rate": 4.43483869996336e-05, "loss": 4.5824, "step": 39270 }, { "epoch": 0.804271701513321, "grad_norm": 0.2933988869190216, "learning_rate": 4.43044585901627e-05, "loss": 4.5762, "step": 39275 }, { "epoch": 0.8043740912907256, "grad_norm": 0.28053733706474304, "learning_rate": 4.4260548177650204e-05, "loss": 4.5857, "step": 39280 }, { "epoch": 0.8044764810681302, "grad_norm": 0.32424768805503845, "learning_rate": 4.4216655769572757e-05, "loss": 4.554, "step": 39285 }, { "epoch": 0.8045788708455348, "grad_norm": 0.2807777523994446, "learning_rate": 4.417278137340392e-05, "loss": 4.6263, "step": 39290 }, { "epoch": 0.8046812606229394, "grad_norm": 0.29028016328811646, "learning_rate": 4.412892499661439e-05, "loss": 4.6218, "step": 39295 }, { "epoch": 0.804783650400344, "grad_norm": 0.2970632314682007, "learning_rate": 4.408508664667159e-05, "loss": 4.5771, "step": 39300 }, { "epoch": 0.8048860401777487, "grad_norm": 0.2803395390510559, "learning_rate": 4.4041266331039986e-05, "loss": 4.6243, "step": 39305 }, { "epoch": 0.8049884299551533, "grad_norm": 0.2990550696849823, "learning_rate": 4.3997464057180845e-05, "loss": 4.5418, "step": 39310 }, { "epoch": 0.8050908197325579, "grad_norm": 0.2797633409500122, "learning_rate": 4.395367983255259e-05, "loss": 4.5788, "step": 39315 }, { "epoch": 0.8051932095099625, "grad_norm": 0.26845577359199524, "learning_rate": 4.3909913664610355e-05, "loss": 4.562, "step": 39320 }, { "epoch": 0.8052955992873672, "grad_norm": 0.27030444145202637, "learning_rate": 4.386616556080625e-05, "loss": 4.5199, "step": 39325 }, { "epoch": 0.8053979890647718, "grad_norm": 0.2664375901222229, "learning_rate": 4.382243552858944e-05, "loss": 4.495, "step": 39330 }, { "epoch": 0.8055003788421764, "grad_norm": 0.31012603640556335, "learning_rate": 4.377872357540583e-05, "loss": 4.6348, "step": 39335 }, { "epoch": 0.805602768619581, "grad_norm": 0.2901231050491333, "learning_rate": 4.3735029708698325e-05, "loss": 4.6027, "step": 39340 }, { "epoch": 0.8057051583969856, "grad_norm": 0.2725827991962433, "learning_rate": 4.369135393590682e-05, "loss": 4.5934, "step": 39345 }, { "epoch": 0.8058075481743903, "grad_norm": 0.2755502164363861, "learning_rate": 4.364769626446803e-05, "loss": 4.6253, "step": 39350 }, { "epoch": 0.8059099379517949, "grad_norm": 0.3057175278663635, "learning_rate": 4.360405670181559e-05, "loss": 4.5737, "step": 39355 }, { "epoch": 0.8060123277291995, "grad_norm": 0.2964734137058258, "learning_rate": 4.356043525538008e-05, "loss": 4.5851, "step": 39360 }, { "epoch": 0.8061147175066041, "grad_norm": 0.2873198091983795, "learning_rate": 4.3516831932589056e-05, "loss": 4.6087, "step": 39365 }, { "epoch": 0.8062171072840087, "grad_norm": 0.2713853716850281, "learning_rate": 4.347324674086691e-05, "loss": 4.5689, "step": 39370 }, { "epoch": 0.8063194970614134, "grad_norm": 0.27605798840522766, "learning_rate": 4.34296796876349e-05, "loss": 4.605, "step": 39375 }, { "epoch": 0.806421886838818, "grad_norm": 0.27485141158103943, "learning_rate": 4.33861307803114e-05, "loss": 4.5899, "step": 39380 }, { "epoch": 0.8065242766162226, "grad_norm": 0.28419703245162964, "learning_rate": 4.3342600026311385e-05, "loss": 4.5744, "step": 39385 }, { "epoch": 0.8066266663936272, "grad_norm": 0.27581843733787537, "learning_rate": 4.329908743304699e-05, "loss": 4.5795, "step": 39390 }, { "epoch": 0.8067290561710319, "grad_norm": 0.2819613814353943, "learning_rate": 4.3255593007927273e-05, "loss": 4.6254, "step": 39395 }, { "epoch": 0.8068314459484365, "grad_norm": 0.3055405020713806, "learning_rate": 4.321211675835793e-05, "loss": 4.5892, "step": 39400 }, { "epoch": 0.8069338357258411, "grad_norm": 0.32575732469558716, "learning_rate": 4.316865869174184e-05, "loss": 4.5819, "step": 39405 }, { "epoch": 0.8070362255032457, "grad_norm": 0.3296785354614258, "learning_rate": 4.312521881547868e-05, "loss": 4.5235, "step": 39410 }, { "epoch": 0.8071386152806503, "grad_norm": 0.30570748448371887, "learning_rate": 4.308179713696497e-05, "loss": 4.5757, "step": 39415 }, { "epoch": 0.807241005058055, "grad_norm": 0.2826966345310211, "learning_rate": 4.303839366359426e-05, "loss": 4.5665, "step": 39420 }, { "epoch": 0.8073433948354596, "grad_norm": 0.2718033790588379, "learning_rate": 4.2995008402756915e-05, "loss": 4.578, "step": 39425 }, { "epoch": 0.8074457846128642, "grad_norm": 0.2811250686645508, "learning_rate": 4.295164136184022e-05, "loss": 4.6362, "step": 39430 }, { "epoch": 0.8075481743902688, "grad_norm": 0.3076514005661011, "learning_rate": 4.290829254822828e-05, "loss": 4.5803, "step": 39435 }, { "epoch": 0.8076505641676734, "grad_norm": 0.2890389859676361, "learning_rate": 4.286496196930228e-05, "loss": 4.5506, "step": 39440 }, { "epoch": 0.8077529539450782, "grad_norm": 0.28918203711509705, "learning_rate": 4.282164963244014e-05, "loss": 4.5758, "step": 39445 }, { "epoch": 0.8078553437224828, "grad_norm": 0.31487032771110535, "learning_rate": 4.2778355545016696e-05, "loss": 4.6039, "step": 39450 }, { "epoch": 0.8079577334998874, "grad_norm": 0.27409568428993225, "learning_rate": 4.273507971440376e-05, "loss": 4.5724, "step": 39455 }, { "epoch": 0.808060123277292, "grad_norm": 0.297097772359848, "learning_rate": 4.2691822147969985e-05, "loss": 4.5531, "step": 39460 }, { "epoch": 0.8081625130546967, "grad_norm": 0.2799051105976105, "learning_rate": 4.264858285308083e-05, "loss": 4.5839, "step": 39465 }, { "epoch": 0.8082649028321013, "grad_norm": 0.2736682891845703, "learning_rate": 4.260536183709882e-05, "loss": 4.5616, "step": 39470 }, { "epoch": 0.8083672926095059, "grad_norm": 0.282229483127594, "learning_rate": 4.256215910738322e-05, "loss": 4.5735, "step": 39475 }, { "epoch": 0.8084696823869105, "grad_norm": 0.2849828600883484, "learning_rate": 4.251897467129019e-05, "loss": 4.5599, "step": 39480 }, { "epoch": 0.8085720721643151, "grad_norm": 0.36352962255477905, "learning_rate": 4.247580853617291e-05, "loss": 4.5819, "step": 39485 }, { "epoch": 0.8086744619417198, "grad_norm": 0.29873931407928467, "learning_rate": 4.243266070938132e-05, "loss": 4.5764, "step": 39490 }, { "epoch": 0.8087768517191244, "grad_norm": 0.28227362036705017, "learning_rate": 4.238953119826225e-05, "loss": 4.5845, "step": 39495 }, { "epoch": 0.808879241496529, "grad_norm": 0.3054857552051544, "learning_rate": 4.234642001015939e-05, "loss": 4.6181, "step": 39500 }, { "epoch": 0.8089816312739336, "grad_norm": 0.2842195928096771, "learning_rate": 4.230332715241346e-05, "loss": 4.6409, "step": 39505 }, { "epoch": 0.8090840210513383, "grad_norm": 0.2869653105735779, "learning_rate": 4.226025263236192e-05, "loss": 4.5387, "step": 39510 }, { "epoch": 0.8091864108287429, "grad_norm": 0.2856428921222687, "learning_rate": 4.221719645733905e-05, "loss": 4.5624, "step": 39515 }, { "epoch": 0.8092888006061475, "grad_norm": 0.2736533284187317, "learning_rate": 4.217415863467627e-05, "loss": 4.6077, "step": 39520 }, { "epoch": 0.8093911903835521, "grad_norm": 0.284796804189682, "learning_rate": 4.213113917170152e-05, "loss": 4.6018, "step": 39525 }, { "epoch": 0.8094935801609567, "grad_norm": 0.27927646040916443, "learning_rate": 4.208813807573989e-05, "loss": 4.569, "step": 39530 }, { "epoch": 0.8095959699383614, "grad_norm": 0.2721383571624756, "learning_rate": 4.20451553541133e-05, "loss": 4.5113, "step": 39535 }, { "epoch": 0.809698359715766, "grad_norm": 0.2773188054561615, "learning_rate": 4.2002191014140366e-05, "loss": 4.582, "step": 39540 }, { "epoch": 0.8098007494931706, "grad_norm": 0.33200955390930176, "learning_rate": 4.19592450631368e-05, "loss": 4.5949, "step": 39545 }, { "epoch": 0.8099031392705752, "grad_norm": 0.28897106647491455, "learning_rate": 4.1916317508415044e-05, "loss": 4.4903, "step": 39550 }, { "epoch": 0.8100055290479798, "grad_norm": 0.2980147898197174, "learning_rate": 4.1873408357284406e-05, "loss": 4.5857, "step": 39555 }, { "epoch": 0.8101079188253845, "grad_norm": 0.28293874859809875, "learning_rate": 4.1830517617051175e-05, "loss": 4.5404, "step": 39560 }, { "epoch": 0.8102103086027891, "grad_norm": 0.28592419624328613, "learning_rate": 4.178764529501839e-05, "loss": 4.615, "step": 39565 }, { "epoch": 0.8103126983801937, "grad_norm": 0.2894458770751953, "learning_rate": 4.1744791398486016e-05, "loss": 4.5859, "step": 39570 }, { "epoch": 0.8104150881575983, "grad_norm": 0.2806173861026764, "learning_rate": 4.170195593475078e-05, "loss": 4.6081, "step": 39575 }, { "epoch": 0.810517477935003, "grad_norm": 0.35036706924438477, "learning_rate": 4.165913891110646e-05, "loss": 4.6226, "step": 39580 }, { "epoch": 0.8106198677124076, "grad_norm": 0.28919950127601624, "learning_rate": 4.161634033484352e-05, "loss": 4.6167, "step": 39585 }, { "epoch": 0.8107222574898122, "grad_norm": 0.2950119376182556, "learning_rate": 4.157356021324932e-05, "loss": 4.6162, "step": 39590 }, { "epoch": 0.8108246472672168, "grad_norm": 0.2698840796947479, "learning_rate": 4.1530798553608205e-05, "loss": 4.6037, "step": 39595 }, { "epoch": 0.8109270370446214, "grad_norm": 0.3377280831336975, "learning_rate": 4.14880553632012e-05, "loss": 4.5657, "step": 39600 }, { "epoch": 0.8110294268220261, "grad_norm": 0.2888646125793457, "learning_rate": 4.1445330649306214e-05, "loss": 4.5684, "step": 39605 }, { "epoch": 0.8111318165994307, "grad_norm": 0.2981690764427185, "learning_rate": 4.140262441919818e-05, "loss": 4.5822, "step": 39610 }, { "epoch": 0.8112342063768353, "grad_norm": 0.2968234717845917, "learning_rate": 4.1359936680148675e-05, "loss": 4.586, "step": 39615 }, { "epoch": 0.8113365961542399, "grad_norm": 0.3319108486175537, "learning_rate": 4.131726743942623e-05, "loss": 4.559, "step": 39620 }, { "epoch": 0.8114389859316445, "grad_norm": 0.28827497363090515, "learning_rate": 4.127461670429617e-05, "loss": 4.6009, "step": 39625 }, { "epoch": 0.8115413757090492, "grad_norm": 0.26755833625793457, "learning_rate": 4.12319844820208e-05, "loss": 4.5762, "step": 39630 }, { "epoch": 0.8116437654864538, "grad_norm": 0.27354004979133606, "learning_rate": 4.1189370779859115e-05, "loss": 4.5482, "step": 39635 }, { "epoch": 0.8117461552638584, "grad_norm": 0.2695753872394562, "learning_rate": 4.1146775605067004e-05, "loss": 4.56, "step": 39640 }, { "epoch": 0.811848545041263, "grad_norm": 0.28312042355537415, "learning_rate": 4.1104198964897283e-05, "loss": 4.5647, "step": 39645 }, { "epoch": 0.8119509348186678, "grad_norm": 0.27489081025123596, "learning_rate": 4.106164086659953e-05, "loss": 4.6066, "step": 39650 }, { "epoch": 0.8120533245960724, "grad_norm": 0.2789163291454315, "learning_rate": 4.1019101317420106e-05, "loss": 4.616, "step": 39655 }, { "epoch": 0.812155714373477, "grad_norm": 0.2964979410171509, "learning_rate": 4.097658032460246e-05, "loss": 4.63, "step": 39660 }, { "epoch": 0.8122581041508816, "grad_norm": 0.29779788851737976, "learning_rate": 4.0934077895386516e-05, "loss": 4.594, "step": 39665 }, { "epoch": 0.8123604939282862, "grad_norm": 0.271930068731308, "learning_rate": 4.0891594037009326e-05, "loss": 4.5532, "step": 39670 }, { "epoch": 0.8124628837056909, "grad_norm": 0.272431343793869, "learning_rate": 4.084912875670478e-05, "loss": 4.6512, "step": 39675 }, { "epoch": 0.8125652734830955, "grad_norm": 0.2750728726387024, "learning_rate": 4.0806682061703336e-05, "loss": 4.5432, "step": 39680 }, { "epoch": 0.8126676632605001, "grad_norm": 0.2824062705039978, "learning_rate": 4.076425395923259e-05, "loss": 4.584, "step": 39685 }, { "epoch": 0.8127700530379047, "grad_norm": 0.28390562534332275, "learning_rate": 4.072184445651681e-05, "loss": 4.5837, "step": 39690 }, { "epoch": 0.8128724428153093, "grad_norm": 0.27793005108833313, "learning_rate": 4.067945356077709e-05, "loss": 4.5361, "step": 39695 }, { "epoch": 0.812974832592714, "grad_norm": 0.30021142959594727, "learning_rate": 4.063708127923149e-05, "loss": 4.6323, "step": 39700 }, { "epoch": 0.8130772223701186, "grad_norm": 1.442164659500122, "learning_rate": 4.0594727619094755e-05, "loss": 4.5769, "step": 39705 }, { "epoch": 0.8131796121475232, "grad_norm": 0.29034852981567383, "learning_rate": 4.055239258757852e-05, "loss": 4.5869, "step": 39710 }, { "epoch": 0.8132820019249278, "grad_norm": 0.2804703116416931, "learning_rate": 4.051007619189119e-05, "loss": 4.6193, "step": 39715 }, { "epoch": 0.8133843917023325, "grad_norm": 0.28716444969177246, "learning_rate": 4.046777843923817e-05, "loss": 4.5762, "step": 39720 }, { "epoch": 0.8134867814797371, "grad_norm": 0.2703050673007965, "learning_rate": 4.0425499336821484e-05, "loss": 4.5967, "step": 39725 }, { "epoch": 0.8135891712571417, "grad_norm": 0.2838965356349945, "learning_rate": 4.038323889184004e-05, "loss": 4.5966, "step": 39730 }, { "epoch": 0.8136915610345463, "grad_norm": 0.28783389925956726, "learning_rate": 4.034099711148968e-05, "loss": 4.5273, "step": 39735 }, { "epoch": 0.8137939508119509, "grad_norm": 0.2864513695240021, "learning_rate": 4.0298774002962945e-05, "loss": 4.5948, "step": 39740 }, { "epoch": 0.8138963405893556, "grad_norm": 0.3301258087158203, "learning_rate": 4.0256569573449185e-05, "loss": 4.6003, "step": 39745 }, { "epoch": 0.8139987303667602, "grad_norm": 0.2918735444545746, "learning_rate": 4.0214383830134734e-05, "loss": 4.5969, "step": 39750 }, { "epoch": 0.8141011201441648, "grad_norm": 0.2782702147960663, "learning_rate": 4.0172216780202534e-05, "loss": 4.6031, "step": 39755 }, { "epoch": 0.8142035099215694, "grad_norm": 0.29124337434768677, "learning_rate": 4.013006843083249e-05, "loss": 4.5927, "step": 39760 }, { "epoch": 0.814305899698974, "grad_norm": 0.2667020559310913, "learning_rate": 4.008793878920119e-05, "loss": 4.5262, "step": 39765 }, { "epoch": 0.8144082894763787, "grad_norm": 0.28633174300193787, "learning_rate": 4.0045827862482234e-05, "loss": 4.4901, "step": 39770 }, { "epoch": 0.8145106792537833, "grad_norm": 0.27973079681396484, "learning_rate": 4.000373565784586e-05, "loss": 4.566, "step": 39775 }, { "epoch": 0.8146130690311879, "grad_norm": 0.27879223227500916, "learning_rate": 3.996166218245915e-05, "loss": 4.5679, "step": 39780 }, { "epoch": 0.8147154588085925, "grad_norm": 0.300350159406662, "learning_rate": 3.9919607443486084e-05, "loss": 4.6046, "step": 39785 }, { "epoch": 0.8148178485859972, "grad_norm": 0.27650022506713867, "learning_rate": 3.9877571448087395e-05, "loss": 4.5557, "step": 39790 }, { "epoch": 0.8149202383634018, "grad_norm": 0.2842326760292053, "learning_rate": 3.983555420342054e-05, "loss": 4.5814, "step": 39795 }, { "epoch": 0.8150226281408064, "grad_norm": 0.2776239216327667, "learning_rate": 3.979355571664002e-05, "loss": 4.6139, "step": 39800 }, { "epoch": 0.815125017918211, "grad_norm": 0.28626564145088196, "learning_rate": 3.975157599489682e-05, "loss": 4.5889, "step": 39805 }, { "epoch": 0.8152274076956156, "grad_norm": 0.2799535095691681, "learning_rate": 3.970961504533899e-05, "loss": 4.5765, "step": 39810 }, { "epoch": 0.8153297974730203, "grad_norm": 0.30687370896339417, "learning_rate": 3.9667672875111286e-05, "loss": 4.5923, "step": 39815 }, { "epoch": 0.8154321872504249, "grad_norm": 0.29484468698501587, "learning_rate": 3.962574949135521e-05, "loss": 4.5662, "step": 39820 }, { "epoch": 0.8155345770278295, "grad_norm": 0.33060598373413086, "learning_rate": 3.9583844901209214e-05, "loss": 4.5596, "step": 39825 }, { "epoch": 0.8156369668052341, "grad_norm": 0.2842893600463867, "learning_rate": 3.9541959111808433e-05, "loss": 4.5925, "step": 39830 }, { "epoch": 0.8157393565826387, "grad_norm": 0.2787351906299591, "learning_rate": 3.95000921302848e-05, "loss": 4.5817, "step": 39835 }, { "epoch": 0.8158417463600434, "grad_norm": 0.3185434639453888, "learning_rate": 3.94582439637671e-05, "loss": 4.5792, "step": 39840 }, { "epoch": 0.815944136137448, "grad_norm": 0.27078425884246826, "learning_rate": 3.941641461938094e-05, "loss": 4.5839, "step": 39845 }, { "epoch": 0.8160465259148527, "grad_norm": 0.2745194733142853, "learning_rate": 3.937460410424859e-05, "loss": 4.5662, "step": 39850 }, { "epoch": 0.8161489156922573, "grad_norm": 0.2862422466278076, "learning_rate": 3.933281242548921e-05, "loss": 4.629, "step": 39855 }, { "epoch": 0.816251305469662, "grad_norm": 0.26588085293769836, "learning_rate": 3.929103959021879e-05, "loss": 4.5757, "step": 39860 }, { "epoch": 0.8163536952470666, "grad_norm": 0.29439637064933777, "learning_rate": 3.9249285605550044e-05, "loss": 4.5791, "step": 39865 }, { "epoch": 0.8164560850244712, "grad_norm": 0.2926248610019684, "learning_rate": 3.9207550478592416e-05, "loss": 4.6052, "step": 39870 }, { "epoch": 0.8165584748018758, "grad_norm": 0.27602916955947876, "learning_rate": 3.916583421645235e-05, "loss": 4.5794, "step": 39875 }, { "epoch": 0.8166608645792804, "grad_norm": 0.28453734517097473, "learning_rate": 3.912413682623287e-05, "loss": 4.5508, "step": 39880 }, { "epoch": 0.8167632543566851, "grad_norm": 0.2766074538230896, "learning_rate": 3.908245831503383e-05, "loss": 4.5613, "step": 39885 }, { "epoch": 0.8168656441340897, "grad_norm": 0.27316877245903015, "learning_rate": 3.904079868995197e-05, "loss": 4.5547, "step": 39890 }, { "epoch": 0.8169680339114943, "grad_norm": 0.29508209228515625, "learning_rate": 3.899915795808072e-05, "loss": 4.6335, "step": 39895 }, { "epoch": 0.8170704236888989, "grad_norm": 0.28743311762809753, "learning_rate": 3.895753612651032e-05, "loss": 4.5782, "step": 39900 }, { "epoch": 0.8171728134663035, "grad_norm": 0.3072720468044281, "learning_rate": 3.891593320232772e-05, "loss": 4.6246, "step": 39905 }, { "epoch": 0.8172752032437082, "grad_norm": 0.2878606915473938, "learning_rate": 3.8874349192616826e-05, "loss": 4.6083, "step": 39910 }, { "epoch": 0.8173775930211128, "grad_norm": 0.27286794781684875, "learning_rate": 3.883278410445819e-05, "loss": 4.6387, "step": 39915 }, { "epoch": 0.8174799827985174, "grad_norm": 0.27541738748550415, "learning_rate": 3.879123794492909e-05, "loss": 4.5475, "step": 39920 }, { "epoch": 0.817582372575922, "grad_norm": 3.955681800842285, "learning_rate": 3.874971072110377e-05, "loss": 4.5906, "step": 39925 }, { "epoch": 0.8176847623533267, "grad_norm": 0.2810393273830414, "learning_rate": 3.870820244005311e-05, "loss": 4.5912, "step": 39930 }, { "epoch": 0.8177871521307313, "grad_norm": 0.318101167678833, "learning_rate": 3.866671310884472e-05, "loss": 4.5904, "step": 39935 }, { "epoch": 0.8178895419081359, "grad_norm": 0.30847010016441345, "learning_rate": 3.86252427345432e-05, "loss": 4.6128, "step": 39940 }, { "epoch": 0.8179919316855405, "grad_norm": 0.2958615720272064, "learning_rate": 3.858379132420962e-05, "loss": 4.5609, "step": 39945 }, { "epoch": 0.8180943214629451, "grad_norm": 0.27650511264801025, "learning_rate": 3.8542358884902093e-05, "loss": 4.6257, "step": 39950 }, { "epoch": 0.8181967112403498, "grad_norm": 0.27521902322769165, "learning_rate": 3.850094542367537e-05, "loss": 4.6032, "step": 39955 }, { "epoch": 0.8182991010177544, "grad_norm": 0.29434481263160706, "learning_rate": 3.8459550947580925e-05, "loss": 4.577, "step": 39960 }, { "epoch": 0.818401490795159, "grad_norm": 0.3031090795993805, "learning_rate": 3.841817546366716e-05, "loss": 4.6189, "step": 39965 }, { "epoch": 0.8185038805725636, "grad_norm": 0.2746244966983795, "learning_rate": 3.837681897897912e-05, "loss": 4.6039, "step": 39970 }, { "epoch": 0.8186062703499682, "grad_norm": 0.28014394640922546, "learning_rate": 3.833548150055858e-05, "loss": 4.5713, "step": 39975 }, { "epoch": 0.8187086601273729, "grad_norm": 0.27805647253990173, "learning_rate": 3.829416303544423e-05, "loss": 4.5126, "step": 39980 }, { "epoch": 0.8188110499047775, "grad_norm": 0.3021852672100067, "learning_rate": 3.82528635906714e-05, "loss": 4.6194, "step": 39985 }, { "epoch": 0.8189134396821821, "grad_norm": 0.2743503451347351, "learning_rate": 3.8211583173272216e-05, "loss": 4.5727, "step": 39990 }, { "epoch": 0.8190158294595867, "grad_norm": 0.28668925166130066, "learning_rate": 3.817032179027551e-05, "loss": 4.598, "step": 39995 }, { "epoch": 0.8191182192369914, "grad_norm": 0.2746945023536682, "learning_rate": 3.812907944870703e-05, "loss": 4.6093, "step": 40000 }, { "epoch": 0.8191182192369914, "eval_loss": 4.598438262939453, "eval_runtime": 17.1327, "eval_samples_per_second": 33.97, "eval_steps_per_second": 4.261, "step": 40000 }, { "epoch": 0.819220609014396, "grad_norm": 0.3382457196712494, "learning_rate": 3.8087856155589135e-05, "loss": 4.6227, "step": 40005 }, { "epoch": 0.8193229987918006, "grad_norm": 0.2767484486103058, "learning_rate": 3.8046651917940923e-05, "loss": 4.5871, "step": 40010 }, { "epoch": 0.8194253885692052, "grad_norm": 0.30373379588127136, "learning_rate": 3.800546674277842e-05, "loss": 4.4867, "step": 40015 }, { "epoch": 0.8195277783466098, "grad_norm": 0.28333204984664917, "learning_rate": 3.796430063711423e-05, "loss": 4.5592, "step": 40020 }, { "epoch": 0.8196301681240145, "grad_norm": 0.3391062915325165, "learning_rate": 3.792315360795774e-05, "loss": 4.553, "step": 40025 }, { "epoch": 0.8197325579014191, "grad_norm": 0.8202936053276062, "learning_rate": 3.7882025662315223e-05, "loss": 4.6518, "step": 40030 }, { "epoch": 0.8198349476788237, "grad_norm": 0.2817802131175995, "learning_rate": 3.784091680718954e-05, "loss": 4.5637, "step": 40035 }, { "epoch": 0.8199373374562283, "grad_norm": 0.2807997465133667, "learning_rate": 3.779982704958038e-05, "loss": 4.5998, "step": 40040 }, { "epoch": 0.8200397272336329, "grad_norm": 0.28021693229675293, "learning_rate": 3.7758756396484116e-05, "loss": 4.5725, "step": 40045 }, { "epoch": 0.8201421170110376, "grad_norm": 0.296805202960968, "learning_rate": 3.771770485489401e-05, "loss": 4.6079, "step": 40050 }, { "epoch": 0.8202445067884423, "grad_norm": 0.2737254500389099, "learning_rate": 3.767667243179991e-05, "loss": 4.6077, "step": 40055 }, { "epoch": 0.8203468965658469, "grad_norm": 0.3445216417312622, "learning_rate": 3.763565913418848e-05, "loss": 4.5476, "step": 40060 }, { "epoch": 0.8204492863432515, "grad_norm": 0.2809492349624634, "learning_rate": 3.759466496904316e-05, "loss": 4.6055, "step": 40065 }, { "epoch": 0.8205516761206562, "grad_norm": 0.2805745601654053, "learning_rate": 3.7553689943344086e-05, "loss": 4.5754, "step": 40070 }, { "epoch": 0.8206540658980608, "grad_norm": 0.27043336629867554, "learning_rate": 3.751273406406807e-05, "loss": 4.6063, "step": 40075 }, { "epoch": 0.8207564556754654, "grad_norm": 0.28540995717048645, "learning_rate": 3.747179733818892e-05, "loss": 4.5984, "step": 40080 }, { "epoch": 0.82085884545287, "grad_norm": 0.2703198492527008, "learning_rate": 3.743087977267678e-05, "loss": 4.6234, "step": 40085 }, { "epoch": 0.8209612352302746, "grad_norm": 0.28417128324508667, "learning_rate": 3.73899813744989e-05, "loss": 4.5485, "step": 40090 }, { "epoch": 0.8210636250076793, "grad_norm": 0.2963154911994934, "learning_rate": 3.734910215061907e-05, "loss": 4.5689, "step": 40095 }, { "epoch": 0.8211660147850839, "grad_norm": 0.2891373634338379, "learning_rate": 3.730824210799784e-05, "loss": 4.5853, "step": 40100 }, { "epoch": 0.8212684045624885, "grad_norm": 0.28095632791519165, "learning_rate": 3.726740125359257e-05, "loss": 4.6735, "step": 40105 }, { "epoch": 0.8213707943398931, "grad_norm": 0.2856515645980835, "learning_rate": 3.722657959435731e-05, "loss": 4.6088, "step": 40110 }, { "epoch": 0.8214731841172978, "grad_norm": 0.28138381242752075, "learning_rate": 3.718577713724274e-05, "loss": 4.6019, "step": 40115 }, { "epoch": 0.8215755738947024, "grad_norm": 0.2722416818141937, "learning_rate": 3.714499388919647e-05, "loss": 4.5733, "step": 40120 }, { "epoch": 0.821677963672107, "grad_norm": 0.2986077070236206, "learning_rate": 3.7104229857162705e-05, "loss": 4.6036, "step": 40125 }, { "epoch": 0.8217803534495116, "grad_norm": 0.2788534164428711, "learning_rate": 3.706348504808236e-05, "loss": 4.6201, "step": 40130 }, { "epoch": 0.8218827432269162, "grad_norm": 0.2822936177253723, "learning_rate": 3.7022759468893135e-05, "loss": 4.58, "step": 40135 }, { "epoch": 0.8219851330043209, "grad_norm": 0.28002282977104187, "learning_rate": 3.6982053126529495e-05, "loss": 4.6483, "step": 40140 }, { "epoch": 0.8220875227817255, "grad_norm": 0.28126972913742065, "learning_rate": 3.6941366027922535e-05, "loss": 4.5892, "step": 40145 }, { "epoch": 0.8221899125591301, "grad_norm": 0.2815869450569153, "learning_rate": 3.690069818000009e-05, "loss": 4.5944, "step": 40150 }, { "epoch": 0.8222923023365347, "grad_norm": 0.27708324790000916, "learning_rate": 3.6860049589686804e-05, "loss": 4.5795, "step": 40155 }, { "epoch": 0.8223946921139393, "grad_norm": 0.28458738327026367, "learning_rate": 3.6819420263903974e-05, "loss": 4.5768, "step": 40160 }, { "epoch": 0.822497081891344, "grad_norm": 0.27662721276283264, "learning_rate": 3.6778810209569546e-05, "loss": 4.6496, "step": 40165 }, { "epoch": 0.8225994716687486, "grad_norm": 0.2832746207714081, "learning_rate": 3.6738219433598373e-05, "loss": 4.6161, "step": 40170 }, { "epoch": 0.8227018614461532, "grad_norm": 0.28657278418540955, "learning_rate": 3.6697647942901865e-05, "loss": 4.5579, "step": 40175 }, { "epoch": 0.8228042512235578, "grad_norm": 0.30123379826545715, "learning_rate": 3.6657095744388185e-05, "loss": 4.5708, "step": 40180 }, { "epoch": 0.8229066410009624, "grad_norm": 0.2788209021091461, "learning_rate": 3.661656284496219e-05, "loss": 4.5561, "step": 40185 }, { "epoch": 0.8230090307783671, "grad_norm": 0.2665553092956543, "learning_rate": 3.657604925152558e-05, "loss": 4.5606, "step": 40190 }, { "epoch": 0.8231114205557717, "grad_norm": 0.2775776982307434, "learning_rate": 3.6535554970976626e-05, "loss": 4.59, "step": 40195 }, { "epoch": 0.8232138103331763, "grad_norm": 0.28505730628967285, "learning_rate": 3.64950800102103e-05, "loss": 4.5835, "step": 40200 }, { "epoch": 0.8233162001105809, "grad_norm": 0.28103506565093994, "learning_rate": 3.645462437611845e-05, "loss": 4.6265, "step": 40205 }, { "epoch": 0.8234185898879856, "grad_norm": 0.282204270362854, "learning_rate": 3.641418807558946e-05, "loss": 4.5734, "step": 40210 }, { "epoch": 0.8235209796653902, "grad_norm": 0.278434693813324, "learning_rate": 3.637377111550847e-05, "loss": 4.5609, "step": 40215 }, { "epoch": 0.8236233694427948, "grad_norm": 0.28934016823768616, "learning_rate": 3.633337350275745e-05, "loss": 4.5745, "step": 40220 }, { "epoch": 0.8237257592201994, "grad_norm": 0.27980026602745056, "learning_rate": 3.6292995244214804e-05, "loss": 4.6031, "step": 40225 }, { "epoch": 0.823828148997604, "grad_norm": 0.29846495389938354, "learning_rate": 3.6252636346755954e-05, "loss": 4.5965, "step": 40230 }, { "epoch": 0.8239305387750087, "grad_norm": 0.29432082176208496, "learning_rate": 3.6212296817252795e-05, "loss": 4.5886, "step": 40235 }, { "epoch": 0.8240329285524133, "grad_norm": 0.2681111693382263, "learning_rate": 3.6171976662574e-05, "loss": 4.5495, "step": 40240 }, { "epoch": 0.8241353183298179, "grad_norm": 0.27589935064315796, "learning_rate": 3.613167588958502e-05, "loss": 4.527, "step": 40245 }, { "epoch": 0.8242377081072225, "grad_norm": 0.3165608048439026, "learning_rate": 3.609139450514791e-05, "loss": 4.5943, "step": 40250 }, { "epoch": 0.8243400978846273, "grad_norm": 0.2817896604537964, "learning_rate": 3.605113251612137e-05, "loss": 4.5478, "step": 40255 }, { "epoch": 0.8244424876620319, "grad_norm": 0.294752299785614, "learning_rate": 3.6010889929361e-05, "loss": 4.5586, "step": 40260 }, { "epoch": 0.8245448774394365, "grad_norm": 0.28963619470596313, "learning_rate": 3.59706667517189e-05, "loss": 4.5754, "step": 40265 }, { "epoch": 0.8246472672168411, "grad_norm": 0.2832202911376953, "learning_rate": 3.593046299004397e-05, "loss": 4.6675, "step": 40270 }, { "epoch": 0.8247496569942457, "grad_norm": 0.291576087474823, "learning_rate": 3.58902786511817e-05, "loss": 4.5963, "step": 40275 }, { "epoch": 0.8248520467716504, "grad_norm": 0.2860012650489807, "learning_rate": 3.5850113741974454e-05, "loss": 4.5391, "step": 40280 }, { "epoch": 0.824954436549055, "grad_norm": 0.29083725810050964, "learning_rate": 3.580996826926113e-05, "loss": 4.5844, "step": 40285 }, { "epoch": 0.8250568263264596, "grad_norm": 0.26927658915519714, "learning_rate": 3.5769842239877304e-05, "loss": 4.6215, "step": 40290 }, { "epoch": 0.8251592161038642, "grad_norm": 0.27664250135421753, "learning_rate": 3.572973566065541e-05, "loss": 4.5786, "step": 40295 }, { "epoch": 0.8252616058812688, "grad_norm": 0.28033411502838135, "learning_rate": 3.568964853842442e-05, "loss": 4.5612, "step": 40300 }, { "epoch": 0.8253639956586735, "grad_norm": 0.2839049696922302, "learning_rate": 3.564958088000997e-05, "loss": 4.5493, "step": 40305 }, { "epoch": 0.8254663854360781, "grad_norm": 0.3106590509414673, "learning_rate": 3.560953269223457e-05, "loss": 4.3958, "step": 40310 }, { "epoch": 0.8255687752134827, "grad_norm": 0.2771055996417999, "learning_rate": 3.556950398191722e-05, "loss": 4.6082, "step": 40315 }, { "epoch": 0.8256711649908873, "grad_norm": 0.28918710350990295, "learning_rate": 3.552949475587368e-05, "loss": 4.5915, "step": 40320 }, { "epoch": 0.825773554768292, "grad_norm": 0.28916171193122864, "learning_rate": 3.548950502091637e-05, "loss": 4.5727, "step": 40325 }, { "epoch": 0.8258759445456966, "grad_norm": 0.27592727541923523, "learning_rate": 3.544953478385445e-05, "loss": 4.5593, "step": 40330 }, { "epoch": 0.8259783343231012, "grad_norm": 0.27757173776626587, "learning_rate": 3.5409584051493725e-05, "loss": 4.5804, "step": 40335 }, { "epoch": 0.8260807241005058, "grad_norm": 0.27432388067245483, "learning_rate": 3.536965283063661e-05, "loss": 4.5884, "step": 40340 }, { "epoch": 0.8261831138779104, "grad_norm": 0.28754088282585144, "learning_rate": 3.532974112808233e-05, "loss": 4.5547, "step": 40345 }, { "epoch": 0.8262855036553151, "grad_norm": 0.3026539981365204, "learning_rate": 3.528984895062672e-05, "loss": 4.6021, "step": 40350 }, { "epoch": 0.8263878934327197, "grad_norm": 0.28035303950309753, "learning_rate": 3.5249976305062186e-05, "loss": 4.5682, "step": 40355 }, { "epoch": 0.8264902832101243, "grad_norm": 0.270597368478775, "learning_rate": 3.521012319817809e-05, "loss": 4.5731, "step": 40360 }, { "epoch": 0.8265926729875289, "grad_norm": 0.28226950764656067, "learning_rate": 3.517028963676008e-05, "loss": 4.5483, "step": 40365 }, { "epoch": 0.8266950627649335, "grad_norm": 0.28886494040489197, "learning_rate": 3.513047562759085e-05, "loss": 4.5326, "step": 40370 }, { "epoch": 0.8267974525423382, "grad_norm": 0.29873228073120117, "learning_rate": 3.509068117744953e-05, "loss": 4.5491, "step": 40375 }, { "epoch": 0.8268998423197428, "grad_norm": 0.2729114890098572, "learning_rate": 3.505090629311194e-05, "loss": 4.6089, "step": 40380 }, { "epoch": 0.8270022320971474, "grad_norm": 0.33923688530921936, "learning_rate": 3.5011150981350713e-05, "loss": 4.5897, "step": 40385 }, { "epoch": 0.827104621874552, "grad_norm": 0.3152208626270294, "learning_rate": 3.497141524893502e-05, "loss": 4.6187, "step": 40390 }, { "epoch": 0.8272070116519566, "grad_norm": 0.28245314955711365, "learning_rate": 3.4931699102630686e-05, "loss": 4.5578, "step": 40395 }, { "epoch": 0.8273094014293613, "grad_norm": 0.29945963621139526, "learning_rate": 3.4892002549200316e-05, "loss": 4.6068, "step": 40400 }, { "epoch": 0.8274117912067659, "grad_norm": 0.2750268280506134, "learning_rate": 3.4852325595403074e-05, "loss": 4.566, "step": 40405 }, { "epoch": 0.8275141809841705, "grad_norm": 0.31208157539367676, "learning_rate": 3.481266824799481e-05, "loss": 4.4874, "step": 40410 }, { "epoch": 0.8276165707615751, "grad_norm": 0.30593588948249817, "learning_rate": 3.477303051372805e-05, "loss": 4.5458, "step": 40415 }, { "epoch": 0.8277189605389798, "grad_norm": 0.3171047270298004, "learning_rate": 3.473341239935201e-05, "loss": 4.6186, "step": 40420 }, { "epoch": 0.8278213503163844, "grad_norm": 0.279586523771286, "learning_rate": 3.46938139116125e-05, "loss": 4.5988, "step": 40425 }, { "epoch": 0.827923740093789, "grad_norm": 0.2898420989513397, "learning_rate": 3.4654235057252014e-05, "loss": 4.6204, "step": 40430 }, { "epoch": 0.8280261298711936, "grad_norm": 0.28708988428115845, "learning_rate": 3.4614675843009796e-05, "loss": 4.576, "step": 40435 }, { "epoch": 0.8281285196485982, "grad_norm": 0.29046791791915894, "learning_rate": 3.4575136275621525e-05, "loss": 4.5745, "step": 40440 }, { "epoch": 0.8282309094260029, "grad_norm": 0.2747562825679779, "learning_rate": 3.4535616361819724e-05, "loss": 4.6188, "step": 40445 }, { "epoch": 0.8283332992034075, "grad_norm": 0.29619061946868896, "learning_rate": 3.449611610833358e-05, "loss": 4.6506, "step": 40450 }, { "epoch": 0.8284356889808121, "grad_norm": 0.30195343494415283, "learning_rate": 3.445663552188882e-05, "loss": 4.5114, "step": 40455 }, { "epoch": 0.8285380787582168, "grad_norm": 0.29632195830345154, "learning_rate": 3.4417174609207856e-05, "loss": 4.5947, "step": 40460 }, { "epoch": 0.8286404685356215, "grad_norm": 0.2991366684436798, "learning_rate": 3.437773337700976e-05, "loss": 4.5803, "step": 40465 }, { "epoch": 0.8287428583130261, "grad_norm": 0.2884520888328552, "learning_rate": 3.4338311832010296e-05, "loss": 4.5416, "step": 40470 }, { "epoch": 0.8288452480904307, "grad_norm": 0.27997058629989624, "learning_rate": 3.429890998092182e-05, "loss": 4.5782, "step": 40475 }, { "epoch": 0.8289476378678353, "grad_norm": 0.2756667137145996, "learning_rate": 3.4259527830453294e-05, "loss": 4.5538, "step": 40480 }, { "epoch": 0.8290500276452399, "grad_norm": 0.2803933620452881, "learning_rate": 3.422016538731049e-05, "loss": 4.5603, "step": 40485 }, { "epoch": 0.8291524174226446, "grad_norm": 0.2907135784626007, "learning_rate": 3.418082265819566e-05, "loss": 4.5366, "step": 40490 }, { "epoch": 0.8292548072000492, "grad_norm": 0.48615598678588867, "learning_rate": 3.414149964980772e-05, "loss": 4.5828, "step": 40495 }, { "epoch": 0.8293571969774538, "grad_norm": 0.27886176109313965, "learning_rate": 3.410219636884239e-05, "loss": 4.5782, "step": 40500 }, { "epoch": 0.8294595867548584, "grad_norm": 0.28133606910705566, "learning_rate": 3.4062912821991734e-05, "loss": 4.577, "step": 40505 }, { "epoch": 0.829561976532263, "grad_norm": 0.2881931960582733, "learning_rate": 3.4023649015944756e-05, "loss": 4.5106, "step": 40510 }, { "epoch": 0.8296643663096677, "grad_norm": 0.2737469971179962, "learning_rate": 3.398440495738692e-05, "loss": 4.5582, "step": 40515 }, { "epoch": 0.8297667560870723, "grad_norm": 0.2830897271633148, "learning_rate": 3.3945180653000366e-05, "loss": 4.5933, "step": 40520 }, { "epoch": 0.8298691458644769, "grad_norm": 0.2840888798236847, "learning_rate": 3.3905976109463936e-05, "loss": 4.5815, "step": 40525 }, { "epoch": 0.8299715356418815, "grad_norm": 0.293977290391922, "learning_rate": 3.386679133345302e-05, "loss": 4.5645, "step": 40530 }, { "epoch": 0.8300739254192862, "grad_norm": 0.2870458960533142, "learning_rate": 3.3827626331639635e-05, "loss": 4.583, "step": 40535 }, { "epoch": 0.8301763151966908, "grad_norm": 0.284534752368927, "learning_rate": 3.378848111069256e-05, "loss": 4.5753, "step": 40540 }, { "epoch": 0.8302787049740954, "grad_norm": 0.2755497097969055, "learning_rate": 3.3749355677277085e-05, "loss": 4.5738, "step": 40545 }, { "epoch": 0.8303810947515, "grad_norm": 0.2797011435031891, "learning_rate": 3.371025003805514e-05, "loss": 4.6116, "step": 40550 }, { "epoch": 0.8304834845289046, "grad_norm": 0.2987256944179535, "learning_rate": 3.367116419968527e-05, "loss": 4.5602, "step": 40555 }, { "epoch": 0.8305858743063093, "grad_norm": 0.27878180146217346, "learning_rate": 3.363209816882279e-05, "loss": 4.5695, "step": 40560 }, { "epoch": 0.8306882640837139, "grad_norm": 0.2940962612628937, "learning_rate": 3.359305195211949e-05, "loss": 4.6228, "step": 40565 }, { "epoch": 0.8307906538611185, "grad_norm": 0.280834436416626, "learning_rate": 3.355402555622378e-05, "loss": 4.5861, "step": 40570 }, { "epoch": 0.8308930436385231, "grad_norm": 0.2941542863845825, "learning_rate": 3.3515018987780884e-05, "loss": 4.5604, "step": 40575 }, { "epoch": 0.8309954334159277, "grad_norm": 0.2844966650009155, "learning_rate": 3.3476032253432356e-05, "loss": 4.5558, "step": 40580 }, { "epoch": 0.8310978231933324, "grad_norm": 0.3021025061607361, "learning_rate": 3.343706535981662e-05, "loss": 4.5832, "step": 40585 }, { "epoch": 0.831200212970737, "grad_norm": 0.27956724166870117, "learning_rate": 3.3398118313568647e-05, "loss": 4.5957, "step": 40590 }, { "epoch": 0.8313026027481416, "grad_norm": 0.30196988582611084, "learning_rate": 3.335919112132e-05, "loss": 4.5439, "step": 40595 }, { "epoch": 0.8314049925255462, "grad_norm": 0.28697851300239563, "learning_rate": 3.332028378969888e-05, "loss": 4.5951, "step": 40600 }, { "epoch": 0.8315073823029508, "grad_norm": 0.303102046251297, "learning_rate": 3.328139632533006e-05, "loss": 4.537, "step": 40605 }, { "epoch": 0.8316097720803555, "grad_norm": 0.2824726998806, "learning_rate": 3.324252873483505e-05, "loss": 4.5667, "step": 40610 }, { "epoch": 0.8317121618577601, "grad_norm": 0.2867865562438965, "learning_rate": 3.320368102483185e-05, "loss": 4.5588, "step": 40615 }, { "epoch": 0.8318145516351647, "grad_norm": 0.29350194334983826, "learning_rate": 3.3164853201935104e-05, "loss": 4.6302, "step": 40620 }, { "epoch": 0.8319169414125693, "grad_norm": 0.28072482347488403, "learning_rate": 3.3126045272756194e-05, "loss": 4.5966, "step": 40625 }, { "epoch": 0.832019331189974, "grad_norm": 0.31003057956695557, "learning_rate": 3.308725724390286e-05, "loss": 4.5576, "step": 40630 }, { "epoch": 0.8321217209673786, "grad_norm": 0.3015013337135315, "learning_rate": 3.3048489121979694e-05, "loss": 4.6012, "step": 40635 }, { "epoch": 0.8322241107447832, "grad_norm": 0.2813338339328766, "learning_rate": 3.300974091358788e-05, "loss": 4.6031, "step": 40640 }, { "epoch": 0.8323265005221878, "grad_norm": 0.2762916386127472, "learning_rate": 3.2971012625324987e-05, "loss": 4.5773, "step": 40645 }, { "epoch": 0.8324288902995924, "grad_norm": 0.28494009375572205, "learning_rate": 3.293230426378546e-05, "loss": 4.6118, "step": 40650 }, { "epoch": 0.8325312800769971, "grad_norm": 0.27528828382492065, "learning_rate": 3.2893615835560205e-05, "loss": 4.5894, "step": 40655 }, { "epoch": 0.8326336698544017, "grad_norm": 0.2866365909576416, "learning_rate": 3.285494734723671e-05, "loss": 4.5539, "step": 40660 }, { "epoch": 0.8327360596318064, "grad_norm": 0.2842416763305664, "learning_rate": 3.281629880539921e-05, "loss": 4.593, "step": 40665 }, { "epoch": 0.832838449409211, "grad_norm": 0.2805674076080322, "learning_rate": 3.277767021662844e-05, "loss": 4.5866, "step": 40670 }, { "epoch": 0.8329408391866157, "grad_norm": 0.27999147772789, "learning_rate": 3.273906158750168e-05, "loss": 4.6231, "step": 40675 }, { "epoch": 0.8330432289640203, "grad_norm": 0.2912527918815613, "learning_rate": 3.270047292459298e-05, "loss": 4.5316, "step": 40680 }, { "epoch": 0.8331456187414249, "grad_norm": 0.2727186977863312, "learning_rate": 3.2661904234472866e-05, "loss": 4.5816, "step": 40685 }, { "epoch": 0.8332480085188295, "grad_norm": 0.27656301856040955, "learning_rate": 3.262335552370849e-05, "loss": 4.5678, "step": 40690 }, { "epoch": 0.8333503982962341, "grad_norm": 0.27214816212654114, "learning_rate": 3.258482679886356e-05, "loss": 4.596, "step": 40695 }, { "epoch": 0.8334527880736388, "grad_norm": 0.30073586106300354, "learning_rate": 3.2546318066498523e-05, "loss": 4.5546, "step": 40700 }, { "epoch": 0.8335551778510434, "grad_norm": 0.282539963722229, "learning_rate": 3.250782933317025e-05, "loss": 4.6335, "step": 40705 }, { "epoch": 0.833657567628448, "grad_norm": 0.28320568799972534, "learning_rate": 3.246936060543228e-05, "loss": 4.6191, "step": 40710 }, { "epoch": 0.8337599574058526, "grad_norm": 0.2768017649650574, "learning_rate": 3.243091188983485e-05, "loss": 4.5991, "step": 40715 }, { "epoch": 0.8338623471832572, "grad_norm": 0.27154484391212463, "learning_rate": 3.2392483192924546e-05, "loss": 4.6122, "step": 40720 }, { "epoch": 0.8339647369606619, "grad_norm": 0.2970268130302429, "learning_rate": 3.2354074521244734e-05, "loss": 4.5865, "step": 40725 }, { "epoch": 0.8340671267380665, "grad_norm": 0.27358826994895935, "learning_rate": 3.2315685881335395e-05, "loss": 4.5937, "step": 40730 }, { "epoch": 0.8341695165154711, "grad_norm": 0.31513655185699463, "learning_rate": 3.227731727973297e-05, "loss": 4.5818, "step": 40735 }, { "epoch": 0.8342719062928757, "grad_norm": 0.2948060929775238, "learning_rate": 3.223896872297055e-05, "loss": 4.6303, "step": 40740 }, { "epoch": 0.8343742960702804, "grad_norm": 0.29236993193626404, "learning_rate": 3.220064021757777e-05, "loss": 4.5999, "step": 40745 }, { "epoch": 0.834476685847685, "grad_norm": 0.2867458760738373, "learning_rate": 3.216233177008097e-05, "loss": 4.5985, "step": 40750 }, { "epoch": 0.8345790756250896, "grad_norm": 0.2854442000389099, "learning_rate": 3.2124043387002954e-05, "loss": 4.6083, "step": 40755 }, { "epoch": 0.8346814654024942, "grad_norm": 0.2821553647518158, "learning_rate": 3.2085775074863105e-05, "loss": 4.5944, "step": 40760 }, { "epoch": 0.8347838551798988, "grad_norm": 0.2813735604286194, "learning_rate": 3.204752684017756e-05, "loss": 4.5255, "step": 40765 }, { "epoch": 0.8348862449573035, "grad_norm": 0.2835007905960083, "learning_rate": 3.2009298689458744e-05, "loss": 4.558, "step": 40770 }, { "epoch": 0.8349886347347081, "grad_norm": 0.28022801876068115, "learning_rate": 3.197109062921591e-05, "loss": 4.5757, "step": 40775 }, { "epoch": 0.8350910245121127, "grad_norm": 0.2893160283565521, "learning_rate": 3.1932902665954894e-05, "loss": 4.5803, "step": 40780 }, { "epoch": 0.8351934142895173, "grad_norm": 0.2787182033061981, "learning_rate": 3.189473480617786e-05, "loss": 4.6025, "step": 40785 }, { "epoch": 0.8352958040669219, "grad_norm": 0.27996236085891724, "learning_rate": 3.1856587056383846e-05, "loss": 4.5655, "step": 40790 }, { "epoch": 0.8353981938443266, "grad_norm": 0.27595436573028564, "learning_rate": 3.1818459423068266e-05, "loss": 4.5905, "step": 40795 }, { "epoch": 0.8355005836217312, "grad_norm": 0.2880684733390808, "learning_rate": 3.1780351912723155e-05, "loss": 4.6407, "step": 40800 }, { "epoch": 0.8356029733991358, "grad_norm": 0.2865196764469147, "learning_rate": 3.174226453183725e-05, "loss": 4.5965, "step": 40805 }, { "epoch": 0.8357053631765404, "grad_norm": 0.27217257022857666, "learning_rate": 3.170419728689565e-05, "loss": 4.5921, "step": 40810 }, { "epoch": 0.835807752953945, "grad_norm": 0.26777732372283936, "learning_rate": 3.166615018438018e-05, "loss": 4.5762, "step": 40815 }, { "epoch": 0.8359101427313497, "grad_norm": 0.2771703600883484, "learning_rate": 3.162812323076914e-05, "loss": 4.5757, "step": 40820 }, { "epoch": 0.8360125325087543, "grad_norm": 0.2957483232021332, "learning_rate": 3.1590116432537495e-05, "loss": 4.5597, "step": 40825 }, { "epoch": 0.8361149222861589, "grad_norm": 0.2982672154903412, "learning_rate": 3.1552129796156725e-05, "loss": 4.6019, "step": 40830 }, { "epoch": 0.8362173120635635, "grad_norm": 0.28175172209739685, "learning_rate": 3.15141633280948e-05, "loss": 4.5474, "step": 40835 }, { "epoch": 0.8363197018409682, "grad_norm": 0.2830298840999603, "learning_rate": 3.147621703481644e-05, "loss": 4.6469, "step": 40840 }, { "epoch": 0.8364220916183728, "grad_norm": 0.40411484241485596, "learning_rate": 3.1438290922782764e-05, "loss": 4.6078, "step": 40845 }, { "epoch": 0.8365244813957774, "grad_norm": 0.2745141088962555, "learning_rate": 3.1400384998451506e-05, "loss": 4.5984, "step": 40850 }, { "epoch": 0.836626871173182, "grad_norm": 0.28638893365859985, "learning_rate": 3.136249926827707e-05, "loss": 4.5967, "step": 40855 }, { "epoch": 0.8367292609505866, "grad_norm": 0.30151182413101196, "learning_rate": 3.1324633738710164e-05, "loss": 4.5752, "step": 40860 }, { "epoch": 0.8368316507279914, "grad_norm": 0.276595801115036, "learning_rate": 3.12867884161983e-05, "loss": 4.552, "step": 40865 }, { "epoch": 0.836934040505396, "grad_norm": 0.2695423364639282, "learning_rate": 3.124896330718551e-05, "loss": 4.5706, "step": 40870 }, { "epoch": 0.8370364302828006, "grad_norm": 0.27079302072525024, "learning_rate": 3.121115841811229e-05, "loss": 4.5541, "step": 40875 }, { "epoch": 0.8371388200602052, "grad_norm": 0.2798822522163391, "learning_rate": 3.117337375541575e-05, "loss": 4.5892, "step": 40880 }, { "epoch": 0.8372412098376099, "grad_norm": 0.28381332755088806, "learning_rate": 3.11356093255295e-05, "loss": 4.5229, "step": 40885 }, { "epoch": 0.8373435996150145, "grad_norm": 0.31425201892852783, "learning_rate": 3.1097865134883834e-05, "loss": 4.5647, "step": 40890 }, { "epoch": 0.8374459893924191, "grad_norm": 0.30924391746520996, "learning_rate": 3.1060141189905474e-05, "loss": 4.5454, "step": 40895 }, { "epoch": 0.8375483791698237, "grad_norm": 0.27807021141052246, "learning_rate": 3.102243749701772e-05, "loss": 4.601, "step": 40900 }, { "epoch": 0.8376507689472283, "grad_norm": 0.3153919279575348, "learning_rate": 3.0984754062640556e-05, "loss": 4.52, "step": 40905 }, { "epoch": 0.837753158724633, "grad_norm": 0.2784497141838074, "learning_rate": 3.094709089319024e-05, "loss": 4.5393, "step": 40910 }, { "epoch": 0.8378555485020376, "grad_norm": 0.28070005774497986, "learning_rate": 3.090944799507982e-05, "loss": 4.5848, "step": 40915 }, { "epoch": 0.8379579382794422, "grad_norm": 0.34741532802581787, "learning_rate": 3.087182537471889e-05, "loss": 4.6055, "step": 40920 }, { "epoch": 0.8380603280568468, "grad_norm": 0.2964925467967987, "learning_rate": 3.083422303851337e-05, "loss": 4.5471, "step": 40925 }, { "epoch": 0.8381627178342514, "grad_norm": 0.2837379276752472, "learning_rate": 3.0796640992866e-05, "loss": 4.6261, "step": 40930 }, { "epoch": 0.8382651076116561, "grad_norm": 0.2933388650417328, "learning_rate": 3.075907924417587e-05, "loss": 4.5777, "step": 40935 }, { "epoch": 0.8383674973890607, "grad_norm": 0.27644672989845276, "learning_rate": 3.072153779883868e-05, "loss": 4.5548, "step": 40940 }, { "epoch": 0.8384698871664653, "grad_norm": 0.279789000749588, "learning_rate": 3.068401666324672e-05, "loss": 4.5693, "step": 40945 }, { "epoch": 0.8385722769438699, "grad_norm": 0.28126752376556396, "learning_rate": 3.0646515843788763e-05, "loss": 4.5984, "step": 40950 }, { "epoch": 0.8386746667212746, "grad_norm": 0.28015634417533875, "learning_rate": 3.060903534685013e-05, "loss": 4.5702, "step": 40955 }, { "epoch": 0.8387770564986792, "grad_norm": 0.2922801077365875, "learning_rate": 3.057157517881263e-05, "loss": 4.6126, "step": 40960 }, { "epoch": 0.8388794462760838, "grad_norm": 0.27999451756477356, "learning_rate": 3.0534135346054774e-05, "loss": 4.5665, "step": 40965 }, { "epoch": 0.8389818360534884, "grad_norm": 0.2854362428188324, "learning_rate": 3.049671585495146e-05, "loss": 4.5704, "step": 40970 }, { "epoch": 0.839084225830893, "grad_norm": 0.27575409412384033, "learning_rate": 3.045931671187411e-05, "loss": 4.5876, "step": 40975 }, { "epoch": 0.8391866156082977, "grad_norm": 0.31625238060951233, "learning_rate": 3.042193792319083e-05, "loss": 4.6041, "step": 40980 }, { "epoch": 0.8392890053857023, "grad_norm": 0.27901962399482727, "learning_rate": 3.0384579495266147e-05, "loss": 4.5466, "step": 40985 }, { "epoch": 0.8393913951631069, "grad_norm": 0.29163533449172974, "learning_rate": 3.034724143446109e-05, "loss": 4.5922, "step": 40990 }, { "epoch": 0.8394937849405115, "grad_norm": 0.29959243535995483, "learning_rate": 3.0309923747133386e-05, "loss": 4.5846, "step": 40995 }, { "epoch": 0.8395961747179161, "grad_norm": 0.2757844626903534, "learning_rate": 3.027262643963702e-05, "loss": 4.552, "step": 41000 }, { "epoch": 0.8395961747179161, "eval_loss": 4.593181133270264, "eval_runtime": 17.1222, "eval_samples_per_second": 33.991, "eval_steps_per_second": 4.263, "step": 41000 }, { "epoch": 0.8396985644953208, "grad_norm": 0.28662437200546265, "learning_rate": 3.02353495183228e-05, "loss": 4.6171, "step": 41005 }, { "epoch": 0.8398009542727254, "grad_norm": 0.2877452075481415, "learning_rate": 3.0198092989537858e-05, "loss": 4.5878, "step": 41010 }, { "epoch": 0.83990334405013, "grad_norm": 0.28487837314605713, "learning_rate": 3.0160856859625977e-05, "loss": 4.5274, "step": 41015 }, { "epoch": 0.8400057338275346, "grad_norm": 0.2777809798717499, "learning_rate": 3.0123641134927395e-05, "loss": 4.5628, "step": 41020 }, { "epoch": 0.8401081236049392, "grad_norm": 0.2962719798088074, "learning_rate": 3.0086445821778863e-05, "loss": 4.5668, "step": 41025 }, { "epoch": 0.8402105133823439, "grad_norm": 0.31961870193481445, "learning_rate": 3.0049270926513754e-05, "loss": 4.5609, "step": 41030 }, { "epoch": 0.8403129031597485, "grad_norm": 0.27768102288246155, "learning_rate": 3.001211645546186e-05, "loss": 4.6029, "step": 41035 }, { "epoch": 0.8404152929371531, "grad_norm": 0.2697317600250244, "learning_rate": 2.9999999999999997e-05, "loss": 4.5762, "step": 41040 }, { "epoch": 0.8405176827145577, "grad_norm": 0.28922295570373535, "learning_rate": 2.9999999999999997e-05, "loss": 4.5458, "step": 41045 }, { "epoch": 0.8406200724919624, "grad_norm": 0.2773067355155945, "learning_rate": 2.9999999999999997e-05, "loss": 4.6026, "step": 41050 }, { "epoch": 0.840722462269367, "grad_norm": 0.27529552578926086, "learning_rate": 2.9999999999999997e-05, "loss": 4.6204, "step": 41055 }, { "epoch": 0.8408248520467716, "grad_norm": 0.2800615131855011, "learning_rate": 2.9999999999999997e-05, "loss": 4.5814, "step": 41060 }, { "epoch": 0.8409272418241762, "grad_norm": 0.2822313606739044, "learning_rate": 2.9999999999999997e-05, "loss": 4.5486, "step": 41065 }, { "epoch": 0.841029631601581, "grad_norm": 0.27693304419517517, "learning_rate": 2.9999999999999997e-05, "loss": 4.5574, "step": 41070 }, { "epoch": 0.8411320213789856, "grad_norm": 0.28031983971595764, "learning_rate": 2.9999999999999997e-05, "loss": 4.5619, "step": 41075 }, { "epoch": 0.8412344111563902, "grad_norm": 0.29966622591018677, "learning_rate": 2.9999999999999997e-05, "loss": 4.5669, "step": 41080 }, { "epoch": 0.8413368009337948, "grad_norm": 0.2828528583049774, "learning_rate": 2.9999999999999997e-05, "loss": 4.6466, "step": 41085 }, { "epoch": 0.8414391907111994, "grad_norm": 0.28675830364227295, "learning_rate": 2.9999999999999997e-05, "loss": 4.5915, "step": 41090 }, { "epoch": 0.8415415804886041, "grad_norm": 0.2805325984954834, "learning_rate": 2.9999999999999997e-05, "loss": 4.5625, "step": 41095 }, { "epoch": 0.8416439702660087, "grad_norm": 0.2904501259326935, "learning_rate": 2.9999999999999997e-05, "loss": 4.5674, "step": 41100 }, { "epoch": 0.8417463600434133, "grad_norm": 0.28026166558265686, "learning_rate": 2.9999999999999997e-05, "loss": 4.601, "step": 41105 }, { "epoch": 0.8418487498208179, "grad_norm": 0.2790822684764862, "learning_rate": 2.9999999999999997e-05, "loss": 4.6097, "step": 41110 }, { "epoch": 0.8419511395982225, "grad_norm": 0.27744176983833313, "learning_rate": 2.9999999999999997e-05, "loss": 4.5876, "step": 41115 }, { "epoch": 0.8420535293756272, "grad_norm": 0.3159765899181366, "learning_rate": 2.9999999999999997e-05, "loss": 4.5175, "step": 41120 }, { "epoch": 0.8421559191530318, "grad_norm": 0.27997395396232605, "learning_rate": 2.9999999999999997e-05, "loss": 4.5976, "step": 41125 }, { "epoch": 0.8422583089304364, "grad_norm": 0.28126683831214905, "learning_rate": 2.9999999999999997e-05, "loss": 4.6137, "step": 41130 }, { "epoch": 0.842360698707841, "grad_norm": 0.28357672691345215, "learning_rate": 2.9999999999999997e-05, "loss": 4.586, "step": 41135 }, { "epoch": 0.8424630884852456, "grad_norm": 0.2679639160633087, "learning_rate": 2.9999999999999997e-05, "loss": 4.5666, "step": 41140 }, { "epoch": 0.8425654782626503, "grad_norm": 0.2936253249645233, "learning_rate": 2.9999999999999997e-05, "loss": 4.5809, "step": 41145 }, { "epoch": 0.8426678680400549, "grad_norm": 0.297695130109787, "learning_rate": 2.9999999999999997e-05, "loss": 4.5803, "step": 41150 }, { "epoch": 0.8427702578174595, "grad_norm": 0.2765631675720215, "learning_rate": 2.9999999999999997e-05, "loss": 4.5503, "step": 41155 }, { "epoch": 0.8428726475948641, "grad_norm": 0.318791925907135, "learning_rate": 2.9999999999999997e-05, "loss": 4.5819, "step": 41160 }, { "epoch": 0.8429750373722688, "grad_norm": 0.29362010955810547, "learning_rate": 2.9999999999999997e-05, "loss": 4.5931, "step": 41165 }, { "epoch": 0.8430774271496734, "grad_norm": 0.2787388563156128, "learning_rate": 2.9999999999999997e-05, "loss": 4.5473, "step": 41170 }, { "epoch": 0.843179816927078, "grad_norm": 0.3351594805717468, "learning_rate": 2.9999999999999997e-05, "loss": 4.577, "step": 41175 }, { "epoch": 0.8432822067044826, "grad_norm": 0.3098401725292206, "learning_rate": 2.9999999999999997e-05, "loss": 4.5853, "step": 41180 }, { "epoch": 0.8433845964818872, "grad_norm": 0.302744060754776, "learning_rate": 2.9999999999999997e-05, "loss": 4.5882, "step": 41185 }, { "epoch": 0.8434869862592919, "grad_norm": 0.27730438113212585, "learning_rate": 2.9999999999999997e-05, "loss": 4.6287, "step": 41190 }, { "epoch": 0.8435893760366965, "grad_norm": 0.2852722406387329, "learning_rate": 2.9999999999999997e-05, "loss": 4.5574, "step": 41195 }, { "epoch": 0.8436917658141011, "grad_norm": 0.2758505344390869, "learning_rate": 2.9999999999999997e-05, "loss": 4.568, "step": 41200 }, { "epoch": 0.8437941555915057, "grad_norm": 0.28374865651130676, "learning_rate": 2.9999999999999997e-05, "loss": 4.5608, "step": 41205 }, { "epoch": 0.8438965453689103, "grad_norm": 0.2999465763568878, "learning_rate": 2.9999999999999997e-05, "loss": 4.5649, "step": 41210 }, { "epoch": 0.843998935146315, "grad_norm": 0.28205832839012146, "learning_rate": 2.9999999999999997e-05, "loss": 4.5096, "step": 41215 }, { "epoch": 0.8441013249237196, "grad_norm": 0.27388060092926025, "learning_rate": 2.9999999999999997e-05, "loss": 4.5995, "step": 41220 }, { "epoch": 0.8442037147011242, "grad_norm": 0.2652662694454193, "learning_rate": 2.9999999999999997e-05, "loss": 4.453, "step": 41225 }, { "epoch": 0.8443061044785288, "grad_norm": 0.30840641260147095, "learning_rate": 2.9999999999999997e-05, "loss": 4.526, "step": 41230 }, { "epoch": 0.8444084942559335, "grad_norm": 0.2902662754058838, "learning_rate": 2.9999999999999997e-05, "loss": 4.5417, "step": 41235 }, { "epoch": 0.8445108840333381, "grad_norm": 0.27841314673423767, "learning_rate": 2.9999999999999997e-05, "loss": 4.5865, "step": 41240 }, { "epoch": 0.8446132738107427, "grad_norm": 0.2912626266479492, "learning_rate": 2.9999999999999997e-05, "loss": 4.6389, "step": 41245 }, { "epoch": 0.8447156635881473, "grad_norm": 0.2787330150604248, "learning_rate": 2.9999999999999997e-05, "loss": 4.5385, "step": 41250 }, { "epoch": 0.8448180533655519, "grad_norm": 0.2781839072704315, "learning_rate": 2.9999999999999997e-05, "loss": 4.6031, "step": 41255 }, { "epoch": 0.8449204431429566, "grad_norm": 0.3461420238018036, "learning_rate": 2.9999999999999997e-05, "loss": 4.5803, "step": 41260 }, { "epoch": 0.8450228329203612, "grad_norm": 0.282930850982666, "learning_rate": 2.9999999999999997e-05, "loss": 4.5916, "step": 41265 }, { "epoch": 0.8451252226977659, "grad_norm": 0.2960691452026367, "learning_rate": 2.9999999999999997e-05, "loss": 4.5519, "step": 41270 }, { "epoch": 0.8452276124751705, "grad_norm": 0.28614863753318787, "learning_rate": 2.9999999999999997e-05, "loss": 4.5799, "step": 41275 }, { "epoch": 0.8453300022525752, "grad_norm": 0.2875410318374634, "learning_rate": 2.9999999999999997e-05, "loss": 4.5659, "step": 41280 }, { "epoch": 0.8454323920299798, "grad_norm": 0.6949137449264526, "learning_rate": 2.9999999999999997e-05, "loss": 4.5754, "step": 41285 }, { "epoch": 0.8455347818073844, "grad_norm": 0.2943025529384613, "learning_rate": 2.9999999999999997e-05, "loss": 4.6048, "step": 41290 }, { "epoch": 0.845637171584789, "grad_norm": 0.2923896312713623, "learning_rate": 2.9999999999999997e-05, "loss": 4.585, "step": 41295 }, { "epoch": 0.8457395613621936, "grad_norm": 0.30523452162742615, "learning_rate": 2.9999999999999997e-05, "loss": 4.5578, "step": 41300 }, { "epoch": 0.8458419511395983, "grad_norm": 0.2870691418647766, "learning_rate": 2.9999999999999997e-05, "loss": 4.6085, "step": 41305 }, { "epoch": 0.8459443409170029, "grad_norm": 0.2939387559890747, "learning_rate": 2.9999999999999997e-05, "loss": 4.5408, "step": 41310 }, { "epoch": 0.8460467306944075, "grad_norm": 0.26975417137145996, "learning_rate": 2.9999999999999997e-05, "loss": 4.5448, "step": 41315 }, { "epoch": 0.8461491204718121, "grad_norm": 0.2728458344936371, "learning_rate": 2.9999999999999997e-05, "loss": 4.6098, "step": 41320 }, { "epoch": 0.8462515102492167, "grad_norm": 0.27852603793144226, "learning_rate": 2.9999999999999997e-05, "loss": 4.5419, "step": 41325 }, { "epoch": 0.8463539000266214, "grad_norm": 0.2822387218475342, "learning_rate": 2.9999999999999997e-05, "loss": 4.5798, "step": 41330 }, { "epoch": 0.846456289804026, "grad_norm": 0.28445008397102356, "learning_rate": 2.9999999999999997e-05, "loss": 4.5386, "step": 41335 }, { "epoch": 0.8465586795814306, "grad_norm": 0.2744186520576477, "learning_rate": 2.9999999999999997e-05, "loss": 4.5464, "step": 41340 }, { "epoch": 0.8466610693588352, "grad_norm": 0.2751639783382416, "learning_rate": 2.9999999999999997e-05, "loss": 4.5761, "step": 41345 }, { "epoch": 0.8467634591362398, "grad_norm": 0.31351083517074585, "learning_rate": 2.9999999999999997e-05, "loss": 4.5885, "step": 41350 }, { "epoch": 0.8468658489136445, "grad_norm": 0.286384642124176, "learning_rate": 2.9999999999999997e-05, "loss": 4.5635, "step": 41355 }, { "epoch": 0.8469682386910491, "grad_norm": 0.28469738364219666, "learning_rate": 2.9999999999999997e-05, "loss": 4.6158, "step": 41360 }, { "epoch": 0.8470706284684537, "grad_norm": 0.2857683300971985, "learning_rate": 2.9999999999999997e-05, "loss": 4.5918, "step": 41365 }, { "epoch": 0.8471730182458583, "grad_norm": 0.3154352307319641, "learning_rate": 2.9999999999999997e-05, "loss": 4.5592, "step": 41370 }, { "epoch": 0.847275408023263, "grad_norm": 0.2844149172306061, "learning_rate": 2.9999999999999997e-05, "loss": 4.5721, "step": 41375 }, { "epoch": 0.8473777978006676, "grad_norm": 0.2833268642425537, "learning_rate": 2.9999999999999997e-05, "loss": 4.5631, "step": 41380 }, { "epoch": 0.8474801875780722, "grad_norm": 0.3229454457759857, "learning_rate": 2.9999999999999997e-05, "loss": 4.6033, "step": 41385 }, { "epoch": 0.8475825773554768, "grad_norm": 0.28634709119796753, "learning_rate": 2.9999999999999997e-05, "loss": 4.6129, "step": 41390 }, { "epoch": 0.8476849671328814, "grad_norm": 0.3008575141429901, "learning_rate": 2.9999999999999997e-05, "loss": 4.5326, "step": 41395 }, { "epoch": 0.8477873569102861, "grad_norm": 0.332033634185791, "learning_rate": 2.9999999999999997e-05, "loss": 4.5979, "step": 41400 }, { "epoch": 0.8478897466876907, "grad_norm": 0.2948870360851288, "learning_rate": 2.9999999999999997e-05, "loss": 4.5302, "step": 41405 }, { "epoch": 0.8479921364650953, "grad_norm": 0.2760053873062134, "learning_rate": 2.9999999999999997e-05, "loss": 4.6197, "step": 41410 }, { "epoch": 0.8480945262424999, "grad_norm": 0.2818705141544342, "learning_rate": 2.9999999999999997e-05, "loss": 4.5919, "step": 41415 }, { "epoch": 0.8481969160199045, "grad_norm": 0.30404141545295715, "learning_rate": 2.9999999999999997e-05, "loss": 4.5394, "step": 41420 }, { "epoch": 0.8482993057973092, "grad_norm": 0.2797127962112427, "learning_rate": 2.9999999999999997e-05, "loss": 4.4302, "step": 41425 }, { "epoch": 0.8484016955747138, "grad_norm": 0.29585498571395874, "learning_rate": 2.9999999999999997e-05, "loss": 4.5359, "step": 41430 }, { "epoch": 0.8485040853521184, "grad_norm": 0.2758622467517853, "learning_rate": 2.9999999999999997e-05, "loss": 4.5738, "step": 41435 }, { "epoch": 0.848606475129523, "grad_norm": 0.27927926182746887, "learning_rate": 2.9999999999999997e-05, "loss": 4.5562, "step": 41440 }, { "epoch": 0.8487088649069277, "grad_norm": 0.28380414843559265, "learning_rate": 2.9999999999999997e-05, "loss": 4.5316, "step": 41445 }, { "epoch": 0.8488112546843323, "grad_norm": 0.2843684256076813, "learning_rate": 2.9999999999999997e-05, "loss": 4.6323, "step": 41450 }, { "epoch": 0.8489136444617369, "grad_norm": 0.27560290694236755, "learning_rate": 2.9999999999999997e-05, "loss": 4.5754, "step": 41455 }, { "epoch": 0.8490160342391415, "grad_norm": 0.2876559793949127, "learning_rate": 2.9999999999999997e-05, "loss": 4.5247, "step": 41460 }, { "epoch": 0.8491184240165461, "grad_norm": 0.31301239132881165, "learning_rate": 2.9999999999999997e-05, "loss": 4.5129, "step": 41465 }, { "epoch": 0.8492208137939508, "grad_norm": 0.2861793637275696, "learning_rate": 2.9999999999999997e-05, "loss": 4.587, "step": 41470 }, { "epoch": 0.8493232035713555, "grad_norm": 0.28453120589256287, "learning_rate": 2.9999999999999997e-05, "loss": 4.5889, "step": 41475 }, { "epoch": 0.8494255933487601, "grad_norm": 0.3102613091468811, "learning_rate": 2.9999999999999997e-05, "loss": 4.6361, "step": 41480 }, { "epoch": 0.8495279831261647, "grad_norm": 0.28760915994644165, "learning_rate": 2.9999999999999997e-05, "loss": 4.6013, "step": 41485 }, { "epoch": 0.8496303729035694, "grad_norm": 0.28489428758621216, "learning_rate": 2.9999999999999997e-05, "loss": 4.5335, "step": 41490 }, { "epoch": 0.849732762680974, "grad_norm": 0.28293073177337646, "learning_rate": 2.9999999999999997e-05, "loss": 4.5785, "step": 41495 }, { "epoch": 0.8498351524583786, "grad_norm": 0.277610719203949, "learning_rate": 2.9999999999999997e-05, "loss": 4.5531, "step": 41500 }, { "epoch": 0.8499375422357832, "grad_norm": 0.2881927788257599, "learning_rate": 2.9999999999999997e-05, "loss": 4.6054, "step": 41505 }, { "epoch": 0.8500399320131878, "grad_norm": 0.28182360529899597, "learning_rate": 2.9999999999999997e-05, "loss": 4.597, "step": 41510 }, { "epoch": 0.8501423217905925, "grad_norm": 0.29651278257369995, "learning_rate": 2.9999999999999997e-05, "loss": 4.5848, "step": 41515 }, { "epoch": 0.8502447115679971, "grad_norm": 0.2813228368759155, "learning_rate": 2.9999999999999997e-05, "loss": 4.5891, "step": 41520 }, { "epoch": 0.8503471013454017, "grad_norm": 0.27777281403541565, "learning_rate": 2.9999999999999997e-05, "loss": 4.6135, "step": 41525 }, { "epoch": 0.8504494911228063, "grad_norm": 0.26647210121154785, "learning_rate": 2.9999999999999997e-05, "loss": 4.6196, "step": 41530 }, { "epoch": 0.8505518809002109, "grad_norm": 0.2821933627128601, "learning_rate": 2.9999999999999997e-05, "loss": 4.5883, "step": 41535 }, { "epoch": 0.8506542706776156, "grad_norm": 0.2846284806728363, "learning_rate": 2.9999999999999997e-05, "loss": 4.5561, "step": 41540 }, { "epoch": 0.8507566604550202, "grad_norm": 0.2869037985801697, "learning_rate": 2.9999999999999997e-05, "loss": 4.6337, "step": 41545 }, { "epoch": 0.8508590502324248, "grad_norm": 0.2855621874332428, "learning_rate": 2.9999999999999997e-05, "loss": 4.5948, "step": 41550 }, { "epoch": 0.8509614400098294, "grad_norm": 0.2860376834869385, "learning_rate": 2.9999999999999997e-05, "loss": 4.5101, "step": 41555 }, { "epoch": 0.851063829787234, "grad_norm": 0.3692401051521301, "learning_rate": 2.9999999999999997e-05, "loss": 4.5895, "step": 41560 }, { "epoch": 0.8511662195646387, "grad_norm": 0.29354408383369446, "learning_rate": 2.9999999999999997e-05, "loss": 4.5458, "step": 41565 }, { "epoch": 0.8512686093420433, "grad_norm": 0.2744632065296173, "learning_rate": 2.9999999999999997e-05, "loss": 4.6093, "step": 41570 }, { "epoch": 0.8513709991194479, "grad_norm": 0.27942997217178345, "learning_rate": 2.9999999999999997e-05, "loss": 4.5687, "step": 41575 }, { "epoch": 0.8514733888968525, "grad_norm": 0.3143038749694824, "learning_rate": 2.9999999999999997e-05, "loss": 4.6032, "step": 41580 }, { "epoch": 0.8515757786742572, "grad_norm": 0.2794727087020874, "learning_rate": 2.9999999999999997e-05, "loss": 4.5768, "step": 41585 }, { "epoch": 0.8516781684516618, "grad_norm": 0.2896329164505005, "learning_rate": 2.9999999999999997e-05, "loss": 4.5505, "step": 41590 }, { "epoch": 0.8517805582290664, "grad_norm": 0.29437339305877686, "learning_rate": 2.9999999999999997e-05, "loss": 4.5885, "step": 41595 }, { "epoch": 0.851882948006471, "grad_norm": 0.30380910634994507, "learning_rate": 2.9999999999999997e-05, "loss": 4.4836, "step": 41600 }, { "epoch": 0.8519853377838756, "grad_norm": 0.28047603368759155, "learning_rate": 2.9999999999999997e-05, "loss": 4.6065, "step": 41605 }, { "epoch": 0.8520877275612803, "grad_norm": 0.29028409719467163, "learning_rate": 2.9999999999999997e-05, "loss": 4.5723, "step": 41610 }, { "epoch": 0.8521901173386849, "grad_norm": 0.27676650881767273, "learning_rate": 2.9999999999999997e-05, "loss": 4.6149, "step": 41615 }, { "epoch": 0.8522925071160895, "grad_norm": 0.28937408328056335, "learning_rate": 2.9999999999999997e-05, "loss": 4.5606, "step": 41620 }, { "epoch": 0.8523948968934941, "grad_norm": 0.2824362516403198, "learning_rate": 2.9999999999999997e-05, "loss": 4.5846, "step": 41625 }, { "epoch": 0.8524972866708987, "grad_norm": 0.2866837978363037, "learning_rate": 2.9999999999999997e-05, "loss": 4.6187, "step": 41630 }, { "epoch": 0.8525996764483034, "grad_norm": 0.28847113251686096, "learning_rate": 2.9999999999999997e-05, "loss": 4.5751, "step": 41635 }, { "epoch": 0.852702066225708, "grad_norm": 0.28669634461402893, "learning_rate": 2.9999999999999997e-05, "loss": 4.5576, "step": 41640 }, { "epoch": 0.8528044560031126, "grad_norm": 0.2918095290660858, "learning_rate": 2.9999999999999997e-05, "loss": 4.5455, "step": 41645 }, { "epoch": 0.8529068457805172, "grad_norm": 0.27304890751838684, "learning_rate": 2.9999999999999997e-05, "loss": 4.6047, "step": 41650 }, { "epoch": 0.8530092355579219, "grad_norm": 0.2830185294151306, "learning_rate": 2.9999999999999997e-05, "loss": 4.5711, "step": 41655 }, { "epoch": 0.8531116253353265, "grad_norm": 0.2840002775192261, "learning_rate": 2.9999999999999997e-05, "loss": 4.548, "step": 41660 }, { "epoch": 0.8532140151127311, "grad_norm": 0.29644858837127686, "learning_rate": 2.9999999999999997e-05, "loss": 4.6533, "step": 41665 }, { "epoch": 0.8533164048901357, "grad_norm": 0.27836039662361145, "learning_rate": 2.9999999999999997e-05, "loss": 4.5634, "step": 41670 }, { "epoch": 0.8534187946675404, "grad_norm": 0.28981637954711914, "learning_rate": 2.9999999999999997e-05, "loss": 4.5704, "step": 41675 }, { "epoch": 0.8535211844449451, "grad_norm": 0.2824649512767792, "learning_rate": 2.9999999999999997e-05, "loss": 4.5704, "step": 41680 }, { "epoch": 0.8536235742223497, "grad_norm": 0.27254143357276917, "learning_rate": 2.9999999999999997e-05, "loss": 4.528, "step": 41685 }, { "epoch": 0.8537259639997543, "grad_norm": 0.32417652010917664, "learning_rate": 2.9999999999999997e-05, "loss": 4.596, "step": 41690 }, { "epoch": 0.8538283537771589, "grad_norm": 0.27936333417892456, "learning_rate": 2.9999999999999997e-05, "loss": 4.5343, "step": 41695 }, { "epoch": 0.8539307435545636, "grad_norm": 0.302987664937973, "learning_rate": 2.9999999999999997e-05, "loss": 4.5512, "step": 41700 }, { "epoch": 0.8540331333319682, "grad_norm": 0.28835564851760864, "learning_rate": 2.9999999999999997e-05, "loss": 4.6041, "step": 41705 }, { "epoch": 0.8541355231093728, "grad_norm": 0.29047226905822754, "learning_rate": 2.9999999999999997e-05, "loss": 4.5264, "step": 41710 }, { "epoch": 0.8542379128867774, "grad_norm": 0.2981569170951843, "learning_rate": 2.9999999999999997e-05, "loss": 4.5758, "step": 41715 }, { "epoch": 0.854340302664182, "grad_norm": 0.330504834651947, "learning_rate": 2.9999999999999997e-05, "loss": 4.5811, "step": 41720 }, { "epoch": 0.8544426924415867, "grad_norm": 0.28930309414863586, "learning_rate": 2.9999999999999997e-05, "loss": 4.5309, "step": 41725 }, { "epoch": 0.8545450822189913, "grad_norm": 0.2907178997993469, "learning_rate": 2.9999999999999997e-05, "loss": 4.5726, "step": 41730 }, { "epoch": 0.8546474719963959, "grad_norm": 0.3091176152229309, "learning_rate": 2.9999999999999997e-05, "loss": 4.5667, "step": 41735 }, { "epoch": 0.8547498617738005, "grad_norm": 0.8943814635276794, "learning_rate": 2.9999999999999997e-05, "loss": 4.5795, "step": 41740 }, { "epoch": 0.8548522515512051, "grad_norm": 0.28994032740592957, "learning_rate": 2.9999999999999997e-05, "loss": 4.5899, "step": 41745 }, { "epoch": 0.8549546413286098, "grad_norm": 0.28370314836502075, "learning_rate": 2.9999999999999997e-05, "loss": 4.5931, "step": 41750 }, { "epoch": 0.8550570311060144, "grad_norm": 0.31428468227386475, "learning_rate": 2.9999999999999997e-05, "loss": 4.6194, "step": 41755 }, { "epoch": 0.855159420883419, "grad_norm": 0.28025707602500916, "learning_rate": 2.9999999999999997e-05, "loss": 4.6054, "step": 41760 }, { "epoch": 0.8552618106608236, "grad_norm": 0.2866079807281494, "learning_rate": 2.9999999999999997e-05, "loss": 4.6121, "step": 41765 }, { "epoch": 0.8553642004382283, "grad_norm": 0.29860225319862366, "learning_rate": 2.9999999999999997e-05, "loss": 4.5499, "step": 41770 }, { "epoch": 0.8554665902156329, "grad_norm": 0.2892419695854187, "learning_rate": 2.9999999999999997e-05, "loss": 4.5718, "step": 41775 }, { "epoch": 0.8555689799930375, "grad_norm": 0.28532499074935913, "learning_rate": 2.9999999999999997e-05, "loss": 4.5262, "step": 41780 }, { "epoch": 0.8556713697704421, "grad_norm": 0.2803010940551758, "learning_rate": 2.9999999999999997e-05, "loss": 4.5912, "step": 41785 }, { "epoch": 0.8557737595478467, "grad_norm": 0.2792489230632782, "learning_rate": 2.9999999999999997e-05, "loss": 4.5509, "step": 41790 }, { "epoch": 0.8558761493252514, "grad_norm": 0.2824473977088928, "learning_rate": 2.9999999999999997e-05, "loss": 4.5703, "step": 41795 }, { "epoch": 0.855978539102656, "grad_norm": 0.28354400396347046, "learning_rate": 2.9999999999999997e-05, "loss": 4.5343, "step": 41800 }, { "epoch": 0.8560809288800606, "grad_norm": 0.2932973802089691, "learning_rate": 2.9999999999999997e-05, "loss": 4.5303, "step": 41805 }, { "epoch": 0.8561833186574652, "grad_norm": 0.2828560471534729, "learning_rate": 2.9999999999999997e-05, "loss": 4.5479, "step": 41810 }, { "epoch": 0.8562857084348698, "grad_norm": 0.2938644587993622, "learning_rate": 2.9999999999999997e-05, "loss": 4.5818, "step": 41815 }, { "epoch": 0.8563880982122745, "grad_norm": 0.2887813448905945, "learning_rate": 2.9999999999999997e-05, "loss": 4.5911, "step": 41820 }, { "epoch": 0.8564904879896791, "grad_norm": 0.29226839542388916, "learning_rate": 2.9999999999999997e-05, "loss": 4.6075, "step": 41825 }, { "epoch": 0.8565928777670837, "grad_norm": 0.2809842824935913, "learning_rate": 2.9999999999999997e-05, "loss": 4.5584, "step": 41830 }, { "epoch": 0.8566952675444883, "grad_norm": 0.28355011343955994, "learning_rate": 2.9999999999999997e-05, "loss": 4.5645, "step": 41835 }, { "epoch": 0.856797657321893, "grad_norm": 0.2816533148288727, "learning_rate": 2.9999999999999997e-05, "loss": 4.5817, "step": 41840 }, { "epoch": 0.8569000470992976, "grad_norm": 0.2742515206336975, "learning_rate": 2.9999999999999997e-05, "loss": 4.5904, "step": 41845 }, { "epoch": 0.8570024368767022, "grad_norm": 0.27726268768310547, "learning_rate": 2.9999999999999997e-05, "loss": 4.5249, "step": 41850 }, { "epoch": 0.8571048266541068, "grad_norm": 0.2813457250595093, "learning_rate": 2.9999999999999997e-05, "loss": 4.5373, "step": 41855 }, { "epoch": 0.8572072164315114, "grad_norm": 0.32697102427482605, "learning_rate": 2.9999999999999997e-05, "loss": 4.5479, "step": 41860 }, { "epoch": 0.857309606208916, "grad_norm": 0.29770803451538086, "learning_rate": 2.9999999999999997e-05, "loss": 4.5563, "step": 41865 }, { "epoch": 0.8574119959863207, "grad_norm": 0.27651724219322205, "learning_rate": 2.9999999999999997e-05, "loss": 4.5707, "step": 41870 }, { "epoch": 0.8575143857637253, "grad_norm": 0.28034499287605286, "learning_rate": 2.9999999999999997e-05, "loss": 4.5569, "step": 41875 }, { "epoch": 0.85761677554113, "grad_norm": 0.2771777808666229, "learning_rate": 2.9999999999999997e-05, "loss": 4.5803, "step": 41880 }, { "epoch": 0.8577191653185346, "grad_norm": 0.2932230532169342, "learning_rate": 2.9999999999999997e-05, "loss": 4.5385, "step": 41885 }, { "epoch": 0.8578215550959393, "grad_norm": 0.32938718795776367, "learning_rate": 2.9999999999999997e-05, "loss": 4.6359, "step": 41890 }, { "epoch": 0.8579239448733439, "grad_norm": 0.28788164258003235, "learning_rate": 2.9999999999999997e-05, "loss": 4.5867, "step": 41895 }, { "epoch": 0.8580263346507485, "grad_norm": 0.3202422559261322, "learning_rate": 2.9999999999999997e-05, "loss": 4.5261, "step": 41900 }, { "epoch": 0.8581287244281531, "grad_norm": 0.286275178194046, "learning_rate": 2.9999999999999997e-05, "loss": 4.5838, "step": 41905 }, { "epoch": 0.8582311142055578, "grad_norm": 0.29635751247406006, "learning_rate": 2.9999999999999997e-05, "loss": 4.5943, "step": 41910 }, { "epoch": 0.8583335039829624, "grad_norm": 0.29215869307518005, "learning_rate": 2.9999999999999997e-05, "loss": 4.6463, "step": 41915 }, { "epoch": 0.858435893760367, "grad_norm": 0.28455254435539246, "learning_rate": 2.9999999999999997e-05, "loss": 4.5548, "step": 41920 }, { "epoch": 0.8585382835377716, "grad_norm": 0.2889324724674225, "learning_rate": 2.9999999999999997e-05, "loss": 4.5618, "step": 41925 }, { "epoch": 0.8586406733151762, "grad_norm": 0.2818892002105713, "learning_rate": 2.9999999999999997e-05, "loss": 4.5916, "step": 41930 }, { "epoch": 0.8587430630925809, "grad_norm": 0.2804669737815857, "learning_rate": 2.9999999999999997e-05, "loss": 4.6042, "step": 41935 }, { "epoch": 0.8588454528699855, "grad_norm": 0.28407761454582214, "learning_rate": 2.9999999999999997e-05, "loss": 4.5983, "step": 41940 }, { "epoch": 0.8589478426473901, "grad_norm": 0.30998361110687256, "learning_rate": 2.9999999999999997e-05, "loss": 4.5394, "step": 41945 }, { "epoch": 0.8590502324247947, "grad_norm": 0.2851797342300415, "learning_rate": 2.9999999999999997e-05, "loss": 4.5569, "step": 41950 }, { "epoch": 0.8591526222021993, "grad_norm": 0.4206336438655853, "learning_rate": 2.9999999999999997e-05, "loss": 4.5316, "step": 41955 }, { "epoch": 0.859255011979604, "grad_norm": 0.29635483026504517, "learning_rate": 2.9999999999999997e-05, "loss": 4.5399, "step": 41960 }, { "epoch": 0.8593574017570086, "grad_norm": 0.2879798114299774, "learning_rate": 2.9999999999999997e-05, "loss": 4.5464, "step": 41965 }, { "epoch": 0.8594597915344132, "grad_norm": 0.3061964511871338, "learning_rate": 2.9999999999999997e-05, "loss": 4.5961, "step": 41970 }, { "epoch": 0.8595621813118178, "grad_norm": 0.29285728931427, "learning_rate": 2.9999999999999997e-05, "loss": 4.5452, "step": 41975 }, { "epoch": 0.8596645710892225, "grad_norm": 0.2791544198989868, "learning_rate": 2.9999999999999997e-05, "loss": 4.59, "step": 41980 }, { "epoch": 0.8597669608666271, "grad_norm": 0.28416672348976135, "learning_rate": 2.9999999999999997e-05, "loss": 4.6065, "step": 41985 }, { "epoch": 0.8598693506440317, "grad_norm": 0.28662338852882385, "learning_rate": 2.9999999999999997e-05, "loss": 4.5532, "step": 41990 }, { "epoch": 0.8599717404214363, "grad_norm": 0.29011231660842896, "learning_rate": 2.9999999999999997e-05, "loss": 4.6009, "step": 41995 }, { "epoch": 0.8600741301988409, "grad_norm": 0.29358547925949097, "learning_rate": 2.9999999999999997e-05, "loss": 4.5478, "step": 42000 }, { "epoch": 0.8600741301988409, "eval_loss": 4.590585231781006, "eval_runtime": 17.1325, "eval_samples_per_second": 33.971, "eval_steps_per_second": 4.261, "step": 42000 }, { "epoch": 0.8601765199762456, "grad_norm": 0.2929130792617798, "learning_rate": 2.9999999999999997e-05, "loss": 4.5863, "step": 42005 }, { "epoch": 0.8602789097536502, "grad_norm": 0.29090362787246704, "learning_rate": 2.9999999999999997e-05, "loss": 4.5614, "step": 42010 }, { "epoch": 0.8603812995310548, "grad_norm": 0.27907586097717285, "learning_rate": 2.9999999999999997e-05, "loss": 4.5621, "step": 42015 }, { "epoch": 0.8604836893084594, "grad_norm": 0.282423734664917, "learning_rate": 2.9999999999999997e-05, "loss": 4.549, "step": 42020 }, { "epoch": 0.860586079085864, "grad_norm": 0.28453391790390015, "learning_rate": 2.9999999999999997e-05, "loss": 4.536, "step": 42025 }, { "epoch": 0.8606884688632687, "grad_norm": 0.3082663416862488, "learning_rate": 2.9999999999999997e-05, "loss": 4.5478, "step": 42030 }, { "epoch": 0.8607908586406733, "grad_norm": 0.2766706049442291, "learning_rate": 2.9999999999999997e-05, "loss": 4.5169, "step": 42035 }, { "epoch": 0.8608932484180779, "grad_norm": 0.28100666403770447, "learning_rate": 2.9999999999999997e-05, "loss": 4.543, "step": 42040 }, { "epoch": 0.8609956381954825, "grad_norm": 0.30033186078071594, "learning_rate": 2.9999999999999997e-05, "loss": 4.6223, "step": 42045 }, { "epoch": 0.8610980279728871, "grad_norm": 0.2801840603351593, "learning_rate": 2.9999999999999997e-05, "loss": 4.5235, "step": 42050 }, { "epoch": 0.8612004177502918, "grad_norm": 0.27475932240486145, "learning_rate": 2.9999999999999997e-05, "loss": 4.581, "step": 42055 }, { "epoch": 0.8613028075276964, "grad_norm": 0.28085681796073914, "learning_rate": 2.9999999999999997e-05, "loss": 4.5854, "step": 42060 }, { "epoch": 0.861405197305101, "grad_norm": 0.2858322262763977, "learning_rate": 2.9999999999999997e-05, "loss": 4.6135, "step": 42065 }, { "epoch": 0.8615075870825056, "grad_norm": 0.29247337579727173, "learning_rate": 2.9999999999999997e-05, "loss": 4.5652, "step": 42070 }, { "epoch": 0.8616099768599103, "grad_norm": 0.28009599447250366, "learning_rate": 2.9999999999999997e-05, "loss": 4.5821, "step": 42075 }, { "epoch": 0.861712366637315, "grad_norm": 0.2810896337032318, "learning_rate": 2.9999999999999997e-05, "loss": 4.578, "step": 42080 }, { "epoch": 0.8618147564147196, "grad_norm": 0.3237174153327942, "learning_rate": 2.9999999999999997e-05, "loss": 4.5869, "step": 42085 }, { "epoch": 0.8619171461921242, "grad_norm": 0.2805618643760681, "learning_rate": 2.9999999999999997e-05, "loss": 4.6044, "step": 42090 }, { "epoch": 0.8620195359695288, "grad_norm": 0.2970050871372223, "learning_rate": 2.9999999999999997e-05, "loss": 4.5761, "step": 42095 }, { "epoch": 0.8621219257469335, "grad_norm": 0.29210126399993896, "learning_rate": 2.9999999999999997e-05, "loss": 4.5839, "step": 42100 }, { "epoch": 0.8622243155243381, "grad_norm": 0.27998438477516174, "learning_rate": 2.9999999999999997e-05, "loss": 4.5538, "step": 42105 }, { "epoch": 0.8623267053017427, "grad_norm": 0.2854512333869934, "learning_rate": 2.9999999999999997e-05, "loss": 4.6154, "step": 42110 }, { "epoch": 0.8624290950791473, "grad_norm": 0.36673498153686523, "learning_rate": 2.9999999999999997e-05, "loss": 4.5629, "step": 42115 }, { "epoch": 0.862531484856552, "grad_norm": 0.4028867781162262, "learning_rate": 2.9999999999999997e-05, "loss": 4.6183, "step": 42120 }, { "epoch": 0.8626338746339566, "grad_norm": 0.29421451687812805, "learning_rate": 2.9999999999999997e-05, "loss": 4.5932, "step": 42125 }, { "epoch": 0.8627362644113612, "grad_norm": 0.27536529302597046, "learning_rate": 2.9999999999999997e-05, "loss": 4.5751, "step": 42130 }, { "epoch": 0.8628386541887658, "grad_norm": 0.28098100423812866, "learning_rate": 2.9999999999999997e-05, "loss": 4.6018, "step": 42135 }, { "epoch": 0.8629410439661704, "grad_norm": 0.27294838428497314, "learning_rate": 2.9999999999999997e-05, "loss": 4.6355, "step": 42140 }, { "epoch": 0.8630434337435751, "grad_norm": 0.273320734500885, "learning_rate": 2.9999999999999997e-05, "loss": 4.595, "step": 42145 }, { "epoch": 0.8631458235209797, "grad_norm": 0.31179067492485046, "learning_rate": 2.9999999999999997e-05, "loss": 4.5569, "step": 42150 }, { "epoch": 0.8632482132983843, "grad_norm": 0.2889743745326996, "learning_rate": 2.9999999999999997e-05, "loss": 4.4923, "step": 42155 }, { "epoch": 0.8633506030757889, "grad_norm": 0.27962151169776917, "learning_rate": 2.9999999999999997e-05, "loss": 4.5543, "step": 42160 }, { "epoch": 0.8634529928531935, "grad_norm": 0.2781238555908203, "learning_rate": 2.9999999999999997e-05, "loss": 4.5395, "step": 42165 }, { "epoch": 0.8635553826305982, "grad_norm": 0.27796876430511475, "learning_rate": 2.9999999999999997e-05, "loss": 4.5305, "step": 42170 }, { "epoch": 0.8636577724080028, "grad_norm": 0.2779185473918915, "learning_rate": 2.9999999999999997e-05, "loss": 4.572, "step": 42175 }, { "epoch": 0.8637601621854074, "grad_norm": 0.3074818253517151, "learning_rate": 2.9999999999999997e-05, "loss": 4.5777, "step": 42180 }, { "epoch": 0.863862551962812, "grad_norm": 0.2755137085914612, "learning_rate": 2.9999999999999997e-05, "loss": 4.5879, "step": 42185 }, { "epoch": 0.8639649417402167, "grad_norm": 0.3061232566833496, "learning_rate": 2.9999999999999997e-05, "loss": 4.57, "step": 42190 }, { "epoch": 0.8640673315176213, "grad_norm": 0.27670255303382874, "learning_rate": 2.9999999999999997e-05, "loss": 4.6003, "step": 42195 }, { "epoch": 0.8641697212950259, "grad_norm": 0.32295048236846924, "learning_rate": 2.9999999999999997e-05, "loss": 4.578, "step": 42200 }, { "epoch": 0.8642721110724305, "grad_norm": 0.2795508801937103, "learning_rate": 2.9999999999999997e-05, "loss": 4.5444, "step": 42205 }, { "epoch": 0.8643745008498351, "grad_norm": 0.28401806950569153, "learning_rate": 2.9999999999999997e-05, "loss": 4.5526, "step": 42210 }, { "epoch": 0.8644768906272398, "grad_norm": 0.29551583528518677, "learning_rate": 2.9999999999999997e-05, "loss": 4.5639, "step": 42215 }, { "epoch": 0.8645792804046444, "grad_norm": 0.2854187786579132, "learning_rate": 2.9999999999999997e-05, "loss": 4.6054, "step": 42220 }, { "epoch": 0.864681670182049, "grad_norm": 0.2859326899051666, "learning_rate": 2.9999999999999997e-05, "loss": 4.5799, "step": 42225 }, { "epoch": 0.8647840599594536, "grad_norm": 0.2843199372291565, "learning_rate": 2.9999999999999997e-05, "loss": 4.5481, "step": 42230 }, { "epoch": 0.8648864497368582, "grad_norm": 0.2788691818714142, "learning_rate": 2.9999999999999997e-05, "loss": 4.6117, "step": 42235 }, { "epoch": 0.8649888395142629, "grad_norm": 0.29678836464881897, "learning_rate": 2.9999999999999997e-05, "loss": 4.5059, "step": 42240 }, { "epoch": 0.8650912292916675, "grad_norm": 0.277492880821228, "learning_rate": 2.9999999999999997e-05, "loss": 4.5295, "step": 42245 }, { "epoch": 0.8651936190690721, "grad_norm": 0.2821527421474457, "learning_rate": 2.9999999999999997e-05, "loss": 4.5915, "step": 42250 }, { "epoch": 0.8652960088464767, "grad_norm": 0.28887367248535156, "learning_rate": 2.9999999999999997e-05, "loss": 4.5833, "step": 42255 }, { "epoch": 0.8653983986238813, "grad_norm": 0.28567564487457275, "learning_rate": 2.9999999999999997e-05, "loss": 4.5263, "step": 42260 }, { "epoch": 0.865500788401286, "grad_norm": 0.2992493212223053, "learning_rate": 2.9999999999999997e-05, "loss": 4.5533, "step": 42265 }, { "epoch": 0.8656031781786906, "grad_norm": 0.28368014097213745, "learning_rate": 2.9999999999999997e-05, "loss": 4.6059, "step": 42270 }, { "epoch": 0.8657055679560952, "grad_norm": 0.27183762192726135, "learning_rate": 2.9999999999999997e-05, "loss": 4.5358, "step": 42275 }, { "epoch": 0.8658079577334998, "grad_norm": 0.565661609172821, "learning_rate": 2.9999999999999997e-05, "loss": 4.5022, "step": 42280 }, { "epoch": 0.8659103475109046, "grad_norm": 0.28837451338768005, "learning_rate": 2.9999999999999997e-05, "loss": 4.6116, "step": 42285 }, { "epoch": 0.8660127372883092, "grad_norm": 0.2835419476032257, "learning_rate": 2.9999999999999997e-05, "loss": 4.4694, "step": 42290 }, { "epoch": 0.8661151270657138, "grad_norm": 1.099507451057434, "learning_rate": 2.9999999999999997e-05, "loss": 4.6911, "step": 42295 }, { "epoch": 0.8662175168431184, "grad_norm": 0.2809179425239563, "learning_rate": 2.9999999999999997e-05, "loss": 4.5912, "step": 42300 }, { "epoch": 0.866319906620523, "grad_norm": 0.2871781587600708, "learning_rate": 2.9999999999999997e-05, "loss": 4.5526, "step": 42305 }, { "epoch": 0.8664222963979277, "grad_norm": 0.3089900016784668, "learning_rate": 2.9999999999999997e-05, "loss": 4.5706, "step": 42310 }, { "epoch": 0.8665246861753323, "grad_norm": 0.2921340763568878, "learning_rate": 2.9999999999999997e-05, "loss": 4.5614, "step": 42315 }, { "epoch": 0.8666270759527369, "grad_norm": 0.28606513142585754, "learning_rate": 2.9999999999999997e-05, "loss": 4.5368, "step": 42320 }, { "epoch": 0.8667294657301415, "grad_norm": 0.2628849744796753, "learning_rate": 2.9999999999999997e-05, "loss": 4.5403, "step": 42325 }, { "epoch": 0.8668318555075462, "grad_norm": 0.2877076268196106, "learning_rate": 2.9999999999999997e-05, "loss": 4.5371, "step": 42330 }, { "epoch": 0.8669342452849508, "grad_norm": 0.2788994014263153, "learning_rate": 2.9999999999999997e-05, "loss": 4.5858, "step": 42335 }, { "epoch": 0.8670366350623554, "grad_norm": 0.27799710631370544, "learning_rate": 2.9999999999999997e-05, "loss": 4.566, "step": 42340 }, { "epoch": 0.86713902483976, "grad_norm": 0.4452689588069916, "learning_rate": 2.9999999999999997e-05, "loss": 4.6125, "step": 42345 }, { "epoch": 0.8672414146171646, "grad_norm": 0.30955836176872253, "learning_rate": 2.9999999999999997e-05, "loss": 4.555, "step": 42350 }, { "epoch": 0.8673438043945693, "grad_norm": 0.30161789059638977, "learning_rate": 2.9999999999999997e-05, "loss": 4.6057, "step": 42355 }, { "epoch": 0.8674461941719739, "grad_norm": 0.3116542100906372, "learning_rate": 2.9999999999999997e-05, "loss": 4.545, "step": 42360 }, { "epoch": 0.8675485839493785, "grad_norm": 0.283415287733078, "learning_rate": 2.9999999999999997e-05, "loss": 4.556, "step": 42365 }, { "epoch": 0.8676509737267831, "grad_norm": 0.3659295439720154, "learning_rate": 2.9999999999999997e-05, "loss": 4.5269, "step": 42370 }, { "epoch": 0.8677533635041877, "grad_norm": 0.3008820116519928, "learning_rate": 2.9999999999999997e-05, "loss": 4.5487, "step": 42375 }, { "epoch": 0.8678557532815924, "grad_norm": 0.29011234641075134, "learning_rate": 2.9999999999999997e-05, "loss": 4.6369, "step": 42380 }, { "epoch": 0.867958143058997, "grad_norm": 0.29220259189605713, "learning_rate": 2.9999999999999997e-05, "loss": 4.6019, "step": 42385 }, { "epoch": 0.8680605328364016, "grad_norm": 0.33202919363975525, "learning_rate": 2.9999999999999997e-05, "loss": 4.5812, "step": 42390 }, { "epoch": 0.8681629226138062, "grad_norm": 0.28137901425361633, "learning_rate": 2.9999999999999997e-05, "loss": 4.5701, "step": 42395 }, { "epoch": 0.8682653123912109, "grad_norm": 0.2826639711856842, "learning_rate": 2.9999999999999997e-05, "loss": 4.6094, "step": 42400 }, { "epoch": 0.8683677021686155, "grad_norm": 0.27954402565956116, "learning_rate": 2.9999999999999997e-05, "loss": 4.5638, "step": 42405 }, { "epoch": 0.8684700919460201, "grad_norm": 0.29488644003868103, "learning_rate": 2.9999999999999997e-05, "loss": 4.5684, "step": 42410 }, { "epoch": 0.8685724817234247, "grad_norm": 0.30218660831451416, "learning_rate": 2.9999999999999997e-05, "loss": 4.5535, "step": 42415 }, { "epoch": 0.8686748715008293, "grad_norm": 0.28542274236679077, "learning_rate": 2.9999999999999997e-05, "loss": 4.5591, "step": 42420 }, { "epoch": 0.868777261278234, "grad_norm": 0.289728045463562, "learning_rate": 2.9999999999999997e-05, "loss": 4.5724, "step": 42425 }, { "epoch": 0.8688796510556386, "grad_norm": 0.2904987037181854, "learning_rate": 2.9999999999999997e-05, "loss": 4.5675, "step": 42430 }, { "epoch": 0.8689820408330432, "grad_norm": 0.28901374340057373, "learning_rate": 2.9999999999999997e-05, "loss": 4.5979, "step": 42435 }, { "epoch": 0.8690844306104478, "grad_norm": 0.30808329582214355, "learning_rate": 2.9999999999999997e-05, "loss": 4.5183, "step": 42440 }, { "epoch": 0.8691868203878524, "grad_norm": 0.2810181975364685, "learning_rate": 2.9999999999999997e-05, "loss": 4.5318, "step": 42445 }, { "epoch": 0.8692892101652571, "grad_norm": 0.28387823700904846, "learning_rate": 2.9999999999999997e-05, "loss": 4.5458, "step": 42450 }, { "epoch": 0.8693915999426617, "grad_norm": 0.2959236800670624, "learning_rate": 2.9999999999999997e-05, "loss": 4.5628, "step": 42455 }, { "epoch": 0.8694939897200663, "grad_norm": 0.33226150274276733, "learning_rate": 2.9999999999999997e-05, "loss": 4.5292, "step": 42460 }, { "epoch": 0.8695963794974709, "grad_norm": 0.2962782382965088, "learning_rate": 2.9999999999999997e-05, "loss": 4.5998, "step": 42465 }, { "epoch": 0.8696987692748755, "grad_norm": 0.3628597557544708, "learning_rate": 2.9999999999999997e-05, "loss": 4.633, "step": 42470 }, { "epoch": 0.8698011590522802, "grad_norm": 0.30368393659591675, "learning_rate": 2.9999999999999997e-05, "loss": 4.5538, "step": 42475 }, { "epoch": 0.8699035488296848, "grad_norm": 0.29624828696250916, "learning_rate": 2.9999999999999997e-05, "loss": 4.5782, "step": 42480 }, { "epoch": 0.8700059386070894, "grad_norm": 0.2826397716999054, "learning_rate": 2.9999999999999997e-05, "loss": 4.5451, "step": 42485 }, { "epoch": 0.8701083283844941, "grad_norm": 0.2871544063091278, "learning_rate": 2.9999999999999997e-05, "loss": 4.5634, "step": 42490 }, { "epoch": 0.8702107181618988, "grad_norm": 0.28926998376846313, "learning_rate": 2.9999999999999997e-05, "loss": 4.5313, "step": 42495 }, { "epoch": 0.8703131079393034, "grad_norm": 0.2764125466346741, "learning_rate": 2.9999999999999997e-05, "loss": 4.5853, "step": 42500 }, { "epoch": 0.870415497716708, "grad_norm": 0.2877795398235321, "learning_rate": 2.9999999999999997e-05, "loss": 4.5678, "step": 42505 }, { "epoch": 0.8705178874941126, "grad_norm": 0.3008129298686981, "learning_rate": 2.9999999999999997e-05, "loss": 4.5419, "step": 42510 }, { "epoch": 0.8706202772715173, "grad_norm": 0.2770102322101593, "learning_rate": 2.9999999999999997e-05, "loss": 4.5813, "step": 42515 }, { "epoch": 0.8707226670489219, "grad_norm": 0.2859014570713043, "learning_rate": 2.9999999999999997e-05, "loss": 4.6064, "step": 42520 }, { "epoch": 0.8708250568263265, "grad_norm": 0.3001577854156494, "learning_rate": 2.9999999999999997e-05, "loss": 4.5527, "step": 42525 }, { "epoch": 0.8709274466037311, "grad_norm": 0.28082939982414246, "learning_rate": 2.9999999999999997e-05, "loss": 4.5822, "step": 42530 }, { "epoch": 0.8710298363811357, "grad_norm": 0.2857016921043396, "learning_rate": 2.9999999999999997e-05, "loss": 4.5967, "step": 42535 }, { "epoch": 0.8711322261585404, "grad_norm": 0.6677747368812561, "learning_rate": 2.9999999999999997e-05, "loss": 4.5275, "step": 42540 }, { "epoch": 0.871234615935945, "grad_norm": 0.29151827096939087, "learning_rate": 2.9999999999999997e-05, "loss": 4.5613, "step": 42545 }, { "epoch": 0.8713370057133496, "grad_norm": 0.29731205105781555, "learning_rate": 2.9999999999999997e-05, "loss": 4.5673, "step": 42550 }, { "epoch": 0.8714393954907542, "grad_norm": 0.4542264938354492, "learning_rate": 2.9999999999999997e-05, "loss": 4.5017, "step": 42555 }, { "epoch": 0.8715417852681588, "grad_norm": 0.33812379837036133, "learning_rate": 2.9999999999999997e-05, "loss": 4.4928, "step": 42560 }, { "epoch": 0.8716441750455635, "grad_norm": 0.30377838015556335, "learning_rate": 2.9999999999999997e-05, "loss": 4.5837, "step": 42565 }, { "epoch": 0.8717465648229681, "grad_norm": 0.2798381447792053, "learning_rate": 2.9999999999999997e-05, "loss": 4.6044, "step": 42570 }, { "epoch": 0.8718489546003727, "grad_norm": 0.2912675440311432, "learning_rate": 2.9999999999999997e-05, "loss": 4.5703, "step": 42575 }, { "epoch": 0.8719513443777773, "grad_norm": 0.2935977578163147, "learning_rate": 2.9999999999999997e-05, "loss": 4.5622, "step": 42580 }, { "epoch": 0.872053734155182, "grad_norm": 0.2936408519744873, "learning_rate": 2.9999999999999997e-05, "loss": 4.5041, "step": 42585 }, { "epoch": 0.8721561239325866, "grad_norm": 0.39489099383354187, "learning_rate": 2.9999999999999997e-05, "loss": 4.5734, "step": 42590 }, { "epoch": 0.8722585137099912, "grad_norm": 0.2872966229915619, "learning_rate": 2.9999999999999997e-05, "loss": 4.5776, "step": 42595 }, { "epoch": 0.8723609034873958, "grad_norm": 0.29036393761634827, "learning_rate": 2.9999999999999997e-05, "loss": 4.5485, "step": 42600 }, { "epoch": 0.8724632932648004, "grad_norm": 0.28508612513542175, "learning_rate": 2.9999999999999997e-05, "loss": 4.5078, "step": 42605 }, { "epoch": 0.872565683042205, "grad_norm": 0.3030988276004791, "learning_rate": 2.9999999999999997e-05, "loss": 4.4962, "step": 42610 }, { "epoch": 0.8726680728196097, "grad_norm": 0.2963111102581024, "learning_rate": 2.9999999999999997e-05, "loss": 4.5934, "step": 42615 }, { "epoch": 0.8727704625970143, "grad_norm": 0.28452345728874207, "learning_rate": 2.9999999999999997e-05, "loss": 4.5454, "step": 42620 }, { "epoch": 0.8728728523744189, "grad_norm": 0.29631686210632324, "learning_rate": 2.9999999999999997e-05, "loss": 4.5539, "step": 42625 }, { "epoch": 0.8729752421518235, "grad_norm": 0.2943074703216553, "learning_rate": 2.9999999999999997e-05, "loss": 4.5386, "step": 42630 }, { "epoch": 0.8730776319292282, "grad_norm": 0.28320398926734924, "learning_rate": 2.9999999999999997e-05, "loss": 4.553, "step": 42635 }, { "epoch": 0.8731800217066328, "grad_norm": 0.29807671904563904, "learning_rate": 2.9999999999999997e-05, "loss": 4.5019, "step": 42640 }, { "epoch": 0.8732824114840374, "grad_norm": 0.2764509916305542, "learning_rate": 2.9999999999999997e-05, "loss": 4.5759, "step": 42645 }, { "epoch": 0.873384801261442, "grad_norm": 0.30844801664352417, "learning_rate": 2.9999999999999997e-05, "loss": 4.6115, "step": 42650 }, { "epoch": 0.8734871910388466, "grad_norm": 0.2824011445045471, "learning_rate": 2.9999999999999997e-05, "loss": 4.6278, "step": 42655 }, { "epoch": 0.8735895808162513, "grad_norm": 0.2877044677734375, "learning_rate": 2.9999999999999997e-05, "loss": 4.5788, "step": 42660 }, { "epoch": 0.8736919705936559, "grad_norm": 0.27674928307533264, "learning_rate": 2.9999999999999997e-05, "loss": 4.5951, "step": 42665 }, { "epoch": 0.8737943603710605, "grad_norm": 0.2907717823982239, "learning_rate": 2.9999999999999997e-05, "loss": 4.4565, "step": 42670 }, { "epoch": 0.8738967501484651, "grad_norm": 0.3075560927391052, "learning_rate": 2.9999999999999997e-05, "loss": 4.5205, "step": 42675 }, { "epoch": 0.8739991399258698, "grad_norm": 0.29234230518341064, "learning_rate": 2.9999999999999997e-05, "loss": 4.5986, "step": 42680 }, { "epoch": 0.8741015297032744, "grad_norm": 0.2991107702255249, "learning_rate": 2.9999999999999997e-05, "loss": 4.6258, "step": 42685 }, { "epoch": 0.8742039194806791, "grad_norm": 0.2882688343524933, "learning_rate": 2.9999999999999997e-05, "loss": 4.5573, "step": 42690 }, { "epoch": 0.8743063092580837, "grad_norm": 0.286185622215271, "learning_rate": 2.9999999999999997e-05, "loss": 4.5586, "step": 42695 }, { "epoch": 0.8744086990354883, "grad_norm": 0.3102838397026062, "learning_rate": 2.9999999999999997e-05, "loss": 4.62, "step": 42700 }, { "epoch": 0.874511088812893, "grad_norm": 0.2933298945426941, "learning_rate": 2.9999999999999997e-05, "loss": 4.6344, "step": 42705 }, { "epoch": 0.8746134785902976, "grad_norm": 0.3276117146015167, "learning_rate": 2.9999999999999997e-05, "loss": 4.5119, "step": 42710 }, { "epoch": 0.8747158683677022, "grad_norm": 0.33533158898353577, "learning_rate": 2.9999999999999997e-05, "loss": 4.5592, "step": 42715 }, { "epoch": 0.8748182581451068, "grad_norm": 0.28675273060798645, "learning_rate": 2.9999999999999997e-05, "loss": 4.6168, "step": 42720 }, { "epoch": 0.8749206479225115, "grad_norm": 0.279441237449646, "learning_rate": 2.9999999999999997e-05, "loss": 4.5934, "step": 42725 }, { "epoch": 0.8750230376999161, "grad_norm": 0.3046245574951172, "learning_rate": 2.9999999999999997e-05, "loss": 4.5823, "step": 42730 }, { "epoch": 0.8751254274773207, "grad_norm": 0.30070775747299194, "learning_rate": 2.9999999999999997e-05, "loss": 4.5309, "step": 42735 }, { "epoch": 0.8752278172547253, "grad_norm": 0.27930042147636414, "learning_rate": 2.9999999999999997e-05, "loss": 4.5464, "step": 42740 }, { "epoch": 0.8753302070321299, "grad_norm": 0.2897210121154785, "learning_rate": 2.9999999999999997e-05, "loss": 4.6005, "step": 42745 }, { "epoch": 0.8754325968095346, "grad_norm": 0.3091546893119812, "learning_rate": 2.9999999999999997e-05, "loss": 4.5816, "step": 42750 }, { "epoch": 0.8755349865869392, "grad_norm": 0.3109922409057617, "learning_rate": 2.9999999999999997e-05, "loss": 4.5946, "step": 42755 }, { "epoch": 0.8756373763643438, "grad_norm": 0.29840153455734253, "learning_rate": 2.9999999999999997e-05, "loss": 4.6148, "step": 42760 }, { "epoch": 0.8757397661417484, "grad_norm": 0.2970777750015259, "learning_rate": 2.9999999999999997e-05, "loss": 4.5784, "step": 42765 }, { "epoch": 0.875842155919153, "grad_norm": 0.2896730899810791, "learning_rate": 2.9999999999999997e-05, "loss": 4.5952, "step": 42770 }, { "epoch": 0.8759445456965577, "grad_norm": 0.29766690731048584, "learning_rate": 2.9999999999999997e-05, "loss": 4.5608, "step": 42775 }, { "epoch": 0.8760469354739623, "grad_norm": 0.2844490110874176, "learning_rate": 2.9999999999999997e-05, "loss": 4.5695, "step": 42780 }, { "epoch": 0.8761493252513669, "grad_norm": 0.2948978543281555, "learning_rate": 2.9999999999999997e-05, "loss": 4.573, "step": 42785 }, { "epoch": 0.8762517150287715, "grad_norm": 0.3014620840549469, "learning_rate": 2.9999999999999997e-05, "loss": 4.6025, "step": 42790 }, { "epoch": 0.8763541048061761, "grad_norm": 0.2828783094882965, "learning_rate": 2.9999999999999997e-05, "loss": 4.5536, "step": 42795 }, { "epoch": 0.8764564945835808, "grad_norm": 0.2940693795681, "learning_rate": 2.9999999999999997e-05, "loss": 4.5767, "step": 42800 }, { "epoch": 0.8765588843609854, "grad_norm": 0.29585936665534973, "learning_rate": 2.9999999999999997e-05, "loss": 4.5989, "step": 42805 }, { "epoch": 0.87666127413839, "grad_norm": 0.2852897346019745, "learning_rate": 2.9999999999999997e-05, "loss": 4.5354, "step": 42810 }, { "epoch": 0.8767636639157946, "grad_norm": 0.28615662455558777, "learning_rate": 2.9999999999999997e-05, "loss": 4.5227, "step": 42815 }, { "epoch": 0.8768660536931993, "grad_norm": 0.29597118496894836, "learning_rate": 2.9999999999999997e-05, "loss": 4.5613, "step": 42820 }, { "epoch": 0.8769684434706039, "grad_norm": 0.28099918365478516, "learning_rate": 2.9999999999999997e-05, "loss": 4.5753, "step": 42825 }, { "epoch": 0.8770708332480085, "grad_norm": 0.29338228702545166, "learning_rate": 2.9999999999999997e-05, "loss": 4.5536, "step": 42830 }, { "epoch": 0.8771732230254131, "grad_norm": 0.28326815366744995, "learning_rate": 2.9999999999999997e-05, "loss": 4.5799, "step": 42835 }, { "epoch": 0.8772756128028177, "grad_norm": 0.2854740023612976, "learning_rate": 2.9999999999999997e-05, "loss": 4.6029, "step": 42840 }, { "epoch": 0.8773780025802224, "grad_norm": 0.2841622531414032, "learning_rate": 2.9999999999999997e-05, "loss": 4.595, "step": 42845 }, { "epoch": 0.877480392357627, "grad_norm": 0.28582340478897095, "learning_rate": 2.9999999999999997e-05, "loss": 4.5594, "step": 42850 }, { "epoch": 0.8775827821350316, "grad_norm": 0.3178538382053375, "learning_rate": 2.9999999999999997e-05, "loss": 4.58, "step": 42855 }, { "epoch": 0.8776851719124362, "grad_norm": 0.28321757912635803, "learning_rate": 2.9999999999999997e-05, "loss": 4.6036, "step": 42860 }, { "epoch": 0.8777875616898408, "grad_norm": 0.28525102138519287, "learning_rate": 2.9999999999999997e-05, "loss": 4.5574, "step": 42865 }, { "epoch": 0.8778899514672455, "grad_norm": 0.29060178995132446, "learning_rate": 2.9999999999999997e-05, "loss": 4.5759, "step": 42870 }, { "epoch": 0.8779923412446501, "grad_norm": 0.30368462204933167, "learning_rate": 2.9999999999999997e-05, "loss": 4.5436, "step": 42875 }, { "epoch": 0.8780947310220547, "grad_norm": 0.30066636204719543, "learning_rate": 2.9999999999999997e-05, "loss": 4.559, "step": 42880 }, { "epoch": 0.8781971207994593, "grad_norm": 0.41088446974754333, "learning_rate": 2.9999999999999997e-05, "loss": 4.5113, "step": 42885 }, { "epoch": 0.878299510576864, "grad_norm": 0.3057084083557129, "learning_rate": 2.9999999999999997e-05, "loss": 4.5543, "step": 42890 }, { "epoch": 0.8784019003542687, "grad_norm": 0.2873361110687256, "learning_rate": 2.9999999999999997e-05, "loss": 4.5912, "step": 42895 }, { "epoch": 0.8785042901316733, "grad_norm": 0.2893359065055847, "learning_rate": 2.9999999999999997e-05, "loss": 4.6089, "step": 42900 }, { "epoch": 0.8786066799090779, "grad_norm": 0.30454859137535095, "learning_rate": 2.9999999999999997e-05, "loss": 4.5792, "step": 42905 }, { "epoch": 0.8787090696864825, "grad_norm": 0.3234281539916992, "learning_rate": 2.9999999999999997e-05, "loss": 4.6122, "step": 42910 }, { "epoch": 0.8788114594638872, "grad_norm": 0.307363897562027, "learning_rate": 2.9999999999999997e-05, "loss": 4.5924, "step": 42915 }, { "epoch": 0.8789138492412918, "grad_norm": 0.2866811752319336, "learning_rate": 2.9999999999999997e-05, "loss": 4.5603, "step": 42920 }, { "epoch": 0.8790162390186964, "grad_norm": 0.29870229959487915, "learning_rate": 2.9999999999999997e-05, "loss": 4.5697, "step": 42925 }, { "epoch": 0.879118628796101, "grad_norm": 0.29656752943992615, "learning_rate": 2.9999999999999997e-05, "loss": 4.5812, "step": 42930 }, { "epoch": 0.8792210185735057, "grad_norm": 0.30561700463294983, "learning_rate": 2.9999999999999997e-05, "loss": 4.5504, "step": 42935 }, { "epoch": 0.8793234083509103, "grad_norm": 0.32332366704940796, "learning_rate": 2.9999999999999997e-05, "loss": 4.5786, "step": 42940 }, { "epoch": 0.8794257981283149, "grad_norm": 0.28673917055130005, "learning_rate": 2.9999999999999997e-05, "loss": 4.5428, "step": 42945 }, { "epoch": 0.8795281879057195, "grad_norm": 0.28213217854499817, "learning_rate": 2.9999999999999997e-05, "loss": 4.5288, "step": 42950 }, { "epoch": 0.8796305776831241, "grad_norm": 0.29143473505973816, "learning_rate": 2.9999999999999997e-05, "loss": 4.5779, "step": 42955 }, { "epoch": 0.8797329674605288, "grad_norm": 0.2868902087211609, "learning_rate": 2.9999999999999997e-05, "loss": 4.5497, "step": 42960 }, { "epoch": 0.8798353572379334, "grad_norm": 0.3348364233970642, "learning_rate": 2.9999999999999997e-05, "loss": 4.5077, "step": 42965 }, { "epoch": 0.879937747015338, "grad_norm": 0.27767911553382874, "learning_rate": 2.9999999999999997e-05, "loss": 4.4875, "step": 42970 }, { "epoch": 0.8800401367927426, "grad_norm": 0.28155142068862915, "learning_rate": 2.9999999999999997e-05, "loss": 4.5819, "step": 42975 }, { "epoch": 0.8801425265701472, "grad_norm": 0.27674272656440735, "learning_rate": 2.9999999999999997e-05, "loss": 4.5774, "step": 42980 }, { "epoch": 0.8802449163475519, "grad_norm": 0.29031631350517273, "learning_rate": 2.9999999999999997e-05, "loss": 4.5194, "step": 42985 }, { "epoch": 0.8803473061249565, "grad_norm": 0.28039461374282837, "learning_rate": 2.9999999999999997e-05, "loss": 4.6035, "step": 42990 }, { "epoch": 0.8804496959023611, "grad_norm": 0.29909491539001465, "learning_rate": 2.9999999999999997e-05, "loss": 4.5776, "step": 42995 }, { "epoch": 0.8805520856797657, "grad_norm": 0.2924463748931885, "learning_rate": 2.9999999999999997e-05, "loss": 4.5262, "step": 43000 }, { "epoch": 0.8805520856797657, "eval_loss": 4.588317394256592, "eval_runtime": 17.1268, "eval_samples_per_second": 33.982, "eval_steps_per_second": 4.262, "step": 43000 }, { "epoch": 0.8806544754571703, "grad_norm": 0.29849857091903687, "learning_rate": 2.9999999999999997e-05, "loss": 4.5815, "step": 43005 }, { "epoch": 0.880756865234575, "grad_norm": 0.30793818831443787, "learning_rate": 2.9999999999999997e-05, "loss": 4.5941, "step": 43010 }, { "epoch": 0.8808592550119796, "grad_norm": 0.28552597761154175, "learning_rate": 2.9999999999999997e-05, "loss": 4.5661, "step": 43015 }, { "epoch": 0.8809616447893842, "grad_norm": 0.2882366180419922, "learning_rate": 2.9999999999999997e-05, "loss": 4.5673, "step": 43020 }, { "epoch": 0.8810640345667888, "grad_norm": 0.295576810836792, "learning_rate": 2.9999999999999997e-05, "loss": 4.568, "step": 43025 }, { "epoch": 0.8811664243441935, "grad_norm": 0.28542160987854004, "learning_rate": 2.9999999999999997e-05, "loss": 4.5625, "step": 43030 }, { "epoch": 0.8812688141215981, "grad_norm": 0.8199988603591919, "learning_rate": 2.9999999999999997e-05, "loss": 4.6262, "step": 43035 }, { "epoch": 0.8813712038990027, "grad_norm": 0.30435624718666077, "learning_rate": 2.9999999999999997e-05, "loss": 4.5742, "step": 43040 }, { "epoch": 0.8814735936764073, "grad_norm": 0.29068076610565186, "learning_rate": 2.9999999999999997e-05, "loss": 4.5668, "step": 43045 }, { "epoch": 0.8815759834538119, "grad_norm": 0.3287653625011444, "learning_rate": 2.9999999999999997e-05, "loss": 4.5645, "step": 43050 }, { "epoch": 0.8816783732312166, "grad_norm": 0.2941044270992279, "learning_rate": 2.9999999999999997e-05, "loss": 4.5611, "step": 43055 }, { "epoch": 0.8817807630086212, "grad_norm": 0.27592143416404724, "learning_rate": 2.9999999999999997e-05, "loss": 4.5598, "step": 43060 }, { "epoch": 0.8818831527860258, "grad_norm": 0.304882675409317, "learning_rate": 2.9999999999999997e-05, "loss": 4.5648, "step": 43065 }, { "epoch": 0.8819855425634304, "grad_norm": 0.29764097929000854, "learning_rate": 2.9999999999999997e-05, "loss": 4.5748, "step": 43070 }, { "epoch": 0.882087932340835, "grad_norm": 0.28902390599250793, "learning_rate": 2.9999999999999997e-05, "loss": 4.5885, "step": 43075 }, { "epoch": 0.8821903221182397, "grad_norm": 0.27456074953079224, "learning_rate": 2.9999999999999997e-05, "loss": 4.543, "step": 43080 }, { "epoch": 0.8822927118956443, "grad_norm": 0.2862781882286072, "learning_rate": 2.9999999999999997e-05, "loss": 4.5668, "step": 43085 }, { "epoch": 0.8823951016730489, "grad_norm": 0.27993398904800415, "learning_rate": 2.9999999999999997e-05, "loss": 4.6255, "step": 43090 }, { "epoch": 0.8824974914504536, "grad_norm": 0.29803958535194397, "learning_rate": 2.9999999999999997e-05, "loss": 4.5789, "step": 43095 }, { "epoch": 0.8825998812278583, "grad_norm": 0.28480470180511475, "learning_rate": 2.9999999999999997e-05, "loss": 4.5901, "step": 43100 }, { "epoch": 0.8827022710052629, "grad_norm": 0.29084596037864685, "learning_rate": 2.9999999999999997e-05, "loss": 4.5822, "step": 43105 }, { "epoch": 0.8828046607826675, "grad_norm": 0.28622016310691833, "learning_rate": 2.9999999999999997e-05, "loss": 4.5038, "step": 43110 }, { "epoch": 0.8829070505600721, "grad_norm": 0.2887270450592041, "learning_rate": 2.9999999999999997e-05, "loss": 4.5189, "step": 43115 }, { "epoch": 0.8830094403374767, "grad_norm": 0.2953267991542816, "learning_rate": 2.9999999999999997e-05, "loss": 4.5724, "step": 43120 }, { "epoch": 0.8831118301148814, "grad_norm": 0.28226035833358765, "learning_rate": 2.9999999999999997e-05, "loss": 4.5794, "step": 43125 }, { "epoch": 0.883214219892286, "grad_norm": 0.29001089930534363, "learning_rate": 2.9999999999999997e-05, "loss": 4.7161, "step": 43130 }, { "epoch": 0.8833166096696906, "grad_norm": 0.3066718876361847, "learning_rate": 2.9999999999999997e-05, "loss": 4.5183, "step": 43135 }, { "epoch": 0.8834189994470952, "grad_norm": 0.2722647786140442, "learning_rate": 2.9999999999999997e-05, "loss": 4.5257, "step": 43140 }, { "epoch": 0.8835213892244999, "grad_norm": 0.3023661971092224, "learning_rate": 2.9999999999999997e-05, "loss": 4.5622, "step": 43145 }, { "epoch": 0.8836237790019045, "grad_norm": 0.347561776638031, "learning_rate": 2.9999999999999997e-05, "loss": 4.5337, "step": 43150 }, { "epoch": 0.8837261687793091, "grad_norm": 0.2886524200439453, "learning_rate": 2.9999999999999997e-05, "loss": 4.5653, "step": 43155 }, { "epoch": 0.8838285585567137, "grad_norm": 0.2908353805541992, "learning_rate": 2.9999999999999997e-05, "loss": 4.5652, "step": 43160 }, { "epoch": 0.8839309483341183, "grad_norm": 0.299456924200058, "learning_rate": 2.9999999999999997e-05, "loss": 4.5631, "step": 43165 }, { "epoch": 0.884033338111523, "grad_norm": 0.2918829321861267, "learning_rate": 2.9999999999999997e-05, "loss": 4.5485, "step": 43170 }, { "epoch": 0.8841357278889276, "grad_norm": 0.3255084455013275, "learning_rate": 2.9999999999999997e-05, "loss": 4.5075, "step": 43175 }, { "epoch": 0.8842381176663322, "grad_norm": 0.29269468784332275, "learning_rate": 2.9999999999999997e-05, "loss": 4.6193, "step": 43180 }, { "epoch": 0.8843405074437368, "grad_norm": 0.30426040291786194, "learning_rate": 2.9999999999999997e-05, "loss": 4.5508, "step": 43185 }, { "epoch": 0.8844428972211414, "grad_norm": 0.3047192394733429, "learning_rate": 2.9999999999999997e-05, "loss": 4.5607, "step": 43190 }, { "epoch": 0.8845452869985461, "grad_norm": 0.30153313279151917, "learning_rate": 2.9999999999999997e-05, "loss": 4.5032, "step": 43195 }, { "epoch": 0.8846476767759507, "grad_norm": 0.2783660292625427, "learning_rate": 2.9999999999999997e-05, "loss": 4.5909, "step": 43200 }, { "epoch": 0.8847500665533553, "grad_norm": 0.3032394349575043, "learning_rate": 2.9999999999999997e-05, "loss": 4.5839, "step": 43205 }, { "epoch": 0.8848524563307599, "grad_norm": 0.28079283237457275, "learning_rate": 2.9999999999999997e-05, "loss": 4.596, "step": 43210 }, { "epoch": 0.8849548461081646, "grad_norm": 0.3117208778858185, "learning_rate": 2.9999999999999997e-05, "loss": 4.5706, "step": 43215 }, { "epoch": 0.8850572358855692, "grad_norm": 0.29995861649513245, "learning_rate": 2.9999999999999997e-05, "loss": 4.5058, "step": 43220 }, { "epoch": 0.8851596256629738, "grad_norm": 0.29243379831314087, "learning_rate": 2.9999999999999997e-05, "loss": 4.5553, "step": 43225 }, { "epoch": 0.8852620154403784, "grad_norm": 0.28652480244636536, "learning_rate": 2.9999999999999997e-05, "loss": 4.5708, "step": 43230 }, { "epoch": 0.885364405217783, "grad_norm": 0.2879696488380432, "learning_rate": 2.9999999999999997e-05, "loss": 4.5917, "step": 43235 }, { "epoch": 0.8854667949951877, "grad_norm": 0.3314764201641083, "learning_rate": 2.9999999999999997e-05, "loss": 4.5605, "step": 43240 }, { "epoch": 0.8855691847725923, "grad_norm": 0.3095373213291168, "learning_rate": 2.9999999999999997e-05, "loss": 4.5641, "step": 43245 }, { "epoch": 0.8856715745499969, "grad_norm": 0.30055418610572815, "learning_rate": 2.9999999999999997e-05, "loss": 4.5265, "step": 43250 }, { "epoch": 0.8857739643274015, "grad_norm": 0.3154790997505188, "learning_rate": 2.9999999999999997e-05, "loss": 4.6022, "step": 43255 }, { "epoch": 0.8858763541048061, "grad_norm": 0.28868356347084045, "learning_rate": 2.9999999999999997e-05, "loss": 4.5839, "step": 43260 }, { "epoch": 0.8859787438822108, "grad_norm": 0.2998608648777008, "learning_rate": 2.9999999999999997e-05, "loss": 4.5308, "step": 43265 }, { "epoch": 0.8860811336596154, "grad_norm": 0.2835061550140381, "learning_rate": 2.9999999999999997e-05, "loss": 4.5232, "step": 43270 }, { "epoch": 0.88618352343702, "grad_norm": 0.2820570468902588, "learning_rate": 2.9999999999999997e-05, "loss": 4.6076, "step": 43275 }, { "epoch": 0.8862859132144246, "grad_norm": 0.28567567467689514, "learning_rate": 2.9999999999999997e-05, "loss": 4.6502, "step": 43280 }, { "epoch": 0.8863883029918292, "grad_norm": 0.2970583438873291, "learning_rate": 2.9999999999999997e-05, "loss": 4.5814, "step": 43285 }, { "epoch": 0.8864906927692339, "grad_norm": 0.2917497456073761, "learning_rate": 2.9999999999999997e-05, "loss": 4.5495, "step": 43290 }, { "epoch": 0.8865930825466385, "grad_norm": 0.30360355973243713, "learning_rate": 2.9999999999999997e-05, "loss": 4.5851, "step": 43295 }, { "epoch": 0.8866954723240432, "grad_norm": 0.2801843583583832, "learning_rate": 2.9999999999999997e-05, "loss": 4.5801, "step": 43300 }, { "epoch": 0.8867978621014478, "grad_norm": 0.3223794102668762, "learning_rate": 2.9999999999999997e-05, "loss": 4.555, "step": 43305 }, { "epoch": 0.8869002518788525, "grad_norm": 0.3030499219894409, "learning_rate": 2.9999999999999997e-05, "loss": 4.6037, "step": 43310 }, { "epoch": 0.8870026416562571, "grad_norm": 0.2885530889034271, "learning_rate": 2.9999999999999997e-05, "loss": 4.5635, "step": 43315 }, { "epoch": 0.8871050314336617, "grad_norm": 0.3388378620147705, "learning_rate": 2.9999999999999997e-05, "loss": 4.5617, "step": 43320 }, { "epoch": 0.8872074212110663, "grad_norm": 0.28313079476356506, "learning_rate": 2.9999999999999997e-05, "loss": 4.5497, "step": 43325 }, { "epoch": 0.887309810988471, "grad_norm": 0.27532637119293213, "learning_rate": 2.9999999999999997e-05, "loss": 4.5404, "step": 43330 }, { "epoch": 0.8874122007658756, "grad_norm": 0.2831867039203644, "learning_rate": 2.9999999999999997e-05, "loss": 4.5259, "step": 43335 }, { "epoch": 0.8875145905432802, "grad_norm": 0.28463199734687805, "learning_rate": 2.9999999999999997e-05, "loss": 4.5789, "step": 43340 }, { "epoch": 0.8876169803206848, "grad_norm": 0.3134804666042328, "learning_rate": 2.9999999999999997e-05, "loss": 4.5373, "step": 43345 }, { "epoch": 0.8877193700980894, "grad_norm": 0.3044542074203491, "learning_rate": 2.9999999999999997e-05, "loss": 4.5625, "step": 43350 }, { "epoch": 0.887821759875494, "grad_norm": 0.2829276919364929, "learning_rate": 2.9999999999999997e-05, "loss": 4.5932, "step": 43355 }, { "epoch": 0.8879241496528987, "grad_norm": 0.2904761731624603, "learning_rate": 2.9999999999999997e-05, "loss": 4.5612, "step": 43360 }, { "epoch": 0.8880265394303033, "grad_norm": 0.28154832124710083, "learning_rate": 2.9999999999999997e-05, "loss": 4.5256, "step": 43365 }, { "epoch": 0.8881289292077079, "grad_norm": 0.2937324345111847, "learning_rate": 2.9999999999999997e-05, "loss": 4.5595, "step": 43370 }, { "epoch": 0.8882313189851125, "grad_norm": 0.30040445923805237, "learning_rate": 2.9999999999999997e-05, "loss": 4.5505, "step": 43375 }, { "epoch": 0.8883337087625172, "grad_norm": 0.29687491059303284, "learning_rate": 2.9999999999999997e-05, "loss": 4.622, "step": 43380 }, { "epoch": 0.8884360985399218, "grad_norm": 0.3083922863006592, "learning_rate": 2.9999999999999997e-05, "loss": 4.5899, "step": 43385 }, { "epoch": 0.8885384883173264, "grad_norm": 0.2865968942642212, "learning_rate": 2.9999999999999997e-05, "loss": 4.6159, "step": 43390 }, { "epoch": 0.888640878094731, "grad_norm": 0.2884815037250519, "learning_rate": 2.9999999999999997e-05, "loss": 4.5902, "step": 43395 }, { "epoch": 0.8887432678721356, "grad_norm": 0.28484782576560974, "learning_rate": 2.9999999999999997e-05, "loss": 4.635, "step": 43400 }, { "epoch": 0.8888456576495403, "grad_norm": 0.2970082461833954, "learning_rate": 2.9999999999999997e-05, "loss": 4.5632, "step": 43405 }, { "epoch": 0.8889480474269449, "grad_norm": 0.2938386797904968, "learning_rate": 2.9999999999999997e-05, "loss": 4.5672, "step": 43410 }, { "epoch": 0.8890504372043495, "grad_norm": 0.28469425439834595, "learning_rate": 2.9999999999999997e-05, "loss": 4.5242, "step": 43415 }, { "epoch": 0.8891528269817541, "grad_norm": 0.364021360874176, "learning_rate": 2.9999999999999997e-05, "loss": 4.5575, "step": 43420 }, { "epoch": 0.8892552167591588, "grad_norm": 0.2875620126724243, "learning_rate": 2.9999999999999997e-05, "loss": 4.5952, "step": 43425 }, { "epoch": 0.8893576065365634, "grad_norm": 0.29636284708976746, "learning_rate": 2.9999999999999997e-05, "loss": 4.5787, "step": 43430 }, { "epoch": 0.889459996313968, "grad_norm": 0.2957055866718292, "learning_rate": 2.9999999999999997e-05, "loss": 4.6116, "step": 43435 }, { "epoch": 0.8895623860913726, "grad_norm": 0.2801075279712677, "learning_rate": 2.9999999999999997e-05, "loss": 4.5848, "step": 43440 }, { "epoch": 0.8896647758687772, "grad_norm": 0.3111990988254547, "learning_rate": 2.9999999999999997e-05, "loss": 4.6566, "step": 43445 }, { "epoch": 0.8897671656461819, "grad_norm": 0.295200914144516, "learning_rate": 2.9999999999999997e-05, "loss": 4.5115, "step": 43450 }, { "epoch": 0.8898695554235865, "grad_norm": 0.2847495675086975, "learning_rate": 2.9999999999999997e-05, "loss": 4.4961, "step": 43455 }, { "epoch": 0.8899719452009911, "grad_norm": 0.3003501892089844, "learning_rate": 2.9999999999999997e-05, "loss": 4.6224, "step": 43460 }, { "epoch": 0.8900743349783957, "grad_norm": 0.29068243503570557, "learning_rate": 2.9999999999999997e-05, "loss": 4.5787, "step": 43465 }, { "epoch": 0.8901767247558003, "grad_norm": 0.28233209252357483, "learning_rate": 2.9999999999999997e-05, "loss": 4.58, "step": 43470 }, { "epoch": 0.890279114533205, "grad_norm": 0.2820115387439728, "learning_rate": 2.9999999999999997e-05, "loss": 4.5867, "step": 43475 }, { "epoch": 0.8903815043106096, "grad_norm": 0.28251320123672485, "learning_rate": 2.9999999999999997e-05, "loss": 4.4561, "step": 43480 }, { "epoch": 0.8904838940880142, "grad_norm": 0.27587589621543884, "learning_rate": 2.9999999999999997e-05, "loss": 4.5677, "step": 43485 }, { "epoch": 0.8905862838654188, "grad_norm": 0.2913191020488739, "learning_rate": 2.9999999999999997e-05, "loss": 4.5786, "step": 43490 }, { "epoch": 0.8906886736428234, "grad_norm": 0.29503539204597473, "learning_rate": 2.9999999999999997e-05, "loss": 4.587, "step": 43495 }, { "epoch": 0.8907910634202282, "grad_norm": 0.3059171736240387, "learning_rate": 2.9999999999999997e-05, "loss": 4.5826, "step": 43500 }, { "epoch": 0.8908934531976328, "grad_norm": 0.29003840684890747, "learning_rate": 2.9999999999999997e-05, "loss": 4.6057, "step": 43505 }, { "epoch": 0.8909958429750374, "grad_norm": 0.29289379715919495, "learning_rate": 2.9999999999999997e-05, "loss": 4.5023, "step": 43510 }, { "epoch": 0.891098232752442, "grad_norm": 0.29439595341682434, "learning_rate": 2.9999999999999997e-05, "loss": 4.5855, "step": 43515 }, { "epoch": 0.8912006225298467, "grad_norm": 0.2912348508834839, "learning_rate": 2.9999999999999997e-05, "loss": 4.5522, "step": 43520 }, { "epoch": 0.8913030123072513, "grad_norm": 0.27954715490341187, "learning_rate": 2.9999999999999997e-05, "loss": 4.6042, "step": 43525 }, { "epoch": 0.8914054020846559, "grad_norm": 0.28138938546180725, "learning_rate": 2.9999999999999997e-05, "loss": 4.5829, "step": 43530 }, { "epoch": 0.8915077918620605, "grad_norm": 0.28235381841659546, "learning_rate": 2.9999999999999997e-05, "loss": 4.549, "step": 43535 }, { "epoch": 0.8916101816394651, "grad_norm": 0.8361275792121887, "learning_rate": 2.9999999999999997e-05, "loss": 4.572, "step": 43540 }, { "epoch": 0.8917125714168698, "grad_norm": 0.29305335879325867, "learning_rate": 2.9999999999999997e-05, "loss": 4.6044, "step": 43545 }, { "epoch": 0.8918149611942744, "grad_norm": 0.2782655358314514, "learning_rate": 2.9999999999999997e-05, "loss": 4.5625, "step": 43550 }, { "epoch": 0.891917350971679, "grad_norm": 0.2837987542152405, "learning_rate": 2.9999999999999997e-05, "loss": 4.5642, "step": 43555 }, { "epoch": 0.8920197407490836, "grad_norm": 0.2914084196090698, "learning_rate": 2.9999999999999997e-05, "loss": 4.5829, "step": 43560 }, { "epoch": 0.8921221305264883, "grad_norm": 0.28752848505973816, "learning_rate": 2.9999999999999997e-05, "loss": 4.5515, "step": 43565 }, { "epoch": 0.8922245203038929, "grad_norm": 0.30271926522254944, "learning_rate": 2.9999999999999997e-05, "loss": 4.5679, "step": 43570 }, { "epoch": 0.8923269100812975, "grad_norm": 0.27412354946136475, "learning_rate": 2.9999999999999997e-05, "loss": 4.5685, "step": 43575 }, { "epoch": 0.8924292998587021, "grad_norm": 0.284602552652359, "learning_rate": 2.9999999999999997e-05, "loss": 4.6077, "step": 43580 }, { "epoch": 0.8925316896361067, "grad_norm": 0.3156874179840088, "learning_rate": 2.9999999999999997e-05, "loss": 4.5639, "step": 43585 }, { "epoch": 0.8926340794135114, "grad_norm": 0.3039139211177826, "learning_rate": 2.9999999999999997e-05, "loss": 4.5724, "step": 43590 }, { "epoch": 0.892736469190916, "grad_norm": 0.2927379608154297, "learning_rate": 2.9999999999999997e-05, "loss": 4.6056, "step": 43595 }, { "epoch": 0.8928388589683206, "grad_norm": 0.2834281623363495, "learning_rate": 2.9999999999999997e-05, "loss": 4.5665, "step": 43600 }, { "epoch": 0.8929412487457252, "grad_norm": 0.2961020767688751, "learning_rate": 2.9999999999999997e-05, "loss": 4.5478, "step": 43605 }, { "epoch": 0.8930436385231298, "grad_norm": 0.28263378143310547, "learning_rate": 2.9999999999999997e-05, "loss": 4.551, "step": 43610 }, { "epoch": 0.8931460283005345, "grad_norm": 0.2814967930316925, "learning_rate": 2.9999999999999997e-05, "loss": 4.5584, "step": 43615 }, { "epoch": 0.8932484180779391, "grad_norm": 0.2874368131160736, "learning_rate": 2.9999999999999997e-05, "loss": 4.5555, "step": 43620 }, { "epoch": 0.8933508078553437, "grad_norm": 0.3158700168132782, "learning_rate": 2.9999999999999997e-05, "loss": 4.5803, "step": 43625 }, { "epoch": 0.8934531976327483, "grad_norm": 0.3642716109752655, "learning_rate": 2.9999999999999997e-05, "loss": 4.6043, "step": 43630 }, { "epoch": 0.893555587410153, "grad_norm": 0.29824936389923096, "learning_rate": 2.9999999999999997e-05, "loss": 4.5823, "step": 43635 }, { "epoch": 0.8936579771875576, "grad_norm": 0.3525303304195404, "learning_rate": 2.9999999999999997e-05, "loss": 4.5454, "step": 43640 }, { "epoch": 0.8937603669649622, "grad_norm": 0.2897406816482544, "learning_rate": 2.9999999999999997e-05, "loss": 4.5495, "step": 43645 }, { "epoch": 0.8938627567423668, "grad_norm": 0.3116193413734436, "learning_rate": 2.9999999999999997e-05, "loss": 4.5517, "step": 43650 }, { "epoch": 0.8939651465197714, "grad_norm": 0.29880356788635254, "learning_rate": 2.9999999999999997e-05, "loss": 4.5423, "step": 43655 }, { "epoch": 0.8940675362971761, "grad_norm": 0.2957082688808441, "learning_rate": 2.9999999999999997e-05, "loss": 4.555, "step": 43660 }, { "epoch": 0.8941699260745807, "grad_norm": 0.28723859786987305, "learning_rate": 2.9999999999999997e-05, "loss": 4.5485, "step": 43665 }, { "epoch": 0.8942723158519853, "grad_norm": 0.2814950942993164, "learning_rate": 2.9999999999999997e-05, "loss": 4.5718, "step": 43670 }, { "epoch": 0.8943747056293899, "grad_norm": 0.4035419225692749, "learning_rate": 2.9999999999999997e-05, "loss": 4.5666, "step": 43675 }, { "epoch": 0.8944770954067945, "grad_norm": 0.2928883731365204, "learning_rate": 2.9999999999999997e-05, "loss": 4.5292, "step": 43680 }, { "epoch": 0.8945794851841992, "grad_norm": 0.3120344281196594, "learning_rate": 2.9999999999999997e-05, "loss": 4.5156, "step": 43685 }, { "epoch": 0.8946818749616038, "grad_norm": 0.29340067505836487, "learning_rate": 2.9999999999999997e-05, "loss": 4.5921, "step": 43690 }, { "epoch": 0.8947842647390084, "grad_norm": 0.286952942609787, "learning_rate": 2.9999999999999997e-05, "loss": 4.5884, "step": 43695 }, { "epoch": 0.894886654516413, "grad_norm": 0.2862982749938965, "learning_rate": 2.9999999999999997e-05, "loss": 4.6195, "step": 43700 }, { "epoch": 0.8949890442938178, "grad_norm": 0.33828821778297424, "learning_rate": 2.9999999999999997e-05, "loss": 4.5335, "step": 43705 }, { "epoch": 0.8950914340712224, "grad_norm": 0.3000277280807495, "learning_rate": 2.9999999999999997e-05, "loss": 4.5795, "step": 43710 }, { "epoch": 0.895193823848627, "grad_norm": 0.2789219915866852, "learning_rate": 2.9999999999999997e-05, "loss": 4.5754, "step": 43715 }, { "epoch": 0.8952962136260316, "grad_norm": 0.2951725423336029, "learning_rate": 2.9999999999999997e-05, "loss": 4.5604, "step": 43720 }, { "epoch": 0.8953986034034362, "grad_norm": 0.31653720140457153, "learning_rate": 2.9999999999999997e-05, "loss": 4.5404, "step": 43725 }, { "epoch": 0.8955009931808409, "grad_norm": 0.2837054133415222, "learning_rate": 2.9999999999999997e-05, "loss": 4.5111, "step": 43730 }, { "epoch": 0.8956033829582455, "grad_norm": 0.287752628326416, "learning_rate": 2.9999999999999997e-05, "loss": 4.5963, "step": 43735 }, { "epoch": 0.8957057727356501, "grad_norm": 0.30167561769485474, "learning_rate": 2.9999999999999997e-05, "loss": 4.5534, "step": 43740 }, { "epoch": 0.8958081625130547, "grad_norm": 0.2922893762588501, "learning_rate": 2.9999999999999997e-05, "loss": 4.5727, "step": 43745 }, { "epoch": 0.8959105522904593, "grad_norm": 0.3056415021419525, "learning_rate": 2.9999999999999997e-05, "loss": 4.6226, "step": 43750 }, { "epoch": 0.896012942067864, "grad_norm": 0.27673646807670593, "learning_rate": 2.9999999999999997e-05, "loss": 4.5942, "step": 43755 }, { "epoch": 0.8961153318452686, "grad_norm": 0.29115694761276245, "learning_rate": 2.9999999999999997e-05, "loss": 4.594, "step": 43760 }, { "epoch": 0.8962177216226732, "grad_norm": 0.2822927236557007, "learning_rate": 2.9999999999999997e-05, "loss": 4.6041, "step": 43765 }, { "epoch": 0.8963201114000778, "grad_norm": 0.28792211413383484, "learning_rate": 2.9999999999999997e-05, "loss": 4.5621, "step": 43770 }, { "epoch": 0.8964225011774825, "grad_norm": 0.27710363268852234, "learning_rate": 2.9999999999999997e-05, "loss": 4.5044, "step": 43775 }, { "epoch": 0.8965248909548871, "grad_norm": 0.2837262749671936, "learning_rate": 2.9999999999999997e-05, "loss": 4.5753, "step": 43780 }, { "epoch": 0.8966272807322917, "grad_norm": 0.28804516792297363, "learning_rate": 2.9999999999999997e-05, "loss": 4.5626, "step": 43785 }, { "epoch": 0.8967296705096963, "grad_norm": 0.3009595572948456, "learning_rate": 2.9999999999999997e-05, "loss": 4.6036, "step": 43790 }, { "epoch": 0.8968320602871009, "grad_norm": 0.29668760299682617, "learning_rate": 2.9999999999999997e-05, "loss": 4.5692, "step": 43795 }, { "epoch": 0.8969344500645056, "grad_norm": 0.28489741683006287, "learning_rate": 2.9999999999999997e-05, "loss": 4.5532, "step": 43800 }, { "epoch": 0.8970368398419102, "grad_norm": 0.2997526526451111, "learning_rate": 2.9999999999999997e-05, "loss": 4.5727, "step": 43805 }, { "epoch": 0.8971392296193148, "grad_norm": 0.3091844618320465, "learning_rate": 2.9999999999999997e-05, "loss": 4.5011, "step": 43810 }, { "epoch": 0.8972416193967194, "grad_norm": 0.36383694410324097, "learning_rate": 2.9999999999999997e-05, "loss": 4.573, "step": 43815 }, { "epoch": 0.897344009174124, "grad_norm": 0.28524136543273926, "learning_rate": 2.9999999999999997e-05, "loss": 4.5614, "step": 43820 }, { "epoch": 0.8974463989515287, "grad_norm": 0.2953522503376007, "learning_rate": 2.9999999999999997e-05, "loss": 4.6166, "step": 43825 }, { "epoch": 0.8975487887289333, "grad_norm": 0.28813883662223816, "learning_rate": 2.9999999999999997e-05, "loss": 4.589, "step": 43830 }, { "epoch": 0.8976511785063379, "grad_norm": 0.28438302874565125, "learning_rate": 2.9999999999999997e-05, "loss": 4.595, "step": 43835 }, { "epoch": 0.8977535682837425, "grad_norm": 0.2952619194984436, "learning_rate": 2.9999999999999997e-05, "loss": 4.5704, "step": 43840 }, { "epoch": 0.8978559580611472, "grad_norm": 0.31650814414024353, "learning_rate": 2.9999999999999997e-05, "loss": 4.5711, "step": 43845 }, { "epoch": 0.8979583478385518, "grad_norm": 0.29960745573043823, "learning_rate": 2.9999999999999997e-05, "loss": 4.5576, "step": 43850 }, { "epoch": 0.8980607376159564, "grad_norm": 0.2964019179344177, "learning_rate": 2.9999999999999997e-05, "loss": 4.5664, "step": 43855 }, { "epoch": 0.898163127393361, "grad_norm": 0.29202255606651306, "learning_rate": 2.9999999999999997e-05, "loss": 4.5914, "step": 43860 }, { "epoch": 0.8982655171707656, "grad_norm": 0.28947746753692627, "learning_rate": 2.9999999999999997e-05, "loss": 4.5512, "step": 43865 }, { "epoch": 0.8983679069481703, "grad_norm": 0.28201887011528015, "learning_rate": 2.9999999999999997e-05, "loss": 4.5778, "step": 43870 }, { "epoch": 0.8984702967255749, "grad_norm": 0.28833359479904175, "learning_rate": 2.9999999999999997e-05, "loss": 4.5778, "step": 43875 }, { "epoch": 0.8985726865029795, "grad_norm": 0.31362542510032654, "learning_rate": 2.9999999999999997e-05, "loss": 4.5621, "step": 43880 }, { "epoch": 0.8986750762803841, "grad_norm": 0.31593671441078186, "learning_rate": 2.9999999999999997e-05, "loss": 4.5782, "step": 43885 }, { "epoch": 0.8987774660577887, "grad_norm": 0.3144204616546631, "learning_rate": 2.9999999999999997e-05, "loss": 4.5675, "step": 43890 }, { "epoch": 0.8988798558351934, "grad_norm": 0.2929304540157318, "learning_rate": 2.9999999999999997e-05, "loss": 4.5404, "step": 43895 }, { "epoch": 0.898982245612598, "grad_norm": 0.3831968903541565, "learning_rate": 2.9999999999999997e-05, "loss": 4.5263, "step": 43900 }, { "epoch": 0.8990846353900026, "grad_norm": 0.2977750599384308, "learning_rate": 2.9999999999999997e-05, "loss": 4.5645, "step": 43905 }, { "epoch": 0.8991870251674073, "grad_norm": 0.30486249923706055, "learning_rate": 2.9999999999999997e-05, "loss": 4.5733, "step": 43910 }, { "epoch": 0.899289414944812, "grad_norm": 0.2831956446170807, "learning_rate": 2.9999999999999997e-05, "loss": 4.5776, "step": 43915 }, { "epoch": 0.8993918047222166, "grad_norm": 0.29849597811698914, "learning_rate": 2.9999999999999997e-05, "loss": 4.5467, "step": 43920 }, { "epoch": 0.8994941944996212, "grad_norm": 1.5628610849380493, "learning_rate": 2.9999999999999997e-05, "loss": 4.5232, "step": 43925 }, { "epoch": 0.8995965842770258, "grad_norm": 0.2881596088409424, "learning_rate": 2.9999999999999997e-05, "loss": 4.5868, "step": 43930 }, { "epoch": 0.8996989740544304, "grad_norm": 0.30083224177360535, "learning_rate": 2.9999999999999997e-05, "loss": 4.5806, "step": 43935 }, { "epoch": 0.8998013638318351, "grad_norm": 0.3666914999485016, "learning_rate": 2.9999999999999997e-05, "loss": 4.5509, "step": 43940 }, { "epoch": 0.8999037536092397, "grad_norm": 0.3085542619228363, "learning_rate": 2.9999999999999997e-05, "loss": 4.5729, "step": 43945 }, { "epoch": 0.9000061433866443, "grad_norm": 0.3142866790294647, "learning_rate": 2.9999999999999997e-05, "loss": 4.5119, "step": 43950 }, { "epoch": 0.9001085331640489, "grad_norm": 0.29957863688468933, "learning_rate": 2.9999999999999997e-05, "loss": 4.5776, "step": 43955 }, { "epoch": 0.9002109229414536, "grad_norm": 0.2904525399208069, "learning_rate": 2.9999999999999997e-05, "loss": 4.5366, "step": 43960 }, { "epoch": 0.9003133127188582, "grad_norm": 0.3167794644832611, "learning_rate": 2.9999999999999997e-05, "loss": 4.5689, "step": 43965 }, { "epoch": 0.9004157024962628, "grad_norm": 0.28093332052230835, "learning_rate": 2.9999999999999997e-05, "loss": 4.5652, "step": 43970 }, { "epoch": 0.9005180922736674, "grad_norm": 0.288825660943985, "learning_rate": 2.9999999999999997e-05, "loss": 4.6403, "step": 43975 }, { "epoch": 0.900620482051072, "grad_norm": 0.2897427976131439, "learning_rate": 2.9999999999999997e-05, "loss": 4.5656, "step": 43980 }, { "epoch": 0.9007228718284767, "grad_norm": 0.2949637472629547, "learning_rate": 2.9999999999999997e-05, "loss": 4.564, "step": 43985 }, { "epoch": 0.9008252616058813, "grad_norm": 0.29060909152030945, "learning_rate": 2.9999999999999997e-05, "loss": 4.5662, "step": 43990 }, { "epoch": 0.9009276513832859, "grad_norm": 0.2961573302745819, "learning_rate": 2.9999999999999997e-05, "loss": 4.5381, "step": 43995 }, { "epoch": 0.9010300411606905, "grad_norm": 0.2899945080280304, "learning_rate": 2.9999999999999997e-05, "loss": 4.5682, "step": 44000 }, { "epoch": 0.9010300411606905, "eval_loss": 4.586471080780029, "eval_runtime": 17.1248, "eval_samples_per_second": 33.986, "eval_steps_per_second": 4.263, "step": 44000 }, { "epoch": 0.9011324309380951, "grad_norm": 0.2913335859775543, "learning_rate": 2.9999999999999997e-05, "loss": 4.571, "step": 44005 }, { "epoch": 0.9012348207154998, "grad_norm": 0.29117926955223083, "learning_rate": 2.9999999999999997e-05, "loss": 4.4865, "step": 44010 }, { "epoch": 0.9013372104929044, "grad_norm": 0.2909631133079529, "learning_rate": 2.9999999999999997e-05, "loss": 4.561, "step": 44015 }, { "epoch": 0.901439600270309, "grad_norm": 0.2952113449573517, "learning_rate": 2.9999999999999997e-05, "loss": 4.5701, "step": 44020 }, { "epoch": 0.9015419900477136, "grad_norm": 0.31003907322883606, "learning_rate": 2.9999999999999997e-05, "loss": 4.5731, "step": 44025 }, { "epoch": 0.9016443798251182, "grad_norm": 0.29012972116470337, "learning_rate": 2.9999999999999997e-05, "loss": 4.5748, "step": 44030 }, { "epoch": 0.9017467696025229, "grad_norm": 0.2859537899494171, "learning_rate": 2.9999999999999997e-05, "loss": 4.5642, "step": 44035 }, { "epoch": 0.9018491593799275, "grad_norm": 0.32267364859580994, "learning_rate": 2.9999999999999997e-05, "loss": 4.611, "step": 44040 }, { "epoch": 0.9019515491573321, "grad_norm": 0.31911522150039673, "learning_rate": 2.9999999999999997e-05, "loss": 4.5812, "step": 44045 }, { "epoch": 0.9020539389347367, "grad_norm": 0.29209789633750916, "learning_rate": 2.9999999999999997e-05, "loss": 4.6097, "step": 44050 }, { "epoch": 0.9021563287121414, "grad_norm": 0.28505492210388184, "learning_rate": 2.9999999999999997e-05, "loss": 4.6094, "step": 44055 }, { "epoch": 0.902258718489546, "grad_norm": 0.3423404097557068, "learning_rate": 2.9999999999999997e-05, "loss": 4.5438, "step": 44060 }, { "epoch": 0.9023611082669506, "grad_norm": 0.29405495524406433, "learning_rate": 2.9999999999999997e-05, "loss": 4.6121, "step": 44065 }, { "epoch": 0.9024634980443552, "grad_norm": 0.3212548792362213, "learning_rate": 2.9999999999999997e-05, "loss": 4.615, "step": 44070 }, { "epoch": 0.9025658878217598, "grad_norm": 0.2923244535923004, "learning_rate": 2.9999999999999997e-05, "loss": 4.579, "step": 44075 }, { "epoch": 0.9026682775991645, "grad_norm": 0.2975698709487915, "learning_rate": 2.9999999999999997e-05, "loss": 4.6111, "step": 44080 }, { "epoch": 0.9027706673765691, "grad_norm": 0.3004777431488037, "learning_rate": 2.9999999999999997e-05, "loss": 4.5691, "step": 44085 }, { "epoch": 0.9028730571539737, "grad_norm": 0.2993500828742981, "learning_rate": 2.9999999999999997e-05, "loss": 4.6132, "step": 44090 }, { "epoch": 0.9029754469313783, "grad_norm": 0.2951490879058838, "learning_rate": 2.9999999999999997e-05, "loss": 4.6079, "step": 44095 }, { "epoch": 0.9030778367087829, "grad_norm": 0.29161161184310913, "learning_rate": 2.9999999999999997e-05, "loss": 4.5427, "step": 44100 }, { "epoch": 0.9031802264861876, "grad_norm": 0.2915920317173004, "learning_rate": 2.9999999999999997e-05, "loss": 4.5934, "step": 44105 }, { "epoch": 0.9032826162635923, "grad_norm": 0.2810094654560089, "learning_rate": 2.9999999999999997e-05, "loss": 4.5154, "step": 44110 }, { "epoch": 0.9033850060409969, "grad_norm": 0.29250696301460266, "learning_rate": 2.9999999999999997e-05, "loss": 4.5773, "step": 44115 }, { "epoch": 0.9034873958184015, "grad_norm": 0.29555365443229675, "learning_rate": 2.9999999999999997e-05, "loss": 4.558, "step": 44120 }, { "epoch": 0.9035897855958062, "grad_norm": 0.2818824350833893, "learning_rate": 2.9999999999999997e-05, "loss": 4.5882, "step": 44125 }, { "epoch": 0.9036921753732108, "grad_norm": 0.2955513000488281, "learning_rate": 2.9999999999999997e-05, "loss": 4.5514, "step": 44130 }, { "epoch": 0.9037945651506154, "grad_norm": 0.29016879200935364, "learning_rate": 2.9999999999999997e-05, "loss": 4.5746, "step": 44135 }, { "epoch": 0.90389695492802, "grad_norm": 0.30701562762260437, "learning_rate": 2.9999999999999997e-05, "loss": 4.5606, "step": 44140 }, { "epoch": 0.9039993447054246, "grad_norm": 0.30402061343193054, "learning_rate": 2.9999999999999997e-05, "loss": 4.5813, "step": 44145 }, { "epoch": 0.9041017344828293, "grad_norm": 0.3075377643108368, "learning_rate": 2.9999999999999997e-05, "loss": 4.5684, "step": 44150 }, { "epoch": 0.9042041242602339, "grad_norm": 0.28911861777305603, "learning_rate": 2.9999999999999997e-05, "loss": 4.6001, "step": 44155 }, { "epoch": 0.9043065140376385, "grad_norm": 0.2968486547470093, "learning_rate": 2.9999999999999997e-05, "loss": 4.5897, "step": 44160 }, { "epoch": 0.9044089038150431, "grad_norm": 0.29539307951927185, "learning_rate": 2.9999999999999997e-05, "loss": 4.5797, "step": 44165 }, { "epoch": 0.9045112935924478, "grad_norm": 0.287286639213562, "learning_rate": 2.9999999999999997e-05, "loss": 4.5892, "step": 44170 }, { "epoch": 0.9046136833698524, "grad_norm": 0.296395868062973, "learning_rate": 2.9999999999999997e-05, "loss": 4.5772, "step": 44175 }, { "epoch": 0.904716073147257, "grad_norm": 0.29325658082962036, "learning_rate": 2.9999999999999997e-05, "loss": 4.5902, "step": 44180 }, { "epoch": 0.9048184629246616, "grad_norm": 0.2802959978580475, "learning_rate": 2.9999999999999997e-05, "loss": 4.5655, "step": 44185 }, { "epoch": 0.9049208527020662, "grad_norm": 0.2855745851993561, "learning_rate": 2.9999999999999997e-05, "loss": 4.576, "step": 44190 }, { "epoch": 0.9050232424794709, "grad_norm": 0.31628891825675964, "learning_rate": 2.9999999999999997e-05, "loss": 4.5539, "step": 44195 }, { "epoch": 0.9051256322568755, "grad_norm": 0.29802003502845764, "learning_rate": 2.9999999999999997e-05, "loss": 4.4967, "step": 44200 }, { "epoch": 0.9052280220342801, "grad_norm": 0.2871008813381195, "learning_rate": 2.9999999999999997e-05, "loss": 4.5601, "step": 44205 }, { "epoch": 0.9053304118116847, "grad_norm": 0.3069486916065216, "learning_rate": 2.9999999999999997e-05, "loss": 4.5297, "step": 44210 }, { "epoch": 0.9054328015890893, "grad_norm": 0.30202245712280273, "learning_rate": 2.9999999999999997e-05, "loss": 4.6304, "step": 44215 }, { "epoch": 0.905535191366494, "grad_norm": 0.2802700400352478, "learning_rate": 2.9999999999999997e-05, "loss": 4.538, "step": 44220 }, { "epoch": 0.9056375811438986, "grad_norm": 0.2830926477909088, "learning_rate": 2.9999999999999997e-05, "loss": 4.5441, "step": 44225 }, { "epoch": 0.9057399709213032, "grad_norm": 0.2824036478996277, "learning_rate": 2.9999999999999997e-05, "loss": 4.5481, "step": 44230 }, { "epoch": 0.9058423606987078, "grad_norm": 0.2798176109790802, "learning_rate": 2.9999999999999997e-05, "loss": 4.5754, "step": 44235 }, { "epoch": 0.9059447504761124, "grad_norm": 0.29679057002067566, "learning_rate": 2.9999999999999997e-05, "loss": 4.6041, "step": 44240 }, { "epoch": 0.9060471402535171, "grad_norm": 0.30267518758773804, "learning_rate": 2.9999999999999997e-05, "loss": 4.5215, "step": 44245 }, { "epoch": 0.9061495300309217, "grad_norm": 0.2759597897529602, "learning_rate": 2.9999999999999997e-05, "loss": 4.5094, "step": 44250 }, { "epoch": 0.9062519198083263, "grad_norm": 0.2725431025028229, "learning_rate": 2.9999999999999997e-05, "loss": 4.5103, "step": 44255 }, { "epoch": 0.9063543095857309, "grad_norm": 0.3026810586452484, "learning_rate": 2.9999999999999997e-05, "loss": 4.592, "step": 44260 }, { "epoch": 0.9064566993631356, "grad_norm": 0.2777859568595886, "learning_rate": 2.9999999999999997e-05, "loss": 4.5875, "step": 44265 }, { "epoch": 0.9065590891405402, "grad_norm": 0.318081796169281, "learning_rate": 2.9999999999999997e-05, "loss": 4.5309, "step": 44270 }, { "epoch": 0.9066614789179448, "grad_norm": 0.2968437671661377, "learning_rate": 2.9999999999999997e-05, "loss": 4.6267, "step": 44275 }, { "epoch": 0.9067638686953494, "grad_norm": 0.30005747079849243, "learning_rate": 2.9999999999999997e-05, "loss": 4.5652, "step": 44280 }, { "epoch": 0.906866258472754, "grad_norm": 0.2993433177471161, "learning_rate": 2.9999999999999997e-05, "loss": 4.587, "step": 44285 }, { "epoch": 0.9069686482501587, "grad_norm": 0.30756816267967224, "learning_rate": 2.9999999999999997e-05, "loss": 4.5544, "step": 44290 }, { "epoch": 0.9070710380275633, "grad_norm": 0.29073426127433777, "learning_rate": 2.9999999999999997e-05, "loss": 4.6004, "step": 44295 }, { "epoch": 0.9071734278049679, "grad_norm": 0.3133041560649872, "learning_rate": 2.9999999999999997e-05, "loss": 4.5797, "step": 44300 }, { "epoch": 0.9072758175823725, "grad_norm": 0.28762057423591614, "learning_rate": 2.9999999999999997e-05, "loss": 4.584, "step": 44305 }, { "epoch": 0.9073782073597771, "grad_norm": 0.31225842237472534, "learning_rate": 2.9999999999999997e-05, "loss": 4.5958, "step": 44310 }, { "epoch": 0.9074805971371819, "grad_norm": 0.29675835371017456, "learning_rate": 2.9999999999999997e-05, "loss": 4.587, "step": 44315 }, { "epoch": 0.9075829869145865, "grad_norm": 0.2963467836380005, "learning_rate": 2.9999999999999997e-05, "loss": 4.578, "step": 44320 }, { "epoch": 0.9076853766919911, "grad_norm": 0.28992506861686707, "learning_rate": 2.9999999999999997e-05, "loss": 4.5554, "step": 44325 }, { "epoch": 0.9077877664693957, "grad_norm": 0.3521689474582672, "learning_rate": 2.9999999999999997e-05, "loss": 4.5715, "step": 44330 }, { "epoch": 0.9078901562468004, "grad_norm": 0.29133668541908264, "learning_rate": 2.9999999999999997e-05, "loss": 4.5617, "step": 44335 }, { "epoch": 0.907992546024205, "grad_norm": 0.2901957631111145, "learning_rate": 2.9999999999999997e-05, "loss": 4.5878, "step": 44340 }, { "epoch": 0.9080949358016096, "grad_norm": 0.28737345337867737, "learning_rate": 2.9999999999999997e-05, "loss": 4.5934, "step": 44345 }, { "epoch": 0.9081973255790142, "grad_norm": 0.30114200711250305, "learning_rate": 2.9999999999999997e-05, "loss": 4.5603, "step": 44350 }, { "epoch": 0.9082997153564188, "grad_norm": 0.28318148851394653, "learning_rate": 2.9999999999999997e-05, "loss": 4.6067, "step": 44355 }, { "epoch": 0.9084021051338235, "grad_norm": 0.29417601227760315, "learning_rate": 2.9999999999999997e-05, "loss": 4.569, "step": 44360 }, { "epoch": 0.9085044949112281, "grad_norm": 0.3162309229373932, "learning_rate": 2.9999999999999997e-05, "loss": 4.5249, "step": 44365 }, { "epoch": 0.9086068846886327, "grad_norm": 0.29203394055366516, "learning_rate": 2.9999999999999997e-05, "loss": 4.5939, "step": 44370 }, { "epoch": 0.9087092744660373, "grad_norm": 0.2932101786136627, "learning_rate": 2.9999999999999997e-05, "loss": 4.5592, "step": 44375 }, { "epoch": 0.908811664243442, "grad_norm": 0.30883273482322693, "learning_rate": 2.9999999999999997e-05, "loss": 4.5787, "step": 44380 }, { "epoch": 0.9089140540208466, "grad_norm": 0.29501113295555115, "learning_rate": 2.9999999999999997e-05, "loss": 4.5564, "step": 44385 }, { "epoch": 0.9090164437982512, "grad_norm": 0.2937309145927429, "learning_rate": 2.9999999999999997e-05, "loss": 4.5598, "step": 44390 }, { "epoch": 0.9091188335756558, "grad_norm": 0.2975621223449707, "learning_rate": 2.9999999999999997e-05, "loss": 4.5995, "step": 44395 }, { "epoch": 0.9092212233530604, "grad_norm": 0.30689218640327454, "learning_rate": 2.9999999999999997e-05, "loss": 4.5718, "step": 44400 }, { "epoch": 0.9093236131304651, "grad_norm": 0.2986644208431244, "learning_rate": 2.9999999999999997e-05, "loss": 4.5933, "step": 44405 }, { "epoch": 0.9094260029078697, "grad_norm": 0.3601891100406647, "learning_rate": 2.9999999999999997e-05, "loss": 4.5599, "step": 44410 }, { "epoch": 0.9095283926852743, "grad_norm": 0.2950226068496704, "learning_rate": 2.9999999999999997e-05, "loss": 4.526, "step": 44415 }, { "epoch": 0.9096307824626789, "grad_norm": 0.2891264259815216, "learning_rate": 2.9999999999999997e-05, "loss": 4.5576, "step": 44420 }, { "epoch": 0.9097331722400835, "grad_norm": 0.2955225706100464, "learning_rate": 2.9999999999999997e-05, "loss": 4.5698, "step": 44425 }, { "epoch": 0.9098355620174882, "grad_norm": 0.29458922147750854, "learning_rate": 2.9999999999999997e-05, "loss": 4.5549, "step": 44430 }, { "epoch": 0.9099379517948928, "grad_norm": 0.29744821786880493, "learning_rate": 2.9999999999999997e-05, "loss": 4.5999, "step": 44435 }, { "epoch": 0.9100403415722974, "grad_norm": 0.2866936922073364, "learning_rate": 2.9999999999999997e-05, "loss": 4.2777, "step": 44440 }, { "epoch": 0.910142731349702, "grad_norm": 0.28462445735931396, "learning_rate": 2.9999999999999997e-05, "loss": 4.5301, "step": 44445 }, { "epoch": 0.9102451211271066, "grad_norm": 0.30194762349128723, "learning_rate": 2.9999999999999997e-05, "loss": 4.5634, "step": 44450 }, { "epoch": 0.9103475109045113, "grad_norm": 0.32271724939346313, "learning_rate": 2.9999999999999997e-05, "loss": 4.5497, "step": 44455 }, { "epoch": 0.9104499006819159, "grad_norm": 0.2883274555206299, "learning_rate": 2.9999999999999997e-05, "loss": 4.5857, "step": 44460 }, { "epoch": 0.9105522904593205, "grad_norm": 0.29806938767433167, "learning_rate": 2.9999999999999997e-05, "loss": 4.5851, "step": 44465 }, { "epoch": 0.9106546802367251, "grad_norm": 0.29009348154067993, "learning_rate": 2.9999999999999997e-05, "loss": 4.5907, "step": 44470 }, { "epoch": 0.9107570700141298, "grad_norm": 0.28038156032562256, "learning_rate": 2.9999999999999997e-05, "loss": 4.5614, "step": 44475 }, { "epoch": 0.9108594597915344, "grad_norm": 0.29314082860946655, "learning_rate": 2.9999999999999997e-05, "loss": 4.5754, "step": 44480 }, { "epoch": 0.910961849568939, "grad_norm": 0.3111684024333954, "learning_rate": 2.9999999999999997e-05, "loss": 4.6112, "step": 44485 }, { "epoch": 0.9110642393463436, "grad_norm": 0.2904896140098572, "learning_rate": 2.9999999999999997e-05, "loss": 4.576, "step": 44490 }, { "epoch": 0.9111666291237482, "grad_norm": 0.30856579542160034, "learning_rate": 2.9999999999999997e-05, "loss": 4.4689, "step": 44495 }, { "epoch": 0.9112690189011529, "grad_norm": 0.28661075234413147, "learning_rate": 2.9999999999999997e-05, "loss": 4.5718, "step": 44500 }, { "epoch": 0.9113714086785575, "grad_norm": 0.3105640709400177, "learning_rate": 2.9999999999999997e-05, "loss": 4.5617, "step": 44505 }, { "epoch": 0.9114737984559621, "grad_norm": 0.28419485688209534, "learning_rate": 2.9999999999999997e-05, "loss": 4.5509, "step": 44510 }, { "epoch": 0.9115761882333668, "grad_norm": 0.2916661202907562, "learning_rate": 2.9999999999999997e-05, "loss": 4.6011, "step": 44515 }, { "epoch": 0.9116785780107715, "grad_norm": 0.2916611135005951, "learning_rate": 2.9999999999999997e-05, "loss": 4.5583, "step": 44520 }, { "epoch": 0.9117809677881761, "grad_norm": 0.3471600115299225, "learning_rate": 2.9999999999999997e-05, "loss": 4.636, "step": 44525 }, { "epoch": 0.9118833575655807, "grad_norm": 0.29736122488975525, "learning_rate": 2.9999999999999997e-05, "loss": 4.5856, "step": 44530 }, { "epoch": 0.9119857473429853, "grad_norm": 0.3281717896461487, "learning_rate": 2.9999999999999997e-05, "loss": 4.5828, "step": 44535 }, { "epoch": 0.9120881371203899, "grad_norm": 0.2958275079727173, "learning_rate": 2.9999999999999997e-05, "loss": 4.5746, "step": 44540 }, { "epoch": 0.9121905268977946, "grad_norm": 0.29329514503479004, "learning_rate": 2.9999999999999997e-05, "loss": 4.5412, "step": 44545 }, { "epoch": 0.9122929166751992, "grad_norm": 0.2900955080986023, "learning_rate": 2.9999999999999997e-05, "loss": 4.5821, "step": 44550 }, { "epoch": 0.9123953064526038, "grad_norm": 0.4106867015361786, "learning_rate": 2.9999999999999997e-05, "loss": 4.6349, "step": 44555 }, { "epoch": 0.9124976962300084, "grad_norm": 0.3019821047782898, "learning_rate": 2.9999999999999997e-05, "loss": 4.5892, "step": 44560 }, { "epoch": 0.912600086007413, "grad_norm": 0.2903803586959839, "learning_rate": 2.9999999999999997e-05, "loss": 4.5456, "step": 44565 }, { "epoch": 0.9127024757848177, "grad_norm": 0.28803229331970215, "learning_rate": 2.9999999999999997e-05, "loss": 4.5925, "step": 44570 }, { "epoch": 0.9128048655622223, "grad_norm": 0.27864280343055725, "learning_rate": 2.9999999999999997e-05, "loss": 4.584, "step": 44575 }, { "epoch": 0.9129072553396269, "grad_norm": 0.28240007162094116, "learning_rate": 2.9999999999999997e-05, "loss": 4.5872, "step": 44580 }, { "epoch": 0.9130096451170315, "grad_norm": 0.28655725717544556, "learning_rate": 2.9999999999999997e-05, "loss": 4.5561, "step": 44585 }, { "epoch": 0.9131120348944362, "grad_norm": 0.3613794147968292, "learning_rate": 2.9999999999999997e-05, "loss": 4.5624, "step": 44590 }, { "epoch": 0.9132144246718408, "grad_norm": 0.27940067648887634, "learning_rate": 2.9999999999999997e-05, "loss": 4.6067, "step": 44595 }, { "epoch": 0.9133168144492454, "grad_norm": 0.291115403175354, "learning_rate": 2.9999999999999997e-05, "loss": 4.605, "step": 44600 }, { "epoch": 0.91341920422665, "grad_norm": 0.30663806200027466, "learning_rate": 2.9999999999999997e-05, "loss": 4.6237, "step": 44605 }, { "epoch": 0.9135215940040546, "grad_norm": 0.292237251996994, "learning_rate": 2.9999999999999997e-05, "loss": 4.6276, "step": 44610 }, { "epoch": 0.9136239837814593, "grad_norm": 0.3186366558074951, "learning_rate": 2.9999999999999997e-05, "loss": 4.5705, "step": 44615 }, { "epoch": 0.9137263735588639, "grad_norm": 0.2985791265964508, "learning_rate": 2.9999999999999997e-05, "loss": 4.5121, "step": 44620 }, { "epoch": 0.9138287633362685, "grad_norm": 0.3115338981151581, "learning_rate": 2.9999999999999997e-05, "loss": 4.5789, "step": 44625 }, { "epoch": 0.9139311531136731, "grad_norm": 0.2941564917564392, "learning_rate": 2.9999999999999997e-05, "loss": 4.5967, "step": 44630 }, { "epoch": 0.9140335428910777, "grad_norm": 0.29198572039604187, "learning_rate": 2.9999999999999997e-05, "loss": 4.5936, "step": 44635 }, { "epoch": 0.9141359326684824, "grad_norm": 0.28676462173461914, "learning_rate": 2.9999999999999997e-05, "loss": 4.5669, "step": 44640 }, { "epoch": 0.914238322445887, "grad_norm": 0.3021875321865082, "learning_rate": 2.9999999999999997e-05, "loss": 4.5505, "step": 44645 }, { "epoch": 0.9143407122232916, "grad_norm": 0.3352340757846832, "learning_rate": 2.9999999999999997e-05, "loss": 4.581, "step": 44650 }, { "epoch": 0.9144431020006962, "grad_norm": 0.28663456439971924, "learning_rate": 2.9999999999999997e-05, "loss": 4.5722, "step": 44655 }, { "epoch": 0.9145454917781008, "grad_norm": 0.29421597719192505, "learning_rate": 2.9999999999999997e-05, "loss": 4.5843, "step": 44660 }, { "epoch": 0.9146478815555055, "grad_norm": 0.2880072593688965, "learning_rate": 2.9999999999999997e-05, "loss": 4.5498, "step": 44665 }, { "epoch": 0.9147502713329101, "grad_norm": 0.28296324610710144, "learning_rate": 2.9999999999999997e-05, "loss": 4.5583, "step": 44670 }, { "epoch": 0.9148526611103147, "grad_norm": 0.2951556444168091, "learning_rate": 2.9999999999999997e-05, "loss": 4.5616, "step": 44675 }, { "epoch": 0.9149550508877193, "grad_norm": 0.27845102548599243, "learning_rate": 2.9999999999999997e-05, "loss": 4.5802, "step": 44680 }, { "epoch": 0.915057440665124, "grad_norm": 0.30597835779190063, "learning_rate": 2.9999999999999997e-05, "loss": 4.5567, "step": 44685 }, { "epoch": 0.9151598304425286, "grad_norm": 0.3044925034046173, "learning_rate": 2.9999999999999997e-05, "loss": 4.5787, "step": 44690 }, { "epoch": 0.9152622202199332, "grad_norm": 0.2863486111164093, "learning_rate": 2.9999999999999997e-05, "loss": 4.5216, "step": 44695 }, { "epoch": 0.9153646099973378, "grad_norm": 0.29713574051856995, "learning_rate": 2.9999999999999997e-05, "loss": 4.5699, "step": 44700 }, { "epoch": 0.9154669997747424, "grad_norm": 0.3571939766407013, "learning_rate": 2.9999999999999997e-05, "loss": 4.5255, "step": 44705 }, { "epoch": 0.9155693895521471, "grad_norm": 0.2816871702671051, "learning_rate": 2.9999999999999997e-05, "loss": 4.6053, "step": 44710 }, { "epoch": 0.9156717793295517, "grad_norm": 0.2798614203929901, "learning_rate": 2.9999999999999997e-05, "loss": 4.4954, "step": 44715 }, { "epoch": 0.9157741691069564, "grad_norm": 0.302320271730423, "learning_rate": 2.9999999999999997e-05, "loss": 4.5727, "step": 44720 }, { "epoch": 0.915876558884361, "grad_norm": 0.2995151877403259, "learning_rate": 2.9999999999999997e-05, "loss": 4.5294, "step": 44725 }, { "epoch": 0.9159789486617657, "grad_norm": 0.28221645951271057, "learning_rate": 2.9999999999999997e-05, "loss": 4.5591, "step": 44730 }, { "epoch": 0.9160813384391703, "grad_norm": 0.29791709780693054, "learning_rate": 2.9999999999999997e-05, "loss": 4.5143, "step": 44735 }, { "epoch": 0.9161837282165749, "grad_norm": 0.29077425599098206, "learning_rate": 2.9999999999999997e-05, "loss": 4.5638, "step": 44740 }, { "epoch": 0.9162861179939795, "grad_norm": 0.28820061683654785, "learning_rate": 2.9999999999999997e-05, "loss": 4.5873, "step": 44745 }, { "epoch": 0.9163885077713841, "grad_norm": 0.2995607256889343, "learning_rate": 2.9999999999999997e-05, "loss": 4.5914, "step": 44750 }, { "epoch": 0.9164908975487888, "grad_norm": 0.29834872484207153, "learning_rate": 2.9999999999999997e-05, "loss": 4.5233, "step": 44755 }, { "epoch": 0.9165932873261934, "grad_norm": 0.28764039278030396, "learning_rate": 2.9999999999999997e-05, "loss": 4.5648, "step": 44760 }, { "epoch": 0.916695677103598, "grad_norm": 0.32084712386131287, "learning_rate": 2.9999999999999997e-05, "loss": 4.5085, "step": 44765 }, { "epoch": 0.9167980668810026, "grad_norm": 0.3009834289550781, "learning_rate": 2.9999999999999997e-05, "loss": 4.5689, "step": 44770 }, { "epoch": 0.9169004566584072, "grad_norm": 0.2824249863624573, "learning_rate": 2.9999999999999997e-05, "loss": 4.504, "step": 44775 }, { "epoch": 0.9170028464358119, "grad_norm": 0.2883903980255127, "learning_rate": 2.9999999999999997e-05, "loss": 4.5494, "step": 44780 }, { "epoch": 0.9171052362132165, "grad_norm": 0.3404322862625122, "learning_rate": 2.9999999999999997e-05, "loss": 4.5699, "step": 44785 }, { "epoch": 0.9172076259906211, "grad_norm": 0.31625035405158997, "learning_rate": 2.9999999999999997e-05, "loss": 4.5463, "step": 44790 }, { "epoch": 0.9173100157680257, "grad_norm": 0.2832892835140228, "learning_rate": 2.9999999999999997e-05, "loss": 4.5663, "step": 44795 }, { "epoch": 0.9174124055454304, "grad_norm": 0.38594958186149597, "learning_rate": 2.9999999999999997e-05, "loss": 4.5514, "step": 44800 }, { "epoch": 0.917514795322835, "grad_norm": 0.29314351081848145, "learning_rate": 2.9999999999999997e-05, "loss": 4.5338, "step": 44805 }, { "epoch": 0.9176171851002396, "grad_norm": 0.28442466259002686, "learning_rate": 2.9999999999999997e-05, "loss": 4.5978, "step": 44810 }, { "epoch": 0.9177195748776442, "grad_norm": 0.3025929033756256, "learning_rate": 2.9999999999999997e-05, "loss": 4.5993, "step": 44815 }, { "epoch": 0.9178219646550488, "grad_norm": 0.29419243335723877, "learning_rate": 2.9999999999999997e-05, "loss": 4.5366, "step": 44820 }, { "epoch": 0.9179243544324535, "grad_norm": 0.2948008179664612, "learning_rate": 2.9999999999999997e-05, "loss": 4.6115, "step": 44825 }, { "epoch": 0.9180267442098581, "grad_norm": 0.2893778085708618, "learning_rate": 2.9999999999999997e-05, "loss": 4.5776, "step": 44830 }, { "epoch": 0.9181291339872627, "grad_norm": 0.3166704475879669, "learning_rate": 2.9999999999999997e-05, "loss": 4.5779, "step": 44835 }, { "epoch": 0.9182315237646673, "grad_norm": 0.2853911519050598, "learning_rate": 2.9999999999999997e-05, "loss": 4.6075, "step": 44840 }, { "epoch": 0.918333913542072, "grad_norm": 0.3583363890647888, "learning_rate": 2.9999999999999997e-05, "loss": 4.5763, "step": 44845 }, { "epoch": 0.9184363033194766, "grad_norm": 0.2975505292415619, "learning_rate": 2.9999999999999997e-05, "loss": 4.5851, "step": 44850 }, { "epoch": 0.9185386930968812, "grad_norm": 0.2748933434486389, "learning_rate": 2.9999999999999997e-05, "loss": 4.551, "step": 44855 }, { "epoch": 0.9186410828742858, "grad_norm": 0.29540279507637024, "learning_rate": 2.9999999999999997e-05, "loss": 4.5765, "step": 44860 }, { "epoch": 0.9187434726516904, "grad_norm": 0.28963378071784973, "learning_rate": 2.9999999999999997e-05, "loss": 4.5538, "step": 44865 }, { "epoch": 0.918845862429095, "grad_norm": 0.329802006483078, "learning_rate": 2.9999999999999997e-05, "loss": 4.5686, "step": 44870 }, { "epoch": 0.9189482522064997, "grad_norm": 0.29359158873558044, "learning_rate": 2.9999999999999997e-05, "loss": 4.5927, "step": 44875 }, { "epoch": 0.9190506419839043, "grad_norm": 0.28925612568855286, "learning_rate": 2.9999999999999997e-05, "loss": 4.5471, "step": 44880 }, { "epoch": 0.9191530317613089, "grad_norm": 0.2977502644062042, "learning_rate": 2.9999999999999997e-05, "loss": 4.5916, "step": 44885 }, { "epoch": 0.9192554215387135, "grad_norm": 0.3059592843055725, "learning_rate": 2.9999999999999997e-05, "loss": 4.5924, "step": 44890 }, { "epoch": 0.9193578113161182, "grad_norm": 0.31695204973220825, "learning_rate": 2.9999999999999997e-05, "loss": 4.5645, "step": 44895 }, { "epoch": 0.9194602010935228, "grad_norm": 0.3907400369644165, "learning_rate": 2.9999999999999997e-05, "loss": 4.5567, "step": 44900 }, { "epoch": 0.9195625908709274, "grad_norm": 0.2951764762401581, "learning_rate": 2.9999999999999997e-05, "loss": 4.6061, "step": 44905 }, { "epoch": 0.919664980648332, "grad_norm": 0.29211193323135376, "learning_rate": 2.9999999999999997e-05, "loss": 4.6376, "step": 44910 }, { "epoch": 0.9197673704257366, "grad_norm": 0.30369940400123596, "learning_rate": 2.9999999999999997e-05, "loss": 4.567, "step": 44915 }, { "epoch": 0.9198697602031414, "grad_norm": 0.292265385389328, "learning_rate": 2.9999999999999997e-05, "loss": 4.5784, "step": 44920 }, { "epoch": 0.919972149980546, "grad_norm": 0.44335368275642395, "learning_rate": 2.9999999999999997e-05, "loss": 4.5753, "step": 44925 }, { "epoch": 0.9200745397579506, "grad_norm": 0.3001779615879059, "learning_rate": 2.9999999999999997e-05, "loss": 4.5473, "step": 44930 }, { "epoch": 0.9201769295353552, "grad_norm": 0.2842656970024109, "learning_rate": 2.9999999999999997e-05, "loss": 4.5376, "step": 44935 }, { "epoch": 0.9202793193127599, "grad_norm": 0.29240307211875916, "learning_rate": 2.9999999999999997e-05, "loss": 4.5703, "step": 44940 }, { "epoch": 0.9203817090901645, "grad_norm": 0.28944623470306396, "learning_rate": 2.9999999999999997e-05, "loss": 4.567, "step": 44945 }, { "epoch": 0.9204840988675691, "grad_norm": 0.2853572964668274, "learning_rate": 2.9999999999999997e-05, "loss": 4.6032, "step": 44950 }, { "epoch": 0.9205864886449737, "grad_norm": 0.2876571714878082, "learning_rate": 2.9999999999999997e-05, "loss": 4.5727, "step": 44955 }, { "epoch": 0.9206888784223783, "grad_norm": 0.2950356900691986, "learning_rate": 2.9999999999999997e-05, "loss": 4.5612, "step": 44960 }, { "epoch": 0.920791268199783, "grad_norm": 0.3180888891220093, "learning_rate": 2.9999999999999997e-05, "loss": 4.5039, "step": 44965 }, { "epoch": 0.9208936579771876, "grad_norm": 0.2985819876194, "learning_rate": 2.9999999999999997e-05, "loss": 4.5599, "step": 44970 }, { "epoch": 0.9209960477545922, "grad_norm": 0.3413555324077606, "learning_rate": 2.9999999999999997e-05, "loss": 4.5591, "step": 44975 }, { "epoch": 0.9210984375319968, "grad_norm": 0.6849969029426575, "learning_rate": 2.9999999999999997e-05, "loss": 4.5641, "step": 44980 }, { "epoch": 0.9212008273094014, "grad_norm": 0.3084063231945038, "learning_rate": 2.9999999999999997e-05, "loss": 4.5885, "step": 44985 }, { "epoch": 0.9213032170868061, "grad_norm": 0.3066835105419159, "learning_rate": 2.9999999999999997e-05, "loss": 4.5276, "step": 44990 }, { "epoch": 0.9214056068642107, "grad_norm": 0.3001596927642822, "learning_rate": 2.9999999999999997e-05, "loss": 4.5656, "step": 44995 }, { "epoch": 0.9215079966416153, "grad_norm": 0.2873099148273468, "learning_rate": 2.9999999999999997e-05, "loss": 4.5462, "step": 45000 }, { "epoch": 0.9215079966416153, "eval_loss": 4.58529806137085, "eval_runtime": 17.1325, "eval_samples_per_second": 33.971, "eval_steps_per_second": 4.261, "step": 45000 }, { "epoch": 0.9216103864190199, "grad_norm": 0.2973378896713257, "learning_rate": 2.9999999999999997e-05, "loss": 4.5837, "step": 45005 }, { "epoch": 0.9217127761964246, "grad_norm": 0.2963358759880066, "learning_rate": 2.9999999999999997e-05, "loss": 4.5435, "step": 45010 }, { "epoch": 0.9218151659738292, "grad_norm": 0.32997384667396545, "learning_rate": 2.9999999999999997e-05, "loss": 4.5147, "step": 45015 }, { "epoch": 0.9219175557512338, "grad_norm": 0.2832203507423401, "learning_rate": 2.9999999999999997e-05, "loss": 4.5975, "step": 45020 }, { "epoch": 0.9220199455286384, "grad_norm": 0.3515530228614807, "learning_rate": 2.9999999999999997e-05, "loss": 4.643, "step": 45025 }, { "epoch": 0.922122335306043, "grad_norm": 0.3126188814640045, "learning_rate": 2.9999999999999997e-05, "loss": 4.6006, "step": 45030 }, { "epoch": 0.9222247250834477, "grad_norm": 0.3004818260669708, "learning_rate": 2.9999999999999997e-05, "loss": 4.6157, "step": 45035 }, { "epoch": 0.9223271148608523, "grad_norm": 0.2943071126937866, "learning_rate": 2.9999999999999997e-05, "loss": 4.5681, "step": 45040 }, { "epoch": 0.9224295046382569, "grad_norm": 0.2926907241344452, "learning_rate": 2.9999999999999997e-05, "loss": 4.5578, "step": 45045 }, { "epoch": 0.9225318944156615, "grad_norm": 0.2933685779571533, "learning_rate": 2.9999999999999997e-05, "loss": 4.5802, "step": 45050 }, { "epoch": 0.9226342841930661, "grad_norm": 0.29561647772789, "learning_rate": 2.9999999999999997e-05, "loss": 4.5653, "step": 45055 }, { "epoch": 0.9227366739704708, "grad_norm": 0.28530168533325195, "learning_rate": 2.9999999999999997e-05, "loss": 4.501, "step": 45060 }, { "epoch": 0.9228390637478754, "grad_norm": 0.30260658264160156, "learning_rate": 2.9999999999999997e-05, "loss": 4.5781, "step": 45065 }, { "epoch": 0.92294145352528, "grad_norm": 0.29369866847991943, "learning_rate": 2.9999999999999997e-05, "loss": 4.5609, "step": 45070 }, { "epoch": 0.9230438433026846, "grad_norm": 0.2968003451824188, "learning_rate": 2.9999999999999997e-05, "loss": 4.5517, "step": 45075 }, { "epoch": 0.9231462330800893, "grad_norm": 0.2818485498428345, "learning_rate": 2.9999999999999997e-05, "loss": 4.5691, "step": 45080 }, { "epoch": 0.9232486228574939, "grad_norm": 0.29828980565071106, "learning_rate": 2.9999999999999997e-05, "loss": 4.5619, "step": 45085 }, { "epoch": 0.9233510126348985, "grad_norm": 0.289543092250824, "learning_rate": 2.9999999999999997e-05, "loss": 4.589, "step": 45090 }, { "epoch": 0.9234534024123031, "grad_norm": 0.290693461894989, "learning_rate": 2.9999999999999997e-05, "loss": 4.5699, "step": 45095 }, { "epoch": 0.9235557921897077, "grad_norm": 0.3078167736530304, "learning_rate": 2.9999999999999997e-05, "loss": 4.6009, "step": 45100 }, { "epoch": 0.9236581819671124, "grad_norm": 0.28769147396087646, "learning_rate": 2.9999999999999997e-05, "loss": 4.5726, "step": 45105 }, { "epoch": 0.923760571744517, "grad_norm": 0.2848482131958008, "learning_rate": 2.9999999999999997e-05, "loss": 4.5611, "step": 45110 }, { "epoch": 0.9238629615219216, "grad_norm": 0.3099733293056488, "learning_rate": 2.9999999999999997e-05, "loss": 4.5544, "step": 45115 }, { "epoch": 0.9239653512993262, "grad_norm": 0.4073297083377838, "learning_rate": 2.9999999999999997e-05, "loss": 4.6083, "step": 45120 }, { "epoch": 0.924067741076731, "grad_norm": 0.3098052144050598, "learning_rate": 2.9999999999999997e-05, "loss": 4.5774, "step": 45125 }, { "epoch": 0.9241701308541356, "grad_norm": 0.29929396510124207, "learning_rate": 2.9999999999999997e-05, "loss": 4.5612, "step": 45130 }, { "epoch": 0.9242725206315402, "grad_norm": 0.3185664713382721, "learning_rate": 2.9999999999999997e-05, "loss": 4.6396, "step": 45135 }, { "epoch": 0.9243749104089448, "grad_norm": 0.29102253913879395, "learning_rate": 2.9999999999999997e-05, "loss": 4.5724, "step": 45140 }, { "epoch": 0.9244773001863494, "grad_norm": 0.2837943732738495, "learning_rate": 2.9999999999999997e-05, "loss": 4.5406, "step": 45145 }, { "epoch": 0.9245796899637541, "grad_norm": 0.2911560833454132, "learning_rate": 2.9999999999999997e-05, "loss": 4.6176, "step": 45150 }, { "epoch": 0.9246820797411587, "grad_norm": 0.2981114089488983, "learning_rate": 2.9999999999999997e-05, "loss": 4.5812, "step": 45155 }, { "epoch": 0.9247844695185633, "grad_norm": 0.30821493268013, "learning_rate": 2.9999999999999997e-05, "loss": 4.5539, "step": 45160 }, { "epoch": 0.9248868592959679, "grad_norm": 0.28796255588531494, "learning_rate": 2.9999999999999997e-05, "loss": 4.5772, "step": 45165 }, { "epoch": 0.9249892490733725, "grad_norm": 0.28619998693466187, "learning_rate": 2.9999999999999997e-05, "loss": 4.6008, "step": 45170 }, { "epoch": 0.9250916388507772, "grad_norm": 0.3595195412635803, "learning_rate": 2.9999999999999997e-05, "loss": 4.5806, "step": 45175 }, { "epoch": 0.9251940286281818, "grad_norm": 0.29030317068099976, "learning_rate": 2.9999999999999997e-05, "loss": 4.5276, "step": 45180 }, { "epoch": 0.9252964184055864, "grad_norm": 0.3279653489589691, "learning_rate": 2.9999999999999997e-05, "loss": 4.5719, "step": 45185 }, { "epoch": 0.925398808182991, "grad_norm": 0.2800334393978119, "learning_rate": 2.9999999999999997e-05, "loss": 4.5404, "step": 45190 }, { "epoch": 0.9255011979603956, "grad_norm": 0.2963065505027771, "learning_rate": 2.9999999999999997e-05, "loss": 4.5914, "step": 45195 }, { "epoch": 0.9256035877378003, "grad_norm": 0.29062843322753906, "learning_rate": 2.9999999999999997e-05, "loss": 4.5451, "step": 45200 }, { "epoch": 0.9257059775152049, "grad_norm": 0.2946456968784332, "learning_rate": 2.9999999999999997e-05, "loss": 4.5691, "step": 45205 }, { "epoch": 0.9258083672926095, "grad_norm": 0.30529794096946716, "learning_rate": 2.9999999999999997e-05, "loss": 4.5734, "step": 45210 }, { "epoch": 0.9259107570700141, "grad_norm": 0.287738174200058, "learning_rate": 2.9999999999999997e-05, "loss": 4.553, "step": 45215 }, { "epoch": 0.9260131468474188, "grad_norm": 0.29744094610214233, "learning_rate": 2.9999999999999997e-05, "loss": 4.5858, "step": 45220 }, { "epoch": 0.9261155366248234, "grad_norm": 0.3122273087501526, "learning_rate": 2.9999999999999997e-05, "loss": 4.5747, "step": 45225 }, { "epoch": 0.926217926402228, "grad_norm": 0.2894415259361267, "learning_rate": 2.9999999999999997e-05, "loss": 4.6024, "step": 45230 }, { "epoch": 0.9263203161796326, "grad_norm": 0.29068198800086975, "learning_rate": 2.9999999999999997e-05, "loss": 4.5874, "step": 45235 }, { "epoch": 0.9264227059570372, "grad_norm": 0.29694217443466187, "learning_rate": 2.9999999999999997e-05, "loss": 4.5712, "step": 45240 }, { "epoch": 0.9265250957344419, "grad_norm": 0.28193652629852295, "learning_rate": 2.9999999999999997e-05, "loss": 4.5896, "step": 45245 }, { "epoch": 0.9266274855118465, "grad_norm": 0.29648712277412415, "learning_rate": 2.9999999999999997e-05, "loss": 4.6129, "step": 45250 }, { "epoch": 0.9267298752892511, "grad_norm": 0.2974899709224701, "learning_rate": 2.9999999999999997e-05, "loss": 4.5979, "step": 45255 }, { "epoch": 0.9268322650666557, "grad_norm": 0.3110671937465668, "learning_rate": 2.9999999999999997e-05, "loss": 4.5784, "step": 45260 }, { "epoch": 0.9269346548440603, "grad_norm": 0.3000732362270355, "learning_rate": 2.9999999999999997e-05, "loss": 4.5646, "step": 45265 }, { "epoch": 0.927037044621465, "grad_norm": 0.28198617696762085, "learning_rate": 2.9999999999999997e-05, "loss": 4.5945, "step": 45270 }, { "epoch": 0.9271394343988696, "grad_norm": 0.28437891602516174, "learning_rate": 2.9999999999999997e-05, "loss": 4.574, "step": 45275 }, { "epoch": 0.9272418241762742, "grad_norm": 0.2919110953807831, "learning_rate": 2.9999999999999997e-05, "loss": 4.5663, "step": 45280 }, { "epoch": 0.9273442139536788, "grad_norm": 0.2821648418903351, "learning_rate": 2.9999999999999997e-05, "loss": 4.6059, "step": 45285 }, { "epoch": 0.9274466037310835, "grad_norm": 0.27970513701438904, "learning_rate": 2.9999999999999997e-05, "loss": 4.5912, "step": 45290 }, { "epoch": 0.9275489935084881, "grad_norm": 0.28932279348373413, "learning_rate": 2.9999999999999997e-05, "loss": 4.5852, "step": 45295 }, { "epoch": 0.9276513832858927, "grad_norm": 0.28567251563072205, "learning_rate": 2.9999999999999997e-05, "loss": 4.5788, "step": 45300 }, { "epoch": 0.9277537730632973, "grad_norm": 0.297882080078125, "learning_rate": 2.9999999999999997e-05, "loss": 4.5693, "step": 45305 }, { "epoch": 0.9278561628407019, "grad_norm": 0.2826113700866699, "learning_rate": 2.9999999999999997e-05, "loss": 4.5739, "step": 45310 }, { "epoch": 0.9279585526181066, "grad_norm": 0.2921213209629059, "learning_rate": 2.9999999999999997e-05, "loss": 4.5849, "step": 45315 }, { "epoch": 0.9280609423955112, "grad_norm": 0.29327523708343506, "learning_rate": 2.9999999999999997e-05, "loss": 4.5664, "step": 45320 }, { "epoch": 0.9281633321729159, "grad_norm": 0.3010656535625458, "learning_rate": 2.9999999999999997e-05, "loss": 4.5771, "step": 45325 }, { "epoch": 0.9282657219503205, "grad_norm": 0.28777647018432617, "learning_rate": 2.9999999999999997e-05, "loss": 4.5675, "step": 45330 }, { "epoch": 0.9283681117277252, "grad_norm": 0.29613229632377625, "learning_rate": 2.9999999999999997e-05, "loss": 4.5744, "step": 45335 }, { "epoch": 0.9284705015051298, "grad_norm": 0.3020172715187073, "learning_rate": 2.9999999999999997e-05, "loss": 4.5726, "step": 45340 }, { "epoch": 0.9285728912825344, "grad_norm": 0.31469112634658813, "learning_rate": 2.9999999999999997e-05, "loss": 4.5326, "step": 45345 }, { "epoch": 0.928675281059939, "grad_norm": 0.29810044169425964, "learning_rate": 2.9999999999999997e-05, "loss": 4.568, "step": 45350 }, { "epoch": 0.9287776708373436, "grad_norm": 0.28736141324043274, "learning_rate": 2.9999999999999997e-05, "loss": 4.5611, "step": 45355 }, { "epoch": 0.9288800606147483, "grad_norm": 0.3133838176727295, "learning_rate": 2.9999999999999997e-05, "loss": 4.5245, "step": 45360 }, { "epoch": 0.9289824503921529, "grad_norm": 0.3475625514984131, "learning_rate": 2.9999999999999997e-05, "loss": 4.5655, "step": 45365 }, { "epoch": 0.9290848401695575, "grad_norm": 0.30536213517189026, "learning_rate": 2.9999999999999997e-05, "loss": 4.5825, "step": 45370 }, { "epoch": 0.9291872299469621, "grad_norm": 0.29482272267341614, "learning_rate": 2.9999999999999997e-05, "loss": 4.6484, "step": 45375 }, { "epoch": 0.9292896197243667, "grad_norm": 0.3003271818161011, "learning_rate": 2.9999999999999997e-05, "loss": 4.6236, "step": 45380 }, { "epoch": 0.9293920095017714, "grad_norm": 0.31987905502319336, "learning_rate": 2.9999999999999997e-05, "loss": 4.5396, "step": 45385 }, { "epoch": 0.929494399279176, "grad_norm": 0.3040323555469513, "learning_rate": 2.9999999999999997e-05, "loss": 4.5648, "step": 45390 }, { "epoch": 0.9295967890565806, "grad_norm": 0.2893180847167969, "learning_rate": 2.9999999999999997e-05, "loss": 4.5472, "step": 45395 }, { "epoch": 0.9296991788339852, "grad_norm": 0.29543089866638184, "learning_rate": 2.9999999999999997e-05, "loss": 4.536, "step": 45400 }, { "epoch": 0.9298015686113899, "grad_norm": 0.307453453540802, "learning_rate": 2.9999999999999997e-05, "loss": 4.5594, "step": 45405 }, { "epoch": 0.9299039583887945, "grad_norm": 0.3002399206161499, "learning_rate": 2.9999999999999997e-05, "loss": 4.5713, "step": 45410 }, { "epoch": 0.9300063481661991, "grad_norm": 0.29182058572769165, "learning_rate": 2.9999999999999997e-05, "loss": 4.5828, "step": 45415 }, { "epoch": 0.9301087379436037, "grad_norm": 0.3115497827529907, "learning_rate": 2.9999999999999997e-05, "loss": 4.5522, "step": 45420 }, { "epoch": 0.9302111277210083, "grad_norm": 0.303641676902771, "learning_rate": 2.9999999999999997e-05, "loss": 4.5316, "step": 45425 }, { "epoch": 0.930313517498413, "grad_norm": 0.29586124420166016, "learning_rate": 2.9999999999999997e-05, "loss": 4.5839, "step": 45430 }, { "epoch": 0.9304159072758176, "grad_norm": 0.29437705874443054, "learning_rate": 2.9999999999999997e-05, "loss": 4.6212, "step": 45435 }, { "epoch": 0.9305182970532222, "grad_norm": 0.29417547583580017, "learning_rate": 2.9999999999999997e-05, "loss": 4.5583, "step": 45440 }, { "epoch": 0.9306206868306268, "grad_norm": 0.30192485451698303, "learning_rate": 2.9999999999999997e-05, "loss": 4.5494, "step": 45445 }, { "epoch": 0.9307230766080314, "grad_norm": 0.3112773001194, "learning_rate": 2.9999999999999997e-05, "loss": 4.5531, "step": 45450 }, { "epoch": 0.9308254663854361, "grad_norm": 0.2834312915802002, "learning_rate": 2.9999999999999997e-05, "loss": 4.5793, "step": 45455 }, { "epoch": 0.9309278561628407, "grad_norm": 0.2876500189304352, "learning_rate": 2.9999999999999997e-05, "loss": 4.5761, "step": 45460 }, { "epoch": 0.9310302459402453, "grad_norm": 0.2978384494781494, "learning_rate": 2.9999999999999997e-05, "loss": 4.5379, "step": 45465 }, { "epoch": 0.9311326357176499, "grad_norm": 0.28771841526031494, "learning_rate": 2.9999999999999997e-05, "loss": 4.5884, "step": 45470 }, { "epoch": 0.9312350254950545, "grad_norm": 0.2919498383998871, "learning_rate": 2.9999999999999997e-05, "loss": 4.5736, "step": 45475 }, { "epoch": 0.9313374152724592, "grad_norm": 0.2844051122665405, "learning_rate": 2.9999999999999997e-05, "loss": 4.5499, "step": 45480 }, { "epoch": 0.9314398050498638, "grad_norm": 0.2960943281650543, "learning_rate": 2.9999999999999997e-05, "loss": 4.5758, "step": 45485 }, { "epoch": 0.9315421948272684, "grad_norm": 0.2949587106704712, "learning_rate": 2.9999999999999997e-05, "loss": 4.5149, "step": 45490 }, { "epoch": 0.931644584604673, "grad_norm": 0.30094870924949646, "learning_rate": 2.9999999999999997e-05, "loss": 4.6182, "step": 45495 }, { "epoch": 0.9317469743820777, "grad_norm": 0.31304439902305603, "learning_rate": 2.9999999999999997e-05, "loss": 4.5844, "step": 45500 }, { "epoch": 0.9318493641594823, "grad_norm": 0.29755863547325134, "learning_rate": 2.9999999999999997e-05, "loss": 4.572, "step": 45505 }, { "epoch": 0.9319517539368869, "grad_norm": 0.3069724142551422, "learning_rate": 2.9999999999999997e-05, "loss": 4.5895, "step": 45510 }, { "epoch": 0.9320541437142915, "grad_norm": 0.2978571653366089, "learning_rate": 2.9999999999999997e-05, "loss": 4.5374, "step": 45515 }, { "epoch": 0.9321565334916961, "grad_norm": 0.2921341359615326, "learning_rate": 2.9999999999999997e-05, "loss": 4.4502, "step": 45520 }, { "epoch": 0.9322589232691008, "grad_norm": 0.28570982813835144, "learning_rate": 2.9999999999999997e-05, "loss": 4.497, "step": 45525 }, { "epoch": 0.9323613130465055, "grad_norm": 0.2988256812095642, "learning_rate": 2.9999999999999997e-05, "loss": 4.6021, "step": 45530 }, { "epoch": 0.9324637028239101, "grad_norm": 0.28905680775642395, "learning_rate": 2.9999999999999997e-05, "loss": 4.6271, "step": 45535 }, { "epoch": 0.9325660926013147, "grad_norm": 0.29460835456848145, "learning_rate": 2.9999999999999997e-05, "loss": 4.5773, "step": 45540 }, { "epoch": 0.9326684823787194, "grad_norm": 0.2926810383796692, "learning_rate": 2.9999999999999997e-05, "loss": 4.5584, "step": 45545 }, { "epoch": 0.932770872156124, "grad_norm": 0.3007657527923584, "learning_rate": 2.9999999999999997e-05, "loss": 4.5799, "step": 45550 }, { "epoch": 0.9328732619335286, "grad_norm": 0.28152361512184143, "learning_rate": 2.9999999999999997e-05, "loss": 4.5201, "step": 45555 }, { "epoch": 0.9329756517109332, "grad_norm": 0.2795524299144745, "learning_rate": 2.9999999999999997e-05, "loss": 4.5287, "step": 45560 }, { "epoch": 0.9330780414883378, "grad_norm": 0.3183031678199768, "learning_rate": 2.9999999999999997e-05, "loss": 4.6047, "step": 45565 }, { "epoch": 0.9331804312657425, "grad_norm": 0.3116456866264343, "learning_rate": 2.9999999999999997e-05, "loss": 4.5388, "step": 45570 }, { "epoch": 0.9332828210431471, "grad_norm": 0.3290521800518036, "learning_rate": 2.9999999999999997e-05, "loss": 4.557, "step": 45575 }, { "epoch": 0.9333852108205517, "grad_norm": 0.30255621671676636, "learning_rate": 2.9999999999999997e-05, "loss": 4.6539, "step": 45580 }, { "epoch": 0.9334876005979563, "grad_norm": 0.2903241515159607, "learning_rate": 2.9999999999999997e-05, "loss": 4.5398, "step": 45585 }, { "epoch": 0.933589990375361, "grad_norm": 0.3150928318500519, "learning_rate": 2.9999999999999997e-05, "loss": 4.577, "step": 45590 }, { "epoch": 0.9336923801527656, "grad_norm": 0.3061113655567169, "learning_rate": 2.9999999999999997e-05, "loss": 4.5662, "step": 45595 }, { "epoch": 0.9337947699301702, "grad_norm": 0.2998338043689728, "learning_rate": 2.9999999999999997e-05, "loss": 4.527, "step": 45600 }, { "epoch": 0.9338971597075748, "grad_norm": 0.3258943259716034, "learning_rate": 2.9999999999999997e-05, "loss": 4.6084, "step": 45605 }, { "epoch": 0.9339995494849794, "grad_norm": 0.2946352958679199, "learning_rate": 2.9999999999999997e-05, "loss": 4.5985, "step": 45610 }, { "epoch": 0.934101939262384, "grad_norm": 0.2906305193901062, "learning_rate": 2.9999999999999997e-05, "loss": 4.5695, "step": 45615 }, { "epoch": 0.9342043290397887, "grad_norm": 0.309285968542099, "learning_rate": 2.9999999999999997e-05, "loss": 4.5823, "step": 45620 }, { "epoch": 0.9343067188171933, "grad_norm": 0.2940753102302551, "learning_rate": 2.9999999999999997e-05, "loss": 4.5745, "step": 45625 }, { "epoch": 0.9344091085945979, "grad_norm": 0.29665306210517883, "learning_rate": 2.9999999999999997e-05, "loss": 4.5608, "step": 45630 }, { "epoch": 0.9345114983720025, "grad_norm": 0.28580552339553833, "learning_rate": 2.9999999999999997e-05, "loss": 4.5819, "step": 45635 }, { "epoch": 0.9346138881494072, "grad_norm": 0.30722177028656006, "learning_rate": 2.9999999999999997e-05, "loss": 4.5603, "step": 45640 }, { "epoch": 0.9347162779268118, "grad_norm": 0.2918175160884857, "learning_rate": 2.9999999999999997e-05, "loss": 4.6299, "step": 45645 }, { "epoch": 0.9348186677042164, "grad_norm": 0.2978382110595703, "learning_rate": 2.9999999999999997e-05, "loss": 4.5489, "step": 45650 }, { "epoch": 0.934921057481621, "grad_norm": 0.36765149235725403, "learning_rate": 2.9999999999999997e-05, "loss": 4.6238, "step": 45655 }, { "epoch": 0.9350234472590256, "grad_norm": 0.29678916931152344, "learning_rate": 2.9999999999999997e-05, "loss": 4.5749, "step": 45660 }, { "epoch": 0.9351258370364303, "grad_norm": 0.3510517179965973, "learning_rate": 2.9999999999999997e-05, "loss": 4.5539, "step": 45665 }, { "epoch": 0.9352282268138349, "grad_norm": 0.3230513334274292, "learning_rate": 2.9999999999999997e-05, "loss": 4.554, "step": 45670 }, { "epoch": 0.9353306165912395, "grad_norm": 0.3000127375125885, "learning_rate": 2.9999999999999997e-05, "loss": 4.573, "step": 45675 }, { "epoch": 0.9354330063686441, "grad_norm": 0.30339938402175903, "learning_rate": 2.9999999999999997e-05, "loss": 4.5225, "step": 45680 }, { "epoch": 0.9355353961460487, "grad_norm": 0.32276660203933716, "learning_rate": 2.9999999999999997e-05, "loss": 4.5064, "step": 45685 }, { "epoch": 0.9356377859234534, "grad_norm": 0.3067030906677246, "learning_rate": 2.9999999999999997e-05, "loss": 4.5368, "step": 45690 }, { "epoch": 0.935740175700858, "grad_norm": 0.2950015962123871, "learning_rate": 2.9999999999999997e-05, "loss": 4.5539, "step": 45695 }, { "epoch": 0.9358425654782626, "grad_norm": 0.2833322286605835, "learning_rate": 2.9999999999999997e-05, "loss": 4.6006, "step": 45700 }, { "epoch": 0.9359449552556672, "grad_norm": 0.3211897909641266, "learning_rate": 2.9999999999999997e-05, "loss": 4.528, "step": 45705 }, { "epoch": 0.9360473450330719, "grad_norm": 0.3024519383907318, "learning_rate": 2.9999999999999997e-05, "loss": 4.5693, "step": 45710 }, { "epoch": 0.9361497348104765, "grad_norm": 0.2962220311164856, "learning_rate": 2.9999999999999997e-05, "loss": 4.5626, "step": 45715 }, { "epoch": 0.9362521245878811, "grad_norm": 0.29894790053367615, "learning_rate": 2.9999999999999997e-05, "loss": 4.5809, "step": 45720 }, { "epoch": 0.9363545143652857, "grad_norm": 0.2957574725151062, "learning_rate": 2.9999999999999997e-05, "loss": 4.6018, "step": 45725 }, { "epoch": 0.9364569041426903, "grad_norm": 0.30089783668518066, "learning_rate": 2.9999999999999997e-05, "loss": 4.5585, "step": 45730 }, { "epoch": 0.9365592939200951, "grad_norm": 0.2974870204925537, "learning_rate": 2.9999999999999997e-05, "loss": 4.5591, "step": 45735 }, { "epoch": 0.9366616836974997, "grad_norm": 0.29053524136543274, "learning_rate": 2.9999999999999997e-05, "loss": 4.4857, "step": 45740 }, { "epoch": 0.9367640734749043, "grad_norm": 0.29384881258010864, "learning_rate": 2.9999999999999997e-05, "loss": 4.5834, "step": 45745 }, { "epoch": 0.9368664632523089, "grad_norm": 0.29201626777648926, "learning_rate": 2.9999999999999997e-05, "loss": 4.5219, "step": 45750 }, { "epoch": 0.9369688530297136, "grad_norm": 0.2854151129722595, "learning_rate": 2.9999999999999997e-05, "loss": 4.561, "step": 45755 }, { "epoch": 0.9370712428071182, "grad_norm": 0.2902468740940094, "learning_rate": 2.9999999999999997e-05, "loss": 4.5425, "step": 45760 }, { "epoch": 0.9371736325845228, "grad_norm": 0.2796638011932373, "learning_rate": 2.9999999999999997e-05, "loss": 4.5831, "step": 45765 }, { "epoch": 0.9372760223619274, "grad_norm": 0.2874743938446045, "learning_rate": 2.9999999999999997e-05, "loss": 4.5176, "step": 45770 }, { "epoch": 0.937378412139332, "grad_norm": 0.2998960018157959, "learning_rate": 2.9999999999999997e-05, "loss": 4.5575, "step": 45775 }, { "epoch": 0.9374808019167367, "grad_norm": 0.30951982736587524, "learning_rate": 2.9999999999999997e-05, "loss": 4.597, "step": 45780 }, { "epoch": 0.9375831916941413, "grad_norm": 0.2959824502468109, "learning_rate": 2.9999999999999997e-05, "loss": 4.5971, "step": 45785 }, { "epoch": 0.9376855814715459, "grad_norm": 0.3059476613998413, "learning_rate": 2.9999999999999997e-05, "loss": 4.5754, "step": 45790 }, { "epoch": 0.9377879712489505, "grad_norm": 0.30178895592689514, "learning_rate": 2.9999999999999997e-05, "loss": 4.5713, "step": 45795 }, { "epoch": 0.9378903610263551, "grad_norm": 0.285638689994812, "learning_rate": 2.9999999999999997e-05, "loss": 4.6093, "step": 45800 }, { "epoch": 0.9379927508037598, "grad_norm": 0.3198152184486389, "learning_rate": 2.9999999999999997e-05, "loss": 4.5846, "step": 45805 }, { "epoch": 0.9380951405811644, "grad_norm": 0.30595332384109497, "learning_rate": 2.9999999999999997e-05, "loss": 4.5842, "step": 45810 }, { "epoch": 0.938197530358569, "grad_norm": 0.293392151594162, "learning_rate": 2.9999999999999997e-05, "loss": 4.5139, "step": 45815 }, { "epoch": 0.9382999201359736, "grad_norm": 0.433502733707428, "learning_rate": 2.9999999999999997e-05, "loss": 4.6002, "step": 45820 }, { "epoch": 0.9384023099133783, "grad_norm": 0.2840341031551361, "learning_rate": 2.9999999999999997e-05, "loss": 4.5529, "step": 45825 }, { "epoch": 0.9385046996907829, "grad_norm": 0.295367032289505, "learning_rate": 2.9999999999999997e-05, "loss": 4.5641, "step": 45830 }, { "epoch": 0.9386070894681875, "grad_norm": 0.28799545764923096, "learning_rate": 2.9999999999999997e-05, "loss": 4.5524, "step": 45835 }, { "epoch": 0.9387094792455921, "grad_norm": 0.2966258227825165, "learning_rate": 2.9999999999999997e-05, "loss": 4.6065, "step": 45840 }, { "epoch": 0.9388118690229967, "grad_norm": 0.29658636450767517, "learning_rate": 2.9999999999999997e-05, "loss": 4.5539, "step": 45845 }, { "epoch": 0.9389142588004014, "grad_norm": 0.30021369457244873, "learning_rate": 2.9999999999999997e-05, "loss": 4.5022, "step": 45850 }, { "epoch": 0.939016648577806, "grad_norm": 0.38482242822647095, "learning_rate": 2.9999999999999997e-05, "loss": 4.5447, "step": 45855 }, { "epoch": 0.9391190383552106, "grad_norm": 0.2887100577354431, "learning_rate": 2.9999999999999997e-05, "loss": 4.5726, "step": 45860 }, { "epoch": 0.9392214281326152, "grad_norm": 0.29952242970466614, "learning_rate": 2.9999999999999997e-05, "loss": 4.532, "step": 45865 }, { "epoch": 0.9393238179100198, "grad_norm": 0.3206510543823242, "learning_rate": 2.9999999999999997e-05, "loss": 4.557, "step": 45870 }, { "epoch": 0.9394262076874245, "grad_norm": 0.3024630546569824, "learning_rate": 2.9999999999999997e-05, "loss": 4.5641, "step": 45875 }, { "epoch": 0.9395285974648291, "grad_norm": 0.3287646770477295, "learning_rate": 2.9999999999999997e-05, "loss": 4.6189, "step": 45880 }, { "epoch": 0.9396309872422337, "grad_norm": 0.30085039138793945, "learning_rate": 2.9999999999999997e-05, "loss": 4.5405, "step": 45885 }, { "epoch": 0.9397333770196383, "grad_norm": 0.3252756595611572, "learning_rate": 2.9999999999999997e-05, "loss": 4.5482, "step": 45890 }, { "epoch": 0.939835766797043, "grad_norm": 0.29676443338394165, "learning_rate": 2.9999999999999997e-05, "loss": 4.5869, "step": 45895 }, { "epoch": 0.9399381565744476, "grad_norm": 0.3045036196708679, "learning_rate": 2.9999999999999997e-05, "loss": 4.5434, "step": 45900 }, { "epoch": 0.9400405463518522, "grad_norm": 0.28315460681915283, "learning_rate": 2.9999999999999997e-05, "loss": 4.5877, "step": 45905 }, { "epoch": 0.9401429361292568, "grad_norm": 0.3238038420677185, "learning_rate": 2.9999999999999997e-05, "loss": 4.638, "step": 45910 }, { "epoch": 0.9402453259066614, "grad_norm": 0.32335028052330017, "learning_rate": 2.9999999999999997e-05, "loss": 4.5703, "step": 45915 }, { "epoch": 0.940347715684066, "grad_norm": 0.28923776745796204, "learning_rate": 2.9999999999999997e-05, "loss": 4.5442, "step": 45920 }, { "epoch": 0.9404501054614707, "grad_norm": 0.2948213517665863, "learning_rate": 2.9999999999999997e-05, "loss": 4.5963, "step": 45925 }, { "epoch": 0.9405524952388753, "grad_norm": 0.30786654353141785, "learning_rate": 2.9999999999999997e-05, "loss": 4.5651, "step": 45930 }, { "epoch": 0.94065488501628, "grad_norm": 0.2849527597427368, "learning_rate": 2.9999999999999997e-05, "loss": 4.5857, "step": 45935 }, { "epoch": 0.9407572747936847, "grad_norm": 0.3040143549442291, "learning_rate": 2.9999999999999997e-05, "loss": 4.5946, "step": 45940 }, { "epoch": 0.9408596645710893, "grad_norm": 0.3182886242866516, "learning_rate": 2.9999999999999997e-05, "loss": 4.596, "step": 45945 }, { "epoch": 0.9409620543484939, "grad_norm": 0.288325697183609, "learning_rate": 2.9999999999999997e-05, "loss": 4.5533, "step": 45950 }, { "epoch": 0.9410644441258985, "grad_norm": 0.3175342082977295, "learning_rate": 2.9999999999999997e-05, "loss": 4.5664, "step": 45955 }, { "epoch": 0.9411668339033031, "grad_norm": 0.28953540325164795, "learning_rate": 2.9999999999999997e-05, "loss": 4.5448, "step": 45960 }, { "epoch": 0.9412692236807078, "grad_norm": 0.30405816435813904, "learning_rate": 2.9999999999999997e-05, "loss": 4.58, "step": 45965 }, { "epoch": 0.9413716134581124, "grad_norm": 0.2976589500904083, "learning_rate": 2.9999999999999997e-05, "loss": 4.5448, "step": 45970 }, { "epoch": 0.941474003235517, "grad_norm": 0.3012731075286865, "learning_rate": 2.9999999999999997e-05, "loss": 4.5902, "step": 45975 }, { "epoch": 0.9415763930129216, "grad_norm": 0.29623860120773315, "learning_rate": 2.9999999999999997e-05, "loss": 4.5609, "step": 45980 }, { "epoch": 0.9416787827903262, "grad_norm": 0.31744128465652466, "learning_rate": 2.9999999999999997e-05, "loss": 4.5766, "step": 45985 }, { "epoch": 0.9417811725677309, "grad_norm": 0.30372464656829834, "learning_rate": 2.9999999999999997e-05, "loss": 4.4996, "step": 45990 }, { "epoch": 0.9418835623451355, "grad_norm": 0.29425516724586487, "learning_rate": 2.9999999999999997e-05, "loss": 4.5925, "step": 45995 }, { "epoch": 0.9419859521225401, "grad_norm": 0.3015255928039551, "learning_rate": 2.9999999999999997e-05, "loss": 4.614, "step": 46000 }, { "epoch": 0.9419859521225401, "eval_loss": 4.583684921264648, "eval_runtime": 17.1326, "eval_samples_per_second": 33.97, "eval_steps_per_second": 4.261, "step": 46000 }, { "epoch": 0.9420883418999447, "grad_norm": 0.3247185945510864, "learning_rate": 2.9999999999999997e-05, "loss": 4.5689, "step": 46005 }, { "epoch": 0.9421907316773493, "grad_norm": 0.31341496109962463, "learning_rate": 2.9999999999999997e-05, "loss": 4.5098, "step": 46010 }, { "epoch": 0.942293121454754, "grad_norm": 0.3268587589263916, "learning_rate": 2.9999999999999997e-05, "loss": 4.6441, "step": 46015 }, { "epoch": 0.9423955112321586, "grad_norm": 0.30855366587638855, "learning_rate": 2.9999999999999997e-05, "loss": 4.5799, "step": 46020 }, { "epoch": 0.9424979010095632, "grad_norm": 0.2909298837184906, "learning_rate": 2.9999999999999997e-05, "loss": 4.5305, "step": 46025 }, { "epoch": 0.9426002907869678, "grad_norm": 0.34957823157310486, "learning_rate": 2.9999999999999997e-05, "loss": 4.5302, "step": 46030 }, { "epoch": 0.9427026805643725, "grad_norm": 0.3359202742576599, "learning_rate": 2.9999999999999997e-05, "loss": 4.5863, "step": 46035 }, { "epoch": 0.9428050703417771, "grad_norm": 0.290401428937912, "learning_rate": 2.9999999999999997e-05, "loss": 4.5969, "step": 46040 }, { "epoch": 0.9429074601191817, "grad_norm": 0.3260645270347595, "learning_rate": 2.9999999999999997e-05, "loss": 4.5317, "step": 46045 }, { "epoch": 0.9430098498965863, "grad_norm": 0.3104172646999359, "learning_rate": 2.9999999999999997e-05, "loss": 4.5733, "step": 46050 }, { "epoch": 0.9431122396739909, "grad_norm": 0.29455071687698364, "learning_rate": 2.9999999999999997e-05, "loss": 4.5827, "step": 46055 }, { "epoch": 0.9432146294513956, "grad_norm": 0.3253496289253235, "learning_rate": 2.9999999999999997e-05, "loss": 4.5354, "step": 46060 }, { "epoch": 0.9433170192288002, "grad_norm": 0.2910245656967163, "learning_rate": 2.9999999999999997e-05, "loss": 4.5558, "step": 46065 }, { "epoch": 0.9434194090062048, "grad_norm": 0.28741756081581116, "learning_rate": 2.9999999999999997e-05, "loss": 4.5549, "step": 46070 }, { "epoch": 0.9435217987836094, "grad_norm": 0.323881059885025, "learning_rate": 2.9999999999999997e-05, "loss": 4.5806, "step": 46075 }, { "epoch": 0.943624188561014, "grad_norm": 0.3421439528465271, "learning_rate": 2.9999999999999997e-05, "loss": 4.59, "step": 46080 }, { "epoch": 0.9437265783384187, "grad_norm": 0.2928997576236725, "learning_rate": 2.9999999999999997e-05, "loss": 4.575, "step": 46085 }, { "epoch": 0.9438289681158233, "grad_norm": 0.29498398303985596, "learning_rate": 2.9999999999999997e-05, "loss": 4.6085, "step": 46090 }, { "epoch": 0.9439313578932279, "grad_norm": 0.2876294255256653, "learning_rate": 2.9999999999999997e-05, "loss": 4.5466, "step": 46095 }, { "epoch": 0.9440337476706325, "grad_norm": 0.28106021881103516, "learning_rate": 2.9999999999999997e-05, "loss": 4.5525, "step": 46100 }, { "epoch": 0.9441361374480371, "grad_norm": 0.2980225384235382, "learning_rate": 2.9999999999999997e-05, "loss": 4.5594, "step": 46105 }, { "epoch": 0.9442385272254418, "grad_norm": 0.29388806223869324, "learning_rate": 2.9999999999999997e-05, "loss": 4.4186, "step": 46110 }, { "epoch": 0.9443409170028464, "grad_norm": 0.3009973168373108, "learning_rate": 2.9999999999999997e-05, "loss": 4.5734, "step": 46115 }, { "epoch": 0.944443306780251, "grad_norm": 0.2896288335323334, "learning_rate": 2.9999999999999997e-05, "loss": 4.5197, "step": 46120 }, { "epoch": 0.9445456965576556, "grad_norm": 0.2991890609264374, "learning_rate": 2.9999999999999997e-05, "loss": 4.4904, "step": 46125 }, { "epoch": 0.9446480863350603, "grad_norm": 0.28481045365333557, "learning_rate": 2.9999999999999997e-05, "loss": 4.5626, "step": 46130 }, { "epoch": 0.9447504761124649, "grad_norm": 0.30722033977508545, "learning_rate": 2.9999999999999997e-05, "loss": 4.5853, "step": 46135 }, { "epoch": 0.9448528658898696, "grad_norm": 0.2884630262851715, "learning_rate": 2.9999999999999997e-05, "loss": 4.5902, "step": 46140 }, { "epoch": 0.9449552556672742, "grad_norm": 0.2827579975128174, "learning_rate": 2.9999999999999997e-05, "loss": 4.5697, "step": 46145 }, { "epoch": 0.9450576454446789, "grad_norm": 0.30568844079971313, "learning_rate": 2.9999999999999997e-05, "loss": 4.5744, "step": 46150 }, { "epoch": 0.9451600352220835, "grad_norm": 0.3058178126811981, "learning_rate": 2.9999999999999997e-05, "loss": 4.5761, "step": 46155 }, { "epoch": 0.9452624249994881, "grad_norm": 0.30323293805122375, "learning_rate": 2.9999999999999997e-05, "loss": 4.5446, "step": 46160 }, { "epoch": 0.9453648147768927, "grad_norm": 0.297662615776062, "learning_rate": 2.9999999999999997e-05, "loss": 4.5679, "step": 46165 }, { "epoch": 0.9454672045542973, "grad_norm": 0.2870453894138336, "learning_rate": 2.9999999999999997e-05, "loss": 4.555, "step": 46170 }, { "epoch": 0.945569594331702, "grad_norm": 0.28932371735572815, "learning_rate": 2.9999999999999997e-05, "loss": 4.5938, "step": 46175 }, { "epoch": 0.9456719841091066, "grad_norm": 0.30089452862739563, "learning_rate": 2.9999999999999997e-05, "loss": 4.6057, "step": 46180 }, { "epoch": 0.9457743738865112, "grad_norm": 0.32060471177101135, "learning_rate": 2.9999999999999997e-05, "loss": 4.5608, "step": 46185 }, { "epoch": 0.9458767636639158, "grad_norm": 0.28784120082855225, "learning_rate": 2.9999999999999997e-05, "loss": 4.5696, "step": 46190 }, { "epoch": 0.9459791534413204, "grad_norm": 0.2971588671207428, "learning_rate": 2.9999999999999997e-05, "loss": 4.6289, "step": 46195 }, { "epoch": 0.9460815432187251, "grad_norm": 0.3049401044845581, "learning_rate": 2.9999999999999997e-05, "loss": 4.6121, "step": 46200 }, { "epoch": 0.9461839329961297, "grad_norm": 0.3207658529281616, "learning_rate": 2.9999999999999997e-05, "loss": 4.5777, "step": 46205 }, { "epoch": 0.9462863227735343, "grad_norm": 0.2914637327194214, "learning_rate": 2.9999999999999997e-05, "loss": 4.5864, "step": 46210 }, { "epoch": 0.9463887125509389, "grad_norm": 0.2939217686653137, "learning_rate": 2.9999999999999997e-05, "loss": 4.5182, "step": 46215 }, { "epoch": 0.9464911023283435, "grad_norm": 0.2861398458480835, "learning_rate": 2.9999999999999997e-05, "loss": 4.5119, "step": 46220 }, { "epoch": 0.9465934921057482, "grad_norm": 0.30429330468177795, "learning_rate": 2.9999999999999997e-05, "loss": 4.5959, "step": 46225 }, { "epoch": 0.9466958818831528, "grad_norm": 0.35886383056640625, "learning_rate": 2.9999999999999997e-05, "loss": 4.5833, "step": 46230 }, { "epoch": 0.9467982716605574, "grad_norm": 0.291823148727417, "learning_rate": 2.9999999999999997e-05, "loss": 4.515, "step": 46235 }, { "epoch": 0.946900661437962, "grad_norm": 0.2943173944950104, "learning_rate": 2.9999999999999997e-05, "loss": 4.5744, "step": 46240 }, { "epoch": 0.9470030512153667, "grad_norm": 0.4758606553077698, "learning_rate": 2.9999999999999997e-05, "loss": 4.547, "step": 46245 }, { "epoch": 0.9471054409927713, "grad_norm": 0.372763067483902, "learning_rate": 2.9999999999999997e-05, "loss": 4.4849, "step": 46250 }, { "epoch": 0.9472078307701759, "grad_norm": 0.28862863779067993, "learning_rate": 2.9999999999999997e-05, "loss": 4.4833, "step": 46255 }, { "epoch": 0.9473102205475805, "grad_norm": 0.2902795970439911, "learning_rate": 2.9999999999999997e-05, "loss": 4.5262, "step": 46260 }, { "epoch": 0.9474126103249851, "grad_norm": 0.2977970540523529, "learning_rate": 2.9999999999999997e-05, "loss": 4.5321, "step": 46265 }, { "epoch": 0.9475150001023898, "grad_norm": 0.4046977162361145, "learning_rate": 2.9999999999999997e-05, "loss": 4.5846, "step": 46270 }, { "epoch": 0.9476173898797944, "grad_norm": 0.29380640387535095, "learning_rate": 2.9999999999999997e-05, "loss": 4.5819, "step": 46275 }, { "epoch": 0.947719779657199, "grad_norm": 0.3022974729537964, "learning_rate": 2.9999999999999997e-05, "loss": 4.5197, "step": 46280 }, { "epoch": 0.9478221694346036, "grad_norm": 0.29053810238838196, "learning_rate": 2.9999999999999997e-05, "loss": 4.5742, "step": 46285 }, { "epoch": 0.9479245592120082, "grad_norm": 0.27897557616233826, "learning_rate": 2.9999999999999997e-05, "loss": 4.5777, "step": 46290 }, { "epoch": 0.9480269489894129, "grad_norm": 0.28883323073387146, "learning_rate": 2.9999999999999997e-05, "loss": 4.5788, "step": 46295 }, { "epoch": 0.9481293387668175, "grad_norm": 0.31002146005630493, "learning_rate": 2.9999999999999997e-05, "loss": 4.5711, "step": 46300 }, { "epoch": 0.9482317285442221, "grad_norm": 0.3166657090187073, "learning_rate": 2.9999999999999997e-05, "loss": 4.5532, "step": 46305 }, { "epoch": 0.9483341183216267, "grad_norm": 0.3122865855693817, "learning_rate": 2.9999999999999997e-05, "loss": 4.5489, "step": 46310 }, { "epoch": 0.9484365080990314, "grad_norm": 0.2848620116710663, "learning_rate": 2.9999999999999997e-05, "loss": 4.5409, "step": 46315 }, { "epoch": 0.948538897876436, "grad_norm": 0.3065323531627655, "learning_rate": 2.9999999999999997e-05, "loss": 4.5402, "step": 46320 }, { "epoch": 0.9486412876538406, "grad_norm": 0.2915879786014557, "learning_rate": 2.9999999999999997e-05, "loss": 4.5876, "step": 46325 }, { "epoch": 0.9487436774312452, "grad_norm": 0.3044731616973877, "learning_rate": 2.9999999999999997e-05, "loss": 4.4693, "step": 46330 }, { "epoch": 0.9488460672086498, "grad_norm": 0.33324384689331055, "learning_rate": 2.9999999999999997e-05, "loss": 4.5919, "step": 46335 }, { "epoch": 0.9489484569860546, "grad_norm": 0.2969648838043213, "learning_rate": 2.9999999999999997e-05, "loss": 4.5418, "step": 46340 }, { "epoch": 0.9490508467634592, "grad_norm": 0.2988201081752777, "learning_rate": 2.9999999999999997e-05, "loss": 4.5901, "step": 46345 }, { "epoch": 0.9491532365408638, "grad_norm": 0.3024013936519623, "learning_rate": 2.9999999999999997e-05, "loss": 4.605, "step": 46350 }, { "epoch": 0.9492556263182684, "grad_norm": 0.302666574716568, "learning_rate": 2.9999999999999997e-05, "loss": 4.5439, "step": 46355 }, { "epoch": 0.949358016095673, "grad_norm": 0.3279913663864136, "learning_rate": 2.9999999999999997e-05, "loss": 4.5416, "step": 46360 }, { "epoch": 0.9494604058730777, "grad_norm": 0.3035995066165924, "learning_rate": 2.9999999999999997e-05, "loss": 4.5195, "step": 46365 }, { "epoch": 0.9495627956504823, "grad_norm": 0.30347394943237305, "learning_rate": 2.9999999999999997e-05, "loss": 4.5328, "step": 46370 }, { "epoch": 0.9496651854278869, "grad_norm": 0.3168480098247528, "learning_rate": 2.9999999999999997e-05, "loss": 4.484, "step": 46375 }, { "epoch": 0.9497675752052915, "grad_norm": 0.30056774616241455, "learning_rate": 2.9999999999999997e-05, "loss": 4.5872, "step": 46380 }, { "epoch": 0.9498699649826962, "grad_norm": 0.29635190963745117, "learning_rate": 2.9999999999999997e-05, "loss": 4.5843, "step": 46385 }, { "epoch": 0.9499723547601008, "grad_norm": 0.28613218665122986, "learning_rate": 2.9999999999999997e-05, "loss": 4.5818, "step": 46390 }, { "epoch": 0.9500747445375054, "grad_norm": 0.35271692276000977, "learning_rate": 2.9999999999999997e-05, "loss": 4.5918, "step": 46395 }, { "epoch": 0.95017713431491, "grad_norm": 0.2879866659641266, "learning_rate": 2.9999999999999997e-05, "loss": 4.5044, "step": 46400 }, { "epoch": 0.9502795240923146, "grad_norm": 0.2910770773887634, "learning_rate": 2.9999999999999997e-05, "loss": 4.553, "step": 46405 }, { "epoch": 0.9503819138697193, "grad_norm": 0.2987903356552124, "learning_rate": 2.9999999999999997e-05, "loss": 4.5389, "step": 46410 }, { "epoch": 0.9504843036471239, "grad_norm": 0.28610172867774963, "learning_rate": 2.9999999999999997e-05, "loss": 4.5486, "step": 46415 }, { "epoch": 0.9505866934245285, "grad_norm": 0.3817905783653259, "learning_rate": 2.9999999999999997e-05, "loss": 4.5868, "step": 46420 }, { "epoch": 0.9506890832019331, "grad_norm": 0.31912466883659363, "learning_rate": 2.9999999999999997e-05, "loss": 4.5107, "step": 46425 }, { "epoch": 0.9507914729793377, "grad_norm": 0.2898521423339844, "learning_rate": 2.9999999999999997e-05, "loss": 4.6252, "step": 46430 }, { "epoch": 0.9508938627567424, "grad_norm": 0.29013702273368835, "learning_rate": 2.9999999999999997e-05, "loss": 4.5944, "step": 46435 }, { "epoch": 0.950996252534147, "grad_norm": 0.29249125719070435, "learning_rate": 2.9999999999999997e-05, "loss": 4.5531, "step": 46440 }, { "epoch": 0.9510986423115516, "grad_norm": 0.284308522939682, "learning_rate": 2.9999999999999997e-05, "loss": 4.5684, "step": 46445 }, { "epoch": 0.9512010320889562, "grad_norm": 0.30925360321998596, "learning_rate": 2.9999999999999997e-05, "loss": 4.5651, "step": 46450 }, { "epoch": 0.9513034218663609, "grad_norm": 0.29362016916275024, "learning_rate": 2.9999999999999997e-05, "loss": 4.5901, "step": 46455 }, { "epoch": 0.9514058116437655, "grad_norm": 0.3067672848701477, "learning_rate": 2.9999999999999997e-05, "loss": 4.5692, "step": 46460 }, { "epoch": 0.9515082014211701, "grad_norm": 0.29780399799346924, "learning_rate": 2.9999999999999997e-05, "loss": 4.5946, "step": 46465 }, { "epoch": 0.9516105911985747, "grad_norm": 0.3147890865802765, "learning_rate": 2.9999999999999997e-05, "loss": 4.55, "step": 46470 }, { "epoch": 0.9517129809759793, "grad_norm": 0.2933990955352783, "learning_rate": 2.9999999999999997e-05, "loss": 4.5795, "step": 46475 }, { "epoch": 0.951815370753384, "grad_norm": 0.3006732165813446, "learning_rate": 2.9999999999999997e-05, "loss": 4.5222, "step": 46480 }, { "epoch": 0.9519177605307886, "grad_norm": 0.31373563408851624, "learning_rate": 2.9999999999999997e-05, "loss": 4.5303, "step": 46485 }, { "epoch": 0.9520201503081932, "grad_norm": 0.3008566200733185, "learning_rate": 2.9999999999999997e-05, "loss": 4.5521, "step": 46490 }, { "epoch": 0.9521225400855978, "grad_norm": 0.2952241003513336, "learning_rate": 2.9999999999999997e-05, "loss": 4.5618, "step": 46495 }, { "epoch": 0.9522249298630024, "grad_norm": 0.29259973764419556, "learning_rate": 2.9999999999999997e-05, "loss": 4.5506, "step": 46500 }, { "epoch": 0.9523273196404071, "grad_norm": 0.29110756516456604, "learning_rate": 2.9999999999999997e-05, "loss": 4.5649, "step": 46505 }, { "epoch": 0.9524297094178117, "grad_norm": 0.2845367193222046, "learning_rate": 2.9999999999999997e-05, "loss": 4.5585, "step": 46510 }, { "epoch": 0.9525320991952163, "grad_norm": 0.28852665424346924, "learning_rate": 2.9999999999999997e-05, "loss": 4.612, "step": 46515 }, { "epoch": 0.9526344889726209, "grad_norm": 0.291458398103714, "learning_rate": 2.9999999999999997e-05, "loss": 4.6069, "step": 46520 }, { "epoch": 0.9527368787500256, "grad_norm": 0.3163106441497803, "learning_rate": 2.9999999999999997e-05, "loss": 4.558, "step": 46525 }, { "epoch": 0.9528392685274302, "grad_norm": 0.3323710560798645, "learning_rate": 2.9999999999999997e-05, "loss": 4.5538, "step": 46530 }, { "epoch": 0.9529416583048348, "grad_norm": 0.3018573224544525, "learning_rate": 2.9999999999999997e-05, "loss": 4.5715, "step": 46535 }, { "epoch": 0.9530440480822394, "grad_norm": 0.28915610909461975, "learning_rate": 2.9999999999999997e-05, "loss": 4.5374, "step": 46540 }, { "epoch": 0.9531464378596441, "grad_norm": 0.3006097972393036, "learning_rate": 2.9999999999999997e-05, "loss": 4.5792, "step": 46545 }, { "epoch": 0.9532488276370488, "grad_norm": 0.28617480397224426, "learning_rate": 2.9999999999999997e-05, "loss": 4.5852, "step": 46550 }, { "epoch": 0.9533512174144534, "grad_norm": 0.2876349091529846, "learning_rate": 2.9999999999999997e-05, "loss": 4.558, "step": 46555 }, { "epoch": 0.953453607191858, "grad_norm": 0.29858532547950745, "learning_rate": 2.9999999999999997e-05, "loss": 4.5807, "step": 46560 }, { "epoch": 0.9535559969692626, "grad_norm": 0.31634581089019775, "learning_rate": 2.9999999999999997e-05, "loss": 4.5456, "step": 46565 }, { "epoch": 0.9536583867466673, "grad_norm": 0.6948842406272888, "learning_rate": 2.9999999999999997e-05, "loss": 4.5797, "step": 46570 }, { "epoch": 0.9537607765240719, "grad_norm": 0.29457876086235046, "learning_rate": 2.9999999999999997e-05, "loss": 4.5399, "step": 46575 }, { "epoch": 0.9538631663014765, "grad_norm": 0.29184049367904663, "learning_rate": 2.9999999999999997e-05, "loss": 4.535, "step": 46580 }, { "epoch": 0.9539655560788811, "grad_norm": 0.29152342677116394, "learning_rate": 2.9999999999999997e-05, "loss": 4.6023, "step": 46585 }, { "epoch": 0.9540679458562857, "grad_norm": 0.3121853172779083, "learning_rate": 2.9999999999999997e-05, "loss": 4.5459, "step": 46590 }, { "epoch": 0.9541703356336904, "grad_norm": 0.29832789301872253, "learning_rate": 2.9999999999999997e-05, "loss": 4.573, "step": 46595 }, { "epoch": 0.954272725411095, "grad_norm": 0.29351404309272766, "learning_rate": 2.9999999999999997e-05, "loss": 4.5808, "step": 46600 }, { "epoch": 0.9543751151884996, "grad_norm": 0.29937365651130676, "learning_rate": 2.9999999999999997e-05, "loss": 4.5569, "step": 46605 }, { "epoch": 0.9544775049659042, "grad_norm": 0.2994340658187866, "learning_rate": 2.9999999999999997e-05, "loss": 4.5266, "step": 46610 }, { "epoch": 0.9545798947433088, "grad_norm": 0.29143932461738586, "learning_rate": 2.9999999999999997e-05, "loss": 4.5655, "step": 46615 }, { "epoch": 0.9546822845207135, "grad_norm": 0.28990811109542847, "learning_rate": 2.9999999999999997e-05, "loss": 4.5287, "step": 46620 }, { "epoch": 0.9547846742981181, "grad_norm": 0.286898672580719, "learning_rate": 2.9999999999999997e-05, "loss": 4.5411, "step": 46625 }, { "epoch": 0.9548870640755227, "grad_norm": 0.28920093178749084, "learning_rate": 2.9999999999999997e-05, "loss": 4.5699, "step": 46630 }, { "epoch": 0.9549894538529273, "grad_norm": 0.30701369047164917, "learning_rate": 2.9999999999999997e-05, "loss": 4.5505, "step": 46635 }, { "epoch": 0.955091843630332, "grad_norm": 0.2999051511287689, "learning_rate": 2.9999999999999997e-05, "loss": 4.6291, "step": 46640 }, { "epoch": 0.9551942334077366, "grad_norm": 0.30438244342803955, "learning_rate": 2.9999999999999997e-05, "loss": 4.5471, "step": 46645 }, { "epoch": 0.9552966231851412, "grad_norm": 0.30867093801498413, "learning_rate": 2.9999999999999997e-05, "loss": 4.5316, "step": 46650 }, { "epoch": 0.9553990129625458, "grad_norm": 0.29062962532043457, "learning_rate": 2.9999999999999997e-05, "loss": 4.5951, "step": 46655 }, { "epoch": 0.9555014027399504, "grad_norm": 0.2844584286212921, "learning_rate": 2.9999999999999997e-05, "loss": 4.5841, "step": 46660 }, { "epoch": 0.955603792517355, "grad_norm": 0.28820452094078064, "learning_rate": 2.9999999999999997e-05, "loss": 4.5852, "step": 46665 }, { "epoch": 0.9557061822947597, "grad_norm": 0.3631156086921692, "learning_rate": 2.9999999999999997e-05, "loss": 4.5461, "step": 46670 }, { "epoch": 0.9558085720721643, "grad_norm": 0.3036566972732544, "learning_rate": 2.9999999999999997e-05, "loss": 4.5925, "step": 46675 }, { "epoch": 0.9559109618495689, "grad_norm": 0.30913856625556946, "learning_rate": 2.9999999999999997e-05, "loss": 4.5955, "step": 46680 }, { "epoch": 0.9560133516269735, "grad_norm": 0.3142358660697937, "learning_rate": 2.9999999999999997e-05, "loss": 4.5883, "step": 46685 }, { "epoch": 0.9561157414043782, "grad_norm": 0.2955227196216583, "learning_rate": 2.9999999999999997e-05, "loss": 4.607, "step": 46690 }, { "epoch": 0.9562181311817828, "grad_norm": 0.3719719648361206, "learning_rate": 2.9999999999999997e-05, "loss": 4.5709, "step": 46695 }, { "epoch": 0.9563205209591874, "grad_norm": 0.2889150381088257, "learning_rate": 2.9999999999999997e-05, "loss": 4.5664, "step": 46700 }, { "epoch": 0.956422910736592, "grad_norm": 0.3480525314807892, "learning_rate": 2.9999999999999997e-05, "loss": 4.5391, "step": 46705 }, { "epoch": 0.9565253005139966, "grad_norm": 0.2765401303768158, "learning_rate": 2.9999999999999997e-05, "loss": 4.4554, "step": 46710 }, { "epoch": 0.9566276902914013, "grad_norm": 0.2994229793548584, "learning_rate": 2.9999999999999997e-05, "loss": 4.6014, "step": 46715 }, { "epoch": 0.9567300800688059, "grad_norm": 0.2966138422489166, "learning_rate": 2.9999999999999997e-05, "loss": 4.557, "step": 46720 }, { "epoch": 0.9568324698462105, "grad_norm": 0.3034304678440094, "learning_rate": 2.9999999999999997e-05, "loss": 4.5691, "step": 46725 }, { "epoch": 0.9569348596236151, "grad_norm": 0.30969443917274475, "learning_rate": 2.9999999999999997e-05, "loss": 4.5806, "step": 46730 }, { "epoch": 0.9570372494010198, "grad_norm": 0.3021763265132904, "learning_rate": 2.9999999999999997e-05, "loss": 4.6384, "step": 46735 }, { "epoch": 0.9571396391784244, "grad_norm": 0.29978522658348083, "learning_rate": 2.9999999999999997e-05, "loss": 4.5576, "step": 46740 }, { "epoch": 0.9572420289558291, "grad_norm": 0.2940644919872284, "learning_rate": 2.9999999999999997e-05, "loss": 4.548, "step": 46745 }, { "epoch": 0.9573444187332337, "grad_norm": 0.2972068190574646, "learning_rate": 2.9999999999999997e-05, "loss": 4.5625, "step": 46750 }, { "epoch": 0.9574468085106383, "grad_norm": 0.30514392256736755, "learning_rate": 2.9999999999999997e-05, "loss": 4.51, "step": 46755 }, { "epoch": 0.957549198288043, "grad_norm": 0.31049951910972595, "learning_rate": 2.9999999999999997e-05, "loss": 4.6043, "step": 46760 }, { "epoch": 0.9576515880654476, "grad_norm": 0.28459200263023376, "learning_rate": 2.9999999999999997e-05, "loss": 4.5769, "step": 46765 }, { "epoch": 0.9577539778428522, "grad_norm": 0.31559404730796814, "learning_rate": 2.9999999999999997e-05, "loss": 4.5461, "step": 46770 }, { "epoch": 0.9578563676202568, "grad_norm": 0.30458056926727295, "learning_rate": 2.9999999999999997e-05, "loss": 4.6753, "step": 46775 }, { "epoch": 0.9579587573976615, "grad_norm": 0.3196863532066345, "learning_rate": 2.9999999999999997e-05, "loss": 4.6074, "step": 46780 }, { "epoch": 0.9580611471750661, "grad_norm": 0.3843401372432709, "learning_rate": 2.9999999999999997e-05, "loss": 4.5677, "step": 46785 }, { "epoch": 0.9581635369524707, "grad_norm": 0.29096993803977966, "learning_rate": 2.9999999999999997e-05, "loss": 4.5799, "step": 46790 }, { "epoch": 0.9582659267298753, "grad_norm": 0.28865140676498413, "learning_rate": 2.9999999999999997e-05, "loss": 4.5081, "step": 46795 }, { "epoch": 0.9583683165072799, "grad_norm": 0.29790523648262024, "learning_rate": 2.9999999999999997e-05, "loss": 4.6181, "step": 46800 }, { "epoch": 0.9584707062846846, "grad_norm": 0.29815900325775146, "learning_rate": 2.9999999999999997e-05, "loss": 4.533, "step": 46805 }, { "epoch": 0.9585730960620892, "grad_norm": 0.28969037532806396, "learning_rate": 2.9999999999999997e-05, "loss": 4.5034, "step": 46810 }, { "epoch": 0.9586754858394938, "grad_norm": 0.2895985245704651, "learning_rate": 2.9999999999999997e-05, "loss": 4.553, "step": 46815 }, { "epoch": 0.9587778756168984, "grad_norm": 0.2950066328048706, "learning_rate": 2.9999999999999997e-05, "loss": 4.5655, "step": 46820 }, { "epoch": 0.958880265394303, "grad_norm": 0.30235326290130615, "learning_rate": 2.9999999999999997e-05, "loss": 4.5666, "step": 46825 }, { "epoch": 0.9589826551717077, "grad_norm": 0.2880280613899231, "learning_rate": 2.9999999999999997e-05, "loss": 4.5212, "step": 46830 }, { "epoch": 0.9590850449491123, "grad_norm": 1.0736048221588135, "learning_rate": 2.9999999999999997e-05, "loss": 4.6092, "step": 46835 }, { "epoch": 0.9591874347265169, "grad_norm": 0.3134042024612427, "learning_rate": 2.9999999999999997e-05, "loss": 4.4997, "step": 46840 }, { "epoch": 0.9592898245039215, "grad_norm": 0.32664966583251953, "learning_rate": 2.9999999999999997e-05, "loss": 4.5354, "step": 46845 }, { "epoch": 0.9593922142813261, "grad_norm": 0.2921341061592102, "learning_rate": 2.9999999999999997e-05, "loss": 4.5767, "step": 46850 }, { "epoch": 0.9594946040587308, "grad_norm": 0.29374033212661743, "learning_rate": 2.9999999999999997e-05, "loss": 4.571, "step": 46855 }, { "epoch": 0.9595969938361354, "grad_norm": 0.3189680874347687, "learning_rate": 2.9999999999999997e-05, "loss": 4.6013, "step": 46860 }, { "epoch": 0.95969938361354, "grad_norm": 0.29350003600120544, "learning_rate": 2.9999999999999997e-05, "loss": 4.5769, "step": 46865 }, { "epoch": 0.9598017733909446, "grad_norm": 0.2955159544944763, "learning_rate": 2.9999999999999997e-05, "loss": 4.5107, "step": 46870 }, { "epoch": 0.9599041631683493, "grad_norm": 0.2950364053249359, "learning_rate": 2.9999999999999997e-05, "loss": 4.5467, "step": 46875 }, { "epoch": 0.9600065529457539, "grad_norm": 0.2971443235874176, "learning_rate": 2.9999999999999997e-05, "loss": 4.5136, "step": 46880 }, { "epoch": 0.9601089427231585, "grad_norm": 0.30741390585899353, "learning_rate": 2.9999999999999997e-05, "loss": 4.536, "step": 46885 }, { "epoch": 0.9602113325005631, "grad_norm": 0.286006897687912, "learning_rate": 2.9999999999999997e-05, "loss": 4.5613, "step": 46890 }, { "epoch": 0.9603137222779677, "grad_norm": 0.3020000755786896, "learning_rate": 2.9999999999999997e-05, "loss": 4.5845, "step": 46895 }, { "epoch": 0.9604161120553724, "grad_norm": 0.2949473261833191, "learning_rate": 2.9999999999999997e-05, "loss": 4.5681, "step": 46900 }, { "epoch": 0.960518501832777, "grad_norm": 0.2953351140022278, "learning_rate": 2.9999999999999997e-05, "loss": 4.5885, "step": 46905 }, { "epoch": 0.9606208916101816, "grad_norm": 0.34834355115890503, "learning_rate": 2.9999999999999997e-05, "loss": 4.5961, "step": 46910 }, { "epoch": 0.9607232813875862, "grad_norm": 0.3003140091896057, "learning_rate": 2.9999999999999997e-05, "loss": 4.5965, "step": 46915 }, { "epoch": 0.9608256711649908, "grad_norm": 0.2925507426261902, "learning_rate": 2.9999999999999997e-05, "loss": 4.5687, "step": 46920 }, { "epoch": 0.9609280609423955, "grad_norm": 0.2905387580394745, "learning_rate": 2.9999999999999997e-05, "loss": 4.5204, "step": 46925 }, { "epoch": 0.9610304507198001, "grad_norm": 0.3015080988407135, "learning_rate": 2.9999999999999997e-05, "loss": 4.5774, "step": 46930 }, { "epoch": 0.9611328404972047, "grad_norm": 0.29733699560165405, "learning_rate": 2.9999999999999997e-05, "loss": 4.5868, "step": 46935 }, { "epoch": 0.9612352302746093, "grad_norm": 0.2915636897087097, "learning_rate": 2.9999999999999997e-05, "loss": 4.5334, "step": 46940 }, { "epoch": 0.961337620052014, "grad_norm": 0.30577629804611206, "learning_rate": 2.9999999999999997e-05, "loss": 4.547, "step": 46945 }, { "epoch": 0.9614400098294187, "grad_norm": 0.31387439370155334, "learning_rate": 2.9999999999999997e-05, "loss": 4.6124, "step": 46950 }, { "epoch": 0.9615423996068233, "grad_norm": 0.39872995018959045, "learning_rate": 2.9999999999999997e-05, "loss": 4.5962, "step": 46955 }, { "epoch": 0.9616447893842279, "grad_norm": 0.32546061277389526, "learning_rate": 2.9999999999999997e-05, "loss": 4.4662, "step": 46960 }, { "epoch": 0.9617471791616325, "grad_norm": 0.3042566478252411, "learning_rate": 2.9999999999999997e-05, "loss": 4.5145, "step": 46965 }, { "epoch": 0.9618495689390372, "grad_norm": 0.29946258664131165, "learning_rate": 2.9999999999999997e-05, "loss": 4.5425, "step": 46970 }, { "epoch": 0.9619519587164418, "grad_norm": 0.29231250286102295, "learning_rate": 2.9999999999999997e-05, "loss": 4.6074, "step": 46975 }, { "epoch": 0.9620543484938464, "grad_norm": 0.29093030095100403, "learning_rate": 2.9999999999999997e-05, "loss": 4.5394, "step": 46980 }, { "epoch": 0.962156738271251, "grad_norm": 0.3020989000797272, "learning_rate": 2.9999999999999997e-05, "loss": 4.5101, "step": 46985 }, { "epoch": 0.9622591280486557, "grad_norm": 0.30913466215133667, "learning_rate": 2.9999999999999997e-05, "loss": 4.5595, "step": 46990 }, { "epoch": 0.9623615178260603, "grad_norm": 0.2933889925479889, "learning_rate": 2.9999999999999997e-05, "loss": 4.5866, "step": 46995 }, { "epoch": 0.9624639076034649, "grad_norm": 0.29119619727134705, "learning_rate": 2.9999999999999997e-05, "loss": 4.5282, "step": 47000 }, { "epoch": 0.9624639076034649, "eval_loss": 4.581810474395752, "eval_runtime": 17.1294, "eval_samples_per_second": 33.977, "eval_steps_per_second": 4.262, "step": 47000 }, { "epoch": 0.9625662973808695, "grad_norm": 0.2967391610145569, "learning_rate": 2.9999999999999997e-05, "loss": 4.5758, "step": 47005 }, { "epoch": 0.9626686871582741, "grad_norm": 0.2947872579097748, "learning_rate": 2.9999999999999997e-05, "loss": 4.5727, "step": 47010 }, { "epoch": 0.9627710769356788, "grad_norm": 0.29944783449172974, "learning_rate": 2.9999999999999997e-05, "loss": 4.5765, "step": 47015 }, { "epoch": 0.9628734667130834, "grad_norm": 0.3540767729282379, "learning_rate": 2.9999999999999997e-05, "loss": 4.5526, "step": 47020 }, { "epoch": 0.962975856490488, "grad_norm": 0.2941175699234009, "learning_rate": 2.9999999999999997e-05, "loss": 4.5368, "step": 47025 }, { "epoch": 0.9630782462678926, "grad_norm": 0.29408174753189087, "learning_rate": 2.9999999999999997e-05, "loss": 4.5939, "step": 47030 }, { "epoch": 0.9631806360452972, "grad_norm": 0.3078875243663788, "learning_rate": 2.9999999999999997e-05, "loss": 4.5505, "step": 47035 }, { "epoch": 0.9632830258227019, "grad_norm": 0.36331120133399963, "learning_rate": 2.9999999999999997e-05, "loss": 4.5808, "step": 47040 }, { "epoch": 0.9633854156001065, "grad_norm": 0.3171165883541107, "learning_rate": 2.9999999999999997e-05, "loss": 4.5744, "step": 47045 }, { "epoch": 0.9634878053775111, "grad_norm": 0.29271119832992554, "learning_rate": 2.9999999999999997e-05, "loss": 4.5762, "step": 47050 }, { "epoch": 0.9635901951549157, "grad_norm": 0.31483906507492065, "learning_rate": 2.9999999999999997e-05, "loss": 4.6228, "step": 47055 }, { "epoch": 0.9636925849323204, "grad_norm": 0.3047357499599457, "learning_rate": 2.9999999999999997e-05, "loss": 4.5675, "step": 47060 }, { "epoch": 0.963794974709725, "grad_norm": 0.2942754328250885, "learning_rate": 2.9999999999999997e-05, "loss": 4.5684, "step": 47065 }, { "epoch": 0.9638973644871296, "grad_norm": 0.2990550994873047, "learning_rate": 2.9999999999999997e-05, "loss": 4.6079, "step": 47070 }, { "epoch": 0.9639997542645342, "grad_norm": 0.2954290211200714, "learning_rate": 2.9999999999999997e-05, "loss": 4.5651, "step": 47075 }, { "epoch": 0.9641021440419388, "grad_norm": 0.2928082048892975, "learning_rate": 2.9999999999999997e-05, "loss": 4.5688, "step": 47080 }, { "epoch": 0.9642045338193435, "grad_norm": 0.3556124269962311, "learning_rate": 2.9999999999999997e-05, "loss": 4.57, "step": 47085 }, { "epoch": 0.9643069235967481, "grad_norm": 0.31088879704475403, "learning_rate": 2.9999999999999997e-05, "loss": 4.5054, "step": 47090 }, { "epoch": 0.9644093133741527, "grad_norm": 0.3035901188850403, "learning_rate": 2.9999999999999997e-05, "loss": 4.5674, "step": 47095 }, { "epoch": 0.9645117031515573, "grad_norm": 0.2908082604408264, "learning_rate": 2.9999999999999997e-05, "loss": 4.5235, "step": 47100 }, { "epoch": 0.9646140929289619, "grad_norm": 0.303617924451828, "learning_rate": 2.9999999999999997e-05, "loss": 4.5973, "step": 47105 }, { "epoch": 0.9647164827063666, "grad_norm": 0.2881530821323395, "learning_rate": 2.9999999999999997e-05, "loss": 4.5752, "step": 47110 }, { "epoch": 0.9648188724837712, "grad_norm": 0.29704049229621887, "learning_rate": 2.9999999999999997e-05, "loss": 4.5137, "step": 47115 }, { "epoch": 0.9649212622611758, "grad_norm": 0.29342731833457947, "learning_rate": 2.9999999999999997e-05, "loss": 4.5698, "step": 47120 }, { "epoch": 0.9650236520385804, "grad_norm": 0.29932162165641785, "learning_rate": 2.9999999999999997e-05, "loss": 4.6056, "step": 47125 }, { "epoch": 0.965126041815985, "grad_norm": 0.29408302903175354, "learning_rate": 2.9999999999999997e-05, "loss": 4.5952, "step": 47130 }, { "epoch": 0.9652284315933897, "grad_norm": 0.30963706970214844, "learning_rate": 2.9999999999999997e-05, "loss": 4.5404, "step": 47135 }, { "epoch": 0.9653308213707943, "grad_norm": 0.31054189801216125, "learning_rate": 2.9999999999999997e-05, "loss": 4.587, "step": 47140 }, { "epoch": 0.9654332111481989, "grad_norm": 0.3025176227092743, "learning_rate": 2.9999999999999997e-05, "loss": 4.5372, "step": 47145 }, { "epoch": 0.9655356009256036, "grad_norm": 0.32492348551750183, "learning_rate": 2.9999999999999997e-05, "loss": 4.5947, "step": 47150 }, { "epoch": 0.9656379907030083, "grad_norm": 0.30467331409454346, "learning_rate": 2.9999999999999997e-05, "loss": 4.5264, "step": 47155 }, { "epoch": 0.9657403804804129, "grad_norm": 0.8612712621688843, "learning_rate": 2.9999999999999997e-05, "loss": 4.546, "step": 47160 }, { "epoch": 0.9658427702578175, "grad_norm": 0.2954406440258026, "learning_rate": 2.9999999999999997e-05, "loss": 4.5654, "step": 47165 }, { "epoch": 0.9659451600352221, "grad_norm": 0.29305946826934814, "learning_rate": 2.9999999999999997e-05, "loss": 4.6023, "step": 47170 }, { "epoch": 0.9660475498126267, "grad_norm": 0.30675962567329407, "learning_rate": 2.9999999999999997e-05, "loss": 4.5784, "step": 47175 }, { "epoch": 0.9661499395900314, "grad_norm": 0.29173755645751953, "learning_rate": 2.9999999999999997e-05, "loss": 4.6345, "step": 47180 }, { "epoch": 0.966252329367436, "grad_norm": 0.35791778564453125, "learning_rate": 2.9999999999999997e-05, "loss": 4.5232, "step": 47185 }, { "epoch": 0.9663547191448406, "grad_norm": 0.30453601479530334, "learning_rate": 2.9999999999999997e-05, "loss": 4.4895, "step": 47190 }, { "epoch": 0.9664571089222452, "grad_norm": 0.2907402217388153, "learning_rate": 2.9999999999999997e-05, "loss": 4.5722, "step": 47195 }, { "epoch": 0.9665594986996499, "grad_norm": 0.2840563654899597, "learning_rate": 2.9999999999999997e-05, "loss": 4.5382, "step": 47200 }, { "epoch": 0.9666618884770545, "grad_norm": 0.3044736087322235, "learning_rate": 2.9999999999999997e-05, "loss": 4.559, "step": 47205 }, { "epoch": 0.9667642782544591, "grad_norm": 0.3105946183204651, "learning_rate": 2.9999999999999997e-05, "loss": 4.5034, "step": 47210 }, { "epoch": 0.9668666680318637, "grad_norm": 0.3129786550998688, "learning_rate": 2.9999999999999997e-05, "loss": 4.4847, "step": 47215 }, { "epoch": 0.9669690578092683, "grad_norm": 0.29703766107559204, "learning_rate": 2.9999999999999997e-05, "loss": 4.614, "step": 47220 }, { "epoch": 0.967071447586673, "grad_norm": 0.30712464451789856, "learning_rate": 2.9999999999999997e-05, "loss": 4.5898, "step": 47225 }, { "epoch": 0.9671738373640776, "grad_norm": 0.2927931845188141, "learning_rate": 2.9999999999999997e-05, "loss": 4.5368, "step": 47230 }, { "epoch": 0.9672762271414822, "grad_norm": 0.29242026805877686, "learning_rate": 2.9999999999999997e-05, "loss": 4.5323, "step": 47235 }, { "epoch": 0.9673786169188868, "grad_norm": 0.29771849513053894, "learning_rate": 2.9999999999999997e-05, "loss": 4.5597, "step": 47240 }, { "epoch": 0.9674810066962914, "grad_norm": 0.30348271131515503, "learning_rate": 2.9999999999999997e-05, "loss": 4.5753, "step": 47245 }, { "epoch": 0.9675833964736961, "grad_norm": 0.3018856942653656, "learning_rate": 2.9999999999999997e-05, "loss": 4.5624, "step": 47250 }, { "epoch": 0.9676857862511007, "grad_norm": 0.3198246955871582, "learning_rate": 2.9999999999999997e-05, "loss": 4.5444, "step": 47255 }, { "epoch": 0.9677881760285053, "grad_norm": 0.30506032705307007, "learning_rate": 2.9999999999999997e-05, "loss": 4.6412, "step": 47260 }, { "epoch": 0.9678905658059099, "grad_norm": 0.3034096956253052, "learning_rate": 2.9999999999999997e-05, "loss": 4.5816, "step": 47265 }, { "epoch": 0.9679929555833146, "grad_norm": 0.30574262142181396, "learning_rate": 2.9999999999999997e-05, "loss": 4.5733, "step": 47270 }, { "epoch": 0.9680953453607192, "grad_norm": 0.3164641857147217, "learning_rate": 2.9999999999999997e-05, "loss": 4.5484, "step": 47275 }, { "epoch": 0.9681977351381238, "grad_norm": 0.29358360171318054, "learning_rate": 2.9999999999999997e-05, "loss": 4.5942, "step": 47280 }, { "epoch": 0.9683001249155284, "grad_norm": 0.28766998648643494, "learning_rate": 2.9999999999999997e-05, "loss": 4.5644, "step": 47285 }, { "epoch": 0.968402514692933, "grad_norm": 0.29751890897750854, "learning_rate": 2.9999999999999997e-05, "loss": 4.5859, "step": 47290 }, { "epoch": 0.9685049044703377, "grad_norm": 0.29219794273376465, "learning_rate": 2.9999999999999997e-05, "loss": 4.5898, "step": 47295 }, { "epoch": 0.9686072942477423, "grad_norm": 0.3122159540653229, "learning_rate": 2.9999999999999997e-05, "loss": 4.5746, "step": 47300 }, { "epoch": 0.9687096840251469, "grad_norm": 0.28809303045272827, "learning_rate": 2.9999999999999997e-05, "loss": 4.5835, "step": 47305 }, { "epoch": 0.9688120738025515, "grad_norm": 0.2955372631549835, "learning_rate": 2.9999999999999997e-05, "loss": 4.5614, "step": 47310 }, { "epoch": 0.9689144635799561, "grad_norm": 0.3172515034675598, "learning_rate": 2.9999999999999997e-05, "loss": 4.6072, "step": 47315 }, { "epoch": 0.9690168533573608, "grad_norm": 0.32738354802131653, "learning_rate": 2.9999999999999997e-05, "loss": 4.5829, "step": 47320 }, { "epoch": 0.9691192431347654, "grad_norm": 0.3401321470737457, "learning_rate": 2.9999999999999997e-05, "loss": 4.5463, "step": 47325 }, { "epoch": 0.96922163291217, "grad_norm": 0.30369314551353455, "learning_rate": 2.9999999999999997e-05, "loss": 4.595, "step": 47330 }, { "epoch": 0.9693240226895746, "grad_norm": 0.30510661005973816, "learning_rate": 2.9999999999999997e-05, "loss": 4.5918, "step": 47335 }, { "epoch": 0.9694264124669792, "grad_norm": 0.30570051074028015, "learning_rate": 2.9999999999999997e-05, "loss": 4.591, "step": 47340 }, { "epoch": 0.9695288022443839, "grad_norm": 0.31844812631607056, "learning_rate": 2.9999999999999997e-05, "loss": 4.6063, "step": 47345 }, { "epoch": 0.9696311920217885, "grad_norm": 0.3160530924797058, "learning_rate": 2.9999999999999997e-05, "loss": 4.584, "step": 47350 }, { "epoch": 0.9697335817991932, "grad_norm": 0.29038333892822266, "learning_rate": 2.9999999999999997e-05, "loss": 4.5466, "step": 47355 }, { "epoch": 0.9698359715765978, "grad_norm": 0.3079160749912262, "learning_rate": 2.9999999999999997e-05, "loss": 4.486, "step": 47360 }, { "epoch": 0.9699383613540025, "grad_norm": 0.2997553050518036, "learning_rate": 2.9999999999999997e-05, "loss": 4.6374, "step": 47365 }, { "epoch": 0.9700407511314071, "grad_norm": 0.31009504199028015, "learning_rate": 2.9999999999999997e-05, "loss": 4.5786, "step": 47370 }, { "epoch": 0.9701431409088117, "grad_norm": 0.2981540858745575, "learning_rate": 2.9999999999999997e-05, "loss": 4.5433, "step": 47375 }, { "epoch": 0.9702455306862163, "grad_norm": 0.3187463581562042, "learning_rate": 2.9999999999999997e-05, "loss": 4.5122, "step": 47380 }, { "epoch": 0.970347920463621, "grad_norm": 0.29447510838508606, "learning_rate": 2.9999999999999997e-05, "loss": 4.5587, "step": 47385 }, { "epoch": 0.9704503102410256, "grad_norm": 0.29934313893318176, "learning_rate": 2.9999999999999997e-05, "loss": 4.5436, "step": 47390 }, { "epoch": 0.9705527000184302, "grad_norm": 0.284311980009079, "learning_rate": 2.9999999999999997e-05, "loss": 4.5495, "step": 47395 }, { "epoch": 0.9706550897958348, "grad_norm": 0.3030093014240265, "learning_rate": 2.9999999999999997e-05, "loss": 4.5605, "step": 47400 }, { "epoch": 0.9707574795732394, "grad_norm": 0.3019144833087921, "learning_rate": 2.9999999999999997e-05, "loss": 4.5776, "step": 47405 }, { "epoch": 0.9708598693506441, "grad_norm": 0.32203003764152527, "learning_rate": 2.9999999999999997e-05, "loss": 4.5775, "step": 47410 }, { "epoch": 0.9709622591280487, "grad_norm": 0.3082998991012573, "learning_rate": 2.9999999999999997e-05, "loss": 4.5634, "step": 47415 }, { "epoch": 0.9710646489054533, "grad_norm": 0.3060787320137024, "learning_rate": 2.9999999999999997e-05, "loss": 4.5531, "step": 47420 }, { "epoch": 0.9711670386828579, "grad_norm": 0.29228317737579346, "learning_rate": 2.9999999999999997e-05, "loss": 4.5748, "step": 47425 }, { "epoch": 0.9712694284602625, "grad_norm": 0.31799471378326416, "learning_rate": 2.9999999999999997e-05, "loss": 4.572, "step": 47430 }, { "epoch": 0.9713718182376672, "grad_norm": 0.3584194481372833, "learning_rate": 2.9999999999999997e-05, "loss": 4.5999, "step": 47435 }, { "epoch": 0.9714742080150718, "grad_norm": 0.31477686762809753, "learning_rate": 2.9999999999999997e-05, "loss": 4.5323, "step": 47440 }, { "epoch": 0.9715765977924764, "grad_norm": 0.30608507990837097, "learning_rate": 2.9999999999999997e-05, "loss": 4.5314, "step": 47445 }, { "epoch": 0.971678987569881, "grad_norm": 0.3086124062538147, "learning_rate": 2.9999999999999997e-05, "loss": 4.5698, "step": 47450 }, { "epoch": 0.9717813773472856, "grad_norm": 0.3038252294063568, "learning_rate": 2.9999999999999997e-05, "loss": 4.5568, "step": 47455 }, { "epoch": 0.9718837671246903, "grad_norm": 0.3878996670246124, "learning_rate": 2.9999999999999997e-05, "loss": 4.5692, "step": 47460 }, { "epoch": 0.9719861569020949, "grad_norm": 0.30116572976112366, "learning_rate": 2.9999999999999997e-05, "loss": 4.5706, "step": 47465 }, { "epoch": 0.9720885466794995, "grad_norm": 0.2941262722015381, "learning_rate": 2.9999999999999997e-05, "loss": 4.5973, "step": 47470 }, { "epoch": 0.9721909364569041, "grad_norm": 0.3013308346271515, "learning_rate": 2.9999999999999997e-05, "loss": 4.527, "step": 47475 }, { "epoch": 0.9722933262343088, "grad_norm": 0.32226625084877014, "learning_rate": 2.9999999999999997e-05, "loss": 4.6295, "step": 47480 }, { "epoch": 0.9723957160117134, "grad_norm": 0.294238805770874, "learning_rate": 2.9999999999999997e-05, "loss": 4.5321, "step": 47485 }, { "epoch": 0.972498105789118, "grad_norm": 0.30070731043815613, "learning_rate": 2.9999999999999997e-05, "loss": 4.5867, "step": 47490 }, { "epoch": 0.9726004955665226, "grad_norm": 0.32017284631729126, "learning_rate": 2.9999999999999997e-05, "loss": 4.5652, "step": 47495 }, { "epoch": 0.9727028853439272, "grad_norm": 0.3248218894004822, "learning_rate": 2.9999999999999997e-05, "loss": 4.5568, "step": 47500 }, { "epoch": 0.9728052751213319, "grad_norm": 0.3047269582748413, "learning_rate": 2.9999999999999997e-05, "loss": 4.6889, "step": 47505 }, { "epoch": 0.9729076648987365, "grad_norm": 0.29684582352638245, "learning_rate": 2.9999999999999997e-05, "loss": 4.5553, "step": 47510 }, { "epoch": 0.9730100546761411, "grad_norm": 0.3022248148918152, "learning_rate": 2.9999999999999997e-05, "loss": 4.5786, "step": 47515 }, { "epoch": 0.9731124444535457, "grad_norm": 0.296001672744751, "learning_rate": 2.9999999999999997e-05, "loss": 4.5842, "step": 47520 }, { "epoch": 0.9732148342309503, "grad_norm": 0.3527770936489105, "learning_rate": 2.9999999999999997e-05, "loss": 4.5337, "step": 47525 }, { "epoch": 0.973317224008355, "grad_norm": 0.31018128991127014, "learning_rate": 2.9999999999999997e-05, "loss": 4.5359, "step": 47530 }, { "epoch": 0.9734196137857596, "grad_norm": 0.29156750440597534, "learning_rate": 2.9999999999999997e-05, "loss": 4.5901, "step": 47535 }, { "epoch": 0.9735220035631642, "grad_norm": 0.3041020631790161, "learning_rate": 2.9999999999999997e-05, "loss": 4.5851, "step": 47540 }, { "epoch": 0.9736243933405688, "grad_norm": 0.30444443225860596, "learning_rate": 2.9999999999999997e-05, "loss": 4.6148, "step": 47545 }, { "epoch": 0.9737267831179734, "grad_norm": 0.29533252120018005, "learning_rate": 2.9999999999999997e-05, "loss": 4.5348, "step": 47550 }, { "epoch": 0.9738291728953781, "grad_norm": 0.2916995882987976, "learning_rate": 2.9999999999999997e-05, "loss": 4.6002, "step": 47555 }, { "epoch": 0.9739315626727828, "grad_norm": 0.2841915190219879, "learning_rate": 2.9999999999999997e-05, "loss": 4.5402, "step": 47560 }, { "epoch": 0.9740339524501874, "grad_norm": 0.2930445075035095, "learning_rate": 2.9999999999999997e-05, "loss": 4.5564, "step": 47565 }, { "epoch": 0.974136342227592, "grad_norm": 0.29968494176864624, "learning_rate": 2.9999999999999997e-05, "loss": 4.5664, "step": 47570 }, { "epoch": 0.9742387320049967, "grad_norm": 0.30154654383659363, "learning_rate": 2.9999999999999997e-05, "loss": 4.6124, "step": 47575 }, { "epoch": 0.9743411217824013, "grad_norm": 0.3050684630870819, "learning_rate": 2.9999999999999997e-05, "loss": 4.4887, "step": 47580 }, { "epoch": 0.9744435115598059, "grad_norm": 0.3294299840927124, "learning_rate": 2.9999999999999997e-05, "loss": 4.5553, "step": 47585 }, { "epoch": 0.9745459013372105, "grad_norm": 0.29346132278442383, "learning_rate": 2.9999999999999997e-05, "loss": 4.5657, "step": 47590 }, { "epoch": 0.9746482911146152, "grad_norm": 0.30177071690559387, "learning_rate": 2.9999999999999997e-05, "loss": 4.5472, "step": 47595 }, { "epoch": 0.9747506808920198, "grad_norm": 0.29101189970970154, "learning_rate": 2.9999999999999997e-05, "loss": 4.5339, "step": 47600 }, { "epoch": 0.9748530706694244, "grad_norm": 0.2959204316139221, "learning_rate": 2.9999999999999997e-05, "loss": 4.5085, "step": 47605 }, { "epoch": 0.974955460446829, "grad_norm": 0.2905808091163635, "learning_rate": 2.9999999999999997e-05, "loss": 4.5638, "step": 47610 }, { "epoch": 0.9750578502242336, "grad_norm": 0.31405702233314514, "learning_rate": 2.9999999999999997e-05, "loss": 4.5099, "step": 47615 }, { "epoch": 0.9751602400016383, "grad_norm": 0.30506718158721924, "learning_rate": 2.9999999999999997e-05, "loss": 4.5412, "step": 47620 }, { "epoch": 0.9752626297790429, "grad_norm": 0.3007020056247711, "learning_rate": 2.9999999999999997e-05, "loss": 4.5642, "step": 47625 }, { "epoch": 0.9753650195564475, "grad_norm": 0.4340749979019165, "learning_rate": 2.9999999999999997e-05, "loss": 4.5624, "step": 47630 }, { "epoch": 0.9754674093338521, "grad_norm": 0.29517611861228943, "learning_rate": 2.9999999999999997e-05, "loss": 4.5556, "step": 47635 }, { "epoch": 0.9755697991112567, "grad_norm": 0.3016403615474701, "learning_rate": 2.9999999999999997e-05, "loss": 4.5887, "step": 47640 }, { "epoch": 0.9756721888886614, "grad_norm": 0.29522985219955444, "learning_rate": 2.9999999999999997e-05, "loss": 4.5627, "step": 47645 }, { "epoch": 0.975774578666066, "grad_norm": 0.2997603118419647, "learning_rate": 2.9999999999999997e-05, "loss": 4.5753, "step": 47650 }, { "epoch": 0.9758769684434706, "grad_norm": 0.29548588395118713, "learning_rate": 2.9999999999999997e-05, "loss": 4.5231, "step": 47655 }, { "epoch": 0.9759793582208752, "grad_norm": 0.2996750473976135, "learning_rate": 2.9999999999999997e-05, "loss": 4.5269, "step": 47660 }, { "epoch": 0.9760817479982798, "grad_norm": 0.5803531408309937, "learning_rate": 2.9999999999999997e-05, "loss": 4.6113, "step": 47665 }, { "epoch": 0.9761841377756845, "grad_norm": 0.3171710968017578, "learning_rate": 2.9999999999999997e-05, "loss": 4.505, "step": 47670 }, { "epoch": 0.9762865275530891, "grad_norm": 0.31973206996917725, "learning_rate": 2.9999999999999997e-05, "loss": 4.5346, "step": 47675 }, { "epoch": 0.9763889173304937, "grad_norm": 0.2989446222782135, "learning_rate": 2.9999999999999997e-05, "loss": 4.5907, "step": 47680 }, { "epoch": 0.9764913071078983, "grad_norm": 0.30542537569999695, "learning_rate": 2.9999999999999997e-05, "loss": 4.5917, "step": 47685 }, { "epoch": 0.976593696885303, "grad_norm": 0.2987099587917328, "learning_rate": 2.9999999999999997e-05, "loss": 4.5065, "step": 47690 }, { "epoch": 0.9766960866627076, "grad_norm": 0.7516525983810425, "learning_rate": 2.9999999999999997e-05, "loss": 4.5825, "step": 47695 }, { "epoch": 0.9767984764401122, "grad_norm": 0.31141313910484314, "learning_rate": 2.9999999999999997e-05, "loss": 4.5883, "step": 47700 }, { "epoch": 0.9769008662175168, "grad_norm": 0.3016454875469208, "learning_rate": 2.9999999999999997e-05, "loss": 4.5647, "step": 47705 }, { "epoch": 0.9770032559949214, "grad_norm": 0.2989034354686737, "learning_rate": 2.9999999999999997e-05, "loss": 4.5689, "step": 47710 }, { "epoch": 0.9771056457723261, "grad_norm": 0.30508163571357727, "learning_rate": 2.9999999999999997e-05, "loss": 4.5473, "step": 47715 }, { "epoch": 0.9772080355497307, "grad_norm": 0.2942759394645691, "learning_rate": 2.9999999999999997e-05, "loss": 4.5663, "step": 47720 }, { "epoch": 0.9773104253271353, "grad_norm": 0.29524552822113037, "learning_rate": 2.9999999999999997e-05, "loss": 4.486, "step": 47725 }, { "epoch": 0.9774128151045399, "grad_norm": 0.29157382249832153, "learning_rate": 2.9999999999999997e-05, "loss": 4.5848, "step": 47730 }, { "epoch": 0.9775152048819445, "grad_norm": 0.30245131254196167, "learning_rate": 2.9999999999999997e-05, "loss": 4.5711, "step": 47735 }, { "epoch": 0.9776175946593492, "grad_norm": 0.2983751893043518, "learning_rate": 2.9999999999999997e-05, "loss": 4.549, "step": 47740 }, { "epoch": 0.9777199844367538, "grad_norm": 0.3162797689437866, "learning_rate": 2.9999999999999997e-05, "loss": 4.5891, "step": 47745 }, { "epoch": 0.9778223742141584, "grad_norm": 0.2861679792404175, "learning_rate": 2.9999999999999997e-05, "loss": 4.5444, "step": 47750 }, { "epoch": 0.977924763991563, "grad_norm": 0.3068501353263855, "learning_rate": 2.9999999999999997e-05, "loss": 4.5623, "step": 47755 }, { "epoch": 0.9780271537689678, "grad_norm": 0.2970331907272339, "learning_rate": 2.9999999999999997e-05, "loss": 4.5897, "step": 47760 }, { "epoch": 0.9781295435463724, "grad_norm": 0.28139594197273254, "learning_rate": 2.9999999999999997e-05, "loss": 4.5435, "step": 47765 }, { "epoch": 0.978231933323777, "grad_norm": 0.31797337532043457, "learning_rate": 2.9999999999999997e-05, "loss": 4.5054, "step": 47770 }, { "epoch": 0.9783343231011816, "grad_norm": 0.29420650005340576, "learning_rate": 2.9999999999999997e-05, "loss": 4.5333, "step": 47775 }, { "epoch": 0.9784367128785862, "grad_norm": 0.29384753108024597, "learning_rate": 2.9999999999999997e-05, "loss": 4.5278, "step": 47780 }, { "epoch": 0.9785391026559909, "grad_norm": 0.30456167459487915, "learning_rate": 2.9999999999999997e-05, "loss": 4.5511, "step": 47785 }, { "epoch": 0.9786414924333955, "grad_norm": 0.2956148386001587, "learning_rate": 2.9999999999999997e-05, "loss": 4.5611, "step": 47790 }, { "epoch": 0.9787438822108001, "grad_norm": 0.2943970561027527, "learning_rate": 2.9999999999999997e-05, "loss": 4.6011, "step": 47795 }, { "epoch": 0.9788462719882047, "grad_norm": 0.2998138666152954, "learning_rate": 2.9999999999999997e-05, "loss": 4.5352, "step": 47800 }, { "epoch": 0.9789486617656094, "grad_norm": 0.31079331040382385, "learning_rate": 2.9999999999999997e-05, "loss": 4.3662, "step": 47805 }, { "epoch": 0.979051051543014, "grad_norm": 0.2993094325065613, "learning_rate": 2.9999999999999997e-05, "loss": 4.5344, "step": 47810 }, { "epoch": 0.9791534413204186, "grad_norm": 0.30916672945022583, "learning_rate": 2.9999999999999997e-05, "loss": 4.547, "step": 47815 }, { "epoch": 0.9792558310978232, "grad_norm": 0.3025127351284027, "learning_rate": 2.9999999999999997e-05, "loss": 4.597, "step": 47820 }, { "epoch": 0.9793582208752278, "grad_norm": 0.2969529628753662, "learning_rate": 2.9999999999999997e-05, "loss": 4.5992, "step": 47825 }, { "epoch": 0.9794606106526325, "grad_norm": 0.30451536178588867, "learning_rate": 2.9999999999999997e-05, "loss": 4.6101, "step": 47830 }, { "epoch": 0.9795630004300371, "grad_norm": 0.28235021233558655, "learning_rate": 2.9999999999999997e-05, "loss": 4.6024, "step": 47835 }, { "epoch": 0.9796653902074417, "grad_norm": 0.2974168658256531, "learning_rate": 2.9999999999999997e-05, "loss": 4.5382, "step": 47840 }, { "epoch": 0.9797677799848463, "grad_norm": 0.3060206174850464, "learning_rate": 2.9999999999999997e-05, "loss": 4.5494, "step": 47845 }, { "epoch": 0.9798701697622509, "grad_norm": 0.3062215745449066, "learning_rate": 2.9999999999999997e-05, "loss": 4.528, "step": 47850 }, { "epoch": 0.9799725595396556, "grad_norm": 0.3104513883590698, "learning_rate": 2.9999999999999997e-05, "loss": 4.5892, "step": 47855 }, { "epoch": 0.9800749493170602, "grad_norm": 0.2982814610004425, "learning_rate": 2.9999999999999997e-05, "loss": 4.5664, "step": 47860 }, { "epoch": 0.9801773390944648, "grad_norm": 0.29843899607658386, "learning_rate": 2.9999999999999997e-05, "loss": 4.5298, "step": 47865 }, { "epoch": 0.9802797288718694, "grad_norm": 0.29165926575660706, "learning_rate": 2.9999999999999997e-05, "loss": 4.5778, "step": 47870 }, { "epoch": 0.980382118649274, "grad_norm": 0.292339563369751, "learning_rate": 2.9999999999999997e-05, "loss": 4.548, "step": 47875 }, { "epoch": 0.9804845084266787, "grad_norm": 0.30853620171546936, "learning_rate": 2.9999999999999997e-05, "loss": 4.553, "step": 47880 }, { "epoch": 0.9805868982040833, "grad_norm": 0.3019552528858185, "learning_rate": 2.9999999999999997e-05, "loss": 4.5485, "step": 47885 }, { "epoch": 0.9806892879814879, "grad_norm": 0.2836818993091583, "learning_rate": 2.9999999999999997e-05, "loss": 4.5667, "step": 47890 }, { "epoch": 0.9807916777588925, "grad_norm": 0.3047296702861786, "learning_rate": 2.9999999999999997e-05, "loss": 4.6309, "step": 47895 }, { "epoch": 0.9808940675362972, "grad_norm": 0.3227206766605377, "learning_rate": 2.9999999999999997e-05, "loss": 4.6047, "step": 47900 }, { "epoch": 0.9809964573137018, "grad_norm": 0.3000172972679138, "learning_rate": 2.9999999999999997e-05, "loss": 4.5939, "step": 47905 }, { "epoch": 0.9810988470911064, "grad_norm": 0.31060072779655457, "learning_rate": 2.9999999999999997e-05, "loss": 4.5729, "step": 47910 }, { "epoch": 0.981201236868511, "grad_norm": 0.30266073346138, "learning_rate": 2.9999999999999997e-05, "loss": 4.5825, "step": 47915 }, { "epoch": 0.9813036266459156, "grad_norm": 0.28458890318870544, "learning_rate": 2.9999999999999997e-05, "loss": 4.5837, "step": 47920 }, { "epoch": 0.9814060164233203, "grad_norm": 0.4968605637550354, "learning_rate": 2.9999999999999997e-05, "loss": 4.5642, "step": 47925 }, { "epoch": 0.9815084062007249, "grad_norm": 0.29144537448883057, "learning_rate": 2.9999999999999997e-05, "loss": 4.5405, "step": 47930 }, { "epoch": 0.9816107959781295, "grad_norm": 0.2946797311306, "learning_rate": 2.9999999999999997e-05, "loss": 4.5616, "step": 47935 }, { "epoch": 0.9817131857555341, "grad_norm": 0.3165966272354126, "learning_rate": 2.9999999999999997e-05, "loss": 4.5244, "step": 47940 }, { "epoch": 0.9818155755329387, "grad_norm": 0.29605773091316223, "learning_rate": 2.9999999999999997e-05, "loss": 4.5738, "step": 47945 }, { "epoch": 0.9819179653103434, "grad_norm": 0.36024630069732666, "learning_rate": 2.9999999999999997e-05, "loss": 4.5367, "step": 47950 }, { "epoch": 0.982020355087748, "grad_norm": 0.30009180307388306, "learning_rate": 2.9999999999999997e-05, "loss": 4.6108, "step": 47955 }, { "epoch": 0.9821227448651526, "grad_norm": 0.3001779317855835, "learning_rate": 2.9999999999999997e-05, "loss": 4.6437, "step": 47960 }, { "epoch": 0.9822251346425573, "grad_norm": 0.3078416883945465, "learning_rate": 2.9999999999999997e-05, "loss": 4.5431, "step": 47965 }, { "epoch": 0.982327524419962, "grad_norm": 0.29466357827186584, "learning_rate": 2.9999999999999997e-05, "loss": 4.5391, "step": 47970 }, { "epoch": 0.9824299141973666, "grad_norm": 0.3046253025531769, "learning_rate": 2.9999999999999997e-05, "loss": 4.5418, "step": 47975 }, { "epoch": 0.9825323039747712, "grad_norm": 0.31107428669929504, "learning_rate": 2.9999999999999997e-05, "loss": 4.6064, "step": 47980 }, { "epoch": 0.9826346937521758, "grad_norm": 0.30249983072280884, "learning_rate": 2.9999999999999997e-05, "loss": 4.5771, "step": 47985 }, { "epoch": 0.9827370835295804, "grad_norm": 0.29450955986976624, "learning_rate": 2.9999999999999997e-05, "loss": 4.5659, "step": 47990 }, { "epoch": 0.9828394733069851, "grad_norm": 0.2966823875904083, "learning_rate": 2.9999999999999997e-05, "loss": 4.5811, "step": 47995 }, { "epoch": 0.9829418630843897, "grad_norm": 0.3255231976509094, "learning_rate": 2.9999999999999997e-05, "loss": 4.5493, "step": 48000 }, { "epoch": 0.9829418630843897, "eval_loss": 4.58038854598999, "eval_runtime": 17.1261, "eval_samples_per_second": 33.983, "eval_steps_per_second": 4.263, "step": 48000 }, { "epoch": 0.9830442528617943, "grad_norm": 0.29559794068336487, "learning_rate": 2.9999999999999997e-05, "loss": 4.5788, "step": 48005 }, { "epoch": 0.9831466426391989, "grad_norm": 0.2873050570487976, "learning_rate": 2.9999999999999997e-05, "loss": 4.6047, "step": 48010 }, { "epoch": 0.9832490324166036, "grad_norm": 0.3836257755756378, "learning_rate": 2.9999999999999997e-05, "loss": 4.5261, "step": 48015 }, { "epoch": 0.9833514221940082, "grad_norm": 0.2874748110771179, "learning_rate": 2.9999999999999997e-05, "loss": 4.6257, "step": 48020 }, { "epoch": 0.9834538119714128, "grad_norm": 0.2962822914123535, "learning_rate": 2.9999999999999997e-05, "loss": 4.5635, "step": 48025 }, { "epoch": 0.9835562017488174, "grad_norm": 0.31296682357788086, "learning_rate": 2.9999999999999997e-05, "loss": 4.5616, "step": 48030 }, { "epoch": 0.983658591526222, "grad_norm": 0.3060377240180969, "learning_rate": 2.9999999999999997e-05, "loss": 4.4271, "step": 48035 }, { "epoch": 0.9837609813036267, "grad_norm": 0.3045669198036194, "learning_rate": 2.9999999999999997e-05, "loss": 4.5513, "step": 48040 }, { "epoch": 0.9838633710810313, "grad_norm": 0.2947128117084503, "learning_rate": 2.9999999999999997e-05, "loss": 4.5827, "step": 48045 }, { "epoch": 0.9839657608584359, "grad_norm": 0.34637144207954407, "learning_rate": 2.9999999999999997e-05, "loss": 4.5234, "step": 48050 }, { "epoch": 0.9840681506358405, "grad_norm": 0.3139941990375519, "learning_rate": 2.9999999999999997e-05, "loss": 4.4869, "step": 48055 }, { "epoch": 0.9841705404132451, "grad_norm": 0.30256378650665283, "learning_rate": 2.9999999999999997e-05, "loss": 4.602, "step": 48060 }, { "epoch": 0.9842729301906498, "grad_norm": 0.3016190826892853, "learning_rate": 2.9999999999999997e-05, "loss": 4.4978, "step": 48065 }, { "epoch": 0.9843753199680544, "grad_norm": 0.30201447010040283, "learning_rate": 2.9999999999999997e-05, "loss": 4.5377, "step": 48070 }, { "epoch": 0.984477709745459, "grad_norm": 0.2965356111526489, "learning_rate": 2.9999999999999997e-05, "loss": 4.5965, "step": 48075 }, { "epoch": 0.9845800995228636, "grad_norm": 0.32482680678367615, "learning_rate": 2.9999999999999997e-05, "loss": 4.5602, "step": 48080 }, { "epoch": 0.9846824893002682, "grad_norm": 0.29524749517440796, "learning_rate": 2.9999999999999997e-05, "loss": 4.5763, "step": 48085 }, { "epoch": 0.9847848790776729, "grad_norm": 0.2916395366191864, "learning_rate": 2.9999999999999997e-05, "loss": 4.5982, "step": 48090 }, { "epoch": 0.9848872688550775, "grad_norm": 0.30824756622314453, "learning_rate": 2.9999999999999997e-05, "loss": 4.5612, "step": 48095 }, { "epoch": 0.9849896586324821, "grad_norm": 0.3042619228363037, "learning_rate": 2.9999999999999997e-05, "loss": 4.5865, "step": 48100 }, { "epoch": 0.9850920484098867, "grad_norm": 0.3027566373348236, "learning_rate": 2.9999999999999997e-05, "loss": 4.5936, "step": 48105 }, { "epoch": 0.9851944381872914, "grad_norm": 0.33060187101364136, "learning_rate": 2.9999999999999997e-05, "loss": 4.4758, "step": 48110 }, { "epoch": 0.985296827964696, "grad_norm": 0.29017114639282227, "learning_rate": 2.9999999999999997e-05, "loss": 4.5754, "step": 48115 }, { "epoch": 0.9853992177421006, "grad_norm": 0.3315523862838745, "learning_rate": 2.9999999999999997e-05, "loss": 4.5919, "step": 48120 }, { "epoch": 0.9855016075195052, "grad_norm": 0.3086949288845062, "learning_rate": 2.9999999999999997e-05, "loss": 4.6245, "step": 48125 }, { "epoch": 0.9856039972969098, "grad_norm": 0.3112105429172516, "learning_rate": 2.9999999999999997e-05, "loss": 4.5972, "step": 48130 }, { "epoch": 0.9857063870743145, "grad_norm": 0.3045254647731781, "learning_rate": 2.9999999999999997e-05, "loss": 4.53, "step": 48135 }, { "epoch": 0.9858087768517191, "grad_norm": 0.29396480321884155, "learning_rate": 2.9999999999999997e-05, "loss": 4.5782, "step": 48140 }, { "epoch": 0.9859111666291237, "grad_norm": 0.29180625081062317, "learning_rate": 2.9999999999999997e-05, "loss": 4.5637, "step": 48145 }, { "epoch": 0.9860135564065283, "grad_norm": 0.29800793528556824, "learning_rate": 2.9999999999999997e-05, "loss": 4.5683, "step": 48150 }, { "epoch": 0.986115946183933, "grad_norm": 0.3194294273853302, "learning_rate": 2.9999999999999997e-05, "loss": 4.5066, "step": 48155 }, { "epoch": 0.9862183359613376, "grad_norm": 0.2990261912345886, "learning_rate": 2.9999999999999997e-05, "loss": 4.6123, "step": 48160 }, { "epoch": 0.9863207257387423, "grad_norm": 0.3095386326313019, "learning_rate": 2.9999999999999997e-05, "loss": 4.6088, "step": 48165 }, { "epoch": 0.9864231155161469, "grad_norm": 0.29120439291000366, "learning_rate": 2.9999999999999997e-05, "loss": 4.5251, "step": 48170 }, { "epoch": 0.9865255052935515, "grad_norm": 0.32928574085235596, "learning_rate": 2.9999999999999997e-05, "loss": 4.4909, "step": 48175 }, { "epoch": 0.9866278950709562, "grad_norm": 0.2973651587963104, "learning_rate": 2.9999999999999997e-05, "loss": 4.526, "step": 48180 }, { "epoch": 0.9867302848483608, "grad_norm": 0.41812607645988464, "learning_rate": 2.9999999999999997e-05, "loss": 4.5729, "step": 48185 }, { "epoch": 0.9868326746257654, "grad_norm": 0.30254292488098145, "learning_rate": 2.9999999999999997e-05, "loss": 4.5442, "step": 48190 }, { "epoch": 0.98693506440317, "grad_norm": 0.29239410161972046, "learning_rate": 2.9999999999999997e-05, "loss": 4.5999, "step": 48195 }, { "epoch": 0.9870374541805746, "grad_norm": 0.288949579000473, "learning_rate": 2.9999999999999997e-05, "loss": 4.5229, "step": 48200 }, { "epoch": 0.9871398439579793, "grad_norm": 0.2900617718696594, "learning_rate": 2.9999999999999997e-05, "loss": 4.5424, "step": 48205 }, { "epoch": 0.9872422337353839, "grad_norm": 0.31114351749420166, "learning_rate": 2.9999999999999997e-05, "loss": 4.5506, "step": 48210 }, { "epoch": 0.9873446235127885, "grad_norm": 0.29957515001296997, "learning_rate": 2.9999999999999997e-05, "loss": 4.5503, "step": 48215 }, { "epoch": 0.9874470132901931, "grad_norm": 0.2843896150588989, "learning_rate": 2.9999999999999997e-05, "loss": 4.5402, "step": 48220 }, { "epoch": 0.9875494030675978, "grad_norm": 0.3023429811000824, "learning_rate": 2.9999999999999997e-05, "loss": 4.5835, "step": 48225 }, { "epoch": 0.9876517928450024, "grad_norm": 0.29364222288131714, "learning_rate": 2.9999999999999997e-05, "loss": 4.5936, "step": 48230 }, { "epoch": 0.987754182622407, "grad_norm": 0.2951924800872803, "learning_rate": 2.9999999999999997e-05, "loss": 4.5296, "step": 48235 }, { "epoch": 0.9878565723998116, "grad_norm": 0.3002241551876068, "learning_rate": 2.9999999999999997e-05, "loss": 4.6014, "step": 48240 }, { "epoch": 0.9879589621772162, "grad_norm": 0.3228725492954254, "learning_rate": 2.9999999999999997e-05, "loss": 4.5675, "step": 48245 }, { "epoch": 0.9880613519546209, "grad_norm": 0.2973325550556183, "learning_rate": 2.9999999999999997e-05, "loss": 4.5701, "step": 48250 }, { "epoch": 0.9881637417320255, "grad_norm": 0.3340296149253845, "learning_rate": 2.9999999999999997e-05, "loss": 4.5867, "step": 48255 }, { "epoch": 0.9882661315094301, "grad_norm": 0.29857170581817627, "learning_rate": 2.9999999999999997e-05, "loss": 4.5791, "step": 48260 }, { "epoch": 0.9883685212868347, "grad_norm": 0.31115448474884033, "learning_rate": 2.9999999999999997e-05, "loss": 4.5804, "step": 48265 }, { "epoch": 0.9884709110642393, "grad_norm": 0.33123666048049927, "learning_rate": 2.9999999999999997e-05, "loss": 4.5663, "step": 48270 }, { "epoch": 0.988573300841644, "grad_norm": 0.30342525243759155, "learning_rate": 2.9999999999999997e-05, "loss": 4.5603, "step": 48275 }, { "epoch": 0.9886756906190486, "grad_norm": 0.34847524762153625, "learning_rate": 2.9999999999999997e-05, "loss": 4.5266, "step": 48280 }, { "epoch": 0.9887780803964532, "grad_norm": 0.29856789112091064, "learning_rate": 2.9999999999999997e-05, "loss": 4.5256, "step": 48285 }, { "epoch": 0.9888804701738578, "grad_norm": 0.2932136654853821, "learning_rate": 2.9999999999999997e-05, "loss": 4.5389, "step": 48290 }, { "epoch": 0.9889828599512624, "grad_norm": 0.3099711835384369, "learning_rate": 2.9999999999999997e-05, "loss": 4.5411, "step": 48295 }, { "epoch": 0.9890852497286671, "grad_norm": 0.2868886888027191, "learning_rate": 2.9999999999999997e-05, "loss": 4.5768, "step": 48300 }, { "epoch": 0.9891876395060717, "grad_norm": 0.2987963855266571, "learning_rate": 2.9999999999999997e-05, "loss": 4.5785, "step": 48305 }, { "epoch": 0.9892900292834763, "grad_norm": 0.306930810213089, "learning_rate": 2.9999999999999997e-05, "loss": 4.5523, "step": 48310 }, { "epoch": 0.9893924190608809, "grad_norm": 0.3057742714881897, "learning_rate": 2.9999999999999997e-05, "loss": 4.5812, "step": 48315 }, { "epoch": 0.9894948088382856, "grad_norm": 0.29376351833343506, "learning_rate": 2.9999999999999997e-05, "loss": 4.5839, "step": 48320 }, { "epoch": 0.9895971986156902, "grad_norm": 0.3674677610397339, "learning_rate": 2.9999999999999997e-05, "loss": 4.5878, "step": 48325 }, { "epoch": 0.9896995883930948, "grad_norm": 0.3014540374279022, "learning_rate": 2.9999999999999997e-05, "loss": 4.5718, "step": 48330 }, { "epoch": 0.9898019781704994, "grad_norm": 0.5672804713249207, "learning_rate": 2.9999999999999997e-05, "loss": 4.5069, "step": 48335 }, { "epoch": 0.989904367947904, "grad_norm": 0.29727086424827576, "learning_rate": 2.9999999999999997e-05, "loss": 4.5248, "step": 48340 }, { "epoch": 0.9900067577253087, "grad_norm": 0.29575619101524353, "learning_rate": 2.9999999999999997e-05, "loss": 4.6139, "step": 48345 }, { "epoch": 0.9901091475027133, "grad_norm": 0.30974772572517395, "learning_rate": 2.9999999999999997e-05, "loss": 4.5381, "step": 48350 }, { "epoch": 0.9902115372801179, "grad_norm": 0.29965290427207947, "learning_rate": 2.9999999999999997e-05, "loss": 4.6347, "step": 48355 }, { "epoch": 0.9903139270575225, "grad_norm": 0.33263689279556274, "learning_rate": 2.9999999999999997e-05, "loss": 4.5864, "step": 48360 }, { "epoch": 0.9904163168349271, "grad_norm": 0.2891058325767517, "learning_rate": 2.9999999999999997e-05, "loss": 4.5297, "step": 48365 }, { "epoch": 0.9905187066123319, "grad_norm": 0.31554582715034485, "learning_rate": 2.9999999999999997e-05, "loss": 4.5698, "step": 48370 }, { "epoch": 0.9906210963897365, "grad_norm": 0.2902148962020874, "learning_rate": 2.9999999999999997e-05, "loss": 4.5561, "step": 48375 }, { "epoch": 0.9907234861671411, "grad_norm": 0.2999386787414551, "learning_rate": 2.9999999999999997e-05, "loss": 4.5651, "step": 48380 }, { "epoch": 0.9908258759445457, "grad_norm": 0.318154901266098, "learning_rate": 2.9999999999999997e-05, "loss": 4.5657, "step": 48385 }, { "epoch": 0.9909282657219504, "grad_norm": 0.29872027039527893, "learning_rate": 2.9999999999999997e-05, "loss": 4.6069, "step": 48390 }, { "epoch": 0.991030655499355, "grad_norm": 0.30033451318740845, "learning_rate": 2.9999999999999997e-05, "loss": 4.596, "step": 48395 }, { "epoch": 0.9911330452767596, "grad_norm": 0.28618279099464417, "learning_rate": 2.9999999999999997e-05, "loss": 4.6149, "step": 48400 }, { "epoch": 0.9912354350541642, "grad_norm": 0.3052704334259033, "learning_rate": 2.9999999999999997e-05, "loss": 4.5962, "step": 48405 }, { "epoch": 0.9913378248315688, "grad_norm": 0.30052441358566284, "learning_rate": 2.9999999999999997e-05, "loss": 4.5889, "step": 48410 }, { "epoch": 0.9914402146089735, "grad_norm": 0.2918172776699066, "learning_rate": 2.9999999999999997e-05, "loss": 4.5396, "step": 48415 }, { "epoch": 0.9915426043863781, "grad_norm": 0.32806017994880676, "learning_rate": 2.9999999999999997e-05, "loss": 4.5853, "step": 48420 }, { "epoch": 0.9916449941637827, "grad_norm": 0.3079558312892914, "learning_rate": 2.9999999999999997e-05, "loss": 4.6285, "step": 48425 }, { "epoch": 0.9917473839411873, "grad_norm": 0.7797358632087708, "learning_rate": 2.9999999999999997e-05, "loss": 4.5935, "step": 48430 }, { "epoch": 0.991849773718592, "grad_norm": 0.30594924092292786, "learning_rate": 2.9999999999999997e-05, "loss": 4.5901, "step": 48435 }, { "epoch": 0.9919521634959966, "grad_norm": 0.299163818359375, "learning_rate": 2.9999999999999997e-05, "loss": 4.5498, "step": 48440 }, { "epoch": 0.9920545532734012, "grad_norm": 0.2934512197971344, "learning_rate": 2.9999999999999997e-05, "loss": 4.5368, "step": 48445 }, { "epoch": 0.9921569430508058, "grad_norm": 0.42306414246559143, "learning_rate": 2.9999999999999997e-05, "loss": 4.5747, "step": 48450 }, { "epoch": 0.9922593328282104, "grad_norm": 0.3244866132736206, "learning_rate": 2.9999999999999997e-05, "loss": 4.571, "step": 48455 }, { "epoch": 0.9923617226056151, "grad_norm": 0.2958906888961792, "learning_rate": 2.9999999999999997e-05, "loss": 4.5693, "step": 48460 }, { "epoch": 0.9924641123830197, "grad_norm": 0.46438702940940857, "learning_rate": 2.9999999999999997e-05, "loss": 4.6219, "step": 48465 }, { "epoch": 0.9925665021604243, "grad_norm": 0.2966323792934418, "learning_rate": 2.9999999999999997e-05, "loss": 4.5287, "step": 48470 }, { "epoch": 0.9926688919378289, "grad_norm": 0.36979418992996216, "learning_rate": 2.9999999999999997e-05, "loss": 4.5192, "step": 48475 }, { "epoch": 0.9927712817152335, "grad_norm": 0.29866376519203186, "learning_rate": 2.9999999999999997e-05, "loss": 4.5937, "step": 48480 }, { "epoch": 0.9928736714926382, "grad_norm": 0.29135310649871826, "learning_rate": 2.9999999999999997e-05, "loss": 4.5857, "step": 48485 }, { "epoch": 0.9929760612700428, "grad_norm": 0.32972216606140137, "learning_rate": 2.9999999999999997e-05, "loss": 4.5914, "step": 48490 }, { "epoch": 0.9930784510474474, "grad_norm": 0.30958205461502075, "learning_rate": 2.9999999999999997e-05, "loss": 4.589, "step": 48495 }, { "epoch": 0.993180840824852, "grad_norm": 0.29018616676330566, "learning_rate": 2.9999999999999997e-05, "loss": 4.5243, "step": 48500 }, { "epoch": 0.9932832306022567, "grad_norm": 0.3043172359466553, "learning_rate": 2.9999999999999997e-05, "loss": 4.5926, "step": 48505 }, { "epoch": 0.9933856203796613, "grad_norm": 0.3271157741546631, "learning_rate": 2.9999999999999997e-05, "loss": 4.5279, "step": 48510 }, { "epoch": 0.9934880101570659, "grad_norm": 0.30671265721321106, "learning_rate": 2.9999999999999997e-05, "loss": 4.541, "step": 48515 }, { "epoch": 0.9935903999344705, "grad_norm": 0.2887033224105835, "learning_rate": 2.9999999999999997e-05, "loss": 4.5568, "step": 48520 }, { "epoch": 0.9936927897118751, "grad_norm": 0.30974188446998596, "learning_rate": 2.9999999999999997e-05, "loss": 4.5086, "step": 48525 }, { "epoch": 0.9937951794892798, "grad_norm": 0.29903706908226013, "learning_rate": 2.9999999999999997e-05, "loss": 4.5542, "step": 48530 }, { "epoch": 0.9938975692666844, "grad_norm": 0.3158770501613617, "learning_rate": 2.9999999999999997e-05, "loss": 4.5431, "step": 48535 }, { "epoch": 0.993999959044089, "grad_norm": 0.2940845489501953, "learning_rate": 2.9999999999999997e-05, "loss": 4.5795, "step": 48540 }, { "epoch": 0.9941023488214936, "grad_norm": 0.2977944612503052, "learning_rate": 2.9999999999999997e-05, "loss": 4.5854, "step": 48545 }, { "epoch": 0.9942047385988982, "grad_norm": 0.29025569558143616, "learning_rate": 2.9999999999999997e-05, "loss": 4.5308, "step": 48550 }, { "epoch": 0.9943071283763029, "grad_norm": 0.3226446807384491, "learning_rate": 2.9999999999999997e-05, "loss": 4.5761, "step": 48555 }, { "epoch": 0.9944095181537075, "grad_norm": 0.2970184087753296, "learning_rate": 2.9999999999999997e-05, "loss": 4.5451, "step": 48560 }, { "epoch": 0.9945119079311121, "grad_norm": 0.2929586172103882, "learning_rate": 2.9999999999999997e-05, "loss": 4.5765, "step": 48565 }, { "epoch": 0.9946142977085168, "grad_norm": 0.3146069049835205, "learning_rate": 2.9999999999999997e-05, "loss": 4.5987, "step": 48570 }, { "epoch": 0.9947166874859215, "grad_norm": 0.2933782935142517, "learning_rate": 2.9999999999999997e-05, "loss": 4.5457, "step": 48575 }, { "epoch": 0.9948190772633261, "grad_norm": 0.30704832077026367, "learning_rate": 2.9999999999999997e-05, "loss": 4.5707, "step": 48580 }, { "epoch": 0.9949214670407307, "grad_norm": 0.3101729154586792, "learning_rate": 2.9999999999999997e-05, "loss": 4.5618, "step": 48585 }, { "epoch": 0.9950238568181353, "grad_norm": 0.2928634285926819, "learning_rate": 2.9999999999999997e-05, "loss": 4.5858, "step": 48590 }, { "epoch": 0.9951262465955399, "grad_norm": 0.3027421832084656, "learning_rate": 2.9999999999999997e-05, "loss": 4.5273, "step": 48595 }, { "epoch": 0.9952286363729446, "grad_norm": 0.3238886594772339, "learning_rate": 2.9999999999999997e-05, "loss": 4.5646, "step": 48600 }, { "epoch": 0.9953310261503492, "grad_norm": 0.30195388197898865, "learning_rate": 2.9999999999999997e-05, "loss": 4.5403, "step": 48605 }, { "epoch": 0.9954334159277538, "grad_norm": 0.31386324763298035, "learning_rate": 2.9999999999999997e-05, "loss": 4.5859, "step": 48610 }, { "epoch": 0.9955358057051584, "grad_norm": 0.32625406980514526, "learning_rate": 2.9999999999999997e-05, "loss": 4.5813, "step": 48615 }, { "epoch": 0.995638195482563, "grad_norm": 0.3073594868183136, "learning_rate": 2.9999999999999997e-05, "loss": 4.5636, "step": 48620 }, { "epoch": 0.9957405852599677, "grad_norm": 0.30962932109832764, "learning_rate": 2.9999999999999997e-05, "loss": 4.5483, "step": 48625 }, { "epoch": 0.9958429750373723, "grad_norm": 0.3184210956096649, "learning_rate": 2.9999999999999997e-05, "loss": 4.5602, "step": 48630 }, { "epoch": 0.9959453648147769, "grad_norm": 0.29708781838417053, "learning_rate": 2.9999999999999997e-05, "loss": 4.5435, "step": 48635 }, { "epoch": 0.9960477545921815, "grad_norm": 0.3089878559112549, "learning_rate": 2.9999999999999997e-05, "loss": 4.5665, "step": 48640 }, { "epoch": 0.9961501443695862, "grad_norm": 0.30662375688552856, "learning_rate": 2.9999999999999997e-05, "loss": 4.5029, "step": 48645 }, { "epoch": 0.9962525341469908, "grad_norm": 0.2912473976612091, "learning_rate": 2.9999999999999997e-05, "loss": 4.5023, "step": 48650 }, { "epoch": 0.9963549239243954, "grad_norm": 0.29344433546066284, "learning_rate": 2.9999999999999997e-05, "loss": 4.5046, "step": 48655 }, { "epoch": 0.9964573137018, "grad_norm": 0.3064979016780853, "learning_rate": 2.9999999999999997e-05, "loss": 4.5615, "step": 48660 }, { "epoch": 0.9965597034792046, "grad_norm": 0.31767401099205017, "learning_rate": 2.9999999999999997e-05, "loss": 4.5763, "step": 48665 }, { "epoch": 0.9966620932566093, "grad_norm": 0.3110029101371765, "learning_rate": 2.9999999999999997e-05, "loss": 4.5906, "step": 48670 }, { "epoch": 0.9967644830340139, "grad_norm": 0.2901157736778259, "learning_rate": 2.9999999999999997e-05, "loss": 4.5029, "step": 48675 }, { "epoch": 0.9968668728114185, "grad_norm": 0.2966289818286896, "learning_rate": 2.9999999999999997e-05, "loss": 4.5575, "step": 48680 }, { "epoch": 0.9969692625888231, "grad_norm": 0.34209704399108887, "learning_rate": 2.9999999999999997e-05, "loss": 4.5737, "step": 48685 }, { "epoch": 0.9970716523662277, "grad_norm": 0.31367409229278564, "learning_rate": 2.9999999999999997e-05, "loss": 4.6228, "step": 48690 }, { "epoch": 0.9971740421436324, "grad_norm": 0.29235970973968506, "learning_rate": 2.9999999999999997e-05, "loss": 4.6071, "step": 48695 }, { "epoch": 0.997276431921037, "grad_norm": 0.3141907751560211, "learning_rate": 2.9999999999999997e-05, "loss": 4.5596, "step": 48700 }, { "epoch": 0.9973788216984416, "grad_norm": 0.3179910480976105, "learning_rate": 2.9999999999999997e-05, "loss": 4.4919, "step": 48705 }, { "epoch": 0.9974812114758462, "grad_norm": 0.30955952405929565, "learning_rate": 2.9999999999999997e-05, "loss": 4.5668, "step": 48710 }, { "epoch": 0.9975836012532509, "grad_norm": 0.2875402271747589, "learning_rate": 2.9999999999999997e-05, "loss": 4.5615, "step": 48715 }, { "epoch": 0.9976859910306555, "grad_norm": 0.316728413105011, "learning_rate": 2.9999999999999997e-05, "loss": 4.6114, "step": 48720 }, { "epoch": 0.9977883808080601, "grad_norm": 0.2882649004459381, "learning_rate": 2.9999999999999997e-05, "loss": 4.5686, "step": 48725 }, { "epoch": 0.9978907705854647, "grad_norm": 0.31529128551483154, "learning_rate": 2.9999999999999997e-05, "loss": 4.5824, "step": 48730 }, { "epoch": 0.9979931603628693, "grad_norm": 0.29442816972732544, "learning_rate": 2.9999999999999997e-05, "loss": 4.5153, "step": 48735 }, { "epoch": 0.998095550140274, "grad_norm": 0.28840017318725586, "learning_rate": 2.9999999999999997e-05, "loss": 4.5885, "step": 48740 }, { "epoch": 0.9981979399176786, "grad_norm": 0.29690900444984436, "learning_rate": 2.9999999999999997e-05, "loss": 4.6002, "step": 48745 }, { "epoch": 0.9983003296950832, "grad_norm": 0.29731130599975586, "learning_rate": 2.9999999999999997e-05, "loss": 4.5881, "step": 48750 }, { "epoch": 0.9984027194724878, "grad_norm": 0.3224312663078308, "learning_rate": 2.9999999999999997e-05, "loss": 4.5788, "step": 48755 }, { "epoch": 0.9985051092498924, "grad_norm": 0.302958607673645, "learning_rate": 2.9999999999999997e-05, "loss": 4.5155, "step": 48760 }, { "epoch": 0.9986074990272971, "grad_norm": 0.29821866750717163, "learning_rate": 2.9999999999999997e-05, "loss": 4.5768, "step": 48765 }, { "epoch": 0.9987098888047017, "grad_norm": 0.29861757159233093, "learning_rate": 2.9999999999999997e-05, "loss": 4.6109, "step": 48770 }, { "epoch": 0.9988122785821064, "grad_norm": 0.31388548016548157, "learning_rate": 2.9999999999999997e-05, "loss": 4.5713, "step": 48775 }, { "epoch": 0.998914668359511, "grad_norm": 0.3197086751461029, "learning_rate": 2.9999999999999997e-05, "loss": 4.5466, "step": 48780 }, { "epoch": 0.9990170581369157, "grad_norm": 0.3147348165512085, "learning_rate": 2.9999999999999997e-05, "loss": 4.558, "step": 48785 }, { "epoch": 0.9991194479143203, "grad_norm": 0.29904741048812866, "learning_rate": 2.9999999999999997e-05, "loss": 4.6132, "step": 48790 }, { "epoch": 0.9992218376917249, "grad_norm": 0.2876797318458557, "learning_rate": 2.9999999999999997e-05, "loss": 4.5397, "step": 48795 }, { "epoch": 0.9993242274691295, "grad_norm": 0.3055194616317749, "learning_rate": 2.9999999999999997e-05, "loss": 4.5512, "step": 48800 }, { "epoch": 0.9994266172465341, "grad_norm": 0.2971906363964081, "learning_rate": 2.9999999999999997e-05, "loss": 4.5158, "step": 48805 }, { "epoch": 0.9995290070239388, "grad_norm": 0.30163276195526123, "learning_rate": 2.9999999999999997e-05, "loss": 4.6007, "step": 48810 }, { "epoch": 0.9996313968013434, "grad_norm": 0.30284538865089417, "learning_rate": 2.9999999999999997e-05, "loss": 4.5838, "step": 48815 }, { "epoch": 0.999733786578748, "grad_norm": 0.2849685847759247, "learning_rate": 2.9999999999999997e-05, "loss": 4.5669, "step": 48820 }, { "epoch": 0.9998361763561526, "grad_norm": 0.29327917098999023, "learning_rate": 2.9999999999999997e-05, "loss": 4.5324, "step": 48825 }, { "epoch": 0.9999385661335572, "grad_norm": 0.2930566370487213, "learning_rate": 2.9999999999999997e-05, "loss": 4.6017, "step": 48830 } ], "logging_steps": 5, "max_steps": 48833, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }