{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.96, "eval_steps": 500, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004, "grad_norm": 8.119571685791016, "learning_rate": 0.0, "loss": 3.4677, "step": 1 }, { "epoch": 0.004, "grad_norm": 8.098031997680664, "learning_rate": 9e-06, "loss": 4.4682, "step": 10 }, { "epoch": 0.008, "grad_norm": 8.463135719299316, "learning_rate": 1.9e-05, "loss": 4.6132, "step": 20 }, { "epoch": 0.012, "grad_norm": 7.452148914337158, "learning_rate": 2.9e-05, "loss": 4.223, "step": 30 }, { "epoch": 0.016, "grad_norm": 5.572236061096191, "learning_rate": 3.9000000000000006e-05, "loss": 4.4111, "step": 40 }, { "epoch": 0.02, "grad_norm": 7.39447021484375, "learning_rate": 4.9e-05, "loss": 4.0543, "step": 50 }, { "epoch": 0.024, "grad_norm": 9.954078674316406, "learning_rate": 4.999833521640187e-05, "loss": 4.5007, "step": 60 }, { "epoch": 0.028, "grad_norm": 5.994736194610596, "learning_rate": 4.9992580693557054e-05, "loss": 4.6204, "step": 70 }, { "epoch": 0.032, "grad_norm": 7.18951940536499, "learning_rate": 4.998271682453017e-05, "loss": 4.2467, "step": 80 }, { "epoch": 0.036, "grad_norm": 9.591761589050293, "learning_rate": 4.996874523116464e-05, "loss": 4.4063, "step": 90 }, { "epoch": 0.04, "grad_norm": 7.566534519195557, "learning_rate": 4.995066821070679e-05, "loss": 4.0773, "step": 100 }, { "epoch": 0.044, "grad_norm": 5.793056488037109, "learning_rate": 4.9928488735428105e-05, "loss": 4.063, "step": 110 }, { "epoch": 0.048, "grad_norm": 7.917051792144775, "learning_rate": 4.990221045213652e-05, "loss": 4.2533, "step": 120 }, { "epoch": 0.052, "grad_norm": 7.804363250732422, "learning_rate": 4.987183768157686e-05, "loss": 4.0497, "step": 130 }, { "epoch": 0.056, "grad_norm": 7.605108261108398, "learning_rate": 4.983737541772033e-05, "loss": 4.4334, "step": 140 }, { "epoch": 0.06, "grad_norm": 9.323838233947754, "learning_rate": 4.979882932694346e-05, "loss": 4.0412, "step": 150 }, { "epoch": 0.064, "grad_norm": 9.219818115234375, "learning_rate": 4.9756205747096385e-05, "loss": 3.9774, "step": 160 }, { "epoch": 0.068, "grad_norm": 8.375937461853027, "learning_rate": 4.9709511686460775e-05, "loss": 4.0021, "step": 170 }, { "epoch": 0.072, "grad_norm": 8.215436935424805, "learning_rate": 4.96587548225975e-05, "loss": 4.4227, "step": 180 }, { "epoch": 0.076, "grad_norm": 9.561295509338379, "learning_rate": 4.960394350108429e-05, "loss": 4.1091, "step": 190 }, { "epoch": 0.08, "grad_norm": 9.014528274536133, "learning_rate": 4.954508673414351e-05, "loss": 3.9428, "step": 200 }, { "epoch": 0.084, "grad_norm": 7.69431209564209, "learning_rate": 4.948219419916037e-05, "loss": 4.368, "step": 210 }, { "epoch": 0.088, "grad_norm": 11.925583839416504, "learning_rate": 4.941527623709172e-05, "loss": 3.6757, "step": 220 }, { "epoch": 0.092, "grad_norm": 8.191117286682129, "learning_rate": 4.934434385076576e-05, "loss": 4.1905, "step": 230 }, { "epoch": 0.096, "grad_norm": 7.863613128662109, "learning_rate": 4.926940870307296e-05, "loss": 4.0099, "step": 240 }, { "epoch": 0.1, "grad_norm": 10.665002822875977, "learning_rate": 4.9190483115048375e-05, "loss": 3.9059, "step": 250 }, { "epoch": 0.104, "grad_norm": 10.385906219482422, "learning_rate": 4.910758006384583e-05, "loss": 3.9221, "step": 260 }, { "epoch": 0.108, "grad_norm": 8.544922828674316, "learning_rate": 4.9020713180604126e-05, "loss": 3.9398, "step": 270 }, { "epoch": 0.112, "grad_norm": 7.989080429077148, "learning_rate": 4.892989674820585e-05, "loss": 3.7757, "step": 280 }, { "epoch": 0.116, "grad_norm": 6.576107025146484, "learning_rate": 4.8835145698928856e-05, "loss": 3.5309, "step": 290 }, { "epoch": 0.12, "grad_norm": 9.80089282989502, "learning_rate": 4.873647561199115e-05, "loss": 4.1776, "step": 300 }, { "epoch": 0.124, "grad_norm": 15.050427436828613, "learning_rate": 4.863390271098922e-05, "loss": 3.5808, "step": 310 }, { "epoch": 0.128, "grad_norm": 8.734102249145508, "learning_rate": 4.852744386123061e-05, "loss": 3.9796, "step": 320 }, { "epoch": 0.132, "grad_norm": 8.711186408996582, "learning_rate": 4.84171165669608e-05, "loss": 4.2317, "step": 330 }, { "epoch": 0.136, "grad_norm": 6.751059055328369, "learning_rate": 4.8302938968485144e-05, "loss": 3.7145, "step": 340 }, { "epoch": 0.14, "grad_norm": 10.623860359191895, "learning_rate": 4.8184929839186196e-05, "loss": 3.9616, "step": 350 }, { "epoch": 0.144, "grad_norm": 7.5071330070495605, "learning_rate": 4.806310858243694e-05, "loss": 4.0164, "step": 360 }, { "epoch": 0.148, "grad_norm": 8.607765197753906, "learning_rate": 4.793749522841042e-05, "loss": 4.4924, "step": 370 }, { "epoch": 0.152, "grad_norm": 8.406026840209961, "learning_rate": 4.780811043078636e-05, "loss": 3.4254, "step": 380 }, { "epoch": 0.156, "grad_norm": 9.387131690979004, "learning_rate": 4.767497546335519e-05, "loss": 3.9158, "step": 390 }, { "epoch": 0.16, "grad_norm": 7.5071258544921875, "learning_rate": 4.753811221652017e-05, "loss": 4.1042, "step": 400 }, { "epoch": 0.164, "grad_norm": 6.716228008270264, "learning_rate": 4.739754319369814e-05, "loss": 3.8632, "step": 410 }, { "epoch": 0.168, "grad_norm": 9.47385311126709, "learning_rate": 4.7253291507619404e-05, "loss": 3.7837, "step": 420 }, { "epoch": 0.172, "grad_norm": 7.547070026397705, "learning_rate": 4.710538087652748e-05, "loss": 4.0398, "step": 430 }, { "epoch": 0.176, "grad_norm": 13.61339282989502, "learning_rate": 4.695383562027933e-05, "loss": 3.7789, "step": 440 }, { "epoch": 0.18, "grad_norm": 6.434921741485596, "learning_rate": 4.679868065634656e-05, "loss": 3.9506, "step": 450 }, { "epoch": 0.184, "grad_norm": 7.865591049194336, "learning_rate": 4.663994149571849e-05, "loss": 3.4036, "step": 460 }, { "epoch": 0.188, "grad_norm": 9.311790466308594, "learning_rate": 4.647764423870751e-05, "loss": 4.1299, "step": 470 }, { "epoch": 0.192, "grad_norm": 8.277907371520996, "learning_rate": 4.631181557065761e-05, "loss": 4.0614, "step": 480 }, { "epoch": 0.196, "grad_norm": 12.398967742919922, "learning_rate": 4.614248275755676e-05, "loss": 3.7492, "step": 490 }, { "epoch": 0.2, "grad_norm": 7.308017730712891, "learning_rate": 4.5969673641553685e-05, "loss": 4.1606, "step": 500 }, { "epoch": 0.204, "grad_norm": 6.510436058044434, "learning_rate": 4.579341663638004e-05, "loss": 3.5708, "step": 510 }, { "epoch": 0.208, "grad_norm": 10.914970397949219, "learning_rate": 4.5613740722678525e-05, "loss": 3.4741, "step": 520 }, { "epoch": 0.212, "grad_norm": 8.786978721618652, "learning_rate": 4.5430675443237817e-05, "loss": 3.6204, "step": 530 }, { "epoch": 0.216, "grad_norm": 10.566540718078613, "learning_rate": 4.524425089813507e-05, "loss": 3.9298, "step": 540 }, { "epoch": 0.22, "grad_norm": 8.051084518432617, "learning_rate": 4.505449773978677e-05, "loss": 3.7783, "step": 550 }, { "epoch": 0.224, "grad_norm": 11.182727813720703, "learning_rate": 4.4861447167908824e-05, "loss": 3.8174, "step": 560 }, { "epoch": 0.228, "grad_norm": 11.375614166259766, "learning_rate": 4.466513092438653e-05, "loss": 4.0511, "step": 570 }, { "epoch": 0.232, "grad_norm": 8.66441535949707, "learning_rate": 4.446558128805561e-05, "loss": 3.7058, "step": 580 }, { "epoch": 0.236, "grad_norm": 5.743879795074463, "learning_rate": 4.426283106939474e-05, "loss": 3.817, "step": 590 }, { "epoch": 0.24, "grad_norm": 10.231585502624512, "learning_rate": 4.4056913605130804e-05, "loss": 3.9779, "step": 600 }, { "epoch": 0.244, "grad_norm": 5.9834465980529785, "learning_rate": 4.3847862752757604e-05, "loss": 3.4466, "step": 610 }, { "epoch": 0.248, "grad_norm": 9.162353515625, "learning_rate": 4.363571288496888e-05, "loss": 3.576, "step": 620 }, { "epoch": 0.252, "grad_norm": 10.162070274353027, "learning_rate": 4.342049888400669e-05, "loss": 4.084, "step": 630 }, { "epoch": 0.256, "grad_norm": 12.520784378051758, "learning_rate": 4.3202256135925956e-05, "loss": 3.6774, "step": 640 }, { "epoch": 0.26, "grad_norm": 10.249221801757812, "learning_rate": 4.298102052477621e-05, "loss": 3.9724, "step": 650 }, { "epoch": 0.264, "grad_norm": 10.406034469604492, "learning_rate": 4.2756828426701426e-05, "loss": 3.9906, "step": 660 }, { "epoch": 0.268, "grad_norm": 15.699187278747559, "learning_rate": 4.2529716703959024e-05, "loss": 3.2696, "step": 670 }, { "epoch": 0.272, "grad_norm": 10.240876197814941, "learning_rate": 4.229972269885877e-05, "loss": 3.2456, "step": 680 }, { "epoch": 0.276, "grad_norm": 11.525603294372559, "learning_rate": 4.206688422762295e-05, "loss": 3.6349, "step": 690 }, { "epoch": 0.28, "grad_norm": 6.479814052581787, "learning_rate": 4.1831239574168493e-05, "loss": 3.5024, "step": 700 }, { "epoch": 0.284, "grad_norm": 10.296248435974121, "learning_rate": 4.159282748381218e-05, "loss": 4.0567, "step": 710 }, { "epoch": 0.288, "grad_norm": 13.293269157409668, "learning_rate": 4.135168715690015e-05, "loss": 3.9591, "step": 720 }, { "epoch": 0.292, "grad_norm": 7.214468479156494, "learning_rate": 4.110785824236236e-05, "loss": 3.8723, "step": 730 }, { "epoch": 0.296, "grad_norm": 8.655447006225586, "learning_rate": 4.086138083119347e-05, "loss": 3.7503, "step": 740 }, { "epoch": 0.3, "grad_norm": 12.488017082214355, "learning_rate": 4.061229544986095e-05, "loss": 3.6059, "step": 750 }, { "epoch": 0.304, "grad_norm": 10.392841339111328, "learning_rate": 4.036064305364162e-05, "loss": 3.7607, "step": 760 }, { "epoch": 0.308, "grad_norm": 13.796865463256836, "learning_rate": 4.010646501988769e-05, "loss": 3.3188, "step": 770 }, { "epoch": 0.312, "grad_norm": 6.374794006347656, "learning_rate": 3.9849803141223324e-05, "loss": 3.3962, "step": 780 }, { "epoch": 0.316, "grad_norm": 9.044532775878906, "learning_rate": 3.9590699618673086e-05, "loss": 3.9154, "step": 790 }, { "epoch": 0.32, "grad_norm": 8.740546226501465, "learning_rate": 3.932919705472306e-05, "loss": 3.4457, "step": 800 }, { "epoch": 0.324, "grad_norm": 10.944662094116211, "learning_rate": 3.906533844631604e-05, "loss": 3.6514, "step": 810 }, { "epoch": 0.328, "grad_norm": 8.953042984008789, "learning_rate": 3.879916717778191e-05, "loss": 3.705, "step": 820 }, { "epoch": 0.332, "grad_norm": 10.540362358093262, "learning_rate": 3.8530727013704215e-05, "loss": 3.4666, "step": 830 }, { "epoch": 0.336, "grad_norm": 8.946858406066895, "learning_rate": 3.826006209172433e-05, "loss": 3.8688, "step": 840 }, { "epoch": 0.34, "grad_norm": 8.028446197509766, "learning_rate": 3.7987216915284184e-05, "loss": 3.6068, "step": 850 }, { "epoch": 0.344, "grad_norm": 13.014655113220215, "learning_rate": 3.771223634630892e-05, "loss": 3.883, "step": 860 }, { "epoch": 0.348, "grad_norm": 7.317591667175293, "learning_rate": 3.743516559783055e-05, "loss": 3.8452, "step": 870 }, { "epoch": 0.352, "grad_norm": 9.003655433654785, "learning_rate": 3.7156050226553956e-05, "loss": 3.5083, "step": 880 }, { "epoch": 0.356, "grad_norm": 8.790939331054688, "learning_rate": 3.687493612536628e-05, "loss": 3.6303, "step": 890 }, { "epoch": 0.36, "grad_norm": 9.35024642944336, "learning_rate": 3.659186951579111e-05, "loss": 3.3183, "step": 900 }, { "epoch": 0.364, "grad_norm": 12.418292045593262, "learning_rate": 3.630689694038866e-05, "loss": 3.6162, "step": 910 }, { "epoch": 0.368, "grad_norm": 9.97085952758789, "learning_rate": 3.6020065255103056e-05, "loss": 3.6587, "step": 920 }, { "epoch": 0.372, "grad_norm": 11.682862281799316, "learning_rate": 3.573142162155819e-05, "loss": 3.679, "step": 930 }, { "epoch": 0.376, "grad_norm": 10.91349983215332, "learning_rate": 3.544101349930328e-05, "loss": 3.5703, "step": 940 }, { "epoch": 0.38, "grad_norm": 7.593992710113525, "learning_rate": 3.514888863800944e-05, "loss": 3.0866, "step": 950 }, { "epoch": 0.384, "grad_norm": 7.078611850738525, "learning_rate": 3.485509506961856e-05, "loss": 3.5236, "step": 960 }, { "epoch": 0.388, "grad_norm": 4.66752290725708, "learning_rate": 3.4559681100445756e-05, "loss": 3.0979, "step": 970 }, { "epoch": 0.392, "grad_norm": 11.089188575744629, "learning_rate": 3.4262695303236724e-05, "loss": 3.5252, "step": 980 }, { "epoch": 0.396, "grad_norm": 9.009184837341309, "learning_rate": 3.396418650918127e-05, "loss": 3.7062, "step": 990 }, { "epoch": 0.4, "grad_norm": 7.165460109710693, "learning_rate": 3.366420379988441e-05, "loss": 3.4182, "step": 1000 }, { "epoch": 0.404, "grad_norm": 13.719085693359375, "learning_rate": 3.336279649929614e-05, "loss": 3.6603, "step": 1010 }, { "epoch": 0.408, "grad_norm": 10.594961166381836, "learning_rate": 3.306001416560156e-05, "loss": 3.824, "step": 1020 }, { "epoch": 0.412, "grad_norm": 9.565075874328613, "learning_rate": 3.275590658307234e-05, "loss": 3.074, "step": 1030 }, { "epoch": 0.416, "grad_norm": 11.031000137329102, "learning_rate": 3.245052375388107e-05, "loss": 3.3561, "step": 1040 }, { "epoch": 0.42, "grad_norm": 8.683501243591309, "learning_rate": 3.214391588987976e-05, "loss": 3.4976, "step": 1050 }, { "epoch": 0.424, "grad_norm": 7.569673538208008, "learning_rate": 3.1836133404343885e-05, "loss": 3.3982, "step": 1060 }, { "epoch": 0.428, "grad_norm": 9.724939346313477, "learning_rate": 3.1527226903683286e-05, "loss": 3.1605, "step": 1070 }, { "epoch": 0.432, "grad_norm": 11.795547485351562, "learning_rate": 3.121724717912138e-05, "loss": 3.4858, "step": 1080 }, { "epoch": 0.436, "grad_norm": 10.01028823852539, "learning_rate": 3.090624519834383e-05, "loss": 3.5917, "step": 1090 }, { "epoch": 0.44, "grad_norm": 10.159195899963379, "learning_rate": 3.0594272097118436e-05, "loss": 3.5127, "step": 1100 }, { "epoch": 0.444, "grad_norm": 12.02109432220459, "learning_rate": 3.028137917088716e-05, "loss": 3.7095, "step": 1110 }, { "epoch": 0.448, "grad_norm": 11.922860145568848, "learning_rate": 2.9967617866331997e-05, "loss": 3.0155, "step": 1120 }, { "epoch": 0.452, "grad_norm": 7.406614780426025, "learning_rate": 2.9653039772916052e-05, "loss": 3.5601, "step": 1130 }, { "epoch": 0.456, "grad_norm": 9.041807174682617, "learning_rate": 2.9337696614400977e-05, "loss": 3.4362, "step": 1140 }, { "epoch": 0.46, "grad_norm": 7.662649631500244, "learning_rate": 2.902164024034246e-05, "loss": 3.2583, "step": 1150 }, { "epoch": 0.464, "grad_norm": 10.045381546020508, "learning_rate": 2.8704922617564983e-05, "loss": 3.5937, "step": 1160 }, { "epoch": 0.468, "grad_norm": 10.429932594299316, "learning_rate": 2.8387595821617275e-05, "loss": 3.5703, "step": 1170 }, { "epoch": 0.472, "grad_norm": 13.951080322265625, "learning_rate": 2.8069712028209927e-05, "loss": 3.2037, "step": 1180 }, { "epoch": 0.476, "grad_norm": 9.768102645874023, "learning_rate": 2.7751323504636544e-05, "loss": 3.2948, "step": 1190 }, { "epoch": 0.48, "grad_norm": 16.445524215698242, "learning_rate": 2.7432482601179794e-05, "loss": 3.7049, "step": 1200 }, { "epoch": 0.484, "grad_norm": 10.077542304992676, "learning_rate": 2.711324174250382e-05, "loss": 3.7272, "step": 1210 }, { "epoch": 0.488, "grad_norm": 10.981230735778809, "learning_rate": 2.6793653419034482e-05, "loss": 3.3686, "step": 1220 }, { "epoch": 0.492, "grad_norm": 8.846978187561035, "learning_rate": 2.6473770178328715e-05, "loss": 3.7523, "step": 1230 }, { "epoch": 0.496, "grad_norm": 13.945764541625977, "learning_rate": 2.6153644616434526e-05, "loss": 3.5152, "step": 1240 }, { "epoch": 0.5, "grad_norm": 10.375041961669922, "learning_rate": 2.583332936924299e-05, "loss": 3.4198, "step": 1250 }, { "epoch": 0.504, "grad_norm": 7.862137794494629, "learning_rate": 2.5512877103833783e-05, "loss": 3.4253, "step": 1260 }, { "epoch": 0.508, "grad_norm": 9.651905059814453, "learning_rate": 2.519234050981543e-05, "loss": 2.9916, "step": 1270 }, { "epoch": 0.512, "grad_norm": 8.323561668395996, "learning_rate": 2.4871772290662044e-05, "loss": 3.0336, "step": 1280 }, { "epoch": 0.516, "grad_norm": 7.276916980743408, "learning_rate": 2.4551225155047573e-05, "loss": 3.3251, "step": 1290 }, { "epoch": 0.52, "grad_norm": 9.36464786529541, "learning_rate": 2.423075180817938e-05, "loss": 3.0858, "step": 1300 }, { "epoch": 0.524, "grad_norm": 9.869660377502441, "learning_rate": 2.391040494313229e-05, "loss": 3.2847, "step": 1310 }, { "epoch": 0.528, "grad_norm": 8.658061981201172, "learning_rate": 2.3590237232184644e-05, "loss": 3.1331, "step": 1320 }, { "epoch": 0.532, "grad_norm": 8.946754455566406, "learning_rate": 2.3270301318157792e-05, "loss": 3.4923, "step": 1330 }, { "epoch": 0.536, "grad_norm": 10.488960266113281, "learning_rate": 2.2950649805760438e-05, "loss": 3.2958, "step": 1340 }, { "epoch": 0.54, "grad_norm": 12.32264518737793, "learning_rate": 2.263133525293918e-05, "loss": 2.9298, "step": 1350 }, { "epoch": 0.544, "grad_norm": 14.110706329345703, "learning_rate": 2.2312410162236883e-05, "loss": 3.2753, "step": 1360 }, { "epoch": 0.548, "grad_norm": 11.187686920166016, "learning_rate": 2.1993926972159972e-05, "loss": 3.4152, "step": 1370 }, { "epoch": 0.552, "grad_norm": 10.895075798034668, "learning_rate": 2.1675938048556446e-05, "loss": 3.4019, "step": 1380 }, { "epoch": 0.556, "grad_norm": 5.504537105560303, "learning_rate": 2.1358495676005664e-05, "loss": 3.167, "step": 1390 }, { "epoch": 0.56, "grad_norm": 8.452468872070312, "learning_rate": 2.1041652049221648e-05, "loss": 3.0729, "step": 1400 }, { "epoch": 0.564, "grad_norm": 11.04509449005127, "learning_rate": 2.0725459264471047e-05, "loss": 3.642, "step": 1410 }, { "epoch": 0.568, "grad_norm": 8.009263038635254, "learning_rate": 2.0409969311007335e-05, "loss": 3.1349, "step": 1420 }, { "epoch": 0.572, "grad_norm": 8.250015258789062, "learning_rate": 2.009523406252263e-05, "loss": 3.4037, "step": 1430 }, { "epoch": 0.576, "grad_norm": 6.933814525604248, "learning_rate": 1.9781305268618417e-05, "loss": 3.2761, "step": 1440 }, { "epoch": 0.58, "grad_norm": 8.798672676086426, "learning_rate": 1.9468234546296844e-05, "loss": 3.2963, "step": 1450 }, { "epoch": 0.584, "grad_norm": 8.615999221801758, "learning_rate": 1.9156073371473618e-05, "loss": 3.3487, "step": 1460 }, { "epoch": 0.588, "grad_norm": 6.798926830291748, "learning_rate": 1.8844873070514272e-05, "loss": 3.2746, "step": 1470 }, { "epoch": 0.592, "grad_norm": 8.364091873168945, "learning_rate": 1.8534684811794893e-05, "loss": 3.071, "step": 1480 }, { "epoch": 0.596, "grad_norm": 6.177745342254639, "learning_rate": 1.822555959728892e-05, "loss": 2.8733, "step": 1490 }, { "epoch": 0.6, "grad_norm": 7.9252238273620605, "learning_rate": 1.7917548254181273e-05, "loss": 3.0836, "step": 1500 }, { "epoch": 0.604, "grad_norm": 10.669748306274414, "learning_rate": 1.7610701426511128e-05, "loss": 3.587, "step": 1510 }, { "epoch": 0.608, "grad_norm": 6.509505271911621, "learning_rate": 1.7305069566845046e-05, "loss": 2.8579, "step": 1520 }, { "epoch": 0.612, "grad_norm": 8.801206588745117, "learning_rate": 1.7000702927981254e-05, "loss": 3.5055, "step": 1530 }, { "epoch": 0.616, "grad_norm": 13.360625267028809, "learning_rate": 1.669765155468708e-05, "loss": 3.007, "step": 1540 }, { "epoch": 0.62, "grad_norm": 9.038350105285645, "learning_rate": 1.6395965275470393e-05, "loss": 3.546, "step": 1550 }, { "epoch": 0.624, "grad_norm": 12.468111038208008, "learning_rate": 1.6095693694386697e-05, "loss": 3.046, "step": 1560 }, { "epoch": 0.628, "grad_norm": 9.378480911254883, "learning_rate": 1.5796886182883053e-05, "loss": 2.9804, "step": 1570 }, { "epoch": 0.632, "grad_norm": 8.186980247497559, "learning_rate": 1.549959187168038e-05, "loss": 3.1672, "step": 1580 }, { "epoch": 0.636, "grad_norm": 13.096222877502441, "learning_rate": 1.520385964269519e-05, "loss": 3.0177, "step": 1590 }, { "epoch": 0.64, "grad_norm": 9.109463691711426, "learning_rate": 1.4909738121002276e-05, "loss": 3.101, "step": 1600 }, { "epoch": 0.644, "grad_norm": 8.423794746398926, "learning_rate": 1.4617275666839725e-05, "loss": 2.8508, "step": 1610 }, { "epoch": 0.648, "grad_norm": 6.3356122970581055, "learning_rate": 1.4326520367657314e-05, "loss": 3.3239, "step": 1620 }, { "epoch": 0.652, "grad_norm": 8.81240177154541, "learning_rate": 1.4037520030209934e-05, "loss": 3.2261, "step": 1630 }, { "epoch": 0.656, "grad_norm": 7.250948905944824, "learning_rate": 1.3750322172696972e-05, "loss": 3.1138, "step": 1640 }, { "epoch": 0.66, "grad_norm": 9.962249755859375, "learning_rate": 1.3464974016949342e-05, "loss": 3.2969, "step": 1650 }, { "epoch": 0.664, "grad_norm": 7.954286575317383, "learning_rate": 1.3181522480665098e-05, "loss": 2.7313, "step": 1660 }, { "epoch": 0.668, "grad_norm": 11.184345245361328, "learning_rate": 1.2900014169695082e-05, "loss": 3.2666, "step": 1670 }, { "epoch": 0.672, "grad_norm": 8.145426750183105, "learning_rate": 1.262049537037992e-05, "loss": 2.7759, "step": 1680 }, { "epoch": 0.676, "grad_norm": 12.047683715820312, "learning_rate": 1.2343012041939469e-05, "loss": 2.9462, "step": 1690 }, { "epoch": 0.68, "grad_norm": 11.436731338500977, "learning_rate": 1.2067609808916086e-05, "loss": 3.362, "step": 1700 }, { "epoch": 0.684, "grad_norm": 7.729074954986572, "learning_rate": 1.1794333953672893e-05, "loss": 3.4444, "step": 1710 }, { "epoch": 0.688, "grad_norm": 11.743609428405762, "learning_rate": 1.1523229408948394e-05, "loss": 2.9723, "step": 1720 }, { "epoch": 0.692, "grad_norm": 7.753131866455078, "learning_rate": 1.1254340750468445e-05, "loss": 3.0701, "step": 1730 }, { "epoch": 0.696, "grad_norm": 10.646190643310547, "learning_rate": 1.0987712189617049e-05, "loss": 3.3374, "step": 1740 }, { "epoch": 0.7, "grad_norm": 8.74120044708252, "learning_rate": 1.0723387566166979e-05, "loss": 3.0917, "step": 1750 }, { "epoch": 0.704, "grad_norm": 9.45445728302002, "learning_rate": 1.0461410341071528e-05, "loss": 3.2809, "step": 1760 }, { "epoch": 0.708, "grad_norm": 11.984269142150879, "learning_rate": 1.0201823589318554e-05, "loss": 3.256, "step": 1770 }, { "epoch": 0.712, "grad_norm": 6.780118465423584, "learning_rate": 9.944669992847946e-06, "loss": 3.0955, "step": 1780 }, { "epoch": 0.716, "grad_norm": 10.487933158874512, "learning_rate": 9.689991833533804e-06, "loss": 3.1214, "step": 1790 }, { "epoch": 0.72, "grad_norm": 7.70168399810791, "learning_rate": 9.437830986232265e-06, "loss": 3.052, "step": 1800 }, { "epoch": 0.724, "grad_norm": 13.816009521484375, "learning_rate": 9.188228911896412e-06, "loss": 3.4094, "step": 1810 }, { "epoch": 0.728, "grad_norm": 8.344259262084961, "learning_rate": 8.94122665075909e-06, "loss": 3.0472, "step": 1820 }, { "epoch": 0.732, "grad_norm": 19.413257598876953, "learning_rate": 8.696864815584995e-06, "loss": 2.6052, "step": 1830 }, { "epoch": 0.736, "grad_norm": 10.31498908996582, "learning_rate": 8.455183584993009e-06, "loss": 3.0981, "step": 1840 }, { "epoch": 0.74, "grad_norm": 11.46462345123291, "learning_rate": 8.2162226968499e-06, "loss": 3.1952, "step": 1850 }, { "epoch": 0.744, "grad_norm": 9.817370414733887, "learning_rate": 7.980021441736576e-06, "loss": 2.9148, "step": 1860 }, { "epoch": 0.748, "grad_norm": 12.085224151611328, "learning_rate": 7.746618656487748e-06, "loss": 3.1418, "step": 1870 }, { "epoch": 0.752, "grad_norm": 13.42601490020752, "learning_rate": 7.516052717806346e-06, "loss": 3.0495, "step": 1880 }, { "epoch": 0.756, "grad_norm": 10.328361511230469, "learning_rate": 7.288361535953472e-06, "loss": 3.2537, "step": 1890 }, { "epoch": 0.76, "grad_norm": 8.320837020874023, "learning_rate": 7.06358254851513e-06, "loss": 3.2002, "step": 1900 }, { "epoch": 0.764, "grad_norm": 12.367525100708008, "learning_rate": 6.841752714246588e-06, "loss": 3.415, "step": 1910 }, { "epoch": 0.768, "grad_norm": 8.72415828704834, "learning_rate": 6.622908506995581e-06, "loss": 2.7481, "step": 1920 }, { "epoch": 0.772, "grad_norm": 9.888436317443848, "learning_rate": 6.407085909705157e-06, "loss": 3.4815, "step": 1930 }, { "epoch": 0.776, "grad_norm": 7.541075706481934, "learning_rate": 6.194320408497245e-06, "loss": 3.4048, "step": 1940 }, { "epoch": 0.78, "grad_norm": 11.171248435974121, "learning_rate": 5.98464698683798e-06, "loss": 3.5409, "step": 1950 }, { "epoch": 0.784, "grad_norm": 9.28205394744873, "learning_rate": 5.778100119785587e-06, "loss": 3.1082, "step": 1960 }, { "epoch": 0.788, "grad_norm": 8.433388710021973, "learning_rate": 5.5747137683219404e-06, "loss": 2.9565, "step": 1970 }, { "epoch": 0.792, "grad_norm": 14.938470840454102, "learning_rate": 5.374521373768549e-06, "loss": 3.2282, "step": 1980 }, { "epoch": 0.796, "grad_norm": 9.903738975524902, "learning_rate": 5.177555852288119e-06, "loss": 2.9652, "step": 1990 }, { "epoch": 0.8, "grad_norm": 13.002461433410645, "learning_rate": 4.983849589472348e-06, "loss": 3.221, "step": 2000 }, { "epoch": 0.804, "grad_norm": 12.107378005981445, "learning_rate": 4.793434435016986e-06, "loss": 3.1341, "step": 2010 }, { "epoch": 0.808, "grad_norm": 11.94257640838623, "learning_rate": 4.606341697485087e-06, "loss": 3.318, "step": 2020 }, { "epoch": 0.812, "grad_norm": 10.116772651672363, "learning_rate": 4.422602139159091e-06, "loss": 3.2286, "step": 2030 }, { "epoch": 0.816, "grad_norm": 10.068933486938477, "learning_rate": 4.242245970982883e-06, "loss": 3.306, "step": 2040 }, { "epoch": 0.82, "grad_norm": 10.280326843261719, "learning_rate": 4.065302847594369e-06, "loss": 3.005, "step": 2050 }, { "epoch": 0.824, "grad_norm": 10.214073181152344, "learning_rate": 3.891801862449629e-06, "loss": 2.9953, "step": 2060 }, { "epoch": 0.828, "grad_norm": 12.787151336669922, "learning_rate": 3.721771543039254e-06, "loss": 2.9877, "step": 2070 }, { "epoch": 0.832, "grad_norm": 7.119079113006592, "learning_rate": 3.5552398461978277e-06, "loss": 3.0851, "step": 2080 }, { "epoch": 0.836, "grad_norm": 6.1061177253723145, "learning_rate": 3.3922341535071483e-06, "loss": 2.9198, "step": 2090 }, { "epoch": 0.84, "grad_norm": 9.866963386535645, "learning_rate": 3.23278126679408e-06, "loss": 2.9846, "step": 2100 }, { "epoch": 0.844, "grad_norm": 9.084943771362305, "learning_rate": 3.0769074037237583e-06, "loss": 2.9903, "step": 2110 }, { "epoch": 0.848, "grad_norm": 6.5540595054626465, "learning_rate": 2.9246381934887684e-06, "loss": 3.2851, "step": 2120 }, { "epoch": 0.852, "grad_norm": 7.740701675415039, "learning_rate": 2.7759986725951703e-06, "loss": 2.9797, "step": 2130 }, { "epoch": 0.856, "grad_norm": 10.074856758117676, "learning_rate": 2.6310132807458894e-06, "loss": 3.1325, "step": 2140 }, { "epoch": 0.86, "grad_norm": 10.44127368927002, "learning_rate": 2.4897058568223137e-06, "loss": 3.0159, "step": 2150 }, { "epoch": 0.864, "grad_norm": 9.894632339477539, "learning_rate": 2.3520996349645995e-06, "loss": 2.8015, "step": 2160 }, { "epoch": 0.868, "grad_norm": 9.043245315551758, "learning_rate": 2.218217240751491e-06, "loss": 3.4477, "step": 2170 }, { "epoch": 0.872, "grad_norm": 9.901315689086914, "learning_rate": 2.088080687480151e-06, "loss": 3.3157, "step": 2180 }, { "epoch": 0.876, "grad_norm": 8.202696800231934, "learning_rate": 1.961711372546657e-06, "loss": 2.9467, "step": 2190 }, { "epoch": 0.88, "grad_norm": 8.691917419433594, "learning_rate": 1.8391300739278139e-06, "loss": 2.9079, "step": 2200 }, { "epoch": 0.884, "grad_norm": 13.363630294799805, "learning_rate": 1.7203569467647674e-06, "loss": 3.2583, "step": 2210 }, { "epoch": 0.888, "grad_norm": 11.732659339904785, "learning_rate": 1.6054115200490493e-06, "loss": 3.0431, "step": 2220 }, { "epoch": 0.892, "grad_norm": 7.8193230628967285, "learning_rate": 1.4943126934115536e-06, "loss": 3.1155, "step": 2230 }, { "epoch": 0.896, "grad_norm": 6.232199192047119, "learning_rate": 1.3870787340150376e-06, "loss": 3.2006, "step": 2240 }, { "epoch": 0.9, "grad_norm": 5.650846004486084, "learning_rate": 1.2837272735505668e-06, "loss": 2.8882, "step": 2250 }, { "epoch": 0.904, "grad_norm": 7.191598892211914, "learning_rate": 1.1842753053384559e-06, "loss": 3.0833, "step": 2260 }, { "epoch": 0.908, "grad_norm": 8.854833602905273, "learning_rate": 1.0887391815342124e-06, "loss": 3.3196, "step": 2270 }, { "epoch": 0.912, "grad_norm": 13.160386085510254, "learning_rate": 9.971346104398455e-07, "loss": 3.564, "step": 2280 }, { "epoch": 0.916, "grad_norm": 8.540671348571777, "learning_rate": 9.09476653921082e-07, "loss": 3.1383, "step": 2290 }, { "epoch": 0.92, "grad_norm": 12.331473350524902, "learning_rate": 8.257797249308419e-07, "loss": 3.259, "step": 2300 }, { "epoch": 0.924, "grad_norm": 7.576813697814941, "learning_rate": 7.460575851394341e-07, "loss": 2.8659, "step": 2310 }, { "epoch": 0.928, "grad_norm": 6.937955379486084, "learning_rate": 6.703233426718136e-07, "loss": 2.9416, "step": 2320 }, { "epoch": 0.932, "grad_norm": 7.9867777824401855, "learning_rate": 5.985894499523193e-07, "loss": 3.0008, "step": 2330 }, { "epoch": 0.936, "grad_norm": 10.474209785461426, "learning_rate": 5.308677016572145e-07, "loss": 3.6042, "step": 2340 }, { "epoch": 0.94, "grad_norm": 6.954331398010254, "learning_rate": 4.6716923277536627e-07, "loss": 2.696, "step": 2350 }, { "epoch": 0.944, "grad_norm": 6.555063247680664, "learning_rate": 4.075045167774072e-07, "loss": 3.2311, "step": 2360 }, { "epoch": 0.948, "grad_norm": 7.122920513153076, "learning_rate": 3.518833638936514e-07, "loss": 3.1349, "step": 2370 }, { "epoch": 0.952, "grad_norm": 10.269899368286133, "learning_rate": 3.003149195010907e-07, "loss": 2.9381, "step": 2380 }, { "epoch": 0.956, "grad_norm": 8.958882331848145, "learning_rate": 2.528076626196585e-07, "loss": 3.0804, "step": 2390 }, { "epoch": 0.96, "grad_norm": 11.036646842956543, "learning_rate": 2.0936940451811437e-07, "loss": 3.0191, "step": 2400 } ], "logging_steps": 10, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.128334475132928e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null }