{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9976947902259106, "eval_steps": 500, "global_step": 6500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030736130321192563, "grad_norm": 2.8125, "learning_rate": 1.9972341733251385e-05, "loss": 1.2571, "step": 10 }, { "epoch": 0.0061472260642385125, "grad_norm": 2.53125, "learning_rate": 1.994161032575292e-05, "loss": 0.6897, "step": 20 }, { "epoch": 0.009220839096357769, "grad_norm": 2.25, "learning_rate": 1.9910878918254458e-05, "loss": 0.7244, "step": 30 }, { "epoch": 0.012294452128477025, "grad_norm": 1.984375, "learning_rate": 1.9880147510755993e-05, "loss": 0.662, "step": 40 }, { "epoch": 0.015368065160596281, "grad_norm": 1.7890625, "learning_rate": 1.984941610325753e-05, "loss": 0.6253, "step": 50 }, { "epoch": 0.018441678192715538, "grad_norm": 2.15625, "learning_rate": 1.9818684695759067e-05, "loss": 0.6154, "step": 60 }, { "epoch": 0.021515291224834792, "grad_norm": 1.796875, "learning_rate": 1.9787953288260605e-05, "loss": 0.5772, "step": 70 }, { "epoch": 0.02458890425695405, "grad_norm": 1.6484375, "learning_rate": 1.975722188076214e-05, "loss": 0.6075, "step": 80 }, { "epoch": 0.027662517289073305, "grad_norm": 2.140625, "learning_rate": 1.972649047326368e-05, "loss": 0.5716, "step": 90 }, { "epoch": 0.030736130321192563, "grad_norm": 1.8828125, "learning_rate": 1.9695759065765213e-05, "loss": 0.6216, "step": 100 }, { "epoch": 0.03380974335331182, "grad_norm": 1.59375, "learning_rate": 1.9665027658266752e-05, "loss": 0.5748, "step": 110 }, { "epoch": 0.036883356385431075, "grad_norm": 2.359375, "learning_rate": 1.9634296250768287e-05, "loss": 0.6002, "step": 120 }, { "epoch": 0.03995696941755033, "grad_norm": 2.3125, "learning_rate": 1.9603564843269825e-05, "loss": 0.603, "step": 130 }, { "epoch": 0.043030582449669584, "grad_norm": 1.90625, "learning_rate": 1.957283343577136e-05, "loss": 0.5885, "step": 140 }, { "epoch": 0.046104195481788846, "grad_norm": 1.8203125, "learning_rate": 1.9542102028272895e-05, "loss": 0.5882, "step": 150 }, { "epoch": 0.0491778085139081, "grad_norm": 2.0, "learning_rate": 1.9511370620774434e-05, "loss": 0.5882, "step": 160 }, { "epoch": 0.052251421546027355, "grad_norm": 1.7109375, "learning_rate": 1.948063921327597e-05, "loss": 0.617, "step": 170 }, { "epoch": 0.05532503457814661, "grad_norm": 1.921875, "learning_rate": 1.9449907805777507e-05, "loss": 0.5883, "step": 180 }, { "epoch": 0.05839864761026587, "grad_norm": 2.234375, "learning_rate": 1.9419176398279042e-05, "loss": 0.5665, "step": 190 }, { "epoch": 0.061472260642385125, "grad_norm": 1.9140625, "learning_rate": 1.938844499078058e-05, "loss": 0.5579, "step": 200 }, { "epoch": 0.06454587367450439, "grad_norm": 1.6875, "learning_rate": 1.9357713583282115e-05, "loss": 0.5114, "step": 210 }, { "epoch": 0.06761948670662364, "grad_norm": 1.96875, "learning_rate": 1.9326982175783654e-05, "loss": 0.5922, "step": 220 }, { "epoch": 0.0706930997387429, "grad_norm": 1.9375, "learning_rate": 1.929625076828519e-05, "loss": 0.5241, "step": 230 }, { "epoch": 0.07376671277086215, "grad_norm": 1.859375, "learning_rate": 1.9265519360786727e-05, "loss": 0.5573, "step": 240 }, { "epoch": 0.0768403258029814, "grad_norm": 1.8984375, "learning_rate": 1.9234787953288262e-05, "loss": 0.5565, "step": 250 }, { "epoch": 0.07991393883510066, "grad_norm": 1.9765625, "learning_rate": 1.92040565457898e-05, "loss": 0.5994, "step": 260 }, { "epoch": 0.08298755186721991, "grad_norm": 1.8515625, "learning_rate": 1.9173325138291336e-05, "loss": 0.5496, "step": 270 }, { "epoch": 0.08606116489933917, "grad_norm": 1.3671875, "learning_rate": 1.914259373079287e-05, "loss": 0.4966, "step": 280 }, { "epoch": 0.08913477793145842, "grad_norm": 1.9765625, "learning_rate": 1.911186232329441e-05, "loss": 0.5337, "step": 290 }, { "epoch": 0.09220839096357769, "grad_norm": 2.421875, "learning_rate": 1.9081130915795944e-05, "loss": 0.5477, "step": 300 }, { "epoch": 0.09528200399569695, "grad_norm": 1.921875, "learning_rate": 1.9050399508297482e-05, "loss": 0.5003, "step": 310 }, { "epoch": 0.0983556170278162, "grad_norm": 1.9296875, "learning_rate": 1.9019668100799017e-05, "loss": 0.5316, "step": 320 }, { "epoch": 0.10142923005993545, "grad_norm": 1.921875, "learning_rate": 1.8988936693300556e-05, "loss": 0.5299, "step": 330 }, { "epoch": 0.10450284309205471, "grad_norm": 2.140625, "learning_rate": 1.895820528580209e-05, "loss": 0.5126, "step": 340 }, { "epoch": 0.10757645612417396, "grad_norm": 1.890625, "learning_rate": 1.892747387830363e-05, "loss": 0.5374, "step": 350 }, { "epoch": 0.11065006915629322, "grad_norm": 1.65625, "learning_rate": 1.8896742470805164e-05, "loss": 0.5385, "step": 360 }, { "epoch": 0.11372368218841247, "grad_norm": 1.953125, "learning_rate": 1.8866011063306702e-05, "loss": 0.5072, "step": 370 }, { "epoch": 0.11679729522053174, "grad_norm": 1.4453125, "learning_rate": 1.8835279655808237e-05, "loss": 0.5379, "step": 380 }, { "epoch": 0.119870908252651, "grad_norm": 1.9140625, "learning_rate": 1.8804548248309776e-05, "loss": 0.5164, "step": 390 }, { "epoch": 0.12294452128477025, "grad_norm": 2.390625, "learning_rate": 1.877381684081131e-05, "loss": 0.5303, "step": 400 }, { "epoch": 0.1260181343168895, "grad_norm": 1.9296875, "learning_rate": 1.8743085433312846e-05, "loss": 0.5223, "step": 410 }, { "epoch": 0.12909174734900877, "grad_norm": 2.0625, "learning_rate": 1.8712354025814384e-05, "loss": 0.5457, "step": 420 }, { "epoch": 0.13216536038112803, "grad_norm": 2.234375, "learning_rate": 1.868162261831592e-05, "loss": 0.5, "step": 430 }, { "epoch": 0.13523897341324728, "grad_norm": 2.015625, "learning_rate": 1.8650891210817458e-05, "loss": 0.525, "step": 440 }, { "epoch": 0.13831258644536654, "grad_norm": 1.546875, "learning_rate": 1.8620159803318993e-05, "loss": 0.4843, "step": 450 }, { "epoch": 0.1413861994774858, "grad_norm": 1.9765625, "learning_rate": 1.858942839582053e-05, "loss": 0.5126, "step": 460 }, { "epoch": 0.14445981250960505, "grad_norm": 2.09375, "learning_rate": 1.8558696988322066e-05, "loss": 0.5159, "step": 470 }, { "epoch": 0.1475334255417243, "grad_norm": 1.9921875, "learning_rate": 1.8527965580823604e-05, "loss": 0.5117, "step": 480 }, { "epoch": 0.15060703857384355, "grad_norm": 1.9609375, "learning_rate": 1.849723417332514e-05, "loss": 0.4941, "step": 490 }, { "epoch": 0.1536806516059628, "grad_norm": 1.78125, "learning_rate": 1.8466502765826678e-05, "loss": 0.5206, "step": 500 }, { "epoch": 0.15675426463808206, "grad_norm": 2.328125, "learning_rate": 1.8435771358328213e-05, "loss": 0.5227, "step": 510 }, { "epoch": 0.15982787767020132, "grad_norm": 1.890625, "learning_rate": 1.840503995082975e-05, "loss": 0.4969, "step": 520 }, { "epoch": 0.16290149070232057, "grad_norm": 1.9765625, "learning_rate": 1.8374308543331286e-05, "loss": 0.5237, "step": 530 }, { "epoch": 0.16597510373443983, "grad_norm": 2.078125, "learning_rate": 1.834357713583282e-05, "loss": 0.5529, "step": 540 }, { "epoch": 0.16904871676655908, "grad_norm": 2.15625, "learning_rate": 1.831284572833436e-05, "loss": 0.535, "step": 550 }, { "epoch": 0.17212232979867834, "grad_norm": 2.28125, "learning_rate": 1.8282114320835895e-05, "loss": 0.495, "step": 560 }, { "epoch": 0.1751959428307976, "grad_norm": 1.96875, "learning_rate": 1.8251382913337433e-05, "loss": 0.4784, "step": 570 }, { "epoch": 0.17826955586291685, "grad_norm": 1.90625, "learning_rate": 1.8220651505838968e-05, "loss": 0.5158, "step": 580 }, { "epoch": 0.18134316889503613, "grad_norm": 1.5859375, "learning_rate": 1.8189920098340506e-05, "loss": 0.5033, "step": 590 }, { "epoch": 0.18441678192715538, "grad_norm": 1.765625, "learning_rate": 1.815918869084204e-05, "loss": 0.5106, "step": 600 }, { "epoch": 0.18749039495927464, "grad_norm": 2.203125, "learning_rate": 1.812845728334358e-05, "loss": 0.5103, "step": 610 }, { "epoch": 0.1905640079913939, "grad_norm": 1.5390625, "learning_rate": 1.8097725875845115e-05, "loss": 0.5091, "step": 620 }, { "epoch": 0.19363762102351315, "grad_norm": 1.7578125, "learning_rate": 1.8066994468346653e-05, "loss": 0.485, "step": 630 }, { "epoch": 0.1967112340556324, "grad_norm": 2.03125, "learning_rate": 1.8036263060848188e-05, "loss": 0.4814, "step": 640 }, { "epoch": 0.19978484708775165, "grad_norm": 1.9765625, "learning_rate": 1.8005531653349727e-05, "loss": 0.4873, "step": 650 }, { "epoch": 0.2028584601198709, "grad_norm": 1.9921875, "learning_rate": 1.797480024585126e-05, "loss": 0.4744, "step": 660 }, { "epoch": 0.20593207315199016, "grad_norm": 1.890625, "learning_rate": 1.7944068838352797e-05, "loss": 0.5105, "step": 670 }, { "epoch": 0.20900568618410942, "grad_norm": 2.0625, "learning_rate": 1.7913337430854335e-05, "loss": 0.496, "step": 680 }, { "epoch": 0.21207929921622867, "grad_norm": 2.1875, "learning_rate": 1.788260602335587e-05, "loss": 0.4867, "step": 690 }, { "epoch": 0.21515291224834793, "grad_norm": 1.8984375, "learning_rate": 1.785187461585741e-05, "loss": 0.4406, "step": 700 }, { "epoch": 0.21822652528046718, "grad_norm": 1.71875, "learning_rate": 1.7821143208358943e-05, "loss": 0.4459, "step": 710 }, { "epoch": 0.22130013831258644, "grad_norm": 1.9609375, "learning_rate": 1.7790411800860482e-05, "loss": 0.4765, "step": 720 }, { "epoch": 0.2243737513447057, "grad_norm": 2.28125, "learning_rate": 1.7759680393362017e-05, "loss": 0.4879, "step": 730 }, { "epoch": 0.22744736437682495, "grad_norm": 2.171875, "learning_rate": 1.7728948985863555e-05, "loss": 0.4889, "step": 740 }, { "epoch": 0.23052097740894423, "grad_norm": 1.9921875, "learning_rate": 1.769821757836509e-05, "loss": 0.4491, "step": 750 }, { "epoch": 0.23359459044106348, "grad_norm": 1.7890625, "learning_rate": 1.766748617086663e-05, "loss": 0.4794, "step": 760 }, { "epoch": 0.23666820347318274, "grad_norm": 2.171875, "learning_rate": 1.7636754763368163e-05, "loss": 0.4935, "step": 770 }, { "epoch": 0.239741816505302, "grad_norm": 1.6953125, "learning_rate": 1.7606023355869702e-05, "loss": 0.4491, "step": 780 }, { "epoch": 0.24281542953742125, "grad_norm": 1.8984375, "learning_rate": 1.7575291948371237e-05, "loss": 0.4424, "step": 790 }, { "epoch": 0.2458890425695405, "grad_norm": 2.03125, "learning_rate": 1.7544560540872772e-05, "loss": 0.4756, "step": 800 }, { "epoch": 0.24896265560165975, "grad_norm": 2.296875, "learning_rate": 1.751382913337431e-05, "loss": 0.4743, "step": 810 }, { "epoch": 0.252036268633779, "grad_norm": 1.8046875, "learning_rate": 1.7483097725875845e-05, "loss": 0.4598, "step": 820 }, { "epoch": 0.25510988166589826, "grad_norm": 2.03125, "learning_rate": 1.7452366318377384e-05, "loss": 0.4849, "step": 830 }, { "epoch": 0.25818349469801755, "grad_norm": 1.984375, "learning_rate": 1.742163491087892e-05, "loss": 0.4555, "step": 840 }, { "epoch": 0.2612571077301368, "grad_norm": 1.859375, "learning_rate": 1.7390903503380457e-05, "loss": 0.4487, "step": 850 }, { "epoch": 0.26433072076225606, "grad_norm": 2.453125, "learning_rate": 1.7360172095881992e-05, "loss": 0.4816, "step": 860 }, { "epoch": 0.2674043337943753, "grad_norm": 2.078125, "learning_rate": 1.732944068838353e-05, "loss": 0.4694, "step": 870 }, { "epoch": 0.27047794682649456, "grad_norm": 1.8828125, "learning_rate": 1.7298709280885065e-05, "loss": 0.4826, "step": 880 }, { "epoch": 0.2735515598586138, "grad_norm": 1.890625, "learning_rate": 1.7267977873386604e-05, "loss": 0.4397, "step": 890 }, { "epoch": 0.2766251728907331, "grad_norm": 2.046875, "learning_rate": 1.723724646588814e-05, "loss": 0.4297, "step": 900 }, { "epoch": 0.2796987859228523, "grad_norm": 1.859375, "learning_rate": 1.7206515058389677e-05, "loss": 0.4964, "step": 910 }, { "epoch": 0.2827723989549716, "grad_norm": 1.4921875, "learning_rate": 1.7175783650891212e-05, "loss": 0.4641, "step": 920 }, { "epoch": 0.2858460119870908, "grad_norm": 1.4765625, "learning_rate": 1.7145052243392747e-05, "loss": 0.4507, "step": 930 }, { "epoch": 0.2889196250192101, "grad_norm": 1.9921875, "learning_rate": 1.7114320835894286e-05, "loss": 0.4675, "step": 940 }, { "epoch": 0.2919932380513293, "grad_norm": 1.703125, "learning_rate": 1.708358942839582e-05, "loss": 0.4329, "step": 950 }, { "epoch": 0.2950668510834486, "grad_norm": 1.859375, "learning_rate": 1.705285802089736e-05, "loss": 0.4666, "step": 960 }, { "epoch": 0.2981404641155678, "grad_norm": 2.0625, "learning_rate": 1.7022126613398894e-05, "loss": 0.445, "step": 970 }, { "epoch": 0.3012140771476871, "grad_norm": 1.8125, "learning_rate": 1.6991395205900432e-05, "loss": 0.4755, "step": 980 }, { "epoch": 0.30428769017980634, "grad_norm": 1.71875, "learning_rate": 1.6960663798401967e-05, "loss": 0.4107, "step": 990 }, { "epoch": 0.3073613032119256, "grad_norm": 2.03125, "learning_rate": 1.6929932390903506e-05, "loss": 0.4595, "step": 1000 }, { "epoch": 0.3104349162440449, "grad_norm": 2.0625, "learning_rate": 1.689920098340504e-05, "loss": 0.4523, "step": 1010 }, { "epoch": 0.3135085292761641, "grad_norm": 1.984375, "learning_rate": 1.686846957590658e-05, "loss": 0.4645, "step": 1020 }, { "epoch": 0.3165821423082834, "grad_norm": 2.28125, "learning_rate": 1.6837738168408114e-05, "loss": 0.4895, "step": 1030 }, { "epoch": 0.31965575534040264, "grad_norm": 1.984375, "learning_rate": 1.6807006760909653e-05, "loss": 0.4519, "step": 1040 }, { "epoch": 0.3227293683725219, "grad_norm": 2.21875, "learning_rate": 1.6776275353411188e-05, "loss": 0.4727, "step": 1050 }, { "epoch": 0.32580298140464115, "grad_norm": 1.8203125, "learning_rate": 1.6745543945912723e-05, "loss": 0.442, "step": 1060 }, { "epoch": 0.3288765944367604, "grad_norm": 2.234375, "learning_rate": 1.671481253841426e-05, "loss": 0.4645, "step": 1070 }, { "epoch": 0.33195020746887965, "grad_norm": 2.015625, "learning_rate": 1.6684081130915796e-05, "loss": 0.4463, "step": 1080 }, { "epoch": 0.33502382050099894, "grad_norm": 1.9296875, "learning_rate": 1.6653349723417334e-05, "loss": 0.4446, "step": 1090 }, { "epoch": 0.33809743353311816, "grad_norm": 1.734375, "learning_rate": 1.662261831591887e-05, "loss": 0.4594, "step": 1100 }, { "epoch": 0.34117104656523745, "grad_norm": 1.9765625, "learning_rate": 1.6591886908420408e-05, "loss": 0.4874, "step": 1110 }, { "epoch": 0.3442446595973567, "grad_norm": 1.921875, "learning_rate": 1.6561155500921943e-05, "loss": 0.4757, "step": 1120 }, { "epoch": 0.34731827262947595, "grad_norm": 2.203125, "learning_rate": 1.653042409342348e-05, "loss": 0.4756, "step": 1130 }, { "epoch": 0.3503918856615952, "grad_norm": 2.25, "learning_rate": 1.6499692685925016e-05, "loss": 0.4271, "step": 1140 }, { "epoch": 0.35346549869371446, "grad_norm": 2.3125, "learning_rate": 1.6468961278426554e-05, "loss": 0.4882, "step": 1150 }, { "epoch": 0.3565391117258337, "grad_norm": 2.078125, "learning_rate": 1.643822987092809e-05, "loss": 0.4381, "step": 1160 }, { "epoch": 0.359612724757953, "grad_norm": 1.796875, "learning_rate": 1.6407498463429628e-05, "loss": 0.4276, "step": 1170 }, { "epoch": 0.36268633779007226, "grad_norm": 1.8046875, "learning_rate": 1.6376767055931163e-05, "loss": 0.4088, "step": 1180 }, { "epoch": 0.3657599508221915, "grad_norm": 1.7109375, "learning_rate": 1.6346035648432698e-05, "loss": 0.4368, "step": 1190 }, { "epoch": 0.36883356385431076, "grad_norm": 1.625, "learning_rate": 1.6315304240934236e-05, "loss": 0.4488, "step": 1200 }, { "epoch": 0.37190717688643, "grad_norm": 2.15625, "learning_rate": 1.628457283343577e-05, "loss": 0.4213, "step": 1210 }, { "epoch": 0.3749807899185493, "grad_norm": 2.078125, "learning_rate": 1.625384142593731e-05, "loss": 0.4275, "step": 1220 }, { "epoch": 0.3780544029506685, "grad_norm": 1.8359375, "learning_rate": 1.6223110018438845e-05, "loss": 0.4721, "step": 1230 }, { "epoch": 0.3811280159827878, "grad_norm": 1.9140625, "learning_rate": 1.6192378610940383e-05, "loss": 0.3997, "step": 1240 }, { "epoch": 0.384201629014907, "grad_norm": 2.015625, "learning_rate": 1.6161647203441918e-05, "loss": 0.433, "step": 1250 }, { "epoch": 0.3872752420470263, "grad_norm": 2.359375, "learning_rate": 1.6130915795943456e-05, "loss": 0.4386, "step": 1260 }, { "epoch": 0.3903488550791455, "grad_norm": 2.046875, "learning_rate": 1.610018438844499e-05, "loss": 0.4366, "step": 1270 }, { "epoch": 0.3934224681112648, "grad_norm": 1.8046875, "learning_rate": 1.606945298094653e-05, "loss": 0.41, "step": 1280 }, { "epoch": 0.396496081143384, "grad_norm": 2.203125, "learning_rate": 1.6038721573448065e-05, "loss": 0.4296, "step": 1290 }, { "epoch": 0.3995696941755033, "grad_norm": 2.03125, "learning_rate": 1.6007990165949603e-05, "loss": 0.4292, "step": 1300 }, { "epoch": 0.40264330720762254, "grad_norm": 1.875, "learning_rate": 1.5977258758451138e-05, "loss": 0.4056, "step": 1310 }, { "epoch": 0.4057169202397418, "grad_norm": 1.90625, "learning_rate": 1.5946527350952673e-05, "loss": 0.4191, "step": 1320 }, { "epoch": 0.40879053327186105, "grad_norm": 1.7265625, "learning_rate": 1.591579594345421e-05, "loss": 0.4153, "step": 1330 }, { "epoch": 0.4118641463039803, "grad_norm": 2.40625, "learning_rate": 1.5885064535955747e-05, "loss": 0.4415, "step": 1340 }, { "epoch": 0.4149377593360996, "grad_norm": 2.359375, "learning_rate": 1.5854333128457285e-05, "loss": 0.4004, "step": 1350 }, { "epoch": 0.41801137236821884, "grad_norm": 1.78125, "learning_rate": 1.582360172095882e-05, "loss": 0.4418, "step": 1360 }, { "epoch": 0.4210849854003381, "grad_norm": 2.34375, "learning_rate": 1.579287031346036e-05, "loss": 0.4311, "step": 1370 }, { "epoch": 0.42415859843245735, "grad_norm": 2.015625, "learning_rate": 1.5762138905961893e-05, "loss": 0.4112, "step": 1380 }, { "epoch": 0.42723221146457663, "grad_norm": 2.25, "learning_rate": 1.5731407498463432e-05, "loss": 0.4352, "step": 1390 }, { "epoch": 0.43030582449669585, "grad_norm": 1.9453125, "learning_rate": 1.5700676090964967e-05, "loss": 0.4617, "step": 1400 }, { "epoch": 0.43337943752881514, "grad_norm": 2.15625, "learning_rate": 1.5669944683466505e-05, "loss": 0.3976, "step": 1410 }, { "epoch": 0.43645305056093436, "grad_norm": 2.203125, "learning_rate": 1.563921327596804e-05, "loss": 0.432, "step": 1420 }, { "epoch": 0.43952666359305365, "grad_norm": 1.8203125, "learning_rate": 1.560848186846958e-05, "loss": 0.3597, "step": 1430 }, { "epoch": 0.4426002766251729, "grad_norm": 1.6015625, "learning_rate": 1.5577750460971114e-05, "loss": 0.4091, "step": 1440 }, { "epoch": 0.44567388965729215, "grad_norm": 2.125, "learning_rate": 1.554701905347265e-05, "loss": 0.4275, "step": 1450 }, { "epoch": 0.4487475026894114, "grad_norm": 1.9375, "learning_rate": 1.5516287645974187e-05, "loss": 0.4125, "step": 1460 }, { "epoch": 0.45182111572153066, "grad_norm": 1.8203125, "learning_rate": 1.5485556238475722e-05, "loss": 0.3917, "step": 1470 }, { "epoch": 0.4548947287536499, "grad_norm": 1.8515625, "learning_rate": 1.545482483097726e-05, "loss": 0.364, "step": 1480 }, { "epoch": 0.4579683417857692, "grad_norm": 1.625, "learning_rate": 1.5424093423478795e-05, "loss": 0.4297, "step": 1490 }, { "epoch": 0.46104195481788846, "grad_norm": 2.109375, "learning_rate": 1.5393362015980334e-05, "loss": 0.4274, "step": 1500 }, { "epoch": 0.4641155678500077, "grad_norm": 1.9453125, "learning_rate": 1.536263060848187e-05, "loss": 0.3786, "step": 1510 }, { "epoch": 0.46718918088212696, "grad_norm": 2.53125, "learning_rate": 1.5331899200983407e-05, "loss": 0.4062, "step": 1520 }, { "epoch": 0.4702627939142462, "grad_norm": 2.359375, "learning_rate": 1.5301167793484942e-05, "loss": 0.3928, "step": 1530 }, { "epoch": 0.4733364069463655, "grad_norm": 2.375, "learning_rate": 1.527043638598648e-05, "loss": 0.4124, "step": 1540 }, { "epoch": 0.4764100199784847, "grad_norm": 1.8125, "learning_rate": 1.5239704978488017e-05, "loss": 0.4111, "step": 1550 }, { "epoch": 0.479483633010604, "grad_norm": 1.546875, "learning_rate": 1.5208973570989554e-05, "loss": 0.385, "step": 1560 }, { "epoch": 0.4825572460427232, "grad_norm": 1.953125, "learning_rate": 1.5178242163491089e-05, "loss": 0.4165, "step": 1570 }, { "epoch": 0.4856308590748425, "grad_norm": 1.7265625, "learning_rate": 1.5147510755992626e-05, "loss": 0.3929, "step": 1580 }, { "epoch": 0.4887044721069617, "grad_norm": 2.46875, "learning_rate": 1.5116779348494162e-05, "loss": 0.4247, "step": 1590 }, { "epoch": 0.491778085139081, "grad_norm": 2.09375, "learning_rate": 1.5086047940995699e-05, "loss": 0.4078, "step": 1600 }, { "epoch": 0.4948516981712002, "grad_norm": 1.5546875, "learning_rate": 1.5055316533497236e-05, "loss": 0.3808, "step": 1610 }, { "epoch": 0.4979253112033195, "grad_norm": 1.9609375, "learning_rate": 1.5024585125998772e-05, "loss": 0.4193, "step": 1620 }, { "epoch": 0.5009989242354388, "grad_norm": 1.9921875, "learning_rate": 1.4993853718500309e-05, "loss": 0.3915, "step": 1630 }, { "epoch": 0.504072537267558, "grad_norm": 2.109375, "learning_rate": 1.4963122311001846e-05, "loss": 0.4306, "step": 1640 }, { "epoch": 0.5071461502996772, "grad_norm": 2.171875, "learning_rate": 1.4932390903503382e-05, "loss": 0.4278, "step": 1650 }, { "epoch": 0.5102197633317965, "grad_norm": 2.3125, "learning_rate": 1.490165949600492e-05, "loss": 0.4045, "step": 1660 }, { "epoch": 0.5132933763639158, "grad_norm": 1.828125, "learning_rate": 1.4870928088506456e-05, "loss": 0.3882, "step": 1670 }, { "epoch": 0.5163669893960351, "grad_norm": 2.140625, "learning_rate": 1.4840196681007993e-05, "loss": 0.4204, "step": 1680 }, { "epoch": 0.5194406024281543, "grad_norm": 2.046875, "learning_rate": 1.480946527350953e-05, "loss": 0.362, "step": 1690 }, { "epoch": 0.5225142154602735, "grad_norm": 2.328125, "learning_rate": 1.4778733866011064e-05, "loss": 0.3536, "step": 1700 }, { "epoch": 0.5255878284923928, "grad_norm": 1.953125, "learning_rate": 1.4748002458512601e-05, "loss": 0.3586, "step": 1710 }, { "epoch": 0.5286614415245121, "grad_norm": 1.7890625, "learning_rate": 1.4717271051014138e-05, "loss": 0.3937, "step": 1720 }, { "epoch": 0.5317350545566313, "grad_norm": 2.3125, "learning_rate": 1.4686539643515674e-05, "loss": 0.3939, "step": 1730 }, { "epoch": 0.5348086675887506, "grad_norm": 1.9375, "learning_rate": 1.4655808236017211e-05, "loss": 0.3851, "step": 1740 }, { "epoch": 0.5378822806208698, "grad_norm": 1.8984375, "learning_rate": 1.4625076828518748e-05, "loss": 0.429, "step": 1750 }, { "epoch": 0.5409558936529891, "grad_norm": 2.078125, "learning_rate": 1.4594345421020284e-05, "loss": 0.3902, "step": 1760 }, { "epoch": 0.5440295066851083, "grad_norm": 2.140625, "learning_rate": 1.4563614013521821e-05, "loss": 0.3783, "step": 1770 }, { "epoch": 0.5471031197172276, "grad_norm": 1.9296875, "learning_rate": 1.4532882606023358e-05, "loss": 0.4017, "step": 1780 }, { "epoch": 0.5501767327493469, "grad_norm": 2.359375, "learning_rate": 1.4502151198524894e-05, "loss": 0.3858, "step": 1790 }, { "epoch": 0.5532503457814661, "grad_norm": 2.375, "learning_rate": 1.4471419791026431e-05, "loss": 0.3847, "step": 1800 }, { "epoch": 0.5563239588135853, "grad_norm": 1.8671875, "learning_rate": 1.4440688383527968e-05, "loss": 0.4029, "step": 1810 }, { "epoch": 0.5593975718457046, "grad_norm": 1.8828125, "learning_rate": 1.4409956976029505e-05, "loss": 0.3941, "step": 1820 }, { "epoch": 0.5624711848778239, "grad_norm": 2.46875, "learning_rate": 1.437922556853104e-05, "loss": 0.3994, "step": 1830 }, { "epoch": 0.5655447979099432, "grad_norm": 2.21875, "learning_rate": 1.4348494161032576e-05, "loss": 0.3863, "step": 1840 }, { "epoch": 0.5686184109420624, "grad_norm": 1.9296875, "learning_rate": 1.4317762753534113e-05, "loss": 0.3733, "step": 1850 }, { "epoch": 0.5716920239741816, "grad_norm": 1.828125, "learning_rate": 1.428703134603565e-05, "loss": 0.367, "step": 1860 }, { "epoch": 0.5747656370063009, "grad_norm": 1.9375, "learning_rate": 1.4256299938537186e-05, "loss": 0.3405, "step": 1870 }, { "epoch": 0.5778392500384202, "grad_norm": 1.9453125, "learning_rate": 1.4225568531038723e-05, "loss": 0.39, "step": 1880 }, { "epoch": 0.5809128630705395, "grad_norm": 1.6328125, "learning_rate": 1.419483712354026e-05, "loss": 0.3968, "step": 1890 }, { "epoch": 0.5839864761026586, "grad_norm": 2.421875, "learning_rate": 1.4164105716041796e-05, "loss": 0.3926, "step": 1900 }, { "epoch": 0.5870600891347779, "grad_norm": 2.203125, "learning_rate": 1.4133374308543333e-05, "loss": 0.3772, "step": 1910 }, { "epoch": 0.5901337021668972, "grad_norm": 1.8359375, "learning_rate": 1.410264290104487e-05, "loss": 0.4271, "step": 1920 }, { "epoch": 0.5932073151990165, "grad_norm": 2.96875, "learning_rate": 1.4071911493546407e-05, "loss": 0.3683, "step": 1930 }, { "epoch": 0.5962809282311357, "grad_norm": 2.0, "learning_rate": 1.4041180086047943e-05, "loss": 0.3764, "step": 1940 }, { "epoch": 0.5993545412632549, "grad_norm": 1.8515625, "learning_rate": 1.401044867854948e-05, "loss": 0.4047, "step": 1950 }, { "epoch": 0.6024281542953742, "grad_norm": 2.0625, "learning_rate": 1.3979717271051015e-05, "loss": 0.3684, "step": 1960 }, { "epoch": 0.6055017673274935, "grad_norm": 2.5625, "learning_rate": 1.3948985863552552e-05, "loss": 0.4148, "step": 1970 }, { "epoch": 0.6085753803596127, "grad_norm": 2.078125, "learning_rate": 1.3918254456054088e-05, "loss": 0.3858, "step": 1980 }, { "epoch": 0.611648993391732, "grad_norm": 2.515625, "learning_rate": 1.3887523048555625e-05, "loss": 0.3473, "step": 1990 }, { "epoch": 0.6147226064238512, "grad_norm": 1.875, "learning_rate": 1.3856791641057162e-05, "loss": 0.3727, "step": 2000 }, { "epoch": 0.6177962194559705, "grad_norm": 2.40625, "learning_rate": 1.3826060233558698e-05, "loss": 0.3615, "step": 2010 }, { "epoch": 0.6208698324880898, "grad_norm": 1.9453125, "learning_rate": 1.3795328826060235e-05, "loss": 0.39, "step": 2020 }, { "epoch": 0.623943445520209, "grad_norm": 1.7109375, "learning_rate": 1.3764597418561772e-05, "loss": 0.3736, "step": 2030 }, { "epoch": 0.6270170585523283, "grad_norm": 1.828125, "learning_rate": 1.3733866011063308e-05, "loss": 0.3375, "step": 2040 }, { "epoch": 0.6300906715844475, "grad_norm": 1.9765625, "learning_rate": 1.3703134603564845e-05, "loss": 0.3694, "step": 2050 }, { "epoch": 0.6331642846165668, "grad_norm": 1.9765625, "learning_rate": 1.3672403196066382e-05, "loss": 0.3941, "step": 2060 }, { "epoch": 0.636237897648686, "grad_norm": 1.640625, "learning_rate": 1.3641671788567919e-05, "loss": 0.333, "step": 2070 }, { "epoch": 0.6393115106808053, "grad_norm": 1.8125, "learning_rate": 1.3610940381069455e-05, "loss": 0.3883, "step": 2080 }, { "epoch": 0.6423851237129246, "grad_norm": 1.8515625, "learning_rate": 1.358020897357099e-05, "loss": 0.3817, "step": 2090 }, { "epoch": 0.6454587367450438, "grad_norm": 2.515625, "learning_rate": 1.3549477566072527e-05, "loss": 0.37, "step": 2100 }, { "epoch": 0.648532349777163, "grad_norm": 2.046875, "learning_rate": 1.3518746158574064e-05, "loss": 0.372, "step": 2110 }, { "epoch": 0.6516059628092823, "grad_norm": 3.109375, "learning_rate": 1.34880147510756e-05, "loss": 0.388, "step": 2120 }, { "epoch": 0.6546795758414016, "grad_norm": 2.140625, "learning_rate": 1.3457283343577137e-05, "loss": 0.3619, "step": 2130 }, { "epoch": 0.6577531888735209, "grad_norm": 2.265625, "learning_rate": 1.3426551936078674e-05, "loss": 0.408, "step": 2140 }, { "epoch": 0.66082680190564, "grad_norm": 2.453125, "learning_rate": 1.339582052858021e-05, "loss": 0.3902, "step": 2150 }, { "epoch": 0.6639004149377593, "grad_norm": 2.234375, "learning_rate": 1.3365089121081747e-05, "loss": 0.3731, "step": 2160 }, { "epoch": 0.6669740279698786, "grad_norm": 2.015625, "learning_rate": 1.3334357713583284e-05, "loss": 0.3645, "step": 2170 }, { "epoch": 0.6700476410019979, "grad_norm": 2.3125, "learning_rate": 1.330362630608482e-05, "loss": 0.3926, "step": 2180 }, { "epoch": 0.6731212540341172, "grad_norm": 2.109375, "learning_rate": 1.3272894898586357e-05, "loss": 0.3805, "step": 2190 }, { "epoch": 0.6761948670662363, "grad_norm": 1.8359375, "learning_rate": 1.3242163491087894e-05, "loss": 0.3481, "step": 2200 }, { "epoch": 0.6792684800983556, "grad_norm": 2.109375, "learning_rate": 1.321143208358943e-05, "loss": 0.3531, "step": 2210 }, { "epoch": 0.6823420931304749, "grad_norm": 1.9765625, "learning_rate": 1.3180700676090966e-05, "loss": 0.3419, "step": 2220 }, { "epoch": 0.6854157061625942, "grad_norm": 1.609375, "learning_rate": 1.3149969268592502e-05, "loss": 0.3551, "step": 2230 }, { "epoch": 0.6884893191947133, "grad_norm": 2.28125, "learning_rate": 1.3119237861094039e-05, "loss": 0.3668, "step": 2240 }, { "epoch": 0.6915629322268326, "grad_norm": 2.46875, "learning_rate": 1.3088506453595576e-05, "loss": 0.3915, "step": 2250 }, { "epoch": 0.6946365452589519, "grad_norm": 2.484375, "learning_rate": 1.3057775046097112e-05, "loss": 0.3845, "step": 2260 }, { "epoch": 0.6977101582910712, "grad_norm": 2.578125, "learning_rate": 1.3027043638598649e-05, "loss": 0.3735, "step": 2270 }, { "epoch": 0.7007837713231904, "grad_norm": 2.25, "learning_rate": 1.2996312231100186e-05, "loss": 0.355, "step": 2280 }, { "epoch": 0.7038573843553096, "grad_norm": 2.109375, "learning_rate": 1.2965580823601722e-05, "loss": 0.3618, "step": 2290 }, { "epoch": 0.7069309973874289, "grad_norm": 2.546875, "learning_rate": 1.293484941610326e-05, "loss": 0.3698, "step": 2300 }, { "epoch": 0.7100046104195482, "grad_norm": 2.546875, "learning_rate": 1.2904118008604796e-05, "loss": 0.359, "step": 2310 }, { "epoch": 0.7130782234516674, "grad_norm": 2.15625, "learning_rate": 1.2873386601106333e-05, "loss": 0.3443, "step": 2320 }, { "epoch": 0.7161518364837867, "grad_norm": 2.65625, "learning_rate": 1.284265519360787e-05, "loss": 0.3849, "step": 2330 }, { "epoch": 0.719225449515906, "grad_norm": 2.84375, "learning_rate": 1.2811923786109406e-05, "loss": 0.4124, "step": 2340 }, { "epoch": 0.7222990625480252, "grad_norm": 2.140625, "learning_rate": 1.2781192378610941e-05, "loss": 0.3298, "step": 2350 }, { "epoch": 0.7253726755801445, "grad_norm": 2.515625, "learning_rate": 1.2750460971112478e-05, "loss": 0.37, "step": 2360 }, { "epoch": 0.7284462886122637, "grad_norm": 2.0625, "learning_rate": 1.2719729563614014e-05, "loss": 0.388, "step": 2370 }, { "epoch": 0.731519901644383, "grad_norm": 1.796875, "learning_rate": 1.2688998156115551e-05, "loss": 0.3375, "step": 2380 }, { "epoch": 0.7345935146765022, "grad_norm": 2.40625, "learning_rate": 1.2658266748617088e-05, "loss": 0.3783, "step": 2390 }, { "epoch": 0.7376671277086215, "grad_norm": 1.828125, "learning_rate": 1.2627535341118624e-05, "loss": 0.3779, "step": 2400 }, { "epoch": 0.7407407407407407, "grad_norm": 1.8203125, "learning_rate": 1.2596803933620161e-05, "loss": 0.3391, "step": 2410 }, { "epoch": 0.74381435377286, "grad_norm": 2.5625, "learning_rate": 1.2566072526121698e-05, "loss": 0.3817, "step": 2420 }, { "epoch": 0.7468879668049793, "grad_norm": 2.453125, "learning_rate": 1.2535341118623235e-05, "loss": 0.3502, "step": 2430 }, { "epoch": 0.7499615798370985, "grad_norm": 2.359375, "learning_rate": 1.2504609711124771e-05, "loss": 0.3552, "step": 2440 }, { "epoch": 0.7530351928692177, "grad_norm": 2.09375, "learning_rate": 1.2473878303626308e-05, "loss": 0.3718, "step": 2450 }, { "epoch": 0.756108805901337, "grad_norm": 2.03125, "learning_rate": 1.2443146896127845e-05, "loss": 0.3423, "step": 2460 }, { "epoch": 0.7591824189334563, "grad_norm": 2.109375, "learning_rate": 1.2412415488629381e-05, "loss": 0.3806, "step": 2470 }, { "epoch": 0.7622560319655756, "grad_norm": 2.15625, "learning_rate": 1.2381684081130916e-05, "loss": 0.353, "step": 2480 }, { "epoch": 0.7653296449976947, "grad_norm": 2.046875, "learning_rate": 1.2350952673632453e-05, "loss": 0.3523, "step": 2490 }, { "epoch": 0.768403258029814, "grad_norm": 2.109375, "learning_rate": 1.232022126613399e-05, "loss": 0.3781, "step": 2500 }, { "epoch": 0.7714768710619333, "grad_norm": 2.21875, "learning_rate": 1.2289489858635526e-05, "loss": 0.3533, "step": 2510 }, { "epoch": 0.7745504840940526, "grad_norm": 1.703125, "learning_rate": 1.2258758451137063e-05, "loss": 0.346, "step": 2520 }, { "epoch": 0.7776240971261719, "grad_norm": 2.21875, "learning_rate": 1.22280270436386e-05, "loss": 0.3564, "step": 2530 }, { "epoch": 0.780697710158291, "grad_norm": 2.015625, "learning_rate": 1.2197295636140136e-05, "loss": 0.3397, "step": 2540 }, { "epoch": 0.7837713231904103, "grad_norm": 1.7421875, "learning_rate": 1.2166564228641673e-05, "loss": 0.3456, "step": 2550 }, { "epoch": 0.7868449362225296, "grad_norm": 2.109375, "learning_rate": 1.213583282114321e-05, "loss": 0.3214, "step": 2560 }, { "epoch": 0.7899185492546489, "grad_norm": 2.046875, "learning_rate": 1.2105101413644747e-05, "loss": 0.3448, "step": 2570 }, { "epoch": 0.792992162286768, "grad_norm": 2.171875, "learning_rate": 1.2074370006146283e-05, "loss": 0.3729, "step": 2580 }, { "epoch": 0.7960657753188873, "grad_norm": 2.390625, "learning_rate": 1.204363859864782e-05, "loss": 0.3235, "step": 2590 }, { "epoch": 0.7991393883510066, "grad_norm": 2.796875, "learning_rate": 1.2012907191149357e-05, "loss": 0.3423, "step": 2600 }, { "epoch": 0.8022130013831259, "grad_norm": 2.28125, "learning_rate": 1.1982175783650892e-05, "loss": 0.3604, "step": 2610 }, { "epoch": 0.8052866144152451, "grad_norm": 1.9765625, "learning_rate": 1.1951444376152428e-05, "loss": 0.3702, "step": 2620 }, { "epoch": 0.8083602274473644, "grad_norm": 1.9921875, "learning_rate": 1.1920712968653965e-05, "loss": 0.3613, "step": 2630 }, { "epoch": 0.8114338404794836, "grad_norm": 2.296875, "learning_rate": 1.1889981561155502e-05, "loss": 0.3198, "step": 2640 }, { "epoch": 0.8145074535116029, "grad_norm": 2.625, "learning_rate": 1.1859250153657038e-05, "loss": 0.3552, "step": 2650 }, { "epoch": 0.8175810665437221, "grad_norm": 2.234375, "learning_rate": 1.1828518746158575e-05, "loss": 0.3481, "step": 2660 }, { "epoch": 0.8206546795758414, "grad_norm": 1.8359375, "learning_rate": 1.1797787338660112e-05, "loss": 0.3437, "step": 2670 }, { "epoch": 0.8237282926079607, "grad_norm": 2.0, "learning_rate": 1.1767055931161648e-05, "loss": 0.3552, "step": 2680 }, { "epoch": 0.8268019056400799, "grad_norm": 2.34375, "learning_rate": 1.1736324523663185e-05, "loss": 0.3293, "step": 2690 }, { "epoch": 0.8298755186721992, "grad_norm": 2.203125, "learning_rate": 1.1705593116164722e-05, "loss": 0.329, "step": 2700 }, { "epoch": 0.8329491317043184, "grad_norm": 1.8046875, "learning_rate": 1.1674861708666259e-05, "loss": 0.338, "step": 2710 }, { "epoch": 0.8360227447364377, "grad_norm": 2.140625, "learning_rate": 1.1644130301167795e-05, "loss": 0.3558, "step": 2720 }, { "epoch": 0.839096357768557, "grad_norm": 2.875, "learning_rate": 1.1613398893669332e-05, "loss": 0.3521, "step": 2730 }, { "epoch": 0.8421699708006762, "grad_norm": 2.359375, "learning_rate": 1.1582667486170867e-05, "loss": 0.381, "step": 2740 }, { "epoch": 0.8452435838327954, "grad_norm": 2.046875, "learning_rate": 1.1551936078672404e-05, "loss": 0.351, "step": 2750 }, { "epoch": 0.8483171968649147, "grad_norm": 1.9375, "learning_rate": 1.152120467117394e-05, "loss": 0.3478, "step": 2760 }, { "epoch": 0.851390809897034, "grad_norm": 2.25, "learning_rate": 1.1490473263675477e-05, "loss": 0.3434, "step": 2770 }, { "epoch": 0.8544644229291533, "grad_norm": 1.9375, "learning_rate": 1.1459741856177014e-05, "loss": 0.3457, "step": 2780 }, { "epoch": 0.8575380359612724, "grad_norm": 1.96875, "learning_rate": 1.142901044867855e-05, "loss": 0.3267, "step": 2790 }, { "epoch": 0.8606116489933917, "grad_norm": 3.171875, "learning_rate": 1.1398279041180087e-05, "loss": 0.3545, "step": 2800 }, { "epoch": 0.863685262025511, "grad_norm": 1.7734375, "learning_rate": 1.1367547633681624e-05, "loss": 0.3394, "step": 2810 }, { "epoch": 0.8667588750576303, "grad_norm": 2.421875, "learning_rate": 1.133681622618316e-05, "loss": 0.3388, "step": 2820 }, { "epoch": 0.8698324880897496, "grad_norm": 2.265625, "learning_rate": 1.1306084818684697e-05, "loss": 0.3326, "step": 2830 }, { "epoch": 0.8729061011218687, "grad_norm": 2.328125, "learning_rate": 1.1275353411186234e-05, "loss": 0.2943, "step": 2840 }, { "epoch": 0.875979714153988, "grad_norm": 2.765625, "learning_rate": 1.124462200368777e-05, "loss": 0.3485, "step": 2850 }, { "epoch": 0.8790533271861073, "grad_norm": 2.640625, "learning_rate": 1.1213890596189307e-05, "loss": 0.3435, "step": 2860 }, { "epoch": 0.8821269402182266, "grad_norm": 1.90625, "learning_rate": 1.1183159188690842e-05, "loss": 0.336, "step": 2870 }, { "epoch": 0.8852005532503457, "grad_norm": 2.234375, "learning_rate": 1.1152427781192379e-05, "loss": 0.3299, "step": 2880 }, { "epoch": 0.888274166282465, "grad_norm": 2.359375, "learning_rate": 1.1121696373693916e-05, "loss": 0.3236, "step": 2890 }, { "epoch": 0.8913477793145843, "grad_norm": 2.6875, "learning_rate": 1.1090964966195452e-05, "loss": 0.3486, "step": 2900 }, { "epoch": 0.8944213923467036, "grad_norm": 2.46875, "learning_rate": 1.1060233558696989e-05, "loss": 0.361, "step": 2910 }, { "epoch": 0.8974950053788228, "grad_norm": 2.15625, "learning_rate": 1.1029502151198526e-05, "loss": 0.3271, "step": 2920 }, { "epoch": 0.900568618410942, "grad_norm": 2.71875, "learning_rate": 1.0998770743700062e-05, "loss": 0.3285, "step": 2930 }, { "epoch": 0.9036422314430613, "grad_norm": 2.015625, "learning_rate": 1.09680393362016e-05, "loss": 0.3021, "step": 2940 }, { "epoch": 0.9067158444751806, "grad_norm": 2.515625, "learning_rate": 1.0937307928703136e-05, "loss": 0.3023, "step": 2950 }, { "epoch": 0.9097894575072998, "grad_norm": 2.203125, "learning_rate": 1.0906576521204673e-05, "loss": 0.351, "step": 2960 }, { "epoch": 0.9128630705394191, "grad_norm": 2.515625, "learning_rate": 1.087584511370621e-05, "loss": 0.3478, "step": 2970 }, { "epoch": 0.9159366835715383, "grad_norm": 2.09375, "learning_rate": 1.0845113706207746e-05, "loss": 0.3674, "step": 2980 }, { "epoch": 0.9190102966036576, "grad_norm": 2.25, "learning_rate": 1.0814382298709283e-05, "loss": 0.3331, "step": 2990 }, { "epoch": 0.9220839096357769, "grad_norm": 2.109375, "learning_rate": 1.0783650891210818e-05, "loss": 0.3487, "step": 3000 }, { "epoch": 0.9251575226678961, "grad_norm": 1.984375, "learning_rate": 1.0752919483712354e-05, "loss": 0.3305, "step": 3010 }, { "epoch": 0.9282311357000154, "grad_norm": 2.640625, "learning_rate": 1.0722188076213891e-05, "loss": 0.3359, "step": 3020 }, { "epoch": 0.9313047487321346, "grad_norm": 2.15625, "learning_rate": 1.0691456668715428e-05, "loss": 0.3223, "step": 3030 }, { "epoch": 0.9343783617642539, "grad_norm": 2.515625, "learning_rate": 1.0660725261216964e-05, "loss": 0.3388, "step": 3040 }, { "epoch": 0.9374519747963731, "grad_norm": 2.15625, "learning_rate": 1.0629993853718501e-05, "loss": 0.3115, "step": 3050 }, { "epoch": 0.9405255878284924, "grad_norm": 2.421875, "learning_rate": 1.0599262446220038e-05, "loss": 0.3434, "step": 3060 }, { "epoch": 0.9435992008606117, "grad_norm": 1.859375, "learning_rate": 1.0568531038721575e-05, "loss": 0.333, "step": 3070 }, { "epoch": 0.946672813892731, "grad_norm": 2.28125, "learning_rate": 1.0537799631223111e-05, "loss": 0.3097, "step": 3080 }, { "epoch": 0.9497464269248501, "grad_norm": 1.78125, "learning_rate": 1.0507068223724648e-05, "loss": 0.3393, "step": 3090 }, { "epoch": 0.9528200399569694, "grad_norm": 2.296875, "learning_rate": 1.0476336816226185e-05, "loss": 0.32, "step": 3100 }, { "epoch": 0.9558936529890887, "grad_norm": 2.359375, "learning_rate": 1.0445605408727721e-05, "loss": 0.3458, "step": 3110 }, { "epoch": 0.958967266021208, "grad_norm": 2.578125, "learning_rate": 1.0414874001229258e-05, "loss": 0.3408, "step": 3120 }, { "epoch": 0.9620408790533271, "grad_norm": 2.3125, "learning_rate": 1.0384142593730793e-05, "loss": 0.3059, "step": 3130 }, { "epoch": 0.9651144920854464, "grad_norm": 2.625, "learning_rate": 1.035341118623233e-05, "loss": 0.3436, "step": 3140 }, { "epoch": 0.9681881051175657, "grad_norm": 2.03125, "learning_rate": 1.0322679778733866e-05, "loss": 0.3455, "step": 3150 }, { "epoch": 0.971261718149685, "grad_norm": 2.21875, "learning_rate": 1.0291948371235403e-05, "loss": 0.3236, "step": 3160 }, { "epoch": 0.9743353311818043, "grad_norm": 2.203125, "learning_rate": 1.026121696373694e-05, "loss": 0.3414, "step": 3170 }, { "epoch": 0.9774089442139234, "grad_norm": 2.390625, "learning_rate": 1.0230485556238476e-05, "loss": 0.3181, "step": 3180 }, { "epoch": 0.9804825572460427, "grad_norm": 2.40625, "learning_rate": 1.0199754148740013e-05, "loss": 0.3312, "step": 3190 }, { "epoch": 0.983556170278162, "grad_norm": 2.28125, "learning_rate": 1.016902274124155e-05, "loss": 0.3483, "step": 3200 }, { "epoch": 0.9866297833102813, "grad_norm": 1.84375, "learning_rate": 1.0138291333743087e-05, "loss": 0.3294, "step": 3210 }, { "epoch": 0.9897033963424005, "grad_norm": 1.953125, "learning_rate": 1.0107559926244623e-05, "loss": 0.3232, "step": 3220 }, { "epoch": 0.9927770093745197, "grad_norm": 2.34375, "learning_rate": 1.007682851874616e-05, "loss": 0.2939, "step": 3230 }, { "epoch": 0.995850622406639, "grad_norm": 1.9453125, "learning_rate": 1.0046097111247697e-05, "loss": 0.3432, "step": 3240 }, { "epoch": 0.9989242354387583, "grad_norm": 2.3125, "learning_rate": 1.0015365703749233e-05, "loss": 0.3194, "step": 3250 }, { "epoch": 1.0018441678192715, "grad_norm": 3.0, "learning_rate": 9.98463429625077e-06, "loss": 0.2676, "step": 3260 }, { "epoch": 1.0049177808513907, "grad_norm": 2.734375, "learning_rate": 9.953902888752307e-06, "loss": 0.2694, "step": 3270 }, { "epoch": 1.0079913938835101, "grad_norm": 1.875, "learning_rate": 9.923171481253843e-06, "loss": 0.2422, "step": 3280 }, { "epoch": 1.0110650069156293, "grad_norm": 1.8515625, "learning_rate": 9.892440073755378e-06, "loss": 0.2634, "step": 3290 }, { "epoch": 1.0141386199477487, "grad_norm": 2.0625, "learning_rate": 9.861708666256915e-06, "loss": 0.252, "step": 3300 }, { "epoch": 1.0172122329798678, "grad_norm": 2.484375, "learning_rate": 9.830977258758452e-06, "loss": 0.2702, "step": 3310 }, { "epoch": 1.020285846011987, "grad_norm": 2.1875, "learning_rate": 9.800245851259988e-06, "loss": 0.2959, "step": 3320 }, { "epoch": 1.0233594590441064, "grad_norm": 1.890625, "learning_rate": 9.769514443761525e-06, "loss": 0.2707, "step": 3330 }, { "epoch": 1.0264330720762256, "grad_norm": 2.125, "learning_rate": 9.738783036263062e-06, "loss": 0.2567, "step": 3340 }, { "epoch": 1.0295066851083448, "grad_norm": 2.40625, "learning_rate": 9.708051628764599e-06, "loss": 0.2623, "step": 3350 }, { "epoch": 1.0325802981404641, "grad_norm": 2.171875, "learning_rate": 9.677320221266134e-06, "loss": 0.2846, "step": 3360 }, { "epoch": 1.0356539111725833, "grad_norm": 2.296875, "learning_rate": 9.64658881376767e-06, "loss": 0.2979, "step": 3370 }, { "epoch": 1.0387275242047027, "grad_norm": 2.140625, "learning_rate": 9.615857406269209e-06, "loss": 0.2692, "step": 3380 }, { "epoch": 1.0418011372368219, "grad_norm": 2.203125, "learning_rate": 9.585125998770745e-06, "loss": 0.2733, "step": 3390 }, { "epoch": 1.044874750268941, "grad_norm": 2.375, "learning_rate": 9.554394591272282e-06, "loss": 0.2625, "step": 3400 }, { "epoch": 1.0479483633010604, "grad_norm": 2.0625, "learning_rate": 9.523663183773819e-06, "loss": 0.283, "step": 3410 }, { "epoch": 1.0510219763331796, "grad_norm": 2.171875, "learning_rate": 9.492931776275354e-06, "loss": 0.2823, "step": 3420 }, { "epoch": 1.054095589365299, "grad_norm": 2.25, "learning_rate": 9.46220036877689e-06, "loss": 0.2728, "step": 3430 }, { "epoch": 1.0571692023974182, "grad_norm": 1.8203125, "learning_rate": 9.431468961278427e-06, "loss": 0.2437, "step": 3440 }, { "epoch": 1.0602428154295374, "grad_norm": 2.375, "learning_rate": 9.400737553779964e-06, "loss": 0.2473, "step": 3450 }, { "epoch": 1.0633164284616567, "grad_norm": 1.7421875, "learning_rate": 9.3700061462815e-06, "loss": 0.27, "step": 3460 }, { "epoch": 1.066390041493776, "grad_norm": 2.65625, "learning_rate": 9.339274738783037e-06, "loss": 0.293, "step": 3470 }, { "epoch": 1.069463654525895, "grad_norm": 3.109375, "learning_rate": 9.308543331284574e-06, "loss": 0.2806, "step": 3480 }, { "epoch": 1.0725372675580145, "grad_norm": 2.546875, "learning_rate": 9.277811923786109e-06, "loss": 0.2824, "step": 3490 }, { "epoch": 1.0756108805901337, "grad_norm": 3.0625, "learning_rate": 9.247080516287647e-06, "loss": 0.2685, "step": 3500 }, { "epoch": 1.078684493622253, "grad_norm": 2.15625, "learning_rate": 9.216349108789184e-06, "loss": 0.2478, "step": 3510 }, { "epoch": 1.0817581066543722, "grad_norm": 2.140625, "learning_rate": 9.18561770129072e-06, "loss": 0.2639, "step": 3520 }, { "epoch": 1.0848317196864914, "grad_norm": 2.609375, "learning_rate": 9.154886293792257e-06, "loss": 0.2966, "step": 3530 }, { "epoch": 1.0879053327186108, "grad_norm": 2.078125, "learning_rate": 9.124154886293794e-06, "loss": 0.2538, "step": 3540 }, { "epoch": 1.09097894575073, "grad_norm": 2.0625, "learning_rate": 9.093423478795329e-06, "loss": 0.283, "step": 3550 }, { "epoch": 1.0940525587828493, "grad_norm": 2.578125, "learning_rate": 9.062692071296866e-06, "loss": 0.295, "step": 3560 }, { "epoch": 1.0971261718149685, "grad_norm": 2.046875, "learning_rate": 9.031960663798402e-06, "loss": 0.2694, "step": 3570 }, { "epoch": 1.1001997848470877, "grad_norm": 2.328125, "learning_rate": 9.00122925629994e-06, "loss": 0.275, "step": 3580 }, { "epoch": 1.103273397879207, "grad_norm": 2.640625, "learning_rate": 8.970497848801476e-06, "loss": 0.3061, "step": 3590 }, { "epoch": 1.1063470109113263, "grad_norm": 2.53125, "learning_rate": 8.939766441303013e-06, "loss": 0.2823, "step": 3600 }, { "epoch": 1.1094206239434454, "grad_norm": 2.296875, "learning_rate": 8.90903503380455e-06, "loss": 0.2522, "step": 3610 }, { "epoch": 1.1124942369755648, "grad_norm": 2.078125, "learning_rate": 8.878303626306086e-06, "loss": 0.2925, "step": 3620 }, { "epoch": 1.115567850007684, "grad_norm": 2.765625, "learning_rate": 8.847572218807623e-06, "loss": 0.2762, "step": 3630 }, { "epoch": 1.1186414630398034, "grad_norm": 2.65625, "learning_rate": 8.81684081130916e-06, "loss": 0.2761, "step": 3640 }, { "epoch": 1.1217150760719226, "grad_norm": 2.296875, "learning_rate": 8.786109403810696e-06, "loss": 0.273, "step": 3650 }, { "epoch": 1.1247886891040417, "grad_norm": 2.328125, "learning_rate": 8.755377996312233e-06, "loss": 0.2805, "step": 3660 }, { "epoch": 1.1278623021361611, "grad_norm": 2.703125, "learning_rate": 8.72464658881377e-06, "loss": 0.2725, "step": 3670 }, { "epoch": 1.1309359151682803, "grad_norm": 2.4375, "learning_rate": 8.693915181315304e-06, "loss": 0.2676, "step": 3680 }, { "epoch": 1.1340095282003997, "grad_norm": 2.359375, "learning_rate": 8.663183773816841e-06, "loss": 0.2852, "step": 3690 }, { "epoch": 1.1370831412325189, "grad_norm": 2.765625, "learning_rate": 8.632452366318378e-06, "loss": 0.2739, "step": 3700 }, { "epoch": 1.140156754264638, "grad_norm": 2.078125, "learning_rate": 8.601720958819915e-06, "loss": 0.2742, "step": 3710 }, { "epoch": 1.1432303672967574, "grad_norm": 2.640625, "learning_rate": 8.570989551321451e-06, "loss": 0.2766, "step": 3720 }, { "epoch": 1.1463039803288766, "grad_norm": 2.375, "learning_rate": 8.540258143822988e-06, "loss": 0.2897, "step": 3730 }, { "epoch": 1.1493775933609958, "grad_norm": 2.0625, "learning_rate": 8.509526736324525e-06, "loss": 0.2664, "step": 3740 }, { "epoch": 1.1524512063931152, "grad_norm": 2.359375, "learning_rate": 8.478795328826061e-06, "loss": 0.2723, "step": 3750 }, { "epoch": 1.1555248194252343, "grad_norm": 1.765625, "learning_rate": 8.448063921327598e-06, "loss": 0.2647, "step": 3760 }, { "epoch": 1.1585984324573535, "grad_norm": 2.59375, "learning_rate": 8.417332513829135e-06, "loss": 0.3002, "step": 3770 }, { "epoch": 1.161672045489473, "grad_norm": 1.796875, "learning_rate": 8.386601106330671e-06, "loss": 0.2566, "step": 3780 }, { "epoch": 1.164745658521592, "grad_norm": 2.328125, "learning_rate": 8.355869698832208e-06, "loss": 0.2624, "step": 3790 }, { "epoch": 1.1678192715537115, "grad_norm": 2.140625, "learning_rate": 8.325138291333745e-06, "loss": 0.2831, "step": 3800 }, { "epoch": 1.1708928845858306, "grad_norm": 2.171875, "learning_rate": 8.29440688383528e-06, "loss": 0.2856, "step": 3810 }, { "epoch": 1.17396649761795, "grad_norm": 2.21875, "learning_rate": 8.263675476336816e-06, "loss": 0.2612, "step": 3820 }, { "epoch": 1.1770401106500692, "grad_norm": 2.296875, "learning_rate": 8.232944068838353e-06, "loss": 0.2484, "step": 3830 }, { "epoch": 1.1801137236821884, "grad_norm": 1.828125, "learning_rate": 8.20221266133989e-06, "loss": 0.2772, "step": 3840 }, { "epoch": 1.1831873367143078, "grad_norm": 2.375, "learning_rate": 8.171481253841427e-06, "loss": 0.2308, "step": 3850 }, { "epoch": 1.186260949746427, "grad_norm": 2.203125, "learning_rate": 8.140749846342963e-06, "loss": 0.2431, "step": 3860 }, { "epoch": 1.189334562778546, "grad_norm": 2.5625, "learning_rate": 8.1100184388445e-06, "loss": 0.262, "step": 3870 }, { "epoch": 1.1924081758106655, "grad_norm": 2.578125, "learning_rate": 8.079287031346037e-06, "loss": 0.2678, "step": 3880 }, { "epoch": 1.1954817888427847, "grad_norm": 2.75, "learning_rate": 8.048555623847573e-06, "loss": 0.2382, "step": 3890 }, { "epoch": 1.1985554018749038, "grad_norm": 2.46875, "learning_rate": 8.01782421634911e-06, "loss": 0.2762, "step": 3900 }, { "epoch": 1.2016290149070232, "grad_norm": 2.0, "learning_rate": 7.987092808850647e-06, "loss": 0.2571, "step": 3910 }, { "epoch": 1.2047026279391424, "grad_norm": 2.734375, "learning_rate": 7.956361401352183e-06, "loss": 0.314, "step": 3920 }, { "epoch": 1.2077762409712618, "grad_norm": 2.3125, "learning_rate": 7.92562999385372e-06, "loss": 0.2412, "step": 3930 }, { "epoch": 1.210849854003381, "grad_norm": 2.515625, "learning_rate": 7.894898586355255e-06, "loss": 0.2633, "step": 3940 }, { "epoch": 1.2139234670355001, "grad_norm": 2.75, "learning_rate": 7.864167178856792e-06, "loss": 0.2563, "step": 3950 }, { "epoch": 1.2169970800676195, "grad_norm": 2.125, "learning_rate": 7.833435771358328e-06, "loss": 0.2917, "step": 3960 }, { "epoch": 1.2200706930997387, "grad_norm": 2.453125, "learning_rate": 7.802704363859865e-06, "loss": 0.2385, "step": 3970 }, { "epoch": 1.223144306131858, "grad_norm": 1.984375, "learning_rate": 7.771972956361402e-06, "loss": 0.265, "step": 3980 }, { "epoch": 1.2262179191639773, "grad_norm": 2.75, "learning_rate": 7.741241548862939e-06, "loss": 0.2519, "step": 3990 }, { "epoch": 1.2292915321960964, "grad_norm": 2.0625, "learning_rate": 7.710510141364475e-06, "loss": 0.2523, "step": 4000 }, { "epoch": 1.2323651452282158, "grad_norm": 3.03125, "learning_rate": 7.679778733866012e-06, "loss": 0.2485, "step": 4010 }, { "epoch": 1.235438758260335, "grad_norm": 2.484375, "learning_rate": 7.649047326367549e-06, "loss": 0.2591, "step": 4020 }, { "epoch": 1.2385123712924542, "grad_norm": 2.265625, "learning_rate": 7.6183159188690845e-06, "loss": 0.2765, "step": 4030 }, { "epoch": 1.2415859843245736, "grad_norm": 2.921875, "learning_rate": 7.587584511370621e-06, "loss": 0.2541, "step": 4040 }, { "epoch": 1.2446595973566927, "grad_norm": 2.484375, "learning_rate": 7.556853103872158e-06, "loss": 0.2941, "step": 4050 }, { "epoch": 1.2477332103888121, "grad_norm": 2.265625, "learning_rate": 7.526121696373695e-06, "loss": 0.2644, "step": 4060 }, { "epoch": 1.2508068234209313, "grad_norm": 1.859375, "learning_rate": 7.4953902888752304e-06, "loss": 0.2743, "step": 4070 }, { "epoch": 1.2538804364530507, "grad_norm": 2.359375, "learning_rate": 7.464658881376767e-06, "loss": 0.2657, "step": 4080 }, { "epoch": 1.2569540494851699, "grad_norm": 2.578125, "learning_rate": 7.433927473878304e-06, "loss": 0.2711, "step": 4090 }, { "epoch": 1.260027662517289, "grad_norm": 2.828125, "learning_rate": 7.4031960663798405e-06, "loss": 0.2613, "step": 4100 }, { "epoch": 1.2631012755494084, "grad_norm": 2.65625, "learning_rate": 7.372464658881377e-06, "loss": 0.2861, "step": 4110 }, { "epoch": 1.2661748885815276, "grad_norm": 2.125, "learning_rate": 7.341733251382914e-06, "loss": 0.2608, "step": 4120 }, { "epoch": 1.2692485016136468, "grad_norm": 2.90625, "learning_rate": 7.311001843884451e-06, "loss": 0.268, "step": 4130 }, { "epoch": 1.2723221146457662, "grad_norm": 2.78125, "learning_rate": 7.2802704363859865e-06, "loss": 0.2485, "step": 4140 }, { "epoch": 1.2753957276778853, "grad_norm": 1.7890625, "learning_rate": 7.249539028887523e-06, "loss": 0.2528, "step": 4150 }, { "epoch": 1.2784693407100045, "grad_norm": 2.15625, "learning_rate": 7.21880762138906e-06, "loss": 0.2659, "step": 4160 }, { "epoch": 1.281542953742124, "grad_norm": 2.359375, "learning_rate": 7.1880762138905965e-06, "loss": 0.2727, "step": 4170 }, { "epoch": 1.284616566774243, "grad_norm": 2.21875, "learning_rate": 7.157344806392133e-06, "loss": 0.277, "step": 4180 }, { "epoch": 1.2876901798063622, "grad_norm": 3.0, "learning_rate": 7.12661339889367e-06, "loss": 0.2812, "step": 4190 }, { "epoch": 1.2907637928384816, "grad_norm": 2.34375, "learning_rate": 7.095881991395206e-06, "loss": 0.2952, "step": 4200 }, { "epoch": 1.2938374058706008, "grad_norm": 2.28125, "learning_rate": 7.0651505838967425e-06, "loss": 0.2447, "step": 4210 }, { "epoch": 1.2969110189027202, "grad_norm": 2.78125, "learning_rate": 7.034419176398279e-06, "loss": 0.2678, "step": 4220 }, { "epoch": 1.2999846319348394, "grad_norm": 1.8359375, "learning_rate": 7.003687768899816e-06, "loss": 0.2617, "step": 4230 }, { "epoch": 1.3030582449669588, "grad_norm": 2.140625, "learning_rate": 6.9729563614013526e-06, "loss": 0.2657, "step": 4240 }, { "epoch": 1.306131857999078, "grad_norm": 2.65625, "learning_rate": 6.942224953902889e-06, "loss": 0.28, "step": 4250 }, { "epoch": 1.309205471031197, "grad_norm": 2.5, "learning_rate": 6.911493546404427e-06, "loss": 0.2695, "step": 4260 }, { "epoch": 1.3122790840633165, "grad_norm": 2.515625, "learning_rate": 6.880762138905962e-06, "loss": 0.2686, "step": 4270 }, { "epoch": 1.3153526970954357, "grad_norm": 2.3125, "learning_rate": 6.8500307314074985e-06, "loss": 0.2472, "step": 4280 }, { "epoch": 1.3184263101275548, "grad_norm": 3.21875, "learning_rate": 6.819299323909035e-06, "loss": 0.2498, "step": 4290 }, { "epoch": 1.3214999231596742, "grad_norm": 3.03125, "learning_rate": 6.788567916410572e-06, "loss": 0.2758, "step": 4300 }, { "epoch": 1.3245735361917934, "grad_norm": 2.484375, "learning_rate": 6.7578365089121086e-06, "loss": 0.2772, "step": 4310 }, { "epoch": 1.3276471492239126, "grad_norm": 3.078125, "learning_rate": 6.727105101413646e-06, "loss": 0.2693, "step": 4320 }, { "epoch": 1.330720762256032, "grad_norm": 2.140625, "learning_rate": 6.696373693915181e-06, "loss": 0.275, "step": 4330 }, { "epoch": 1.3337943752881511, "grad_norm": 2.625, "learning_rate": 6.665642286416718e-06, "loss": 0.2852, "step": 4340 }, { "epoch": 1.3368679883202705, "grad_norm": 2.59375, "learning_rate": 6.6349108789182545e-06, "loss": 0.2865, "step": 4350 }, { "epoch": 1.3399416013523897, "grad_norm": 2.296875, "learning_rate": 6.604179471419791e-06, "loss": 0.2714, "step": 4360 }, { "epoch": 1.343015214384509, "grad_norm": 2.515625, "learning_rate": 6.573448063921328e-06, "loss": 0.2551, "step": 4370 }, { "epoch": 1.3460888274166283, "grad_norm": 1.984375, "learning_rate": 6.5427166564228654e-06, "loss": 0.255, "step": 4380 }, { "epoch": 1.3491624404487474, "grad_norm": 2.328125, "learning_rate": 6.511985248924402e-06, "loss": 0.236, "step": 4390 }, { "epoch": 1.3522360534808668, "grad_norm": 1.9453125, "learning_rate": 6.481253841425937e-06, "loss": 0.2599, "step": 4400 }, { "epoch": 1.355309666512986, "grad_norm": 2.546875, "learning_rate": 6.450522433927474e-06, "loss": 0.2586, "step": 4410 }, { "epoch": 1.3583832795451052, "grad_norm": 1.8515625, "learning_rate": 6.4197910264290105e-06, "loss": 0.2531, "step": 4420 }, { "epoch": 1.3614568925772246, "grad_norm": 2.171875, "learning_rate": 6.389059618930547e-06, "loss": 0.2543, "step": 4430 }, { "epoch": 1.3645305056093437, "grad_norm": 2.6875, "learning_rate": 6.358328211432085e-06, "loss": 0.2641, "step": 4440 }, { "epoch": 1.367604118641463, "grad_norm": 2.328125, "learning_rate": 6.3275968039336215e-06, "loss": 0.2631, "step": 4450 }, { "epoch": 1.3706777316735823, "grad_norm": 2.5625, "learning_rate": 6.2968653964351565e-06, "loss": 0.2782, "step": 4460 }, { "epoch": 1.3737513447057015, "grad_norm": 2.765625, "learning_rate": 6.266133988936693e-06, "loss": 0.2614, "step": 4470 }, { "epoch": 1.3768249577378209, "grad_norm": 1.984375, "learning_rate": 6.23540258143823e-06, "loss": 0.2633, "step": 4480 }, { "epoch": 1.37989857076994, "grad_norm": 2.515625, "learning_rate": 6.2046711739397665e-06, "loss": 0.2587, "step": 4490 }, { "epoch": 1.3829721838020594, "grad_norm": 2.015625, "learning_rate": 6.173939766441303e-06, "loss": 0.2503, "step": 4500 }, { "epoch": 1.3860457968341786, "grad_norm": 2.0, "learning_rate": 6.143208358942841e-06, "loss": 0.2426, "step": 4510 }, { "epoch": 1.3891194098662978, "grad_norm": 2.890625, "learning_rate": 6.1124769514443775e-06, "loss": 0.2734, "step": 4520 }, { "epoch": 1.3921930228984172, "grad_norm": 2.484375, "learning_rate": 6.0817455439459125e-06, "loss": 0.2667, "step": 4530 }, { "epoch": 1.3952666359305363, "grad_norm": 2.53125, "learning_rate": 6.051014136447449e-06, "loss": 0.2595, "step": 4540 }, { "epoch": 1.3983402489626555, "grad_norm": 2.015625, "learning_rate": 6.020282728948986e-06, "loss": 0.2494, "step": 4550 }, { "epoch": 1.401413861994775, "grad_norm": 2.453125, "learning_rate": 5.9895513214505226e-06, "loss": 0.207, "step": 4560 }, { "epoch": 1.404487475026894, "grad_norm": 2.375, "learning_rate": 5.95881991395206e-06, "loss": 0.2547, "step": 4570 }, { "epoch": 1.4075610880590133, "grad_norm": 2.421875, "learning_rate": 5.928088506453597e-06, "loss": 0.2823, "step": 4580 }, { "epoch": 1.4106347010911326, "grad_norm": 2.4375, "learning_rate": 5.897357098955132e-06, "loss": 0.2654, "step": 4590 }, { "epoch": 1.4137083141232518, "grad_norm": 2.359375, "learning_rate": 5.8666256914566685e-06, "loss": 0.2591, "step": 4600 }, { "epoch": 1.4167819271553712, "grad_norm": 2.671875, "learning_rate": 5.835894283958205e-06, "loss": 0.2856, "step": 4610 }, { "epoch": 1.4198555401874904, "grad_norm": 2.3125, "learning_rate": 5.805162876459742e-06, "loss": 0.2865, "step": 4620 }, { "epoch": 1.4229291532196098, "grad_norm": 2.28125, "learning_rate": 5.7744314689612794e-06, "loss": 0.2241, "step": 4630 }, { "epoch": 1.426002766251729, "grad_norm": 2.578125, "learning_rate": 5.743700061462816e-06, "loss": 0.2939, "step": 4640 }, { "epoch": 1.4290763792838481, "grad_norm": 2.375, "learning_rate": 5.712968653964353e-06, "loss": 0.2837, "step": 4650 }, { "epoch": 1.4321499923159675, "grad_norm": 1.796875, "learning_rate": 5.682237246465888e-06, "loss": 0.2509, "step": 4660 }, { "epoch": 1.4352236053480867, "grad_norm": 2.296875, "learning_rate": 5.6515058389674245e-06, "loss": 0.2812, "step": 4670 }, { "epoch": 1.4382972183802059, "grad_norm": 2.28125, "learning_rate": 5.620774431468961e-06, "loss": 0.2667, "step": 4680 }, { "epoch": 1.4413708314123252, "grad_norm": 1.3125, "learning_rate": 5.590043023970499e-06, "loss": 0.2411, "step": 4690 }, { "epoch": 1.4444444444444444, "grad_norm": 2.234375, "learning_rate": 5.5593116164720354e-06, "loss": 0.2485, "step": 4700 }, { "epoch": 1.4475180574765636, "grad_norm": 2.609375, "learning_rate": 5.528580208973572e-06, "loss": 0.2606, "step": 4710 }, { "epoch": 1.450591670508683, "grad_norm": 2.8125, "learning_rate": 5.497848801475107e-06, "loss": 0.2536, "step": 4720 }, { "epoch": 1.4536652835408022, "grad_norm": 2.578125, "learning_rate": 5.467117393976644e-06, "loss": 0.2716, "step": 4730 }, { "epoch": 1.4567388965729215, "grad_norm": 1.8671875, "learning_rate": 5.4363859864781805e-06, "loss": 0.2297, "step": 4740 }, { "epoch": 1.4598125096050407, "grad_norm": 2.078125, "learning_rate": 5.405654578979718e-06, "loss": 0.2937, "step": 4750 }, { "epoch": 1.4628861226371601, "grad_norm": 2.375, "learning_rate": 5.374923171481255e-06, "loss": 0.2927, "step": 4760 }, { "epoch": 1.4659597356692793, "grad_norm": 2.421875, "learning_rate": 5.3441917639827915e-06, "loss": 0.2757, "step": 4770 }, { "epoch": 1.4690333487013985, "grad_norm": 1.9765625, "learning_rate": 5.313460356484328e-06, "loss": 0.2751, "step": 4780 }, { "epoch": 1.4721069617335178, "grad_norm": 2.6875, "learning_rate": 5.282728948985863e-06, "loss": 0.2831, "step": 4790 }, { "epoch": 1.475180574765637, "grad_norm": 2.078125, "learning_rate": 5.2519975414874e-06, "loss": 0.2273, "step": 4800 }, { "epoch": 1.4782541877977562, "grad_norm": 2.046875, "learning_rate": 5.221266133988937e-06, "loss": 0.265, "step": 4810 }, { "epoch": 1.4813278008298756, "grad_norm": 2.515625, "learning_rate": 5.190534726490474e-06, "loss": 0.2605, "step": 4820 }, { "epoch": 1.4844014138619948, "grad_norm": 2.25, "learning_rate": 5.159803318992011e-06, "loss": 0.2595, "step": 4830 }, { "epoch": 1.487475026894114, "grad_norm": 2.25, "learning_rate": 5.1290719114935475e-06, "loss": 0.2491, "step": 4840 }, { "epoch": 1.4905486399262333, "grad_norm": 1.9921875, "learning_rate": 5.0983405039950825e-06, "loss": 0.2739, "step": 4850 }, { "epoch": 1.4936222529583525, "grad_norm": 2.71875, "learning_rate": 5.067609096496619e-06, "loss": 0.2642, "step": 4860 }, { "epoch": 1.4966958659904717, "grad_norm": 2.59375, "learning_rate": 5.036877688998157e-06, "loss": 0.2876, "step": 4870 }, { "epoch": 1.499769479022591, "grad_norm": 2.34375, "learning_rate": 5.006146281499693e-06, "loss": 0.2167, "step": 4880 }, { "epoch": 1.5028430920547104, "grad_norm": 1.953125, "learning_rate": 4.97541487400123e-06, "loss": 0.2668, "step": 4890 }, { "epoch": 1.5059167050868296, "grad_norm": 2.703125, "learning_rate": 4.944683466502766e-06, "loss": 0.274, "step": 4900 }, { "epoch": 1.5089903181189488, "grad_norm": 2.125, "learning_rate": 4.913952059004303e-06, "loss": 0.2559, "step": 4910 }, { "epoch": 1.5120639311510682, "grad_norm": 2.28125, "learning_rate": 4.883220651505839e-06, "loss": 0.253, "step": 4920 }, { "epoch": 1.5151375441831874, "grad_norm": 2.484375, "learning_rate": 4.852489244007376e-06, "loss": 0.2538, "step": 4930 }, { "epoch": 1.5182111572153065, "grad_norm": 2.75, "learning_rate": 4.821757836508913e-06, "loss": 0.2763, "step": 4940 }, { "epoch": 1.521284770247426, "grad_norm": 2.203125, "learning_rate": 4.7910264290104494e-06, "loss": 0.2621, "step": 4950 }, { "epoch": 1.524358383279545, "grad_norm": 2.21875, "learning_rate": 4.760295021511986e-06, "loss": 0.269, "step": 4960 }, { "epoch": 1.5274319963116643, "grad_norm": 2.265625, "learning_rate": 4.729563614013522e-06, "loss": 0.2617, "step": 4970 }, { "epoch": 1.5305056093437837, "grad_norm": 2.359375, "learning_rate": 4.698832206515059e-06, "loss": 0.2691, "step": 4980 }, { "epoch": 1.5335792223759028, "grad_norm": 2.703125, "learning_rate": 4.668100799016595e-06, "loss": 0.2592, "step": 4990 }, { "epoch": 1.536652835408022, "grad_norm": 2.890625, "learning_rate": 4.637369391518132e-06, "loss": 0.2441, "step": 5000 }, { "epoch": 1.5397264484401414, "grad_norm": 2.140625, "learning_rate": 4.606637984019669e-06, "loss": 0.2212, "step": 5010 }, { "epoch": 1.5428000614722608, "grad_norm": 2.75, "learning_rate": 4.5759065765212054e-06, "loss": 0.2572, "step": 5020 }, { "epoch": 1.5458736745043797, "grad_norm": 2.5625, "learning_rate": 4.545175169022741e-06, "loss": 0.2554, "step": 5030 }, { "epoch": 1.5489472875364991, "grad_norm": 2.75, "learning_rate": 4.514443761524278e-06, "loss": 0.2633, "step": 5040 }, { "epoch": 1.5520209005686185, "grad_norm": 3.5625, "learning_rate": 4.483712354025815e-06, "loss": 0.2433, "step": 5050 }, { "epoch": 1.5550945136007377, "grad_norm": 2.796875, "learning_rate": 4.452980946527351e-06, "loss": 0.284, "step": 5060 }, { "epoch": 1.5581681266328569, "grad_norm": 3.53125, "learning_rate": 4.422249539028888e-06, "loss": 0.2655, "step": 5070 }, { "epoch": 1.5612417396649763, "grad_norm": 2.171875, "learning_rate": 4.391518131530425e-06, "loss": 0.2448, "step": 5080 }, { "epoch": 1.5643153526970954, "grad_norm": 2.65625, "learning_rate": 4.3607867240319615e-06, "loss": 0.2536, "step": 5090 }, { "epoch": 1.5673889657292146, "grad_norm": 2.3125, "learning_rate": 4.330055316533497e-06, "loss": 0.2619, "step": 5100 }, { "epoch": 1.570462578761334, "grad_norm": 2.265625, "learning_rate": 4.299323909035034e-06, "loss": 0.2762, "step": 5110 }, { "epoch": 1.5735361917934532, "grad_norm": 2.296875, "learning_rate": 4.268592501536571e-06, "loss": 0.2555, "step": 5120 }, { "epoch": 1.5766098048255723, "grad_norm": 2.046875, "learning_rate": 4.237861094038107e-06, "loss": 0.2457, "step": 5130 }, { "epoch": 1.5796834178576917, "grad_norm": 2.578125, "learning_rate": 4.207129686539644e-06, "loss": 0.27, "step": 5140 }, { "epoch": 1.5827570308898111, "grad_norm": 2.1875, "learning_rate": 4.176398279041181e-06, "loss": 0.2525, "step": 5150 }, { "epoch": 1.58583064392193, "grad_norm": 2.390625, "learning_rate": 4.145666871542717e-06, "loss": 0.2788, "step": 5160 }, { "epoch": 1.5889042569540495, "grad_norm": 2.46875, "learning_rate": 4.114935464044253e-06, "loss": 0.2732, "step": 5170 }, { "epoch": 1.5919778699861689, "grad_norm": 2.359375, "learning_rate": 4.08420405654579e-06, "loss": 0.2415, "step": 5180 }, { "epoch": 1.595051483018288, "grad_norm": 2.5625, "learning_rate": 4.053472649047327e-06, "loss": 0.2461, "step": 5190 }, { "epoch": 1.5981250960504072, "grad_norm": 2.46875, "learning_rate": 4.022741241548863e-06, "loss": 0.2554, "step": 5200 }, { "epoch": 1.6011987090825266, "grad_norm": 2.5, "learning_rate": 3.9920098340504e-06, "loss": 0.2824, "step": 5210 }, { "epoch": 1.6042723221146458, "grad_norm": 2.328125, "learning_rate": 3.961278426551937e-06, "loss": 0.2289, "step": 5220 }, { "epoch": 1.607345935146765, "grad_norm": 2.453125, "learning_rate": 3.930547019053473e-06, "loss": 0.2515, "step": 5230 }, { "epoch": 1.6104195481788843, "grad_norm": 2.703125, "learning_rate": 3.899815611555009e-06, "loss": 0.2636, "step": 5240 }, { "epoch": 1.6134931612110035, "grad_norm": 2.375, "learning_rate": 3.869084204056546e-06, "loss": 0.2656, "step": 5250 }, { "epoch": 1.6165667742431227, "grad_norm": 2.4375, "learning_rate": 3.838352796558083e-06, "loss": 0.2758, "step": 5260 }, { "epoch": 1.619640387275242, "grad_norm": 2.375, "learning_rate": 3.807621389059619e-06, "loss": 0.2656, "step": 5270 }, { "epoch": 1.6227140003073615, "grad_norm": 2.453125, "learning_rate": 3.776889981561156e-06, "loss": 0.2808, "step": 5280 }, { "epoch": 1.6257876133394804, "grad_norm": 2.140625, "learning_rate": 3.746158574062692e-06, "loss": 0.2827, "step": 5290 }, { "epoch": 1.6288612263715998, "grad_norm": 2.265625, "learning_rate": 3.7154271665642287e-06, "loss": 0.2532, "step": 5300 }, { "epoch": 1.6319348394037192, "grad_norm": 2.90625, "learning_rate": 3.6846957590657658e-06, "loss": 0.2599, "step": 5310 }, { "epoch": 1.6350084524358384, "grad_norm": 2.625, "learning_rate": 3.6539643515673025e-06, "loss": 0.2759, "step": 5320 }, { "epoch": 1.6380820654679575, "grad_norm": 2.640625, "learning_rate": 3.6232329440688383e-06, "loss": 0.262, "step": 5330 }, { "epoch": 1.641155678500077, "grad_norm": 2.40625, "learning_rate": 3.5925015365703754e-06, "loss": 0.2804, "step": 5340 }, { "epoch": 1.644229291532196, "grad_norm": 2.15625, "learning_rate": 3.561770129071912e-06, "loss": 0.2631, "step": 5350 }, { "epoch": 1.6473029045643153, "grad_norm": 2.28125, "learning_rate": 3.531038721573448e-06, "loss": 0.2603, "step": 5360 }, { "epoch": 1.6503765175964347, "grad_norm": 1.9921875, "learning_rate": 3.500307314074985e-06, "loss": 0.234, "step": 5370 }, { "epoch": 1.6534501306285538, "grad_norm": 1.859375, "learning_rate": 3.469575906576522e-06, "loss": 0.2589, "step": 5380 }, { "epoch": 1.656523743660673, "grad_norm": 2.71875, "learning_rate": 3.4388444990780576e-06, "loss": 0.2643, "step": 5390 }, { "epoch": 1.6595973566927924, "grad_norm": 2.015625, "learning_rate": 3.4081130915795948e-06, "loss": 0.252, "step": 5400 }, { "epoch": 1.6626709697249118, "grad_norm": 2.34375, "learning_rate": 3.3773816840811315e-06, "loss": 0.2721, "step": 5410 }, { "epoch": 1.6657445827570307, "grad_norm": 2.515625, "learning_rate": 3.3466502765826673e-06, "loss": 0.2749, "step": 5420 }, { "epoch": 1.6688181957891501, "grad_norm": 2.59375, "learning_rate": 3.3159188690842044e-06, "loss": 0.2675, "step": 5430 }, { "epoch": 1.6718918088212695, "grad_norm": 2.765625, "learning_rate": 3.285187461585741e-06, "loss": 0.2348, "step": 5440 }, { "epoch": 1.6749654218533887, "grad_norm": 2.25, "learning_rate": 3.254456054087278e-06, "loss": 0.2552, "step": 5450 }, { "epoch": 1.6780390348855079, "grad_norm": 2.140625, "learning_rate": 3.223724646588814e-06, "loss": 0.306, "step": 5460 }, { "epoch": 1.6811126479176273, "grad_norm": 2.328125, "learning_rate": 3.1929932390903508e-06, "loss": 0.2652, "step": 5470 }, { "epoch": 1.6841862609497464, "grad_norm": 2.15625, "learning_rate": 3.1622618315918875e-06, "loss": 0.2572, "step": 5480 }, { "epoch": 1.6872598739818656, "grad_norm": 2.125, "learning_rate": 3.1315304240934238e-06, "loss": 0.2807, "step": 5490 }, { "epoch": 1.690333487013985, "grad_norm": 2.578125, "learning_rate": 3.1007990165949604e-06, "loss": 0.2612, "step": 5500 }, { "epoch": 1.6934071000461042, "grad_norm": 2.234375, "learning_rate": 3.070067609096497e-06, "loss": 0.2599, "step": 5510 }, { "epoch": 1.6964807130782233, "grad_norm": 2.375, "learning_rate": 3.0393362015980334e-06, "loss": 0.2675, "step": 5520 }, { "epoch": 1.6995543261103427, "grad_norm": 1.9140625, "learning_rate": 3.00860479409957e-06, "loss": 0.2352, "step": 5530 }, { "epoch": 1.702627939142462, "grad_norm": 1.875, "learning_rate": 2.977873386601107e-06, "loss": 0.262, "step": 5540 }, { "epoch": 1.705701552174581, "grad_norm": 2.53125, "learning_rate": 2.947141979102643e-06, "loss": 0.2622, "step": 5550 }, { "epoch": 1.7087751652067005, "grad_norm": 2.796875, "learning_rate": 2.9164105716041798e-06, "loss": 0.2577, "step": 5560 }, { "epoch": 1.7118487782388199, "grad_norm": 2.25, "learning_rate": 2.8856791641057165e-06, "loss": 0.2688, "step": 5570 }, { "epoch": 1.714922391270939, "grad_norm": 3.0, "learning_rate": 2.854947756607253e-06, "loss": 0.2794, "step": 5580 }, { "epoch": 1.7179960043030582, "grad_norm": 2.328125, "learning_rate": 2.8242163491087894e-06, "loss": 0.2451, "step": 5590 }, { "epoch": 1.7210696173351776, "grad_norm": 2.46875, "learning_rate": 2.793484941610326e-06, "loss": 0.2478, "step": 5600 }, { "epoch": 1.7241432303672968, "grad_norm": 2.25, "learning_rate": 2.762753534111863e-06, "loss": 0.2533, "step": 5610 }, { "epoch": 1.727216843399416, "grad_norm": 2.359375, "learning_rate": 2.732022126613399e-06, "loss": 0.2803, "step": 5620 }, { "epoch": 1.7302904564315353, "grad_norm": 2.4375, "learning_rate": 2.7012907191149358e-06, "loss": 0.236, "step": 5630 }, { "epoch": 1.7333640694636545, "grad_norm": 2.765625, "learning_rate": 2.6705593116164725e-06, "loss": 0.2735, "step": 5640 }, { "epoch": 1.7364376824957737, "grad_norm": 2.328125, "learning_rate": 2.6398279041180088e-06, "loss": 0.2486, "step": 5650 }, { "epoch": 1.739511295527893, "grad_norm": 2.46875, "learning_rate": 2.6090964966195454e-06, "loss": 0.2673, "step": 5660 }, { "epoch": 1.7425849085600122, "grad_norm": 2.171875, "learning_rate": 2.578365089121082e-06, "loss": 0.2445, "step": 5670 }, { "epoch": 1.7456585215921314, "grad_norm": 2.828125, "learning_rate": 2.5476336816226184e-06, "loss": 0.2702, "step": 5680 }, { "epoch": 1.7487321346242508, "grad_norm": 2.3125, "learning_rate": 2.516902274124155e-06, "loss": 0.2502, "step": 5690 }, { "epoch": 1.7518057476563702, "grad_norm": 2.296875, "learning_rate": 2.486170866625692e-06, "loss": 0.2615, "step": 5700 }, { "epoch": 1.7548793606884892, "grad_norm": 1.8828125, "learning_rate": 2.455439459127228e-06, "loss": 0.2696, "step": 5710 }, { "epoch": 1.7579529737206085, "grad_norm": 2.3125, "learning_rate": 2.4247080516287648e-06, "loss": 0.2489, "step": 5720 }, { "epoch": 1.761026586752728, "grad_norm": 2.84375, "learning_rate": 2.3939766441303015e-06, "loss": 0.2448, "step": 5730 }, { "epoch": 1.7641001997848471, "grad_norm": 2.875, "learning_rate": 2.3632452366318377e-06, "loss": 0.2512, "step": 5740 }, { "epoch": 1.7671738128169663, "grad_norm": 2.125, "learning_rate": 2.3325138291333744e-06, "loss": 0.2481, "step": 5750 }, { "epoch": 1.7702474258490857, "grad_norm": 2.90625, "learning_rate": 2.301782421634911e-06, "loss": 0.2504, "step": 5760 }, { "epoch": 1.7733210388812048, "grad_norm": 2.515625, "learning_rate": 2.2710510141364474e-06, "loss": 0.2581, "step": 5770 }, { "epoch": 1.776394651913324, "grad_norm": 2.40625, "learning_rate": 2.240319606637984e-06, "loss": 0.2627, "step": 5780 }, { "epoch": 1.7794682649454434, "grad_norm": 2.40625, "learning_rate": 2.2095881991395208e-06, "loss": 0.25, "step": 5790 }, { "epoch": 1.7825418779775626, "grad_norm": 2.390625, "learning_rate": 2.1788567916410575e-06, "loss": 0.2664, "step": 5800 }, { "epoch": 1.7856154910096818, "grad_norm": 2.671875, "learning_rate": 2.1481253841425938e-06, "loss": 0.2627, "step": 5810 }, { "epoch": 1.7886891040418011, "grad_norm": 2.359375, "learning_rate": 2.1173939766441304e-06, "loss": 0.2092, "step": 5820 }, { "epoch": 1.7917627170739205, "grad_norm": 2.390625, "learning_rate": 2.086662569145667e-06, "loss": 0.2603, "step": 5830 }, { "epoch": 1.7948363301060395, "grad_norm": 1.8984375, "learning_rate": 2.0559311616472034e-06, "loss": 0.2244, "step": 5840 }, { "epoch": 1.7979099431381589, "grad_norm": 2.078125, "learning_rate": 2.02519975414874e-06, "loss": 0.2542, "step": 5850 }, { "epoch": 1.8009835561702783, "grad_norm": 1.953125, "learning_rate": 1.994468346650277e-06, "loss": 0.2478, "step": 5860 }, { "epoch": 1.8040571692023974, "grad_norm": 2.515625, "learning_rate": 1.963736939151813e-06, "loss": 0.2253, "step": 5870 }, { "epoch": 1.8071307822345166, "grad_norm": 1.9296875, "learning_rate": 1.9330055316533498e-06, "loss": 0.2727, "step": 5880 }, { "epoch": 1.810204395266636, "grad_norm": 2.8125, "learning_rate": 1.9022741241548865e-06, "loss": 0.273, "step": 5890 }, { "epoch": 1.8132780082987552, "grad_norm": 1.96875, "learning_rate": 1.871542716656423e-06, "loss": 0.2747, "step": 5900 }, { "epoch": 1.8163516213308744, "grad_norm": 2.09375, "learning_rate": 1.8408113091579596e-06, "loss": 0.2624, "step": 5910 }, { "epoch": 1.8194252343629937, "grad_norm": 2.71875, "learning_rate": 1.8100799016594961e-06, "loss": 0.2392, "step": 5920 }, { "epoch": 1.822498847395113, "grad_norm": 2.375, "learning_rate": 1.7793484941610328e-06, "loss": 0.2444, "step": 5930 }, { "epoch": 1.825572460427232, "grad_norm": 2.484375, "learning_rate": 1.7486170866625693e-06, "loss": 0.2719, "step": 5940 }, { "epoch": 1.8286460734593515, "grad_norm": 2.671875, "learning_rate": 1.7178856791641058e-06, "loss": 0.2604, "step": 5950 }, { "epoch": 1.8317196864914709, "grad_norm": 2.46875, "learning_rate": 1.6871542716656425e-06, "loss": 0.2628, "step": 5960 }, { "epoch": 1.8347932995235898, "grad_norm": 2.140625, "learning_rate": 1.656422864167179e-06, "loss": 0.2842, "step": 5970 }, { "epoch": 1.8378669125557092, "grad_norm": 2.859375, "learning_rate": 1.6256914566687157e-06, "loss": 0.2726, "step": 5980 }, { "epoch": 1.8409405255878286, "grad_norm": 2.21875, "learning_rate": 1.5949600491702521e-06, "loss": 0.2798, "step": 5990 }, { "epoch": 1.8440141386199478, "grad_norm": 2.5, "learning_rate": 1.5642286416717886e-06, "loss": 0.2865, "step": 6000 }, { "epoch": 1.847087751652067, "grad_norm": 2.71875, "learning_rate": 1.5334972341733253e-06, "loss": 0.2594, "step": 6010 }, { "epoch": 1.8501613646841863, "grad_norm": 2.703125, "learning_rate": 1.5027658266748618e-06, "loss": 0.2585, "step": 6020 }, { "epoch": 1.8532349777163055, "grad_norm": 2.8125, "learning_rate": 1.4720344191763983e-06, "loss": 0.2758, "step": 6030 }, { "epoch": 1.8563085907484247, "grad_norm": 2.34375, "learning_rate": 1.441303011677935e-06, "loss": 0.2722, "step": 6040 }, { "epoch": 1.859382203780544, "grad_norm": 2.1875, "learning_rate": 1.4105716041794715e-06, "loss": 0.2812, "step": 6050 }, { "epoch": 1.8624558168126633, "grad_norm": 2.796875, "learning_rate": 1.3798401966810082e-06, "loss": 0.2263, "step": 6060 }, { "epoch": 1.8655294298447824, "grad_norm": 2.21875, "learning_rate": 1.3491087891825446e-06, "loss": 0.2759, "step": 6070 }, { "epoch": 1.8686030428769018, "grad_norm": 2.59375, "learning_rate": 1.3183773816840811e-06, "loss": 0.2584, "step": 6080 }, { "epoch": 1.8716766559090212, "grad_norm": 2.484375, "learning_rate": 1.2876459741856178e-06, "loss": 0.2728, "step": 6090 }, { "epoch": 1.8747502689411402, "grad_norm": 2.296875, "learning_rate": 1.2569145666871543e-06, "loss": 0.2466, "step": 6100 }, { "epoch": 1.8778238819732596, "grad_norm": 2.6875, "learning_rate": 1.226183159188691e-06, "loss": 0.2428, "step": 6110 }, { "epoch": 1.880897495005379, "grad_norm": 2.90625, "learning_rate": 1.1954517516902275e-06, "loss": 0.2383, "step": 6120 }, { "epoch": 1.8839711080374981, "grad_norm": 2.515625, "learning_rate": 1.1647203441917642e-06, "loss": 0.2902, "step": 6130 }, { "epoch": 1.8870447210696173, "grad_norm": 2.796875, "learning_rate": 1.1339889366933007e-06, "loss": 0.2328, "step": 6140 }, { "epoch": 1.8901183341017367, "grad_norm": 2.203125, "learning_rate": 1.1032575291948371e-06, "loss": 0.2504, "step": 6150 }, { "epoch": 1.8931919471338559, "grad_norm": 2.25, "learning_rate": 1.0725261216963738e-06, "loss": 0.2605, "step": 6160 }, { "epoch": 1.896265560165975, "grad_norm": 2.625, "learning_rate": 1.0417947141979103e-06, "loss": 0.2615, "step": 6170 }, { "epoch": 1.8993391731980944, "grad_norm": 3.03125, "learning_rate": 1.0110633066994468e-06, "loss": 0.2613, "step": 6180 }, { "epoch": 1.9024127862302136, "grad_norm": 2.828125, "learning_rate": 9.803318992009835e-07, "loss": 0.2761, "step": 6190 }, { "epoch": 1.9054863992623328, "grad_norm": 2.078125, "learning_rate": 9.496004917025201e-07, "loss": 0.2589, "step": 6200 }, { "epoch": 1.9085600122944522, "grad_norm": 2.671875, "learning_rate": 9.188690842040567e-07, "loss": 0.2762, "step": 6210 }, { "epoch": 1.9116336253265716, "grad_norm": 2.40625, "learning_rate": 8.881376767055933e-07, "loss": 0.2326, "step": 6220 }, { "epoch": 1.9147072383586905, "grad_norm": 2.484375, "learning_rate": 8.574062692071297e-07, "loss": 0.2404, "step": 6230 }, { "epoch": 1.91778085139081, "grad_norm": 2.609375, "learning_rate": 8.266748617086663e-07, "loss": 0.2708, "step": 6240 }, { "epoch": 1.9208544644229293, "grad_norm": 1.796875, "learning_rate": 7.959434542102029e-07, "loss": 0.2478, "step": 6250 }, { "epoch": 1.9239280774550485, "grad_norm": 2.078125, "learning_rate": 7.652120467117395e-07, "loss": 0.2615, "step": 6260 }, { "epoch": 1.9270016904871676, "grad_norm": 2.640625, "learning_rate": 7.34480639213276e-07, "loss": 0.2484, "step": 6270 }, { "epoch": 1.930075303519287, "grad_norm": 2.296875, "learning_rate": 7.037492317148126e-07, "loss": 0.2866, "step": 6280 }, { "epoch": 1.9331489165514062, "grad_norm": 2.328125, "learning_rate": 6.730178242163492e-07, "loss": 0.2632, "step": 6290 }, { "epoch": 1.9362225295835254, "grad_norm": 2.8125, "learning_rate": 6.422864167178858e-07, "loss": 0.2576, "step": 6300 }, { "epoch": 1.9392961426156448, "grad_norm": 2.125, "learning_rate": 6.115550092194224e-07, "loss": 0.2428, "step": 6310 }, { "epoch": 1.942369755647764, "grad_norm": 2.1875, "learning_rate": 5.808236017209588e-07, "loss": 0.2435, "step": 6320 }, { "epoch": 1.945443368679883, "grad_norm": 1.8515625, "learning_rate": 5.500921942224954e-07, "loss": 0.2537, "step": 6330 }, { "epoch": 1.9485169817120025, "grad_norm": 2.09375, "learning_rate": 5.19360786724032e-07, "loss": 0.2701, "step": 6340 }, { "epoch": 1.9515905947441217, "grad_norm": 2.515625, "learning_rate": 4.886293792255686e-07, "loss": 0.1977, "step": 6350 }, { "epoch": 1.9546642077762408, "grad_norm": 2.78125, "learning_rate": 4.5789797172710514e-07, "loss": 0.2635, "step": 6360 }, { "epoch": 1.9577378208083602, "grad_norm": 2.265625, "learning_rate": 4.271665642286417e-07, "loss": 0.2385, "step": 6370 }, { "epoch": 1.9608114338404796, "grad_norm": 2.546875, "learning_rate": 3.964351567301783e-07, "loss": 0.2657, "step": 6380 }, { "epoch": 1.9638850468725988, "grad_norm": 2.234375, "learning_rate": 3.657037492317148e-07, "loss": 0.2301, "step": 6390 }, { "epoch": 1.966958659904718, "grad_norm": 2.328125, "learning_rate": 3.3497234173325144e-07, "loss": 0.2379, "step": 6400 }, { "epoch": 1.9700322729368374, "grad_norm": 2.203125, "learning_rate": 3.04240934234788e-07, "loss": 0.2598, "step": 6410 }, { "epoch": 1.9731058859689565, "grad_norm": 2.546875, "learning_rate": 2.7350952673632457e-07, "loss": 0.253, "step": 6420 }, { "epoch": 1.9761794990010757, "grad_norm": 2.375, "learning_rate": 2.427781192378611e-07, "loss": 0.2614, "step": 6430 }, { "epoch": 1.979253112033195, "grad_norm": 1.8984375, "learning_rate": 2.120467117393977e-07, "loss": 0.2474, "step": 6440 }, { "epoch": 1.9823267250653143, "grad_norm": 1.8203125, "learning_rate": 1.8131530424093426e-07, "loss": 0.2825, "step": 6450 }, { "epoch": 1.9854003380974334, "grad_norm": 1.9921875, "learning_rate": 1.5058389674247082e-07, "loss": 0.2705, "step": 6460 }, { "epoch": 1.9884739511295528, "grad_norm": 1.8203125, "learning_rate": 1.1985248924400738e-07, "loss": 0.2716, "step": 6470 }, { "epoch": 1.991547564161672, "grad_norm": 2.25, "learning_rate": 8.912108174554396e-08, "loss": 0.2911, "step": 6480 }, { "epoch": 1.9946211771937912, "grad_norm": 2.40625, "learning_rate": 5.838967424708052e-08, "loss": 0.2304, "step": 6490 }, { "epoch": 1.9976947902259106, "grad_norm": 2.15625, "learning_rate": 2.7658266748617086e-08, "loss": 0.2298, "step": 6500 } ], "logging_steps": 10, "max_steps": 6508, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.59613886778096e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }