diff --git "a/plbart_ia3_official_1e-05/checkpoint-88308/trainer_state.json" "b/plbart_ia3_official_1e-05/checkpoint-88308/trainer_state.json" new file mode 100644--- /dev/null +++ "b/plbart_ia3_official_1e-05/checkpoint-88308/trainer_state.json" @@ -0,0 +1,123792 @@ +{ + "best_metric": 0.004714649665768724, + "best_model_checkpoint": "./results-cc/plbart/plbart_ia3_official_1e-05/checkpoint-88308", + "epoch": 6.0, + "eval_steps": 500, + "global_step": 88308, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003397200706617747, + "grad_norm": 5.588343143463135, + "learning_rate": 9.999745209947004e-06, + "loss": 13.7496, + "step": 5 + }, + { + "epoch": 0.0006794401413235494, + "grad_norm": 4.147919654846191, + "learning_rate": 9.999320559858677e-06, + "loss": 13.4472, + "step": 10 + }, + { + "epoch": 0.0010191602119853241, + "grad_norm": 7.245415687561035, + "learning_rate": 9.99889590977035e-06, + "loss": 14.2089, + "step": 15 + }, + { + "epoch": 0.001358880282647099, + "grad_norm": 5.466311931610107, + "learning_rate": 9.998471259682023e-06, + "loss": 13.5719, + "step": 20 + }, + { + "epoch": 0.0016986003533088735, + "grad_norm": 5.576236724853516, + "learning_rate": 9.998046609593695e-06, + "loss": 13.7265, + "step": 25 + }, + { + "epoch": 0.0020383204239706482, + "grad_norm": 5.830343246459961, + "learning_rate": 9.997621959505368e-06, + "loss": 13.8259, + "step": 30 + }, + { + "epoch": 0.002378040494632423, + "grad_norm": NaN, + "learning_rate": 9.997282239434707e-06, + "loss": 13.7542, + "step": 35 + }, + { + "epoch": 0.002717760565294198, + "grad_norm": 5.09084415435791, + "learning_rate": 9.99685758934638e-06, + "loss": 13.7037, + "step": 40 + }, + { + "epoch": 0.0030574806359559724, + "grad_norm": 5.64856481552124, + "learning_rate": 9.996432939258051e-06, + "loss": 14.416, + "step": 45 + }, + { + "epoch": 0.003397200706617747, + "grad_norm": 6.461962699890137, + "learning_rate": 9.996008289169725e-06, + "loss": 13.5441, + "step": 50 + }, + { + "epoch": 0.0037369207772795215, + "grad_norm": 5.128623962402344, + "learning_rate": 9.995583639081398e-06, + "loss": 13.4717, + "step": 55 + }, + { + "epoch": 0.0040766408479412965, + "grad_norm": 4.8374714851379395, + "learning_rate": 9.99515898899307e-06, + "loss": 13.585, + "step": 60 + }, + { + "epoch": 0.0044163609186030715, + "grad_norm": 5.24623966217041, + "learning_rate": 9.994734338904744e-06, + "loss": 13.8324, + "step": 65 + }, + { + "epoch": 0.004756080989264846, + "grad_norm": 6.42460823059082, + "learning_rate": 9.994309688816417e-06, + "loss": 13.714, + "step": 70 + }, + { + "epoch": 0.005095801059926621, + "grad_norm": 4.493538856506348, + "learning_rate": 9.993885038728088e-06, + "loss": 13.1869, + "step": 75 + }, + { + "epoch": 0.005435521130588396, + "grad_norm": 4.684764385223389, + "learning_rate": 9.993460388639762e-06, + "loss": 13.2432, + "step": 80 + }, + { + "epoch": 0.00577524120125017, + "grad_norm": 4.926094055175781, + "learning_rate": 9.993035738551435e-06, + "loss": 13.4791, + "step": 85 + }, + { + "epoch": 0.006114961271911945, + "grad_norm": 5.749914646148682, + "learning_rate": 9.992611088463106e-06, + "loss": 13.3926, + "step": 90 + }, + { + "epoch": 0.006454681342573719, + "grad_norm": 4.970900058746338, + "learning_rate": 9.99218643837478e-06, + "loss": 13.8674, + "step": 95 + }, + { + "epoch": 0.006794401413235494, + "grad_norm": 5.175647258758545, + "learning_rate": 9.991761788286452e-06, + "loss": 13.5887, + "step": 100 + }, + { + "epoch": 0.007134121483897269, + "grad_norm": 6.891249656677246, + "learning_rate": 9.991337138198125e-06, + "loss": 13.6452, + "step": 105 + }, + { + "epoch": 0.007473841554559043, + "grad_norm": 4.981839179992676, + "learning_rate": 9.990912488109799e-06, + "loss": 13.7139, + "step": 110 + }, + { + "epoch": 0.007813561625220818, + "grad_norm": 4.64729118347168, + "learning_rate": 9.99048783802147e-06, + "loss": 13.5354, + "step": 115 + }, + { + "epoch": 0.008153281695882593, + "grad_norm": 4.944385051727295, + "learning_rate": 9.990063187933145e-06, + "loss": 13.4863, + "step": 120 + }, + { + "epoch": 0.008493001766544368, + "grad_norm": 5.334909915924072, + "learning_rate": 9.989638537844817e-06, + "loss": 13.3032, + "step": 125 + }, + { + "epoch": 0.008832721837206143, + "grad_norm": 4.8724684715271, + "learning_rate": 9.989213887756489e-06, + "loss": 13.3933, + "step": 130 + }, + { + "epoch": 0.009172441907867916, + "grad_norm": 5.676644802093506, + "learning_rate": 9.988789237668163e-06, + "loss": 13.2966, + "step": 135 + }, + { + "epoch": 0.009512161978529691, + "grad_norm": 5.262624740600586, + "learning_rate": 9.988364587579836e-06, + "loss": 13.3534, + "step": 140 + }, + { + "epoch": 0.009851882049191466, + "grad_norm": 6.7558183670043945, + "learning_rate": 9.987939937491507e-06, + "loss": 13.3375, + "step": 145 + }, + { + "epoch": 0.010191602119853241, + "grad_norm": 5.502933979034424, + "learning_rate": 9.987515287403181e-06, + "loss": 13.8999, + "step": 150 + }, + { + "epoch": 0.010531322190515016, + "grad_norm": 4.300497531890869, + "learning_rate": 9.987090637314854e-06, + "loss": 12.7697, + "step": 155 + }, + { + "epoch": 0.010871042261176791, + "grad_norm": 4.235837936401367, + "learning_rate": 9.986665987226525e-06, + "loss": 12.832, + "step": 160 + }, + { + "epoch": 0.011210762331838564, + "grad_norm": 7.5555033683776855, + "learning_rate": 9.9862413371382e-06, + "loss": 12.6879, + "step": 165 + }, + { + "epoch": 0.01155048240250034, + "grad_norm": 4.813079357147217, + "learning_rate": 9.985816687049871e-06, + "loss": 13.1028, + "step": 170 + }, + { + "epoch": 0.011890202473162114, + "grad_norm": 4.587883472442627, + "learning_rate": 9.985392036961544e-06, + "loss": 13.0054, + "step": 175 + }, + { + "epoch": 0.01222992254382389, + "grad_norm": 4.992321491241455, + "learning_rate": 9.984967386873218e-06, + "loss": 13.3979, + "step": 180 + }, + { + "epoch": 0.012569642614485664, + "grad_norm": 6.254053115844727, + "learning_rate": 9.98454273678489e-06, + "loss": 13.2297, + "step": 185 + }, + { + "epoch": 0.012909362685147438, + "grad_norm": 4.7956695556640625, + "learning_rate": 9.984118086696562e-06, + "loss": 12.9473, + "step": 190 + }, + { + "epoch": 0.013249082755809213, + "grad_norm": 5.777144908905029, + "learning_rate": 9.983693436608237e-06, + "loss": 13.6165, + "step": 195 + }, + { + "epoch": 0.013588802826470988, + "grad_norm": 6.089964866638184, + "learning_rate": 9.983268786519908e-06, + "loss": 13.1998, + "step": 200 + }, + { + "epoch": 0.013928522897132763, + "grad_norm": 4.855130672454834, + "learning_rate": 9.98284413643158e-06, + "loss": 13.3092, + "step": 205 + }, + { + "epoch": 0.014268242967794538, + "grad_norm": 5.239603042602539, + "learning_rate": 9.982419486343255e-06, + "loss": 13.1853, + "step": 210 + }, + { + "epoch": 0.014607963038456313, + "grad_norm": 4.772551536560059, + "learning_rate": 9.981994836254926e-06, + "loss": 13.2721, + "step": 215 + }, + { + "epoch": 0.014947683109118086, + "grad_norm": 4.601176738739014, + "learning_rate": 9.981570186166599e-06, + "loss": 13.2459, + "step": 220 + }, + { + "epoch": 0.015287403179779861, + "grad_norm": 4.2293572425842285, + "learning_rate": 9.981145536078274e-06, + "loss": 12.7858, + "step": 225 + }, + { + "epoch": 0.015627123250441636, + "grad_norm": 5.177498817443848, + "learning_rate": 9.980720885989945e-06, + "loss": 13.3869, + "step": 230 + }, + { + "epoch": 0.01596684332110341, + "grad_norm": 5.383495807647705, + "learning_rate": 9.980296235901617e-06, + "loss": 13.3399, + "step": 235 + }, + { + "epoch": 0.016306563391765186, + "grad_norm": 4.6775336265563965, + "learning_rate": 9.979871585813292e-06, + "loss": 12.4871, + "step": 240 + }, + { + "epoch": 0.01664628346242696, + "grad_norm": 5.113440036773682, + "learning_rate": 9.979446935724963e-06, + "loss": 13.0117, + "step": 245 + }, + { + "epoch": 0.016986003533088736, + "grad_norm": 5.332059860229492, + "learning_rate": 9.979022285636636e-06, + "loss": 13.3675, + "step": 250 + }, + { + "epoch": 0.01732572360375051, + "grad_norm": 4.548219680786133, + "learning_rate": 9.978597635548309e-06, + "loss": 12.8393, + "step": 255 + }, + { + "epoch": 0.017665443674412286, + "grad_norm": 5.225782871246338, + "learning_rate": 9.978172985459981e-06, + "loss": 13.1291, + "step": 260 + }, + { + "epoch": 0.01800516374507406, + "grad_norm": 4.921344757080078, + "learning_rate": 9.977748335371654e-06, + "loss": 13.4252, + "step": 265 + }, + { + "epoch": 0.018344883815735832, + "grad_norm": 4.333355903625488, + "learning_rate": 9.977323685283327e-06, + "loss": 12.9303, + "step": 270 + }, + { + "epoch": 0.01868460388639761, + "grad_norm": 5.195517063140869, + "learning_rate": 9.976899035195e-06, + "loss": 12.5713, + "step": 275 + }, + { + "epoch": 0.019024323957059382, + "grad_norm": 4.513863563537598, + "learning_rate": 9.976474385106673e-06, + "loss": 12.5133, + "step": 280 + }, + { + "epoch": 0.01936404402772116, + "grad_norm": 4.881851673126221, + "learning_rate": 9.976049735018345e-06, + "loss": 12.6769, + "step": 285 + }, + { + "epoch": 0.019703764098382932, + "grad_norm": 4.719494342803955, + "learning_rate": 9.975625084930018e-06, + "loss": 12.1796, + "step": 290 + }, + { + "epoch": 0.020043484169044706, + "grad_norm": 5.196517467498779, + "learning_rate": 9.975200434841691e-06, + "loss": 12.6711, + "step": 295 + }, + { + "epoch": 0.020383204239706482, + "grad_norm": 4.041914463043213, + "learning_rate": 9.974775784753364e-06, + "loss": 12.3575, + "step": 300 + }, + { + "epoch": 0.020722924310368256, + "grad_norm": 4.807907581329346, + "learning_rate": 9.974351134665037e-06, + "loss": 13.0571, + "step": 305 + }, + { + "epoch": 0.021062644381030032, + "grad_norm": 5.007236957550049, + "learning_rate": 9.97392648457671e-06, + "loss": 12.7422, + "step": 310 + }, + { + "epoch": 0.021402364451691806, + "grad_norm": 4.651087760925293, + "learning_rate": 9.973501834488382e-06, + "loss": 12.9706, + "step": 315 + }, + { + "epoch": 0.021742084522353582, + "grad_norm": 5.573067665100098, + "learning_rate": 9.973077184400055e-06, + "loss": 13.188, + "step": 320 + }, + { + "epoch": 0.022081804593015356, + "grad_norm": 4.055250644683838, + "learning_rate": 9.972652534311728e-06, + "loss": 11.9324, + "step": 325 + }, + { + "epoch": 0.02242152466367713, + "grad_norm": 4.174074172973633, + "learning_rate": 9.9722278842234e-06, + "loss": 12.3218, + "step": 330 + }, + { + "epoch": 0.022761244734338906, + "grad_norm": 4.95819091796875, + "learning_rate": 9.971803234135073e-06, + "loss": 12.9804, + "step": 335 + }, + { + "epoch": 0.02310096480500068, + "grad_norm": 5.362131595611572, + "learning_rate": 9.971378584046746e-06, + "loss": 12.2636, + "step": 340 + }, + { + "epoch": 0.023440684875662456, + "grad_norm": 3.8360021114349365, + "learning_rate": 9.970953933958419e-06, + "loss": 12.632, + "step": 345 + }, + { + "epoch": 0.02378040494632423, + "grad_norm": 4.776843547821045, + "learning_rate": 9.970529283870092e-06, + "loss": 12.1899, + "step": 350 + }, + { + "epoch": 0.024120125016986002, + "grad_norm": 5.973950386047363, + "learning_rate": 9.970104633781765e-06, + "loss": 12.87, + "step": 355 + }, + { + "epoch": 0.02445984508764778, + "grad_norm": 5.632570266723633, + "learning_rate": 9.969679983693437e-06, + "loss": 13.1513, + "step": 360 + }, + { + "epoch": 0.024799565158309552, + "grad_norm": 4.927257537841797, + "learning_rate": 9.96925533360511e-06, + "loss": 12.7617, + "step": 365 + }, + { + "epoch": 0.02513928522897133, + "grad_norm": 5.725837230682373, + "learning_rate": 9.968830683516783e-06, + "loss": 12.7412, + "step": 370 + }, + { + "epoch": 0.025479005299633102, + "grad_norm": 5.346511363983154, + "learning_rate": 9.968406033428456e-06, + "loss": 12.5258, + "step": 375 + }, + { + "epoch": 0.025818725370294875, + "grad_norm": 4.331769943237305, + "learning_rate": 9.967981383340129e-06, + "loss": 12.1927, + "step": 380 + }, + { + "epoch": 0.026158445440956652, + "grad_norm": 4.845086097717285, + "learning_rate": 9.967556733251801e-06, + "loss": 12.6231, + "step": 385 + }, + { + "epoch": 0.026498165511618425, + "grad_norm": 4.607351779937744, + "learning_rate": 9.967132083163474e-06, + "loss": 11.9247, + "step": 390 + }, + { + "epoch": 0.026837885582280202, + "grad_norm": 4.286142826080322, + "learning_rate": 9.966707433075147e-06, + "loss": 11.6822, + "step": 395 + }, + { + "epoch": 0.027177605652941975, + "grad_norm": 4.375635147094727, + "learning_rate": 9.96628278298682e-06, + "loss": 12.5628, + "step": 400 + }, + { + "epoch": 0.027517325723603752, + "grad_norm": 5.469222068786621, + "learning_rate": 9.965858132898493e-06, + "loss": 13.0576, + "step": 405 + }, + { + "epoch": 0.027857045794265525, + "grad_norm": 5.219028472900391, + "learning_rate": 9.965433482810165e-06, + "loss": 12.7196, + "step": 410 + }, + { + "epoch": 0.0281967658649273, + "grad_norm": 4.886843204498291, + "learning_rate": 9.965008832721838e-06, + "loss": 12.7005, + "step": 415 + }, + { + "epoch": 0.028536485935589075, + "grad_norm": 4.5192999839782715, + "learning_rate": 9.964584182633511e-06, + "loss": 12.1678, + "step": 420 + }, + { + "epoch": 0.02887620600625085, + "grad_norm": 4.591263294219971, + "learning_rate": 9.964159532545184e-06, + "loss": 12.5201, + "step": 425 + }, + { + "epoch": 0.029215926076912625, + "grad_norm": 5.271311283111572, + "learning_rate": 9.963734882456857e-06, + "loss": 12.549, + "step": 430 + }, + { + "epoch": 0.0295556461475744, + "grad_norm": 4.696244716644287, + "learning_rate": 9.96331023236853e-06, + "loss": 12.4418, + "step": 435 + }, + { + "epoch": 0.029895366218236172, + "grad_norm": 4.772137641906738, + "learning_rate": 9.962885582280202e-06, + "loss": 12.4259, + "step": 440 + }, + { + "epoch": 0.03023508628889795, + "grad_norm": 4.994176387786865, + "learning_rate": 9.962460932191873e-06, + "loss": 12.1508, + "step": 445 + }, + { + "epoch": 0.030574806359559722, + "grad_norm": 5.625973224639893, + "learning_rate": 9.962036282103548e-06, + "loss": 12.1754, + "step": 450 + }, + { + "epoch": 0.0309145264302215, + "grad_norm": 4.958076000213623, + "learning_rate": 9.96161163201522e-06, + "loss": 12.2915, + "step": 455 + }, + { + "epoch": 0.03125424650088327, + "grad_norm": 4.714076519012451, + "learning_rate": 9.961186981926893e-06, + "loss": 12.7744, + "step": 460 + }, + { + "epoch": 0.03159396657154505, + "grad_norm": 4.0131330490112305, + "learning_rate": 9.960762331838566e-06, + "loss": 12.3622, + "step": 465 + }, + { + "epoch": 0.03193368664220682, + "grad_norm": 4.50294303894043, + "learning_rate": 9.960337681750239e-06, + "loss": 12.3557, + "step": 470 + }, + { + "epoch": 0.032273406712868595, + "grad_norm": 6.086542129516602, + "learning_rate": 9.959913031661912e-06, + "loss": 12.5268, + "step": 475 + }, + { + "epoch": 0.03261312678353037, + "grad_norm": 4.0648345947265625, + "learning_rate": 9.959488381573585e-06, + "loss": 11.963, + "step": 480 + }, + { + "epoch": 0.03295284685419215, + "grad_norm": 4.732232570648193, + "learning_rate": 9.959063731485257e-06, + "loss": 12.2066, + "step": 485 + }, + { + "epoch": 0.03329256692485392, + "grad_norm": 4.095287322998047, + "learning_rate": 9.95863908139693e-06, + "loss": 12.0899, + "step": 490 + }, + { + "epoch": 0.033632286995515695, + "grad_norm": 4.746736526489258, + "learning_rate": 9.958214431308603e-06, + "loss": 12.3664, + "step": 495 + }, + { + "epoch": 0.03397200706617747, + "grad_norm": 4.950629234313965, + "learning_rate": 9.957789781220276e-06, + "loss": 12.2196, + "step": 500 + }, + { + "epoch": 0.03431172713683924, + "grad_norm": 4.879374027252197, + "learning_rate": 9.957365131131949e-06, + "loss": 12.4864, + "step": 505 + }, + { + "epoch": 0.03465144720750102, + "grad_norm": 4.764768123626709, + "learning_rate": 9.956940481043621e-06, + "loss": 12.2219, + "step": 510 + }, + { + "epoch": 0.034991167278162795, + "grad_norm": 4.716470241546631, + "learning_rate": 9.956515830955293e-06, + "loss": 12.2587, + "step": 515 + }, + { + "epoch": 0.03533088734882457, + "grad_norm": 4.447343826293945, + "learning_rate": 9.956091180866967e-06, + "loss": 12.0682, + "step": 520 + }, + { + "epoch": 0.03567060741948634, + "grad_norm": 4.581253528594971, + "learning_rate": 9.95566653077864e-06, + "loss": 11.6338, + "step": 525 + }, + { + "epoch": 0.03601032749014812, + "grad_norm": 4.533254146575928, + "learning_rate": 9.955241880690311e-06, + "loss": 11.5974, + "step": 530 + }, + { + "epoch": 0.036350047560809895, + "grad_norm": 4.841196060180664, + "learning_rate": 9.954817230601985e-06, + "loss": 11.5303, + "step": 535 + }, + { + "epoch": 0.036689767631471665, + "grad_norm": 6.136174201965332, + "learning_rate": 9.954392580513658e-06, + "loss": 11.9672, + "step": 540 + }, + { + "epoch": 0.03702948770213344, + "grad_norm": 6.753835201263428, + "learning_rate": 9.95396793042533e-06, + "loss": 11.7558, + "step": 545 + }, + { + "epoch": 0.03736920777279522, + "grad_norm": 6.889477729797363, + "learning_rate": 9.953543280337004e-06, + "loss": 12.325, + "step": 550 + }, + { + "epoch": 0.03770892784345699, + "grad_norm": 3.9408977031707764, + "learning_rate": 9.953118630248677e-06, + "loss": 12.0247, + "step": 555 + }, + { + "epoch": 0.038048647914118765, + "grad_norm": 4.361989974975586, + "learning_rate": 9.952693980160348e-06, + "loss": 12.1175, + "step": 560 + }, + { + "epoch": 0.03838836798478054, + "grad_norm": 4.777275562286377, + "learning_rate": 9.952269330072022e-06, + "loss": 11.9433, + "step": 565 + }, + { + "epoch": 0.03872808805544232, + "grad_norm": 4.865354537963867, + "learning_rate": 9.951844679983695e-06, + "loss": 12.0638, + "step": 570 + }, + { + "epoch": 0.03906780812610409, + "grad_norm": 4.985920429229736, + "learning_rate": 9.951420029895366e-06, + "loss": 12.3396, + "step": 575 + }, + { + "epoch": 0.039407528196765865, + "grad_norm": 4.311769962310791, + "learning_rate": 9.95099537980704e-06, + "loss": 11.3956, + "step": 580 + }, + { + "epoch": 0.03974724826742764, + "grad_norm": 4.61562967300415, + "learning_rate": 9.950570729718713e-06, + "loss": 12.2139, + "step": 585 + }, + { + "epoch": 0.04008696833808941, + "grad_norm": 4.141115665435791, + "learning_rate": 9.950146079630385e-06, + "loss": 11.8013, + "step": 590 + }, + { + "epoch": 0.04042668840875119, + "grad_norm": 4.904603481292725, + "learning_rate": 9.949721429542059e-06, + "loss": 11.7134, + "step": 595 + }, + { + "epoch": 0.040766408479412965, + "grad_norm": 4.289965629577637, + "learning_rate": 9.94929677945373e-06, + "loss": 11.8991, + "step": 600 + }, + { + "epoch": 0.04110612855007474, + "grad_norm": 4.066554546356201, + "learning_rate": 9.948872129365403e-06, + "loss": 11.4872, + "step": 605 + }, + { + "epoch": 0.04144584862073651, + "grad_norm": 4.582818031311035, + "learning_rate": 9.948447479277077e-06, + "loss": 12.0206, + "step": 610 + }, + { + "epoch": 0.04178556869139829, + "grad_norm": 4.082798004150391, + "learning_rate": 9.948022829188749e-06, + "loss": 11.9315, + "step": 615 + }, + { + "epoch": 0.042125288762060065, + "grad_norm": 3.584038019180298, + "learning_rate": 9.947598179100421e-06, + "loss": 11.7022, + "step": 620 + }, + { + "epoch": 0.042465008832721834, + "grad_norm": 3.939055919647217, + "learning_rate": 9.947173529012096e-06, + "loss": 11.7144, + "step": 625 + }, + { + "epoch": 0.04280472890338361, + "grad_norm": 3.6859183311462402, + "learning_rate": 9.946748878923767e-06, + "loss": 11.2011, + "step": 630 + }, + { + "epoch": 0.04314444897404539, + "grad_norm": 4.360085964202881, + "learning_rate": 9.94632422883544e-06, + "loss": 11.7106, + "step": 635 + }, + { + "epoch": 0.043484169044707165, + "grad_norm": 5.084804058074951, + "learning_rate": 9.945899578747114e-06, + "loss": 11.845, + "step": 640 + }, + { + "epoch": 0.043823889115368934, + "grad_norm": 4.708499431610107, + "learning_rate": 9.945474928658785e-06, + "loss": 12.1678, + "step": 645 + }, + { + "epoch": 0.04416360918603071, + "grad_norm": 5.710316181182861, + "learning_rate": 9.945050278570458e-06, + "loss": 11.8811, + "step": 650 + }, + { + "epoch": 0.04450332925669249, + "grad_norm": 4.12416410446167, + "learning_rate": 9.944625628482133e-06, + "loss": 11.8495, + "step": 655 + }, + { + "epoch": 0.04484304932735426, + "grad_norm": 3.6814351081848145, + "learning_rate": 9.944200978393804e-06, + "loss": 11.3873, + "step": 660 + }, + { + "epoch": 0.045182769398016034, + "grad_norm": 3.45304274559021, + "learning_rate": 9.943776328305477e-06, + "loss": 11.4563, + "step": 665 + }, + { + "epoch": 0.04552248946867781, + "grad_norm": 4.203713417053223, + "learning_rate": 9.94335167821715e-06, + "loss": 11.9139, + "step": 670 + }, + { + "epoch": 0.04586220953933958, + "grad_norm": 4.808511734008789, + "learning_rate": 9.942927028128822e-06, + "loss": 11.6328, + "step": 675 + }, + { + "epoch": 0.04620192961000136, + "grad_norm": 3.6378672122955322, + "learning_rate": 9.942502378040495e-06, + "loss": 11.2486, + "step": 680 + }, + { + "epoch": 0.046541649680663134, + "grad_norm": 4.917055130004883, + "learning_rate": 9.942077727952168e-06, + "loss": 11.6679, + "step": 685 + }, + { + "epoch": 0.04688136975132491, + "grad_norm": 5.144744873046875, + "learning_rate": 9.94165307786384e-06, + "loss": 11.7269, + "step": 690 + }, + { + "epoch": 0.04722108982198668, + "grad_norm": 3.682931900024414, + "learning_rate": 9.941228427775513e-06, + "loss": 11.6567, + "step": 695 + }, + { + "epoch": 0.04756080989264846, + "grad_norm": 3.8717918395996094, + "learning_rate": 9.940803777687186e-06, + "loss": 11.9262, + "step": 700 + }, + { + "epoch": 0.047900529963310234, + "grad_norm": 3.4032795429229736, + "learning_rate": 9.940379127598859e-06, + "loss": 11.1397, + "step": 705 + }, + { + "epoch": 0.048240250033972004, + "grad_norm": 3.7126970291137695, + "learning_rate": 9.939954477510532e-06, + "loss": 11.6387, + "step": 710 + }, + { + "epoch": 0.04857997010463378, + "grad_norm": 4.0513176918029785, + "learning_rate": 9.939529827422205e-06, + "loss": 11.1948, + "step": 715 + }, + { + "epoch": 0.04891969017529556, + "grad_norm": 3.722806692123413, + "learning_rate": 9.939105177333877e-06, + "loss": 11.5323, + "step": 720 + }, + { + "epoch": 0.049259410245957334, + "grad_norm": 3.6919822692871094, + "learning_rate": 9.93868052724555e-06, + "loss": 11.3934, + "step": 725 + }, + { + "epoch": 0.049599130316619104, + "grad_norm": 4.5993876457214355, + "learning_rate": 9.938255877157223e-06, + "loss": 11.5668, + "step": 730 + }, + { + "epoch": 0.04993885038728088, + "grad_norm": 3.8000190258026123, + "learning_rate": 9.937831227068896e-06, + "loss": 10.9727, + "step": 735 + }, + { + "epoch": 0.05027857045794266, + "grad_norm": 4.173575401306152, + "learning_rate": 9.937406576980569e-06, + "loss": 12.0251, + "step": 740 + }, + { + "epoch": 0.05061829052860443, + "grad_norm": 5.022716045379639, + "learning_rate": 9.936981926892241e-06, + "loss": 11.2092, + "step": 745 + }, + { + "epoch": 0.050958010599266204, + "grad_norm": 3.6002378463745117, + "learning_rate": 9.936557276803914e-06, + "loss": 11.4168, + "step": 750 + }, + { + "epoch": 0.05129773066992798, + "grad_norm": 4.45355749130249, + "learning_rate": 9.936132626715587e-06, + "loss": 11.3669, + "step": 755 + }, + { + "epoch": 0.05163745074058975, + "grad_norm": 4.008460998535156, + "learning_rate": 9.93570797662726e-06, + "loss": 11.603, + "step": 760 + }, + { + "epoch": 0.05197717081125153, + "grad_norm": 4.443114280700684, + "learning_rate": 9.935283326538933e-06, + "loss": 10.9654, + "step": 765 + }, + { + "epoch": 0.052316890881913304, + "grad_norm": 3.507004737854004, + "learning_rate": 9.934858676450605e-06, + "loss": 11.5796, + "step": 770 + }, + { + "epoch": 0.05265661095257508, + "grad_norm": 4.270377159118652, + "learning_rate": 9.934434026362278e-06, + "loss": 11.3872, + "step": 775 + }, + { + "epoch": 0.05299633102323685, + "grad_norm": 4.543367385864258, + "learning_rate": 9.934009376273951e-06, + "loss": 11.675, + "step": 780 + }, + { + "epoch": 0.05333605109389863, + "grad_norm": 3.776069402694702, + "learning_rate": 9.933584726185624e-06, + "loss": 11.4306, + "step": 785 + }, + { + "epoch": 0.053675771164560404, + "grad_norm": 3.8853657245635986, + "learning_rate": 9.933160076097297e-06, + "loss": 11.2476, + "step": 790 + }, + { + "epoch": 0.054015491235222174, + "grad_norm": 4.130936145782471, + "learning_rate": 9.93273542600897e-06, + "loss": 10.8699, + "step": 795 + }, + { + "epoch": 0.05435521130588395, + "grad_norm": 3.6768412590026855, + "learning_rate": 9.932310775920642e-06, + "loss": 11.358, + "step": 800 + }, + { + "epoch": 0.05469493137654573, + "grad_norm": 4.582154750823975, + "learning_rate": 9.931886125832315e-06, + "loss": 11.5391, + "step": 805 + }, + { + "epoch": 0.055034651447207504, + "grad_norm": 3.740185022354126, + "learning_rate": 9.931461475743988e-06, + "loss": 11.684, + "step": 810 + }, + { + "epoch": 0.055374371517869274, + "grad_norm": 3.019994020462036, + "learning_rate": 9.93103682565566e-06, + "loss": 11.4011, + "step": 815 + }, + { + "epoch": 0.05571409158853105, + "grad_norm": 3.9848835468292236, + "learning_rate": 9.930612175567333e-06, + "loss": 11.2354, + "step": 820 + }, + { + "epoch": 0.05605381165919283, + "grad_norm": 4.225237846374512, + "learning_rate": 9.930187525479006e-06, + "loss": 11.1701, + "step": 825 + }, + { + "epoch": 0.0563935317298546, + "grad_norm": 4.221912384033203, + "learning_rate": 9.929762875390679e-06, + "loss": 11.9287, + "step": 830 + }, + { + "epoch": 0.056733251800516374, + "grad_norm": 4.423780918121338, + "learning_rate": 9.929338225302352e-06, + "loss": 11.1862, + "step": 835 + }, + { + "epoch": 0.05707297187117815, + "grad_norm": 3.9596996307373047, + "learning_rate": 9.928913575214025e-06, + "loss": 11.6167, + "step": 840 + }, + { + "epoch": 0.05741269194183993, + "grad_norm": 3.8374569416046143, + "learning_rate": 9.928488925125697e-06, + "loss": 11.3889, + "step": 845 + }, + { + "epoch": 0.0577524120125017, + "grad_norm": 6.121049404144287, + "learning_rate": 9.92806427503737e-06, + "loss": 11.8664, + "step": 850 + }, + { + "epoch": 0.058092132083163474, + "grad_norm": 3.0539777278900146, + "learning_rate": 9.927639624949043e-06, + "loss": 10.933, + "step": 855 + }, + { + "epoch": 0.05843185215382525, + "grad_norm": 4.217175483703613, + "learning_rate": 9.927214974860716e-06, + "loss": 10.8839, + "step": 860 + }, + { + "epoch": 0.05877157222448702, + "grad_norm": 4.015830039978027, + "learning_rate": 9.926790324772389e-06, + "loss": 11.3181, + "step": 865 + }, + { + "epoch": 0.0591112922951488, + "grad_norm": 3.736560821533203, + "learning_rate": 9.926365674684061e-06, + "loss": 10.9291, + "step": 870 + }, + { + "epoch": 0.059451012365810574, + "grad_norm": 4.5897650718688965, + "learning_rate": 9.925941024595734e-06, + "loss": 11.3592, + "step": 875 + }, + { + "epoch": 0.059790732436472344, + "grad_norm": 3.9770307540893555, + "learning_rate": 9.925516374507407e-06, + "loss": 11.1638, + "step": 880 + }, + { + "epoch": 0.06013045250713412, + "grad_norm": 4.609964847564697, + "learning_rate": 9.92509172441908e-06, + "loss": 10.897, + "step": 885 + }, + { + "epoch": 0.0604701725777959, + "grad_norm": 3.4169058799743652, + "learning_rate": 9.924667074330753e-06, + "loss": 11.0693, + "step": 890 + }, + { + "epoch": 0.060809892648457674, + "grad_norm": 4.332650184631348, + "learning_rate": 9.924242424242425e-06, + "loss": 11.4244, + "step": 895 + }, + { + "epoch": 0.061149612719119444, + "grad_norm": 3.8448293209075928, + "learning_rate": 9.923817774154098e-06, + "loss": 11.0141, + "step": 900 + }, + { + "epoch": 0.06148933278978122, + "grad_norm": 3.59759783744812, + "learning_rate": 9.923393124065771e-06, + "loss": 11.0629, + "step": 905 + }, + { + "epoch": 0.061829052860443, + "grad_norm": 6.085134029388428, + "learning_rate": 9.922968473977444e-06, + "loss": 11.1196, + "step": 910 + }, + { + "epoch": 0.06216877293110477, + "grad_norm": 3.6650919914245605, + "learning_rate": 9.922543823889117e-06, + "loss": 11.0609, + "step": 915 + }, + { + "epoch": 0.06250849300176654, + "grad_norm": 3.7142527103424072, + "learning_rate": 9.92211917380079e-06, + "loss": 11.1312, + "step": 920 + }, + { + "epoch": 0.06284821307242831, + "grad_norm": 3.790339469909668, + "learning_rate": 9.921694523712462e-06, + "loss": 11.5378, + "step": 925 + }, + { + "epoch": 0.0631879331430901, + "grad_norm": 4.385564804077148, + "learning_rate": 9.921269873624133e-06, + "loss": 11.4133, + "step": 930 + }, + { + "epoch": 0.06352765321375187, + "grad_norm": 3.4253742694854736, + "learning_rate": 9.920845223535808e-06, + "loss": 11.0478, + "step": 935 + }, + { + "epoch": 0.06386737328441364, + "grad_norm": 3.548251152038574, + "learning_rate": 9.92042057344748e-06, + "loss": 10.6844, + "step": 940 + }, + { + "epoch": 0.06420709335507542, + "grad_norm": 4.508519172668457, + "learning_rate": 9.919995923359152e-06, + "loss": 10.8018, + "step": 945 + }, + { + "epoch": 0.06454681342573719, + "grad_norm": 3.2482380867004395, + "learning_rate": 9.919571273270826e-06, + "loss": 10.9887, + "step": 950 + }, + { + "epoch": 0.06488653349639897, + "grad_norm": 3.4730100631713867, + "learning_rate": 9.919146623182499e-06, + "loss": 10.9644, + "step": 955 + }, + { + "epoch": 0.06522625356706074, + "grad_norm": 4.085874080657959, + "learning_rate": 9.91872197309417e-06, + "loss": 11.0076, + "step": 960 + }, + { + "epoch": 0.06556597363772251, + "grad_norm": 3.246004104614258, + "learning_rate": 9.918297323005845e-06, + "loss": 10.4921, + "step": 965 + }, + { + "epoch": 0.0659056937083843, + "grad_norm": 3.983152389526367, + "learning_rate": 9.917872672917517e-06, + "loss": 10.4578, + "step": 970 + }, + { + "epoch": 0.06624541377904607, + "grad_norm": 3.518188953399658, + "learning_rate": 9.917448022829189e-06, + "loss": 11.055, + "step": 975 + }, + { + "epoch": 0.06658513384970784, + "grad_norm": 4.120105266571045, + "learning_rate": 9.917023372740863e-06, + "loss": 10.9429, + "step": 980 + }, + { + "epoch": 0.06692485392036962, + "grad_norm": 4.088332653045654, + "learning_rate": 9.916598722652536e-06, + "loss": 11.1066, + "step": 985 + }, + { + "epoch": 0.06726457399103139, + "grad_norm": 4.3657426834106445, + "learning_rate": 9.916174072564207e-06, + "loss": 11.2144, + "step": 990 + }, + { + "epoch": 0.06760429406169316, + "grad_norm": 3.589780807495117, + "learning_rate": 9.915749422475881e-06, + "loss": 10.7426, + "step": 995 + }, + { + "epoch": 0.06794401413235494, + "grad_norm": 4.472056865692139, + "learning_rate": 9.915324772387554e-06, + "loss": 11.1684, + "step": 1000 + }, + { + "epoch": 0.06828373420301671, + "grad_norm": 3.110936164855957, + "learning_rate": 9.914900122299225e-06, + "loss": 10.7808, + "step": 1005 + }, + { + "epoch": 0.06862345427367848, + "grad_norm": 3.432605743408203, + "learning_rate": 9.9144754722109e-06, + "loss": 10.6192, + "step": 1010 + }, + { + "epoch": 0.06896317434434027, + "grad_norm": 3.199779748916626, + "learning_rate": 9.914050822122571e-06, + "loss": 11.067, + "step": 1015 + }, + { + "epoch": 0.06930289441500204, + "grad_norm": 3.4214870929718018, + "learning_rate": 9.913626172034244e-06, + "loss": 11.1751, + "step": 1020 + }, + { + "epoch": 0.0696426144856638, + "grad_norm": 3.8639068603515625, + "learning_rate": 9.913201521945918e-06, + "loss": 10.9217, + "step": 1025 + }, + { + "epoch": 0.06998233455632559, + "grad_norm": 3.325946569442749, + "learning_rate": 9.91277687185759e-06, + "loss": 10.6299, + "step": 1030 + }, + { + "epoch": 0.07032205462698736, + "grad_norm": 4.516903877258301, + "learning_rate": 9.912352221769262e-06, + "loss": 10.6845, + "step": 1035 + }, + { + "epoch": 0.07066177469764914, + "grad_norm": 5.226884841918945, + "learning_rate": 9.911927571680937e-06, + "loss": 10.2927, + "step": 1040 + }, + { + "epoch": 0.07100149476831091, + "grad_norm": 3.5261731147766113, + "learning_rate": 9.911502921592608e-06, + "loss": 10.8673, + "step": 1045 + }, + { + "epoch": 0.07134121483897268, + "grad_norm": 3.200565814971924, + "learning_rate": 9.91107827150428e-06, + "loss": 10.9006, + "step": 1050 + }, + { + "epoch": 0.07168093490963447, + "grad_norm": 3.215090036392212, + "learning_rate": 9.910653621415955e-06, + "loss": 10.4052, + "step": 1055 + }, + { + "epoch": 0.07202065498029624, + "grad_norm": 4.199523448944092, + "learning_rate": 9.910228971327626e-06, + "loss": 10.7281, + "step": 1060 + }, + { + "epoch": 0.072360375050958, + "grad_norm": 3.726140022277832, + "learning_rate": 9.909804321239299e-06, + "loss": 10.2398, + "step": 1065 + }, + { + "epoch": 0.07270009512161979, + "grad_norm": 4.110472202301025, + "learning_rate": 9.909379671150973e-06, + "loss": 11.2545, + "step": 1070 + }, + { + "epoch": 0.07303981519228156, + "grad_norm": 3.1856770515441895, + "learning_rate": 9.908955021062645e-06, + "loss": 10.4995, + "step": 1075 + }, + { + "epoch": 0.07337953526294333, + "grad_norm": 4.205803394317627, + "learning_rate": 9.908530370974317e-06, + "loss": 10.5848, + "step": 1080 + }, + { + "epoch": 0.07371925533360511, + "grad_norm": 2.805187940597534, + "learning_rate": 9.90810572088599e-06, + "loss": 10.4519, + "step": 1085 + }, + { + "epoch": 0.07405897540426688, + "grad_norm": 2.9936251640319824, + "learning_rate": 9.907681070797663e-06, + "loss": 10.6216, + "step": 1090 + }, + { + "epoch": 0.07439869547492865, + "grad_norm": 3.089982509613037, + "learning_rate": 9.907256420709336e-06, + "loss": 10.528, + "step": 1095 + }, + { + "epoch": 0.07473841554559044, + "grad_norm": 3.2330338954925537, + "learning_rate": 9.906831770621009e-06, + "loss": 10.3658, + "step": 1100 + }, + { + "epoch": 0.0750781356162522, + "grad_norm": 4.016029357910156, + "learning_rate": 9.906407120532681e-06, + "loss": 11.0903, + "step": 1105 + }, + { + "epoch": 0.07541785568691398, + "grad_norm": 3.4805471897125244, + "learning_rate": 9.905982470444354e-06, + "loss": 10.6717, + "step": 1110 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 4.715586185455322, + "learning_rate": 9.905557820356027e-06, + "loss": 10.9074, + "step": 1115 + }, + { + "epoch": 0.07609729582823753, + "grad_norm": 3.7419660091400146, + "learning_rate": 9.9051331702677e-06, + "loss": 10.3919, + "step": 1120 + }, + { + "epoch": 0.07643701589889931, + "grad_norm": 4.672693729400635, + "learning_rate": 9.904708520179373e-06, + "loss": 10.6706, + "step": 1125 + }, + { + "epoch": 0.07677673596956108, + "grad_norm": 3.751079559326172, + "learning_rate": 9.904283870091045e-06, + "loss": 10.7657, + "step": 1130 + }, + { + "epoch": 0.07711645604022285, + "grad_norm": 3.617222785949707, + "learning_rate": 9.903859220002718e-06, + "loss": 10.6463, + "step": 1135 + }, + { + "epoch": 0.07745617611088464, + "grad_norm": 3.9197537899017334, + "learning_rate": 9.903434569914393e-06, + "loss": 10.3002, + "step": 1140 + }, + { + "epoch": 0.0777958961815464, + "grad_norm": 3.8868045806884766, + "learning_rate": 9.903009919826064e-06, + "loss": 10.7225, + "step": 1145 + }, + { + "epoch": 0.07813561625220818, + "grad_norm": 3.9348435401916504, + "learning_rate": 9.902585269737737e-06, + "loss": 10.7851, + "step": 1150 + }, + { + "epoch": 0.07847533632286996, + "grad_norm": 4.426255702972412, + "learning_rate": 9.902160619649411e-06, + "loss": 10.7343, + "step": 1155 + }, + { + "epoch": 0.07881505639353173, + "grad_norm": 3.9117493629455566, + "learning_rate": 9.901735969561082e-06, + "loss": 10.4509, + "step": 1160 + }, + { + "epoch": 0.0791547764641935, + "grad_norm": 3.092104196548462, + "learning_rate": 9.901311319472755e-06, + "loss": 10.6097, + "step": 1165 + }, + { + "epoch": 0.07949449653485528, + "grad_norm": 3.9392311573028564, + "learning_rate": 9.900886669384428e-06, + "loss": 10.6384, + "step": 1170 + }, + { + "epoch": 0.07983421660551705, + "grad_norm": 3.5129520893096924, + "learning_rate": 9.9004620192961e-06, + "loss": 10.4116, + "step": 1175 + }, + { + "epoch": 0.08017393667617882, + "grad_norm": 4.161727428436279, + "learning_rate": 9.900037369207773e-06, + "loss": 10.9099, + "step": 1180 + }, + { + "epoch": 0.0805136567468406, + "grad_norm": 4.077610492706299, + "learning_rate": 9.899612719119446e-06, + "loss": 10.5298, + "step": 1185 + }, + { + "epoch": 0.08085337681750238, + "grad_norm": 3.9664700031280518, + "learning_rate": 9.899188069031119e-06, + "loss": 10.2809, + "step": 1190 + }, + { + "epoch": 0.08119309688816416, + "grad_norm": 4.092404842376709, + "learning_rate": 9.898763418942792e-06, + "loss": 10.5557, + "step": 1195 + }, + { + "epoch": 0.08153281695882593, + "grad_norm": 3.0606191158294678, + "learning_rate": 9.898338768854465e-06, + "loss": 10.8798, + "step": 1200 + }, + { + "epoch": 0.0818725370294877, + "grad_norm": 2.756366014480591, + "learning_rate": 9.897914118766137e-06, + "loss": 10.4873, + "step": 1205 + }, + { + "epoch": 0.08221225710014948, + "grad_norm": 3.4908173084259033, + "learning_rate": 9.89748946867781e-06, + "loss": 10.422, + "step": 1210 + }, + { + "epoch": 0.08255197717081125, + "grad_norm": 3.842383623123169, + "learning_rate": 9.897064818589483e-06, + "loss": 10.3019, + "step": 1215 + }, + { + "epoch": 0.08289169724147302, + "grad_norm": 4.042928695678711, + "learning_rate": 9.896640168501156e-06, + "loss": 10.612, + "step": 1220 + }, + { + "epoch": 0.0832314173121348, + "grad_norm": 3.850269317626953, + "learning_rate": 9.896215518412829e-06, + "loss": 10.5407, + "step": 1225 + }, + { + "epoch": 0.08357113738279658, + "grad_norm": 4.130558013916016, + "learning_rate": 9.895790868324501e-06, + "loss": 10.982, + "step": 1230 + }, + { + "epoch": 0.08391085745345835, + "grad_norm": 3.470337390899658, + "learning_rate": 9.895366218236174e-06, + "loss": 10.1258, + "step": 1235 + }, + { + "epoch": 0.08425057752412013, + "grad_norm": 4.799595832824707, + "learning_rate": 9.894941568147847e-06, + "loss": 10.3792, + "step": 1240 + }, + { + "epoch": 0.0845902975947819, + "grad_norm": 3.5654797554016113, + "learning_rate": 9.89451691805952e-06, + "loss": 10.3019, + "step": 1245 + }, + { + "epoch": 0.08493001766544367, + "grad_norm": 4.141574859619141, + "learning_rate": 9.894092267971193e-06, + "loss": 10.5225, + "step": 1250 + }, + { + "epoch": 0.08526973773610545, + "grad_norm": 3.2112061977386475, + "learning_rate": 9.893667617882865e-06, + "loss": 10.6137, + "step": 1255 + }, + { + "epoch": 0.08560945780676722, + "grad_norm": 4.031147480010986, + "learning_rate": 9.893242967794538e-06, + "loss": 10.7001, + "step": 1260 + }, + { + "epoch": 0.08594917787742899, + "grad_norm": 3.0918054580688477, + "learning_rate": 9.892818317706211e-06, + "loss": 10.4656, + "step": 1265 + }, + { + "epoch": 0.08628889794809078, + "grad_norm": 2.819690227508545, + "learning_rate": 9.892393667617884e-06, + "loss": 10.4929, + "step": 1270 + }, + { + "epoch": 0.08662861801875255, + "grad_norm": 3.5951499938964844, + "learning_rate": 9.891969017529557e-06, + "loss": 10.3594, + "step": 1275 + }, + { + "epoch": 0.08696833808941433, + "grad_norm": 3.327256917953491, + "learning_rate": 9.89154436744123e-06, + "loss": 10.7585, + "step": 1280 + }, + { + "epoch": 0.0873080581600761, + "grad_norm": 3.6384871006011963, + "learning_rate": 9.891119717352902e-06, + "loss": 10.8084, + "step": 1285 + }, + { + "epoch": 0.08764777823073787, + "grad_norm": 3.1058852672576904, + "learning_rate": 9.890695067264575e-06, + "loss": 10.4928, + "step": 1290 + }, + { + "epoch": 0.08798749830139965, + "grad_norm": 3.194603681564331, + "learning_rate": 9.890270417176248e-06, + "loss": 10.3146, + "step": 1295 + }, + { + "epoch": 0.08832721837206142, + "grad_norm": 3.56955623626709, + "learning_rate": 9.88984576708792e-06, + "loss": 10.5167, + "step": 1300 + }, + { + "epoch": 0.08866693844272319, + "grad_norm": 2.9359374046325684, + "learning_rate": 9.889421116999593e-06, + "loss": 10.1109, + "step": 1305 + }, + { + "epoch": 0.08900665851338498, + "grad_norm": 3.623802900314331, + "learning_rate": 9.888996466911266e-06, + "loss": 10.3382, + "step": 1310 + }, + { + "epoch": 0.08934637858404675, + "grad_norm": 3.6544861793518066, + "learning_rate": 9.888571816822939e-06, + "loss": 10.3957, + "step": 1315 + }, + { + "epoch": 0.08968609865470852, + "grad_norm": 3.3310234546661377, + "learning_rate": 9.888147166734612e-06, + "loss": 10.0309, + "step": 1320 + }, + { + "epoch": 0.0900258187253703, + "grad_norm": 2.7472524642944336, + "learning_rate": 9.887722516646285e-06, + "loss": 9.8601, + "step": 1325 + }, + { + "epoch": 0.09036553879603207, + "grad_norm": 3.0760622024536133, + "learning_rate": 9.887297866557957e-06, + "loss": 10.2606, + "step": 1330 + }, + { + "epoch": 0.09070525886669384, + "grad_norm": 3.3211588859558105, + "learning_rate": 9.88687321646963e-06, + "loss": 10.2672, + "step": 1335 + }, + { + "epoch": 0.09104497893735562, + "grad_norm": 3.430612802505493, + "learning_rate": 9.886448566381303e-06, + "loss": 10.0152, + "step": 1340 + }, + { + "epoch": 0.09138469900801739, + "grad_norm": 3.839916706085205, + "learning_rate": 9.886023916292976e-06, + "loss": 10.1574, + "step": 1345 + }, + { + "epoch": 0.09172441907867916, + "grad_norm": 3.0542473793029785, + "learning_rate": 9.885599266204649e-06, + "loss": 10.1716, + "step": 1350 + }, + { + "epoch": 0.09206413914934095, + "grad_norm": 3.00162935256958, + "learning_rate": 9.885174616116321e-06, + "loss": 10.2432, + "step": 1355 + }, + { + "epoch": 0.09240385922000272, + "grad_norm": 3.2442831993103027, + "learning_rate": 9.884749966027993e-06, + "loss": 10.115, + "step": 1360 + }, + { + "epoch": 0.0927435792906645, + "grad_norm": 3.123339891433716, + "learning_rate": 9.884325315939667e-06, + "loss": 10.1362, + "step": 1365 + }, + { + "epoch": 0.09308329936132627, + "grad_norm": 2.681227445602417, + "learning_rate": 9.88390066585134e-06, + "loss": 10.2548, + "step": 1370 + }, + { + "epoch": 0.09342301943198804, + "grad_norm": 3.1401607990264893, + "learning_rate": 9.883476015763011e-06, + "loss": 10.4723, + "step": 1375 + }, + { + "epoch": 0.09376273950264982, + "grad_norm": 3.525358200073242, + "learning_rate": 9.883051365674685e-06, + "loss": 10.2865, + "step": 1380 + }, + { + "epoch": 0.09410245957331159, + "grad_norm": 3.117112159729004, + "learning_rate": 9.882626715586358e-06, + "loss": 10.2297, + "step": 1385 + }, + { + "epoch": 0.09444217964397336, + "grad_norm": 2.1166462898254395, + "learning_rate": 9.88220206549803e-06, + "loss": 9.8285, + "step": 1390 + }, + { + "epoch": 0.09478189971463515, + "grad_norm": 3.556422710418701, + "learning_rate": 9.881777415409704e-06, + "loss": 10.4254, + "step": 1395 + }, + { + "epoch": 0.09512161978529692, + "grad_norm": 4.661703586578369, + "learning_rate": 9.881352765321377e-06, + "loss": 10.5403, + "step": 1400 + }, + { + "epoch": 0.09546133985595869, + "grad_norm": 3.033308982849121, + "learning_rate": 9.880928115233048e-06, + "loss": 9.8934, + "step": 1405 + }, + { + "epoch": 0.09580105992662047, + "grad_norm": 4.054826259613037, + "learning_rate": 9.880503465144722e-06, + "loss": 9.8472, + "step": 1410 + }, + { + "epoch": 0.09614077999728224, + "grad_norm": 3.0008296966552734, + "learning_rate": 9.880078815056395e-06, + "loss": 9.9206, + "step": 1415 + }, + { + "epoch": 0.09648050006794401, + "grad_norm": 3.4594268798828125, + "learning_rate": 9.879654164968066e-06, + "loss": 10.3127, + "step": 1420 + }, + { + "epoch": 0.09682022013860579, + "grad_norm": 2.8132004737854004, + "learning_rate": 9.87922951487974e-06, + "loss": 10.1061, + "step": 1425 + }, + { + "epoch": 0.09715994020926756, + "grad_norm": 2.8023738861083984, + "learning_rate": 9.878804864791412e-06, + "loss": 10.0265, + "step": 1430 + }, + { + "epoch": 0.09749966027992933, + "grad_norm": 3.064344882965088, + "learning_rate": 9.878380214703085e-06, + "loss": 9.7137, + "step": 1435 + }, + { + "epoch": 0.09783938035059112, + "grad_norm": 2.3047196865081787, + "learning_rate": 9.877955564614759e-06, + "loss": 10.0767, + "step": 1440 + }, + { + "epoch": 0.09817910042125289, + "grad_norm": 4.502571105957031, + "learning_rate": 9.87753091452643e-06, + "loss": 9.9463, + "step": 1445 + }, + { + "epoch": 0.09851882049191467, + "grad_norm": 3.3912034034729004, + "learning_rate": 9.877106264438103e-06, + "loss": 9.892, + "step": 1450 + }, + { + "epoch": 0.09885854056257644, + "grad_norm": 2.868283271789551, + "learning_rate": 9.876681614349777e-06, + "loss": 9.8563, + "step": 1455 + }, + { + "epoch": 0.09919826063323821, + "grad_norm": 3.0830602645874023, + "learning_rate": 9.876256964261449e-06, + "loss": 10.3391, + "step": 1460 + }, + { + "epoch": 0.09953798070389999, + "grad_norm": 2.8210480213165283, + "learning_rate": 9.875832314173121e-06, + "loss": 9.9405, + "step": 1465 + }, + { + "epoch": 0.09987770077456176, + "grad_norm": 2.897245168685913, + "learning_rate": 9.875407664084796e-06, + "loss": 10.3533, + "step": 1470 + }, + { + "epoch": 0.10021742084522353, + "grad_norm": 3.2384109497070312, + "learning_rate": 9.874983013996467e-06, + "loss": 9.8348, + "step": 1475 + }, + { + "epoch": 0.10055714091588532, + "grad_norm": 3.4479856491088867, + "learning_rate": 9.874558363908141e-06, + "loss": 10.2203, + "step": 1480 + }, + { + "epoch": 0.10089686098654709, + "grad_norm": 2.549337863922119, + "learning_rate": 9.874133713819814e-06, + "loss": 9.9428, + "step": 1485 + }, + { + "epoch": 0.10123658105720885, + "grad_norm": 2.610278844833374, + "learning_rate": 9.873709063731485e-06, + "loss": 10.0157, + "step": 1490 + }, + { + "epoch": 0.10157630112787064, + "grad_norm": 2.9252281188964844, + "learning_rate": 9.87328441364316e-06, + "loss": 9.8026, + "step": 1495 + }, + { + "epoch": 0.10191602119853241, + "grad_norm": 2.805912733078003, + "learning_rate": 9.872859763554831e-06, + "loss": 9.8972, + "step": 1500 + }, + { + "epoch": 0.10225574126919418, + "grad_norm": 4.0254225730896, + "learning_rate": 9.872435113466504e-06, + "loss": 10.2735, + "step": 1505 + }, + { + "epoch": 0.10259546133985596, + "grad_norm": 3.0049328804016113, + "learning_rate": 9.872010463378178e-06, + "loss": 10.014, + "step": 1510 + }, + { + "epoch": 0.10293518141051773, + "grad_norm": 3.0524725914001465, + "learning_rate": 9.87158581328985e-06, + "loss": 9.9087, + "step": 1515 + }, + { + "epoch": 0.1032749014811795, + "grad_norm": 2.5243537425994873, + "learning_rate": 9.871161163201522e-06, + "loss": 9.8251, + "step": 1520 + }, + { + "epoch": 0.10361462155184128, + "grad_norm": 3.6610143184661865, + "learning_rate": 9.870736513113197e-06, + "loss": 10.3064, + "step": 1525 + }, + { + "epoch": 0.10395434162250305, + "grad_norm": 3.2295422554016113, + "learning_rate": 9.870311863024868e-06, + "loss": 10.0749, + "step": 1530 + }, + { + "epoch": 0.10429406169316484, + "grad_norm": 2.392803430557251, + "learning_rate": 9.86988721293654e-06, + "loss": 10.2803, + "step": 1535 + }, + { + "epoch": 0.10463378176382661, + "grad_norm": 2.853811740875244, + "learning_rate": 9.869462562848215e-06, + "loss": 10.0463, + "step": 1540 + }, + { + "epoch": 0.10497350183448838, + "grad_norm": 3.1025733947753906, + "learning_rate": 9.869037912759886e-06, + "loss": 10.0924, + "step": 1545 + }, + { + "epoch": 0.10531322190515016, + "grad_norm": 2.467848777770996, + "learning_rate": 9.868613262671559e-06, + "loss": 9.97, + "step": 1550 + }, + { + "epoch": 0.10565294197581193, + "grad_norm": 3.2755346298217773, + "learning_rate": 9.868188612583233e-06, + "loss": 9.4879, + "step": 1555 + }, + { + "epoch": 0.1059926620464737, + "grad_norm": 3.170074462890625, + "learning_rate": 9.867763962494905e-06, + "loss": 9.9486, + "step": 1560 + }, + { + "epoch": 0.10633238211713548, + "grad_norm": 4.715015888214111, + "learning_rate": 9.867339312406577e-06, + "loss": 10.2258, + "step": 1565 + }, + { + "epoch": 0.10667210218779725, + "grad_norm": 3.239492654800415, + "learning_rate": 9.866914662318252e-06, + "loss": 10.0961, + "step": 1570 + }, + { + "epoch": 0.10701182225845902, + "grad_norm": 2.6898839473724365, + "learning_rate": 9.866490012229923e-06, + "loss": 9.7066, + "step": 1575 + }, + { + "epoch": 0.10735154232912081, + "grad_norm": 2.98099946975708, + "learning_rate": 9.866065362141596e-06, + "loss": 9.6631, + "step": 1580 + }, + { + "epoch": 0.10769126239978258, + "grad_norm": 3.301753282546997, + "learning_rate": 9.865640712053269e-06, + "loss": 9.7401, + "step": 1585 + }, + { + "epoch": 0.10803098247044435, + "grad_norm": 2.1544759273529053, + "learning_rate": 9.865216061964941e-06, + "loss": 9.625, + "step": 1590 + }, + { + "epoch": 0.10837070254110613, + "grad_norm": 3.433104991912842, + "learning_rate": 9.864791411876614e-06, + "loss": 10.07, + "step": 1595 + }, + { + "epoch": 0.1087104226117679, + "grad_norm": 2.5662500858306885, + "learning_rate": 9.864366761788287e-06, + "loss": 10.0033, + "step": 1600 + }, + { + "epoch": 0.10905014268242967, + "grad_norm": 2.952796459197998, + "learning_rate": 9.86394211169996e-06, + "loss": 10.3958, + "step": 1605 + }, + { + "epoch": 0.10938986275309145, + "grad_norm": 3.1527857780456543, + "learning_rate": 9.863517461611633e-06, + "loss": 10.1572, + "step": 1610 + }, + { + "epoch": 0.10972958282375322, + "grad_norm": 3.0845062732696533, + "learning_rate": 9.863092811523305e-06, + "loss": 9.7755, + "step": 1615 + }, + { + "epoch": 0.11006930289441501, + "grad_norm": 2.4434990882873535, + "learning_rate": 9.862668161434978e-06, + "loss": 9.7703, + "step": 1620 + }, + { + "epoch": 0.11040902296507678, + "grad_norm": 3.0153822898864746, + "learning_rate": 9.862243511346651e-06, + "loss": 9.9148, + "step": 1625 + }, + { + "epoch": 0.11074874303573855, + "grad_norm": 3.103381872177124, + "learning_rate": 9.861818861258324e-06, + "loss": 9.6807, + "step": 1630 + }, + { + "epoch": 0.11108846310640033, + "grad_norm": 4.0696306228637695, + "learning_rate": 9.861394211169997e-06, + "loss": 9.7487, + "step": 1635 + }, + { + "epoch": 0.1114281831770621, + "grad_norm": 2.6394429206848145, + "learning_rate": 9.86096956108167e-06, + "loss": 10.1014, + "step": 1640 + }, + { + "epoch": 0.11176790324772387, + "grad_norm": 3.6575095653533936, + "learning_rate": 9.860544910993342e-06, + "loss": 9.8626, + "step": 1645 + }, + { + "epoch": 0.11210762331838565, + "grad_norm": 2.9135568141937256, + "learning_rate": 9.860120260905015e-06, + "loss": 9.659, + "step": 1650 + }, + { + "epoch": 0.11244734338904742, + "grad_norm": 2.9166674613952637, + "learning_rate": 9.859695610816688e-06, + "loss": 9.7207, + "step": 1655 + }, + { + "epoch": 0.1127870634597092, + "grad_norm": 2.4438717365264893, + "learning_rate": 9.85927096072836e-06, + "loss": 9.6772, + "step": 1660 + }, + { + "epoch": 0.11312678353037098, + "grad_norm": 2.605783462524414, + "learning_rate": 9.858846310640033e-06, + "loss": 9.6626, + "step": 1665 + }, + { + "epoch": 0.11346650360103275, + "grad_norm": 2.9607794284820557, + "learning_rate": 9.858421660551706e-06, + "loss": 10.0553, + "step": 1670 + }, + { + "epoch": 0.11380622367169452, + "grad_norm": 3.2105133533477783, + "learning_rate": 9.857997010463379e-06, + "loss": 9.7769, + "step": 1675 + }, + { + "epoch": 0.1141459437423563, + "grad_norm": 3.2479217052459717, + "learning_rate": 9.857572360375052e-06, + "loss": 9.7415, + "step": 1680 + }, + { + "epoch": 0.11448566381301807, + "grad_norm": 3.527750253677368, + "learning_rate": 9.857147710286725e-06, + "loss": 9.8287, + "step": 1685 + }, + { + "epoch": 0.11482538388367985, + "grad_norm": 2.737656354904175, + "learning_rate": 9.856723060198397e-06, + "loss": 9.5942, + "step": 1690 + }, + { + "epoch": 0.11516510395434162, + "grad_norm": 2.5910556316375732, + "learning_rate": 9.85629841011007e-06, + "loss": 9.6279, + "step": 1695 + }, + { + "epoch": 0.1155048240250034, + "grad_norm": 2.292987585067749, + "learning_rate": 9.855873760021743e-06, + "loss": 9.6648, + "step": 1700 + }, + { + "epoch": 0.11584454409566518, + "grad_norm": 3.2566699981689453, + "learning_rate": 9.855449109933416e-06, + "loss": 9.5426, + "step": 1705 + }, + { + "epoch": 0.11618426416632695, + "grad_norm": 2.763524055480957, + "learning_rate": 9.855024459845089e-06, + "loss": 9.8343, + "step": 1710 + }, + { + "epoch": 0.11652398423698872, + "grad_norm": 3.953828811645508, + "learning_rate": 9.854599809756761e-06, + "loss": 9.7369, + "step": 1715 + }, + { + "epoch": 0.1168637043076505, + "grad_norm": 3.2535922527313232, + "learning_rate": 9.854175159668434e-06, + "loss": 9.8909, + "step": 1720 + }, + { + "epoch": 0.11720342437831227, + "grad_norm": 2.524836778640747, + "learning_rate": 9.853750509580107e-06, + "loss": 9.5688, + "step": 1725 + }, + { + "epoch": 0.11754314444897404, + "grad_norm": 3.0376033782958984, + "learning_rate": 9.85332585949178e-06, + "loss": 9.2484, + "step": 1730 + }, + { + "epoch": 0.11788286451963582, + "grad_norm": 3.700544595718384, + "learning_rate": 9.852901209403453e-06, + "loss": 9.854, + "step": 1735 + }, + { + "epoch": 0.1182225845902976, + "grad_norm": 2.4292690753936768, + "learning_rate": 9.852476559315125e-06, + "loss": 9.5691, + "step": 1740 + }, + { + "epoch": 0.11856230466095936, + "grad_norm": 2.668112277984619, + "learning_rate": 9.852051909226798e-06, + "loss": 9.7767, + "step": 1745 + }, + { + "epoch": 0.11890202473162115, + "grad_norm": 3.032670736312866, + "learning_rate": 9.851627259138471e-06, + "loss": 9.7686, + "step": 1750 + }, + { + "epoch": 0.11924174480228292, + "grad_norm": 2.473573923110962, + "learning_rate": 9.851202609050144e-06, + "loss": 9.528, + "step": 1755 + }, + { + "epoch": 0.11958146487294469, + "grad_norm": 2.5362930297851562, + "learning_rate": 9.850777958961817e-06, + "loss": 9.9757, + "step": 1760 + }, + { + "epoch": 0.11992118494360647, + "grad_norm": 2.3830721378326416, + "learning_rate": 9.85035330887349e-06, + "loss": 9.5698, + "step": 1765 + }, + { + "epoch": 0.12026090501426824, + "grad_norm": 2.710089683532715, + "learning_rate": 9.849928658785162e-06, + "loss": 9.6926, + "step": 1770 + }, + { + "epoch": 0.12060062508493002, + "grad_norm": 3.441169500350952, + "learning_rate": 9.849504008696833e-06, + "loss": 10.0427, + "step": 1775 + }, + { + "epoch": 0.1209403451555918, + "grad_norm": 3.157487154006958, + "learning_rate": 9.849079358608508e-06, + "loss": 10.0281, + "step": 1780 + }, + { + "epoch": 0.12128006522625356, + "grad_norm": 2.3788135051727295, + "learning_rate": 9.84865470852018e-06, + "loss": 9.4917, + "step": 1785 + }, + { + "epoch": 0.12161978529691535, + "grad_norm": 3.9062323570251465, + "learning_rate": 9.848230058431852e-06, + "loss": 9.5612, + "step": 1790 + }, + { + "epoch": 0.12195950536757712, + "grad_norm": 2.3985517024993896, + "learning_rate": 9.847805408343526e-06, + "loss": 9.1766, + "step": 1795 + }, + { + "epoch": 0.12229922543823889, + "grad_norm": 2.5487124919891357, + "learning_rate": 9.847380758255199e-06, + "loss": 9.5359, + "step": 1800 + }, + { + "epoch": 0.12263894550890067, + "grad_norm": 2.5190515518188477, + "learning_rate": 9.84695610816687e-06, + "loss": 9.5892, + "step": 1805 + }, + { + "epoch": 0.12297866557956244, + "grad_norm": 2.560927152633667, + "learning_rate": 9.846531458078545e-06, + "loss": 9.4689, + "step": 1810 + }, + { + "epoch": 0.12331838565022421, + "grad_norm": 2.6873908042907715, + "learning_rate": 9.846106807990217e-06, + "loss": 9.4163, + "step": 1815 + }, + { + "epoch": 0.123658105720886, + "grad_norm": 2.6609954833984375, + "learning_rate": 9.84568215790189e-06, + "loss": 9.1393, + "step": 1820 + }, + { + "epoch": 0.12399782579154776, + "grad_norm": 2.392719030380249, + "learning_rate": 9.845257507813563e-06, + "loss": 9.6318, + "step": 1825 + }, + { + "epoch": 0.12433754586220953, + "grad_norm": 2.829751491546631, + "learning_rate": 9.844832857725236e-06, + "loss": 9.4349, + "step": 1830 + }, + { + "epoch": 0.12467726593287132, + "grad_norm": 2.1500048637390137, + "learning_rate": 9.844408207636909e-06, + "loss": 9.4554, + "step": 1835 + }, + { + "epoch": 0.1250169860035331, + "grad_norm": 2.6588003635406494, + "learning_rate": 9.843983557548581e-06, + "loss": 9.3882, + "step": 1840 + }, + { + "epoch": 0.12535670607419486, + "grad_norm": 2.4659011363983154, + "learning_rate": 9.843558907460252e-06, + "loss": 9.3157, + "step": 1845 + }, + { + "epoch": 0.12569642614485663, + "grad_norm": 2.3587076663970947, + "learning_rate": 9.843134257371927e-06, + "loss": 9.6598, + "step": 1850 + }, + { + "epoch": 0.12603614621551842, + "grad_norm": 2.8033955097198486, + "learning_rate": 9.8427096072836e-06, + "loss": 9.5965, + "step": 1855 + }, + { + "epoch": 0.1263758662861802, + "grad_norm": 2.8880929946899414, + "learning_rate": 9.842284957195271e-06, + "loss": 9.5426, + "step": 1860 + }, + { + "epoch": 0.12671558635684196, + "grad_norm": 3.3066604137420654, + "learning_rate": 9.841860307106945e-06, + "loss": 9.8663, + "step": 1865 + }, + { + "epoch": 0.12705530642750373, + "grad_norm": 2.7250053882598877, + "learning_rate": 9.841435657018618e-06, + "loss": 9.31, + "step": 1870 + }, + { + "epoch": 0.1273950264981655, + "grad_norm": 2.1773014068603516, + "learning_rate": 9.84101100693029e-06, + "loss": 9.2144, + "step": 1875 + }, + { + "epoch": 0.12773474656882727, + "grad_norm": 2.1924028396606445, + "learning_rate": 9.840586356841964e-06, + "loss": 9.0335, + "step": 1880 + }, + { + "epoch": 0.12807446663948907, + "grad_norm": 2.9055986404418945, + "learning_rate": 9.840161706753637e-06, + "loss": 9.6001, + "step": 1885 + }, + { + "epoch": 0.12841418671015084, + "grad_norm": 2.3276126384735107, + "learning_rate": 9.839737056665308e-06, + "loss": 8.9822, + "step": 1890 + }, + { + "epoch": 0.1287539067808126, + "grad_norm": 3.100273609161377, + "learning_rate": 9.839312406576982e-06, + "loss": 9.5672, + "step": 1895 + }, + { + "epoch": 0.12909362685147438, + "grad_norm": 2.6492257118225098, + "learning_rate": 9.838887756488655e-06, + "loss": 9.4226, + "step": 1900 + }, + { + "epoch": 0.12943334692213615, + "grad_norm": 2.213219165802002, + "learning_rate": 9.838463106400326e-06, + "loss": 8.969, + "step": 1905 + }, + { + "epoch": 0.12977306699279795, + "grad_norm": 2.614039182662964, + "learning_rate": 9.838038456312e-06, + "loss": 9.3873, + "step": 1910 + }, + { + "epoch": 0.13011278706345972, + "grad_norm": 2.4979841709136963, + "learning_rate": 9.837613806223673e-06, + "loss": 9.3107, + "step": 1915 + }, + { + "epoch": 0.1304525071341215, + "grad_norm": 2.910792350769043, + "learning_rate": 9.837189156135345e-06, + "loss": 9.0443, + "step": 1920 + }, + { + "epoch": 0.13079222720478326, + "grad_norm": 2.482065200805664, + "learning_rate": 9.836764506047019e-06, + "loss": 9.621, + "step": 1925 + }, + { + "epoch": 0.13113194727544503, + "grad_norm": 2.544682741165161, + "learning_rate": 9.83633985595869e-06, + "loss": 9.453, + "step": 1930 + }, + { + "epoch": 0.1314716673461068, + "grad_norm": 2.700113296508789, + "learning_rate": 9.835915205870363e-06, + "loss": 9.3077, + "step": 1935 + }, + { + "epoch": 0.1318113874167686, + "grad_norm": 2.506549835205078, + "learning_rate": 9.835490555782037e-06, + "loss": 9.3021, + "step": 1940 + }, + { + "epoch": 0.13215110748743036, + "grad_norm": 2.2118008136749268, + "learning_rate": 9.835065905693709e-06, + "loss": 9.3218, + "step": 1945 + }, + { + "epoch": 0.13249082755809213, + "grad_norm": 2.8125016689300537, + "learning_rate": 9.834641255605381e-06, + "loss": 9.406, + "step": 1950 + }, + { + "epoch": 0.1328305476287539, + "grad_norm": 2.94927978515625, + "learning_rate": 9.834216605517056e-06, + "loss": 9.0082, + "step": 1955 + }, + { + "epoch": 0.13317026769941567, + "grad_norm": 2.7701761722564697, + "learning_rate": 9.833791955428727e-06, + "loss": 9.3248, + "step": 1960 + }, + { + "epoch": 0.13350998777007744, + "grad_norm": 2.226536512374878, + "learning_rate": 9.8333673053404e-06, + "loss": 9.0585, + "step": 1965 + }, + { + "epoch": 0.13384970784073924, + "grad_norm": 2.844184160232544, + "learning_rate": 9.832942655252074e-06, + "loss": 9.4656, + "step": 1970 + }, + { + "epoch": 0.134189427911401, + "grad_norm": 2.6900718212127686, + "learning_rate": 9.832518005163745e-06, + "loss": 9.5309, + "step": 1975 + }, + { + "epoch": 0.13452914798206278, + "grad_norm": 2.772333860397339, + "learning_rate": 9.832093355075418e-06, + "loss": 9.0695, + "step": 1980 + }, + { + "epoch": 0.13486886805272455, + "grad_norm": 2.9446702003479004, + "learning_rate": 9.831668704987093e-06, + "loss": 9.5074, + "step": 1985 + }, + { + "epoch": 0.13520858812338632, + "grad_norm": 2.7873246669769287, + "learning_rate": 9.831244054898764e-06, + "loss": 9.4987, + "step": 1990 + }, + { + "epoch": 0.13554830819404812, + "grad_norm": 2.2451040744781494, + "learning_rate": 9.830819404810437e-06, + "loss": 9.2709, + "step": 1995 + }, + { + "epoch": 0.1358880282647099, + "grad_norm": 2.741299867630005, + "learning_rate": 9.83039475472211e-06, + "loss": 9.1972, + "step": 2000 + }, + { + "epoch": 0.13622774833537166, + "grad_norm": 2.522662878036499, + "learning_rate": 9.829970104633782e-06, + "loss": 9.0727, + "step": 2005 + }, + { + "epoch": 0.13656746840603343, + "grad_norm": 2.517920732498169, + "learning_rate": 9.829545454545455e-06, + "loss": 9.4522, + "step": 2010 + }, + { + "epoch": 0.1369071884766952, + "grad_norm": 2.3984923362731934, + "learning_rate": 9.829120804457128e-06, + "loss": 9.3177, + "step": 2015 + }, + { + "epoch": 0.13724690854735697, + "grad_norm": 3.9839718341827393, + "learning_rate": 9.8286961543688e-06, + "loss": 9.3716, + "step": 2020 + }, + { + "epoch": 0.13758662861801876, + "grad_norm": 2.1438040733337402, + "learning_rate": 9.828271504280473e-06, + "loss": 9.1479, + "step": 2025 + }, + { + "epoch": 0.13792634868868053, + "grad_norm": 2.833765745162964, + "learning_rate": 9.827846854192146e-06, + "loss": 9.5455, + "step": 2030 + }, + { + "epoch": 0.1382660687593423, + "grad_norm": 2.5863118171691895, + "learning_rate": 9.827422204103819e-06, + "loss": 9.3158, + "step": 2035 + }, + { + "epoch": 0.13860578883000407, + "grad_norm": 2.304546356201172, + "learning_rate": 9.826997554015492e-06, + "loss": 9.5387, + "step": 2040 + }, + { + "epoch": 0.13894550890066584, + "grad_norm": 2.505924701690674, + "learning_rate": 9.826572903927165e-06, + "loss": 9.2481, + "step": 2045 + }, + { + "epoch": 0.1392852289713276, + "grad_norm": 3.0584144592285156, + "learning_rate": 9.826148253838837e-06, + "loss": 9.2724, + "step": 2050 + }, + { + "epoch": 0.1396249490419894, + "grad_norm": 2.8603334426879883, + "learning_rate": 9.82572360375051e-06, + "loss": 9.3572, + "step": 2055 + }, + { + "epoch": 0.13996466911265118, + "grad_norm": 2.5567243099212646, + "learning_rate": 9.825298953662183e-06, + "loss": 9.2462, + "step": 2060 + }, + { + "epoch": 0.14030438918331295, + "grad_norm": 2.4096240997314453, + "learning_rate": 9.824874303573856e-06, + "loss": 9.4315, + "step": 2065 + }, + { + "epoch": 0.14064410925397472, + "grad_norm": 2.407817840576172, + "learning_rate": 9.824449653485529e-06, + "loss": 9.1779, + "step": 2070 + }, + { + "epoch": 0.1409838293246365, + "grad_norm": 2.267364501953125, + "learning_rate": 9.824025003397201e-06, + "loss": 9.2401, + "step": 2075 + }, + { + "epoch": 0.1413235493952983, + "grad_norm": 2.4690568447113037, + "learning_rate": 9.823600353308874e-06, + "loss": 9.2789, + "step": 2080 + }, + { + "epoch": 0.14166326946596006, + "grad_norm": 3.1811797618865967, + "learning_rate": 9.823175703220547e-06, + "loss": 9.3068, + "step": 2085 + }, + { + "epoch": 0.14200298953662183, + "grad_norm": 3.1249217987060547, + "learning_rate": 9.82275105313222e-06, + "loss": 9.2545, + "step": 2090 + }, + { + "epoch": 0.1423427096072836, + "grad_norm": 3.2506136894226074, + "learning_rate": 9.822326403043893e-06, + "loss": 9.3049, + "step": 2095 + }, + { + "epoch": 0.14268242967794537, + "grad_norm": 3.1442368030548096, + "learning_rate": 9.821901752955565e-06, + "loss": 9.1819, + "step": 2100 + }, + { + "epoch": 0.14302214974860714, + "grad_norm": 2.6402382850646973, + "learning_rate": 9.821477102867238e-06, + "loss": 9.4587, + "step": 2105 + }, + { + "epoch": 0.14336186981926893, + "grad_norm": 1.6315408945083618, + "learning_rate": 9.821052452778911e-06, + "loss": 8.9839, + "step": 2110 + }, + { + "epoch": 0.1437015898899307, + "grad_norm": 2.2782793045043945, + "learning_rate": 9.820627802690584e-06, + "loss": 8.9202, + "step": 2115 + }, + { + "epoch": 0.14404130996059247, + "grad_norm": 2.5533783435821533, + "learning_rate": 9.820203152602257e-06, + "loss": 9.1182, + "step": 2120 + }, + { + "epoch": 0.14438103003125424, + "grad_norm": 2.4279963970184326, + "learning_rate": 9.81977850251393e-06, + "loss": 9.1079, + "step": 2125 + }, + { + "epoch": 0.144720750101916, + "grad_norm": 2.9317145347595215, + "learning_rate": 9.819353852425602e-06, + "loss": 9.0948, + "step": 2130 + }, + { + "epoch": 0.14506047017257778, + "grad_norm": 2.632890462875366, + "learning_rate": 9.818929202337275e-06, + "loss": 9.121, + "step": 2135 + }, + { + "epoch": 0.14540019024323958, + "grad_norm": 3.0679681301116943, + "learning_rate": 9.818504552248948e-06, + "loss": 9.0813, + "step": 2140 + }, + { + "epoch": 0.14573991031390135, + "grad_norm": 1.946466326713562, + "learning_rate": 9.81807990216062e-06, + "loss": 9.1518, + "step": 2145 + }, + { + "epoch": 0.14607963038456312, + "grad_norm": 2.3588016033172607, + "learning_rate": 9.817655252072293e-06, + "loss": 9.3004, + "step": 2150 + }, + { + "epoch": 0.1464193504552249, + "grad_norm": 2.2681314945220947, + "learning_rate": 9.817230601983966e-06, + "loss": 9.2708, + "step": 2155 + }, + { + "epoch": 0.14675907052588666, + "grad_norm": 2.2911934852600098, + "learning_rate": 9.816805951895639e-06, + "loss": 9.1347, + "step": 2160 + }, + { + "epoch": 0.14709879059654846, + "grad_norm": 2.5878427028656006, + "learning_rate": 9.816381301807312e-06, + "loss": 9.2091, + "step": 2165 + }, + { + "epoch": 0.14743851066721023, + "grad_norm": 2.8931870460510254, + "learning_rate": 9.815956651718985e-06, + "loss": 9.1548, + "step": 2170 + }, + { + "epoch": 0.147778230737872, + "grad_norm": 2.9680614471435547, + "learning_rate": 9.815532001630657e-06, + "loss": 9.4279, + "step": 2175 + }, + { + "epoch": 0.14811795080853377, + "grad_norm": 2.432217836380005, + "learning_rate": 9.81510735154233e-06, + "loss": 9.2072, + "step": 2180 + }, + { + "epoch": 0.14845767087919554, + "grad_norm": 2.3003506660461426, + "learning_rate": 9.814682701454003e-06, + "loss": 9.157, + "step": 2185 + }, + { + "epoch": 0.1487973909498573, + "grad_norm": 2.299881935119629, + "learning_rate": 9.814258051365676e-06, + "loss": 9.068, + "step": 2190 + }, + { + "epoch": 0.1491371110205191, + "grad_norm": 2.988687753677368, + "learning_rate": 9.813833401277349e-06, + "loss": 9.1189, + "step": 2195 + }, + { + "epoch": 0.14947683109118087, + "grad_norm": 3.3672237396240234, + "learning_rate": 9.813408751189021e-06, + "loss": 8.908, + "step": 2200 + }, + { + "epoch": 0.14981655116184264, + "grad_norm": 3.2114617824554443, + "learning_rate": 9.812984101100694e-06, + "loss": 8.9507, + "step": 2205 + }, + { + "epoch": 0.1501562712325044, + "grad_norm": 2.120476007461548, + "learning_rate": 9.812559451012367e-06, + "loss": 9.1993, + "step": 2210 + }, + { + "epoch": 0.15049599130316618, + "grad_norm": 2.3364646434783936, + "learning_rate": 9.81213480092404e-06, + "loss": 8.9082, + "step": 2215 + }, + { + "epoch": 0.15083571137382795, + "grad_norm": 2.1824216842651367, + "learning_rate": 9.811710150835713e-06, + "loss": 9.2246, + "step": 2220 + }, + { + "epoch": 0.15117543144448975, + "grad_norm": 2.888725757598877, + "learning_rate": 9.811285500747385e-06, + "loss": 9.2773, + "step": 2225 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 2.12292742729187, + "learning_rate": 9.810860850659058e-06, + "loss": 9.0401, + "step": 2230 + }, + { + "epoch": 0.1518548715858133, + "grad_norm": 2.3783812522888184, + "learning_rate": 9.810436200570731e-06, + "loss": 9.1571, + "step": 2235 + }, + { + "epoch": 0.15219459165647506, + "grad_norm": 2.3046329021453857, + "learning_rate": 9.810011550482404e-06, + "loss": 9.0585, + "step": 2240 + }, + { + "epoch": 0.15253431172713683, + "grad_norm": 2.459852457046509, + "learning_rate": 9.809586900394077e-06, + "loss": 8.954, + "step": 2245 + }, + { + "epoch": 0.15287403179779863, + "grad_norm": 2.6354761123657227, + "learning_rate": 9.80916225030575e-06, + "loss": 8.9271, + "step": 2250 + }, + { + "epoch": 0.1532137518684604, + "grad_norm": 2.4873898029327393, + "learning_rate": 9.808737600217422e-06, + "loss": 8.9993, + "step": 2255 + }, + { + "epoch": 0.15355347193912217, + "grad_norm": 2.1513266563415527, + "learning_rate": 9.808312950129095e-06, + "loss": 8.9839, + "step": 2260 + }, + { + "epoch": 0.15389319200978394, + "grad_norm": 2.457712411880493, + "learning_rate": 9.807888300040768e-06, + "loss": 8.9666, + "step": 2265 + }, + { + "epoch": 0.1542329120804457, + "grad_norm": 3.166149854660034, + "learning_rate": 9.80746364995244e-06, + "loss": 9.1945, + "step": 2270 + }, + { + "epoch": 0.15457263215110748, + "grad_norm": 2.5910208225250244, + "learning_rate": 9.807038999864112e-06, + "loss": 9.0844, + "step": 2275 + }, + { + "epoch": 0.15491235222176927, + "grad_norm": 2.2371439933776855, + "learning_rate": 9.806614349775786e-06, + "loss": 9.0025, + "step": 2280 + }, + { + "epoch": 0.15525207229243104, + "grad_norm": 2.6949455738067627, + "learning_rate": 9.806189699687459e-06, + "loss": 8.9506, + "step": 2285 + }, + { + "epoch": 0.1555917923630928, + "grad_norm": 2.169642686843872, + "learning_rate": 9.80576504959913e-06, + "loss": 8.7243, + "step": 2290 + }, + { + "epoch": 0.15593151243375458, + "grad_norm": 2.499776840209961, + "learning_rate": 9.805340399510805e-06, + "loss": 8.9899, + "step": 2295 + }, + { + "epoch": 0.15627123250441635, + "grad_norm": 3.1645681858062744, + "learning_rate": 9.804915749422477e-06, + "loss": 8.7358, + "step": 2300 + }, + { + "epoch": 0.15661095257507815, + "grad_norm": 2.5277211666107178, + "learning_rate": 9.804491099334148e-06, + "loss": 9.0008, + "step": 2305 + }, + { + "epoch": 0.15695067264573992, + "grad_norm": 2.2956814765930176, + "learning_rate": 9.804066449245823e-06, + "loss": 9.1133, + "step": 2310 + }, + { + "epoch": 0.1572903927164017, + "grad_norm": 2.2163357734680176, + "learning_rate": 9.803641799157496e-06, + "loss": 8.9325, + "step": 2315 + }, + { + "epoch": 0.15763011278706346, + "grad_norm": 2.3176417350769043, + "learning_rate": 9.803217149069167e-06, + "loss": 8.9466, + "step": 2320 + }, + { + "epoch": 0.15796983285772523, + "grad_norm": 2.6389002799987793, + "learning_rate": 9.802792498980841e-06, + "loss": 8.7977, + "step": 2325 + }, + { + "epoch": 0.158309552928387, + "grad_norm": 2.240600824356079, + "learning_rate": 9.802367848892514e-06, + "loss": 8.8306, + "step": 2330 + }, + { + "epoch": 0.1586492729990488, + "grad_norm": 1.8003758192062378, + "learning_rate": 9.801943198804185e-06, + "loss": 8.9201, + "step": 2335 + }, + { + "epoch": 0.15898899306971057, + "grad_norm": 1.9709049463272095, + "learning_rate": 9.80151854871586e-06, + "loss": 9.195, + "step": 2340 + }, + { + "epoch": 0.15932871314037234, + "grad_norm": 2.3576693534851074, + "learning_rate": 9.801093898627531e-06, + "loss": 9.155, + "step": 2345 + }, + { + "epoch": 0.1596684332110341, + "grad_norm": 2.532987594604492, + "learning_rate": 9.800669248539204e-06, + "loss": 9.0454, + "step": 2350 + }, + { + "epoch": 0.16000815328169588, + "grad_norm": 3.8833861351013184, + "learning_rate": 9.800244598450878e-06, + "loss": 8.9467, + "step": 2355 + }, + { + "epoch": 0.16034787335235764, + "grad_norm": 2.342780828475952, + "learning_rate": 9.79981994836255e-06, + "loss": 8.8719, + "step": 2360 + }, + { + "epoch": 0.16068759342301944, + "grad_norm": 2.2839772701263428, + "learning_rate": 9.799395298274222e-06, + "loss": 8.6715, + "step": 2365 + }, + { + "epoch": 0.1610273134936812, + "grad_norm": 1.822191834449768, + "learning_rate": 9.798970648185897e-06, + "loss": 8.7338, + "step": 2370 + }, + { + "epoch": 0.16136703356434298, + "grad_norm": 1.8220857381820679, + "learning_rate": 9.798545998097568e-06, + "loss": 8.8928, + "step": 2375 + }, + { + "epoch": 0.16170675363500475, + "grad_norm": 2.244391679763794, + "learning_rate": 9.79812134800924e-06, + "loss": 8.936, + "step": 2380 + }, + { + "epoch": 0.16204647370566652, + "grad_norm": 1.9182313680648804, + "learning_rate": 9.797696697920915e-06, + "loss": 8.8861, + "step": 2385 + }, + { + "epoch": 0.16238619377632832, + "grad_norm": 2.097604274749756, + "learning_rate": 9.797272047832586e-06, + "loss": 8.8493, + "step": 2390 + }, + { + "epoch": 0.1627259138469901, + "grad_norm": 2.9872677326202393, + "learning_rate": 9.796847397744259e-06, + "loss": 8.9092, + "step": 2395 + }, + { + "epoch": 0.16306563391765186, + "grad_norm": 2.20143985748291, + "learning_rate": 9.796422747655933e-06, + "loss": 9.1501, + "step": 2400 + }, + { + "epoch": 0.16340535398831363, + "grad_norm": 2.9597771167755127, + "learning_rate": 9.795998097567604e-06, + "loss": 8.8, + "step": 2405 + }, + { + "epoch": 0.1637450740589754, + "grad_norm": 2.465460777282715, + "learning_rate": 9.795573447479277e-06, + "loss": 9.2979, + "step": 2410 + }, + { + "epoch": 0.16408479412963717, + "grad_norm": 1.9476176500320435, + "learning_rate": 9.79514879739095e-06, + "loss": 8.8477, + "step": 2415 + }, + { + "epoch": 0.16442451420029897, + "grad_norm": 2.164438486099243, + "learning_rate": 9.794724147302623e-06, + "loss": 8.7838, + "step": 2420 + }, + { + "epoch": 0.16476423427096074, + "grad_norm": 2.4051549434661865, + "learning_rate": 9.794299497214296e-06, + "loss": 8.7227, + "step": 2425 + }, + { + "epoch": 0.1651039543416225, + "grad_norm": 2.6053380966186523, + "learning_rate": 9.793874847125968e-06, + "loss": 9.0016, + "step": 2430 + }, + { + "epoch": 0.16544367441228428, + "grad_norm": 1.9169187545776367, + "learning_rate": 9.793450197037641e-06, + "loss": 8.8537, + "step": 2435 + }, + { + "epoch": 0.16578339448294604, + "grad_norm": 2.7082459926605225, + "learning_rate": 9.793025546949314e-06, + "loss": 8.9009, + "step": 2440 + }, + { + "epoch": 0.16612311455360781, + "grad_norm": 2.0989041328430176, + "learning_rate": 9.792600896860987e-06, + "loss": 8.9919, + "step": 2445 + }, + { + "epoch": 0.1664628346242696, + "grad_norm": 2.0418624877929688, + "learning_rate": 9.79217624677266e-06, + "loss": 9.1033, + "step": 2450 + }, + { + "epoch": 0.16680255469493138, + "grad_norm": 2.4617371559143066, + "learning_rate": 9.791751596684332e-06, + "loss": 8.7392, + "step": 2455 + }, + { + "epoch": 0.16714227476559315, + "grad_norm": 3.2250306606292725, + "learning_rate": 9.791326946596005e-06, + "loss": 9.0514, + "step": 2460 + }, + { + "epoch": 0.16748199483625492, + "grad_norm": 2.3818888664245605, + "learning_rate": 9.790902296507678e-06, + "loss": 8.7365, + "step": 2465 + }, + { + "epoch": 0.1678217149069167, + "grad_norm": 2.1752381324768066, + "learning_rate": 9.790477646419351e-06, + "loss": 8.9734, + "step": 2470 + }, + { + "epoch": 0.1681614349775785, + "grad_norm": 2.331453323364258, + "learning_rate": 9.790052996331024e-06, + "loss": 8.92, + "step": 2475 + }, + { + "epoch": 0.16850115504824026, + "grad_norm": 2.817750930786133, + "learning_rate": 9.789628346242696e-06, + "loss": 9.0676, + "step": 2480 + }, + { + "epoch": 0.16884087511890203, + "grad_norm": 1.8822293281555176, + "learning_rate": 9.78920369615437e-06, + "loss": 8.928, + "step": 2485 + }, + { + "epoch": 0.1691805951895638, + "grad_norm": 2.292208433151245, + "learning_rate": 9.788779046066042e-06, + "loss": 8.9273, + "step": 2490 + }, + { + "epoch": 0.16952031526022557, + "grad_norm": 1.9444451332092285, + "learning_rate": 9.788354395977715e-06, + "loss": 8.796, + "step": 2495 + }, + { + "epoch": 0.16986003533088734, + "grad_norm": 2.1063649654388428, + "learning_rate": 9.787929745889388e-06, + "loss": 9.1288, + "step": 2500 + }, + { + "epoch": 0.17019975540154914, + "grad_norm": 2.67866849899292, + "learning_rate": 9.78750509580106e-06, + "loss": 9.192, + "step": 2505 + }, + { + "epoch": 0.1705394754722109, + "grad_norm": 1.9954450130462646, + "learning_rate": 9.787080445712733e-06, + "loss": 8.7272, + "step": 2510 + }, + { + "epoch": 0.17087919554287267, + "grad_norm": 2.6508545875549316, + "learning_rate": 9.786655795624406e-06, + "loss": 8.9942, + "step": 2515 + }, + { + "epoch": 0.17121891561353444, + "grad_norm": 2.2483158111572266, + "learning_rate": 9.786231145536079e-06, + "loss": 9.1329, + "step": 2520 + }, + { + "epoch": 0.17155863568419621, + "grad_norm": 2.563673257827759, + "learning_rate": 9.785806495447752e-06, + "loss": 9.0129, + "step": 2525 + }, + { + "epoch": 0.17189835575485798, + "grad_norm": 1.8709678649902344, + "learning_rate": 9.785381845359425e-06, + "loss": 8.686, + "step": 2530 + }, + { + "epoch": 0.17223807582551978, + "grad_norm": 2.22244930267334, + "learning_rate": 9.784957195271097e-06, + "loss": 8.8399, + "step": 2535 + }, + { + "epoch": 0.17257779589618155, + "grad_norm": 2.5534183979034424, + "learning_rate": 9.78453254518277e-06, + "loss": 8.4202, + "step": 2540 + }, + { + "epoch": 0.17291751596684332, + "grad_norm": 2.5758323669433594, + "learning_rate": 9.784107895094443e-06, + "loss": 8.8874, + "step": 2545 + }, + { + "epoch": 0.1732572360375051, + "grad_norm": 1.9469996690750122, + "learning_rate": 9.783683245006116e-06, + "loss": 8.9468, + "step": 2550 + }, + { + "epoch": 0.17359695610816686, + "grad_norm": 2.0789144039154053, + "learning_rate": 9.783258594917789e-06, + "loss": 8.7658, + "step": 2555 + }, + { + "epoch": 0.17393667617882866, + "grad_norm": 1.9352399110794067, + "learning_rate": 9.782833944829461e-06, + "loss": 8.5787, + "step": 2560 + }, + { + "epoch": 0.17427639624949043, + "grad_norm": 2.0238914489746094, + "learning_rate": 9.782409294741134e-06, + "loss": 8.8867, + "step": 2565 + }, + { + "epoch": 0.1746161163201522, + "grad_norm": 2.442403793334961, + "learning_rate": 9.781984644652807e-06, + "loss": 8.8113, + "step": 2570 + }, + { + "epoch": 0.17495583639081397, + "grad_norm": 1.8237483501434326, + "learning_rate": 9.78155999456448e-06, + "loss": 8.691, + "step": 2575 + }, + { + "epoch": 0.17529555646147574, + "grad_norm": 4.165390491485596, + "learning_rate": 9.781135344476153e-06, + "loss": 8.9901, + "step": 2580 + }, + { + "epoch": 0.1756352765321375, + "grad_norm": 1.88339102268219, + "learning_rate": 9.780710694387825e-06, + "loss": 8.5485, + "step": 2585 + }, + { + "epoch": 0.1759749966027993, + "grad_norm": 2.5714333057403564, + "learning_rate": 9.780286044299498e-06, + "loss": 8.8604, + "step": 2590 + }, + { + "epoch": 0.17631471667346107, + "grad_norm": 2.0200302600860596, + "learning_rate": 9.779861394211171e-06, + "loss": 8.7145, + "step": 2595 + }, + { + "epoch": 0.17665443674412284, + "grad_norm": 2.1377828121185303, + "learning_rate": 9.779436744122844e-06, + "loss": 8.3963, + "step": 2600 + }, + { + "epoch": 0.17699415681478461, + "grad_norm": 1.7296333312988281, + "learning_rate": 9.779012094034517e-06, + "loss": 8.7174, + "step": 2605 + }, + { + "epoch": 0.17733387688544638, + "grad_norm": 2.598283529281616, + "learning_rate": 9.77858744394619e-06, + "loss": 8.8558, + "step": 2610 + }, + { + "epoch": 0.17767359695610815, + "grad_norm": 2.4188003540039062, + "learning_rate": 9.778162793857862e-06, + "loss": 8.6182, + "step": 2615 + }, + { + "epoch": 0.17801331702676995, + "grad_norm": 2.381463050842285, + "learning_rate": 9.777738143769535e-06, + "loss": 8.6803, + "step": 2620 + }, + { + "epoch": 0.17835303709743172, + "grad_norm": 2.2587475776672363, + "learning_rate": 9.777313493681208e-06, + "loss": 8.4848, + "step": 2625 + }, + { + "epoch": 0.1786927571680935, + "grad_norm": 2.7249844074249268, + "learning_rate": 9.77688884359288e-06, + "loss": 8.8477, + "step": 2630 + }, + { + "epoch": 0.17903247723875526, + "grad_norm": 3.5100860595703125, + "learning_rate": 9.776464193504553e-06, + "loss": 8.353, + "step": 2635 + }, + { + "epoch": 0.17937219730941703, + "grad_norm": 2.983100414276123, + "learning_rate": 9.776039543416226e-06, + "loss": 8.5827, + "step": 2640 + }, + { + "epoch": 0.17971191738007883, + "grad_norm": 1.7096928358078003, + "learning_rate": 9.775614893327899e-06, + "loss": 8.5299, + "step": 2645 + }, + { + "epoch": 0.1800516374507406, + "grad_norm": 1.9482812881469727, + "learning_rate": 9.775190243239572e-06, + "loss": 8.6369, + "step": 2650 + }, + { + "epoch": 0.18039135752140237, + "grad_norm": 1.9604891538619995, + "learning_rate": 9.774765593151245e-06, + "loss": 8.7965, + "step": 2655 + }, + { + "epoch": 0.18073107759206414, + "grad_norm": 2.1914756298065186, + "learning_rate": 9.774340943062917e-06, + "loss": 8.4844, + "step": 2660 + }, + { + "epoch": 0.1810707976627259, + "grad_norm": 1.9606603384017944, + "learning_rate": 9.77391629297459e-06, + "loss": 8.6857, + "step": 2665 + }, + { + "epoch": 0.18141051773338768, + "grad_norm": 4.228400230407715, + "learning_rate": 9.773491642886263e-06, + "loss": 8.5692, + "step": 2670 + }, + { + "epoch": 0.18175023780404947, + "grad_norm": 2.448622465133667, + "learning_rate": 9.773066992797936e-06, + "loss": 8.4076, + "step": 2675 + }, + { + "epoch": 0.18208995787471124, + "grad_norm": 2.2305023670196533, + "learning_rate": 9.772642342709609e-06, + "loss": 8.406, + "step": 2680 + }, + { + "epoch": 0.18242967794537301, + "grad_norm": 2.1376354694366455, + "learning_rate": 9.772217692621281e-06, + "loss": 8.3827, + "step": 2685 + }, + { + "epoch": 0.18276939801603478, + "grad_norm": 2.092820882797241, + "learning_rate": 9.771793042532952e-06, + "loss": 8.6429, + "step": 2690 + }, + { + "epoch": 0.18310911808669655, + "grad_norm": 1.9364889860153198, + "learning_rate": 9.771368392444627e-06, + "loss": 8.5022, + "step": 2695 + }, + { + "epoch": 0.18344883815735832, + "grad_norm": 2.055783748626709, + "learning_rate": 9.7709437423563e-06, + "loss": 8.8123, + "step": 2700 + }, + { + "epoch": 0.18378855822802012, + "grad_norm": 2.0905041694641113, + "learning_rate": 9.77051909226797e-06, + "loss": 8.6171, + "step": 2705 + }, + { + "epoch": 0.1841282782986819, + "grad_norm": 2.231294870376587, + "learning_rate": 9.770094442179645e-06, + "loss": 8.5143, + "step": 2710 + }, + { + "epoch": 0.18446799836934366, + "grad_norm": 2.453437328338623, + "learning_rate": 9.769669792091318e-06, + "loss": 8.68, + "step": 2715 + }, + { + "epoch": 0.18480771844000543, + "grad_norm": 2.1206796169281006, + "learning_rate": 9.76924514200299e-06, + "loss": 8.563, + "step": 2720 + }, + { + "epoch": 0.1851474385106672, + "grad_norm": 2.0405209064483643, + "learning_rate": 9.768820491914664e-06, + "loss": 8.4939, + "step": 2725 + }, + { + "epoch": 0.185487158581329, + "grad_norm": 2.0680458545684814, + "learning_rate": 9.768395841826337e-06, + "loss": 8.6925, + "step": 2730 + }, + { + "epoch": 0.18582687865199077, + "grad_norm": 2.1710407733917236, + "learning_rate": 9.767971191738008e-06, + "loss": 8.8439, + "step": 2735 + }, + { + "epoch": 0.18616659872265254, + "grad_norm": 1.989499568939209, + "learning_rate": 9.767546541649682e-06, + "loss": 8.1296, + "step": 2740 + }, + { + "epoch": 0.1865063187933143, + "grad_norm": 2.4298298358917236, + "learning_rate": 9.767121891561355e-06, + "loss": 8.5826, + "step": 2745 + }, + { + "epoch": 0.18684603886397608, + "grad_norm": 2.1440534591674805, + "learning_rate": 9.766697241473026e-06, + "loss": 8.5361, + "step": 2750 + }, + { + "epoch": 0.18718575893463785, + "grad_norm": 2.400078535079956, + "learning_rate": 9.7662725913847e-06, + "loss": 8.7798, + "step": 2755 + }, + { + "epoch": 0.18752547900529964, + "grad_norm": 2.1058239936828613, + "learning_rate": 9.765847941296372e-06, + "loss": 8.5393, + "step": 2760 + }, + { + "epoch": 0.18786519907596141, + "grad_norm": 2.0503876209259033, + "learning_rate": 9.765423291208044e-06, + "loss": 8.6881, + "step": 2765 + }, + { + "epoch": 0.18820491914662318, + "grad_norm": 2.574594736099243, + "learning_rate": 9.764998641119719e-06, + "loss": 8.7254, + "step": 2770 + }, + { + "epoch": 0.18854463921728495, + "grad_norm": 2.5699124336242676, + "learning_rate": 9.76457399103139e-06, + "loss": 8.6507, + "step": 2775 + }, + { + "epoch": 0.18888435928794672, + "grad_norm": 2.368260622024536, + "learning_rate": 9.764149340943063e-06, + "loss": 8.5769, + "step": 2780 + }, + { + "epoch": 0.1892240793586085, + "grad_norm": 1.6611465215682983, + "learning_rate": 9.763724690854737e-06, + "loss": 8.4152, + "step": 2785 + }, + { + "epoch": 0.1895637994292703, + "grad_norm": 1.765701413154602, + "learning_rate": 9.763300040766408e-06, + "loss": 8.6126, + "step": 2790 + }, + { + "epoch": 0.18990351949993206, + "grad_norm": 2.0505242347717285, + "learning_rate": 9.762875390678081e-06, + "loss": 8.7065, + "step": 2795 + }, + { + "epoch": 0.19024323957059383, + "grad_norm": 2.3915822505950928, + "learning_rate": 9.762450740589756e-06, + "loss": 8.7454, + "step": 2800 + }, + { + "epoch": 0.1905829596412556, + "grad_norm": 1.9571397304534912, + "learning_rate": 9.762026090501427e-06, + "loss": 8.4276, + "step": 2805 + }, + { + "epoch": 0.19092267971191737, + "grad_norm": 2.380415201187134, + "learning_rate": 9.7616014404131e-06, + "loss": 7.8542, + "step": 2810 + }, + { + "epoch": 0.19126239978257917, + "grad_norm": 2.459878444671631, + "learning_rate": 9.761176790324774e-06, + "loss": 8.6246, + "step": 2815 + }, + { + "epoch": 0.19160211985324094, + "grad_norm": 2.2050139904022217, + "learning_rate": 9.760752140236445e-06, + "loss": 8.1229, + "step": 2820 + }, + { + "epoch": 0.1919418399239027, + "grad_norm": 1.496591567993164, + "learning_rate": 9.760327490148118e-06, + "loss": 8.3951, + "step": 2825 + }, + { + "epoch": 0.19228155999456448, + "grad_norm": 1.7068730592727661, + "learning_rate": 9.759902840059793e-06, + "loss": 8.4489, + "step": 2830 + }, + { + "epoch": 0.19262128006522625, + "grad_norm": 1.9305721521377563, + "learning_rate": 9.759478189971464e-06, + "loss": 8.4595, + "step": 2835 + }, + { + "epoch": 0.19296100013588802, + "grad_norm": 2.5356597900390625, + "learning_rate": 9.759053539883138e-06, + "loss": 8.8855, + "step": 2840 + }, + { + "epoch": 0.19330072020654981, + "grad_norm": 2.8519797325134277, + "learning_rate": 9.75862888979481e-06, + "loss": 8.7338, + "step": 2845 + }, + { + "epoch": 0.19364044027721158, + "grad_norm": 2.4148335456848145, + "learning_rate": 9.758204239706482e-06, + "loss": 8.4966, + "step": 2850 + }, + { + "epoch": 0.19398016034787335, + "grad_norm": 2.757280111312866, + "learning_rate": 9.757779589618157e-06, + "loss": 8.679, + "step": 2855 + }, + { + "epoch": 0.19431988041853512, + "grad_norm": 3.254831314086914, + "learning_rate": 9.757354939529828e-06, + "loss": 8.3008, + "step": 2860 + }, + { + "epoch": 0.1946596004891969, + "grad_norm": 1.8815112113952637, + "learning_rate": 9.7569302894415e-06, + "loss": 8.3559, + "step": 2865 + }, + { + "epoch": 0.19499932055985866, + "grad_norm": 1.6743916273117065, + "learning_rate": 9.756505639353175e-06, + "loss": 8.3662, + "step": 2870 + }, + { + "epoch": 0.19533904063052046, + "grad_norm": 2.1011719703674316, + "learning_rate": 9.756080989264846e-06, + "loss": 8.2978, + "step": 2875 + }, + { + "epoch": 0.19567876070118223, + "grad_norm": 1.8835898637771606, + "learning_rate": 9.755656339176519e-06, + "loss": 8.6552, + "step": 2880 + }, + { + "epoch": 0.196018480771844, + "grad_norm": 1.8329274654388428, + "learning_rate": 9.755231689088193e-06, + "loss": 8.3236, + "step": 2885 + }, + { + "epoch": 0.19635820084250577, + "grad_norm": 3.781890869140625, + "learning_rate": 9.754807038999864e-06, + "loss": 8.2925, + "step": 2890 + }, + { + "epoch": 0.19669792091316754, + "grad_norm": 2.098569869995117, + "learning_rate": 9.754382388911537e-06, + "loss": 8.5834, + "step": 2895 + }, + { + "epoch": 0.19703764098382934, + "grad_norm": 2.3272578716278076, + "learning_rate": 9.753957738823212e-06, + "loss": 8.3192, + "step": 2900 + }, + { + "epoch": 0.1973773610544911, + "grad_norm": 2.004063129425049, + "learning_rate": 9.753533088734883e-06, + "loss": 8.228, + "step": 2905 + }, + { + "epoch": 0.19771708112515288, + "grad_norm": 1.848136067390442, + "learning_rate": 9.753108438646556e-06, + "loss": 8.4478, + "step": 2910 + }, + { + "epoch": 0.19805680119581465, + "grad_norm": 2.713777542114258, + "learning_rate": 9.752683788558228e-06, + "loss": 8.3313, + "step": 2915 + }, + { + "epoch": 0.19839652126647642, + "grad_norm": 1.800822377204895, + "learning_rate": 9.752259138469901e-06, + "loss": 8.1135, + "step": 2920 + }, + { + "epoch": 0.1987362413371382, + "grad_norm": 1.8280500173568726, + "learning_rate": 9.751834488381574e-06, + "loss": 8.3598, + "step": 2925 + }, + { + "epoch": 0.19907596140779998, + "grad_norm": 2.183372735977173, + "learning_rate": 9.751409838293247e-06, + "loss": 8.3833, + "step": 2930 + }, + { + "epoch": 0.19941568147846175, + "grad_norm": 1.7836872339248657, + "learning_rate": 9.75098518820492e-06, + "loss": 8.5396, + "step": 2935 + }, + { + "epoch": 0.19975540154912352, + "grad_norm": 2.6022579669952393, + "learning_rate": 9.750560538116592e-06, + "loss": 8.4047, + "step": 2940 + }, + { + "epoch": 0.2000951216197853, + "grad_norm": 1.8535873889923096, + "learning_rate": 9.750135888028265e-06, + "loss": 8.4069, + "step": 2945 + }, + { + "epoch": 0.20043484169044706, + "grad_norm": 1.3834894895553589, + "learning_rate": 9.749711237939938e-06, + "loss": 8.3799, + "step": 2950 + }, + { + "epoch": 0.20077456176110883, + "grad_norm": 2.0396013259887695, + "learning_rate": 9.749286587851611e-06, + "loss": 8.5745, + "step": 2955 + }, + { + "epoch": 0.20111428183177063, + "grad_norm": 1.875942587852478, + "learning_rate": 9.748861937763284e-06, + "loss": 8.4902, + "step": 2960 + }, + { + "epoch": 0.2014540019024324, + "grad_norm": 2.113250255584717, + "learning_rate": 9.748437287674956e-06, + "loss": 8.3464, + "step": 2965 + }, + { + "epoch": 0.20179372197309417, + "grad_norm": 1.6741125583648682, + "learning_rate": 9.74801263758663e-06, + "loss": 8.3546, + "step": 2970 + }, + { + "epoch": 0.20213344204375594, + "grad_norm": 2.1586899757385254, + "learning_rate": 9.747587987498302e-06, + "loss": 8.4027, + "step": 2975 + }, + { + "epoch": 0.2024731621144177, + "grad_norm": 2.150252342224121, + "learning_rate": 9.747163337409975e-06, + "loss": 8.3115, + "step": 2980 + }, + { + "epoch": 0.2028128821850795, + "grad_norm": 1.6684975624084473, + "learning_rate": 9.746738687321648e-06, + "loss": 8.2233, + "step": 2985 + }, + { + "epoch": 0.20315260225574128, + "grad_norm": 1.7795578241348267, + "learning_rate": 9.74631403723332e-06, + "loss": 8.1419, + "step": 2990 + }, + { + "epoch": 0.20349232232640305, + "grad_norm": 1.6564139127731323, + "learning_rate": 9.745889387144993e-06, + "loss": 8.2636, + "step": 2995 + }, + { + "epoch": 0.20383204239706482, + "grad_norm": 1.933114767074585, + "learning_rate": 9.745464737056666e-06, + "loss": 8.3888, + "step": 3000 + }, + { + "epoch": 0.2041717624677266, + "grad_norm": 2.228128671646118, + "learning_rate": 9.745040086968339e-06, + "loss": 8.4692, + "step": 3005 + }, + { + "epoch": 0.20451148253838836, + "grad_norm": 1.7550139427185059, + "learning_rate": 9.744615436880012e-06, + "loss": 8.3669, + "step": 3010 + }, + { + "epoch": 0.20485120260905015, + "grad_norm": 1.8640563488006592, + "learning_rate": 9.744190786791684e-06, + "loss": 8.5689, + "step": 3015 + }, + { + "epoch": 0.20519092267971192, + "grad_norm": 2.010150671005249, + "learning_rate": 9.743766136703357e-06, + "loss": 8.4896, + "step": 3020 + }, + { + "epoch": 0.2055306427503737, + "grad_norm": 1.8784284591674805, + "learning_rate": 9.74334148661503e-06, + "loss": 8.3715, + "step": 3025 + }, + { + "epoch": 0.20587036282103546, + "grad_norm": 1.9530155658721924, + "learning_rate": 9.742916836526703e-06, + "loss": 8.485, + "step": 3030 + }, + { + "epoch": 0.20621008289169723, + "grad_norm": 1.4664827585220337, + "learning_rate": 9.742492186438376e-06, + "loss": 8.2459, + "step": 3035 + }, + { + "epoch": 0.206549802962359, + "grad_norm": 2.0936169624328613, + "learning_rate": 9.742067536350048e-06, + "loss": 8.394, + "step": 3040 + }, + { + "epoch": 0.2068895230330208, + "grad_norm": 2.102576494216919, + "learning_rate": 9.741642886261721e-06, + "loss": 8.3299, + "step": 3045 + }, + { + "epoch": 0.20722924310368257, + "grad_norm": 2.188882827758789, + "learning_rate": 9.741218236173394e-06, + "loss": 8.5945, + "step": 3050 + }, + { + "epoch": 0.20756896317434434, + "grad_norm": 2.023571491241455, + "learning_rate": 9.740793586085067e-06, + "loss": 8.189, + "step": 3055 + }, + { + "epoch": 0.2079086832450061, + "grad_norm": 1.9881174564361572, + "learning_rate": 9.74036893599674e-06, + "loss": 8.6538, + "step": 3060 + }, + { + "epoch": 0.20824840331566788, + "grad_norm": 4.365421295166016, + "learning_rate": 9.739944285908412e-06, + "loss": 8.477, + "step": 3065 + }, + { + "epoch": 0.20858812338632968, + "grad_norm": 2.3085222244262695, + "learning_rate": 9.739519635820085e-06, + "loss": 8.6569, + "step": 3070 + }, + { + "epoch": 0.20892784345699145, + "grad_norm": 2.124807357788086, + "learning_rate": 9.739094985731758e-06, + "loss": 8.2513, + "step": 3075 + }, + { + "epoch": 0.20926756352765322, + "grad_norm": 1.8732200860977173, + "learning_rate": 9.738670335643431e-06, + "loss": 8.2895, + "step": 3080 + }, + { + "epoch": 0.209607283598315, + "grad_norm": 2.2237300872802734, + "learning_rate": 9.738245685555104e-06, + "loss": 8.2654, + "step": 3085 + }, + { + "epoch": 0.20994700366897676, + "grad_norm": 2.1989927291870117, + "learning_rate": 9.737821035466776e-06, + "loss": 8.7208, + "step": 3090 + }, + { + "epoch": 0.21028672373963853, + "grad_norm": 1.8464378118515015, + "learning_rate": 9.73739638537845e-06, + "loss": 8.3656, + "step": 3095 + }, + { + "epoch": 0.21062644381030032, + "grad_norm": 1.7817602157592773, + "learning_rate": 9.736971735290122e-06, + "loss": 8.3125, + "step": 3100 + }, + { + "epoch": 0.2109661638809621, + "grad_norm": 1.892408847808838, + "learning_rate": 9.736547085201793e-06, + "loss": 8.316, + "step": 3105 + }, + { + "epoch": 0.21130588395162386, + "grad_norm": 1.7000806331634521, + "learning_rate": 9.736122435113468e-06, + "loss": 8.5031, + "step": 3110 + }, + { + "epoch": 0.21164560402228563, + "grad_norm": 2.052206039428711, + "learning_rate": 9.73569778502514e-06, + "loss": 8.545, + "step": 3115 + }, + { + "epoch": 0.2119853240929474, + "grad_norm": 1.9103138446807861, + "learning_rate": 9.735273134936812e-06, + "loss": 8.3271, + "step": 3120 + }, + { + "epoch": 0.21232504416360917, + "grad_norm": 2.1280200481414795, + "learning_rate": 9.734848484848486e-06, + "loss": 8.0598, + "step": 3125 + }, + { + "epoch": 0.21266476423427097, + "grad_norm": 1.8781410455703735, + "learning_rate": 9.734423834760159e-06, + "loss": 8.4289, + "step": 3130 + }, + { + "epoch": 0.21300448430493274, + "grad_norm": 2.0789809226989746, + "learning_rate": 9.73399918467183e-06, + "loss": 8.3299, + "step": 3135 + }, + { + "epoch": 0.2133442043755945, + "grad_norm": 1.9288660287857056, + "learning_rate": 9.733574534583505e-06, + "loss": 8.4338, + "step": 3140 + }, + { + "epoch": 0.21368392444625628, + "grad_norm": 2.0185232162475586, + "learning_rate": 9.733149884495177e-06, + "loss": 8.4253, + "step": 3145 + }, + { + "epoch": 0.21402364451691805, + "grad_norm": 1.7563859224319458, + "learning_rate": 9.732725234406848e-06, + "loss": 7.996, + "step": 3150 + }, + { + "epoch": 0.21436336458757985, + "grad_norm": 1.763025164604187, + "learning_rate": 9.732300584318523e-06, + "loss": 8.2878, + "step": 3155 + }, + { + "epoch": 0.21470308465824162, + "grad_norm": 2.0214953422546387, + "learning_rate": 9.731875934230196e-06, + "loss": 8.3817, + "step": 3160 + }, + { + "epoch": 0.2150428047289034, + "grad_norm": 1.9048465490341187, + "learning_rate": 9.731451284141867e-06, + "loss": 8.0893, + "step": 3165 + }, + { + "epoch": 0.21538252479956516, + "grad_norm": 1.8526486158370972, + "learning_rate": 9.731026634053541e-06, + "loss": 8.3312, + "step": 3170 + }, + { + "epoch": 0.21572224487022693, + "grad_norm": 1.5993146896362305, + "learning_rate": 9.730601983965214e-06, + "loss": 8.3283, + "step": 3175 + }, + { + "epoch": 0.2160619649408887, + "grad_norm": 1.633880615234375, + "learning_rate": 9.730177333876887e-06, + "loss": 8.2038, + "step": 3180 + }, + { + "epoch": 0.2164016850115505, + "grad_norm": 2.159825086593628, + "learning_rate": 9.72975268378856e-06, + "loss": 8.3098, + "step": 3185 + }, + { + "epoch": 0.21674140508221226, + "grad_norm": 1.9780476093292236, + "learning_rate": 9.72932803370023e-06, + "loss": 8.365, + "step": 3190 + }, + { + "epoch": 0.21708112515287403, + "grad_norm": 1.6488490104675293, + "learning_rate": 9.728903383611905e-06, + "loss": 8.2241, + "step": 3195 + }, + { + "epoch": 0.2174208452235358, + "grad_norm": 1.5985373258590698, + "learning_rate": 9.728478733523578e-06, + "loss": 8.3254, + "step": 3200 + }, + { + "epoch": 0.21776056529419757, + "grad_norm": 1.8874716758728027, + "learning_rate": 9.72805408343525e-06, + "loss": 8.7923, + "step": 3205 + }, + { + "epoch": 0.21810028536485934, + "grad_norm": 2.152174949645996, + "learning_rate": 9.727629433346924e-06, + "loss": 8.1518, + "step": 3210 + }, + { + "epoch": 0.21844000543552114, + "grad_norm": 1.5608922243118286, + "learning_rate": 9.727204783258597e-06, + "loss": 8.0375, + "step": 3215 + }, + { + "epoch": 0.2187797255061829, + "grad_norm": 2.117875099182129, + "learning_rate": 9.726780133170268e-06, + "loss": 8.2622, + "step": 3220 + }, + { + "epoch": 0.21911944557684468, + "grad_norm": 1.6264911890029907, + "learning_rate": 9.726355483081942e-06, + "loss": 8.1492, + "step": 3225 + }, + { + "epoch": 0.21945916564750645, + "grad_norm": 2.0583395957946777, + "learning_rate": 9.725930832993615e-06, + "loss": 8.2882, + "step": 3230 + }, + { + "epoch": 0.21979888571816822, + "grad_norm": 2.7058372497558594, + "learning_rate": 9.725506182905286e-06, + "loss": 8.1985, + "step": 3235 + }, + { + "epoch": 0.22013860578883002, + "grad_norm": 1.7835265398025513, + "learning_rate": 9.72508153281696e-06, + "loss": 8.0816, + "step": 3240 + }, + { + "epoch": 0.22047832585949179, + "grad_norm": 1.8824657201766968, + "learning_rate": 9.724656882728633e-06, + "loss": 8.247, + "step": 3245 + }, + { + "epoch": 0.22081804593015356, + "grad_norm": 1.6934734582901, + "learning_rate": 9.724232232640304e-06, + "loss": 8.4321, + "step": 3250 + }, + { + "epoch": 0.22115776600081533, + "grad_norm": 1.903273105621338, + "learning_rate": 9.723807582551979e-06, + "loss": 8.2784, + "step": 3255 + }, + { + "epoch": 0.2214974860714771, + "grad_norm": 2.0810861587524414, + "learning_rate": 9.72338293246365e-06, + "loss": 8.1638, + "step": 3260 + }, + { + "epoch": 0.22183720614213887, + "grad_norm": 1.9789711236953735, + "learning_rate": 9.722958282375323e-06, + "loss": 8.259, + "step": 3265 + }, + { + "epoch": 0.22217692621280066, + "grad_norm": 1.4445555210113525, + "learning_rate": 9.722533632286997e-06, + "loss": 8.2635, + "step": 3270 + }, + { + "epoch": 0.22251664628346243, + "grad_norm": 1.9484059810638428, + "learning_rate": 9.722108982198668e-06, + "loss": 8.3518, + "step": 3275 + }, + { + "epoch": 0.2228563663541242, + "grad_norm": 1.6151180267333984, + "learning_rate": 9.721684332110341e-06, + "loss": 7.9592, + "step": 3280 + }, + { + "epoch": 0.22319608642478597, + "grad_norm": 1.6080671548843384, + "learning_rate": 9.721259682022016e-06, + "loss": 8.2752, + "step": 3285 + }, + { + "epoch": 0.22353580649544774, + "grad_norm": 2.145977735519409, + "learning_rate": 9.720835031933687e-06, + "loss": 8.3307, + "step": 3290 + }, + { + "epoch": 0.22387552656610954, + "grad_norm": 2.4139761924743652, + "learning_rate": 9.72041038184536e-06, + "loss": 8.1152, + "step": 3295 + }, + { + "epoch": 0.2242152466367713, + "grad_norm": 2.9978995323181152, + "learning_rate": 9.719985731757034e-06, + "loss": 8.1329, + "step": 3300 + }, + { + "epoch": 0.22455496670743308, + "grad_norm": 3.2721569538116455, + "learning_rate": 9.719561081668705e-06, + "loss": 8.5309, + "step": 3305 + }, + { + "epoch": 0.22489468677809485, + "grad_norm": 1.8700071573257446, + "learning_rate": 9.719136431580378e-06, + "loss": 7.9461, + "step": 3310 + }, + { + "epoch": 0.22523440684875662, + "grad_norm": 1.8225265741348267, + "learning_rate": 9.718711781492053e-06, + "loss": 8.0673, + "step": 3315 + }, + { + "epoch": 0.2255741269194184, + "grad_norm": 1.6704471111297607, + "learning_rate": 9.718287131403724e-06, + "loss": 8.066, + "step": 3320 + }, + { + "epoch": 0.22591384699008019, + "grad_norm": 1.6646416187286377, + "learning_rate": 9.717862481315396e-06, + "loss": 7.9705, + "step": 3325 + }, + { + "epoch": 0.22625356706074196, + "grad_norm": 3.127112627029419, + "learning_rate": 9.71743783122707e-06, + "loss": 8.5433, + "step": 3330 + }, + { + "epoch": 0.22659328713140373, + "grad_norm": 1.5120998620986938, + "learning_rate": 9.717013181138742e-06, + "loss": 8.1362, + "step": 3335 + }, + { + "epoch": 0.2269330072020655, + "grad_norm": 2.0904717445373535, + "learning_rate": 9.716588531050415e-06, + "loss": 7.8492, + "step": 3340 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 1.5632755756378174, + "learning_rate": 9.716163880962088e-06, + "loss": 8.2815, + "step": 3345 + }, + { + "epoch": 0.22761244734338903, + "grad_norm": 2.1681833267211914, + "learning_rate": 9.71573923087376e-06, + "loss": 8.3322, + "step": 3350 + }, + { + "epoch": 0.22795216741405083, + "grad_norm": 1.6717790365219116, + "learning_rate": 9.715314580785433e-06, + "loss": 7.8429, + "step": 3355 + }, + { + "epoch": 0.2282918874847126, + "grad_norm": 1.4922358989715576, + "learning_rate": 9.714889930697106e-06, + "loss": 8.1321, + "step": 3360 + }, + { + "epoch": 0.22863160755537437, + "grad_norm": 2.018345832824707, + "learning_rate": 9.714465280608779e-06, + "loss": 8.1306, + "step": 3365 + }, + { + "epoch": 0.22897132762603614, + "grad_norm": 2.049833297729492, + "learning_rate": 9.714040630520452e-06, + "loss": 8.3222, + "step": 3370 + }, + { + "epoch": 0.2293110476966979, + "grad_norm": 1.9591184854507446, + "learning_rate": 9.713615980432124e-06, + "loss": 8.2282, + "step": 3375 + }, + { + "epoch": 0.2296507677673597, + "grad_norm": 1.5382254123687744, + "learning_rate": 9.713191330343797e-06, + "loss": 8.1065, + "step": 3380 + }, + { + "epoch": 0.22999048783802148, + "grad_norm": 1.5967490673065186, + "learning_rate": 9.71276668025547e-06, + "loss": 7.8932, + "step": 3385 + }, + { + "epoch": 0.23033020790868325, + "grad_norm": 1.8861844539642334, + "learning_rate": 9.712342030167143e-06, + "loss": 8.0731, + "step": 3390 + }, + { + "epoch": 0.23066992797934502, + "grad_norm": 2.010568380355835, + "learning_rate": 9.711917380078816e-06, + "loss": 8.3083, + "step": 3395 + }, + { + "epoch": 0.2310096480500068, + "grad_norm": 2.0535953044891357, + "learning_rate": 9.711492729990488e-06, + "loss": 8.1902, + "step": 3400 + }, + { + "epoch": 0.23134936812066856, + "grad_norm": 1.7264593839645386, + "learning_rate": 9.711068079902161e-06, + "loss": 8.0172, + "step": 3405 + }, + { + "epoch": 0.23168908819133036, + "grad_norm": 1.9365153312683105, + "learning_rate": 9.710643429813834e-06, + "loss": 7.9974, + "step": 3410 + }, + { + "epoch": 0.23202880826199213, + "grad_norm": 1.956925868988037, + "learning_rate": 9.710218779725507e-06, + "loss": 8.0907, + "step": 3415 + }, + { + "epoch": 0.2323685283326539, + "grad_norm": 1.9848542213439941, + "learning_rate": 9.70979412963718e-06, + "loss": 8.4199, + "step": 3420 + }, + { + "epoch": 0.23270824840331567, + "grad_norm": 1.6359902620315552, + "learning_rate": 9.709369479548852e-06, + "loss": 8.1954, + "step": 3425 + }, + { + "epoch": 0.23304796847397743, + "grad_norm": 1.8433490991592407, + "learning_rate": 9.708944829460525e-06, + "loss": 8.2794, + "step": 3430 + }, + { + "epoch": 0.2333876885446392, + "grad_norm": 2.246016025543213, + "learning_rate": 9.708520179372198e-06, + "loss": 8.2572, + "step": 3435 + }, + { + "epoch": 0.233727408615301, + "grad_norm": 2.0351147651672363, + "learning_rate": 9.708095529283871e-06, + "loss": 8.1948, + "step": 3440 + }, + { + "epoch": 0.23406712868596277, + "grad_norm": 2.216580867767334, + "learning_rate": 9.707670879195544e-06, + "loss": 8.2661, + "step": 3445 + }, + { + "epoch": 0.23440684875662454, + "grad_norm": 2.038313627243042, + "learning_rate": 9.707246229107216e-06, + "loss": 8.0632, + "step": 3450 + }, + { + "epoch": 0.2347465688272863, + "grad_norm": 1.83799147605896, + "learning_rate": 9.70682157901889e-06, + "loss": 8.0816, + "step": 3455 + }, + { + "epoch": 0.23508628889794808, + "grad_norm": 1.6049362421035767, + "learning_rate": 9.706396928930562e-06, + "loss": 8.0705, + "step": 3460 + }, + { + "epoch": 0.23542600896860988, + "grad_norm": 2.192065477371216, + "learning_rate": 9.705972278842235e-06, + "loss": 8.3939, + "step": 3465 + }, + { + "epoch": 0.23576572903927165, + "grad_norm": 1.5709937810897827, + "learning_rate": 9.705547628753908e-06, + "loss": 8.0445, + "step": 3470 + }, + { + "epoch": 0.23610544910993342, + "grad_norm": 2.640047073364258, + "learning_rate": 9.70512297866558e-06, + "loss": 8.2761, + "step": 3475 + }, + { + "epoch": 0.2364451691805952, + "grad_norm": 1.6289962530136108, + "learning_rate": 9.704698328577253e-06, + "loss": 8.16, + "step": 3480 + }, + { + "epoch": 0.23678488925125696, + "grad_norm": 1.7711145877838135, + "learning_rate": 9.704273678488926e-06, + "loss": 7.9928, + "step": 3485 + }, + { + "epoch": 0.23712460932191873, + "grad_norm": 1.797109842300415, + "learning_rate": 9.703849028400599e-06, + "loss": 7.8549, + "step": 3490 + }, + { + "epoch": 0.23746432939258053, + "grad_norm": 1.9044466018676758, + "learning_rate": 9.703424378312272e-06, + "loss": 7.9864, + "step": 3495 + }, + { + "epoch": 0.2378040494632423, + "grad_norm": 2.209481954574585, + "learning_rate": 9.702999728223944e-06, + "loss": 8.0779, + "step": 3500 + }, + { + "epoch": 0.23814376953390406, + "grad_norm": 2.235274076461792, + "learning_rate": 9.702575078135617e-06, + "loss": 8.2968, + "step": 3505 + }, + { + "epoch": 0.23848348960456583, + "grad_norm": 1.9905833005905151, + "learning_rate": 9.70215042804729e-06, + "loss": 8.2558, + "step": 3510 + }, + { + "epoch": 0.2388232096752276, + "grad_norm": 4.608081340789795, + "learning_rate": 9.701725777958963e-06, + "loss": 8.4226, + "step": 3515 + }, + { + "epoch": 0.23916292974588937, + "grad_norm": 1.8394711017608643, + "learning_rate": 9.701301127870636e-06, + "loss": 8.0148, + "step": 3520 + }, + { + "epoch": 0.23950264981655117, + "grad_norm": 1.6227418184280396, + "learning_rate": 9.700876477782308e-06, + "loss": 8.0814, + "step": 3525 + }, + { + "epoch": 0.23984236988721294, + "grad_norm": 1.5509939193725586, + "learning_rate": 9.700451827693981e-06, + "loss": 8.2769, + "step": 3530 + }, + { + "epoch": 0.2401820899578747, + "grad_norm": 2.381481409072876, + "learning_rate": 9.700027177605654e-06, + "loss": 8.0254, + "step": 3535 + }, + { + "epoch": 0.24052181002853648, + "grad_norm": 1.6435312032699585, + "learning_rate": 9.699602527517327e-06, + "loss": 8.0174, + "step": 3540 + }, + { + "epoch": 0.24086153009919825, + "grad_norm": 1.3890719413757324, + "learning_rate": 9.699177877429e-06, + "loss": 8.1249, + "step": 3545 + }, + { + "epoch": 0.24120125016986005, + "grad_norm": 2.228792667388916, + "learning_rate": 9.698753227340672e-06, + "loss": 8.1817, + "step": 3550 + }, + { + "epoch": 0.24154097024052182, + "grad_norm": 1.8183283805847168, + "learning_rate": 9.698328577252345e-06, + "loss": 8.3589, + "step": 3555 + }, + { + "epoch": 0.2418806903111836, + "grad_norm": 1.7336645126342773, + "learning_rate": 9.697903927164018e-06, + "loss": 8.0069, + "step": 3560 + }, + { + "epoch": 0.24222041038184536, + "grad_norm": 1.6706063747406006, + "learning_rate": 9.697479277075691e-06, + "loss": 8.0516, + "step": 3565 + }, + { + "epoch": 0.24256013045250713, + "grad_norm": 1.8942409753799438, + "learning_rate": 9.697054626987364e-06, + "loss": 7.9161, + "step": 3570 + }, + { + "epoch": 0.2428998505231689, + "grad_norm": 2.1556968688964844, + "learning_rate": 9.696629976899036e-06, + "loss": 8.0073, + "step": 3575 + }, + { + "epoch": 0.2432395705938307, + "grad_norm": 1.905173420906067, + "learning_rate": 9.69620532681071e-06, + "loss": 8.4872, + "step": 3580 + }, + { + "epoch": 0.24357929066449246, + "grad_norm": 2.582798480987549, + "learning_rate": 9.695780676722382e-06, + "loss": 8.2398, + "step": 3585 + }, + { + "epoch": 0.24391901073515423, + "grad_norm": 1.8031612634658813, + "learning_rate": 9.695356026634055e-06, + "loss": 8.0448, + "step": 3590 + }, + { + "epoch": 0.244258730805816, + "grad_norm": 1.7692409753799438, + "learning_rate": 9.694931376545728e-06, + "loss": 8.2149, + "step": 3595 + }, + { + "epoch": 0.24459845087647777, + "grad_norm": 1.7830904722213745, + "learning_rate": 9.6945067264574e-06, + "loss": 7.9047, + "step": 3600 + }, + { + "epoch": 0.24493817094713954, + "grad_norm": 1.9386130571365356, + "learning_rate": 9.694082076369072e-06, + "loss": 8.4047, + "step": 3605 + }, + { + "epoch": 0.24527789101780134, + "grad_norm": 1.559373378753662, + "learning_rate": 9.693657426280746e-06, + "loss": 8.0203, + "step": 3610 + }, + { + "epoch": 0.2456176110884631, + "grad_norm": 1.6317503452301025, + "learning_rate": 9.693232776192419e-06, + "loss": 8.1442, + "step": 3615 + }, + { + "epoch": 0.24595733115912488, + "grad_norm": 1.964117407798767, + "learning_rate": 9.69280812610409e-06, + "loss": 7.8518, + "step": 3620 + }, + { + "epoch": 0.24629705122978665, + "grad_norm": 1.4454035758972168, + "learning_rate": 9.692383476015764e-06, + "loss": 7.8505, + "step": 3625 + }, + { + "epoch": 0.24663677130044842, + "grad_norm": 1.7038803100585938, + "learning_rate": 9.691958825927437e-06, + "loss": 7.9079, + "step": 3630 + }, + { + "epoch": 0.24697649137111022, + "grad_norm": 1.6210466623306274, + "learning_rate": 9.691534175839108e-06, + "loss": 8.0282, + "step": 3635 + }, + { + "epoch": 0.247316211441772, + "grad_norm": 1.8805921077728271, + "learning_rate": 9.691109525750783e-06, + "loss": 8.1621, + "step": 3640 + }, + { + "epoch": 0.24765593151243376, + "grad_norm": 1.578094482421875, + "learning_rate": 9.690684875662456e-06, + "loss": 7.8631, + "step": 3645 + }, + { + "epoch": 0.24799565158309553, + "grad_norm": 1.6649115085601807, + "learning_rate": 9.690260225574127e-06, + "loss": 8.1061, + "step": 3650 + }, + { + "epoch": 0.2483353716537573, + "grad_norm": 1.8443374633789062, + "learning_rate": 9.689835575485801e-06, + "loss": 7.6698, + "step": 3655 + }, + { + "epoch": 0.24867509172441907, + "grad_norm": 1.628286361694336, + "learning_rate": 9.689410925397474e-06, + "loss": 7.9134, + "step": 3660 + }, + { + "epoch": 0.24901481179508086, + "grad_norm": 1.7300853729248047, + "learning_rate": 9.688986275309145e-06, + "loss": 7.9197, + "step": 3665 + }, + { + "epoch": 0.24935453186574263, + "grad_norm": 1.6985721588134766, + "learning_rate": 9.68856162522082e-06, + "loss": 8.0671, + "step": 3670 + }, + { + "epoch": 0.2496942519364044, + "grad_norm": 1.349033236503601, + "learning_rate": 9.68813697513249e-06, + "loss": 8.0278, + "step": 3675 + }, + { + "epoch": 0.2500339720070662, + "grad_norm": 1.6125155687332153, + "learning_rate": 9.687712325044164e-06, + "loss": 7.977, + "step": 3680 + }, + { + "epoch": 0.25037369207772797, + "grad_norm": 1.9823821783065796, + "learning_rate": 9.687287674955838e-06, + "loss": 7.9707, + "step": 3685 + }, + { + "epoch": 0.2507134121483897, + "grad_norm": 1.3984802961349487, + "learning_rate": 9.68686302486751e-06, + "loss": 7.9634, + "step": 3690 + }, + { + "epoch": 0.2510531322190515, + "grad_norm": 1.6475436687469482, + "learning_rate": 9.686438374779182e-06, + "loss": 7.903, + "step": 3695 + }, + { + "epoch": 0.25139285228971325, + "grad_norm": 1.4367499351501465, + "learning_rate": 9.686013724690856e-06, + "loss": 7.9856, + "step": 3700 + }, + { + "epoch": 0.25173257236037505, + "grad_norm": 1.5150820016860962, + "learning_rate": 9.685589074602528e-06, + "loss": 7.9109, + "step": 3705 + }, + { + "epoch": 0.25207229243103685, + "grad_norm": 1.5941588878631592, + "learning_rate": 9.6851644245142e-06, + "loss": 7.9755, + "step": 3710 + }, + { + "epoch": 0.2524120125016986, + "grad_norm": 1.7604492902755737, + "learning_rate": 9.684739774425875e-06, + "loss": 7.8859, + "step": 3715 + }, + { + "epoch": 0.2527517325723604, + "grad_norm": 1.4070253372192383, + "learning_rate": 9.684315124337546e-06, + "loss": 7.8071, + "step": 3720 + }, + { + "epoch": 0.25309145264302213, + "grad_norm": 1.5699213743209839, + "learning_rate": 9.683890474249219e-06, + "loss": 7.7037, + "step": 3725 + }, + { + "epoch": 0.2534311727136839, + "grad_norm": 1.800492763519287, + "learning_rate": 9.683465824160893e-06, + "loss": 8.103, + "step": 3730 + }, + { + "epoch": 0.2537708927843457, + "grad_norm": 1.7605360746383667, + "learning_rate": 9.683041174072564e-06, + "loss": 8.0341, + "step": 3735 + }, + { + "epoch": 0.25411061285500747, + "grad_norm": 1.7984868288040161, + "learning_rate": 9.682616523984237e-06, + "loss": 7.9835, + "step": 3740 + }, + { + "epoch": 0.25445033292566926, + "grad_norm": 1.489402413368225, + "learning_rate": 9.682191873895912e-06, + "loss": 7.7456, + "step": 3745 + }, + { + "epoch": 0.254790052996331, + "grad_norm": 1.9971833229064941, + "learning_rate": 9.681767223807583e-06, + "loss": 7.7865, + "step": 3750 + }, + { + "epoch": 0.2551297730669928, + "grad_norm": 2.6241440773010254, + "learning_rate": 9.681342573719256e-06, + "loss": 8.0625, + "step": 3755 + }, + { + "epoch": 0.25546949313765455, + "grad_norm": 1.6215327978134155, + "learning_rate": 9.680917923630928e-06, + "loss": 7.7773, + "step": 3760 + }, + { + "epoch": 0.25580921320831634, + "grad_norm": 1.679215669631958, + "learning_rate": 9.680493273542601e-06, + "loss": 7.9485, + "step": 3765 + }, + { + "epoch": 0.25614893327897814, + "grad_norm": 1.796700358390808, + "learning_rate": 9.680068623454274e-06, + "loss": 7.5876, + "step": 3770 + }, + { + "epoch": 0.2564886533496399, + "grad_norm": 1.5699052810668945, + "learning_rate": 9.679643973365947e-06, + "loss": 7.8945, + "step": 3775 + }, + { + "epoch": 0.2568283734203017, + "grad_norm": 1.6011594533920288, + "learning_rate": 9.67921932327762e-06, + "loss": 8.0397, + "step": 3780 + }, + { + "epoch": 0.2571680934909634, + "grad_norm": 1.860802173614502, + "learning_rate": 9.678794673189292e-06, + "loss": 7.9878, + "step": 3785 + }, + { + "epoch": 0.2575078135616252, + "grad_norm": 1.4940128326416016, + "learning_rate": 9.678370023100965e-06, + "loss": 8.0095, + "step": 3790 + }, + { + "epoch": 0.257847533632287, + "grad_norm": 1.723378300666809, + "learning_rate": 9.677945373012638e-06, + "loss": 8.0025, + "step": 3795 + }, + { + "epoch": 0.25818725370294876, + "grad_norm": 1.6612991094589233, + "learning_rate": 9.67752072292431e-06, + "loss": 7.8815, + "step": 3800 + }, + { + "epoch": 0.25852697377361056, + "grad_norm": 1.6254154443740845, + "learning_rate": 9.677096072835984e-06, + "loss": 7.907, + "step": 3805 + }, + { + "epoch": 0.2588666938442723, + "grad_norm": 1.7833635807037354, + "learning_rate": 9.676671422747656e-06, + "loss": 7.7968, + "step": 3810 + }, + { + "epoch": 0.2592064139149341, + "grad_norm": 1.4211536645889282, + "learning_rate": 9.67624677265933e-06, + "loss": 8.0775, + "step": 3815 + }, + { + "epoch": 0.2595461339855959, + "grad_norm": 1.3587783575057983, + "learning_rate": 9.675822122571002e-06, + "loss": 7.7667, + "step": 3820 + }, + { + "epoch": 0.25988585405625764, + "grad_norm": 1.8006142377853394, + "learning_rate": 9.675397472482675e-06, + "loss": 8.0769, + "step": 3825 + }, + { + "epoch": 0.26022557412691943, + "grad_norm": 2.0792760848999023, + "learning_rate": 9.674972822394348e-06, + "loss": 7.9328, + "step": 3830 + }, + { + "epoch": 0.2605652941975812, + "grad_norm": 1.8029084205627441, + "learning_rate": 9.67454817230602e-06, + "loss": 7.9973, + "step": 3835 + }, + { + "epoch": 0.260905014268243, + "grad_norm": 1.7160025835037231, + "learning_rate": 9.674123522217693e-06, + "loss": 7.6927, + "step": 3840 + }, + { + "epoch": 0.2612447343389047, + "grad_norm": 1.4841077327728271, + "learning_rate": 9.673698872129366e-06, + "loss": 7.9615, + "step": 3845 + }, + { + "epoch": 0.2615844544095665, + "grad_norm": 2.188424825668335, + "learning_rate": 9.673274222041039e-06, + "loss": 7.9323, + "step": 3850 + }, + { + "epoch": 0.2619241744802283, + "grad_norm": 1.6702497005462646, + "learning_rate": 9.672849571952712e-06, + "loss": 7.9701, + "step": 3855 + }, + { + "epoch": 0.26226389455089005, + "grad_norm": 1.3686248064041138, + "learning_rate": 9.672424921864384e-06, + "loss": 7.9324, + "step": 3860 + }, + { + "epoch": 0.26260361462155185, + "grad_norm": 3.0005366802215576, + "learning_rate": 9.672000271776057e-06, + "loss": 8.0299, + "step": 3865 + }, + { + "epoch": 0.2629433346922136, + "grad_norm": 1.5062764883041382, + "learning_rate": 9.67157562168773e-06, + "loss": 7.7056, + "step": 3870 + }, + { + "epoch": 0.2632830547628754, + "grad_norm": 1.5591791868209839, + "learning_rate": 9.671150971599403e-06, + "loss": 8.0667, + "step": 3875 + }, + { + "epoch": 0.2636227748335372, + "grad_norm": 2.183457851409912, + "learning_rate": 9.670726321511076e-06, + "loss": 7.9613, + "step": 3880 + }, + { + "epoch": 0.26396249490419893, + "grad_norm": 1.5056397914886475, + "learning_rate": 9.670301671422748e-06, + "loss": 7.7649, + "step": 3885 + }, + { + "epoch": 0.2643022149748607, + "grad_norm": 1.9562219381332397, + "learning_rate": 9.669877021334421e-06, + "loss": 7.8727, + "step": 3890 + }, + { + "epoch": 0.26464193504552247, + "grad_norm": 1.5266690254211426, + "learning_rate": 9.669452371246094e-06, + "loss": 7.8443, + "step": 3895 + }, + { + "epoch": 0.26498165511618427, + "grad_norm": 1.9272682666778564, + "learning_rate": 9.669027721157767e-06, + "loss": 7.8077, + "step": 3900 + }, + { + "epoch": 0.26532137518684606, + "grad_norm": 1.9370181560516357, + "learning_rate": 9.66860307106944e-06, + "loss": 8.0247, + "step": 3905 + }, + { + "epoch": 0.2656610952575078, + "grad_norm": 1.4018436670303345, + "learning_rate": 9.668178420981112e-06, + "loss": 7.6242, + "step": 3910 + }, + { + "epoch": 0.2660008153281696, + "grad_norm": 1.567516565322876, + "learning_rate": 9.667753770892785e-06, + "loss": 7.8664, + "step": 3915 + }, + { + "epoch": 0.26634053539883135, + "grad_norm": 2.2065811157226562, + "learning_rate": 9.667329120804458e-06, + "loss": 7.9672, + "step": 3920 + }, + { + "epoch": 0.26668025546949314, + "grad_norm": 1.4228607416152954, + "learning_rate": 9.66690447071613e-06, + "loss": 7.9756, + "step": 3925 + }, + { + "epoch": 0.2670199755401549, + "grad_norm": 1.6753971576690674, + "learning_rate": 9.666479820627804e-06, + "loss": 7.9176, + "step": 3930 + }, + { + "epoch": 0.2673596956108167, + "grad_norm": 1.4208810329437256, + "learning_rate": 9.666055170539476e-06, + "loss": 8.0863, + "step": 3935 + }, + { + "epoch": 0.2676994156814785, + "grad_norm": 1.4897748231887817, + "learning_rate": 9.66563052045115e-06, + "loss": 8.1713, + "step": 3940 + }, + { + "epoch": 0.2680391357521402, + "grad_norm": 1.4231878519058228, + "learning_rate": 9.665205870362822e-06, + "loss": 7.7302, + "step": 3945 + }, + { + "epoch": 0.268378855822802, + "grad_norm": 1.8147635459899902, + "learning_rate": 9.664781220274495e-06, + "loss": 8.2211, + "step": 3950 + }, + { + "epoch": 0.26871857589346376, + "grad_norm": 1.4185136556625366, + "learning_rate": 9.664356570186168e-06, + "loss": 7.7523, + "step": 3955 + }, + { + "epoch": 0.26905829596412556, + "grad_norm": 1.609837532043457, + "learning_rate": 9.66393192009784e-06, + "loss": 7.8036, + "step": 3960 + }, + { + "epoch": 0.26939801603478736, + "grad_norm": 1.7108650207519531, + "learning_rate": 9.663507270009513e-06, + "loss": 7.8764, + "step": 3965 + }, + { + "epoch": 0.2697377361054491, + "grad_norm": 1.8300257921218872, + "learning_rate": 9.663082619921186e-06, + "loss": 7.785, + "step": 3970 + }, + { + "epoch": 0.2700774561761109, + "grad_norm": 1.3818646669387817, + "learning_rate": 9.662657969832859e-06, + "loss": 7.8804, + "step": 3975 + }, + { + "epoch": 0.27041717624677264, + "grad_norm": 1.3443918228149414, + "learning_rate": 9.662233319744532e-06, + "loss": 7.6742, + "step": 3980 + }, + { + "epoch": 0.27075689631743444, + "grad_norm": 1.5431832075119019, + "learning_rate": 9.661808669656204e-06, + "loss": 8.0105, + "step": 3985 + }, + { + "epoch": 0.27109661638809623, + "grad_norm": 1.2372424602508545, + "learning_rate": 9.661384019567877e-06, + "loss": 7.8179, + "step": 3990 + }, + { + "epoch": 0.271436336458758, + "grad_norm": 1.8563395738601685, + "learning_rate": 9.66095936947955e-06, + "loss": 7.92, + "step": 3995 + }, + { + "epoch": 0.2717760565294198, + "grad_norm": 1.5817902088165283, + "learning_rate": 9.660534719391223e-06, + "loss": 7.9204, + "step": 4000 + }, + { + "epoch": 0.2721157766000815, + "grad_norm": 1.5252302885055542, + "learning_rate": 9.660110069302896e-06, + "loss": 7.6374, + "step": 4005 + }, + { + "epoch": 0.2724554966707433, + "grad_norm": 2.0048201084136963, + "learning_rate": 9.659685419214568e-06, + "loss": 7.5959, + "step": 4010 + }, + { + "epoch": 0.27279521674140506, + "grad_norm": 1.4259779453277588, + "learning_rate": 9.659260769126241e-06, + "loss": 7.9453, + "step": 4015 + }, + { + "epoch": 0.27313493681206685, + "grad_norm": 2.067317008972168, + "learning_rate": 9.658836119037912e-06, + "loss": 7.9776, + "step": 4020 + }, + { + "epoch": 0.27347465688272865, + "grad_norm": 1.7814877033233643, + "learning_rate": 9.658411468949587e-06, + "loss": 7.9912, + "step": 4025 + }, + { + "epoch": 0.2738143769533904, + "grad_norm": 1.5007516145706177, + "learning_rate": 9.65798681886126e-06, + "loss": 7.7182, + "step": 4030 + }, + { + "epoch": 0.2741540970240522, + "grad_norm": 1.5059844255447388, + "learning_rate": 9.65756216877293e-06, + "loss": 7.8667, + "step": 4035 + }, + { + "epoch": 0.27449381709471393, + "grad_norm": 1.790657639503479, + "learning_rate": 9.657137518684605e-06, + "loss": 7.6049, + "step": 4040 + }, + { + "epoch": 0.27483353716537573, + "grad_norm": 1.706992745399475, + "learning_rate": 9.656712868596278e-06, + "loss": 7.9458, + "step": 4045 + }, + { + "epoch": 0.2751732572360375, + "grad_norm": 1.3728920221328735, + "learning_rate": 9.65628821850795e-06, + "loss": 7.7995, + "step": 4050 + }, + { + "epoch": 0.27551297730669927, + "grad_norm": 1.3591402769088745, + "learning_rate": 9.655863568419624e-06, + "loss": 7.8529, + "step": 4055 + }, + { + "epoch": 0.27585269737736107, + "grad_norm": 1.4771960973739624, + "learning_rate": 9.655438918331296e-06, + "loss": 7.8848, + "step": 4060 + }, + { + "epoch": 0.2761924174480228, + "grad_norm": 1.4850475788116455, + "learning_rate": 9.655014268242968e-06, + "loss": 7.5248, + "step": 4065 + }, + { + "epoch": 0.2765321375186846, + "grad_norm": 1.7069720029830933, + "learning_rate": 9.654589618154642e-06, + "loss": 7.7127, + "step": 4070 + }, + { + "epoch": 0.2768718575893464, + "grad_norm": 1.4488298892974854, + "learning_rate": 9.654164968066315e-06, + "loss": 7.9848, + "step": 4075 + }, + { + "epoch": 0.27721157766000815, + "grad_norm": 1.2563635110855103, + "learning_rate": 9.653740317977986e-06, + "loss": 7.7126, + "step": 4080 + }, + { + "epoch": 0.27755129773066994, + "grad_norm": 1.5969164371490479, + "learning_rate": 9.65331566788966e-06, + "loss": 7.6737, + "step": 4085 + }, + { + "epoch": 0.2778910178013317, + "grad_norm": 1.7851451635360718, + "learning_rate": 9.652891017801333e-06, + "loss": 7.8006, + "step": 4090 + }, + { + "epoch": 0.2782307378719935, + "grad_norm": 1.6866488456726074, + "learning_rate": 9.652466367713004e-06, + "loss": 7.9242, + "step": 4095 + }, + { + "epoch": 0.2785704579426552, + "grad_norm": 1.9010202884674072, + "learning_rate": 9.652041717624679e-06, + "loss": 7.7106, + "step": 4100 + }, + { + "epoch": 0.278910178013317, + "grad_norm": 1.478929042816162, + "learning_rate": 9.65161706753635e-06, + "loss": 7.6413, + "step": 4105 + }, + { + "epoch": 0.2792498980839788, + "grad_norm": 1.4760854244232178, + "learning_rate": 9.651192417448023e-06, + "loss": 7.8949, + "step": 4110 + }, + { + "epoch": 0.27958961815464056, + "grad_norm": 1.7153033018112183, + "learning_rate": 9.650767767359697e-06, + "loss": 7.7533, + "step": 4115 + }, + { + "epoch": 0.27992933822530236, + "grad_norm": 1.6382431983947754, + "learning_rate": 9.650343117271368e-06, + "loss": 7.7177, + "step": 4120 + }, + { + "epoch": 0.2802690582959641, + "grad_norm": 1.3453532457351685, + "learning_rate": 9.649918467183041e-06, + "loss": 7.6371, + "step": 4125 + }, + { + "epoch": 0.2806087783666259, + "grad_norm": 1.529593825340271, + "learning_rate": 9.649493817094716e-06, + "loss": 7.7625, + "step": 4130 + }, + { + "epoch": 0.2809484984372877, + "grad_norm": 1.3357956409454346, + "learning_rate": 9.649069167006387e-06, + "loss": 7.6727, + "step": 4135 + }, + { + "epoch": 0.28128821850794944, + "grad_norm": 1.3229249715805054, + "learning_rate": 9.64864451691806e-06, + "loss": 7.9103, + "step": 4140 + }, + { + "epoch": 0.28162793857861124, + "grad_norm": 2.105422019958496, + "learning_rate": 9.648219866829734e-06, + "loss": 7.8427, + "step": 4145 + }, + { + "epoch": 0.281967658649273, + "grad_norm": 1.9474483728408813, + "learning_rate": 9.647795216741405e-06, + "loss": 8.0419, + "step": 4150 + }, + { + "epoch": 0.2823073787199348, + "grad_norm": 1.2449572086334229, + "learning_rate": 9.647370566653078e-06, + "loss": 7.9128, + "step": 4155 + }, + { + "epoch": 0.2826470987905966, + "grad_norm": 1.4645432233810425, + "learning_rate": 9.646945916564752e-06, + "loss": 7.8116, + "step": 4160 + }, + { + "epoch": 0.2829868188612583, + "grad_norm": 1.3964130878448486, + "learning_rate": 9.646521266476424e-06, + "loss": 7.854, + "step": 4165 + }, + { + "epoch": 0.2833265389319201, + "grad_norm": 1.4077709913253784, + "learning_rate": 9.646096616388096e-06, + "loss": 7.6371, + "step": 4170 + }, + { + "epoch": 0.28366625900258186, + "grad_norm": 1.4445412158966064, + "learning_rate": 9.64567196629977e-06, + "loss": 7.723, + "step": 4175 + }, + { + "epoch": 0.28400597907324365, + "grad_norm": 2.2198169231414795, + "learning_rate": 9.645247316211442e-06, + "loss": 7.5947, + "step": 4180 + }, + { + "epoch": 0.2843456991439054, + "grad_norm": 1.6391876935958862, + "learning_rate": 9.644822666123115e-06, + "loss": 7.6991, + "step": 4185 + }, + { + "epoch": 0.2846854192145672, + "grad_norm": 1.866833209991455, + "learning_rate": 9.644398016034788e-06, + "loss": 7.8216, + "step": 4190 + }, + { + "epoch": 0.285025139285229, + "grad_norm": 2.3788323402404785, + "learning_rate": 9.64397336594646e-06, + "loss": 7.8694, + "step": 4195 + }, + { + "epoch": 0.28536485935589073, + "grad_norm": 1.5158735513687134, + "learning_rate": 9.643548715858135e-06, + "loss": 7.6707, + "step": 4200 + }, + { + "epoch": 0.28570457942655253, + "grad_norm": 1.4841891527175903, + "learning_rate": 9.643124065769806e-06, + "loss": 7.8056, + "step": 4205 + }, + { + "epoch": 0.28604429949721427, + "grad_norm": 1.3326330184936523, + "learning_rate": 9.642699415681479e-06, + "loss": 7.7566, + "step": 4210 + }, + { + "epoch": 0.28638401956787607, + "grad_norm": 1.3501598834991455, + "learning_rate": 9.642274765593153e-06, + "loss": 7.6968, + "step": 4215 + }, + { + "epoch": 0.28672373963853787, + "grad_norm": 1.555914044380188, + "learning_rate": 9.641850115504824e-06, + "loss": 8.067, + "step": 4220 + }, + { + "epoch": 0.2870634597091996, + "grad_norm": 1.5668944120407104, + "learning_rate": 9.641425465416497e-06, + "loss": 7.5519, + "step": 4225 + }, + { + "epoch": 0.2874031797798614, + "grad_norm": 1.7123619318008423, + "learning_rate": 9.641000815328172e-06, + "loss": 7.7214, + "step": 4230 + }, + { + "epoch": 0.28774289985052315, + "grad_norm": 1.395336389541626, + "learning_rate": 9.640576165239843e-06, + "loss": 7.6357, + "step": 4235 + }, + { + "epoch": 0.28808261992118495, + "grad_norm": 1.6019988059997559, + "learning_rate": 9.640151515151516e-06, + "loss": 7.911, + "step": 4240 + }, + { + "epoch": 0.28842233999184674, + "grad_norm": 1.330839991569519, + "learning_rate": 9.639726865063188e-06, + "loss": 7.906, + "step": 4245 + }, + { + "epoch": 0.2887620600625085, + "grad_norm": 1.43523108959198, + "learning_rate": 9.639302214974861e-06, + "loss": 7.5565, + "step": 4250 + }, + { + "epoch": 0.2891017801331703, + "grad_norm": 1.5311310291290283, + "learning_rate": 9.638877564886534e-06, + "loss": 7.4994, + "step": 4255 + }, + { + "epoch": 0.289441500203832, + "grad_norm": 1.4778732061386108, + "learning_rate": 9.638452914798207e-06, + "loss": 7.8232, + "step": 4260 + }, + { + "epoch": 0.2897812202744938, + "grad_norm": 1.5241563320159912, + "learning_rate": 9.63802826470988e-06, + "loss": 7.7176, + "step": 4265 + }, + { + "epoch": 0.29012094034515556, + "grad_norm": 1.1821743249893188, + "learning_rate": 9.637603614621552e-06, + "loss": 7.8133, + "step": 4270 + }, + { + "epoch": 0.29046066041581736, + "grad_norm": 1.8482180833816528, + "learning_rate": 9.637178964533225e-06, + "loss": 7.7347, + "step": 4275 + }, + { + "epoch": 0.29080038048647916, + "grad_norm": 1.1922824382781982, + "learning_rate": 9.636754314444898e-06, + "loss": 7.5635, + "step": 4280 + }, + { + "epoch": 0.2911401005571409, + "grad_norm": 1.3064136505126953, + "learning_rate": 9.63632966435657e-06, + "loss": 7.7457, + "step": 4285 + }, + { + "epoch": 0.2914798206278027, + "grad_norm": 1.1870638132095337, + "learning_rate": 9.635905014268244e-06, + "loss": 7.7358, + "step": 4290 + }, + { + "epoch": 0.29181954069846444, + "grad_norm": 1.3843802213668823, + "learning_rate": 9.635480364179916e-06, + "loss": 7.6568, + "step": 4295 + }, + { + "epoch": 0.29215926076912624, + "grad_norm": 1.0192164182662964, + "learning_rate": 9.63505571409159e-06, + "loss": 7.3967, + "step": 4300 + }, + { + "epoch": 0.29249898083978804, + "grad_norm": 1.4916598796844482, + "learning_rate": 9.634631064003262e-06, + "loss": 7.3345, + "step": 4305 + }, + { + "epoch": 0.2928387009104498, + "grad_norm": 1.2498506307601929, + "learning_rate": 9.634206413914935e-06, + "loss": 7.6548, + "step": 4310 + }, + { + "epoch": 0.2931784209811116, + "grad_norm": 1.4606589078903198, + "learning_rate": 9.633781763826608e-06, + "loss": 7.6742, + "step": 4315 + }, + { + "epoch": 0.2935181410517733, + "grad_norm": 1.528498888015747, + "learning_rate": 9.63335711373828e-06, + "loss": 7.7716, + "step": 4320 + }, + { + "epoch": 0.2938578611224351, + "grad_norm": 1.0994668006896973, + "learning_rate": 9.632932463649953e-06, + "loss": 7.5556, + "step": 4325 + }, + { + "epoch": 0.2941975811930969, + "grad_norm": 1.2977185249328613, + "learning_rate": 9.632507813561626e-06, + "loss": 7.5512, + "step": 4330 + }, + { + "epoch": 0.29453730126375866, + "grad_norm": 1.1432348489761353, + "learning_rate": 9.632083163473299e-06, + "loss": 7.712, + "step": 4335 + }, + { + "epoch": 0.29487702133442045, + "grad_norm": 1.6045879125595093, + "learning_rate": 9.631658513384972e-06, + "loss": 7.5775, + "step": 4340 + }, + { + "epoch": 0.2952167414050822, + "grad_norm": 1.33564293384552, + "learning_rate": 9.631233863296644e-06, + "loss": 7.3867, + "step": 4345 + }, + { + "epoch": 0.295556461475744, + "grad_norm": 1.391176462173462, + "learning_rate": 9.630809213208317e-06, + "loss": 7.5213, + "step": 4350 + }, + { + "epoch": 0.29589618154640573, + "grad_norm": 1.2768720388412476, + "learning_rate": 9.63038456311999e-06, + "loss": 7.6423, + "step": 4355 + }, + { + "epoch": 0.29623590161706753, + "grad_norm": 1.6274430751800537, + "learning_rate": 9.629959913031663e-06, + "loss": 7.763, + "step": 4360 + }, + { + "epoch": 0.29657562168772933, + "grad_norm": 1.2973769903182983, + "learning_rate": 9.629535262943336e-06, + "loss": 7.6537, + "step": 4365 + }, + { + "epoch": 0.29691534175839107, + "grad_norm": 1.4032009840011597, + "learning_rate": 9.629110612855008e-06, + "loss": 7.6228, + "step": 4370 + }, + { + "epoch": 0.29725506182905287, + "grad_norm": 1.276227355003357, + "learning_rate": 9.628685962766681e-06, + "loss": 7.6368, + "step": 4375 + }, + { + "epoch": 0.2975947818997146, + "grad_norm": 1.2703239917755127, + "learning_rate": 9.628261312678354e-06, + "loss": 7.6601, + "step": 4380 + }, + { + "epoch": 0.2979345019703764, + "grad_norm": 1.238783359527588, + "learning_rate": 9.627836662590027e-06, + "loss": 7.7822, + "step": 4385 + }, + { + "epoch": 0.2982742220410382, + "grad_norm": 1.4424924850463867, + "learning_rate": 9.6274120125017e-06, + "loss": 7.7609, + "step": 4390 + }, + { + "epoch": 0.29861394211169995, + "grad_norm": 1.3772697448730469, + "learning_rate": 9.626987362413372e-06, + "loss": 7.6441, + "step": 4395 + }, + { + "epoch": 0.29895366218236175, + "grad_norm": 1.621362328529358, + "learning_rate": 9.626562712325045e-06, + "loss": 7.5859, + "step": 4400 + }, + { + "epoch": 0.2992933822530235, + "grad_norm": 1.776361107826233, + "learning_rate": 9.626138062236718e-06, + "loss": 7.5819, + "step": 4405 + }, + { + "epoch": 0.2996331023236853, + "grad_norm": 1.3549398183822632, + "learning_rate": 9.62571341214839e-06, + "loss": 7.9544, + "step": 4410 + }, + { + "epoch": 0.2999728223943471, + "grad_norm": 1.5463109016418457, + "learning_rate": 9.625288762060064e-06, + "loss": 7.6492, + "step": 4415 + }, + { + "epoch": 0.3003125424650088, + "grad_norm": 1.548681378364563, + "learning_rate": 9.624864111971736e-06, + "loss": 7.5522, + "step": 4420 + }, + { + "epoch": 0.3006522625356706, + "grad_norm": 1.032016634941101, + "learning_rate": 9.62443946188341e-06, + "loss": 7.6988, + "step": 4425 + }, + { + "epoch": 0.30099198260633236, + "grad_norm": 1.8639155626296997, + "learning_rate": 9.624014811795082e-06, + "loss": 7.6486, + "step": 4430 + }, + { + "epoch": 0.30133170267699416, + "grad_norm": 1.163831114768982, + "learning_rate": 9.623590161706755e-06, + "loss": 7.7754, + "step": 4435 + }, + { + "epoch": 0.3016714227476559, + "grad_norm": 1.7026983499526978, + "learning_rate": 9.623165511618428e-06, + "loss": 7.3901, + "step": 4440 + }, + { + "epoch": 0.3020111428183177, + "grad_norm": 1.3792121410369873, + "learning_rate": 9.6227408615301e-06, + "loss": 7.5095, + "step": 4445 + }, + { + "epoch": 0.3023508628889795, + "grad_norm": 1.4855046272277832, + "learning_rate": 9.622316211441772e-06, + "loss": 7.572, + "step": 4450 + }, + { + "epoch": 0.30269058295964124, + "grad_norm": 1.8751827478408813, + "learning_rate": 9.621891561353446e-06, + "loss": 7.6468, + "step": 4455 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 1.8145192861557007, + "learning_rate": 9.621466911265119e-06, + "loss": 7.6476, + "step": 4460 + }, + { + "epoch": 0.3033700231009648, + "grad_norm": 1.4857734441757202, + "learning_rate": 9.62104226117679e-06, + "loss": 7.5282, + "step": 4465 + }, + { + "epoch": 0.3037097431716266, + "grad_norm": 2.1511542797088623, + "learning_rate": 9.620617611088464e-06, + "loss": 7.5435, + "step": 4470 + }, + { + "epoch": 0.3040494632422884, + "grad_norm": 1.322142481803894, + "learning_rate": 9.620192961000137e-06, + "loss": 7.623, + "step": 4475 + }, + { + "epoch": 0.3043891833129501, + "grad_norm": 1.7737032175064087, + "learning_rate": 9.619768310911808e-06, + "loss": 7.9866, + "step": 4480 + }, + { + "epoch": 0.3047289033836119, + "grad_norm": 1.1355233192443848, + "learning_rate": 9.619343660823483e-06, + "loss": 7.743, + "step": 4485 + }, + { + "epoch": 0.30506862345427366, + "grad_norm": 1.2715108394622803, + "learning_rate": 9.618919010735156e-06, + "loss": 7.7504, + "step": 4490 + }, + { + "epoch": 0.30540834352493546, + "grad_norm": 1.6316419839859009, + "learning_rate": 9.618494360646827e-06, + "loss": 7.5368, + "step": 4495 + }, + { + "epoch": 0.30574806359559725, + "grad_norm": 1.2130463123321533, + "learning_rate": 9.618069710558501e-06, + "loss": 7.5449, + "step": 4500 + }, + { + "epoch": 0.306087783666259, + "grad_norm": 1.3764592409133911, + "learning_rate": 9.617645060470174e-06, + "loss": 7.515, + "step": 4505 + }, + { + "epoch": 0.3064275037369208, + "grad_norm": 1.6078612804412842, + "learning_rate": 9.617220410381845e-06, + "loss": 7.5002, + "step": 4510 + }, + { + "epoch": 0.30676722380758253, + "grad_norm": 1.2442744970321655, + "learning_rate": 9.61679576029352e-06, + "loss": 7.6896, + "step": 4515 + }, + { + "epoch": 0.30710694387824433, + "grad_norm": 1.3061271905899048, + "learning_rate": 9.61637111020519e-06, + "loss": 7.7415, + "step": 4520 + }, + { + "epoch": 0.3074466639489061, + "grad_norm": 1.682708501815796, + "learning_rate": 9.615946460116864e-06, + "loss": 7.6159, + "step": 4525 + }, + { + "epoch": 0.30778638401956787, + "grad_norm": 1.6862084865570068, + "learning_rate": 9.615521810028538e-06, + "loss": 7.5949, + "step": 4530 + }, + { + "epoch": 0.30812610409022967, + "grad_norm": 1.1882232427597046, + "learning_rate": 9.615097159940209e-06, + "loss": 7.5997, + "step": 4535 + }, + { + "epoch": 0.3084658241608914, + "grad_norm": 1.6707614660263062, + "learning_rate": 9.614672509851884e-06, + "loss": 7.4272, + "step": 4540 + }, + { + "epoch": 0.3088055442315532, + "grad_norm": 1.4477314949035645, + "learning_rate": 9.614247859763556e-06, + "loss": 7.1856, + "step": 4545 + }, + { + "epoch": 0.30914526430221495, + "grad_norm": 1.3789488077163696, + "learning_rate": 9.613823209675228e-06, + "loss": 7.5557, + "step": 4550 + }, + { + "epoch": 0.30948498437287675, + "grad_norm": 1.4662619829177856, + "learning_rate": 9.613398559586902e-06, + "loss": 7.7022, + "step": 4555 + }, + { + "epoch": 0.30982470444353855, + "grad_norm": 1.2134242057800293, + "learning_rate": 9.612973909498575e-06, + "loss": 7.5808, + "step": 4560 + }, + { + "epoch": 0.3101644245142003, + "grad_norm": 1.3938546180725098, + "learning_rate": 9.612549259410246e-06, + "loss": 7.6427, + "step": 4565 + }, + { + "epoch": 0.3105041445848621, + "grad_norm": 1.2590209245681763, + "learning_rate": 9.61212460932192e-06, + "loss": 7.8353, + "step": 4570 + }, + { + "epoch": 0.3108438646555238, + "grad_norm": 1.2456141710281372, + "learning_rate": 9.611699959233593e-06, + "loss": 7.5591, + "step": 4575 + }, + { + "epoch": 0.3111835847261856, + "grad_norm": 1.2458726167678833, + "learning_rate": 9.611275309145264e-06, + "loss": 7.7012, + "step": 4580 + }, + { + "epoch": 0.3115233047968474, + "grad_norm": 1.4189165830612183, + "learning_rate": 9.610850659056939e-06, + "loss": 7.393, + "step": 4585 + }, + { + "epoch": 0.31186302486750916, + "grad_norm": 2.081804037094116, + "learning_rate": 9.61042600896861e-06, + "loss": 7.6624, + "step": 4590 + }, + { + "epoch": 0.31220274493817096, + "grad_norm": 1.2666360139846802, + "learning_rate": 9.610001358880283e-06, + "loss": 7.63, + "step": 4595 + }, + { + "epoch": 0.3125424650088327, + "grad_norm": 2.1545302867889404, + "learning_rate": 9.609576708791957e-06, + "loss": 7.6597, + "step": 4600 + }, + { + "epoch": 0.3128821850794945, + "grad_norm": 1.3855570554733276, + "learning_rate": 9.609152058703628e-06, + "loss": 7.5444, + "step": 4605 + }, + { + "epoch": 0.3132219051501563, + "grad_norm": 1.6135544776916504, + "learning_rate": 9.608727408615301e-06, + "loss": 7.5393, + "step": 4610 + }, + { + "epoch": 0.31356162522081804, + "grad_norm": 1.2931530475616455, + "learning_rate": 9.608302758526976e-06, + "loss": 7.5146, + "step": 4615 + }, + { + "epoch": 0.31390134529147984, + "grad_norm": 1.482391119003296, + "learning_rate": 9.607878108438647e-06, + "loss": 7.7587, + "step": 4620 + }, + { + "epoch": 0.3142410653621416, + "grad_norm": 1.6187175512313843, + "learning_rate": 9.60745345835032e-06, + "loss": 7.6421, + "step": 4625 + }, + { + "epoch": 0.3145807854328034, + "grad_norm": 1.0160502195358276, + "learning_rate": 9.607028808261994e-06, + "loss": 7.2451, + "step": 4630 + }, + { + "epoch": 0.3149205055034651, + "grad_norm": 1.5996487140655518, + "learning_rate": 9.606604158173665e-06, + "loss": 7.6333, + "step": 4635 + }, + { + "epoch": 0.3152602255741269, + "grad_norm": 1.4650472402572632, + "learning_rate": 9.606179508085338e-06, + "loss": 7.6046, + "step": 4640 + }, + { + "epoch": 0.3155999456447887, + "grad_norm": 2.8187129497528076, + "learning_rate": 9.605754857997012e-06, + "loss": 7.6763, + "step": 4645 + }, + { + "epoch": 0.31593966571545046, + "grad_norm": 1.0858076810836792, + "learning_rate": 9.605330207908684e-06, + "loss": 7.5636, + "step": 4650 + }, + { + "epoch": 0.31627938578611225, + "grad_norm": 1.2703900337219238, + "learning_rate": 9.604905557820356e-06, + "loss": 7.4756, + "step": 4655 + }, + { + "epoch": 0.316619105856774, + "grad_norm": 1.6231516599655151, + "learning_rate": 9.604480907732031e-06, + "loss": 7.5085, + "step": 4660 + }, + { + "epoch": 0.3169588259274358, + "grad_norm": 1.0328954458236694, + "learning_rate": 9.604056257643702e-06, + "loss": 7.4052, + "step": 4665 + }, + { + "epoch": 0.3172985459980976, + "grad_norm": 1.228196620941162, + "learning_rate": 9.603631607555375e-06, + "loss": 7.3937, + "step": 4670 + }, + { + "epoch": 0.31763826606875933, + "grad_norm": 1.2110131978988647, + "learning_rate": 9.603206957467048e-06, + "loss": 7.6943, + "step": 4675 + }, + { + "epoch": 0.31797798613942113, + "grad_norm": 1.289746642112732, + "learning_rate": 9.60278230737872e-06, + "loss": 7.6914, + "step": 4680 + }, + { + "epoch": 0.3183177062100829, + "grad_norm": 1.7252827882766724, + "learning_rate": 9.602357657290393e-06, + "loss": 7.6235, + "step": 4685 + }, + { + "epoch": 0.31865742628074467, + "grad_norm": 1.2159883975982666, + "learning_rate": 9.601933007202066e-06, + "loss": 7.4101, + "step": 4690 + }, + { + "epoch": 0.31899714635140647, + "grad_norm": 1.5185797214508057, + "learning_rate": 9.601508357113739e-06, + "loss": 7.6134, + "step": 4695 + }, + { + "epoch": 0.3193368664220682, + "grad_norm": 1.1065338850021362, + "learning_rate": 9.601083707025412e-06, + "loss": 7.6766, + "step": 4700 + }, + { + "epoch": 0.31967658649273, + "grad_norm": 1.1433777809143066, + "learning_rate": 9.600659056937084e-06, + "loss": 7.5709, + "step": 4705 + }, + { + "epoch": 0.32001630656339175, + "grad_norm": 1.343485951423645, + "learning_rate": 9.600234406848757e-06, + "loss": 7.5648, + "step": 4710 + }, + { + "epoch": 0.32035602663405355, + "grad_norm": 1.5320122241973877, + "learning_rate": 9.59980975676043e-06, + "loss": 7.6673, + "step": 4715 + }, + { + "epoch": 0.3206957467047153, + "grad_norm": 1.104996919631958, + "learning_rate": 9.599385106672103e-06, + "loss": 7.6931, + "step": 4720 + }, + { + "epoch": 0.3210354667753771, + "grad_norm": 1.813625454902649, + "learning_rate": 9.598960456583776e-06, + "loss": 7.628, + "step": 4725 + }, + { + "epoch": 0.3213751868460389, + "grad_norm": 1.2048423290252686, + "learning_rate": 9.598535806495448e-06, + "loss": 7.4711, + "step": 4730 + }, + { + "epoch": 0.3217149069167006, + "grad_norm": 1.0684188604354858, + "learning_rate": 9.598111156407121e-06, + "loss": 7.378, + "step": 4735 + }, + { + "epoch": 0.3220546269873624, + "grad_norm": 1.4968490600585938, + "learning_rate": 9.597686506318794e-06, + "loss": 7.3979, + "step": 4740 + }, + { + "epoch": 0.32239434705802417, + "grad_norm": 1.0921329259872437, + "learning_rate": 9.597261856230467e-06, + "loss": 7.4401, + "step": 4745 + }, + { + "epoch": 0.32273406712868596, + "grad_norm": 1.4298936128616333, + "learning_rate": 9.59683720614214e-06, + "loss": 7.4226, + "step": 4750 + }, + { + "epoch": 0.32307378719934776, + "grad_norm": 1.2362232208251953, + "learning_rate": 9.596412556053812e-06, + "loss": 7.434, + "step": 4755 + }, + { + "epoch": 0.3234135072700095, + "grad_norm": 0.969854474067688, + "learning_rate": 9.595987905965485e-06, + "loss": 7.375, + "step": 4760 + }, + { + "epoch": 0.3237532273406713, + "grad_norm": 1.29863440990448, + "learning_rate": 9.595563255877158e-06, + "loss": 7.6851, + "step": 4765 + }, + { + "epoch": 0.32409294741133304, + "grad_norm": 1.7302130460739136, + "learning_rate": 9.59513860578883e-06, + "loss": 7.676, + "step": 4770 + }, + { + "epoch": 0.32443266748199484, + "grad_norm": 1.1664716005325317, + "learning_rate": 9.594713955700504e-06, + "loss": 7.4986, + "step": 4775 + }, + { + "epoch": 0.32477238755265664, + "grad_norm": 1.3701177835464478, + "learning_rate": 9.594289305612176e-06, + "loss": 7.5178, + "step": 4780 + }, + { + "epoch": 0.3251121076233184, + "grad_norm": 1.1793434619903564, + "learning_rate": 9.59386465552385e-06, + "loss": 7.4428, + "step": 4785 + }, + { + "epoch": 0.3254518276939802, + "grad_norm": 1.4162105321884155, + "learning_rate": 9.593440005435522e-06, + "loss": 7.4091, + "step": 4790 + }, + { + "epoch": 0.3257915477646419, + "grad_norm": 1.2314858436584473, + "learning_rate": 9.593015355347195e-06, + "loss": 7.3221, + "step": 4795 + }, + { + "epoch": 0.3261312678353037, + "grad_norm": 1.2618643045425415, + "learning_rate": 9.592590705258868e-06, + "loss": 7.1773, + "step": 4800 + }, + { + "epoch": 0.32647098790596546, + "grad_norm": 1.3771189451217651, + "learning_rate": 9.59216605517054e-06, + "loss": 7.248, + "step": 4805 + }, + { + "epoch": 0.32681070797662726, + "grad_norm": 1.3367106914520264, + "learning_rate": 9.591741405082213e-06, + "loss": 7.623, + "step": 4810 + }, + { + "epoch": 0.32715042804728905, + "grad_norm": 1.431275486946106, + "learning_rate": 9.591316754993886e-06, + "loss": 7.1424, + "step": 4815 + }, + { + "epoch": 0.3274901481179508, + "grad_norm": 1.4563020467758179, + "learning_rate": 9.590892104905559e-06, + "loss": 7.6239, + "step": 4820 + }, + { + "epoch": 0.3278298681886126, + "grad_norm": 1.0713160037994385, + "learning_rate": 9.590467454817232e-06, + "loss": 7.396, + "step": 4825 + }, + { + "epoch": 0.32816958825927434, + "grad_norm": 1.224117398262024, + "learning_rate": 9.590042804728904e-06, + "loss": 7.4525, + "step": 4830 + }, + { + "epoch": 0.32850930832993613, + "grad_norm": 1.2163647413253784, + "learning_rate": 9.589618154640577e-06, + "loss": 7.5136, + "step": 4835 + }, + { + "epoch": 0.32884902840059793, + "grad_norm": 1.345991849899292, + "learning_rate": 9.58919350455225e-06, + "loss": 7.5047, + "step": 4840 + }, + { + "epoch": 0.3291887484712597, + "grad_norm": 1.966314673423767, + "learning_rate": 9.588768854463923e-06, + "loss": 7.5249, + "step": 4845 + }, + { + "epoch": 0.32952846854192147, + "grad_norm": 1.2280223369598389, + "learning_rate": 9.588344204375596e-06, + "loss": 7.5975, + "step": 4850 + }, + { + "epoch": 0.3298681886125832, + "grad_norm": 2.274409055709839, + "learning_rate": 9.587919554287268e-06, + "loss": 7.6436, + "step": 4855 + }, + { + "epoch": 0.330207908683245, + "grad_norm": 1.1692484617233276, + "learning_rate": 9.587494904198941e-06, + "loss": 7.2415, + "step": 4860 + }, + { + "epoch": 0.3305476287539068, + "grad_norm": 1.1142812967300415, + "learning_rate": 9.587070254110612e-06, + "loss": 7.5088, + "step": 4865 + }, + { + "epoch": 0.33088734882456855, + "grad_norm": 1.2577345371246338, + "learning_rate": 9.586645604022287e-06, + "loss": 7.6296, + "step": 4870 + }, + { + "epoch": 0.33122706889523035, + "grad_norm": 1.2115339040756226, + "learning_rate": 9.58622095393396e-06, + "loss": 7.1602, + "step": 4875 + }, + { + "epoch": 0.3315667889658921, + "grad_norm": 1.4795914888381958, + "learning_rate": 9.585796303845632e-06, + "loss": 7.4581, + "step": 4880 + }, + { + "epoch": 0.3319065090365539, + "grad_norm": 1.1620038747787476, + "learning_rate": 9.585371653757305e-06, + "loss": 7.3403, + "step": 4885 + }, + { + "epoch": 0.33224622910721563, + "grad_norm": 1.199143886566162, + "learning_rate": 9.584947003668978e-06, + "loss": 7.2547, + "step": 4890 + }, + { + "epoch": 0.3325859491778774, + "grad_norm": 1.3171460628509521, + "learning_rate": 9.58452235358065e-06, + "loss": 7.7719, + "step": 4895 + }, + { + "epoch": 0.3329256692485392, + "grad_norm": 1.3158966302871704, + "learning_rate": 9.584097703492324e-06, + "loss": 7.4626, + "step": 4900 + }, + { + "epoch": 0.33326538931920097, + "grad_norm": 1.6291760206222534, + "learning_rate": 9.583673053403996e-06, + "loss": 7.6012, + "step": 4905 + }, + { + "epoch": 0.33360510938986276, + "grad_norm": 1.321362853050232, + "learning_rate": 9.58324840331567e-06, + "loss": 7.4215, + "step": 4910 + }, + { + "epoch": 0.3339448294605245, + "grad_norm": 1.3115805387496948, + "learning_rate": 9.582823753227342e-06, + "loss": 7.6325, + "step": 4915 + }, + { + "epoch": 0.3342845495311863, + "grad_norm": 1.1136987209320068, + "learning_rate": 9.582399103139015e-06, + "loss": 7.5722, + "step": 4920 + }, + { + "epoch": 0.3346242696018481, + "grad_norm": 1.2715038061141968, + "learning_rate": 9.581974453050688e-06, + "loss": 7.5969, + "step": 4925 + }, + { + "epoch": 0.33496398967250984, + "grad_norm": 1.5174721479415894, + "learning_rate": 9.58154980296236e-06, + "loss": 7.7186, + "step": 4930 + }, + { + "epoch": 0.33530370974317164, + "grad_norm": 1.3483880758285522, + "learning_rate": 9.581125152874032e-06, + "loss": 7.5081, + "step": 4935 + }, + { + "epoch": 0.3356434298138334, + "grad_norm": 1.3291321992874146, + "learning_rate": 9.580700502785706e-06, + "loss": 7.1953, + "step": 4940 + }, + { + "epoch": 0.3359831498844952, + "grad_norm": 1.124162197113037, + "learning_rate": 9.580275852697379e-06, + "loss": 7.6172, + "step": 4945 + }, + { + "epoch": 0.336322869955157, + "grad_norm": 1.4410741329193115, + "learning_rate": 9.57985120260905e-06, + "loss": 7.4852, + "step": 4950 + }, + { + "epoch": 0.3366625900258187, + "grad_norm": 0.9831736087799072, + "learning_rate": 9.579426552520724e-06, + "loss": 7.3097, + "step": 4955 + }, + { + "epoch": 0.3370023100964805, + "grad_norm": 1.9982539415359497, + "learning_rate": 9.579001902432397e-06, + "loss": 7.6877, + "step": 4960 + }, + { + "epoch": 0.33734203016714226, + "grad_norm": 1.3467497825622559, + "learning_rate": 9.578577252344068e-06, + "loss": 7.2838, + "step": 4965 + }, + { + "epoch": 0.33768175023780406, + "grad_norm": 1.3324596881866455, + "learning_rate": 9.578152602255743e-06, + "loss": 7.0393, + "step": 4970 + }, + { + "epoch": 0.3380214703084658, + "grad_norm": 1.225204586982727, + "learning_rate": 9.577727952167416e-06, + "loss": 7.471, + "step": 4975 + }, + { + "epoch": 0.3383611903791276, + "grad_norm": 1.0643913745880127, + "learning_rate": 9.577303302079087e-06, + "loss": 7.4405, + "step": 4980 + }, + { + "epoch": 0.3387009104497894, + "grad_norm": 1.4514341354370117, + "learning_rate": 9.576878651990761e-06, + "loss": 7.3913, + "step": 4985 + }, + { + "epoch": 0.33904063052045114, + "grad_norm": 1.2244254350662231, + "learning_rate": 9.576454001902434e-06, + "loss": 7.5379, + "step": 4990 + }, + { + "epoch": 0.33938035059111293, + "grad_norm": 1.493311882019043, + "learning_rate": 9.576029351814105e-06, + "loss": 7.407, + "step": 4995 + }, + { + "epoch": 0.3397200706617747, + "grad_norm": 1.3579236268997192, + "learning_rate": 9.57560470172578e-06, + "loss": 7.488, + "step": 5000 + }, + { + "epoch": 0.3400597907324365, + "grad_norm": 1.2056710720062256, + "learning_rate": 9.575180051637452e-06, + "loss": 7.6384, + "step": 5005 + }, + { + "epoch": 0.34039951080309827, + "grad_norm": 1.3800066709518433, + "learning_rate": 9.574755401549124e-06, + "loss": 7.4287, + "step": 5010 + }, + { + "epoch": 0.34073923087376, + "grad_norm": 1.3403029441833496, + "learning_rate": 9.574330751460798e-06, + "loss": 7.3363, + "step": 5015 + }, + { + "epoch": 0.3410789509444218, + "grad_norm": 1.361847162246704, + "learning_rate": 9.573906101372469e-06, + "loss": 7.4936, + "step": 5020 + }, + { + "epoch": 0.34141867101508355, + "grad_norm": 1.1145708560943604, + "learning_rate": 9.573481451284142e-06, + "loss": 7.4945, + "step": 5025 + }, + { + "epoch": 0.34175839108574535, + "grad_norm": 1.3940767049789429, + "learning_rate": 9.573056801195816e-06, + "loss": 7.4414, + "step": 5030 + }, + { + "epoch": 0.34209811115640715, + "grad_norm": 1.1958765983581543, + "learning_rate": 9.572632151107488e-06, + "loss": 7.5632, + "step": 5035 + }, + { + "epoch": 0.3424378312270689, + "grad_norm": 1.1961469650268555, + "learning_rate": 9.57220750101916e-06, + "loss": 7.2265, + "step": 5040 + }, + { + "epoch": 0.3427775512977307, + "grad_norm": 1.161600112915039, + "learning_rate": 9.571782850930835e-06, + "loss": 7.4662, + "step": 5045 + }, + { + "epoch": 0.34311727136839243, + "grad_norm": 1.5003905296325684, + "learning_rate": 9.571358200842506e-06, + "loss": 7.5116, + "step": 5050 + }, + { + "epoch": 0.3434569914390542, + "grad_norm": 1.010978102684021, + "learning_rate": 9.570933550754179e-06, + "loss": 7.333, + "step": 5055 + }, + { + "epoch": 0.34379671150971597, + "grad_norm": 1.121873378753662, + "learning_rate": 9.570508900665853e-06, + "loss": 7.2318, + "step": 5060 + }, + { + "epoch": 0.34413643158037777, + "grad_norm": 1.2438092231750488, + "learning_rate": 9.570084250577524e-06, + "loss": 7.1619, + "step": 5065 + }, + { + "epoch": 0.34447615165103956, + "grad_norm": 1.303676962852478, + "learning_rate": 9.569659600489197e-06, + "loss": 7.3621, + "step": 5070 + }, + { + "epoch": 0.3448158717217013, + "grad_norm": 1.2670460939407349, + "learning_rate": 9.569234950400872e-06, + "loss": 7.353, + "step": 5075 + }, + { + "epoch": 0.3451555917923631, + "grad_norm": 0.967865526676178, + "learning_rate": 9.568810300312543e-06, + "loss": 7.3195, + "step": 5080 + }, + { + "epoch": 0.34549531186302485, + "grad_norm": 1.6943622827529907, + "learning_rate": 9.568385650224216e-06, + "loss": 7.3526, + "step": 5085 + }, + { + "epoch": 0.34583503193368664, + "grad_norm": 1.3840936422348022, + "learning_rate": 9.567961000135888e-06, + "loss": 7.4323, + "step": 5090 + }, + { + "epoch": 0.34617475200434844, + "grad_norm": 1.7878801822662354, + "learning_rate": 9.567536350047561e-06, + "loss": 7.3715, + "step": 5095 + }, + { + "epoch": 0.3465144720750102, + "grad_norm": 1.0391308069229126, + "learning_rate": 9.567111699959234e-06, + "loss": 7.4451, + "step": 5100 + }, + { + "epoch": 0.346854192145672, + "grad_norm": 1.2489497661590576, + "learning_rate": 9.566687049870907e-06, + "loss": 7.2971, + "step": 5105 + }, + { + "epoch": 0.3471939122163337, + "grad_norm": 1.0140299797058105, + "learning_rate": 9.56626239978258e-06, + "loss": 7.5294, + "step": 5110 + }, + { + "epoch": 0.3475336322869955, + "grad_norm": 1.1607557535171509, + "learning_rate": 9.565837749694252e-06, + "loss": 7.3935, + "step": 5115 + }, + { + "epoch": 0.3478733523576573, + "grad_norm": 1.2958191633224487, + "learning_rate": 9.565413099605925e-06, + "loss": 7.287, + "step": 5120 + }, + { + "epoch": 0.34821307242831906, + "grad_norm": 1.1276158094406128, + "learning_rate": 9.564988449517598e-06, + "loss": 7.2742, + "step": 5125 + }, + { + "epoch": 0.34855279249898086, + "grad_norm": 1.1953548192977905, + "learning_rate": 9.56456379942927e-06, + "loss": 7.1352, + "step": 5130 + }, + { + "epoch": 0.3488925125696426, + "grad_norm": 1.1012563705444336, + "learning_rate": 9.564139149340944e-06, + "loss": 7.3207, + "step": 5135 + }, + { + "epoch": 0.3492322326403044, + "grad_norm": 1.2536355257034302, + "learning_rate": 9.563714499252616e-06, + "loss": 7.2047, + "step": 5140 + }, + { + "epoch": 0.34957195271096614, + "grad_norm": 1.0012165307998657, + "learning_rate": 9.563289849164289e-06, + "loss": 7.1411, + "step": 5145 + }, + { + "epoch": 0.34991167278162794, + "grad_norm": 1.8531949520111084, + "learning_rate": 9.562865199075962e-06, + "loss": 7.6668, + "step": 5150 + }, + { + "epoch": 0.35025139285228973, + "grad_norm": 1.1611249446868896, + "learning_rate": 9.562440548987635e-06, + "loss": 7.2131, + "step": 5155 + }, + { + "epoch": 0.3505911129229515, + "grad_norm": 1.269422173500061, + "learning_rate": 9.562015898899308e-06, + "loss": 7.3751, + "step": 5160 + }, + { + "epoch": 0.3509308329936133, + "grad_norm": 0.8757149577140808, + "learning_rate": 9.56159124881098e-06, + "loss": 7.479, + "step": 5165 + }, + { + "epoch": 0.351270553064275, + "grad_norm": 1.3466209173202515, + "learning_rate": 9.561166598722653e-06, + "loss": 7.3405, + "step": 5170 + }, + { + "epoch": 0.3516102731349368, + "grad_norm": 1.20992910861969, + "learning_rate": 9.560741948634326e-06, + "loss": 7.278, + "step": 5175 + }, + { + "epoch": 0.3519499932055986, + "grad_norm": 1.2432438135147095, + "learning_rate": 9.560317298545999e-06, + "loss": 7.3028, + "step": 5180 + }, + { + "epoch": 0.35228971327626035, + "grad_norm": 1.2688630819320679, + "learning_rate": 9.559892648457672e-06, + "loss": 7.4551, + "step": 5185 + }, + { + "epoch": 0.35262943334692215, + "grad_norm": 1.3420954942703247, + "learning_rate": 9.559467998369344e-06, + "loss": 7.441, + "step": 5190 + }, + { + "epoch": 0.3529691534175839, + "grad_norm": 1.1211988925933838, + "learning_rate": 9.559043348281017e-06, + "loss": 7.3065, + "step": 5195 + }, + { + "epoch": 0.3533088734882457, + "grad_norm": 1.031619668006897, + "learning_rate": 9.55861869819269e-06, + "loss": 7.3563, + "step": 5200 + }, + { + "epoch": 0.3536485935589075, + "grad_norm": 1.213963270187378, + "learning_rate": 9.558194048104363e-06, + "loss": 7.4431, + "step": 5205 + }, + { + "epoch": 0.35398831362956923, + "grad_norm": 1.8332287073135376, + "learning_rate": 9.557769398016036e-06, + "loss": 7.4009, + "step": 5210 + }, + { + "epoch": 0.354328033700231, + "grad_norm": 1.2374143600463867, + "learning_rate": 9.557344747927708e-06, + "loss": 7.313, + "step": 5215 + }, + { + "epoch": 0.35466775377089277, + "grad_norm": 1.2319973707199097, + "learning_rate": 9.556920097839381e-06, + "loss": 7.3393, + "step": 5220 + }, + { + "epoch": 0.35500747384155457, + "grad_norm": 1.2003912925720215, + "learning_rate": 9.556495447751054e-06, + "loss": 7.1592, + "step": 5225 + }, + { + "epoch": 0.3553471939122163, + "grad_norm": 1.12804114818573, + "learning_rate": 9.556070797662727e-06, + "loss": 7.4808, + "step": 5230 + }, + { + "epoch": 0.3556869139828781, + "grad_norm": 1.3445720672607422, + "learning_rate": 9.5556461475744e-06, + "loss": 7.1586, + "step": 5235 + }, + { + "epoch": 0.3560266340535399, + "grad_norm": 1.1448252201080322, + "learning_rate": 9.555221497486072e-06, + "loss": 7.3465, + "step": 5240 + }, + { + "epoch": 0.35636635412420165, + "grad_norm": 1.2098517417907715, + "learning_rate": 9.554796847397745e-06, + "loss": 7.2902, + "step": 5245 + }, + { + "epoch": 0.35670607419486344, + "grad_norm": 1.2606232166290283, + "learning_rate": 9.554372197309418e-06, + "loss": 7.5607, + "step": 5250 + }, + { + "epoch": 0.3570457942655252, + "grad_norm": 1.301750898361206, + "learning_rate": 9.55394754722109e-06, + "loss": 7.3728, + "step": 5255 + }, + { + "epoch": 0.357385514336187, + "grad_norm": 0.993430495262146, + "learning_rate": 9.553522897132764e-06, + "loss": 7.3345, + "step": 5260 + }, + { + "epoch": 0.3577252344068488, + "grad_norm": 1.056207299232483, + "learning_rate": 9.553098247044436e-06, + "loss": 7.0251, + "step": 5265 + }, + { + "epoch": 0.3580649544775105, + "grad_norm": 1.1407129764556885, + "learning_rate": 9.55267359695611e-06, + "loss": 7.269, + "step": 5270 + }, + { + "epoch": 0.3584046745481723, + "grad_norm": 0.9822043180465698, + "learning_rate": 9.552248946867782e-06, + "loss": 7.2726, + "step": 5275 + }, + { + "epoch": 0.35874439461883406, + "grad_norm": 1.2222548723220825, + "learning_rate": 9.551824296779455e-06, + "loss": 7.3847, + "step": 5280 + }, + { + "epoch": 0.35908411468949586, + "grad_norm": 1.111000418663025, + "learning_rate": 9.551399646691128e-06, + "loss": 7.4112, + "step": 5285 + }, + { + "epoch": 0.35942383476015766, + "grad_norm": 1.3948721885681152, + "learning_rate": 9.5509749966028e-06, + "loss": 7.3627, + "step": 5290 + }, + { + "epoch": 0.3597635548308194, + "grad_norm": 1.321800708770752, + "learning_rate": 9.550550346514473e-06, + "loss": 7.2808, + "step": 5295 + }, + { + "epoch": 0.3601032749014812, + "grad_norm": 1.6836682558059692, + "learning_rate": 9.550125696426146e-06, + "loss": 7.2685, + "step": 5300 + }, + { + "epoch": 0.36044299497214294, + "grad_norm": 0.9692202806472778, + "learning_rate": 9.549701046337819e-06, + "loss": 7.3833, + "step": 5305 + }, + { + "epoch": 0.36078271504280474, + "grad_norm": 1.2989288568496704, + "learning_rate": 9.549276396249492e-06, + "loss": 7.4838, + "step": 5310 + }, + { + "epoch": 0.3611224351134665, + "grad_norm": 1.7783026695251465, + "learning_rate": 9.548851746161164e-06, + "loss": 7.6425, + "step": 5315 + }, + { + "epoch": 0.3614621551841283, + "grad_norm": 1.311322808265686, + "learning_rate": 9.548427096072837e-06, + "loss": 7.1209, + "step": 5320 + }, + { + "epoch": 0.3618018752547901, + "grad_norm": 1.456925630569458, + "learning_rate": 9.54800244598451e-06, + "loss": 7.4885, + "step": 5325 + }, + { + "epoch": 0.3621415953254518, + "grad_norm": 1.707478404045105, + "learning_rate": 9.547577795896183e-06, + "loss": 7.3679, + "step": 5330 + }, + { + "epoch": 0.3624813153961136, + "grad_norm": 1.2551711797714233, + "learning_rate": 9.547153145807856e-06, + "loss": 7.5123, + "step": 5335 + }, + { + "epoch": 0.36282103546677535, + "grad_norm": 1.0625113248825073, + "learning_rate": 9.546728495719528e-06, + "loss": 7.1548, + "step": 5340 + }, + { + "epoch": 0.36316075553743715, + "grad_norm": 1.2070060968399048, + "learning_rate": 9.546303845631201e-06, + "loss": 7.3117, + "step": 5345 + }, + { + "epoch": 0.36350047560809895, + "grad_norm": 1.0518494844436646, + "learning_rate": 9.545879195542872e-06, + "loss": 7.3036, + "step": 5350 + }, + { + "epoch": 0.3638401956787607, + "grad_norm": 1.4446951150894165, + "learning_rate": 9.545454545454547e-06, + "loss": 7.1107, + "step": 5355 + }, + { + "epoch": 0.3641799157494225, + "grad_norm": 1.01797616481781, + "learning_rate": 9.54502989536622e-06, + "loss": 7.1598, + "step": 5360 + }, + { + "epoch": 0.36451963582008423, + "grad_norm": 1.710965633392334, + "learning_rate": 9.54460524527789e-06, + "loss": 7.56, + "step": 5365 + }, + { + "epoch": 0.36485935589074603, + "grad_norm": 1.3303159475326538, + "learning_rate": 9.544180595189565e-06, + "loss": 7.197, + "step": 5370 + }, + { + "epoch": 0.3651990759614078, + "grad_norm": 1.0526362657546997, + "learning_rate": 9.543755945101238e-06, + "loss": 7.1636, + "step": 5375 + }, + { + "epoch": 0.36553879603206957, + "grad_norm": 1.6635013818740845, + "learning_rate": 9.543331295012909e-06, + "loss": 7.1199, + "step": 5380 + }, + { + "epoch": 0.36587851610273137, + "grad_norm": 0.8602310419082642, + "learning_rate": 9.542906644924584e-06, + "loss": 7.3228, + "step": 5385 + }, + { + "epoch": 0.3662182361733931, + "grad_norm": 1.1290918588638306, + "learning_rate": 9.542481994836256e-06, + "loss": 7.1787, + "step": 5390 + }, + { + "epoch": 0.3665579562440549, + "grad_norm": 0.9420107007026672, + "learning_rate": 9.542057344747927e-06, + "loss": 7.2811, + "step": 5395 + }, + { + "epoch": 0.36689767631471665, + "grad_norm": 1.5882043838500977, + "learning_rate": 9.541632694659602e-06, + "loss": 7.2306, + "step": 5400 + }, + { + "epoch": 0.36723739638537845, + "grad_norm": 1.5327876806259155, + "learning_rate": 9.541208044571275e-06, + "loss": 7.1318, + "step": 5405 + }, + { + "epoch": 0.36757711645604024, + "grad_norm": 1.1644059419631958, + "learning_rate": 9.540783394482946e-06, + "loss": 7.3483, + "step": 5410 + }, + { + "epoch": 0.367916836526702, + "grad_norm": 0.8405628204345703, + "learning_rate": 9.54035874439462e-06, + "loss": 6.8896, + "step": 5415 + }, + { + "epoch": 0.3682565565973638, + "grad_norm": 1.4642516374588013, + "learning_rate": 9.539934094306293e-06, + "loss": 7.3716, + "step": 5420 + }, + { + "epoch": 0.3685962766680255, + "grad_norm": 1.1444036960601807, + "learning_rate": 9.539509444217964e-06, + "loss": 7.0326, + "step": 5425 + }, + { + "epoch": 0.3689359967386873, + "grad_norm": 0.9684827923774719, + "learning_rate": 9.539084794129639e-06, + "loss": 7.0764, + "step": 5430 + }, + { + "epoch": 0.3692757168093491, + "grad_norm": 1.2783466577529907, + "learning_rate": 9.53866014404131e-06, + "loss": 7.4077, + "step": 5435 + }, + { + "epoch": 0.36961543688001086, + "grad_norm": 1.0315830707550049, + "learning_rate": 9.538235493952983e-06, + "loss": 7.0184, + "step": 5440 + }, + { + "epoch": 0.36995515695067266, + "grad_norm": 1.121238350868225, + "learning_rate": 9.537810843864657e-06, + "loss": 7.1418, + "step": 5445 + }, + { + "epoch": 0.3702948770213344, + "grad_norm": 1.2503337860107422, + "learning_rate": 9.537386193776328e-06, + "loss": 7.2045, + "step": 5450 + }, + { + "epoch": 0.3706345970919962, + "grad_norm": 1.1864569187164307, + "learning_rate": 9.536961543688001e-06, + "loss": 7.3383, + "step": 5455 + }, + { + "epoch": 0.370974317162658, + "grad_norm": 1.3663928508758545, + "learning_rate": 9.536536893599676e-06, + "loss": 7.2157, + "step": 5460 + }, + { + "epoch": 0.37131403723331974, + "grad_norm": 1.1199800968170166, + "learning_rate": 9.536112243511347e-06, + "loss": 7.1881, + "step": 5465 + }, + { + "epoch": 0.37165375730398154, + "grad_norm": 1.293368935585022, + "learning_rate": 9.53568759342302e-06, + "loss": 7.3182, + "step": 5470 + }, + { + "epoch": 0.3719934773746433, + "grad_norm": 1.1258602142333984, + "learning_rate": 9.535262943334694e-06, + "loss": 7.3474, + "step": 5475 + }, + { + "epoch": 0.3723331974453051, + "grad_norm": 1.0312446355819702, + "learning_rate": 9.534838293246365e-06, + "loss": 7.1741, + "step": 5480 + }, + { + "epoch": 0.3726729175159668, + "grad_norm": 1.4382026195526123, + "learning_rate": 9.534413643158038e-06, + "loss": 7.4105, + "step": 5485 + }, + { + "epoch": 0.3730126375866286, + "grad_norm": 1.085905909538269, + "learning_rate": 9.533988993069712e-06, + "loss": 7.276, + "step": 5490 + }, + { + "epoch": 0.3733523576572904, + "grad_norm": 1.3694627285003662, + "learning_rate": 9.533564342981384e-06, + "loss": 7.2859, + "step": 5495 + }, + { + "epoch": 0.37369207772795215, + "grad_norm": 1.1991710662841797, + "learning_rate": 9.533139692893056e-06, + "loss": 7.3648, + "step": 5500 + }, + { + "epoch": 0.37403179779861395, + "grad_norm": 1.1170839071273804, + "learning_rate": 9.532715042804729e-06, + "loss": 7.3727, + "step": 5505 + }, + { + "epoch": 0.3743715178692757, + "grad_norm": 1.045008897781372, + "learning_rate": 9.532290392716402e-06, + "loss": 7.3763, + "step": 5510 + }, + { + "epoch": 0.3747112379399375, + "grad_norm": 1.3781782388687134, + "learning_rate": 9.531865742628075e-06, + "loss": 7.3503, + "step": 5515 + }, + { + "epoch": 0.3750509580105993, + "grad_norm": 0.977583110332489, + "learning_rate": 9.531441092539748e-06, + "loss": 7.1202, + "step": 5520 + }, + { + "epoch": 0.37539067808126103, + "grad_norm": 1.1080034971237183, + "learning_rate": 9.53101644245142e-06, + "loss": 6.9418, + "step": 5525 + }, + { + "epoch": 0.37573039815192283, + "grad_norm": 1.313391089439392, + "learning_rate": 9.530591792363093e-06, + "loss": 6.8897, + "step": 5530 + }, + { + "epoch": 0.37607011822258457, + "grad_norm": 0.9265292286872864, + "learning_rate": 9.530167142274766e-06, + "loss": 7.234, + "step": 5535 + }, + { + "epoch": 0.37640983829324637, + "grad_norm": 1.0927367210388184, + "learning_rate": 9.529742492186439e-06, + "loss": 7.3557, + "step": 5540 + }, + { + "epoch": 0.37674955836390817, + "grad_norm": 1.1829965114593506, + "learning_rate": 9.529317842098112e-06, + "loss": 7.4733, + "step": 5545 + }, + { + "epoch": 0.3770892784345699, + "grad_norm": 0.9814586043357849, + "learning_rate": 9.528893192009784e-06, + "loss": 7.3199, + "step": 5550 + }, + { + "epoch": 0.3774289985052317, + "grad_norm": 0.9074739813804626, + "learning_rate": 9.528468541921457e-06, + "loss": 7.1656, + "step": 5555 + }, + { + "epoch": 0.37776871857589345, + "grad_norm": 1.2673373222351074, + "learning_rate": 9.528043891833132e-06, + "loss": 7.5273, + "step": 5560 + }, + { + "epoch": 0.37810843864655524, + "grad_norm": 1.2241482734680176, + "learning_rate": 9.527619241744803e-06, + "loss": 7.2806, + "step": 5565 + }, + { + "epoch": 0.378448158717217, + "grad_norm": 0.9892963767051697, + "learning_rate": 9.527194591656476e-06, + "loss": 7.1204, + "step": 5570 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 1.317311406135559, + "learning_rate": 9.526769941568148e-06, + "loss": 7.1986, + "step": 5575 + }, + { + "epoch": 0.3791275988585406, + "grad_norm": 0.9860427975654602, + "learning_rate": 9.526345291479821e-06, + "loss": 6.9937, + "step": 5580 + }, + { + "epoch": 0.3794673189292023, + "grad_norm": 1.4406852722167969, + "learning_rate": 9.525920641391494e-06, + "loss": 7.1934, + "step": 5585 + }, + { + "epoch": 0.3798070389998641, + "grad_norm": 1.1962993144989014, + "learning_rate": 9.525495991303167e-06, + "loss": 7.0667, + "step": 5590 + }, + { + "epoch": 0.38014675907052586, + "grad_norm": 1.2438690662384033, + "learning_rate": 9.52507134121484e-06, + "loss": 7.1597, + "step": 5595 + }, + { + "epoch": 0.38048647914118766, + "grad_norm": 1.419825553894043, + "learning_rate": 9.524646691126512e-06, + "loss": 7.2211, + "step": 5600 + }, + { + "epoch": 0.38082619921184946, + "grad_norm": 1.071311593055725, + "learning_rate": 9.524222041038185e-06, + "loss": 7.2624, + "step": 5605 + }, + { + "epoch": 0.3811659192825112, + "grad_norm": 1.0137948989868164, + "learning_rate": 9.523797390949858e-06, + "loss": 7.1936, + "step": 5610 + }, + { + "epoch": 0.381505639353173, + "grad_norm": 1.156599521636963, + "learning_rate": 9.52337274086153e-06, + "loss": 7.3566, + "step": 5615 + }, + { + "epoch": 0.38184535942383474, + "grad_norm": 0.942391574382782, + "learning_rate": 9.522948090773204e-06, + "loss": 7.2926, + "step": 5620 + }, + { + "epoch": 0.38218507949449654, + "grad_norm": 1.074296474456787, + "learning_rate": 9.522523440684876e-06, + "loss": 7.1612, + "step": 5625 + }, + { + "epoch": 0.38252479956515834, + "grad_norm": 1.3120936155319214, + "learning_rate": 9.522098790596549e-06, + "loss": 7.2371, + "step": 5630 + }, + { + "epoch": 0.3828645196358201, + "grad_norm": 1.3015021085739136, + "learning_rate": 9.521674140508222e-06, + "loss": 7.3531, + "step": 5635 + }, + { + "epoch": 0.3832042397064819, + "grad_norm": 1.1863629817962646, + "learning_rate": 9.521249490419895e-06, + "loss": 7.163, + "step": 5640 + }, + { + "epoch": 0.3835439597771436, + "grad_norm": 1.0688556432724, + "learning_rate": 9.520824840331568e-06, + "loss": 7.1314, + "step": 5645 + }, + { + "epoch": 0.3838836798478054, + "grad_norm": 0.98948734998703, + "learning_rate": 9.52040019024324e-06, + "loss": 7.0425, + "step": 5650 + }, + { + "epoch": 0.38422339991846716, + "grad_norm": 1.174883246421814, + "learning_rate": 9.519975540154913e-06, + "loss": 7.1671, + "step": 5655 + }, + { + "epoch": 0.38456311998912895, + "grad_norm": 1.1549357175827026, + "learning_rate": 9.519550890066586e-06, + "loss": 7.2904, + "step": 5660 + }, + { + "epoch": 0.38490284005979075, + "grad_norm": 1.5308055877685547, + "learning_rate": 9.519126239978259e-06, + "loss": 7.1512, + "step": 5665 + }, + { + "epoch": 0.3852425601304525, + "grad_norm": 1.3643089532852173, + "learning_rate": 9.518701589889932e-06, + "loss": 7.1148, + "step": 5670 + }, + { + "epoch": 0.3855822802011143, + "grad_norm": 0.9959084391593933, + "learning_rate": 9.518276939801604e-06, + "loss": 7.1522, + "step": 5675 + }, + { + "epoch": 0.38592200027177603, + "grad_norm": 1.123969554901123, + "learning_rate": 9.517852289713277e-06, + "loss": 6.9243, + "step": 5680 + }, + { + "epoch": 0.38626172034243783, + "grad_norm": 0.9675835967063904, + "learning_rate": 9.51742763962495e-06, + "loss": 6.9129, + "step": 5685 + }, + { + "epoch": 0.38660144041309963, + "grad_norm": 1.1987221240997314, + "learning_rate": 9.517002989536623e-06, + "loss": 7.0823, + "step": 5690 + }, + { + "epoch": 0.38694116048376137, + "grad_norm": 1.1132768392562866, + "learning_rate": 9.516578339448296e-06, + "loss": 7.0024, + "step": 5695 + }, + { + "epoch": 0.38728088055442317, + "grad_norm": 1.0880979299545288, + "learning_rate": 9.516153689359968e-06, + "loss": 7.0554, + "step": 5700 + }, + { + "epoch": 0.3876206006250849, + "grad_norm": 0.9191100597381592, + "learning_rate": 9.515729039271641e-06, + "loss": 7.1254, + "step": 5705 + }, + { + "epoch": 0.3879603206957467, + "grad_norm": 0.9297699928283691, + "learning_rate": 9.515304389183314e-06, + "loss": 7.1651, + "step": 5710 + }, + { + "epoch": 0.3883000407664085, + "grad_norm": 0.9137892723083496, + "learning_rate": 9.514879739094987e-06, + "loss": 7.4242, + "step": 5715 + }, + { + "epoch": 0.38863976083707025, + "grad_norm": 1.4167314767837524, + "learning_rate": 9.51445508900666e-06, + "loss": 7.2226, + "step": 5720 + }, + { + "epoch": 0.38897948090773204, + "grad_norm": 1.0583386421203613, + "learning_rate": 9.514030438918332e-06, + "loss": 7.1842, + "step": 5725 + }, + { + "epoch": 0.3893192009783938, + "grad_norm": 1.1724671125411987, + "learning_rate": 9.513605788830005e-06, + "loss": 7.124, + "step": 5730 + }, + { + "epoch": 0.3896589210490556, + "grad_norm": 1.3348591327667236, + "learning_rate": 9.513181138741678e-06, + "loss": 7.3151, + "step": 5735 + }, + { + "epoch": 0.3899986411197173, + "grad_norm": 1.0466469526290894, + "learning_rate": 9.51275648865335e-06, + "loss": 7.3636, + "step": 5740 + }, + { + "epoch": 0.3903383611903791, + "grad_norm": 1.282662034034729, + "learning_rate": 9.512331838565024e-06, + "loss": 7.1534, + "step": 5745 + }, + { + "epoch": 0.3906780812610409, + "grad_norm": 1.3631551265716553, + "learning_rate": 9.511907188476696e-06, + "loss": 7.0181, + "step": 5750 + }, + { + "epoch": 0.39101780133170266, + "grad_norm": 0.8994230031967163, + "learning_rate": 9.511482538388369e-06, + "loss": 7.1349, + "step": 5755 + }, + { + "epoch": 0.39135752140236446, + "grad_norm": 1.2012819051742554, + "learning_rate": 9.511057888300042e-06, + "loss": 7.2377, + "step": 5760 + }, + { + "epoch": 0.3916972414730262, + "grad_norm": 1.108229398727417, + "learning_rate": 9.510633238211715e-06, + "loss": 7.1901, + "step": 5765 + }, + { + "epoch": 0.392036961543688, + "grad_norm": 1.0396685600280762, + "learning_rate": 9.510208588123388e-06, + "loss": 7.3906, + "step": 5770 + }, + { + "epoch": 0.3923766816143498, + "grad_norm": 1.1145473718643188, + "learning_rate": 9.50978393803506e-06, + "loss": 7.1405, + "step": 5775 + }, + { + "epoch": 0.39271640168501154, + "grad_norm": 1.1773136854171753, + "learning_rate": 9.509359287946731e-06, + "loss": 7.1604, + "step": 5780 + }, + { + "epoch": 0.39305612175567334, + "grad_norm": 1.1064716577529907, + "learning_rate": 9.508934637858406e-06, + "loss": 7.2052, + "step": 5785 + }, + { + "epoch": 0.3933958418263351, + "grad_norm": 1.2980811595916748, + "learning_rate": 9.508509987770079e-06, + "loss": 7.2269, + "step": 5790 + }, + { + "epoch": 0.3937355618969969, + "grad_norm": 0.9173927307128906, + "learning_rate": 9.50808533768175e-06, + "loss": 7.073, + "step": 5795 + }, + { + "epoch": 0.3940752819676587, + "grad_norm": 1.1991550922393799, + "learning_rate": 9.507660687593424e-06, + "loss": 7.2784, + "step": 5800 + }, + { + "epoch": 0.3944150020383204, + "grad_norm": 1.4972875118255615, + "learning_rate": 9.507236037505097e-06, + "loss": 7.4369, + "step": 5805 + }, + { + "epoch": 0.3947547221089822, + "grad_norm": 1.1932111978530884, + "learning_rate": 9.506811387416768e-06, + "loss": 7.0737, + "step": 5810 + }, + { + "epoch": 0.39509444217964396, + "grad_norm": 1.0212924480438232, + "learning_rate": 9.506386737328443e-06, + "loss": 6.7813, + "step": 5815 + }, + { + "epoch": 0.39543416225030575, + "grad_norm": 1.005353331565857, + "learning_rate": 9.505962087240116e-06, + "loss": 7.3339, + "step": 5820 + }, + { + "epoch": 0.3957738823209675, + "grad_norm": 1.028649926185608, + "learning_rate": 9.505537437151787e-06, + "loss": 7.2072, + "step": 5825 + }, + { + "epoch": 0.3961136023916293, + "grad_norm": 1.051651120185852, + "learning_rate": 9.505112787063461e-06, + "loss": 7.3663, + "step": 5830 + }, + { + "epoch": 0.3964533224622911, + "grad_norm": 1.0965793132781982, + "learning_rate": 9.504688136975134e-06, + "loss": 7.3386, + "step": 5835 + }, + { + "epoch": 0.39679304253295283, + "grad_norm": 1.2467221021652222, + "learning_rate": 9.504263486886805e-06, + "loss": 7.1118, + "step": 5840 + }, + { + "epoch": 0.39713276260361463, + "grad_norm": 1.0550509691238403, + "learning_rate": 9.50383883679848e-06, + "loss": 6.8584, + "step": 5845 + }, + { + "epoch": 0.3974724826742764, + "grad_norm": 1.1440951824188232, + "learning_rate": 9.50341418671015e-06, + "loss": 6.9969, + "step": 5850 + }, + { + "epoch": 0.39781220274493817, + "grad_norm": 1.0417051315307617, + "learning_rate": 9.502989536621823e-06, + "loss": 7.148, + "step": 5855 + }, + { + "epoch": 0.39815192281559997, + "grad_norm": 1.3437097072601318, + "learning_rate": 9.502564886533498e-06, + "loss": 7.0785, + "step": 5860 + }, + { + "epoch": 0.3984916428862617, + "grad_norm": 0.9693278074264526, + "learning_rate": 9.502140236445169e-06, + "loss": 6.9518, + "step": 5865 + }, + { + "epoch": 0.3988313629569235, + "grad_norm": 1.057731032371521, + "learning_rate": 9.501715586356842e-06, + "loss": 6.982, + "step": 5870 + }, + { + "epoch": 0.39917108302758525, + "grad_norm": 0.7926727533340454, + "learning_rate": 9.501290936268516e-06, + "loss": 6.8515, + "step": 5875 + }, + { + "epoch": 0.39951080309824705, + "grad_norm": 1.0944507122039795, + "learning_rate": 9.500866286180187e-06, + "loss": 7.2861, + "step": 5880 + }, + { + "epoch": 0.39985052316890884, + "grad_norm": 1.0467087030410767, + "learning_rate": 9.50044163609186e-06, + "loss": 7.1706, + "step": 5885 + }, + { + "epoch": 0.4001902432395706, + "grad_norm": 0.9234235286712646, + "learning_rate": 9.500016986003535e-06, + "loss": 7.158, + "step": 5890 + }, + { + "epoch": 0.4005299633102324, + "grad_norm": 1.482506275177002, + "learning_rate": 9.499592335915206e-06, + "loss": 6.9561, + "step": 5895 + }, + { + "epoch": 0.4008696833808941, + "grad_norm": 0.9408546686172485, + "learning_rate": 9.49916768582688e-06, + "loss": 7.2118, + "step": 5900 + }, + { + "epoch": 0.4012094034515559, + "grad_norm": 1.003036379814148, + "learning_rate": 9.498743035738553e-06, + "loss": 7.1635, + "step": 5905 + }, + { + "epoch": 0.40154912352221767, + "grad_norm": 0.8950259685516357, + "learning_rate": 9.498318385650224e-06, + "loss": 7.0599, + "step": 5910 + }, + { + "epoch": 0.40188884359287946, + "grad_norm": 0.906271755695343, + "learning_rate": 9.497893735561899e-06, + "loss": 6.9147, + "step": 5915 + }, + { + "epoch": 0.40222856366354126, + "grad_norm": 0.9998205900192261, + "learning_rate": 9.49746908547357e-06, + "loss": 6.968, + "step": 5920 + }, + { + "epoch": 0.402568283734203, + "grad_norm": 1.2797770500183105, + "learning_rate": 9.497044435385243e-06, + "loss": 7.0208, + "step": 5925 + }, + { + "epoch": 0.4029080038048648, + "grad_norm": 0.8571717143058777, + "learning_rate": 9.496619785296917e-06, + "loss": 7.1105, + "step": 5930 + }, + { + "epoch": 0.40324772387552654, + "grad_norm": 0.8847532868385315, + "learning_rate": 9.496195135208588e-06, + "loss": 7.192, + "step": 5935 + }, + { + "epoch": 0.40358744394618834, + "grad_norm": 0.9208801984786987, + "learning_rate": 9.495770485120261e-06, + "loss": 7.0457, + "step": 5940 + }, + { + "epoch": 0.40392716401685014, + "grad_norm": 0.9773485660552979, + "learning_rate": 9.495345835031936e-06, + "loss": 7.3149, + "step": 5945 + }, + { + "epoch": 0.4042668840875119, + "grad_norm": 1.2985477447509766, + "learning_rate": 9.494921184943607e-06, + "loss": 7.2595, + "step": 5950 + }, + { + "epoch": 0.4046066041581737, + "grad_norm": 1.2775092124938965, + "learning_rate": 9.49449653485528e-06, + "loss": 7.3211, + "step": 5955 + }, + { + "epoch": 0.4049463242288354, + "grad_norm": 1.4441479444503784, + "learning_rate": 9.494071884766954e-06, + "loss": 7.1237, + "step": 5960 + }, + { + "epoch": 0.4052860442994972, + "grad_norm": 1.136715292930603, + "learning_rate": 9.493647234678625e-06, + "loss": 7.1602, + "step": 5965 + }, + { + "epoch": 0.405625764370159, + "grad_norm": 0.9682796001434326, + "learning_rate": 9.493222584590298e-06, + "loss": 7.3635, + "step": 5970 + }, + { + "epoch": 0.40596548444082076, + "grad_norm": 0.9240173101425171, + "learning_rate": 9.492797934501972e-06, + "loss": 7.0107, + "step": 5975 + }, + { + "epoch": 0.40630520451148255, + "grad_norm": 0.8105183243751526, + "learning_rate": 9.492373284413643e-06, + "loss": 7.0454, + "step": 5980 + }, + { + "epoch": 0.4066449245821443, + "grad_norm": 0.9453456997871399, + "learning_rate": 9.491948634325316e-06, + "loss": 7.1084, + "step": 5985 + }, + { + "epoch": 0.4069846446528061, + "grad_norm": 1.1496661901474, + "learning_rate": 9.49152398423699e-06, + "loss": 7.4134, + "step": 5990 + }, + { + "epoch": 0.40732436472346784, + "grad_norm": 1.0155354738235474, + "learning_rate": 9.491099334148662e-06, + "loss": 7.2588, + "step": 5995 + }, + { + "epoch": 0.40766408479412963, + "grad_norm": 1.2182565927505493, + "learning_rate": 9.490674684060335e-06, + "loss": 6.9242, + "step": 6000 + }, + { + "epoch": 0.40800380486479143, + "grad_norm": 0.7630282640457153, + "learning_rate": 9.490250033972007e-06, + "loss": 7.2641, + "step": 6005 + }, + { + "epoch": 0.4083435249354532, + "grad_norm": 0.8422251343727112, + "learning_rate": 9.48982538388368e-06, + "loss": 7.0428, + "step": 6010 + }, + { + "epoch": 0.40868324500611497, + "grad_norm": 1.0205700397491455, + "learning_rate": 9.489400733795353e-06, + "loss": 7.1783, + "step": 6015 + }, + { + "epoch": 0.4090229650767767, + "grad_norm": 0.9248514771461487, + "learning_rate": 9.488976083707026e-06, + "loss": 7.0116, + "step": 6020 + }, + { + "epoch": 0.4093626851474385, + "grad_norm": 0.9137230515480042, + "learning_rate": 9.488551433618699e-06, + "loss": 7.03, + "step": 6025 + }, + { + "epoch": 0.4097024052181003, + "grad_norm": 1.0392686128616333, + "learning_rate": 9.488126783530371e-06, + "loss": 7.0934, + "step": 6030 + }, + { + "epoch": 0.41004212528876205, + "grad_norm": 1.1267544031143188, + "learning_rate": 9.487702133442044e-06, + "loss": 7.1834, + "step": 6035 + }, + { + "epoch": 0.41038184535942385, + "grad_norm": 1.3940680027008057, + "learning_rate": 9.487277483353717e-06, + "loss": 7.1648, + "step": 6040 + }, + { + "epoch": 0.4107215654300856, + "grad_norm": 1.1078941822052002, + "learning_rate": 9.48685283326539e-06, + "loss": 7.062, + "step": 6045 + }, + { + "epoch": 0.4110612855007474, + "grad_norm": 1.0429941415786743, + "learning_rate": 9.486428183177063e-06, + "loss": 7.137, + "step": 6050 + }, + { + "epoch": 0.4114010055714092, + "grad_norm": 0.8603450059890747, + "learning_rate": 9.486003533088735e-06, + "loss": 7.2113, + "step": 6055 + }, + { + "epoch": 0.4117407256420709, + "grad_norm": 0.8819674849510193, + "learning_rate": 9.485578883000408e-06, + "loss": 6.9737, + "step": 6060 + }, + { + "epoch": 0.4120804457127327, + "grad_norm": 1.1130350828170776, + "learning_rate": 9.485154232912081e-06, + "loss": 7.2123, + "step": 6065 + }, + { + "epoch": 0.41242016578339447, + "grad_norm": 0.9767665266990662, + "learning_rate": 9.484729582823754e-06, + "loss": 7.1427, + "step": 6070 + }, + { + "epoch": 0.41275988585405626, + "grad_norm": 1.219112515449524, + "learning_rate": 9.484304932735427e-06, + "loss": 6.7836, + "step": 6075 + }, + { + "epoch": 0.413099605924718, + "grad_norm": 0.8666915893554688, + "learning_rate": 9.4838802826471e-06, + "loss": 7.0666, + "step": 6080 + }, + { + "epoch": 0.4134393259953798, + "grad_norm": 1.53532874584198, + "learning_rate": 9.483455632558772e-06, + "loss": 7.1596, + "step": 6085 + }, + { + "epoch": 0.4137790460660416, + "grad_norm": 0.8915999531745911, + "learning_rate": 9.483030982470445e-06, + "loss": 7.2992, + "step": 6090 + }, + { + "epoch": 0.41411876613670334, + "grad_norm": 0.9171149730682373, + "learning_rate": 9.482606332382118e-06, + "loss": 7.0273, + "step": 6095 + }, + { + "epoch": 0.41445848620736514, + "grad_norm": 0.8702678680419922, + "learning_rate": 9.482266612311455e-06, + "loss": 7.0387, + "step": 6100 + }, + { + "epoch": 0.4147982062780269, + "grad_norm": 1.1009182929992676, + "learning_rate": 9.48184196222313e-06, + "loss": 6.8604, + "step": 6105 + }, + { + "epoch": 0.4151379263486887, + "grad_norm": 1.17258620262146, + "learning_rate": 9.481417312134802e-06, + "loss": 6.918, + "step": 6110 + }, + { + "epoch": 0.4154776464193505, + "grad_norm": 1.1784557104110718, + "learning_rate": 9.480992662046474e-06, + "loss": 6.9823, + "step": 6115 + }, + { + "epoch": 0.4158173664900122, + "grad_norm": 1.169632911682129, + "learning_rate": 9.480568011958148e-06, + "loss": 7.341, + "step": 6120 + }, + { + "epoch": 0.416157086560674, + "grad_norm": 1.103583812713623, + "learning_rate": 9.48014336186982e-06, + "loss": 7.1418, + "step": 6125 + }, + { + "epoch": 0.41649680663133576, + "grad_norm": 0.8504982590675354, + "learning_rate": 9.479718711781492e-06, + "loss": 6.8208, + "step": 6130 + }, + { + "epoch": 0.41683652670199756, + "grad_norm": 1.2303040027618408, + "learning_rate": 9.479294061693166e-06, + "loss": 7.0753, + "step": 6135 + }, + { + "epoch": 0.41717624677265935, + "grad_norm": 1.1429755687713623, + "learning_rate": 9.47886941160484e-06, + "loss": 7.0926, + "step": 6140 + }, + { + "epoch": 0.4175159668433211, + "grad_norm": 1.8517396450042725, + "learning_rate": 9.47844476151651e-06, + "loss": 7.0009, + "step": 6145 + }, + { + "epoch": 0.4178556869139829, + "grad_norm": 0.9018865823745728, + "learning_rate": 9.478020111428185e-06, + "loss": 6.9662, + "step": 6150 + }, + { + "epoch": 0.41819540698464464, + "grad_norm": 1.20879065990448, + "learning_rate": 9.477595461339858e-06, + "loss": 7.1788, + "step": 6155 + }, + { + "epoch": 0.41853512705530643, + "grad_norm": 0.8970786333084106, + "learning_rate": 9.477170811251529e-06, + "loss": 6.9998, + "step": 6160 + }, + { + "epoch": 0.4188748471259682, + "grad_norm": 0.918951690196991, + "learning_rate": 9.476746161163203e-06, + "loss": 7.2165, + "step": 6165 + }, + { + "epoch": 0.41921456719663, + "grad_norm": 0.8602477312088013, + "learning_rate": 9.476321511074876e-06, + "loss": 7.1064, + "step": 6170 + }, + { + "epoch": 0.41955428726729177, + "grad_norm": 1.0329922437667847, + "learning_rate": 9.475896860986547e-06, + "loss": 7.1243, + "step": 6175 + }, + { + "epoch": 0.4198940073379535, + "grad_norm": 1.1206920146942139, + "learning_rate": 9.475472210898222e-06, + "loss": 6.889, + "step": 6180 + }, + { + "epoch": 0.4202337274086153, + "grad_norm": 1.0798910856246948, + "learning_rate": 9.475047560809893e-06, + "loss": 6.9971, + "step": 6185 + }, + { + "epoch": 0.42057344747927705, + "grad_norm": 0.9241163730621338, + "learning_rate": 9.474622910721566e-06, + "loss": 7.0615, + "step": 6190 + }, + { + "epoch": 0.42091316754993885, + "grad_norm": 1.0887022018432617, + "learning_rate": 9.47419826063324e-06, + "loss": 7.027, + "step": 6195 + }, + { + "epoch": 0.42125288762060065, + "grad_norm": 0.92713862657547, + "learning_rate": 9.473773610544911e-06, + "loss": 6.9932, + "step": 6200 + }, + { + "epoch": 0.4215926076912624, + "grad_norm": 0.9353451728820801, + "learning_rate": 9.473348960456584e-06, + "loss": 6.8898, + "step": 6205 + }, + { + "epoch": 0.4219323277619242, + "grad_norm": 0.9619958996772766, + "learning_rate": 9.472924310368258e-06, + "loss": 6.999, + "step": 6210 + }, + { + "epoch": 0.42227204783258593, + "grad_norm": 0.863460123538971, + "learning_rate": 9.47249966027993e-06, + "loss": 6.9689, + "step": 6215 + }, + { + "epoch": 0.4226117679032477, + "grad_norm": 0.8515422344207764, + "learning_rate": 9.472075010191602e-06, + "loss": 6.9178, + "step": 6220 + }, + { + "epoch": 0.4229514879739095, + "grad_norm": 1.1933566331863403, + "learning_rate": 9.471650360103277e-06, + "loss": 7.145, + "step": 6225 + }, + { + "epoch": 0.42329120804457127, + "grad_norm": 0.990936279296875, + "learning_rate": 9.471225710014948e-06, + "loss": 7.0211, + "step": 6230 + }, + { + "epoch": 0.42363092811523306, + "grad_norm": 1.4157167673110962, + "learning_rate": 9.47080105992662e-06, + "loss": 7.1478, + "step": 6235 + }, + { + "epoch": 0.4239706481858948, + "grad_norm": 0.9909043908119202, + "learning_rate": 9.470376409838295e-06, + "loss": 6.9939, + "step": 6240 + }, + { + "epoch": 0.4243103682565566, + "grad_norm": 1.085897445678711, + "learning_rate": 9.469951759749966e-06, + "loss": 7.082, + "step": 6245 + }, + { + "epoch": 0.42465008832721834, + "grad_norm": 1.0903548002243042, + "learning_rate": 9.469527109661639e-06, + "loss": 6.896, + "step": 6250 + }, + { + "epoch": 0.42498980839788014, + "grad_norm": 0.9738759994506836, + "learning_rate": 9.469102459573312e-06, + "loss": 7.0177, + "step": 6255 + }, + { + "epoch": 0.42532952846854194, + "grad_norm": 0.909224808216095, + "learning_rate": 9.468677809484985e-06, + "loss": 7.0388, + "step": 6260 + }, + { + "epoch": 0.4256692485392037, + "grad_norm": 0.9810065627098083, + "learning_rate": 9.468253159396658e-06, + "loss": 7.0466, + "step": 6265 + }, + { + "epoch": 0.4260089686098655, + "grad_norm": 0.8595856428146362, + "learning_rate": 9.46782850930833e-06, + "loss": 7.1434, + "step": 6270 + }, + { + "epoch": 0.4263486886805272, + "grad_norm": 0.8824175000190735, + "learning_rate": 9.467403859220003e-06, + "loss": 6.9148, + "step": 6275 + }, + { + "epoch": 0.426688408751189, + "grad_norm": 1.076076626777649, + "learning_rate": 9.466979209131676e-06, + "loss": 6.9624, + "step": 6280 + }, + { + "epoch": 0.4270281288218508, + "grad_norm": 1.1642382144927979, + "learning_rate": 9.466554559043349e-06, + "loss": 7.0447, + "step": 6285 + }, + { + "epoch": 0.42736784889251256, + "grad_norm": 1.0411481857299805, + "learning_rate": 9.466129908955022e-06, + "loss": 7.0483, + "step": 6290 + }, + { + "epoch": 0.42770756896317436, + "grad_norm": 0.903594970703125, + "learning_rate": 9.465705258866694e-06, + "loss": 7.0662, + "step": 6295 + }, + { + "epoch": 0.4280472890338361, + "grad_norm": 0.7276753783226013, + "learning_rate": 9.465280608778367e-06, + "loss": 6.9305, + "step": 6300 + }, + { + "epoch": 0.4283870091044979, + "grad_norm": 1.0119266510009766, + "learning_rate": 9.46485595869004e-06, + "loss": 6.6774, + "step": 6305 + }, + { + "epoch": 0.4287267291751597, + "grad_norm": 1.3340345621109009, + "learning_rate": 9.464431308601713e-06, + "loss": 7.1332, + "step": 6310 + }, + { + "epoch": 0.42906644924582144, + "grad_norm": 1.0572330951690674, + "learning_rate": 9.464006658513386e-06, + "loss": 6.9297, + "step": 6315 + }, + { + "epoch": 0.42940616931648323, + "grad_norm": 0.8724763989448547, + "learning_rate": 9.463582008425058e-06, + "loss": 7.0449, + "step": 6320 + }, + { + "epoch": 0.429745889387145, + "grad_norm": 0.938838005065918, + "learning_rate": 9.463157358336731e-06, + "loss": 7.1444, + "step": 6325 + }, + { + "epoch": 0.4300856094578068, + "grad_norm": 1.267940640449524, + "learning_rate": 9.462732708248404e-06, + "loss": 7.1273, + "step": 6330 + }, + { + "epoch": 0.4304253295284685, + "grad_norm": 0.9566308259963989, + "learning_rate": 9.462308058160077e-06, + "loss": 6.9935, + "step": 6335 + }, + { + "epoch": 0.4307650495991303, + "grad_norm": 1.1679370403289795, + "learning_rate": 9.46188340807175e-06, + "loss": 7.1059, + "step": 6340 + }, + { + "epoch": 0.4311047696697921, + "grad_norm": 0.9410686492919922, + "learning_rate": 9.461458757983422e-06, + "loss": 6.9536, + "step": 6345 + }, + { + "epoch": 0.43144448974045385, + "grad_norm": 0.8753842711448669, + "learning_rate": 9.461034107895095e-06, + "loss": 7.0528, + "step": 6350 + }, + { + "epoch": 0.43178420981111565, + "grad_norm": 1.1074210405349731, + "learning_rate": 9.460609457806768e-06, + "loss": 6.9708, + "step": 6355 + }, + { + "epoch": 0.4321239298817774, + "grad_norm": 0.7825161218643188, + "learning_rate": 9.46018480771844e-06, + "loss": 6.8772, + "step": 6360 + }, + { + "epoch": 0.4324636499524392, + "grad_norm": 0.9924049973487854, + "learning_rate": 9.459760157630114e-06, + "loss": 6.9317, + "step": 6365 + }, + { + "epoch": 0.432803370023101, + "grad_norm": 1.065462589263916, + "learning_rate": 9.459335507541786e-06, + "loss": 7.0334, + "step": 6370 + }, + { + "epoch": 0.43314309009376273, + "grad_norm": 0.9140529632568359, + "learning_rate": 9.45891085745346e-06, + "loss": 6.8327, + "step": 6375 + }, + { + "epoch": 0.4334828101644245, + "grad_norm": 1.3456588983535767, + "learning_rate": 9.458486207365132e-06, + "loss": 7.3135, + "step": 6380 + }, + { + "epoch": 0.43382253023508627, + "grad_norm": 1.4509061574935913, + "learning_rate": 9.458061557276805e-06, + "loss": 6.9207, + "step": 6385 + }, + { + "epoch": 0.43416225030574807, + "grad_norm": 0.796503484249115, + "learning_rate": 9.457636907188478e-06, + "loss": 7.2753, + "step": 6390 + }, + { + "epoch": 0.43450197037640986, + "grad_norm": 0.9864493012428284, + "learning_rate": 9.45721225710015e-06, + "loss": 7.1621, + "step": 6395 + }, + { + "epoch": 0.4348416904470716, + "grad_norm": 1.0423486232757568, + "learning_rate": 9.456787607011823e-06, + "loss": 6.9656, + "step": 6400 + }, + { + "epoch": 0.4351814105177334, + "grad_norm": 0.7430753707885742, + "learning_rate": 9.456362956923496e-06, + "loss": 7.0029, + "step": 6405 + }, + { + "epoch": 0.43552113058839514, + "grad_norm": 0.9109513759613037, + "learning_rate": 9.455938306835169e-06, + "loss": 6.8634, + "step": 6410 + }, + { + "epoch": 0.43586085065905694, + "grad_norm": 0.8650234341621399, + "learning_rate": 9.455513656746842e-06, + "loss": 6.9364, + "step": 6415 + }, + { + "epoch": 0.4362005707297187, + "grad_norm": 0.7624695301055908, + "learning_rate": 9.455089006658514e-06, + "loss": 6.8988, + "step": 6420 + }, + { + "epoch": 0.4365402908003805, + "grad_norm": 0.9330212473869324, + "learning_rate": 9.454664356570187e-06, + "loss": 6.9252, + "step": 6425 + }, + { + "epoch": 0.4368800108710423, + "grad_norm": 0.9918550252914429, + "learning_rate": 9.45423970648186e-06, + "loss": 7.02, + "step": 6430 + }, + { + "epoch": 0.437219730941704, + "grad_norm": 0.8198363780975342, + "learning_rate": 9.453815056393533e-06, + "loss": 7.113, + "step": 6435 + }, + { + "epoch": 0.4375594510123658, + "grad_norm": 1.0742613077163696, + "learning_rate": 9.453390406305206e-06, + "loss": 7.0097, + "step": 6440 + }, + { + "epoch": 0.43789917108302756, + "grad_norm": 0.8951501846313477, + "learning_rate": 9.452965756216878e-06, + "loss": 6.9517, + "step": 6445 + }, + { + "epoch": 0.43823889115368936, + "grad_norm": 0.9625474810600281, + "learning_rate": 9.452541106128551e-06, + "loss": 6.8211, + "step": 6450 + }, + { + "epoch": 0.43857861122435116, + "grad_norm": 1.0708805322647095, + "learning_rate": 9.452116456040224e-06, + "loss": 6.9853, + "step": 6455 + }, + { + "epoch": 0.4389183312950129, + "grad_norm": 1.1871598958969116, + "learning_rate": 9.451691805951897e-06, + "loss": 7.1983, + "step": 6460 + }, + { + "epoch": 0.4392580513656747, + "grad_norm": 1.0670018196105957, + "learning_rate": 9.45126715586357e-06, + "loss": 6.9534, + "step": 6465 + }, + { + "epoch": 0.43959777143633644, + "grad_norm": 1.0406653881072998, + "learning_rate": 9.450842505775242e-06, + "loss": 6.9595, + "step": 6470 + }, + { + "epoch": 0.43993749150699824, + "grad_norm": 1.0186182260513306, + "learning_rate": 9.450417855686915e-06, + "loss": 7.1505, + "step": 6475 + }, + { + "epoch": 0.44027721157766003, + "grad_norm": 0.941541314125061, + "learning_rate": 9.449993205598588e-06, + "loss": 7.0828, + "step": 6480 + }, + { + "epoch": 0.4406169316483218, + "grad_norm": 0.9553037285804749, + "learning_rate": 9.44956855551026e-06, + "loss": 6.8788, + "step": 6485 + }, + { + "epoch": 0.44095665171898357, + "grad_norm": 1.079506754875183, + "learning_rate": 9.449143905421934e-06, + "loss": 7.0546, + "step": 6490 + }, + { + "epoch": 0.4412963717896453, + "grad_norm": 0.85225510597229, + "learning_rate": 9.448719255333606e-06, + "loss": 6.7684, + "step": 6495 + }, + { + "epoch": 0.4416360918603071, + "grad_norm": 1.1207984685897827, + "learning_rate": 9.44829460524528e-06, + "loss": 7.1043, + "step": 6500 + }, + { + "epoch": 0.4419758119309689, + "grad_norm": 0.8311938643455505, + "learning_rate": 9.447869955156952e-06, + "loss": 6.7751, + "step": 6505 + }, + { + "epoch": 0.44231553200163065, + "grad_norm": 0.9116659164428711, + "learning_rate": 9.447445305068625e-06, + "loss": 6.9963, + "step": 6510 + }, + { + "epoch": 0.44265525207229245, + "grad_norm": 1.1168979406356812, + "learning_rate": 9.447020654980298e-06, + "loss": 6.8258, + "step": 6515 + }, + { + "epoch": 0.4429949721429542, + "grad_norm": 0.8977029919624329, + "learning_rate": 9.44659600489197e-06, + "loss": 6.8197, + "step": 6520 + }, + { + "epoch": 0.443334692213616, + "grad_norm": 1.2517333030700684, + "learning_rate": 9.446171354803643e-06, + "loss": 7.0233, + "step": 6525 + }, + { + "epoch": 0.44367441228427773, + "grad_norm": 0.825666606426239, + "learning_rate": 9.445746704715314e-06, + "loss": 6.7847, + "step": 6530 + }, + { + "epoch": 0.44401413235493953, + "grad_norm": 0.8056749105453491, + "learning_rate": 9.445322054626989e-06, + "loss": 6.8552, + "step": 6535 + }, + { + "epoch": 0.4443538524256013, + "grad_norm": 1.0186346769332886, + "learning_rate": 9.444897404538662e-06, + "loss": 6.8705, + "step": 6540 + }, + { + "epoch": 0.44469357249626307, + "grad_norm": 1.23142409324646, + "learning_rate": 9.444472754450333e-06, + "loss": 6.847, + "step": 6545 + }, + { + "epoch": 0.44503329256692487, + "grad_norm": 1.040743350982666, + "learning_rate": 9.444048104362007e-06, + "loss": 6.9793, + "step": 6550 + }, + { + "epoch": 0.4453730126375866, + "grad_norm": 0.8387247323989868, + "learning_rate": 9.44362345427368e-06, + "loss": 7.0479, + "step": 6555 + }, + { + "epoch": 0.4457127327082484, + "grad_norm": 1.1513484716415405, + "learning_rate": 9.443198804185351e-06, + "loss": 7.216, + "step": 6560 + }, + { + "epoch": 0.4460524527789102, + "grad_norm": 0.8814455270767212, + "learning_rate": 9.442774154097026e-06, + "loss": 6.7845, + "step": 6565 + }, + { + "epoch": 0.44639217284957194, + "grad_norm": 0.8264756798744202, + "learning_rate": 9.442349504008698e-06, + "loss": 6.891, + "step": 6570 + }, + { + "epoch": 0.44673189292023374, + "grad_norm": 1.0676146745681763, + "learning_rate": 9.44192485392037e-06, + "loss": 7.0682, + "step": 6575 + }, + { + "epoch": 0.4470716129908955, + "grad_norm": 0.842787504196167, + "learning_rate": 9.441500203832044e-06, + "loss": 6.8412, + "step": 6580 + }, + { + "epoch": 0.4474113330615573, + "grad_norm": 1.5337035655975342, + "learning_rate": 9.441075553743717e-06, + "loss": 7.0197, + "step": 6585 + }, + { + "epoch": 0.4477510531322191, + "grad_norm": 0.9397187829017639, + "learning_rate": 9.440650903655388e-06, + "loss": 7.0713, + "step": 6590 + }, + { + "epoch": 0.4480907732028808, + "grad_norm": 0.8309446573257446, + "learning_rate": 9.440226253567062e-06, + "loss": 6.8726, + "step": 6595 + }, + { + "epoch": 0.4484304932735426, + "grad_norm": 0.9798251986503601, + "learning_rate": 9.439801603478734e-06, + "loss": 7.0173, + "step": 6600 + }, + { + "epoch": 0.44877021334420436, + "grad_norm": 0.8350381851196289, + "learning_rate": 9.439376953390406e-06, + "loss": 6.6493, + "step": 6605 + }, + { + "epoch": 0.44910993341486616, + "grad_norm": 0.7107457518577576, + "learning_rate": 9.43895230330208e-06, + "loss": 6.73, + "step": 6610 + }, + { + "epoch": 0.4494496534855279, + "grad_norm": 0.7430808544158936, + "learning_rate": 9.438527653213752e-06, + "loss": 6.9567, + "step": 6615 + }, + { + "epoch": 0.4497893735561897, + "grad_norm": 0.7617524862289429, + "learning_rate": 9.438103003125425e-06, + "loss": 6.8281, + "step": 6620 + }, + { + "epoch": 0.4501290936268515, + "grad_norm": 0.9770264625549316, + "learning_rate": 9.4376783530371e-06, + "loss": 6.9653, + "step": 6625 + }, + { + "epoch": 0.45046881369751324, + "grad_norm": 1.3557504415512085, + "learning_rate": 9.43725370294877e-06, + "loss": 7.1616, + "step": 6630 + }, + { + "epoch": 0.45080853376817503, + "grad_norm": 1.0879008769989014, + "learning_rate": 9.436829052860443e-06, + "loss": 7.0628, + "step": 6635 + }, + { + "epoch": 0.4511482538388368, + "grad_norm": 0.9681192636489868, + "learning_rate": 9.436404402772118e-06, + "loss": 7.2294, + "step": 6640 + }, + { + "epoch": 0.4514879739094986, + "grad_norm": 1.0363694429397583, + "learning_rate": 9.435979752683789e-06, + "loss": 6.9512, + "step": 6645 + }, + { + "epoch": 0.45182769398016037, + "grad_norm": 1.5091588497161865, + "learning_rate": 9.435555102595462e-06, + "loss": 7.0188, + "step": 6650 + }, + { + "epoch": 0.4521674140508221, + "grad_norm": 0.8171900510787964, + "learning_rate": 9.435130452507136e-06, + "loss": 6.9948, + "step": 6655 + }, + { + "epoch": 0.4525071341214839, + "grad_norm": 1.503381371498108, + "learning_rate": 9.434705802418807e-06, + "loss": 6.6024, + "step": 6660 + }, + { + "epoch": 0.45284685419214565, + "grad_norm": 0.930162787437439, + "learning_rate": 9.43428115233048e-06, + "loss": 6.8657, + "step": 6665 + }, + { + "epoch": 0.45318657426280745, + "grad_norm": 0.9079920649528503, + "learning_rate": 9.433856502242153e-06, + "loss": 7.0096, + "step": 6670 + }, + { + "epoch": 0.45352629433346925, + "grad_norm": 0.8968695402145386, + "learning_rate": 9.433431852153826e-06, + "loss": 7.0619, + "step": 6675 + }, + { + "epoch": 0.453866014404131, + "grad_norm": 1.1464383602142334, + "learning_rate": 9.433007202065498e-06, + "loss": 7.0686, + "step": 6680 + }, + { + "epoch": 0.4542057344747928, + "grad_norm": 1.3535710573196411, + "learning_rate": 9.432582551977171e-06, + "loss": 6.6683, + "step": 6685 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.8522243499755859, + "learning_rate": 9.432157901888844e-06, + "loss": 6.6654, + "step": 6690 + }, + { + "epoch": 0.45488517461611633, + "grad_norm": 1.0825642347335815, + "learning_rate": 9.431733251800517e-06, + "loss": 6.855, + "step": 6695 + }, + { + "epoch": 0.45522489468677807, + "grad_norm": 1.1377815008163452, + "learning_rate": 9.43130860171219e-06, + "loss": 6.7237, + "step": 6700 + }, + { + "epoch": 0.45556461475743987, + "grad_norm": 1.0637131929397583, + "learning_rate": 9.430883951623862e-06, + "loss": 6.9144, + "step": 6705 + }, + { + "epoch": 0.45590433482810166, + "grad_norm": 1.1076207160949707, + "learning_rate": 9.430459301535535e-06, + "loss": 7.1887, + "step": 6710 + }, + { + "epoch": 0.4562440548987634, + "grad_norm": 0.6923676133155823, + "learning_rate": 9.430034651447208e-06, + "loss": 6.908, + "step": 6715 + }, + { + "epoch": 0.4565837749694252, + "grad_norm": 1.203664779663086, + "learning_rate": 9.42961000135888e-06, + "loss": 6.7278, + "step": 6720 + }, + { + "epoch": 0.45692349504008695, + "grad_norm": 0.8809427618980408, + "learning_rate": 9.429185351270554e-06, + "loss": 6.9216, + "step": 6725 + }, + { + "epoch": 0.45726321511074874, + "grad_norm": 1.0786895751953125, + "learning_rate": 9.428760701182226e-06, + "loss": 6.7735, + "step": 6730 + }, + { + "epoch": 0.45760293518141054, + "grad_norm": 0.9941666722297668, + "learning_rate": 9.428336051093899e-06, + "loss": 6.9578, + "step": 6735 + }, + { + "epoch": 0.4579426552520723, + "grad_norm": 0.7463558912277222, + "learning_rate": 9.427911401005572e-06, + "loss": 6.8928, + "step": 6740 + }, + { + "epoch": 0.4582823753227341, + "grad_norm": 0.8819549679756165, + "learning_rate": 9.427486750917245e-06, + "loss": 6.9338, + "step": 6745 + }, + { + "epoch": 0.4586220953933958, + "grad_norm": 0.952846348285675, + "learning_rate": 9.427062100828918e-06, + "loss": 7.1481, + "step": 6750 + }, + { + "epoch": 0.4589618154640576, + "grad_norm": 0.9597423076629639, + "learning_rate": 9.42663745074059e-06, + "loss": 6.8751, + "step": 6755 + }, + { + "epoch": 0.4593015355347194, + "grad_norm": 0.8891428709030151, + "learning_rate": 9.426212800652263e-06, + "loss": 6.6798, + "step": 6760 + }, + { + "epoch": 0.45964125560538116, + "grad_norm": 1.0078986883163452, + "learning_rate": 9.425788150563936e-06, + "loss": 6.9614, + "step": 6765 + }, + { + "epoch": 0.45998097567604296, + "grad_norm": 0.9036884903907776, + "learning_rate": 9.425363500475609e-06, + "loss": 6.9678, + "step": 6770 + }, + { + "epoch": 0.4603206957467047, + "grad_norm": 1.0082577466964722, + "learning_rate": 9.424938850387282e-06, + "loss": 6.9821, + "step": 6775 + }, + { + "epoch": 0.4606604158173665, + "grad_norm": 0.718622624874115, + "learning_rate": 9.424514200298954e-06, + "loss": 7.074, + "step": 6780 + }, + { + "epoch": 0.46100013588802824, + "grad_norm": 0.7855224609375, + "learning_rate": 9.424089550210627e-06, + "loss": 6.8927, + "step": 6785 + }, + { + "epoch": 0.46133985595869004, + "grad_norm": 0.815398097038269, + "learning_rate": 9.4236649001223e-06, + "loss": 6.8328, + "step": 6790 + }, + { + "epoch": 0.46167957602935183, + "grad_norm": 1.2609484195709229, + "learning_rate": 9.423240250033973e-06, + "loss": 6.8777, + "step": 6795 + }, + { + "epoch": 0.4620192961000136, + "grad_norm": 0.9868876338005066, + "learning_rate": 9.422815599945646e-06, + "loss": 6.9799, + "step": 6800 + }, + { + "epoch": 0.4623590161706754, + "grad_norm": 0.8121034502983093, + "learning_rate": 9.422390949857318e-06, + "loss": 6.8893, + "step": 6805 + }, + { + "epoch": 0.4626987362413371, + "grad_norm": 0.9479442834854126, + "learning_rate": 9.421966299768991e-06, + "loss": 6.9332, + "step": 6810 + }, + { + "epoch": 0.4630384563119989, + "grad_norm": 0.8871968984603882, + "learning_rate": 9.421541649680664e-06, + "loss": 6.768, + "step": 6815 + }, + { + "epoch": 0.4633781763826607, + "grad_norm": 1.076857328414917, + "learning_rate": 9.421116999592337e-06, + "loss": 7.312, + "step": 6820 + }, + { + "epoch": 0.46371789645332245, + "grad_norm": 0.9043833017349243, + "learning_rate": 9.42069234950401e-06, + "loss": 6.951, + "step": 6825 + }, + { + "epoch": 0.46405761652398425, + "grad_norm": 1.0734810829162598, + "learning_rate": 9.420267699415682e-06, + "loss": 6.7803, + "step": 6830 + }, + { + "epoch": 0.464397336594646, + "grad_norm": 0.7865541577339172, + "learning_rate": 9.419843049327355e-06, + "loss": 6.7287, + "step": 6835 + }, + { + "epoch": 0.4647370566653078, + "grad_norm": 0.7863699197769165, + "learning_rate": 9.419418399239028e-06, + "loss": 6.9765, + "step": 6840 + }, + { + "epoch": 0.4650767767359696, + "grad_norm": 0.6915971040725708, + "learning_rate": 9.4189937491507e-06, + "loss": 6.751, + "step": 6845 + }, + { + "epoch": 0.46541649680663133, + "grad_norm": 0.9015297293663025, + "learning_rate": 9.418569099062374e-06, + "loss": 6.8156, + "step": 6850 + }, + { + "epoch": 0.4657562168772931, + "grad_norm": 0.7886335253715515, + "learning_rate": 9.418144448974046e-06, + "loss": 6.6394, + "step": 6855 + }, + { + "epoch": 0.46609593694795487, + "grad_norm": 0.910670816898346, + "learning_rate": 9.417719798885719e-06, + "loss": 6.9042, + "step": 6860 + }, + { + "epoch": 0.46643565701861667, + "grad_norm": 1.0451463460922241, + "learning_rate": 9.417295148797392e-06, + "loss": 6.8337, + "step": 6865 + }, + { + "epoch": 0.4667753770892784, + "grad_norm": 0.8270052671432495, + "learning_rate": 9.416870498709065e-06, + "loss": 6.8706, + "step": 6870 + }, + { + "epoch": 0.4671150971599402, + "grad_norm": 1.3445216417312622, + "learning_rate": 9.416445848620738e-06, + "loss": 6.8457, + "step": 6875 + }, + { + "epoch": 0.467454817230602, + "grad_norm": 0.74224853515625, + "learning_rate": 9.41602119853241e-06, + "loss": 6.8595, + "step": 6880 + }, + { + "epoch": 0.46779453730126375, + "grad_norm": 0.9257288575172424, + "learning_rate": 9.415596548444083e-06, + "loss": 6.7256, + "step": 6885 + }, + { + "epoch": 0.46813425737192554, + "grad_norm": 0.7825515270233154, + "learning_rate": 9.415171898355756e-06, + "loss": 6.9623, + "step": 6890 + }, + { + "epoch": 0.4684739774425873, + "grad_norm": 0.9336167573928833, + "learning_rate": 9.414747248267429e-06, + "loss": 6.9991, + "step": 6895 + }, + { + "epoch": 0.4688136975132491, + "grad_norm": 0.7275258898735046, + "learning_rate": 9.414322598179102e-06, + "loss": 7.1386, + "step": 6900 + }, + { + "epoch": 0.4691534175839109, + "grad_norm": 1.3609256744384766, + "learning_rate": 9.413897948090774e-06, + "loss": 6.946, + "step": 6905 + }, + { + "epoch": 0.4694931376545726, + "grad_norm": 1.0374950170516968, + "learning_rate": 9.413473298002447e-06, + "loss": 6.9218, + "step": 6910 + }, + { + "epoch": 0.4698328577252344, + "grad_norm": 0.9430462718009949, + "learning_rate": 9.41304864791412e-06, + "loss": 6.9338, + "step": 6915 + }, + { + "epoch": 0.47017257779589616, + "grad_norm": 1.0528303384780884, + "learning_rate": 9.412623997825793e-06, + "loss": 6.7473, + "step": 6920 + }, + { + "epoch": 0.47051229786655796, + "grad_norm": 1.0124766826629639, + "learning_rate": 9.412199347737466e-06, + "loss": 6.951, + "step": 6925 + }, + { + "epoch": 0.47085201793721976, + "grad_norm": 0.9779643416404724, + "learning_rate": 9.411774697649138e-06, + "loss": 7.0442, + "step": 6930 + }, + { + "epoch": 0.4711917380078815, + "grad_norm": 0.843136727809906, + "learning_rate": 9.411350047560811e-06, + "loss": 6.9393, + "step": 6935 + }, + { + "epoch": 0.4715314580785433, + "grad_norm": 0.9563644528388977, + "learning_rate": 9.410925397472484e-06, + "loss": 6.9167, + "step": 6940 + }, + { + "epoch": 0.47187117814920504, + "grad_norm": 1.1861497163772583, + "learning_rate": 9.410500747384155e-06, + "loss": 7.0731, + "step": 6945 + }, + { + "epoch": 0.47221089821986684, + "grad_norm": 1.0253018140792847, + "learning_rate": 9.41007609729583e-06, + "loss": 6.9051, + "step": 6950 + }, + { + "epoch": 0.4725506182905286, + "grad_norm": 0.916854739189148, + "learning_rate": 9.409651447207502e-06, + "loss": 6.9389, + "step": 6955 + }, + { + "epoch": 0.4728903383611904, + "grad_norm": 0.9326732754707336, + "learning_rate": 9.409226797119173e-06, + "loss": 6.8415, + "step": 6960 + }, + { + "epoch": 0.4732300584318522, + "grad_norm": 1.3391211032867432, + "learning_rate": 9.408802147030848e-06, + "loss": 6.9603, + "step": 6965 + }, + { + "epoch": 0.4735697785025139, + "grad_norm": 0.972969651222229, + "learning_rate": 9.40837749694252e-06, + "loss": 6.5895, + "step": 6970 + }, + { + "epoch": 0.4739094985731757, + "grad_norm": 0.6874942779541016, + "learning_rate": 9.407952846854192e-06, + "loss": 6.7545, + "step": 6975 + }, + { + "epoch": 0.47424921864383746, + "grad_norm": 0.8462442755699158, + "learning_rate": 9.407528196765866e-06, + "loss": 6.9816, + "step": 6980 + }, + { + "epoch": 0.47458893871449925, + "grad_norm": 0.7268022298812866, + "learning_rate": 9.40710354667754e-06, + "loss": 7.01, + "step": 6985 + }, + { + "epoch": 0.47492865878516105, + "grad_norm": 0.6837387084960938, + "learning_rate": 9.40667889658921e-06, + "loss": 6.9906, + "step": 6990 + }, + { + "epoch": 0.4752683788558228, + "grad_norm": 1.2806452512741089, + "learning_rate": 9.406254246500885e-06, + "loss": 6.583, + "step": 6995 + }, + { + "epoch": 0.4756080989264846, + "grad_norm": 1.744321584701538, + "learning_rate": 9.405829596412558e-06, + "loss": 6.5743, + "step": 7000 + }, + { + "epoch": 0.47594781899714633, + "grad_norm": 1.0051915645599365, + "learning_rate": 9.405404946324229e-06, + "loss": 7.1006, + "step": 7005 + }, + { + "epoch": 0.47628753906780813, + "grad_norm": 1.0023012161254883, + "learning_rate": 9.404980296235903e-06, + "loss": 6.8242, + "step": 7010 + }, + { + "epoch": 0.4766272591384699, + "grad_norm": 1.0751943588256836, + "learning_rate": 9.404555646147574e-06, + "loss": 6.8018, + "step": 7015 + }, + { + "epoch": 0.47696697920913167, + "grad_norm": 1.1198912858963013, + "learning_rate": 9.404130996059247e-06, + "loss": 6.4802, + "step": 7020 + }, + { + "epoch": 0.47730669927979347, + "grad_norm": 1.2644476890563965, + "learning_rate": 9.403706345970922e-06, + "loss": 7.0271, + "step": 7025 + }, + { + "epoch": 0.4776464193504552, + "grad_norm": 0.8951007723808289, + "learning_rate": 9.403281695882593e-06, + "loss": 6.5875, + "step": 7030 + }, + { + "epoch": 0.477986139421117, + "grad_norm": 0.886833131313324, + "learning_rate": 9.402857045794265e-06, + "loss": 7.0457, + "step": 7035 + }, + { + "epoch": 0.47832585949177875, + "grad_norm": 0.8498584628105164, + "learning_rate": 9.40243239570594e-06, + "loss": 6.8877, + "step": 7040 + }, + { + "epoch": 0.47866557956244055, + "grad_norm": 0.9566428661346436, + "learning_rate": 9.402007745617611e-06, + "loss": 7.0016, + "step": 7045 + }, + { + "epoch": 0.47900529963310234, + "grad_norm": 0.774046778678894, + "learning_rate": 9.401583095529284e-06, + "loss": 6.8718, + "step": 7050 + }, + { + "epoch": 0.4793450197037641, + "grad_norm": 0.9663683772087097, + "learning_rate": 9.401158445440958e-06, + "loss": 6.7026, + "step": 7055 + }, + { + "epoch": 0.4796847397744259, + "grad_norm": 0.8908493518829346, + "learning_rate": 9.40073379535263e-06, + "loss": 6.8193, + "step": 7060 + }, + { + "epoch": 0.4800244598450876, + "grad_norm": 0.8302739262580872, + "learning_rate": 9.400309145264302e-06, + "loss": 6.6358, + "step": 7065 + }, + { + "epoch": 0.4803641799157494, + "grad_norm": 1.1321642398834229, + "learning_rate": 9.399884495175977e-06, + "loss": 6.8433, + "step": 7070 + }, + { + "epoch": 0.4807038999864112, + "grad_norm": 0.9136603474617004, + "learning_rate": 9.399459845087648e-06, + "loss": 6.8955, + "step": 7075 + }, + { + "epoch": 0.48104362005707296, + "grad_norm": 0.8867666721343994, + "learning_rate": 9.39903519499932e-06, + "loss": 6.7667, + "step": 7080 + }, + { + "epoch": 0.48138334012773476, + "grad_norm": 0.9154548048973083, + "learning_rate": 9.398610544910995e-06, + "loss": 6.6096, + "step": 7085 + }, + { + "epoch": 0.4817230601983965, + "grad_norm": 0.9172536134719849, + "learning_rate": 9.398185894822666e-06, + "loss": 6.7807, + "step": 7090 + }, + { + "epoch": 0.4820627802690583, + "grad_norm": 0.8270658850669861, + "learning_rate": 9.397761244734339e-06, + "loss": 6.9419, + "step": 7095 + }, + { + "epoch": 0.4824025003397201, + "grad_norm": 0.8099174499511719, + "learning_rate": 9.397336594646012e-06, + "loss": 6.6207, + "step": 7100 + }, + { + "epoch": 0.48274222041038184, + "grad_norm": 1.0215178728103638, + "learning_rate": 9.396911944557685e-06, + "loss": 6.8072, + "step": 7105 + }, + { + "epoch": 0.48308194048104364, + "grad_norm": 0.8136088848114014, + "learning_rate": 9.396487294469357e-06, + "loss": 6.9356, + "step": 7110 + }, + { + "epoch": 0.4834216605517054, + "grad_norm": 1.2214863300323486, + "learning_rate": 9.39606264438103e-06, + "loss": 6.9137, + "step": 7115 + }, + { + "epoch": 0.4837613806223672, + "grad_norm": 0.7846865057945251, + "learning_rate": 9.395637994292703e-06, + "loss": 6.7195, + "step": 7120 + }, + { + "epoch": 0.4841011006930289, + "grad_norm": 0.877834677696228, + "learning_rate": 9.395213344204378e-06, + "loss": 7.0301, + "step": 7125 + }, + { + "epoch": 0.4844408207636907, + "grad_norm": 0.9723012447357178, + "learning_rate": 9.394788694116049e-06, + "loss": 6.8832, + "step": 7130 + }, + { + "epoch": 0.4847805408343525, + "grad_norm": 0.6475871801376343, + "learning_rate": 9.394364044027721e-06, + "loss": 6.7021, + "step": 7135 + }, + { + "epoch": 0.48512026090501426, + "grad_norm": 1.0516350269317627, + "learning_rate": 9.393939393939396e-06, + "loss": 6.8834, + "step": 7140 + }, + { + "epoch": 0.48545998097567605, + "grad_norm": 1.015688419342041, + "learning_rate": 9.393514743851067e-06, + "loss": 6.955, + "step": 7145 + }, + { + "epoch": 0.4857997010463378, + "grad_norm": 0.8305737376213074, + "learning_rate": 9.39309009376274e-06, + "loss": 6.7688, + "step": 7150 + }, + { + "epoch": 0.4861394211169996, + "grad_norm": 0.6688657999038696, + "learning_rate": 9.392665443674414e-06, + "loss": 6.9296, + "step": 7155 + }, + { + "epoch": 0.4864791411876614, + "grad_norm": 0.9116338491439819, + "learning_rate": 9.392240793586086e-06, + "loss": 6.9873, + "step": 7160 + }, + { + "epoch": 0.48681886125832313, + "grad_norm": 1.3022520542144775, + "learning_rate": 9.391816143497758e-06, + "loss": 6.9463, + "step": 7165 + }, + { + "epoch": 0.48715858132898493, + "grad_norm": 0.781035304069519, + "learning_rate": 9.391391493409431e-06, + "loss": 6.7388, + "step": 7170 + }, + { + "epoch": 0.48749830139964667, + "grad_norm": 0.7641096711158752, + "learning_rate": 9.390966843321104e-06, + "loss": 6.7637, + "step": 7175 + }, + { + "epoch": 0.48783802147030847, + "grad_norm": 0.842128336429596, + "learning_rate": 9.390542193232777e-06, + "loss": 6.8941, + "step": 7180 + }, + { + "epoch": 0.48817774154097027, + "grad_norm": 0.8434991240501404, + "learning_rate": 9.39011754314445e-06, + "loss": 6.4745, + "step": 7185 + }, + { + "epoch": 0.488517461611632, + "grad_norm": 0.8535445928573608, + "learning_rate": 9.389692893056122e-06, + "loss": 6.8195, + "step": 7190 + }, + { + "epoch": 0.4888571816822938, + "grad_norm": 0.9039226174354553, + "learning_rate": 9.389268242967795e-06, + "loss": 6.9274, + "step": 7195 + }, + { + "epoch": 0.48919690175295555, + "grad_norm": 0.8680581450462341, + "learning_rate": 9.388843592879468e-06, + "loss": 6.8664, + "step": 7200 + }, + { + "epoch": 0.48953662182361735, + "grad_norm": 0.9782411456108093, + "learning_rate": 9.38841894279114e-06, + "loss": 6.807, + "step": 7205 + }, + { + "epoch": 0.4898763418942791, + "grad_norm": 0.9042291641235352, + "learning_rate": 9.387994292702814e-06, + "loss": 7.0013, + "step": 7210 + }, + { + "epoch": 0.4902160619649409, + "grad_norm": 0.9558718204498291, + "learning_rate": 9.387569642614486e-06, + "loss": 6.8023, + "step": 7215 + }, + { + "epoch": 0.4905557820356027, + "grad_norm": 0.7176926136016846, + "learning_rate": 9.387144992526159e-06, + "loss": 6.9471, + "step": 7220 + }, + { + "epoch": 0.4908955021062644, + "grad_norm": 0.8245338201522827, + "learning_rate": 9.386720342437832e-06, + "loss": 6.807, + "step": 7225 + }, + { + "epoch": 0.4912352221769262, + "grad_norm": 0.9849390983581543, + "learning_rate": 9.386295692349505e-06, + "loss": 6.6734, + "step": 7230 + }, + { + "epoch": 0.49157494224758796, + "grad_norm": 0.6917431354522705, + "learning_rate": 9.385871042261178e-06, + "loss": 6.8584, + "step": 7235 + }, + { + "epoch": 0.49191466231824976, + "grad_norm": 0.8589382767677307, + "learning_rate": 9.38544639217285e-06, + "loss": 6.847, + "step": 7240 + }, + { + "epoch": 0.49225438238891156, + "grad_norm": 0.7180635333061218, + "learning_rate": 9.385021742084523e-06, + "loss": 6.6895, + "step": 7245 + }, + { + "epoch": 0.4925941024595733, + "grad_norm": 0.8543854355812073, + "learning_rate": 9.384597091996196e-06, + "loss": 6.8849, + "step": 7250 + }, + { + "epoch": 0.4929338225302351, + "grad_norm": 0.8247594237327576, + "learning_rate": 9.384172441907869e-06, + "loss": 6.5902, + "step": 7255 + }, + { + "epoch": 0.49327354260089684, + "grad_norm": 0.871656596660614, + "learning_rate": 9.383747791819542e-06, + "loss": 6.9284, + "step": 7260 + }, + { + "epoch": 0.49361326267155864, + "grad_norm": 0.8345972299575806, + "learning_rate": 9.383323141731214e-06, + "loss": 6.8069, + "step": 7265 + }, + { + "epoch": 0.49395298274222044, + "grad_norm": 0.8284621238708496, + "learning_rate": 9.382898491642887e-06, + "loss": 6.9111, + "step": 7270 + }, + { + "epoch": 0.4942927028128822, + "grad_norm": 0.8150320053100586, + "learning_rate": 9.38247384155456e-06, + "loss": 6.6145, + "step": 7275 + }, + { + "epoch": 0.494632422883544, + "grad_norm": 0.9100799560546875, + "learning_rate": 9.382049191466233e-06, + "loss": 6.5831, + "step": 7280 + }, + { + "epoch": 0.4949721429542057, + "grad_norm": 0.9185923933982849, + "learning_rate": 9.381624541377906e-06, + "loss": 7.0425, + "step": 7285 + }, + { + "epoch": 0.4953118630248675, + "grad_norm": 0.7632870078086853, + "learning_rate": 9.381199891289578e-06, + "loss": 6.4838, + "step": 7290 + }, + { + "epoch": 0.49565158309552926, + "grad_norm": 0.8461910486221313, + "learning_rate": 9.380775241201251e-06, + "loss": 6.8559, + "step": 7295 + }, + { + "epoch": 0.49599130316619106, + "grad_norm": 0.8245363831520081, + "learning_rate": 9.380350591112924e-06, + "loss": 6.7988, + "step": 7300 + }, + { + "epoch": 0.49633102323685285, + "grad_norm": 0.9423805475234985, + "learning_rate": 9.379925941024597e-06, + "loss": 6.8098, + "step": 7305 + }, + { + "epoch": 0.4966707433075146, + "grad_norm": 0.8868122696876526, + "learning_rate": 9.37950129093627e-06, + "loss": 6.7524, + "step": 7310 + }, + { + "epoch": 0.4970104633781764, + "grad_norm": 0.7413000464439392, + "learning_rate": 9.379076640847942e-06, + "loss": 6.8359, + "step": 7315 + }, + { + "epoch": 0.49735018344883813, + "grad_norm": 0.7795374393463135, + "learning_rate": 9.378651990759615e-06, + "loss": 6.8443, + "step": 7320 + }, + { + "epoch": 0.49768990351949993, + "grad_norm": 1.039803147315979, + "learning_rate": 9.378227340671288e-06, + "loss": 6.7329, + "step": 7325 + }, + { + "epoch": 0.49802962359016173, + "grad_norm": 0.904178261756897, + "learning_rate": 9.37780269058296e-06, + "loss": 6.8616, + "step": 7330 + }, + { + "epoch": 0.49836934366082347, + "grad_norm": 0.8657399415969849, + "learning_rate": 9.377378040494634e-06, + "loss": 6.9122, + "step": 7335 + }, + { + "epoch": 0.49870906373148527, + "grad_norm": 0.9828017950057983, + "learning_rate": 9.376953390406306e-06, + "loss": 6.8706, + "step": 7340 + }, + { + "epoch": 0.499048783802147, + "grad_norm": 0.8157175183296204, + "learning_rate": 9.376528740317979e-06, + "loss": 6.7545, + "step": 7345 + }, + { + "epoch": 0.4993885038728088, + "grad_norm": 0.755646824836731, + "learning_rate": 9.376104090229652e-06, + "loss": 6.5869, + "step": 7350 + }, + { + "epoch": 0.4997282239434706, + "grad_norm": 0.9050033688545227, + "learning_rate": 9.375679440141325e-06, + "loss": 6.7917, + "step": 7355 + }, + { + "epoch": 0.5000679440141323, + "grad_norm": 0.7949090003967285, + "learning_rate": 9.375254790052996e-06, + "loss": 6.7394, + "step": 7360 + }, + { + "epoch": 0.5004076640847941, + "grad_norm": 0.8849690556526184, + "learning_rate": 9.37483013996467e-06, + "loss": 6.954, + "step": 7365 + }, + { + "epoch": 0.5007473841554559, + "grad_norm": 0.8551892638206482, + "learning_rate": 9.374405489876343e-06, + "loss": 6.968, + "step": 7370 + }, + { + "epoch": 0.5010871042261177, + "grad_norm": 0.9085467457771301, + "learning_rate": 9.373980839788014e-06, + "loss": 6.6842, + "step": 7375 + }, + { + "epoch": 0.5014268242967794, + "grad_norm": 0.8068432211875916, + "learning_rate": 9.373556189699689e-06, + "loss": 6.8715, + "step": 7380 + }, + { + "epoch": 0.5017665443674413, + "grad_norm": 0.6600916981697083, + "learning_rate": 9.373131539611362e-06, + "loss": 6.8275, + "step": 7385 + }, + { + "epoch": 0.502106264438103, + "grad_norm": 0.8323143124580383, + "learning_rate": 9.372706889523033e-06, + "loss": 6.5033, + "step": 7390 + }, + { + "epoch": 0.5024459845087648, + "grad_norm": 1.0018125772476196, + "learning_rate": 9.372282239434707e-06, + "loss": 6.8256, + "step": 7395 + }, + { + "epoch": 0.5027857045794265, + "grad_norm": 0.8109241127967834, + "learning_rate": 9.37185758934638e-06, + "loss": 6.757, + "step": 7400 + }, + { + "epoch": 0.5031254246500884, + "grad_norm": 0.9458985924720764, + "learning_rate": 9.371432939258051e-06, + "loss": 6.674, + "step": 7405 + }, + { + "epoch": 0.5034651447207501, + "grad_norm": 0.6943719387054443, + "learning_rate": 9.371008289169726e-06, + "loss": 6.5685, + "step": 7410 + }, + { + "epoch": 0.5038048647914118, + "grad_norm": 1.130963683128357, + "learning_rate": 9.370583639081398e-06, + "loss": 6.6857, + "step": 7415 + }, + { + "epoch": 0.5041445848620737, + "grad_norm": 1.2186988592147827, + "learning_rate": 9.37015898899307e-06, + "loss": 6.7435, + "step": 7420 + }, + { + "epoch": 0.5044843049327354, + "grad_norm": 0.9146189093589783, + "learning_rate": 9.369734338904744e-06, + "loss": 6.8177, + "step": 7425 + }, + { + "epoch": 0.5048240250033972, + "grad_norm": 0.9108222723007202, + "learning_rate": 9.369309688816415e-06, + "loss": 6.7436, + "step": 7430 + }, + { + "epoch": 0.5051637450740589, + "grad_norm": 0.7408660650253296, + "learning_rate": 9.368885038728088e-06, + "loss": 6.7105, + "step": 7435 + }, + { + "epoch": 0.5055034651447208, + "grad_norm": 0.7732295393943787, + "learning_rate": 9.368460388639762e-06, + "loss": 6.6838, + "step": 7440 + }, + { + "epoch": 0.5058431852153825, + "grad_norm": 0.7094430327415466, + "learning_rate": 9.368035738551433e-06, + "loss": 6.6703, + "step": 7445 + }, + { + "epoch": 0.5061829052860443, + "grad_norm": 0.7481225728988647, + "learning_rate": 9.367611088463106e-06, + "loss": 6.5721, + "step": 7450 + }, + { + "epoch": 0.5065226253567061, + "grad_norm": 0.8894798755645752, + "learning_rate": 9.36718643837478e-06, + "loss": 6.7059, + "step": 7455 + }, + { + "epoch": 0.5068623454273679, + "grad_norm": 0.6893903017044067, + "learning_rate": 9.366761788286452e-06, + "loss": 6.6198, + "step": 7460 + }, + { + "epoch": 0.5072020654980296, + "grad_norm": 0.8489027619361877, + "learning_rate": 9.366337138198125e-06, + "loss": 6.846, + "step": 7465 + }, + { + "epoch": 0.5075417855686915, + "grad_norm": 0.9765400290489197, + "learning_rate": 9.365912488109799e-06, + "loss": 6.5228, + "step": 7470 + }, + { + "epoch": 0.5078815056393532, + "grad_norm": 0.5596321821212769, + "learning_rate": 9.36548783802147e-06, + "loss": 6.7912, + "step": 7475 + }, + { + "epoch": 0.5082212257100149, + "grad_norm": 0.9029366374015808, + "learning_rate": 9.365063187933145e-06, + "loss": 6.8128, + "step": 7480 + }, + { + "epoch": 0.5085609457806767, + "grad_norm": 0.7897516489028931, + "learning_rate": 9.364638537844818e-06, + "loss": 7.0108, + "step": 7485 + }, + { + "epoch": 0.5089006658513385, + "grad_norm": 0.8748897910118103, + "learning_rate": 9.364213887756489e-06, + "loss": 6.8066, + "step": 7490 + }, + { + "epoch": 0.5092403859220003, + "grad_norm": 0.7224349975585938, + "learning_rate": 9.363789237668163e-06, + "loss": 6.7637, + "step": 7495 + }, + { + "epoch": 0.509580105992662, + "grad_norm": 0.7380138635635376, + "learning_rate": 9.363364587579836e-06, + "loss": 6.7272, + "step": 7500 + }, + { + "epoch": 0.5099198260633239, + "grad_norm": 1.0328501462936401, + "learning_rate": 9.362939937491507e-06, + "loss": 6.6898, + "step": 7505 + }, + { + "epoch": 0.5102595461339856, + "grad_norm": 0.8380527496337891, + "learning_rate": 9.362515287403182e-06, + "loss": 6.716, + "step": 7510 + }, + { + "epoch": 0.5105992662046474, + "grad_norm": 0.9495871067047119, + "learning_rate": 9.362090637314853e-06, + "loss": 6.7882, + "step": 7515 + }, + { + "epoch": 0.5109389862753091, + "grad_norm": 0.7937866449356079, + "learning_rate": 9.361665987226525e-06, + "loss": 6.9149, + "step": 7520 + }, + { + "epoch": 0.511278706345971, + "grad_norm": 0.8012692332267761, + "learning_rate": 9.3612413371382e-06, + "loss": 6.7738, + "step": 7525 + }, + { + "epoch": 0.5116184264166327, + "grad_norm": 1.0279793739318848, + "learning_rate": 9.360816687049871e-06, + "loss": 6.9483, + "step": 7530 + }, + { + "epoch": 0.5119581464872944, + "grad_norm": 1.1140809059143066, + "learning_rate": 9.360392036961544e-06, + "loss": 6.7586, + "step": 7535 + }, + { + "epoch": 0.5122978665579563, + "grad_norm": 0.747970461845398, + "learning_rate": 9.359967386873218e-06, + "loss": 6.7373, + "step": 7540 + }, + { + "epoch": 0.512637586628618, + "grad_norm": 0.7524696588516235, + "learning_rate": 9.35954273678489e-06, + "loss": 6.4057, + "step": 7545 + }, + { + "epoch": 0.5129773066992798, + "grad_norm": 0.9555236101150513, + "learning_rate": 9.359118086696562e-06, + "loss": 6.7602, + "step": 7550 + }, + { + "epoch": 0.5133170267699416, + "grad_norm": 1.0072259902954102, + "learning_rate": 9.358693436608237e-06, + "loss": 6.685, + "step": 7555 + }, + { + "epoch": 0.5136567468406034, + "grad_norm": 0.8759356141090393, + "learning_rate": 9.358268786519908e-06, + "loss": 6.7967, + "step": 7560 + }, + { + "epoch": 0.5139964669112651, + "grad_norm": 0.7562576532363892, + "learning_rate": 9.35784413643158e-06, + "loss": 6.8643, + "step": 7565 + }, + { + "epoch": 0.5143361869819268, + "grad_norm": 0.8411049246788025, + "learning_rate": 9.357419486343255e-06, + "loss": 6.7831, + "step": 7570 + }, + { + "epoch": 0.5146759070525887, + "grad_norm": 0.8356865048408508, + "learning_rate": 9.356994836254926e-06, + "loss": 6.8607, + "step": 7575 + }, + { + "epoch": 0.5150156271232504, + "grad_norm": 0.8415096402168274, + "learning_rate": 9.356570186166599e-06, + "loss": 6.909, + "step": 7580 + }, + { + "epoch": 0.5153553471939122, + "grad_norm": 0.8361212611198425, + "learning_rate": 9.356145536078272e-06, + "loss": 6.8117, + "step": 7585 + }, + { + "epoch": 0.515695067264574, + "grad_norm": 1.1105051040649414, + "learning_rate": 9.355720885989945e-06, + "loss": 6.9576, + "step": 7590 + }, + { + "epoch": 0.5160347873352358, + "grad_norm": 0.8192757964134216, + "learning_rate": 9.355296235901617e-06, + "loss": 6.813, + "step": 7595 + }, + { + "epoch": 0.5163745074058975, + "grad_norm": 0.7561287879943848, + "learning_rate": 9.35487158581329e-06, + "loss": 6.6865, + "step": 7600 + }, + { + "epoch": 0.5167142274765593, + "grad_norm": 0.700171172618866, + "learning_rate": 9.354446935724963e-06, + "loss": 6.5854, + "step": 7605 + }, + { + "epoch": 0.5170539475472211, + "grad_norm": 0.9322381019592285, + "learning_rate": 9.354022285636636e-06, + "loss": 6.6474, + "step": 7610 + }, + { + "epoch": 0.5173936676178829, + "grad_norm": 0.9758726358413696, + "learning_rate": 9.353597635548309e-06, + "loss": 6.6642, + "step": 7615 + }, + { + "epoch": 0.5177333876885446, + "grad_norm": 0.7774538993835449, + "learning_rate": 9.353172985459981e-06, + "loss": 6.7867, + "step": 7620 + }, + { + "epoch": 0.5180731077592065, + "grad_norm": 0.9328933358192444, + "learning_rate": 9.352748335371654e-06, + "loss": 7.0528, + "step": 7625 + }, + { + "epoch": 0.5184128278298682, + "grad_norm": 0.757388174533844, + "learning_rate": 9.352323685283327e-06, + "loss": 6.5334, + "step": 7630 + }, + { + "epoch": 0.5187525479005299, + "grad_norm": 0.8998204469680786, + "learning_rate": 9.351899035195e-06, + "loss": 6.7773, + "step": 7635 + }, + { + "epoch": 0.5190922679711918, + "grad_norm": 0.9459860324859619, + "learning_rate": 9.351474385106673e-06, + "loss": 6.6455, + "step": 7640 + }, + { + "epoch": 0.5194319880418535, + "grad_norm": 0.789713442325592, + "learning_rate": 9.351049735018345e-06, + "loss": 6.8172, + "step": 7645 + }, + { + "epoch": 0.5197717081125153, + "grad_norm": 0.8400719165802002, + "learning_rate": 9.350625084930018e-06, + "loss": 6.7107, + "step": 7650 + }, + { + "epoch": 0.520111428183177, + "grad_norm": 0.9902055263519287, + "learning_rate": 9.350200434841691e-06, + "loss": 6.831, + "step": 7655 + }, + { + "epoch": 0.5204511482538389, + "grad_norm": 0.6937239170074463, + "learning_rate": 9.349775784753364e-06, + "loss": 6.6745, + "step": 7660 + }, + { + "epoch": 0.5207908683245006, + "grad_norm": 0.6814174652099609, + "learning_rate": 9.349351134665037e-06, + "loss": 6.6488, + "step": 7665 + }, + { + "epoch": 0.5211305883951624, + "grad_norm": 0.7690621018409729, + "learning_rate": 9.34892648457671e-06, + "loss": 6.6519, + "step": 7670 + }, + { + "epoch": 0.5214703084658242, + "grad_norm": 0.8035858273506165, + "learning_rate": 9.348501834488382e-06, + "loss": 6.8382, + "step": 7675 + }, + { + "epoch": 0.521810028536486, + "grad_norm": 0.9311736822128296, + "learning_rate": 9.348077184400055e-06, + "loss": 6.7017, + "step": 7680 + }, + { + "epoch": 0.5221497486071477, + "grad_norm": 1.0976568460464478, + "learning_rate": 9.347652534311728e-06, + "loss": 6.8024, + "step": 7685 + }, + { + "epoch": 0.5224894686778094, + "grad_norm": 1.0256719589233398, + "learning_rate": 9.3472278842234e-06, + "loss": 6.9195, + "step": 7690 + }, + { + "epoch": 0.5228291887484713, + "grad_norm": 0.86808842420578, + "learning_rate": 9.346803234135073e-06, + "loss": 6.5854, + "step": 7695 + }, + { + "epoch": 0.523168908819133, + "grad_norm": 0.7043029069900513, + "learning_rate": 9.346378584046746e-06, + "loss": 6.7928, + "step": 7700 + }, + { + "epoch": 0.5235086288897948, + "grad_norm": 0.8211622834205627, + "learning_rate": 9.345953933958419e-06, + "loss": 6.9006, + "step": 7705 + }, + { + "epoch": 0.5238483489604566, + "grad_norm": 0.885947048664093, + "learning_rate": 9.345529283870092e-06, + "loss": 6.6042, + "step": 7710 + }, + { + "epoch": 0.5241880690311184, + "grad_norm": 0.8073636889457703, + "learning_rate": 9.345104633781765e-06, + "loss": 6.7974, + "step": 7715 + }, + { + "epoch": 0.5245277891017801, + "grad_norm": 0.800381064414978, + "learning_rate": 9.344679983693437e-06, + "loss": 6.7936, + "step": 7720 + }, + { + "epoch": 0.524867509172442, + "grad_norm": 1.1157984733581543, + "learning_rate": 9.34425533360511e-06, + "loss": 6.8989, + "step": 7725 + }, + { + "epoch": 0.5252072292431037, + "grad_norm": 0.777052104473114, + "learning_rate": 9.343830683516783e-06, + "loss": 6.8148, + "step": 7730 + }, + { + "epoch": 0.5255469493137654, + "grad_norm": 2.1842968463897705, + "learning_rate": 9.343406033428456e-06, + "loss": 6.5419, + "step": 7735 + }, + { + "epoch": 0.5258866693844272, + "grad_norm": 0.8441357612609863, + "learning_rate": 9.342981383340129e-06, + "loss": 6.7172, + "step": 7740 + }, + { + "epoch": 0.526226389455089, + "grad_norm": 0.6989182233810425, + "learning_rate": 9.342556733251801e-06, + "loss": 6.5897, + "step": 7745 + }, + { + "epoch": 0.5265661095257508, + "grad_norm": 0.7307862043380737, + "learning_rate": 9.342132083163474e-06, + "loss": 6.8815, + "step": 7750 + }, + { + "epoch": 0.5269058295964125, + "grad_norm": 0.8035904169082642, + "learning_rate": 9.341707433075147e-06, + "loss": 6.7373, + "step": 7755 + }, + { + "epoch": 0.5272455496670744, + "grad_norm": 0.8951308727264404, + "learning_rate": 9.34128278298682e-06, + "loss": 6.3133, + "step": 7760 + }, + { + "epoch": 0.5275852697377361, + "grad_norm": 0.7997956871986389, + "learning_rate": 9.340858132898493e-06, + "loss": 6.8439, + "step": 7765 + }, + { + "epoch": 0.5279249898083979, + "grad_norm": 1.1819437742233276, + "learning_rate": 9.340433482810166e-06, + "loss": 6.7733, + "step": 7770 + }, + { + "epoch": 0.5282647098790596, + "grad_norm": 0.8578700423240662, + "learning_rate": 9.340008832721837e-06, + "loss": 6.7092, + "step": 7775 + }, + { + "epoch": 0.5286044299497215, + "grad_norm": 1.0323578119277954, + "learning_rate": 9.339584182633511e-06, + "loss": 6.6587, + "step": 7780 + }, + { + "epoch": 0.5289441500203832, + "grad_norm": 0.8637508749961853, + "learning_rate": 9.339159532545184e-06, + "loss": 6.8149, + "step": 7785 + }, + { + "epoch": 0.5292838700910449, + "grad_norm": 0.8146135807037354, + "learning_rate": 9.338734882456855e-06, + "loss": 6.8604, + "step": 7790 + }, + { + "epoch": 0.5296235901617068, + "grad_norm": 0.9225687980651855, + "learning_rate": 9.33831023236853e-06, + "loss": 6.7139, + "step": 7795 + }, + { + "epoch": 0.5299633102323685, + "grad_norm": 0.9604765772819519, + "learning_rate": 9.337885582280202e-06, + "loss": 6.8019, + "step": 7800 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.877306342124939, + "learning_rate": 9.337460932191873e-06, + "loss": 6.6018, + "step": 7805 + }, + { + "epoch": 0.5306427503736921, + "grad_norm": 1.4056878089904785, + "learning_rate": 9.337036282103548e-06, + "loss": 6.571, + "step": 7810 + }, + { + "epoch": 0.5309824704443539, + "grad_norm": 0.8973736763000488, + "learning_rate": 9.33661163201522e-06, + "loss": 6.607, + "step": 7815 + }, + { + "epoch": 0.5313221905150156, + "grad_norm": 0.8538034558296204, + "learning_rate": 9.336186981926894e-06, + "loss": 6.7733, + "step": 7820 + }, + { + "epoch": 0.5316619105856774, + "grad_norm": 0.6233241558074951, + "learning_rate": 9.335762331838566e-06, + "loss": 6.5728, + "step": 7825 + }, + { + "epoch": 0.5320016306563392, + "grad_norm": 0.7097254991531372, + "learning_rate": 9.335337681750239e-06, + "loss": 6.7236, + "step": 7830 + }, + { + "epoch": 0.532341350727001, + "grad_norm": 0.8475677371025085, + "learning_rate": 9.334913031661912e-06, + "loss": 6.8206, + "step": 7835 + }, + { + "epoch": 0.5326810707976627, + "grad_norm": 0.6882188320159912, + "learning_rate": 9.334488381573585e-06, + "loss": 6.8282, + "step": 7840 + }, + { + "epoch": 0.5330207908683245, + "grad_norm": 1.0270963907241821, + "learning_rate": 9.334063731485258e-06, + "loss": 6.7588, + "step": 7845 + }, + { + "epoch": 0.5333605109389863, + "grad_norm": 0.7753660082817078, + "learning_rate": 9.33363908139693e-06, + "loss": 6.7048, + "step": 7850 + }, + { + "epoch": 0.533700231009648, + "grad_norm": 0.620800256729126, + "learning_rate": 9.333214431308603e-06, + "loss": 6.7856, + "step": 7855 + }, + { + "epoch": 0.5340399510803098, + "grad_norm": 0.8721664547920227, + "learning_rate": 9.332789781220274e-06, + "loss": 6.6671, + "step": 7860 + }, + { + "epoch": 0.5343796711509716, + "grad_norm": 0.7708675861358643, + "learning_rate": 9.332365131131949e-06, + "loss": 6.5537, + "step": 7865 + }, + { + "epoch": 0.5347193912216334, + "grad_norm": 0.6123479008674622, + "learning_rate": 9.331940481043622e-06, + "loss": 6.8584, + "step": 7870 + }, + { + "epoch": 0.5350591112922951, + "grad_norm": 0.7674098610877991, + "learning_rate": 9.331515830955293e-06, + "loss": 6.7583, + "step": 7875 + }, + { + "epoch": 0.535398831362957, + "grad_norm": 1.1448827981948853, + "learning_rate": 9.331091180866967e-06, + "loss": 6.8414, + "step": 7880 + }, + { + "epoch": 0.5357385514336187, + "grad_norm": 0.7812538146972656, + "learning_rate": 9.33066653077864e-06, + "loss": 6.561, + "step": 7885 + }, + { + "epoch": 0.5360782715042804, + "grad_norm": 0.7949689626693726, + "learning_rate": 9.330241880690311e-06, + "loss": 6.6707, + "step": 7890 + }, + { + "epoch": 0.5364179915749423, + "grad_norm": 0.6983610391616821, + "learning_rate": 9.329817230601986e-06, + "loss": 6.4738, + "step": 7895 + }, + { + "epoch": 0.536757711645604, + "grad_norm": 0.7443625330924988, + "learning_rate": 9.329392580513658e-06, + "loss": 6.7027, + "step": 7900 + }, + { + "epoch": 0.5370974317162658, + "grad_norm": 0.8233029246330261, + "learning_rate": 9.32896793042533e-06, + "loss": 6.9266, + "step": 7905 + }, + { + "epoch": 0.5374371517869275, + "grad_norm": 0.8143211007118225, + "learning_rate": 9.328543280337004e-06, + "loss": 6.7439, + "step": 7910 + }, + { + "epoch": 0.5377768718575894, + "grad_norm": 0.6522383093833923, + "learning_rate": 9.328118630248677e-06, + "loss": 6.6366, + "step": 7915 + }, + { + "epoch": 0.5381165919282511, + "grad_norm": 0.7335131764411926, + "learning_rate": 9.327693980160348e-06, + "loss": 6.5607, + "step": 7920 + }, + { + "epoch": 0.5384563119989129, + "grad_norm": 1.124111294746399, + "learning_rate": 9.327269330072022e-06, + "loss": 6.5227, + "step": 7925 + }, + { + "epoch": 0.5387960320695747, + "grad_norm": 0.7792592644691467, + "learning_rate": 9.326844679983693e-06, + "loss": 6.8489, + "step": 7930 + }, + { + "epoch": 0.5391357521402365, + "grad_norm": 0.6851006150245667, + "learning_rate": 9.326420029895366e-06, + "loss": 6.6393, + "step": 7935 + }, + { + "epoch": 0.5394754722108982, + "grad_norm": 0.8376139402389526, + "learning_rate": 9.32599537980704e-06, + "loss": 6.7784, + "step": 7940 + }, + { + "epoch": 0.5398151922815599, + "grad_norm": 0.7781663537025452, + "learning_rate": 9.325570729718712e-06, + "loss": 6.7414, + "step": 7945 + }, + { + "epoch": 0.5401549123522218, + "grad_norm": 0.8254972696304321, + "learning_rate": 9.325146079630385e-06, + "loss": 6.6755, + "step": 7950 + }, + { + "epoch": 0.5404946324228835, + "grad_norm": 0.7544137239456177, + "learning_rate": 9.324721429542059e-06, + "loss": 6.448, + "step": 7955 + }, + { + "epoch": 0.5408343524935453, + "grad_norm": 0.8963250517845154, + "learning_rate": 9.32429677945373e-06, + "loss": 6.7271, + "step": 7960 + }, + { + "epoch": 0.5411740725642071, + "grad_norm": 0.7679321765899658, + "learning_rate": 9.323872129365403e-06, + "loss": 6.5522, + "step": 7965 + }, + { + "epoch": 0.5415137926348689, + "grad_norm": 1.0811657905578613, + "learning_rate": 9.323447479277078e-06, + "loss": 6.5479, + "step": 7970 + }, + { + "epoch": 0.5418535127055306, + "grad_norm": 0.911293089389801, + "learning_rate": 9.323022829188749e-06, + "loss": 6.7877, + "step": 7975 + }, + { + "epoch": 0.5421932327761925, + "grad_norm": 0.8353615403175354, + "learning_rate": 9.322598179100421e-06, + "loss": 6.7963, + "step": 7980 + }, + { + "epoch": 0.5425329528468542, + "grad_norm": 0.6910008788108826, + "learning_rate": 9.322173529012096e-06, + "loss": 6.6543, + "step": 7985 + }, + { + "epoch": 0.542872672917516, + "grad_norm": 0.6317711472511292, + "learning_rate": 9.321748878923767e-06, + "loss": 6.7535, + "step": 7990 + }, + { + "epoch": 0.5432123929881777, + "grad_norm": 0.7702703475952148, + "learning_rate": 9.32132422883544e-06, + "loss": 6.6983, + "step": 7995 + }, + { + "epoch": 0.5435521130588395, + "grad_norm": 0.6755439043045044, + "learning_rate": 9.320899578747113e-06, + "loss": 6.7336, + "step": 8000 + }, + { + "epoch": 0.5438918331295013, + "grad_norm": 0.8753911256790161, + "learning_rate": 9.320474928658785e-06, + "loss": 6.5655, + "step": 8005 + }, + { + "epoch": 0.544231553200163, + "grad_norm": 0.7859448790550232, + "learning_rate": 9.320050278570458e-06, + "loss": 6.543, + "step": 8010 + }, + { + "epoch": 0.5445712732708249, + "grad_norm": 0.7153217196464539, + "learning_rate": 9.319625628482131e-06, + "loss": 6.5978, + "step": 8015 + }, + { + "epoch": 0.5449109933414866, + "grad_norm": 0.9683584570884705, + "learning_rate": 9.319200978393804e-06, + "loss": 6.8772, + "step": 8020 + }, + { + "epoch": 0.5452507134121484, + "grad_norm": 0.9836431741714478, + "learning_rate": 9.318776328305477e-06, + "loss": 6.883, + "step": 8025 + }, + { + "epoch": 0.5455904334828101, + "grad_norm": 0.9295907020568848, + "learning_rate": 9.31835167821715e-06, + "loss": 6.7793, + "step": 8030 + }, + { + "epoch": 0.545930153553472, + "grad_norm": 0.6646861433982849, + "learning_rate": 9.317927028128822e-06, + "loss": 6.5728, + "step": 8035 + }, + { + "epoch": 0.5462698736241337, + "grad_norm": 0.6967893838882446, + "learning_rate": 9.317502378040495e-06, + "loss": 6.5335, + "step": 8040 + }, + { + "epoch": 0.5466095936947954, + "grad_norm": 0.7900519371032715, + "learning_rate": 9.317077727952168e-06, + "loss": 6.5057, + "step": 8045 + }, + { + "epoch": 0.5469493137654573, + "grad_norm": 0.6967466473579407, + "learning_rate": 9.31665307786384e-06, + "loss": 6.6202, + "step": 8050 + }, + { + "epoch": 0.547289033836119, + "grad_norm": 0.7448360919952393, + "learning_rate": 9.316228427775513e-06, + "loss": 6.6054, + "step": 8055 + }, + { + "epoch": 0.5476287539067808, + "grad_norm": 0.7098957300186157, + "learning_rate": 9.315803777687186e-06, + "loss": 6.5404, + "step": 8060 + }, + { + "epoch": 0.5479684739774426, + "grad_norm": 0.750868558883667, + "learning_rate": 9.315379127598859e-06, + "loss": 6.65, + "step": 8065 + }, + { + "epoch": 0.5483081940481044, + "grad_norm": 0.8391157388687134, + "learning_rate": 9.314954477510532e-06, + "loss": 6.8554, + "step": 8070 + }, + { + "epoch": 0.5486479141187661, + "grad_norm": 0.7765434384346008, + "learning_rate": 9.314529827422205e-06, + "loss": 6.5444, + "step": 8075 + }, + { + "epoch": 0.5489876341894279, + "grad_norm": 0.7651300430297852, + "learning_rate": 9.314105177333877e-06, + "loss": 6.659, + "step": 8080 + }, + { + "epoch": 0.5493273542600897, + "grad_norm": 0.8561496138572693, + "learning_rate": 9.31368052724555e-06, + "loss": 6.8051, + "step": 8085 + }, + { + "epoch": 0.5496670743307515, + "grad_norm": 0.8215615153312683, + "learning_rate": 9.313255877157223e-06, + "loss": 6.7256, + "step": 8090 + }, + { + "epoch": 0.5500067944014132, + "grad_norm": 0.8574585914611816, + "learning_rate": 9.312831227068896e-06, + "loss": 6.7048, + "step": 8095 + }, + { + "epoch": 0.550346514472075, + "grad_norm": 0.7290385365486145, + "learning_rate": 9.312406576980569e-06, + "loss": 6.6195, + "step": 8100 + }, + { + "epoch": 0.5506862345427368, + "grad_norm": 0.8218104243278503, + "learning_rate": 9.311981926892241e-06, + "loss": 6.6113, + "step": 8105 + }, + { + "epoch": 0.5510259546133985, + "grad_norm": 0.8941261768341064, + "learning_rate": 9.311557276803914e-06, + "loss": 6.674, + "step": 8110 + }, + { + "epoch": 0.5513656746840603, + "grad_norm": 0.8281568884849548, + "learning_rate": 9.311132626715587e-06, + "loss": 6.6228, + "step": 8115 + }, + { + "epoch": 0.5517053947547221, + "grad_norm": 0.6900314092636108, + "learning_rate": 9.31070797662726e-06, + "loss": 6.4537, + "step": 8120 + }, + { + "epoch": 0.5520451148253839, + "grad_norm": 0.7071418762207031, + "learning_rate": 9.310283326538933e-06, + "loss": 6.534, + "step": 8125 + }, + { + "epoch": 0.5523848348960456, + "grad_norm": 0.7229854464530945, + "learning_rate": 9.309858676450605e-06, + "loss": 6.6618, + "step": 8130 + }, + { + "epoch": 0.5527245549667075, + "grad_norm": 0.6819376945495605, + "learning_rate": 9.309434026362278e-06, + "loss": 6.4157, + "step": 8135 + }, + { + "epoch": 0.5530642750373692, + "grad_norm": 0.754050612449646, + "learning_rate": 9.309009376273951e-06, + "loss": 6.5716, + "step": 8140 + }, + { + "epoch": 0.553403995108031, + "grad_norm": 0.9548196792602539, + "learning_rate": 9.308584726185624e-06, + "loss": 6.6806, + "step": 8145 + }, + { + "epoch": 0.5537437151786928, + "grad_norm": 0.8929256200790405, + "learning_rate": 9.308160076097297e-06, + "loss": 6.5753, + "step": 8150 + }, + { + "epoch": 0.5540834352493546, + "grad_norm": 1.0439097881317139, + "learning_rate": 9.30773542600897e-06, + "loss": 6.5061, + "step": 8155 + }, + { + "epoch": 0.5544231553200163, + "grad_norm": 0.8385816216468811, + "learning_rate": 9.307310775920642e-06, + "loss": 6.5717, + "step": 8160 + }, + { + "epoch": 0.554762875390678, + "grad_norm": 0.8122405409812927, + "learning_rate": 9.306886125832315e-06, + "loss": 6.8675, + "step": 8165 + }, + { + "epoch": 0.5551025954613399, + "grad_norm": 1.1205629110336304, + "learning_rate": 9.306461475743988e-06, + "loss": 6.8755, + "step": 8170 + }, + { + "epoch": 0.5554423155320016, + "grad_norm": 0.690618097782135, + "learning_rate": 9.30603682565566e-06, + "loss": 6.6771, + "step": 8175 + }, + { + "epoch": 0.5557820356026634, + "grad_norm": 0.894432008266449, + "learning_rate": 9.305612175567333e-06, + "loss": 6.5914, + "step": 8180 + }, + { + "epoch": 0.5561217556733252, + "grad_norm": 0.9849356412887573, + "learning_rate": 9.305187525479006e-06, + "loss": 6.5769, + "step": 8185 + }, + { + "epoch": 0.556461475743987, + "grad_norm": 1.0562227964401245, + "learning_rate": 9.304762875390679e-06, + "loss": 6.7033, + "step": 8190 + }, + { + "epoch": 0.5568011958146487, + "grad_norm": 0.7616122961044312, + "learning_rate": 9.304338225302352e-06, + "loss": 6.7203, + "step": 8195 + }, + { + "epoch": 0.5571409158853105, + "grad_norm": 0.5788894295692444, + "learning_rate": 9.303913575214025e-06, + "loss": 6.3182, + "step": 8200 + }, + { + "epoch": 0.5574806359559723, + "grad_norm": 0.847647488117218, + "learning_rate": 9.303488925125697e-06, + "loss": 6.4941, + "step": 8205 + }, + { + "epoch": 0.557820356026634, + "grad_norm": 1.597216010093689, + "learning_rate": 9.30306427503737e-06, + "loss": 6.671, + "step": 8210 + }, + { + "epoch": 0.5581600760972958, + "grad_norm": 0.718958854675293, + "learning_rate": 9.302639624949043e-06, + "loss": 6.6865, + "step": 8215 + }, + { + "epoch": 0.5584997961679576, + "grad_norm": 0.8348727226257324, + "learning_rate": 9.302214974860716e-06, + "loss": 6.6536, + "step": 8220 + }, + { + "epoch": 0.5588395162386194, + "grad_norm": 0.7317186594009399, + "learning_rate": 9.301790324772389e-06, + "loss": 6.3706, + "step": 8225 + }, + { + "epoch": 0.5591792363092811, + "grad_norm": 0.5876362919807434, + "learning_rate": 9.301365674684061e-06, + "loss": 6.6441, + "step": 8230 + }, + { + "epoch": 0.559518956379943, + "grad_norm": 0.8632931113243103, + "learning_rate": 9.300941024595734e-06, + "loss": 6.6493, + "step": 8235 + }, + { + "epoch": 0.5598586764506047, + "grad_norm": 0.8125372529029846, + "learning_rate": 9.300516374507407e-06, + "loss": 6.6939, + "step": 8240 + }, + { + "epoch": 0.5601983965212665, + "grad_norm": 0.6943705677986145, + "learning_rate": 9.30009172441908e-06, + "loss": 6.4974, + "step": 8245 + }, + { + "epoch": 0.5605381165919282, + "grad_norm": 0.8909331560134888, + "learning_rate": 9.299667074330753e-06, + "loss": 6.7826, + "step": 8250 + }, + { + "epoch": 0.5608778366625901, + "grad_norm": 0.780013382434845, + "learning_rate": 9.299242424242425e-06, + "loss": 6.6537, + "step": 8255 + }, + { + "epoch": 0.5612175567332518, + "grad_norm": 0.7969016432762146, + "learning_rate": 9.298817774154098e-06, + "loss": 6.6412, + "step": 8260 + }, + { + "epoch": 0.5615572768039135, + "grad_norm": 0.6534283757209778, + "learning_rate": 9.298393124065771e-06, + "loss": 6.6566, + "step": 8265 + }, + { + "epoch": 0.5618969968745754, + "grad_norm": 0.6588190793991089, + "learning_rate": 9.297968473977444e-06, + "loss": 6.5685, + "step": 8270 + }, + { + "epoch": 0.5622367169452371, + "grad_norm": 0.8077401518821716, + "learning_rate": 9.297543823889115e-06, + "loss": 6.4499, + "step": 8275 + }, + { + "epoch": 0.5625764370158989, + "grad_norm": 0.7709189057350159, + "learning_rate": 9.29711917380079e-06, + "loss": 6.6714, + "step": 8280 + }, + { + "epoch": 0.5629161570865606, + "grad_norm": 1.2067773342132568, + "learning_rate": 9.296694523712462e-06, + "loss": 6.6169, + "step": 8285 + }, + { + "epoch": 0.5632558771572225, + "grad_norm": 0.7492165565490723, + "learning_rate": 9.296269873624133e-06, + "loss": 6.4964, + "step": 8290 + }, + { + "epoch": 0.5635955972278842, + "grad_norm": 1.1534981727600098, + "learning_rate": 9.295845223535808e-06, + "loss": 6.6235, + "step": 8295 + }, + { + "epoch": 0.563935317298546, + "grad_norm": 0.7270280718803406, + "learning_rate": 9.29542057344748e-06, + "loss": 6.6135, + "step": 8300 + }, + { + "epoch": 0.5642750373692078, + "grad_norm": 0.6144871711730957, + "learning_rate": 9.294995923359152e-06, + "loss": 6.671, + "step": 8305 + }, + { + "epoch": 0.5646147574398696, + "grad_norm": 0.737166702747345, + "learning_rate": 9.294571273270826e-06, + "loss": 6.5959, + "step": 8310 + }, + { + "epoch": 0.5649544775105313, + "grad_norm": 0.9313573837280273, + "learning_rate": 9.294146623182499e-06, + "loss": 6.3803, + "step": 8315 + }, + { + "epoch": 0.5652941975811931, + "grad_norm": 0.8746311068534851, + "learning_rate": 9.29372197309417e-06, + "loss": 6.3616, + "step": 8320 + }, + { + "epoch": 0.5656339176518549, + "grad_norm": 0.838326096534729, + "learning_rate": 9.293297323005845e-06, + "loss": 6.8448, + "step": 8325 + }, + { + "epoch": 0.5659736377225166, + "grad_norm": 0.6852318048477173, + "learning_rate": 9.292872672917517e-06, + "loss": 6.6541, + "step": 8330 + }, + { + "epoch": 0.5663133577931784, + "grad_norm": 1.1294573545455933, + "learning_rate": 9.292448022829189e-06, + "loss": 6.6008, + "step": 8335 + }, + { + "epoch": 0.5666530778638402, + "grad_norm": 0.8486148118972778, + "learning_rate": 9.292023372740863e-06, + "loss": 6.5574, + "step": 8340 + }, + { + "epoch": 0.566992797934502, + "grad_norm": 0.7166041135787964, + "learning_rate": 9.291598722652534e-06, + "loss": 6.5069, + "step": 8345 + }, + { + "epoch": 0.5673325180051637, + "grad_norm": 0.8937546610832214, + "learning_rate": 9.291174072564207e-06, + "loss": 6.8551, + "step": 8350 + }, + { + "epoch": 0.5676722380758256, + "grad_norm": 0.7645808458328247, + "learning_rate": 9.290749422475881e-06, + "loss": 6.7053, + "step": 8355 + }, + { + "epoch": 0.5680119581464873, + "grad_norm": 0.8209601640701294, + "learning_rate": 9.290324772387553e-06, + "loss": 6.392, + "step": 8360 + }, + { + "epoch": 0.568351678217149, + "grad_norm": 0.8201396465301514, + "learning_rate": 9.289900122299225e-06, + "loss": 6.6133, + "step": 8365 + }, + { + "epoch": 0.5686913982878108, + "grad_norm": 0.6825620532035828, + "learning_rate": 9.2894754722109e-06, + "loss": 6.5057, + "step": 8370 + }, + { + "epoch": 0.5690311183584726, + "grad_norm": 0.9247139096260071, + "learning_rate": 9.289050822122571e-06, + "loss": 6.6186, + "step": 8375 + }, + { + "epoch": 0.5693708384291344, + "grad_norm": 0.6868758797645569, + "learning_rate": 9.288626172034244e-06, + "loss": 6.6026, + "step": 8380 + }, + { + "epoch": 0.5697105584997961, + "grad_norm": 0.6930785179138184, + "learning_rate": 9.288201521945918e-06, + "loss": 6.5528, + "step": 8385 + }, + { + "epoch": 0.570050278570458, + "grad_norm": 0.70697021484375, + "learning_rate": 9.28777687185759e-06, + "loss": 6.5654, + "step": 8390 + }, + { + "epoch": 0.5703899986411197, + "grad_norm": 0.698073148727417, + "learning_rate": 9.287352221769262e-06, + "loss": 6.8042, + "step": 8395 + }, + { + "epoch": 0.5707297187117815, + "grad_norm": 0.9896298050880432, + "learning_rate": 9.286927571680937e-06, + "loss": 6.3888, + "step": 8400 + }, + { + "epoch": 0.5710694387824433, + "grad_norm": 0.7518388032913208, + "learning_rate": 9.286502921592608e-06, + "loss": 6.587, + "step": 8405 + }, + { + "epoch": 0.5714091588531051, + "grad_norm": 0.7909395694732666, + "learning_rate": 9.28607827150428e-06, + "loss": 6.659, + "step": 8410 + }, + { + "epoch": 0.5717488789237668, + "grad_norm": 0.7908509969711304, + "learning_rate": 9.285653621415955e-06, + "loss": 6.5109, + "step": 8415 + }, + { + "epoch": 0.5720885989944285, + "grad_norm": 0.7797964215278625, + "learning_rate": 9.285228971327626e-06, + "loss": 6.6798, + "step": 8420 + }, + { + "epoch": 0.5724283190650904, + "grad_norm": 0.7347980737686157, + "learning_rate": 9.284804321239299e-06, + "loss": 6.6403, + "step": 8425 + }, + { + "epoch": 0.5727680391357521, + "grad_norm": 0.7683941721916199, + "learning_rate": 9.284379671150972e-06, + "loss": 6.5199, + "step": 8430 + }, + { + "epoch": 0.5731077592064139, + "grad_norm": 0.8057160973548889, + "learning_rate": 9.283955021062645e-06, + "loss": 6.353, + "step": 8435 + }, + { + "epoch": 0.5734474792770757, + "grad_norm": 0.6899069547653198, + "learning_rate": 9.283530370974317e-06, + "loss": 6.4445, + "step": 8440 + }, + { + "epoch": 0.5737871993477375, + "grad_norm": 0.8066838979721069, + "learning_rate": 9.28310572088599e-06, + "loss": 6.4173, + "step": 8445 + }, + { + "epoch": 0.5741269194183992, + "grad_norm": 0.8197283148765564, + "learning_rate": 9.282681070797663e-06, + "loss": 6.5567, + "step": 8450 + }, + { + "epoch": 0.574466639489061, + "grad_norm": 0.6761922240257263, + "learning_rate": 9.282256420709336e-06, + "loss": 6.3725, + "step": 8455 + }, + { + "epoch": 0.5748063595597228, + "grad_norm": 0.6397257447242737, + "learning_rate": 9.281831770621009e-06, + "loss": 6.6236, + "step": 8460 + }, + { + "epoch": 0.5751460796303846, + "grad_norm": 0.7320919632911682, + "learning_rate": 9.281407120532681e-06, + "loss": 6.6328, + "step": 8465 + }, + { + "epoch": 0.5754857997010463, + "grad_norm": 0.7651802897453308, + "learning_rate": 9.280982470444354e-06, + "loss": 6.5005, + "step": 8470 + }, + { + "epoch": 0.5758255197717081, + "grad_norm": 0.7108744382858276, + "learning_rate": 9.280557820356027e-06, + "loss": 6.3887, + "step": 8475 + }, + { + "epoch": 0.5761652398423699, + "grad_norm": 0.678952157497406, + "learning_rate": 9.2801331702677e-06, + "loss": 6.1546, + "step": 8480 + }, + { + "epoch": 0.5765049599130316, + "grad_norm": 0.8136634826660156, + "learning_rate": 9.279708520179373e-06, + "loss": 6.7596, + "step": 8485 + }, + { + "epoch": 0.5768446799836935, + "grad_norm": 0.7717469930648804, + "learning_rate": 9.279283870091045e-06, + "loss": 6.4063, + "step": 8490 + }, + { + "epoch": 0.5771844000543552, + "grad_norm": 0.8410373330116272, + "learning_rate": 9.278859220002718e-06, + "loss": 6.7551, + "step": 8495 + }, + { + "epoch": 0.577524120125017, + "grad_norm": 0.7402711510658264, + "learning_rate": 9.278434569914391e-06, + "loss": 6.5383, + "step": 8500 + }, + { + "epoch": 0.5778638401956787, + "grad_norm": 0.7750253677368164, + "learning_rate": 9.278009919826064e-06, + "loss": 6.349, + "step": 8505 + }, + { + "epoch": 0.5782035602663406, + "grad_norm": 0.6997986435890198, + "learning_rate": 9.277585269737737e-06, + "loss": 6.4701, + "step": 8510 + }, + { + "epoch": 0.5785432803370023, + "grad_norm": 0.7187374234199524, + "learning_rate": 9.27716061964941e-06, + "loss": 6.4555, + "step": 8515 + }, + { + "epoch": 0.578883000407664, + "grad_norm": 0.6788787245750427, + "learning_rate": 9.276735969561082e-06, + "loss": 6.4524, + "step": 8520 + }, + { + "epoch": 0.5792227204783259, + "grad_norm": 0.7104960083961487, + "learning_rate": 9.276311319472755e-06, + "loss": 6.3926, + "step": 8525 + }, + { + "epoch": 0.5795624405489876, + "grad_norm": 0.6821863651275635, + "learning_rate": 9.275886669384428e-06, + "loss": 6.5334, + "step": 8530 + }, + { + "epoch": 0.5799021606196494, + "grad_norm": 0.7344508171081543, + "learning_rate": 9.2754620192961e-06, + "loss": 6.757, + "step": 8535 + }, + { + "epoch": 0.5802418806903111, + "grad_norm": 0.9812008142471313, + "learning_rate": 9.275037369207773e-06, + "loss": 6.6334, + "step": 8540 + }, + { + "epoch": 0.580581600760973, + "grad_norm": 0.6224141716957092, + "learning_rate": 9.274612719119446e-06, + "loss": 6.6463, + "step": 8545 + }, + { + "epoch": 0.5809213208316347, + "grad_norm": 0.653782308101654, + "learning_rate": 9.274188069031119e-06, + "loss": 6.4401, + "step": 8550 + }, + { + "epoch": 0.5812610409022965, + "grad_norm": 0.7509824633598328, + "learning_rate": 9.273763418942792e-06, + "loss": 6.5079, + "step": 8555 + }, + { + "epoch": 0.5816007609729583, + "grad_norm": 1.1555777788162231, + "learning_rate": 9.273338768854465e-06, + "loss": 6.6764, + "step": 8560 + }, + { + "epoch": 0.5819404810436201, + "grad_norm": 0.6799468398094177, + "learning_rate": 9.272914118766137e-06, + "loss": 6.3581, + "step": 8565 + }, + { + "epoch": 0.5822802011142818, + "grad_norm": 0.9060140252113342, + "learning_rate": 9.27248946867781e-06, + "loss": 6.6672, + "step": 8570 + }, + { + "epoch": 0.5826199211849437, + "grad_norm": 0.6833000779151917, + "learning_rate": 9.272064818589483e-06, + "loss": 6.3612, + "step": 8575 + }, + { + "epoch": 0.5829596412556054, + "grad_norm": 1.3293389081954956, + "learning_rate": 9.271640168501156e-06, + "loss": 6.4149, + "step": 8580 + }, + { + "epoch": 0.5832993613262671, + "grad_norm": 1.0880935192108154, + "learning_rate": 9.271215518412829e-06, + "loss": 6.6469, + "step": 8585 + }, + { + "epoch": 0.5836390813969289, + "grad_norm": 0.7810761332511902, + "learning_rate": 9.270790868324501e-06, + "loss": 6.601, + "step": 8590 + }, + { + "epoch": 0.5839788014675907, + "grad_norm": 1.0172480344772339, + "learning_rate": 9.270366218236174e-06, + "loss": 6.8761, + "step": 8595 + }, + { + "epoch": 0.5843185215382525, + "grad_norm": 0.8078831434249878, + "learning_rate": 9.269941568147847e-06, + "loss": 6.7093, + "step": 8600 + }, + { + "epoch": 0.5846582416089142, + "grad_norm": 0.6386788487434387, + "learning_rate": 9.26951691805952e-06, + "loss": 6.5661, + "step": 8605 + }, + { + "epoch": 0.5849979616795761, + "grad_norm": 0.7225192189216614, + "learning_rate": 9.269092267971193e-06, + "loss": 6.4379, + "step": 8610 + }, + { + "epoch": 0.5853376817502378, + "grad_norm": 0.6530749797821045, + "learning_rate": 9.268667617882865e-06, + "loss": 6.6059, + "step": 8615 + }, + { + "epoch": 0.5856774018208996, + "grad_norm": 0.8841409683227539, + "learning_rate": 9.268242967794538e-06, + "loss": 6.6212, + "step": 8620 + }, + { + "epoch": 0.5860171218915613, + "grad_norm": 0.892754852771759, + "learning_rate": 9.267818317706211e-06, + "loss": 6.4627, + "step": 8625 + }, + { + "epoch": 0.5863568419622232, + "grad_norm": 0.6756227016448975, + "learning_rate": 9.267393667617884e-06, + "loss": 6.5443, + "step": 8630 + }, + { + "epoch": 0.5866965620328849, + "grad_norm": 0.686662495136261, + "learning_rate": 9.266969017529557e-06, + "loss": 6.4659, + "step": 8635 + }, + { + "epoch": 0.5870362821035466, + "grad_norm": 0.951156735420227, + "learning_rate": 9.26654436744123e-06, + "loss": 6.3442, + "step": 8640 + }, + { + "epoch": 0.5873760021742085, + "grad_norm": 0.6152322292327881, + "learning_rate": 9.266119717352902e-06, + "loss": 6.7152, + "step": 8645 + }, + { + "epoch": 0.5877157222448702, + "grad_norm": 0.7493312954902649, + "learning_rate": 9.265695067264575e-06, + "loss": 6.6932, + "step": 8650 + }, + { + "epoch": 0.588055442315532, + "grad_norm": 0.8165887594223022, + "learning_rate": 9.265270417176248e-06, + "loss": 6.3621, + "step": 8655 + }, + { + "epoch": 0.5883951623861938, + "grad_norm": 0.7460973262786865, + "learning_rate": 9.26484576708792e-06, + "loss": 6.3481, + "step": 8660 + }, + { + "epoch": 0.5887348824568556, + "grad_norm": 0.674244225025177, + "learning_rate": 9.264421116999593e-06, + "loss": 6.5854, + "step": 8665 + }, + { + "epoch": 0.5890746025275173, + "grad_norm": 0.5280740261077881, + "learning_rate": 9.263996466911266e-06, + "loss": 6.5506, + "step": 8670 + }, + { + "epoch": 0.589414322598179, + "grad_norm": 0.6519100666046143, + "learning_rate": 9.263571816822939e-06, + "loss": 6.4693, + "step": 8675 + }, + { + "epoch": 0.5897540426688409, + "grad_norm": 0.681494414806366, + "learning_rate": 9.263147166734612e-06, + "loss": 6.536, + "step": 8680 + }, + { + "epoch": 0.5900937627395026, + "grad_norm": 0.7753263711929321, + "learning_rate": 9.262722516646285e-06, + "loss": 6.3434, + "step": 8685 + }, + { + "epoch": 0.5904334828101644, + "grad_norm": 0.6553172469139099, + "learning_rate": 9.262297866557956e-06, + "loss": 6.4297, + "step": 8690 + }, + { + "epoch": 0.5907732028808262, + "grad_norm": 0.8711057901382446, + "learning_rate": 9.26187321646963e-06, + "loss": 6.5686, + "step": 8695 + }, + { + "epoch": 0.591112922951488, + "grad_norm": 0.7641153931617737, + "learning_rate": 9.261448566381303e-06, + "loss": 6.6286, + "step": 8700 + }, + { + "epoch": 0.5914526430221497, + "grad_norm": 0.8650850057601929, + "learning_rate": 9.261023916292974e-06, + "loss": 6.4967, + "step": 8705 + }, + { + "epoch": 0.5917923630928115, + "grad_norm": 0.8001235127449036, + "learning_rate": 9.260599266204649e-06, + "loss": 6.4815, + "step": 8710 + }, + { + "epoch": 0.5921320831634733, + "grad_norm": 0.7393561005592346, + "learning_rate": 9.260174616116321e-06, + "loss": 6.3963, + "step": 8715 + }, + { + "epoch": 0.5924718032341351, + "grad_norm": 0.9117370843887329, + "learning_rate": 9.259749966027993e-06, + "loss": 6.5146, + "step": 8720 + }, + { + "epoch": 0.5928115233047968, + "grad_norm": 0.6751478314399719, + "learning_rate": 9.259325315939667e-06, + "loss": 6.5468, + "step": 8725 + }, + { + "epoch": 0.5931512433754587, + "grad_norm": 0.753832221031189, + "learning_rate": 9.25890066585134e-06, + "loss": 6.4742, + "step": 8730 + }, + { + "epoch": 0.5934909634461204, + "grad_norm": 1.068841576576233, + "learning_rate": 9.258476015763011e-06, + "loss": 6.5448, + "step": 8735 + }, + { + "epoch": 0.5938306835167821, + "grad_norm": 0.7415449023246765, + "learning_rate": 9.258051365674685e-06, + "loss": 6.5942, + "step": 8740 + }, + { + "epoch": 0.594170403587444, + "grad_norm": 0.8536980152130127, + "learning_rate": 9.257626715586358e-06, + "loss": 6.4709, + "step": 8745 + }, + { + "epoch": 0.5945101236581057, + "grad_norm": 0.6774705052375793, + "learning_rate": 9.25720206549803e-06, + "loss": 6.4204, + "step": 8750 + }, + { + "epoch": 0.5948498437287675, + "grad_norm": 0.7498024702072144, + "learning_rate": 9.256777415409704e-06, + "loss": 6.4337, + "step": 8755 + }, + { + "epoch": 0.5951895637994292, + "grad_norm": 0.8637076020240784, + "learning_rate": 9.256352765321377e-06, + "loss": 6.7512, + "step": 8760 + }, + { + "epoch": 0.5955292838700911, + "grad_norm": 0.8009146451950073, + "learning_rate": 9.255928115233048e-06, + "loss": 6.1802, + "step": 8765 + }, + { + "epoch": 0.5958690039407528, + "grad_norm": 0.7095454335212708, + "learning_rate": 9.255503465144722e-06, + "loss": 6.6402, + "step": 8770 + }, + { + "epoch": 0.5962087240114146, + "grad_norm": 0.8460397720336914, + "learning_rate": 9.255078815056393e-06, + "loss": 6.5712, + "step": 8775 + }, + { + "epoch": 0.5965484440820764, + "grad_norm": 0.7000206112861633, + "learning_rate": 9.254654164968066e-06, + "loss": 6.4798, + "step": 8780 + }, + { + "epoch": 0.5968881641527382, + "grad_norm": 0.9944565296173096, + "learning_rate": 9.25422951487974e-06, + "loss": 6.6454, + "step": 8785 + }, + { + "epoch": 0.5972278842233999, + "grad_norm": 0.6444679498672485, + "learning_rate": 9.253804864791412e-06, + "loss": 6.4854, + "step": 8790 + }, + { + "epoch": 0.5975676042940616, + "grad_norm": 0.638753354549408, + "learning_rate": 9.253380214703085e-06, + "loss": 6.4944, + "step": 8795 + }, + { + "epoch": 0.5979073243647235, + "grad_norm": 0.6244166493415833, + "learning_rate": 9.252955564614759e-06, + "loss": 6.6064, + "step": 8800 + }, + { + "epoch": 0.5982470444353852, + "grad_norm": 0.7655489444732666, + "learning_rate": 9.25253091452643e-06, + "loss": 6.7089, + "step": 8805 + }, + { + "epoch": 0.598586764506047, + "grad_norm": 0.8627684116363525, + "learning_rate": 9.252106264438103e-06, + "loss": 6.7218, + "step": 8810 + }, + { + "epoch": 0.5989264845767088, + "grad_norm": 0.7675917148590088, + "learning_rate": 9.251681614349777e-06, + "loss": 6.5126, + "step": 8815 + }, + { + "epoch": 0.5992662046473706, + "grad_norm": 0.85997474193573, + "learning_rate": 9.251256964261449e-06, + "loss": 6.4054, + "step": 8820 + }, + { + "epoch": 0.5996059247180323, + "grad_norm": 0.8291817903518677, + "learning_rate": 9.250832314173121e-06, + "loss": 6.4119, + "step": 8825 + }, + { + "epoch": 0.5999456447886942, + "grad_norm": 0.6759452819824219, + "learning_rate": 9.250407664084796e-06, + "loss": 6.5215, + "step": 8830 + }, + { + "epoch": 0.6002853648593559, + "grad_norm": 0.7374590039253235, + "learning_rate": 9.249983013996467e-06, + "loss": 6.4045, + "step": 8835 + }, + { + "epoch": 0.6006250849300176, + "grad_norm": 0.7410818338394165, + "learning_rate": 9.249558363908141e-06, + "loss": 6.3093, + "step": 8840 + }, + { + "epoch": 0.6009648050006794, + "grad_norm": 0.7854533791542053, + "learning_rate": 9.249133713819813e-06, + "loss": 6.5142, + "step": 8845 + }, + { + "epoch": 0.6013045250713412, + "grad_norm": 0.6694662570953369, + "learning_rate": 9.248709063731485e-06, + "loss": 6.5386, + "step": 8850 + }, + { + "epoch": 0.601644245142003, + "grad_norm": 0.5643718838691711, + "learning_rate": 9.24828441364316e-06, + "loss": 6.5438, + "step": 8855 + }, + { + "epoch": 0.6019839652126647, + "grad_norm": 0.8682068586349487, + "learning_rate": 9.247859763554831e-06, + "loss": 6.491, + "step": 8860 + }, + { + "epoch": 0.6023236852833266, + "grad_norm": 0.6820575594902039, + "learning_rate": 9.247435113466504e-06, + "loss": 6.5988, + "step": 8865 + }, + { + "epoch": 0.6026634053539883, + "grad_norm": 0.7561167478561401, + "learning_rate": 9.247010463378178e-06, + "loss": 6.4314, + "step": 8870 + }, + { + "epoch": 0.6030031254246501, + "grad_norm": 0.8363939523696899, + "learning_rate": 9.24658581328985e-06, + "loss": 6.3269, + "step": 8875 + }, + { + "epoch": 0.6033428454953118, + "grad_norm": 0.7338782548904419, + "learning_rate": 9.246161163201522e-06, + "loss": 6.337, + "step": 8880 + }, + { + "epoch": 0.6036825655659737, + "grad_norm": 0.7108462452888489, + "learning_rate": 9.245736513113197e-06, + "loss": 6.4406, + "step": 8885 + }, + { + "epoch": 0.6040222856366354, + "grad_norm": 0.5754204988479614, + "learning_rate": 9.245311863024868e-06, + "loss": 6.3494, + "step": 8890 + }, + { + "epoch": 0.6043620057072971, + "grad_norm": 0.7149022817611694, + "learning_rate": 9.24488721293654e-06, + "loss": 6.4037, + "step": 8895 + }, + { + "epoch": 0.604701725777959, + "grad_norm": 0.772790789604187, + "learning_rate": 9.244462562848215e-06, + "loss": 6.4637, + "step": 8900 + }, + { + "epoch": 0.6050414458486207, + "grad_norm": 0.6773320436477661, + "learning_rate": 9.244037912759886e-06, + "loss": 6.3804, + "step": 8905 + }, + { + "epoch": 0.6053811659192825, + "grad_norm": 0.6212844848632812, + "learning_rate": 9.243613262671559e-06, + "loss": 6.419, + "step": 8910 + }, + { + "epoch": 0.6057208859899443, + "grad_norm": 0.7716318368911743, + "learning_rate": 9.243188612583232e-06, + "loss": 6.699, + "step": 8915 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.7483435869216919, + "learning_rate": 9.242763962494905e-06, + "loss": 6.4887, + "step": 8920 + }, + { + "epoch": 0.6064003261312678, + "grad_norm": 1.0547586679458618, + "learning_rate": 9.242339312406577e-06, + "loss": 6.6711, + "step": 8925 + }, + { + "epoch": 0.6067400462019296, + "grad_norm": 0.8728380799293518, + "learning_rate": 9.24191466231825e-06, + "loss": 6.5919, + "step": 8930 + }, + { + "epoch": 0.6070797662725914, + "grad_norm": 0.8630229830741882, + "learning_rate": 9.241490012229923e-06, + "loss": 6.7621, + "step": 8935 + }, + { + "epoch": 0.6074194863432532, + "grad_norm": 0.7087079882621765, + "learning_rate": 9.241065362141596e-06, + "loss": 6.594, + "step": 8940 + }, + { + "epoch": 0.6077592064139149, + "grad_norm": 0.9722884297370911, + "learning_rate": 9.240640712053269e-06, + "loss": 6.7984, + "step": 8945 + }, + { + "epoch": 0.6080989264845768, + "grad_norm": 1.081441044807434, + "learning_rate": 9.240216061964941e-06, + "loss": 6.6581, + "step": 8950 + }, + { + "epoch": 0.6084386465552385, + "grad_norm": 0.7065415978431702, + "learning_rate": 9.239791411876614e-06, + "loss": 6.4761, + "step": 8955 + }, + { + "epoch": 0.6087783666259002, + "grad_norm": 0.6732839345932007, + "learning_rate": 9.239366761788287e-06, + "loss": 6.4525, + "step": 8960 + }, + { + "epoch": 0.609118086696562, + "grad_norm": 0.7818865776062012, + "learning_rate": 9.23894211169996e-06, + "loss": 6.1796, + "step": 8965 + }, + { + "epoch": 0.6094578067672238, + "grad_norm": 0.7087841629981995, + "learning_rate": 9.238517461611633e-06, + "loss": 6.4408, + "step": 8970 + }, + { + "epoch": 0.6097975268378856, + "grad_norm": 0.8134922385215759, + "learning_rate": 9.238092811523305e-06, + "loss": 6.4793, + "step": 8975 + }, + { + "epoch": 0.6101372469085473, + "grad_norm": 0.7085851430892944, + "learning_rate": 9.237668161434978e-06, + "loss": 6.5167, + "step": 8980 + }, + { + "epoch": 0.6104769669792092, + "grad_norm": 0.7074335813522339, + "learning_rate": 9.237243511346651e-06, + "loss": 6.2214, + "step": 8985 + }, + { + "epoch": 0.6108166870498709, + "grad_norm": 0.6706517934799194, + "learning_rate": 9.236818861258324e-06, + "loss": 6.3624, + "step": 8990 + }, + { + "epoch": 0.6111564071205327, + "grad_norm": 0.6566395163536072, + "learning_rate": 9.236394211169997e-06, + "loss": 6.6491, + "step": 8995 + }, + { + "epoch": 0.6114961271911945, + "grad_norm": 0.7390492558479309, + "learning_rate": 9.23596956108167e-06, + "loss": 6.3979, + "step": 9000 + }, + { + "epoch": 0.6118358472618562, + "grad_norm": 0.9428990483283997, + "learning_rate": 9.235544910993342e-06, + "loss": 6.5616, + "step": 9005 + }, + { + "epoch": 0.612175567332518, + "grad_norm": 0.686882495880127, + "learning_rate": 9.235120260905015e-06, + "loss": 6.3787, + "step": 9010 + }, + { + "epoch": 0.6125152874031797, + "grad_norm": 0.6078020334243774, + "learning_rate": 9.234695610816688e-06, + "loss": 6.1986, + "step": 9015 + }, + { + "epoch": 0.6128550074738416, + "grad_norm": 0.7055761218070984, + "learning_rate": 9.23427096072836e-06, + "loss": 6.4748, + "step": 9020 + }, + { + "epoch": 0.6131947275445033, + "grad_norm": 0.6609373092651367, + "learning_rate": 9.233846310640033e-06, + "loss": 6.3077, + "step": 9025 + }, + { + "epoch": 0.6135344476151651, + "grad_norm": 0.7220235466957092, + "learning_rate": 9.233421660551706e-06, + "loss": 6.5114, + "step": 9030 + }, + { + "epoch": 0.6138741676858269, + "grad_norm": 0.7358887195587158, + "learning_rate": 9.232997010463379e-06, + "loss": 6.207, + "step": 9035 + }, + { + "epoch": 0.6142138877564887, + "grad_norm": 0.7431795597076416, + "learning_rate": 9.232572360375052e-06, + "loss": 6.4373, + "step": 9040 + }, + { + "epoch": 0.6145536078271504, + "grad_norm": 0.7869439721107483, + "learning_rate": 9.232147710286725e-06, + "loss": 6.5881, + "step": 9045 + }, + { + "epoch": 0.6148933278978121, + "grad_norm": 0.6314511299133301, + "learning_rate": 9.231723060198397e-06, + "loss": 6.3618, + "step": 9050 + }, + { + "epoch": 0.615233047968474, + "grad_norm": 0.821575403213501, + "learning_rate": 9.23129841011007e-06, + "loss": 6.4353, + "step": 9055 + }, + { + "epoch": 0.6155727680391357, + "grad_norm": 0.6898730397224426, + "learning_rate": 9.230873760021743e-06, + "loss": 6.5245, + "step": 9060 + }, + { + "epoch": 0.6159124881097975, + "grad_norm": 0.6678595542907715, + "learning_rate": 9.230449109933416e-06, + "loss": 6.527, + "step": 9065 + }, + { + "epoch": 0.6162522081804593, + "grad_norm": 0.8897793292999268, + "learning_rate": 9.230024459845089e-06, + "loss": 6.4202, + "step": 9070 + }, + { + "epoch": 0.6165919282511211, + "grad_norm": 0.7132363319396973, + "learning_rate": 9.229599809756761e-06, + "loss": 6.6897, + "step": 9075 + }, + { + "epoch": 0.6169316483217828, + "grad_norm": 0.7110785841941833, + "learning_rate": 9.229175159668434e-06, + "loss": 6.5434, + "step": 9080 + }, + { + "epoch": 0.6172713683924447, + "grad_norm": 0.7758983373641968, + "learning_rate": 9.228750509580107e-06, + "loss": 6.4119, + "step": 9085 + }, + { + "epoch": 0.6176110884631064, + "grad_norm": 0.8512669801712036, + "learning_rate": 9.22832585949178e-06, + "loss": 6.5502, + "step": 9090 + }, + { + "epoch": 0.6179508085337682, + "grad_norm": 0.7217029929161072, + "learning_rate": 9.227901209403453e-06, + "loss": 6.3513, + "step": 9095 + }, + { + "epoch": 0.6182905286044299, + "grad_norm": 1.1794688701629639, + "learning_rate": 9.227476559315125e-06, + "loss": 6.6038, + "step": 9100 + }, + { + "epoch": 0.6186302486750918, + "grad_norm": 0.6530920267105103, + "learning_rate": 9.227051909226798e-06, + "loss": 6.5711, + "step": 9105 + }, + { + "epoch": 0.6189699687457535, + "grad_norm": 0.7430432438850403, + "learning_rate": 9.226627259138471e-06, + "loss": 6.2254, + "step": 9110 + }, + { + "epoch": 0.6193096888164152, + "grad_norm": 0.7315286993980408, + "learning_rate": 9.226202609050144e-06, + "loss": 6.6765, + "step": 9115 + }, + { + "epoch": 0.6196494088870771, + "grad_norm": 0.7858328819274902, + "learning_rate": 9.225777958961815e-06, + "loss": 6.4784, + "step": 9120 + }, + { + "epoch": 0.6199891289577388, + "grad_norm": 0.6739543676376343, + "learning_rate": 9.22535330887349e-06, + "loss": 6.5238, + "step": 9125 + }, + { + "epoch": 0.6203288490284006, + "grad_norm": 0.6158331036567688, + "learning_rate": 9.224928658785162e-06, + "loss": 6.6312, + "step": 9130 + }, + { + "epoch": 0.6206685690990623, + "grad_norm": 0.8052266836166382, + "learning_rate": 9.224504008696833e-06, + "loss": 6.4996, + "step": 9135 + }, + { + "epoch": 0.6210082891697242, + "grad_norm": 0.7186631560325623, + "learning_rate": 9.224079358608508e-06, + "loss": 6.5251, + "step": 9140 + }, + { + "epoch": 0.6213480092403859, + "grad_norm": 0.7885273694992065, + "learning_rate": 9.22365470852018e-06, + "loss": 6.6527, + "step": 9145 + }, + { + "epoch": 0.6216877293110477, + "grad_norm": 0.7229416370391846, + "learning_rate": 9.223230058431852e-06, + "loss": 6.1741, + "step": 9150 + }, + { + "epoch": 0.6220274493817095, + "grad_norm": 0.6691935062408447, + "learning_rate": 9.222805408343526e-06, + "loss": 6.2793, + "step": 9155 + }, + { + "epoch": 0.6223671694523712, + "grad_norm": 0.6716103553771973, + "learning_rate": 9.222380758255199e-06, + "loss": 6.4494, + "step": 9160 + }, + { + "epoch": 0.622706889523033, + "grad_norm": 0.6721286773681641, + "learning_rate": 9.22195610816687e-06, + "loss": 6.4322, + "step": 9165 + }, + { + "epoch": 0.6230466095936948, + "grad_norm": 0.887121856212616, + "learning_rate": 9.221531458078545e-06, + "loss": 6.4441, + "step": 9170 + }, + { + "epoch": 0.6233863296643566, + "grad_norm": 0.8433877825737, + "learning_rate": 9.221106807990217e-06, + "loss": 6.3044, + "step": 9175 + }, + { + "epoch": 0.6237260497350183, + "grad_norm": 0.8579421639442444, + "learning_rate": 9.22068215790189e-06, + "loss": 6.3364, + "step": 9180 + }, + { + "epoch": 0.6240657698056801, + "grad_norm": 0.7512094974517822, + "learning_rate": 9.220257507813563e-06, + "loss": 6.6763, + "step": 9185 + }, + { + "epoch": 0.6244054898763419, + "grad_norm": 0.9043259620666504, + "learning_rate": 9.219832857725234e-06, + "loss": 6.3205, + "step": 9190 + }, + { + "epoch": 0.6247452099470037, + "grad_norm": 0.6315712928771973, + "learning_rate": 9.219408207636909e-06, + "loss": 6.5801, + "step": 9195 + }, + { + "epoch": 0.6250849300176654, + "grad_norm": 0.9833595156669617, + "learning_rate": 9.218983557548581e-06, + "loss": 6.4571, + "step": 9200 + }, + { + "epoch": 0.6254246500883273, + "grad_norm": 0.6419947743415833, + "learning_rate": 9.218558907460253e-06, + "loss": 6.3118, + "step": 9205 + }, + { + "epoch": 0.625764370158989, + "grad_norm": 0.8229233622550964, + "learning_rate": 9.218134257371927e-06, + "loss": 6.7854, + "step": 9210 + }, + { + "epoch": 0.6261040902296507, + "grad_norm": 1.0853732824325562, + "learning_rate": 9.2177096072836e-06, + "loss": 6.6697, + "step": 9215 + }, + { + "epoch": 0.6264438103003126, + "grad_norm": 0.6123767495155334, + "learning_rate": 9.217284957195271e-06, + "loss": 6.3478, + "step": 9220 + }, + { + "epoch": 0.6267835303709743, + "grad_norm": 0.6985893845558167, + "learning_rate": 9.216860307106945e-06, + "loss": 6.8374, + "step": 9225 + }, + { + "epoch": 0.6271232504416361, + "grad_norm": 0.7874070405960083, + "learning_rate": 9.216435657018618e-06, + "loss": 6.3829, + "step": 9230 + }, + { + "epoch": 0.6274629705122978, + "grad_norm": 0.8025583624839783, + "learning_rate": 9.21601100693029e-06, + "loss": 6.3521, + "step": 9235 + }, + { + "epoch": 0.6278026905829597, + "grad_norm": 0.6567882299423218, + "learning_rate": 9.215586356841964e-06, + "loss": 6.4658, + "step": 9240 + }, + { + "epoch": 0.6281424106536214, + "grad_norm": 0.8511118292808533, + "learning_rate": 9.215161706753637e-06, + "loss": 6.3056, + "step": 9245 + }, + { + "epoch": 0.6284821307242832, + "grad_norm": 0.852371871471405, + "learning_rate": 9.214737056665308e-06, + "loss": 6.2123, + "step": 9250 + }, + { + "epoch": 0.628821850794945, + "grad_norm": 0.8588346242904663, + "learning_rate": 9.214312406576982e-06, + "loss": 6.4863, + "step": 9255 + }, + { + "epoch": 0.6291615708656068, + "grad_norm": 0.6779218316078186, + "learning_rate": 9.213887756488653e-06, + "loss": 6.1491, + "step": 9260 + }, + { + "epoch": 0.6295012909362685, + "grad_norm": 0.5496200323104858, + "learning_rate": 9.213463106400326e-06, + "loss": 6.2753, + "step": 9265 + }, + { + "epoch": 0.6298410110069302, + "grad_norm": 0.8846322894096375, + "learning_rate": 9.213038456312e-06, + "loss": 6.5126, + "step": 9270 + }, + { + "epoch": 0.6301807310775921, + "grad_norm": 0.7760903835296631, + "learning_rate": 9.212613806223672e-06, + "loss": 6.3571, + "step": 9275 + }, + { + "epoch": 0.6305204511482538, + "grad_norm": 0.5459357500076294, + "learning_rate": 9.212189156135345e-06, + "loss": 6.453, + "step": 9280 + }, + { + "epoch": 0.6308601712189156, + "grad_norm": 0.9391347169876099, + "learning_rate": 9.211764506047019e-06, + "loss": 6.4992, + "step": 9285 + }, + { + "epoch": 0.6311998912895774, + "grad_norm": 0.7088111639022827, + "learning_rate": 9.21133985595869e-06, + "loss": 6.2169, + "step": 9290 + }, + { + "epoch": 0.6315396113602392, + "grad_norm": 0.9384233951568604, + "learning_rate": 9.210915205870363e-06, + "loss": 6.1718, + "step": 9295 + }, + { + "epoch": 0.6318793314309009, + "grad_norm": 0.934938371181488, + "learning_rate": 9.210490555782037e-06, + "loss": 6.4379, + "step": 9300 + }, + { + "epoch": 0.6322190515015628, + "grad_norm": 0.6194551587104797, + "learning_rate": 9.210065905693709e-06, + "loss": 6.4371, + "step": 9305 + }, + { + "epoch": 0.6325587715722245, + "grad_norm": 0.6546567678451538, + "learning_rate": 9.209641255605381e-06, + "loss": 6.1466, + "step": 9310 + }, + { + "epoch": 0.6328984916428863, + "grad_norm": 1.0848010778427124, + "learning_rate": 9.209216605517056e-06, + "loss": 6.183, + "step": 9315 + }, + { + "epoch": 0.633238211713548, + "grad_norm": 0.8379188776016235, + "learning_rate": 9.208791955428727e-06, + "loss": 6.294, + "step": 9320 + }, + { + "epoch": 0.6335779317842098, + "grad_norm": 0.7061664462089539, + "learning_rate": 9.2083673053404e-06, + "loss": 6.6962, + "step": 9325 + }, + { + "epoch": 0.6339176518548716, + "grad_norm": 0.6474084854125977, + "learning_rate": 9.207942655252074e-06, + "loss": 6.5295, + "step": 9330 + }, + { + "epoch": 0.6342573719255333, + "grad_norm": 0.7843411564826965, + "learning_rate": 9.207518005163745e-06, + "loss": 6.555, + "step": 9335 + }, + { + "epoch": 0.6345970919961952, + "grad_norm": 0.7835501432418823, + "learning_rate": 9.207093355075418e-06, + "loss": 6.534, + "step": 9340 + }, + { + "epoch": 0.6349368120668569, + "grad_norm": 0.7084590196609497, + "learning_rate": 9.206668704987091e-06, + "loss": 6.4128, + "step": 9345 + }, + { + "epoch": 0.6352765321375187, + "grad_norm": 0.6890985369682312, + "learning_rate": 9.206244054898764e-06, + "loss": 6.5151, + "step": 9350 + }, + { + "epoch": 0.6356162522081804, + "grad_norm": 0.6049187779426575, + "learning_rate": 9.205819404810437e-06, + "loss": 6.3942, + "step": 9355 + }, + { + "epoch": 0.6359559722788423, + "grad_norm": 0.6340472102165222, + "learning_rate": 9.20539475472211e-06, + "loss": 6.3372, + "step": 9360 + }, + { + "epoch": 0.636295692349504, + "grad_norm": 0.6754929423332214, + "learning_rate": 9.204970104633782e-06, + "loss": 6.48, + "step": 9365 + }, + { + "epoch": 0.6366354124201657, + "grad_norm": 0.7508379817008972, + "learning_rate": 9.204545454545455e-06, + "loss": 6.6212, + "step": 9370 + }, + { + "epoch": 0.6369751324908276, + "grad_norm": 0.7008665204048157, + "learning_rate": 9.204120804457128e-06, + "loss": 6.5671, + "step": 9375 + }, + { + "epoch": 0.6373148525614893, + "grad_norm": 0.6775002479553223, + "learning_rate": 9.2036961543688e-06, + "loss": 6.1761, + "step": 9380 + }, + { + "epoch": 0.6376545726321511, + "grad_norm": 0.7965831756591797, + "learning_rate": 9.203271504280473e-06, + "loss": 6.2711, + "step": 9385 + }, + { + "epoch": 0.6379942927028129, + "grad_norm": 0.67812180519104, + "learning_rate": 9.202846854192146e-06, + "loss": 6.2748, + "step": 9390 + }, + { + "epoch": 0.6383340127734747, + "grad_norm": 0.7567317485809326, + "learning_rate": 9.202422204103819e-06, + "loss": 6.5994, + "step": 9395 + }, + { + "epoch": 0.6386737328441364, + "grad_norm": 0.6561221480369568, + "learning_rate": 9.201997554015492e-06, + "loss": 6.2369, + "step": 9400 + }, + { + "epoch": 0.6390134529147982, + "grad_norm": 0.721612274646759, + "learning_rate": 9.201572903927165e-06, + "loss": 6.2006, + "step": 9405 + }, + { + "epoch": 0.63935317298546, + "grad_norm": 0.8582138419151306, + "learning_rate": 9.201148253838837e-06, + "loss": 6.3086, + "step": 9410 + }, + { + "epoch": 0.6396928930561218, + "grad_norm": 0.6316065788269043, + "learning_rate": 9.20072360375051e-06, + "loss": 6.3437, + "step": 9415 + }, + { + "epoch": 0.6400326131267835, + "grad_norm": 0.7860479354858398, + "learning_rate": 9.200298953662183e-06, + "loss": 6.3927, + "step": 9420 + }, + { + "epoch": 0.6403723331974454, + "grad_norm": 0.702231228351593, + "learning_rate": 9.199874303573856e-06, + "loss": 6.5195, + "step": 9425 + }, + { + "epoch": 0.6407120532681071, + "grad_norm": 0.6660271286964417, + "learning_rate": 9.199449653485529e-06, + "loss": 6.6085, + "step": 9430 + }, + { + "epoch": 0.6410517733387688, + "grad_norm": 0.705818235874176, + "learning_rate": 9.199025003397201e-06, + "loss": 6.2628, + "step": 9435 + }, + { + "epoch": 0.6413914934094306, + "grad_norm": 0.7041183114051819, + "learning_rate": 9.198600353308874e-06, + "loss": 6.4863, + "step": 9440 + }, + { + "epoch": 0.6417312134800924, + "grad_norm": 0.6696336269378662, + "learning_rate": 9.198175703220547e-06, + "loss": 6.4701, + "step": 9445 + }, + { + "epoch": 0.6420709335507542, + "grad_norm": 0.803166389465332, + "learning_rate": 9.19775105313222e-06, + "loss": 6.3831, + "step": 9450 + }, + { + "epoch": 0.6424106536214159, + "grad_norm": 0.7642784118652344, + "learning_rate": 9.197326403043893e-06, + "loss": 6.2922, + "step": 9455 + }, + { + "epoch": 0.6427503736920778, + "grad_norm": 0.6845123171806335, + "learning_rate": 9.196901752955565e-06, + "loss": 6.3948, + "step": 9460 + }, + { + "epoch": 0.6430900937627395, + "grad_norm": 0.6065470576286316, + "learning_rate": 9.196477102867238e-06, + "loss": 6.2112, + "step": 9465 + }, + { + "epoch": 0.6434298138334013, + "grad_norm": 0.7849832773208618, + "learning_rate": 9.196052452778911e-06, + "loss": 6.2962, + "step": 9470 + }, + { + "epoch": 0.6437695339040631, + "grad_norm": 0.7358639240264893, + "learning_rate": 9.195627802690584e-06, + "loss": 6.5816, + "step": 9475 + }, + { + "epoch": 0.6441092539747248, + "grad_norm": 0.6847927570343018, + "learning_rate": 9.195203152602257e-06, + "loss": 6.5103, + "step": 9480 + }, + { + "epoch": 0.6444489740453866, + "grad_norm": 0.7305313944816589, + "learning_rate": 9.19477850251393e-06, + "loss": 6.5572, + "step": 9485 + }, + { + "epoch": 0.6447886941160483, + "grad_norm": 0.6516531705856323, + "learning_rate": 9.194353852425602e-06, + "loss": 6.4048, + "step": 9490 + }, + { + "epoch": 0.6451284141867102, + "grad_norm": 0.6872650980949402, + "learning_rate": 9.193929202337275e-06, + "loss": 6.5124, + "step": 9495 + }, + { + "epoch": 0.6454681342573719, + "grad_norm": 0.6582599878311157, + "learning_rate": 9.193504552248948e-06, + "loss": 6.1815, + "step": 9500 + }, + { + "epoch": 0.6458078543280337, + "grad_norm": 0.7024451494216919, + "learning_rate": 9.19307990216062e-06, + "loss": 6.3238, + "step": 9505 + }, + { + "epoch": 0.6461475743986955, + "grad_norm": 0.7070611119270325, + "learning_rate": 9.192655252072293e-06, + "loss": 6.3875, + "step": 9510 + }, + { + "epoch": 0.6464872944693573, + "grad_norm": 0.6455351710319519, + "learning_rate": 9.192230601983966e-06, + "loss": 6.1866, + "step": 9515 + }, + { + "epoch": 0.646827014540019, + "grad_norm": 0.7399108409881592, + "learning_rate": 9.191805951895639e-06, + "loss": 6.5183, + "step": 9520 + }, + { + "epoch": 0.6471667346106807, + "grad_norm": 0.6895038485527039, + "learning_rate": 9.191381301807312e-06, + "loss": 6.4226, + "step": 9525 + }, + { + "epoch": 0.6475064546813426, + "grad_norm": 0.5351959466934204, + "learning_rate": 9.190956651718985e-06, + "loss": 6.447, + "step": 9530 + }, + { + "epoch": 0.6478461747520043, + "grad_norm": 0.6617714762687683, + "learning_rate": 9.190532001630657e-06, + "loss": 6.3967, + "step": 9535 + }, + { + "epoch": 0.6481858948226661, + "grad_norm": 0.7203499674797058, + "learning_rate": 9.19010735154233e-06, + "loss": 6.6206, + "step": 9540 + }, + { + "epoch": 0.6485256148933279, + "grad_norm": 0.7479397654533386, + "learning_rate": 9.189682701454003e-06, + "loss": 6.4961, + "step": 9545 + }, + { + "epoch": 0.6488653349639897, + "grad_norm": 0.7004820704460144, + "learning_rate": 9.189258051365676e-06, + "loss": 6.4031, + "step": 9550 + }, + { + "epoch": 0.6492050550346514, + "grad_norm": 0.5364325642585754, + "learning_rate": 9.188833401277349e-06, + "loss": 6.3254, + "step": 9555 + }, + { + "epoch": 0.6495447751053133, + "grad_norm": 0.8434842228889465, + "learning_rate": 9.188408751189021e-06, + "loss": 6.3158, + "step": 9560 + }, + { + "epoch": 0.649884495175975, + "grad_norm": 0.583679735660553, + "learning_rate": 9.187984101100694e-06, + "loss": 6.3223, + "step": 9565 + }, + { + "epoch": 0.6502242152466368, + "grad_norm": 0.8354448080062866, + "learning_rate": 9.187559451012367e-06, + "loss": 6.0271, + "step": 9570 + }, + { + "epoch": 0.6505639353172985, + "grad_norm": 0.9048535823822021, + "learning_rate": 9.18713480092404e-06, + "loss": 6.4293, + "step": 9575 + }, + { + "epoch": 0.6509036553879604, + "grad_norm": 0.6438108682632446, + "learning_rate": 9.186710150835713e-06, + "loss": 6.5053, + "step": 9580 + }, + { + "epoch": 0.6512433754586221, + "grad_norm": 0.8894186615943909, + "learning_rate": 9.186285500747385e-06, + "loss": 6.3987, + "step": 9585 + }, + { + "epoch": 0.6515830955292838, + "grad_norm": 0.5737390518188477, + "learning_rate": 9.185860850659058e-06, + "loss": 6.1063, + "step": 9590 + }, + { + "epoch": 0.6519228155999457, + "grad_norm": 0.6547545194625854, + "learning_rate": 9.185436200570731e-06, + "loss": 6.2063, + "step": 9595 + }, + { + "epoch": 0.6522625356706074, + "grad_norm": 0.6937142610549927, + "learning_rate": 9.185011550482404e-06, + "loss": 6.1824, + "step": 9600 + }, + { + "epoch": 0.6526022557412692, + "grad_norm": 0.6390159726142883, + "learning_rate": 9.184586900394075e-06, + "loss": 6.3639, + "step": 9605 + }, + { + "epoch": 0.6529419758119309, + "grad_norm": 0.8000003099441528, + "learning_rate": 9.18416225030575e-06, + "loss": 6.5289, + "step": 9610 + }, + { + "epoch": 0.6532816958825928, + "grad_norm": 0.7709170579910278, + "learning_rate": 9.183737600217422e-06, + "loss": 6.7424, + "step": 9615 + }, + { + "epoch": 0.6536214159532545, + "grad_norm": 0.8069483041763306, + "learning_rate": 9.183312950129093e-06, + "loss": 6.2516, + "step": 9620 + }, + { + "epoch": 0.6539611360239163, + "grad_norm": 0.7051231861114502, + "learning_rate": 9.182888300040768e-06, + "loss": 6.2041, + "step": 9625 + }, + { + "epoch": 0.6543008560945781, + "grad_norm": 0.7638939619064331, + "learning_rate": 9.18246364995244e-06, + "loss": 6.264, + "step": 9630 + }, + { + "epoch": 0.6546405761652399, + "grad_norm": 0.6280597448348999, + "learning_rate": 9.182038999864112e-06, + "loss": 6.3095, + "step": 9635 + }, + { + "epoch": 0.6549802962359016, + "grad_norm": 0.6691423058509827, + "learning_rate": 9.181614349775786e-06, + "loss": 6.5376, + "step": 9640 + }, + { + "epoch": 0.6553200163065634, + "grad_norm": 0.6948831677436829, + "learning_rate": 9.181189699687459e-06, + "loss": 6.091, + "step": 9645 + }, + { + "epoch": 0.6556597363772252, + "grad_norm": 0.7839148044586182, + "learning_rate": 9.18076504959913e-06, + "loss": 5.7947, + "step": 9650 + }, + { + "epoch": 0.6559994564478869, + "grad_norm": 0.7258013486862183, + "learning_rate": 9.180340399510805e-06, + "loss": 6.2196, + "step": 9655 + }, + { + "epoch": 0.6563391765185487, + "grad_norm": 0.8199764490127563, + "learning_rate": 9.179915749422477e-06, + "loss": 6.5006, + "step": 9660 + }, + { + "epoch": 0.6566788965892105, + "grad_norm": 0.6570567488670349, + "learning_rate": 9.179491099334149e-06, + "loss": 6.2229, + "step": 9665 + }, + { + "epoch": 0.6570186166598723, + "grad_norm": 0.824135422706604, + "learning_rate": 9.179066449245823e-06, + "loss": 5.7055, + "step": 9670 + }, + { + "epoch": 0.657358336730534, + "grad_norm": 0.850911557674408, + "learning_rate": 9.178641799157496e-06, + "loss": 6.2586, + "step": 9675 + }, + { + "epoch": 0.6576980568011959, + "grad_norm": 0.6358680725097656, + "learning_rate": 9.178217149069167e-06, + "loss": 6.3617, + "step": 9680 + }, + { + "epoch": 0.6580377768718576, + "grad_norm": 0.7808555364608765, + "learning_rate": 9.177792498980841e-06, + "loss": 6.3879, + "step": 9685 + }, + { + "epoch": 0.6583774969425193, + "grad_norm": 0.7472771406173706, + "learning_rate": 9.177367848892513e-06, + "loss": 6.1436, + "step": 9690 + }, + { + "epoch": 0.6587172170131811, + "grad_norm": 0.6717007160186768, + "learning_rate": 9.176943198804185e-06, + "loss": 6.3491, + "step": 9695 + }, + { + "epoch": 0.6590569370838429, + "grad_norm": 0.6365639567375183, + "learning_rate": 9.17651854871586e-06, + "loss": 6.1375, + "step": 9700 + }, + { + "epoch": 0.6593966571545047, + "grad_norm": 0.8654581904411316, + "learning_rate": 9.176093898627531e-06, + "loss": 6.4987, + "step": 9705 + }, + { + "epoch": 0.6597363772251664, + "grad_norm": 0.6320400834083557, + "learning_rate": 9.175669248539204e-06, + "loss": 6.2519, + "step": 9710 + }, + { + "epoch": 0.6600760972958283, + "grad_norm": 0.678617537021637, + "learning_rate": 9.175244598450878e-06, + "loss": 6.286, + "step": 9715 + }, + { + "epoch": 0.66041581736649, + "grad_norm": 0.6697331666946411, + "learning_rate": 9.17481994836255e-06, + "loss": 6.7809, + "step": 9720 + }, + { + "epoch": 0.6607555374371518, + "grad_norm": 0.8575046062469482, + "learning_rate": 9.174395298274222e-06, + "loss": 6.2132, + "step": 9725 + }, + { + "epoch": 0.6610952575078136, + "grad_norm": 0.8286623954772949, + "learning_rate": 9.173970648185897e-06, + "loss": 6.0534, + "step": 9730 + }, + { + "epoch": 0.6614349775784754, + "grad_norm": 0.7623854875564575, + "learning_rate": 9.173545998097568e-06, + "loss": 6.1306, + "step": 9735 + }, + { + "epoch": 0.6617746976491371, + "grad_norm": 0.7228173613548279, + "learning_rate": 9.17312134800924e-06, + "loss": 6.1075, + "step": 9740 + }, + { + "epoch": 0.6621144177197988, + "grad_norm": 0.5480359196662903, + "learning_rate": 9.172696697920915e-06, + "loss": 6.2314, + "step": 9745 + }, + { + "epoch": 0.6624541377904607, + "grad_norm": 0.7385909557342529, + "learning_rate": 9.172272047832586e-06, + "loss": 6.297, + "step": 9750 + }, + { + "epoch": 0.6627938578611224, + "grad_norm": 0.9001138210296631, + "learning_rate": 9.171847397744259e-06, + "loss": 6.3285, + "step": 9755 + }, + { + "epoch": 0.6631335779317842, + "grad_norm": 0.8034594058990479, + "learning_rate": 9.171422747655932e-06, + "loss": 6.3531, + "step": 9760 + }, + { + "epoch": 0.663473298002446, + "grad_norm": 0.664456844329834, + "learning_rate": 9.170998097567605e-06, + "loss": 6.0142, + "step": 9765 + }, + { + "epoch": 0.6638130180731078, + "grad_norm": 0.6336710453033447, + "learning_rate": 9.170573447479277e-06, + "loss": 6.3426, + "step": 9770 + }, + { + "epoch": 0.6641527381437695, + "grad_norm": 0.7892618775367737, + "learning_rate": 9.17014879739095e-06, + "loss": 6.2028, + "step": 9775 + }, + { + "epoch": 0.6644924582144313, + "grad_norm": 0.6140861511230469, + "learning_rate": 9.169724147302623e-06, + "loss": 6.3767, + "step": 9780 + }, + { + "epoch": 0.6648321782850931, + "grad_norm": 0.8637545108795166, + "learning_rate": 9.169299497214296e-06, + "loss": 6.4397, + "step": 9785 + }, + { + "epoch": 0.6651718983557549, + "grad_norm": 0.7345805764198303, + "learning_rate": 9.168874847125969e-06, + "loss": 6.1726, + "step": 9790 + }, + { + "epoch": 0.6655116184264166, + "grad_norm": 0.7940070033073425, + "learning_rate": 9.168450197037641e-06, + "loss": 6.1116, + "step": 9795 + }, + { + "epoch": 0.6658513384970784, + "grad_norm": 0.8355920910835266, + "learning_rate": 9.168025546949314e-06, + "loss": 6.3121, + "step": 9800 + }, + { + "epoch": 0.6661910585677402, + "grad_norm": 0.8159860968589783, + "learning_rate": 9.167600896860987e-06, + "loss": 6.2857, + "step": 9805 + }, + { + "epoch": 0.6665307786384019, + "grad_norm": 0.5755505561828613, + "learning_rate": 9.16717624677266e-06, + "loss": 6.2557, + "step": 9810 + }, + { + "epoch": 0.6668704987090638, + "grad_norm": 0.5895490646362305, + "learning_rate": 9.166751596684333e-06, + "loss": 6.3097, + "step": 9815 + }, + { + "epoch": 0.6672102187797255, + "grad_norm": 0.8742266297340393, + "learning_rate": 9.166326946596005e-06, + "loss": 6.5037, + "step": 9820 + }, + { + "epoch": 0.6675499388503873, + "grad_norm": 0.6573750972747803, + "learning_rate": 9.165902296507678e-06, + "loss": 6.5423, + "step": 9825 + }, + { + "epoch": 0.667889658921049, + "grad_norm": 0.6709744334220886, + "learning_rate": 9.165477646419351e-06, + "loss": 6.1515, + "step": 9830 + }, + { + "epoch": 0.6682293789917109, + "grad_norm": 0.7991673946380615, + "learning_rate": 9.165052996331024e-06, + "loss": 6.5981, + "step": 9835 + }, + { + "epoch": 0.6685690990623726, + "grad_norm": 0.6705992221832275, + "learning_rate": 9.164628346242697e-06, + "loss": 6.1605, + "step": 9840 + }, + { + "epoch": 0.6689088191330343, + "grad_norm": 0.6917455792427063, + "learning_rate": 9.16420369615437e-06, + "loss": 6.3615, + "step": 9845 + }, + { + "epoch": 0.6692485392036962, + "grad_norm": 0.6309854984283447, + "learning_rate": 9.163779046066042e-06, + "loss": 6.2909, + "step": 9850 + }, + { + "epoch": 0.6695882592743579, + "grad_norm": 0.7422961592674255, + "learning_rate": 9.163354395977715e-06, + "loss": 6.3586, + "step": 9855 + }, + { + "epoch": 0.6699279793450197, + "grad_norm": 0.6413054466247559, + "learning_rate": 9.162929745889388e-06, + "loss": 6.2332, + "step": 9860 + }, + { + "epoch": 0.6702676994156814, + "grad_norm": 1.2663066387176514, + "learning_rate": 9.16250509580106e-06, + "loss": 6.3024, + "step": 9865 + }, + { + "epoch": 0.6706074194863433, + "grad_norm": 0.7502869367599487, + "learning_rate": 9.162080445712733e-06, + "loss": 6.3487, + "step": 9870 + }, + { + "epoch": 0.670947139557005, + "grad_norm": 0.6787996292114258, + "learning_rate": 9.161655795624406e-06, + "loss": 6.3782, + "step": 9875 + }, + { + "epoch": 0.6712868596276668, + "grad_norm": 0.5651604533195496, + "learning_rate": 9.161231145536079e-06, + "loss": 6.4259, + "step": 9880 + }, + { + "epoch": 0.6716265796983286, + "grad_norm": 0.857718288898468, + "learning_rate": 9.160806495447752e-06, + "loss": 6.1969, + "step": 9885 + }, + { + "epoch": 0.6719662997689904, + "grad_norm": 0.7047984600067139, + "learning_rate": 9.160381845359425e-06, + "loss": 5.9988, + "step": 9890 + }, + { + "epoch": 0.6723060198396521, + "grad_norm": 0.6644574999809265, + "learning_rate": 9.159957195271097e-06, + "loss": 6.1873, + "step": 9895 + }, + { + "epoch": 0.672645739910314, + "grad_norm": 0.7631362080574036, + "learning_rate": 9.15953254518277e-06, + "loss": 6.4927, + "step": 9900 + }, + { + "epoch": 0.6729854599809757, + "grad_norm": 0.6537579894065857, + "learning_rate": 9.159107895094443e-06, + "loss": 5.9671, + "step": 9905 + }, + { + "epoch": 0.6733251800516374, + "grad_norm": 0.6834453344345093, + "learning_rate": 9.158683245006116e-06, + "loss": 6.3851, + "step": 9910 + }, + { + "epoch": 0.6736649001222992, + "grad_norm": 0.661952793598175, + "learning_rate": 9.158258594917789e-06, + "loss": 6.3544, + "step": 9915 + }, + { + "epoch": 0.674004620192961, + "grad_norm": 0.6329561471939087, + "learning_rate": 9.157833944829461e-06, + "loss": 6.2632, + "step": 9920 + }, + { + "epoch": 0.6743443402636228, + "grad_norm": 0.6302782297134399, + "learning_rate": 9.157409294741134e-06, + "loss": 6.0921, + "step": 9925 + }, + { + "epoch": 0.6746840603342845, + "grad_norm": 0.7470691204071045, + "learning_rate": 9.156984644652807e-06, + "loss": 6.1539, + "step": 9930 + }, + { + "epoch": 0.6750237804049464, + "grad_norm": 0.6707788109779358, + "learning_rate": 9.15655999456448e-06, + "loss": 5.9992, + "step": 9935 + }, + { + "epoch": 0.6753635004756081, + "grad_norm": 0.7276530265808105, + "learning_rate": 9.156135344476153e-06, + "loss": 6.5568, + "step": 9940 + }, + { + "epoch": 0.6757032205462699, + "grad_norm": 0.5319271683692932, + "learning_rate": 9.155710694387825e-06, + "loss": 6.2624, + "step": 9945 + }, + { + "epoch": 0.6760429406169316, + "grad_norm": 0.7461548447608948, + "learning_rate": 9.155286044299498e-06, + "loss": 6.1652, + "step": 9950 + }, + { + "epoch": 0.6763826606875935, + "grad_norm": 0.6228979825973511, + "learning_rate": 9.154861394211171e-06, + "loss": 6.2533, + "step": 9955 + }, + { + "epoch": 0.6767223807582552, + "grad_norm": 0.7325747013092041, + "learning_rate": 9.154436744122844e-06, + "loss": 6.3522, + "step": 9960 + }, + { + "epoch": 0.6770621008289169, + "grad_norm": 0.6145057082176208, + "learning_rate": 9.154012094034517e-06, + "loss": 6.3568, + "step": 9965 + }, + { + "epoch": 0.6774018208995788, + "grad_norm": 0.6935858726501465, + "learning_rate": 9.15358744394619e-06, + "loss": 6.3257, + "step": 9970 + }, + { + "epoch": 0.6777415409702405, + "grad_norm": 0.7435451149940491, + "learning_rate": 9.153162793857862e-06, + "loss": 6.2281, + "step": 9975 + }, + { + "epoch": 0.6780812610409023, + "grad_norm": 0.6240575909614563, + "learning_rate": 9.152738143769535e-06, + "loss": 6.3432, + "step": 9980 + }, + { + "epoch": 0.6784209811115641, + "grad_norm": 0.7022627592086792, + "learning_rate": 9.152313493681208e-06, + "loss": 6.3093, + "step": 9985 + }, + { + "epoch": 0.6787607011822259, + "grad_norm": 0.6329903602600098, + "learning_rate": 9.15188884359288e-06, + "loss": 6.1706, + "step": 9990 + }, + { + "epoch": 0.6791004212528876, + "grad_norm": 0.6364669799804688, + "learning_rate": 9.151464193504553e-06, + "loss": 6.2766, + "step": 9995 + }, + { + "epoch": 0.6794401413235494, + "grad_norm": 0.6661564707756042, + "learning_rate": 9.151039543416226e-06, + "loss": 6.3284, + "step": 10000 + }, + { + "epoch": 0.6797798613942112, + "grad_norm": 0.7486583590507507, + "learning_rate": 9.150614893327899e-06, + "loss": 6.3018, + "step": 10005 + }, + { + "epoch": 0.680119581464873, + "grad_norm": 0.646386981010437, + "learning_rate": 9.150190243239572e-06, + "loss": 6.3256, + "step": 10010 + }, + { + "epoch": 0.6804593015355347, + "grad_norm": 0.7611472010612488, + "learning_rate": 9.149765593151245e-06, + "loss": 6.0835, + "step": 10015 + }, + { + "epoch": 0.6807990216061965, + "grad_norm": 0.6947270631790161, + "learning_rate": 9.149340943062917e-06, + "loss": 6.2721, + "step": 10020 + }, + { + "epoch": 0.6811387416768583, + "grad_norm": 0.6878142356872559, + "learning_rate": 9.14891629297459e-06, + "loss": 6.3597, + "step": 10025 + }, + { + "epoch": 0.68147846174752, + "grad_norm": 0.6414291262626648, + "learning_rate": 9.148491642886263e-06, + "loss": 6.1672, + "step": 10030 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.7262043356895447, + "learning_rate": 9.148066992797934e-06, + "loss": 6.3783, + "step": 10035 + }, + { + "epoch": 0.6821579018888436, + "grad_norm": 0.6811686158180237, + "learning_rate": 9.147642342709609e-06, + "loss": 6.2445, + "step": 10040 + }, + { + "epoch": 0.6824976219595054, + "grad_norm": 0.5911498069763184, + "learning_rate": 9.147217692621281e-06, + "loss": 6.2972, + "step": 10045 + }, + { + "epoch": 0.6828373420301671, + "grad_norm": 0.6832822561264038, + "learning_rate": 9.146793042532952e-06, + "loss": 6.1133, + "step": 10050 + }, + { + "epoch": 0.683177062100829, + "grad_norm": 0.6675670146942139, + "learning_rate": 9.146368392444627e-06, + "loss": 6.3109, + "step": 10055 + }, + { + "epoch": 0.6835167821714907, + "grad_norm": 0.613399088382721, + "learning_rate": 9.1459437423563e-06, + "loss": 6.4646, + "step": 10060 + }, + { + "epoch": 0.6838565022421524, + "grad_norm": 0.8031336069107056, + "learning_rate": 9.145519092267971e-06, + "loss": 6.2949, + "step": 10065 + }, + { + "epoch": 0.6841962223128143, + "grad_norm": 0.7029309272766113, + "learning_rate": 9.145094442179645e-06, + "loss": 6.3596, + "step": 10070 + }, + { + "epoch": 0.684535942383476, + "grad_norm": 0.6840498447418213, + "learning_rate": 9.144669792091318e-06, + "loss": 6.067, + "step": 10075 + }, + { + "epoch": 0.6848756624541378, + "grad_norm": 0.9287526607513428, + "learning_rate": 9.14424514200299e-06, + "loss": 6.1842, + "step": 10080 + }, + { + "epoch": 0.6852153825247995, + "grad_norm": 0.7433454394340515, + "learning_rate": 9.143820491914664e-06, + "loss": 6.0383, + "step": 10085 + }, + { + "epoch": 0.6855551025954614, + "grad_norm": 0.7090742588043213, + "learning_rate": 9.143395841826337e-06, + "loss": 6.2692, + "step": 10090 + }, + { + "epoch": 0.6858948226661231, + "grad_norm": 0.7800020575523376, + "learning_rate": 9.142971191738008e-06, + "loss": 6.3318, + "step": 10095 + }, + { + "epoch": 0.6862345427367849, + "grad_norm": 0.7503989338874817, + "learning_rate": 9.142546541649682e-06, + "loss": 6.5477, + "step": 10100 + }, + { + "epoch": 0.6865742628074467, + "grad_norm": 0.8266313076019287, + "learning_rate": 9.142121891561353e-06, + "loss": 6.2559, + "step": 10105 + }, + { + "epoch": 0.6869139828781085, + "grad_norm": 0.5687940716743469, + "learning_rate": 9.141697241473026e-06, + "loss": 6.1596, + "step": 10110 + }, + { + "epoch": 0.6872537029487702, + "grad_norm": 0.6482721567153931, + "learning_rate": 9.1412725913847e-06, + "loss": 6.2206, + "step": 10115 + }, + { + "epoch": 0.6875934230194319, + "grad_norm": 0.7440736293792725, + "learning_rate": 9.140847941296372e-06, + "loss": 6.2953, + "step": 10120 + }, + { + "epoch": 0.6879331430900938, + "grad_norm": 0.7609699368476868, + "learning_rate": 9.140423291208044e-06, + "loss": 6.3312, + "step": 10125 + }, + { + "epoch": 0.6882728631607555, + "grad_norm": 0.5636163353919983, + "learning_rate": 9.139998641119719e-06, + "loss": 5.973, + "step": 10130 + }, + { + "epoch": 0.6886125832314173, + "grad_norm": 0.6521832942962646, + "learning_rate": 9.13957399103139e-06, + "loss": 6.1841, + "step": 10135 + }, + { + "epoch": 0.6889523033020791, + "grad_norm": 0.9209757447242737, + "learning_rate": 9.139149340943063e-06, + "loss": 6.0507, + "step": 10140 + }, + { + "epoch": 0.6892920233727409, + "grad_norm": 0.6453912258148193, + "learning_rate": 9.138724690854737e-06, + "loss": 6.2545, + "step": 10145 + }, + { + "epoch": 0.6896317434434026, + "grad_norm": 0.78888338804245, + "learning_rate": 9.138300040766409e-06, + "loss": 6.1357, + "step": 10150 + }, + { + "epoch": 0.6899714635140645, + "grad_norm": 0.7031204700469971, + "learning_rate": 9.137875390678081e-06, + "loss": 6.4607, + "step": 10155 + }, + { + "epoch": 0.6903111835847262, + "grad_norm": 0.6247392892837524, + "learning_rate": 9.137450740589756e-06, + "loss": 6.1794, + "step": 10160 + }, + { + "epoch": 0.690650903655388, + "grad_norm": 0.734210729598999, + "learning_rate": 9.137026090501427e-06, + "loss": 6.14, + "step": 10165 + }, + { + "epoch": 0.6909906237260497, + "grad_norm": 0.7468337416648865, + "learning_rate": 9.1366014404131e-06, + "loss": 6.1896, + "step": 10170 + }, + { + "epoch": 0.6913303437967115, + "grad_norm": 0.6938478350639343, + "learning_rate": 9.136176790324773e-06, + "loss": 6.3799, + "step": 10175 + }, + { + "epoch": 0.6916700638673733, + "grad_norm": 0.7582805156707764, + "learning_rate": 9.135752140236445e-06, + "loss": 6.3542, + "step": 10180 + }, + { + "epoch": 0.692009783938035, + "grad_norm": 0.939236581325531, + "learning_rate": 9.135327490148118e-06, + "loss": 6.0007, + "step": 10185 + }, + { + "epoch": 0.6923495040086969, + "grad_norm": 0.6389191150665283, + "learning_rate": 9.134902840059791e-06, + "loss": 6.1894, + "step": 10190 + }, + { + "epoch": 0.6926892240793586, + "grad_norm": 0.5593389868736267, + "learning_rate": 9.134478189971464e-06, + "loss": 6.2345, + "step": 10195 + }, + { + "epoch": 0.6930289441500204, + "grad_norm": 0.7461782693862915, + "learning_rate": 9.134053539883138e-06, + "loss": 6.1338, + "step": 10200 + }, + { + "epoch": 0.6933686642206821, + "grad_norm": 0.7460827827453613, + "learning_rate": 9.13362888979481e-06, + "loss": 5.9878, + "step": 10205 + }, + { + "epoch": 0.693708384291344, + "grad_norm": 0.9394932985305786, + "learning_rate": 9.133204239706482e-06, + "loss": 6.255, + "step": 10210 + }, + { + "epoch": 0.6940481043620057, + "grad_norm": 0.7191615104675293, + "learning_rate": 9.132779589618157e-06, + "loss": 6.1559, + "step": 10215 + }, + { + "epoch": 0.6943878244326674, + "grad_norm": 0.7977159023284912, + "learning_rate": 9.132354939529828e-06, + "loss": 6.0214, + "step": 10220 + }, + { + "epoch": 0.6947275445033293, + "grad_norm": 0.7992469668388367, + "learning_rate": 9.1319302894415e-06, + "loss": 6.1762, + "step": 10225 + }, + { + "epoch": 0.695067264573991, + "grad_norm": 0.6600250601768494, + "learning_rate": 9.131505639353175e-06, + "loss": 6.2371, + "step": 10230 + }, + { + "epoch": 0.6954069846446528, + "grad_norm": 0.6978618502616882, + "learning_rate": 9.131080989264846e-06, + "loss": 6.2411, + "step": 10235 + }, + { + "epoch": 0.6957467047153146, + "grad_norm": 0.7488465905189514, + "learning_rate": 9.130656339176519e-06, + "loss": 6.1946, + "step": 10240 + }, + { + "epoch": 0.6960864247859764, + "grad_norm": 0.6144048571586609, + "learning_rate": 9.130231689088193e-06, + "loss": 6.239, + "step": 10245 + }, + { + "epoch": 0.6964261448566381, + "grad_norm": 0.5966808795928955, + "learning_rate": 9.129807038999865e-06, + "loss": 6.165, + "step": 10250 + }, + { + "epoch": 0.6967658649272999, + "grad_norm": 0.7170645594596863, + "learning_rate": 9.129382388911537e-06, + "loss": 6.2076, + "step": 10255 + }, + { + "epoch": 0.6971055849979617, + "grad_norm": 0.7845368385314941, + "learning_rate": 9.12895773882321e-06, + "loss": 6.1571, + "step": 10260 + }, + { + "epoch": 0.6974453050686235, + "grad_norm": 0.7013678550720215, + "learning_rate": 9.128533088734883e-06, + "loss": 6.2284, + "step": 10265 + }, + { + "epoch": 0.6977850251392852, + "grad_norm": 0.6811504364013672, + "learning_rate": 9.128108438646556e-06, + "loss": 6.2058, + "step": 10270 + }, + { + "epoch": 0.698124745209947, + "grad_norm": 0.7817476987838745, + "learning_rate": 9.127683788558229e-06, + "loss": 6.2336, + "step": 10275 + }, + { + "epoch": 0.6984644652806088, + "grad_norm": 0.6504366397857666, + "learning_rate": 9.127259138469901e-06, + "loss": 6.2692, + "step": 10280 + }, + { + "epoch": 0.6988041853512705, + "grad_norm": 0.5906623005867004, + "learning_rate": 9.126834488381574e-06, + "loss": 6.1499, + "step": 10285 + }, + { + "epoch": 0.6991439054219323, + "grad_norm": 0.6771206259727478, + "learning_rate": 9.126409838293247e-06, + "loss": 5.9039, + "step": 10290 + }, + { + "epoch": 0.6994836254925941, + "grad_norm": 0.6593291759490967, + "learning_rate": 9.126070118222586e-06, + "loss": 6.1387, + "step": 10295 + }, + { + "epoch": 0.6998233455632559, + "grad_norm": 0.7172386050224304, + "learning_rate": 9.125645468134257e-06, + "loss": 6.2229, + "step": 10300 + }, + { + "epoch": 0.7001630656339176, + "grad_norm": 1.064487338066101, + "learning_rate": 9.125220818045931e-06, + "loss": 6.2792, + "step": 10305 + }, + { + "epoch": 0.7005027857045795, + "grad_norm": 0.7107935547828674, + "learning_rate": 9.124796167957604e-06, + "loss": 6.0864, + "step": 10310 + }, + { + "epoch": 0.7008425057752412, + "grad_norm": 0.6468725800514221, + "learning_rate": 9.124371517869275e-06, + "loss": 6.2504, + "step": 10315 + }, + { + "epoch": 0.701182225845903, + "grad_norm": 0.8270561695098877, + "learning_rate": 9.12394686778095e-06, + "loss": 6.233, + "step": 10320 + }, + { + "epoch": 0.7015219459165648, + "grad_norm": 0.6312653422355652, + "learning_rate": 9.123522217692623e-06, + "loss": 6.098, + "step": 10325 + }, + { + "epoch": 0.7018616659872265, + "grad_norm": 0.690470278263092, + "learning_rate": 9.123097567604294e-06, + "loss": 6.3806, + "step": 10330 + }, + { + "epoch": 0.7022013860578883, + "grad_norm": 0.6093543171882629, + "learning_rate": 9.122672917515968e-06, + "loss": 6.1621, + "step": 10335 + }, + { + "epoch": 0.70254110612855, + "grad_norm": 0.7734710574150085, + "learning_rate": 9.122248267427641e-06, + "loss": 6.1355, + "step": 10340 + }, + { + "epoch": 0.7028808261992119, + "grad_norm": 0.7822386622428894, + "learning_rate": 9.121823617339312e-06, + "loss": 6.2492, + "step": 10345 + }, + { + "epoch": 0.7032205462698736, + "grad_norm": 0.6131181716918945, + "learning_rate": 9.121398967250987e-06, + "loss": 6.2147, + "step": 10350 + }, + { + "epoch": 0.7035602663405354, + "grad_norm": 0.7684249877929688, + "learning_rate": 9.120974317162658e-06, + "loss": 6.32, + "step": 10355 + }, + { + "epoch": 0.7038999864111972, + "grad_norm": 0.6337093114852905, + "learning_rate": 9.12054966707433e-06, + "loss": 6.3214, + "step": 10360 + }, + { + "epoch": 0.704239706481859, + "grad_norm": 0.8002831935882568, + "learning_rate": 9.120125016986005e-06, + "loss": 5.8931, + "step": 10365 + }, + { + "epoch": 0.7045794265525207, + "grad_norm": 0.7831613421440125, + "learning_rate": 9.119700366897676e-06, + "loss": 6.3292, + "step": 10370 + }, + { + "epoch": 0.7049191466231824, + "grad_norm": 0.5820332765579224, + "learning_rate": 9.119275716809349e-06, + "loss": 6.2184, + "step": 10375 + }, + { + "epoch": 0.7052588666938443, + "grad_norm": 0.6730987429618835, + "learning_rate": 9.118851066721023e-06, + "loss": 6.214, + "step": 10380 + }, + { + "epoch": 0.705598586764506, + "grad_norm": 1.7740737199783325, + "learning_rate": 9.118426416632695e-06, + "loss": 6.2581, + "step": 10385 + }, + { + "epoch": 0.7059383068351678, + "grad_norm": 0.6709515452384949, + "learning_rate": 9.118001766544367e-06, + "loss": 6.1936, + "step": 10390 + }, + { + "epoch": 0.7062780269058296, + "grad_norm": 0.6563542485237122, + "learning_rate": 9.117577116456042e-06, + "loss": 6.2173, + "step": 10395 + }, + { + "epoch": 0.7066177469764914, + "grad_norm": 0.6079421639442444, + "learning_rate": 9.117152466367713e-06, + "loss": 6.3391, + "step": 10400 + }, + { + "epoch": 0.7069574670471531, + "grad_norm": 0.6594847440719604, + "learning_rate": 9.116727816279387e-06, + "loss": 6.0225, + "step": 10405 + }, + { + "epoch": 0.707297187117815, + "grad_norm": 0.6193268299102783, + "learning_rate": 9.11630316619106e-06, + "loss": 6.1661, + "step": 10410 + }, + { + "epoch": 0.7076369071884767, + "grad_norm": 0.7004114389419556, + "learning_rate": 9.115878516102731e-06, + "loss": 6.2313, + "step": 10415 + }, + { + "epoch": 0.7079766272591385, + "grad_norm": 0.773926317691803, + "learning_rate": 9.115453866014406e-06, + "loss": 6.134, + "step": 10420 + }, + { + "epoch": 0.7083163473298002, + "grad_norm": 0.5979351997375488, + "learning_rate": 9.115029215926077e-06, + "loss": 6.2012, + "step": 10425 + }, + { + "epoch": 0.708656067400462, + "grad_norm": 0.7781161665916443, + "learning_rate": 9.11460456583775e-06, + "loss": 6.3311, + "step": 10430 + }, + { + "epoch": 0.7089957874711238, + "grad_norm": 0.6650593876838684, + "learning_rate": 9.114179915749424e-06, + "loss": 6.1569, + "step": 10435 + }, + { + "epoch": 0.7093355075417855, + "grad_norm": 0.5518545508384705, + "learning_rate": 9.113755265661095e-06, + "loss": 6.1732, + "step": 10440 + }, + { + "epoch": 0.7096752276124474, + "grad_norm": 0.6374890208244324, + "learning_rate": 9.113330615572768e-06, + "loss": 6.1933, + "step": 10445 + }, + { + "epoch": 0.7100149476831091, + "grad_norm": 0.7412489056587219, + "learning_rate": 9.112905965484443e-06, + "loss": 6.2908, + "step": 10450 + }, + { + "epoch": 0.7103546677537709, + "grad_norm": 0.6096909046173096, + "learning_rate": 9.112481315396114e-06, + "loss": 6.1725, + "step": 10455 + }, + { + "epoch": 0.7106943878244326, + "grad_norm": 0.8166878819465637, + "learning_rate": 9.112056665307787e-06, + "loss": 6.0339, + "step": 10460 + }, + { + "epoch": 0.7110341078950945, + "grad_norm": 0.7108005881309509, + "learning_rate": 9.111632015219461e-06, + "loss": 5.9138, + "step": 10465 + }, + { + "epoch": 0.7113738279657562, + "grad_norm": 0.7629491090774536, + "learning_rate": 9.111207365131132e-06, + "loss": 6.1114, + "step": 10470 + }, + { + "epoch": 0.711713548036418, + "grad_norm": 0.6546962261199951, + "learning_rate": 9.110782715042805e-06, + "loss": 6.2017, + "step": 10475 + }, + { + "epoch": 0.7120532681070798, + "grad_norm": 0.6703891158103943, + "learning_rate": 9.11035806495448e-06, + "loss": 6.0989, + "step": 10480 + }, + { + "epoch": 0.7123929881777415, + "grad_norm": 0.757207453250885, + "learning_rate": 9.10993341486615e-06, + "loss": 6.0475, + "step": 10485 + }, + { + "epoch": 0.7127327082484033, + "grad_norm": 0.626958429813385, + "learning_rate": 9.109508764777823e-06, + "loss": 6.1667, + "step": 10490 + }, + { + "epoch": 0.7130724283190651, + "grad_norm": 0.7275068163871765, + "learning_rate": 9.109084114689498e-06, + "loss": 6.1059, + "step": 10495 + }, + { + "epoch": 0.7134121483897269, + "grad_norm": 0.7578950524330139, + "learning_rate": 9.108659464601169e-06, + "loss": 6.3051, + "step": 10500 + }, + { + "epoch": 0.7137518684603886, + "grad_norm": 0.6924611330032349, + "learning_rate": 9.108234814512842e-06, + "loss": 6.3348, + "step": 10505 + }, + { + "epoch": 0.7140915885310504, + "grad_norm": 0.5590173602104187, + "learning_rate": 9.107810164424515e-06, + "loss": 6.3094, + "step": 10510 + }, + { + "epoch": 0.7144313086017122, + "grad_norm": 0.6681362390518188, + "learning_rate": 9.107385514336187e-06, + "loss": 5.9478, + "step": 10515 + }, + { + "epoch": 0.714771028672374, + "grad_norm": 0.6942286491394043, + "learning_rate": 9.10696086424786e-06, + "loss": 6.1535, + "step": 10520 + }, + { + "epoch": 0.7151107487430357, + "grad_norm": 0.646040141582489, + "learning_rate": 9.106536214159533e-06, + "loss": 6.3067, + "step": 10525 + }, + { + "epoch": 0.7154504688136976, + "grad_norm": 0.7510294914245605, + "learning_rate": 9.106111564071206e-06, + "loss": 5.8945, + "step": 10530 + }, + { + "epoch": 0.7157901888843593, + "grad_norm": 0.6834248304367065, + "learning_rate": 9.105686913982879e-06, + "loss": 6.202, + "step": 10535 + }, + { + "epoch": 0.716129908955021, + "grad_norm": 0.6368762850761414, + "learning_rate": 9.105262263894551e-06, + "loss": 6.205, + "step": 10540 + }, + { + "epoch": 0.7164696290256828, + "grad_norm": 0.8698294162750244, + "learning_rate": 9.104837613806224e-06, + "loss": 6.0727, + "step": 10545 + }, + { + "epoch": 0.7168093490963446, + "grad_norm": 0.5948230028152466, + "learning_rate": 9.104412963717897e-06, + "loss": 6.2348, + "step": 10550 + }, + { + "epoch": 0.7171490691670064, + "grad_norm": 0.6860139966011047, + "learning_rate": 9.10398831362957e-06, + "loss": 6.1388, + "step": 10555 + }, + { + "epoch": 0.7174887892376681, + "grad_norm": 0.7169312238693237, + "learning_rate": 9.103563663541243e-06, + "loss": 6.1207, + "step": 10560 + }, + { + "epoch": 0.71782850930833, + "grad_norm": 0.6250284314155579, + "learning_rate": 9.103139013452915e-06, + "loss": 6.1629, + "step": 10565 + }, + { + "epoch": 0.7181682293789917, + "grad_norm": 0.6194764971733093, + "learning_rate": 9.102714363364588e-06, + "loss": 6.2628, + "step": 10570 + }, + { + "epoch": 0.7185079494496535, + "grad_norm": 0.7537720203399658, + "learning_rate": 9.102289713276261e-06, + "loss": 6.0506, + "step": 10575 + }, + { + "epoch": 0.7188476695203153, + "grad_norm": 0.6780703663825989, + "learning_rate": 9.101865063187934e-06, + "loss": 6.3109, + "step": 10580 + }, + { + "epoch": 0.719187389590977, + "grad_norm": 0.8812618255615234, + "learning_rate": 9.101440413099607e-06, + "loss": 5.9699, + "step": 10585 + }, + { + "epoch": 0.7195271096616388, + "grad_norm": 0.7103826999664307, + "learning_rate": 9.10101576301128e-06, + "loss": 6.1972, + "step": 10590 + }, + { + "epoch": 0.7198668297323005, + "grad_norm": 0.7400150299072266, + "learning_rate": 9.100591112922952e-06, + "loss": 5.9534, + "step": 10595 + }, + { + "epoch": 0.7202065498029624, + "grad_norm": 0.6607983708381653, + "learning_rate": 9.100166462834625e-06, + "loss": 5.999, + "step": 10600 + }, + { + "epoch": 0.7205462698736241, + "grad_norm": 0.5289641618728638, + "learning_rate": 9.099741812746298e-06, + "loss": 6.3194, + "step": 10605 + }, + { + "epoch": 0.7208859899442859, + "grad_norm": 0.6488122344017029, + "learning_rate": 9.09931716265797e-06, + "loss": 6.2099, + "step": 10610 + }, + { + "epoch": 0.7212257100149477, + "grad_norm": 0.6237058043479919, + "learning_rate": 9.098892512569643e-06, + "loss": 6.056, + "step": 10615 + }, + { + "epoch": 0.7215654300856095, + "grad_norm": 0.6431372761726379, + "learning_rate": 9.098467862481316e-06, + "loss": 6.1154, + "step": 10620 + }, + { + "epoch": 0.7219051501562712, + "grad_norm": 0.6451079845428467, + "learning_rate": 9.098043212392989e-06, + "loss": 6.1158, + "step": 10625 + }, + { + "epoch": 0.722244870226933, + "grad_norm": 0.6032236218452454, + "learning_rate": 9.097618562304662e-06, + "loss": 6.0755, + "step": 10630 + }, + { + "epoch": 0.7225845902975948, + "grad_norm": 0.6178813576698303, + "learning_rate": 9.097193912216335e-06, + "loss": 6.1873, + "step": 10635 + }, + { + "epoch": 0.7229243103682566, + "grad_norm": 0.7638076543807983, + "learning_rate": 9.096769262128007e-06, + "loss": 6.2236, + "step": 10640 + }, + { + "epoch": 0.7232640304389183, + "grad_norm": 0.7236708402633667, + "learning_rate": 9.09634461203968e-06, + "loss": 6.2591, + "step": 10645 + }, + { + "epoch": 0.7236037505095801, + "grad_norm": 0.6746871471405029, + "learning_rate": 9.095919961951353e-06, + "loss": 6.3129, + "step": 10650 + }, + { + "epoch": 0.7239434705802419, + "grad_norm": 0.7376072406768799, + "learning_rate": 9.095495311863026e-06, + "loss": 6.1568, + "step": 10655 + }, + { + "epoch": 0.7242831906509036, + "grad_norm": 0.668287992477417, + "learning_rate": 9.095070661774699e-06, + "loss": 5.7904, + "step": 10660 + }, + { + "epoch": 0.7246229107215655, + "grad_norm": 0.5937743782997131, + "learning_rate": 9.094646011686371e-06, + "loss": 6.1256, + "step": 10665 + }, + { + "epoch": 0.7249626307922272, + "grad_norm": 0.7390088438987732, + "learning_rate": 9.094221361598044e-06, + "loss": 6.1134, + "step": 10670 + }, + { + "epoch": 0.725302350862889, + "grad_norm": 0.5513741970062256, + "learning_rate": 9.093796711509717e-06, + "loss": 6.3287, + "step": 10675 + }, + { + "epoch": 0.7256420709335507, + "grad_norm": 0.6940723657608032, + "learning_rate": 9.09337206142139e-06, + "loss": 6.4382, + "step": 10680 + }, + { + "epoch": 0.7259817910042126, + "grad_norm": 0.6995964050292969, + "learning_rate": 9.092947411333063e-06, + "loss": 6.3763, + "step": 10685 + }, + { + "epoch": 0.7263215110748743, + "grad_norm": 0.8319272398948669, + "learning_rate": 9.092522761244735e-06, + "loss": 6.2168, + "step": 10690 + }, + { + "epoch": 0.726661231145536, + "grad_norm": 0.643247127532959, + "learning_rate": 9.092098111156408e-06, + "loss": 6.0571, + "step": 10695 + }, + { + "epoch": 0.7270009512161979, + "grad_norm": 0.5708909630775452, + "learning_rate": 9.09167346106808e-06, + "loss": 6.2874, + "step": 10700 + }, + { + "epoch": 0.7273406712868596, + "grad_norm": 0.5578559637069702, + "learning_rate": 9.091248810979754e-06, + "loss": 6.1467, + "step": 10705 + }, + { + "epoch": 0.7276803913575214, + "grad_norm": 0.775023341178894, + "learning_rate": 9.090824160891427e-06, + "loss": 6.3142, + "step": 10710 + }, + { + "epoch": 0.7280201114281831, + "grad_norm": 0.6777134537696838, + "learning_rate": 9.090399510803098e-06, + "loss": 6.2705, + "step": 10715 + }, + { + "epoch": 0.728359831498845, + "grad_norm": 0.9485875964164734, + "learning_rate": 9.089974860714772e-06, + "loss": 5.9464, + "step": 10720 + }, + { + "epoch": 0.7286995515695067, + "grad_norm": 0.6906781792640686, + "learning_rate": 9.089550210626445e-06, + "loss": 5.9833, + "step": 10725 + }, + { + "epoch": 0.7290392716401685, + "grad_norm": 0.5829170346260071, + "learning_rate": 9.089125560538116e-06, + "loss": 5.7792, + "step": 10730 + }, + { + "epoch": 0.7293789917108303, + "grad_norm": 0.5855238437652588, + "learning_rate": 9.08870091044979e-06, + "loss": 6.3952, + "step": 10735 + }, + { + "epoch": 0.7297187117814921, + "grad_norm": 0.662796676158905, + "learning_rate": 9.088276260361463e-06, + "loss": 6.087, + "step": 10740 + }, + { + "epoch": 0.7300584318521538, + "grad_norm": 0.7753608226776123, + "learning_rate": 9.087851610273136e-06, + "loss": 6.1441, + "step": 10745 + }, + { + "epoch": 0.7303981519228157, + "grad_norm": 0.783457338809967, + "learning_rate": 9.087426960184809e-06, + "loss": 6.1976, + "step": 10750 + }, + { + "epoch": 0.7307378719934774, + "grad_norm": 0.7659125924110413, + "learning_rate": 9.087002310096482e-06, + "loss": 6.2401, + "step": 10755 + }, + { + "epoch": 0.7310775920641391, + "grad_norm": 0.6225931644439697, + "learning_rate": 9.086577660008155e-06, + "loss": 6.1617, + "step": 10760 + }, + { + "epoch": 0.7314173121348009, + "grad_norm": 0.6361936330795288, + "learning_rate": 9.086153009919827e-06, + "loss": 6.1076, + "step": 10765 + }, + { + "epoch": 0.7317570322054627, + "grad_norm": 0.612817108631134, + "learning_rate": 9.085728359831499e-06, + "loss": 5.9692, + "step": 10770 + }, + { + "epoch": 0.7320967522761245, + "grad_norm": 0.6322273015975952, + "learning_rate": 9.085303709743173e-06, + "loss": 6.3099, + "step": 10775 + }, + { + "epoch": 0.7324364723467862, + "grad_norm": 0.7087699770927429, + "learning_rate": 9.084879059654846e-06, + "loss": 6.0212, + "step": 10780 + }, + { + "epoch": 0.7327761924174481, + "grad_norm": 0.8059830069541931, + "learning_rate": 9.084454409566517e-06, + "loss": 6.0903, + "step": 10785 + }, + { + "epoch": 0.7331159124881098, + "grad_norm": 0.5716100931167603, + "learning_rate": 9.084029759478191e-06, + "loss": 6.0219, + "step": 10790 + }, + { + "epoch": 0.7334556325587716, + "grad_norm": 0.6627748608589172, + "learning_rate": 9.083605109389864e-06, + "loss": 6.2446, + "step": 10795 + }, + { + "epoch": 0.7337953526294333, + "grad_norm": 0.7269123792648315, + "learning_rate": 9.083180459301535e-06, + "loss": 6.2951, + "step": 10800 + }, + { + "epoch": 0.7341350727000951, + "grad_norm": 0.6630348563194275, + "learning_rate": 9.08275580921321e-06, + "loss": 6.0714, + "step": 10805 + }, + { + "epoch": 0.7344747927707569, + "grad_norm": 0.6432666182518005, + "learning_rate": 9.082331159124883e-06, + "loss": 6.2528, + "step": 10810 + }, + { + "epoch": 0.7348145128414186, + "grad_norm": 0.5279867053031921, + "learning_rate": 9.081906509036554e-06, + "loss": 6.1146, + "step": 10815 + }, + { + "epoch": 0.7351542329120805, + "grad_norm": 0.6725083589553833, + "learning_rate": 9.081481858948228e-06, + "loss": 6.397, + "step": 10820 + }, + { + "epoch": 0.7354939529827422, + "grad_norm": 0.6194068789482117, + "learning_rate": 9.081057208859901e-06, + "loss": 5.9958, + "step": 10825 + }, + { + "epoch": 0.735833673053404, + "grad_norm": 0.6518157124519348, + "learning_rate": 9.080632558771572e-06, + "loss": 6.0165, + "step": 10830 + }, + { + "epoch": 0.7361733931240658, + "grad_norm": 1.2530076503753662, + "learning_rate": 9.080207908683247e-06, + "loss": 6.1074, + "step": 10835 + }, + { + "epoch": 0.7365131131947276, + "grad_norm": 0.6549580693244934, + "learning_rate": 9.07978325859492e-06, + "loss": 6.0949, + "step": 10840 + }, + { + "epoch": 0.7368528332653893, + "grad_norm": 0.7937160134315491, + "learning_rate": 9.07935860850659e-06, + "loss": 5.9873, + "step": 10845 + }, + { + "epoch": 0.737192553336051, + "grad_norm": 0.7268819808959961, + "learning_rate": 9.078933958418265e-06, + "loss": 6.0886, + "step": 10850 + }, + { + "epoch": 0.7375322734067129, + "grad_norm": 0.7376886606216431, + "learning_rate": 9.078509308329936e-06, + "loss": 6.0674, + "step": 10855 + }, + { + "epoch": 0.7378719934773746, + "grad_norm": 0.6283778548240662, + "learning_rate": 9.078084658241609e-06, + "loss": 6.2819, + "step": 10860 + }, + { + "epoch": 0.7382117135480364, + "grad_norm": 0.5701752305030823, + "learning_rate": 9.077660008153283e-06, + "loss": 6.3178, + "step": 10865 + }, + { + "epoch": 0.7385514336186982, + "grad_norm": 0.6629175543785095, + "learning_rate": 9.077235358064955e-06, + "loss": 5.9507, + "step": 10870 + }, + { + "epoch": 0.73889115368936, + "grad_norm": 0.6508638262748718, + "learning_rate": 9.076810707976627e-06, + "loss": 6.099, + "step": 10875 + }, + { + "epoch": 0.7392308737600217, + "grad_norm": 0.8040112853050232, + "learning_rate": 9.076386057888302e-06, + "loss": 6.1781, + "step": 10880 + }, + { + "epoch": 0.7395705938306835, + "grad_norm": 0.6098882555961609, + "learning_rate": 9.075961407799973e-06, + "loss": 5.9082, + "step": 10885 + }, + { + "epoch": 0.7399103139013453, + "grad_norm": 0.9232227802276611, + "learning_rate": 9.075536757711646e-06, + "loss": 6.1414, + "step": 10890 + }, + { + "epoch": 0.7402500339720071, + "grad_norm": 0.6679373979568481, + "learning_rate": 9.07511210762332e-06, + "loss": 6.218, + "step": 10895 + }, + { + "epoch": 0.7405897540426688, + "grad_norm": 0.585797131061554, + "learning_rate": 9.074687457534991e-06, + "loss": 5.9395, + "step": 10900 + }, + { + "epoch": 0.7409294741133307, + "grad_norm": 0.942024827003479, + "learning_rate": 9.074262807446664e-06, + "loss": 6.2671, + "step": 10905 + }, + { + "epoch": 0.7412691941839924, + "grad_norm": 0.6072708368301392, + "learning_rate": 9.073838157358339e-06, + "loss": 6.1789, + "step": 10910 + }, + { + "epoch": 0.7416089142546541, + "grad_norm": 0.5485156774520874, + "learning_rate": 9.07341350727001e-06, + "loss": 6.2082, + "step": 10915 + }, + { + "epoch": 0.741948634325316, + "grad_norm": 0.6513394713401794, + "learning_rate": 9.072988857181683e-06, + "loss": 6.1656, + "step": 10920 + }, + { + "epoch": 0.7422883543959777, + "grad_norm": 0.6223764419555664, + "learning_rate": 9.072564207093355e-06, + "loss": 6.1033, + "step": 10925 + }, + { + "epoch": 0.7426280744666395, + "grad_norm": 0.6358680129051208, + "learning_rate": 9.072139557005028e-06, + "loss": 6.3618, + "step": 10930 + }, + { + "epoch": 0.7429677945373012, + "grad_norm": 0.802317202091217, + "learning_rate": 9.071714906916701e-06, + "loss": 6.0804, + "step": 10935 + }, + { + "epoch": 0.7433075146079631, + "grad_norm": 0.7477809190750122, + "learning_rate": 9.071290256828374e-06, + "loss": 6.1951, + "step": 10940 + }, + { + "epoch": 0.7436472346786248, + "grad_norm": 0.733869731426239, + "learning_rate": 9.070865606740047e-06, + "loss": 5.9502, + "step": 10945 + }, + { + "epoch": 0.7439869547492866, + "grad_norm": 0.677051842212677, + "learning_rate": 9.07044095665172e-06, + "loss": 6.4832, + "step": 10950 + }, + { + "epoch": 0.7443266748199484, + "grad_norm": 0.7385284304618835, + "learning_rate": 9.070016306563392e-06, + "loss": 5.9532, + "step": 10955 + }, + { + "epoch": 0.7446663948906102, + "grad_norm": 0.5065022706985474, + "learning_rate": 9.069591656475065e-06, + "loss": 5.8763, + "step": 10960 + }, + { + "epoch": 0.7450061149612719, + "grad_norm": 0.6322180032730103, + "learning_rate": 9.069167006386738e-06, + "loss": 6.164, + "step": 10965 + }, + { + "epoch": 0.7453458350319336, + "grad_norm": 0.7827985286712646, + "learning_rate": 9.06874235629841e-06, + "loss": 6.1885, + "step": 10970 + }, + { + "epoch": 0.7456855551025955, + "grad_norm": 0.7087762951850891, + "learning_rate": 9.068317706210083e-06, + "loss": 6.1328, + "step": 10975 + }, + { + "epoch": 0.7460252751732572, + "grad_norm": 0.7452583909034729, + "learning_rate": 9.067893056121756e-06, + "loss": 5.9899, + "step": 10980 + }, + { + "epoch": 0.746364995243919, + "grad_norm": 0.6387804746627808, + "learning_rate": 9.067468406033429e-06, + "loss": 6.0268, + "step": 10985 + }, + { + "epoch": 0.7467047153145808, + "grad_norm": 0.5995728969573975, + "learning_rate": 9.067043755945102e-06, + "loss": 5.9194, + "step": 10990 + }, + { + "epoch": 0.7470444353852426, + "grad_norm": 0.6961165070533752, + "learning_rate": 9.066619105856775e-06, + "loss": 5.9798, + "step": 10995 + }, + { + "epoch": 0.7473841554559043, + "grad_norm": 0.5884104371070862, + "learning_rate": 9.066194455768447e-06, + "loss": 5.9127, + "step": 11000 + }, + { + "epoch": 0.7477238755265662, + "grad_norm": 0.664591908454895, + "learning_rate": 9.06576980568012e-06, + "loss": 5.9935, + "step": 11005 + }, + { + "epoch": 0.7480635955972279, + "grad_norm": 0.6460817456245422, + "learning_rate": 9.065345155591793e-06, + "loss": 5.9245, + "step": 11010 + }, + { + "epoch": 0.7484033156678896, + "grad_norm": 0.7064407467842102, + "learning_rate": 9.064920505503466e-06, + "loss": 6.0538, + "step": 11015 + }, + { + "epoch": 0.7487430357385514, + "grad_norm": 0.6427567005157471, + "learning_rate": 9.064495855415139e-06, + "loss": 6.161, + "step": 11020 + }, + { + "epoch": 0.7490827558092132, + "grad_norm": 0.7184216380119324, + "learning_rate": 9.064071205326811e-06, + "loss": 5.9556, + "step": 11025 + }, + { + "epoch": 0.749422475879875, + "grad_norm": 0.7246560454368591, + "learning_rate": 9.063646555238484e-06, + "loss": 6.1563, + "step": 11030 + }, + { + "epoch": 0.7497621959505367, + "grad_norm": 0.6639940738677979, + "learning_rate": 9.063221905150157e-06, + "loss": 6.1357, + "step": 11035 + }, + { + "epoch": 0.7501019160211986, + "grad_norm": 0.7058921456336975, + "learning_rate": 9.06279725506183e-06, + "loss": 6.1973, + "step": 11040 + }, + { + "epoch": 0.7504416360918603, + "grad_norm": 0.6831948161125183, + "learning_rate": 9.062372604973503e-06, + "loss": 6.1326, + "step": 11045 + }, + { + "epoch": 0.7507813561625221, + "grad_norm": 0.6566158533096313, + "learning_rate": 9.061947954885175e-06, + "loss": 5.9694, + "step": 11050 + }, + { + "epoch": 0.7511210762331838, + "grad_norm": 0.6205055117607117, + "learning_rate": 9.061523304796848e-06, + "loss": 5.9146, + "step": 11055 + }, + { + "epoch": 0.7514607963038457, + "grad_norm": 0.5869320631027222, + "learning_rate": 9.061098654708521e-06, + "loss": 6.071, + "step": 11060 + }, + { + "epoch": 0.7518005163745074, + "grad_norm": 0.5915622711181641, + "learning_rate": 9.060674004620194e-06, + "loss": 6.0969, + "step": 11065 + }, + { + "epoch": 0.7521402364451691, + "grad_norm": 0.6456178426742554, + "learning_rate": 9.060249354531867e-06, + "loss": 6.0704, + "step": 11070 + }, + { + "epoch": 0.752479956515831, + "grad_norm": 0.7233245372772217, + "learning_rate": 9.05982470444354e-06, + "loss": 6.1665, + "step": 11075 + }, + { + "epoch": 0.7528196765864927, + "grad_norm": 0.6032096147537231, + "learning_rate": 9.059400054355212e-06, + "loss": 5.7929, + "step": 11080 + }, + { + "epoch": 0.7531593966571545, + "grad_norm": 0.7029232382774353, + "learning_rate": 9.058975404266885e-06, + "loss": 5.8573, + "step": 11085 + }, + { + "epoch": 0.7534991167278163, + "grad_norm": 0.63865727186203, + "learning_rate": 9.058550754178558e-06, + "loss": 6.0658, + "step": 11090 + }, + { + "epoch": 0.7538388367984781, + "grad_norm": 0.7018494606018066, + "learning_rate": 9.05812610409023e-06, + "loss": 6.1845, + "step": 11095 + }, + { + "epoch": 0.7541785568691398, + "grad_norm": 0.6945291757583618, + "learning_rate": 9.057701454001903e-06, + "loss": 5.9001, + "step": 11100 + }, + { + "epoch": 0.7545182769398016, + "grad_norm": 0.6334263682365417, + "learning_rate": 9.057276803913576e-06, + "loss": 5.9543, + "step": 11105 + }, + { + "epoch": 0.7548579970104634, + "grad_norm": 0.5758827924728394, + "learning_rate": 9.056852153825249e-06, + "loss": 6.0049, + "step": 11110 + }, + { + "epoch": 0.7551977170811252, + "grad_norm": 0.8086891770362854, + "learning_rate": 9.056427503736922e-06, + "loss": 6.1115, + "step": 11115 + }, + { + "epoch": 0.7555374371517869, + "grad_norm": 0.676560640335083, + "learning_rate": 9.056002853648595e-06, + "loss": 6.1224, + "step": 11120 + }, + { + "epoch": 0.7558771572224487, + "grad_norm": 0.6264210939407349, + "learning_rate": 9.055578203560267e-06, + "loss": 5.972, + "step": 11125 + }, + { + "epoch": 0.7562168772931105, + "grad_norm": 0.6654858589172363, + "learning_rate": 9.05515355347194e-06, + "loss": 6.0477, + "step": 11130 + }, + { + "epoch": 0.7565565973637722, + "grad_norm": 0.6521281003952026, + "learning_rate": 9.054728903383613e-06, + "loss": 6.0921, + "step": 11135 + }, + { + "epoch": 0.756896317434434, + "grad_norm": 0.5822196006774902, + "learning_rate": 9.054304253295286e-06, + "loss": 5.9334, + "step": 11140 + }, + { + "epoch": 0.7572360375050958, + "grad_norm": 0.6636847853660583, + "learning_rate": 9.053879603206959e-06, + "loss": 6.0879, + "step": 11145 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.6635540723800659, + "learning_rate": 9.053454953118631e-06, + "loss": 6.2739, + "step": 11150 + }, + { + "epoch": 0.7579154776464193, + "grad_norm": 0.6033146977424622, + "learning_rate": 9.053030303030304e-06, + "loss": 6.3404, + "step": 11155 + }, + { + "epoch": 0.7582551977170812, + "grad_norm": 0.6161674857139587, + "learning_rate": 9.052605652941977e-06, + "loss": 5.8685, + "step": 11160 + }, + { + "epoch": 0.7585949177877429, + "grad_norm": 0.7000011205673218, + "learning_rate": 9.05218100285365e-06, + "loss": 5.8485, + "step": 11165 + }, + { + "epoch": 0.7589346378584046, + "grad_norm": 0.6552472114562988, + "learning_rate": 9.051756352765323e-06, + "loss": 5.9885, + "step": 11170 + }, + { + "epoch": 0.7592743579290665, + "grad_norm": 0.6570580005645752, + "learning_rate": 9.051331702676995e-06, + "loss": 5.898, + "step": 11175 + }, + { + "epoch": 0.7596140779997282, + "grad_norm": 0.6096943020820618, + "learning_rate": 9.050907052588668e-06, + "loss": 6.0241, + "step": 11180 + }, + { + "epoch": 0.75995379807039, + "grad_norm": 0.7022497653961182, + "learning_rate": 9.050482402500341e-06, + "loss": 6.0373, + "step": 11185 + }, + { + "epoch": 0.7602935181410517, + "grad_norm": 0.7695353031158447, + "learning_rate": 9.050057752412014e-06, + "loss": 6.1597, + "step": 11190 + }, + { + "epoch": 0.7606332382117136, + "grad_norm": 0.5041824579238892, + "learning_rate": 9.049633102323687e-06, + "loss": 5.9489, + "step": 11195 + }, + { + "epoch": 0.7609729582823753, + "grad_norm": 0.752454936504364, + "learning_rate": 9.049208452235358e-06, + "loss": 6.107, + "step": 11200 + }, + { + "epoch": 0.7613126783530371, + "grad_norm": 0.6383696794509888, + "learning_rate": 9.048783802147032e-06, + "loss": 5.9362, + "step": 11205 + }, + { + "epoch": 0.7616523984236989, + "grad_norm": 0.6259339451789856, + "learning_rate": 9.048359152058705e-06, + "loss": 6.1133, + "step": 11210 + }, + { + "epoch": 0.7619921184943607, + "grad_norm": 0.7811927795410156, + "learning_rate": 9.047934501970376e-06, + "loss": 6.1146, + "step": 11215 + }, + { + "epoch": 0.7623318385650224, + "grad_norm": 0.594751238822937, + "learning_rate": 9.04750985188205e-06, + "loss": 6.0763, + "step": 11220 + }, + { + "epoch": 0.7626715586356841, + "grad_norm": 0.6017780900001526, + "learning_rate": 9.047085201793723e-06, + "loss": 5.8288, + "step": 11225 + }, + { + "epoch": 0.763011278706346, + "grad_norm": 1.1503748893737793, + "learning_rate": 9.046660551705395e-06, + "loss": 6.0217, + "step": 11230 + }, + { + "epoch": 0.7633509987770077, + "grad_norm": 0.6445888876914978, + "learning_rate": 9.046235901617069e-06, + "loss": 6.0922, + "step": 11235 + }, + { + "epoch": 0.7636907188476695, + "grad_norm": 0.6693727374076843, + "learning_rate": 9.045811251528742e-06, + "loss": 5.9838, + "step": 11240 + }, + { + "epoch": 0.7640304389183313, + "grad_norm": 0.7035183906555176, + "learning_rate": 9.045386601440413e-06, + "loss": 5.9889, + "step": 11245 + }, + { + "epoch": 0.7643701589889931, + "grad_norm": 0.5808925628662109, + "learning_rate": 9.044961951352087e-06, + "loss": 5.8393, + "step": 11250 + }, + { + "epoch": 0.7647098790596548, + "grad_norm": 0.605078399181366, + "learning_rate": 9.04453730126376e-06, + "loss": 6.1749, + "step": 11255 + }, + { + "epoch": 0.7650495991303167, + "grad_norm": 0.7221083045005798, + "learning_rate": 9.044112651175431e-06, + "loss": 5.9909, + "step": 11260 + }, + { + "epoch": 0.7653893192009784, + "grad_norm": 0.7374167442321777, + "learning_rate": 9.043688001087106e-06, + "loss": 5.9605, + "step": 11265 + }, + { + "epoch": 0.7657290392716402, + "grad_norm": 0.6433135867118835, + "learning_rate": 9.043263350998777e-06, + "loss": 6.1193, + "step": 11270 + }, + { + "epoch": 0.7660687593423019, + "grad_norm": 0.6645304560661316, + "learning_rate": 9.04283870091045e-06, + "loss": 6.1719, + "step": 11275 + }, + { + "epoch": 0.7664084794129638, + "grad_norm": 0.6718870401382446, + "learning_rate": 9.042414050822124e-06, + "loss": 6.051, + "step": 11280 + }, + { + "epoch": 0.7667481994836255, + "grad_norm": 0.7085744142532349, + "learning_rate": 9.041989400733795e-06, + "loss": 6.1292, + "step": 11285 + }, + { + "epoch": 0.7670879195542872, + "grad_norm": 0.6619330048561096, + "learning_rate": 9.041564750645468e-06, + "loss": 5.9459, + "step": 11290 + }, + { + "epoch": 0.7674276396249491, + "grad_norm": 0.5893954038619995, + "learning_rate": 9.041140100557143e-06, + "loss": 6.0212, + "step": 11295 + }, + { + "epoch": 0.7677673596956108, + "grad_norm": 0.5837656855583191, + "learning_rate": 9.040715450468814e-06, + "loss": 5.6699, + "step": 11300 + }, + { + "epoch": 0.7681070797662726, + "grad_norm": 0.7071300745010376, + "learning_rate": 9.040290800380487e-06, + "loss": 6.0188, + "step": 11305 + }, + { + "epoch": 0.7684467998369343, + "grad_norm": 0.7255082726478577, + "learning_rate": 9.039866150292161e-06, + "loss": 6.2226, + "step": 11310 + }, + { + "epoch": 0.7687865199075962, + "grad_norm": 0.9456356763839722, + "learning_rate": 9.039441500203832e-06, + "loss": 6.0934, + "step": 11315 + }, + { + "epoch": 0.7691262399782579, + "grad_norm": 1.1977088451385498, + "learning_rate": 9.039016850115505e-06, + "loss": 6.2775, + "step": 11320 + }, + { + "epoch": 0.7694659600489197, + "grad_norm": 0.7551604509353638, + "learning_rate": 9.03859220002718e-06, + "loss": 6.1268, + "step": 11325 + }, + { + "epoch": 0.7698056801195815, + "grad_norm": 0.7121284604072571, + "learning_rate": 9.03816754993885e-06, + "loss": 6.1216, + "step": 11330 + }, + { + "epoch": 0.7701454001902432, + "grad_norm": 0.6765163540840149, + "learning_rate": 9.037742899850523e-06, + "loss": 6.1317, + "step": 11335 + }, + { + "epoch": 0.770485120260905, + "grad_norm": 0.7159232497215271, + "learning_rate": 9.037318249762196e-06, + "loss": 5.6379, + "step": 11340 + }, + { + "epoch": 0.7708248403315668, + "grad_norm": 0.6723832488059998, + "learning_rate": 9.036893599673869e-06, + "loss": 5.9964, + "step": 11345 + }, + { + "epoch": 0.7711645604022286, + "grad_norm": 0.5771089196205139, + "learning_rate": 9.036468949585542e-06, + "loss": 5.9649, + "step": 11350 + }, + { + "epoch": 0.7715042804728903, + "grad_norm": 0.578117311000824, + "learning_rate": 9.036044299497215e-06, + "loss": 5.6839, + "step": 11355 + }, + { + "epoch": 0.7718440005435521, + "grad_norm": 0.6280841827392578, + "learning_rate": 9.035619649408887e-06, + "loss": 6.0204, + "step": 11360 + }, + { + "epoch": 0.7721837206142139, + "grad_norm": 0.644568681716919, + "learning_rate": 9.03519499932056e-06, + "loss": 5.8674, + "step": 11365 + }, + { + "epoch": 0.7725234406848757, + "grad_norm": 0.6532883644104004, + "learning_rate": 9.034770349232233e-06, + "loss": 6.1417, + "step": 11370 + }, + { + "epoch": 0.7728631607555374, + "grad_norm": 0.6610100269317627, + "learning_rate": 9.034345699143906e-06, + "loss": 6.0375, + "step": 11375 + }, + { + "epoch": 0.7732028808261993, + "grad_norm": 0.6813567280769348, + "learning_rate": 9.033921049055579e-06, + "loss": 5.8466, + "step": 11380 + }, + { + "epoch": 0.773542600896861, + "grad_norm": 0.6154352426528931, + "learning_rate": 9.033496398967251e-06, + "loss": 6.0058, + "step": 11385 + }, + { + "epoch": 0.7738823209675227, + "grad_norm": 0.6625020503997803, + "learning_rate": 9.033071748878924e-06, + "loss": 6.4177, + "step": 11390 + }, + { + "epoch": 0.7742220410381845, + "grad_norm": 0.7035338878631592, + "learning_rate": 9.032647098790597e-06, + "loss": 6.0943, + "step": 11395 + }, + { + "epoch": 0.7745617611088463, + "grad_norm": 0.70542973279953, + "learning_rate": 9.03222244870227e-06, + "loss": 5.9026, + "step": 11400 + }, + { + "epoch": 0.7749014811795081, + "grad_norm": 0.615914523601532, + "learning_rate": 9.031797798613943e-06, + "loss": 6.1018, + "step": 11405 + }, + { + "epoch": 0.7752412012501698, + "grad_norm": 0.6197558641433716, + "learning_rate": 9.031373148525615e-06, + "loss": 6.0034, + "step": 11410 + }, + { + "epoch": 0.7755809213208317, + "grad_norm": 0.638068437576294, + "learning_rate": 9.030948498437288e-06, + "loss": 6.2769, + "step": 11415 + }, + { + "epoch": 0.7759206413914934, + "grad_norm": 0.7557953596115112, + "learning_rate": 9.030523848348961e-06, + "loss": 6.2688, + "step": 11420 + }, + { + "epoch": 0.7762603614621552, + "grad_norm": 0.7933095097541809, + "learning_rate": 9.030099198260634e-06, + "loss": 5.981, + "step": 11425 + }, + { + "epoch": 0.776600081532817, + "grad_norm": 0.66328364610672, + "learning_rate": 9.029674548172307e-06, + "loss": 5.8771, + "step": 11430 + }, + { + "epoch": 0.7769398016034788, + "grad_norm": 0.664966881275177, + "learning_rate": 9.02924989808398e-06, + "loss": 6.0434, + "step": 11435 + }, + { + "epoch": 0.7772795216741405, + "grad_norm": 0.7218535542488098, + "learning_rate": 9.028825247995652e-06, + "loss": 5.9188, + "step": 11440 + }, + { + "epoch": 0.7776192417448022, + "grad_norm": 0.6583040356636047, + "learning_rate": 9.028400597907325e-06, + "loss": 6.0191, + "step": 11445 + }, + { + "epoch": 0.7779589618154641, + "grad_norm": 0.6645073890686035, + "learning_rate": 9.027975947818998e-06, + "loss": 6.0841, + "step": 11450 + }, + { + "epoch": 0.7782986818861258, + "grad_norm": 0.5755152702331543, + "learning_rate": 9.02755129773067e-06, + "loss": 6.0238, + "step": 11455 + }, + { + "epoch": 0.7786384019567876, + "grad_norm": 0.9154600501060486, + "learning_rate": 9.027126647642343e-06, + "loss": 5.827, + "step": 11460 + }, + { + "epoch": 0.7789781220274494, + "grad_norm": 0.781932532787323, + "learning_rate": 9.026701997554016e-06, + "loss": 6.2147, + "step": 11465 + }, + { + "epoch": 0.7793178420981112, + "grad_norm": 0.6583735942840576, + "learning_rate": 9.026277347465689e-06, + "loss": 6.0494, + "step": 11470 + }, + { + "epoch": 0.7796575621687729, + "grad_norm": 0.6197389364242554, + "learning_rate": 9.025852697377362e-06, + "loss": 6.1442, + "step": 11475 + }, + { + "epoch": 0.7799972822394347, + "grad_norm": 0.657954216003418, + "learning_rate": 9.025428047289035e-06, + "loss": 5.978, + "step": 11480 + }, + { + "epoch": 0.7803370023100965, + "grad_norm": 0.4873189628124237, + "learning_rate": 9.025003397200707e-06, + "loss": 5.7571, + "step": 11485 + }, + { + "epoch": 0.7806767223807582, + "grad_norm": 0.6236392259597778, + "learning_rate": 9.02457874711238e-06, + "loss": 5.9986, + "step": 11490 + }, + { + "epoch": 0.78101644245142, + "grad_norm": 0.600966215133667, + "learning_rate": 9.024154097024053e-06, + "loss": 5.9058, + "step": 11495 + }, + { + "epoch": 0.7813561625220818, + "grad_norm": 0.6833361387252808, + "learning_rate": 9.023729446935726e-06, + "loss": 5.6263, + "step": 11500 + }, + { + "epoch": 0.7816958825927436, + "grad_norm": 0.6164446473121643, + "learning_rate": 9.023304796847399e-06, + "loss": 5.9386, + "step": 11505 + }, + { + "epoch": 0.7820356026634053, + "grad_norm": 0.6206728219985962, + "learning_rate": 9.022880146759071e-06, + "loss": 6.0532, + "step": 11510 + }, + { + "epoch": 0.7823753227340672, + "grad_norm": 0.7041054964065552, + "learning_rate": 9.022455496670744e-06, + "loss": 5.9004, + "step": 11515 + }, + { + "epoch": 0.7827150428047289, + "grad_norm": 0.7492522597312927, + "learning_rate": 9.022030846582417e-06, + "loss": 5.9184, + "step": 11520 + }, + { + "epoch": 0.7830547628753907, + "grad_norm": 0.7124365568161011, + "learning_rate": 9.02160619649409e-06, + "loss": 6.1449, + "step": 11525 + }, + { + "epoch": 0.7833944829460524, + "grad_norm": 0.9094390273094177, + "learning_rate": 9.021181546405763e-06, + "loss": 6.0121, + "step": 11530 + }, + { + "epoch": 0.7837342030167143, + "grad_norm": 0.44615790247917175, + "learning_rate": 9.020756896317435e-06, + "loss": 5.8895, + "step": 11535 + }, + { + "epoch": 0.784073923087376, + "grad_norm": 0.5515995621681213, + "learning_rate": 9.020332246229108e-06, + "loss": 5.9473, + "step": 11540 + }, + { + "epoch": 0.7844136431580377, + "grad_norm": 0.4909602999687195, + "learning_rate": 9.019907596140781e-06, + "loss": 5.8462, + "step": 11545 + }, + { + "epoch": 0.7847533632286996, + "grad_norm": 0.5231213569641113, + "learning_rate": 9.019482946052454e-06, + "loss": 5.8493, + "step": 11550 + }, + { + "epoch": 0.7850930832993613, + "grad_norm": 0.5994216203689575, + "learning_rate": 9.019058295964127e-06, + "loss": 5.9235, + "step": 11555 + }, + { + "epoch": 0.7854328033700231, + "grad_norm": 0.576900839805603, + "learning_rate": 9.0186336458758e-06, + "loss": 6.0866, + "step": 11560 + }, + { + "epoch": 0.7857725234406848, + "grad_norm": 0.6182736158370972, + "learning_rate": 9.018208995787472e-06, + "loss": 5.7611, + "step": 11565 + }, + { + "epoch": 0.7861122435113467, + "grad_norm": 0.8119008541107178, + "learning_rate": 9.017784345699145e-06, + "loss": 5.9431, + "step": 11570 + }, + { + "epoch": 0.7864519635820084, + "grad_norm": 0.6421230435371399, + "learning_rate": 9.017359695610818e-06, + "loss": 6.0748, + "step": 11575 + }, + { + "epoch": 0.7867916836526702, + "grad_norm": 0.5864526629447937, + "learning_rate": 9.01693504552249e-06, + "loss": 6.3358, + "step": 11580 + }, + { + "epoch": 0.787131403723332, + "grad_norm": 0.6992130279541016, + "learning_rate": 9.016510395434163e-06, + "loss": 5.9006, + "step": 11585 + }, + { + "epoch": 0.7874711237939938, + "grad_norm": 0.5581008791923523, + "learning_rate": 9.016085745345836e-06, + "loss": 5.8025, + "step": 11590 + }, + { + "epoch": 0.7878108438646555, + "grad_norm": 0.6965614557266235, + "learning_rate": 9.015661095257509e-06, + "loss": 5.8796, + "step": 11595 + }, + { + "epoch": 0.7881505639353173, + "grad_norm": 0.7464913129806519, + "learning_rate": 9.015236445169182e-06, + "loss": 6.0767, + "step": 11600 + }, + { + "epoch": 0.7884902840059791, + "grad_norm": 0.6937700510025024, + "learning_rate": 9.014811795080855e-06, + "loss": 6.0666, + "step": 11605 + }, + { + "epoch": 0.7888300040766408, + "grad_norm": 0.6416366696357727, + "learning_rate": 9.014387144992527e-06, + "loss": 6.0398, + "step": 11610 + }, + { + "epoch": 0.7891697241473026, + "grad_norm": 0.7063788175582886, + "learning_rate": 9.013962494904198e-06, + "loss": 5.8882, + "step": 11615 + }, + { + "epoch": 0.7895094442179644, + "grad_norm": 0.682248055934906, + "learning_rate": 9.013537844815873e-06, + "loss": 6.1264, + "step": 11620 + }, + { + "epoch": 0.7898491642886262, + "grad_norm": 0.7479922771453857, + "learning_rate": 9.013113194727546e-06, + "loss": 5.9189, + "step": 11625 + }, + { + "epoch": 0.7901888843592879, + "grad_norm": 0.5801248550415039, + "learning_rate": 9.012688544639217e-06, + "loss": 5.6889, + "step": 11630 + }, + { + "epoch": 0.7905286044299498, + "grad_norm": 0.6028261184692383, + "learning_rate": 9.012263894550891e-06, + "loss": 5.9995, + "step": 11635 + }, + { + "epoch": 0.7908683245006115, + "grad_norm": 0.7124404311180115, + "learning_rate": 9.011839244462564e-06, + "loss": 5.9571, + "step": 11640 + }, + { + "epoch": 0.7912080445712733, + "grad_norm": 0.6245173215866089, + "learning_rate": 9.011414594374235e-06, + "loss": 5.748, + "step": 11645 + }, + { + "epoch": 0.791547764641935, + "grad_norm": 0.8648340702056885, + "learning_rate": 9.01098994428591e-06, + "loss": 6.1069, + "step": 11650 + }, + { + "epoch": 0.7918874847125968, + "grad_norm": 0.8680819272994995, + "learning_rate": 9.010565294197583e-06, + "loss": 6.0142, + "step": 11655 + }, + { + "epoch": 0.7922272047832586, + "grad_norm": 0.6807880997657776, + "learning_rate": 9.010140644109254e-06, + "loss": 5.8795, + "step": 11660 + }, + { + "epoch": 0.7925669248539203, + "grad_norm": 0.6954074501991272, + "learning_rate": 9.009715994020928e-06, + "loss": 5.9361, + "step": 11665 + }, + { + "epoch": 0.7929066449245822, + "grad_norm": 0.6043300628662109, + "learning_rate": 9.009291343932601e-06, + "loss": 5.9164, + "step": 11670 + }, + { + "epoch": 0.7932463649952439, + "grad_norm": 0.6911776065826416, + "learning_rate": 9.008866693844272e-06, + "loss": 5.9344, + "step": 11675 + }, + { + "epoch": 0.7935860850659057, + "grad_norm": 0.5456531047821045, + "learning_rate": 9.008442043755947e-06, + "loss": 6.0667, + "step": 11680 + }, + { + "epoch": 0.7939258051365675, + "grad_norm": 0.6261152625083923, + "learning_rate": 9.008017393667618e-06, + "loss": 6.0113, + "step": 11685 + }, + { + "epoch": 0.7942655252072293, + "grad_norm": 0.7566394805908203, + "learning_rate": 9.00759274357929e-06, + "loss": 5.7687, + "step": 11690 + }, + { + "epoch": 0.794605245277891, + "grad_norm": 0.6525707840919495, + "learning_rate": 9.007168093490965e-06, + "loss": 6.1991, + "step": 11695 + }, + { + "epoch": 0.7949449653485527, + "grad_norm": 0.636425256729126, + "learning_rate": 9.006743443402636e-06, + "loss": 6.0493, + "step": 11700 + }, + { + "epoch": 0.7952846854192146, + "grad_norm": 0.8666204810142517, + "learning_rate": 9.006318793314309e-06, + "loss": 5.5709, + "step": 11705 + }, + { + "epoch": 0.7956244054898763, + "grad_norm": 0.6207607388496399, + "learning_rate": 9.005894143225983e-06, + "loss": 5.7305, + "step": 11710 + }, + { + "epoch": 0.7959641255605381, + "grad_norm": 0.7428746819496155, + "learning_rate": 9.005469493137654e-06, + "loss": 5.9026, + "step": 11715 + }, + { + "epoch": 0.7963038456311999, + "grad_norm": 0.7107107043266296, + "learning_rate": 9.005044843049327e-06, + "loss": 6.2082, + "step": 11720 + }, + { + "epoch": 0.7966435657018617, + "grad_norm": 0.5129759311676025, + "learning_rate": 9.004620192961002e-06, + "loss": 5.7463, + "step": 11725 + }, + { + "epoch": 0.7969832857725234, + "grad_norm": 0.668508768081665, + "learning_rate": 9.004195542872673e-06, + "loss": 5.8851, + "step": 11730 + }, + { + "epoch": 0.7973230058431852, + "grad_norm": 0.7006961107254028, + "learning_rate": 9.003770892784346e-06, + "loss": 5.9445, + "step": 11735 + }, + { + "epoch": 0.797662725913847, + "grad_norm": 0.65779709815979, + "learning_rate": 9.00334624269602e-06, + "loss": 6.2682, + "step": 11740 + }, + { + "epoch": 0.7980024459845088, + "grad_norm": 0.8017051815986633, + "learning_rate": 9.002921592607691e-06, + "loss": 6.3766, + "step": 11745 + }, + { + "epoch": 0.7983421660551705, + "grad_norm": 0.6291772723197937, + "learning_rate": 9.002496942519364e-06, + "loss": 6.1376, + "step": 11750 + }, + { + "epoch": 0.7986818861258324, + "grad_norm": 0.5962680578231812, + "learning_rate": 9.002072292431039e-06, + "loss": 5.7274, + "step": 11755 + }, + { + "epoch": 0.7990216061964941, + "grad_norm": 0.793258547782898, + "learning_rate": 9.00164764234271e-06, + "loss": 6.2432, + "step": 11760 + }, + { + "epoch": 0.7993613262671558, + "grad_norm": 0.5895866751670837, + "learning_rate": 9.001222992254384e-06, + "loss": 5.884, + "step": 11765 + }, + { + "epoch": 0.7997010463378177, + "grad_norm": 0.6157402396202087, + "learning_rate": 9.000798342166055e-06, + "loss": 5.7193, + "step": 11770 + }, + { + "epoch": 0.8000407664084794, + "grad_norm": 0.592755138874054, + "learning_rate": 9.000373692077728e-06, + "loss": 5.8427, + "step": 11775 + }, + { + "epoch": 0.8003804864791412, + "grad_norm": 0.5231430530548096, + "learning_rate": 8.999949041989403e-06, + "loss": 5.9123, + "step": 11780 + }, + { + "epoch": 0.8007202065498029, + "grad_norm": 0.6405135989189148, + "learning_rate": 8.999524391901074e-06, + "loss": 6.0143, + "step": 11785 + }, + { + "epoch": 0.8010599266204648, + "grad_norm": 0.5814012289047241, + "learning_rate": 8.999099741812746e-06, + "loss": 5.6615, + "step": 11790 + }, + { + "epoch": 0.8013996466911265, + "grad_norm": 0.5580039024353027, + "learning_rate": 8.998675091724421e-06, + "loss": 5.793, + "step": 11795 + }, + { + "epoch": 0.8017393667617883, + "grad_norm": 0.7179898023605347, + "learning_rate": 8.998250441636092e-06, + "loss": 5.8797, + "step": 11800 + }, + { + "epoch": 0.8020790868324501, + "grad_norm": 0.6777712106704712, + "learning_rate": 8.997825791547765e-06, + "loss": 6.0677, + "step": 11805 + }, + { + "epoch": 0.8024188069031118, + "grad_norm": 0.880836009979248, + "learning_rate": 8.99740114145944e-06, + "loss": 6.4442, + "step": 11810 + }, + { + "epoch": 0.8027585269737736, + "grad_norm": 0.7919129133224487, + "learning_rate": 8.99697649137111e-06, + "loss": 5.9064, + "step": 11815 + }, + { + "epoch": 0.8030982470444353, + "grad_norm": 0.6180311441421509, + "learning_rate": 8.996551841282783e-06, + "loss": 5.9794, + "step": 11820 + }, + { + "epoch": 0.8034379671150972, + "grad_norm": 0.6063016057014465, + "learning_rate": 8.996127191194458e-06, + "loss": 5.8094, + "step": 11825 + }, + { + "epoch": 0.8037776871857589, + "grad_norm": 0.6887465119361877, + "learning_rate": 8.995702541106129e-06, + "loss": 6.0459, + "step": 11830 + }, + { + "epoch": 0.8041174072564207, + "grad_norm": 0.6753359436988831, + "learning_rate": 8.995277891017802e-06, + "loss": 5.8201, + "step": 11835 + }, + { + "epoch": 0.8044571273270825, + "grad_norm": 0.7286096215248108, + "learning_rate": 8.994853240929475e-06, + "loss": 6.1298, + "step": 11840 + }, + { + "epoch": 0.8047968473977443, + "grad_norm": 0.5766781568527222, + "learning_rate": 8.994428590841147e-06, + "loss": 5.8973, + "step": 11845 + }, + { + "epoch": 0.805136567468406, + "grad_norm": 1.0095829963684082, + "learning_rate": 8.99400394075282e-06, + "loss": 5.972, + "step": 11850 + }, + { + "epoch": 0.8054762875390679, + "grad_norm": 0.8437008261680603, + "learning_rate": 8.993579290664493e-06, + "loss": 5.8127, + "step": 11855 + }, + { + "epoch": 0.8058160076097296, + "grad_norm": 0.6617098450660706, + "learning_rate": 8.993154640576166e-06, + "loss": 5.4978, + "step": 11860 + }, + { + "epoch": 0.8061557276803913, + "grad_norm": 0.7565464377403259, + "learning_rate": 8.992729990487839e-06, + "loss": 5.8288, + "step": 11865 + }, + { + "epoch": 0.8064954477510531, + "grad_norm": 0.6554325819015503, + "learning_rate": 8.992305340399511e-06, + "loss": 6.0212, + "step": 11870 + }, + { + "epoch": 0.8068351678217149, + "grad_norm": 0.542535662651062, + "learning_rate": 8.991880690311184e-06, + "loss": 5.9566, + "step": 11875 + }, + { + "epoch": 0.8071748878923767, + "grad_norm": 0.7035189867019653, + "learning_rate": 8.991456040222857e-06, + "loss": 5.6408, + "step": 11880 + }, + { + "epoch": 0.8075146079630384, + "grad_norm": 0.6220024228096008, + "learning_rate": 8.99103139013453e-06, + "loss": 5.956, + "step": 11885 + }, + { + "epoch": 0.8078543280337003, + "grad_norm": 0.6199259757995605, + "learning_rate": 8.990606740046203e-06, + "loss": 5.7074, + "step": 11890 + }, + { + "epoch": 0.808194048104362, + "grad_norm": 0.8030275702476501, + "learning_rate": 8.990182089957875e-06, + "loss": 5.8054, + "step": 11895 + }, + { + "epoch": 0.8085337681750238, + "grad_norm": 0.6240834593772888, + "learning_rate": 8.989757439869548e-06, + "loss": 5.8733, + "step": 11900 + }, + { + "epoch": 0.8088734882456855, + "grad_norm": 0.8396137952804565, + "learning_rate": 8.989332789781221e-06, + "loss": 5.6086, + "step": 11905 + }, + { + "epoch": 0.8092132083163474, + "grad_norm": 0.586702287197113, + "learning_rate": 8.988908139692894e-06, + "loss": 5.7522, + "step": 11910 + }, + { + "epoch": 0.8095529283870091, + "grad_norm": 0.6903568506240845, + "learning_rate": 8.988483489604567e-06, + "loss": 5.9044, + "step": 11915 + }, + { + "epoch": 0.8098926484576708, + "grad_norm": 0.7131275534629822, + "learning_rate": 8.98805883951624e-06, + "loss": 5.718, + "step": 11920 + }, + { + "epoch": 0.8102323685283327, + "grad_norm": 0.6790189743041992, + "learning_rate": 8.987634189427912e-06, + "loss": 5.8878, + "step": 11925 + }, + { + "epoch": 0.8105720885989944, + "grad_norm": 0.5432945489883423, + "learning_rate": 8.987209539339585e-06, + "loss": 5.9341, + "step": 11930 + }, + { + "epoch": 0.8109118086696562, + "grad_norm": 0.627251386642456, + "learning_rate": 8.986784889251258e-06, + "loss": 5.9602, + "step": 11935 + }, + { + "epoch": 0.811251528740318, + "grad_norm": 0.5612712502479553, + "learning_rate": 8.98636023916293e-06, + "loss": 5.5523, + "step": 11940 + }, + { + "epoch": 0.8115912488109798, + "grad_norm": 0.6724875569343567, + "learning_rate": 8.985935589074603e-06, + "loss": 5.7451, + "step": 11945 + }, + { + "epoch": 0.8119309688816415, + "grad_norm": 0.75004643201828, + "learning_rate": 8.985510938986276e-06, + "loss": 5.7668, + "step": 11950 + }, + { + "epoch": 0.8122706889523033, + "grad_norm": 0.9189428091049194, + "learning_rate": 8.985086288897949e-06, + "loss": 6.0962, + "step": 11955 + }, + { + "epoch": 0.8126104090229651, + "grad_norm": 0.6527339220046997, + "learning_rate": 8.984661638809622e-06, + "loss": 5.8756, + "step": 11960 + }, + { + "epoch": 0.8129501290936268, + "grad_norm": 0.5951376557350159, + "learning_rate": 8.984236988721295e-06, + "loss": 5.882, + "step": 11965 + }, + { + "epoch": 0.8132898491642886, + "grad_norm": 0.727786123752594, + "learning_rate": 8.983812338632967e-06, + "loss": 5.813, + "step": 11970 + }, + { + "epoch": 0.8136295692349504, + "grad_norm": 0.7243455052375793, + "learning_rate": 8.98338768854464e-06, + "loss": 5.9117, + "step": 11975 + }, + { + "epoch": 0.8139692893056122, + "grad_norm": 0.7621417045593262, + "learning_rate": 8.982963038456313e-06, + "loss": 5.9629, + "step": 11980 + }, + { + "epoch": 0.8143090093762739, + "grad_norm": 0.5634975433349609, + "learning_rate": 8.982538388367986e-06, + "loss": 5.8617, + "step": 11985 + }, + { + "epoch": 0.8146487294469357, + "grad_norm": 0.6375267505645752, + "learning_rate": 8.982113738279659e-06, + "loss": 5.8753, + "step": 11990 + }, + { + "epoch": 0.8149884495175975, + "grad_norm": 0.8709176778793335, + "learning_rate": 8.981689088191331e-06, + "loss": 5.9659, + "step": 11995 + }, + { + "epoch": 0.8153281695882593, + "grad_norm": 0.6439688205718994, + "learning_rate": 8.981264438103004e-06, + "loss": 5.7384, + "step": 12000 + }, + { + "epoch": 0.815667889658921, + "grad_norm": 0.7012848258018494, + "learning_rate": 8.980839788014677e-06, + "loss": 5.8644, + "step": 12005 + }, + { + "epoch": 0.8160076097295829, + "grad_norm": 0.7754683494567871, + "learning_rate": 8.98041513792635e-06, + "loss": 5.9533, + "step": 12010 + }, + { + "epoch": 0.8163473298002446, + "grad_norm": 0.5970046520233154, + "learning_rate": 8.979990487838023e-06, + "loss": 6.0786, + "step": 12015 + }, + { + "epoch": 0.8166870498709063, + "grad_norm": 0.8493043184280396, + "learning_rate": 8.979565837749695e-06, + "loss": 5.9522, + "step": 12020 + }, + { + "epoch": 0.8170267699415682, + "grad_norm": 0.6497097015380859, + "learning_rate": 8.979141187661368e-06, + "loss": 5.5882, + "step": 12025 + }, + { + "epoch": 0.8173664900122299, + "grad_norm": 0.6048570275306702, + "learning_rate": 8.97871653757304e-06, + "loss": 5.8497, + "step": 12030 + }, + { + "epoch": 0.8177062100828917, + "grad_norm": 0.9065004587173462, + "learning_rate": 8.978291887484714e-06, + "loss": 5.886, + "step": 12035 + }, + { + "epoch": 0.8180459301535534, + "grad_norm": 0.6169681549072266, + "learning_rate": 8.977867237396387e-06, + "loss": 5.5867, + "step": 12040 + }, + { + "epoch": 0.8183856502242153, + "grad_norm": 0.641316831111908, + "learning_rate": 8.977442587308058e-06, + "loss": 5.768, + "step": 12045 + }, + { + "epoch": 0.818725370294877, + "grad_norm": 0.526333749294281, + "learning_rate": 8.977017937219732e-06, + "loss": 5.8784, + "step": 12050 + }, + { + "epoch": 0.8190650903655388, + "grad_norm": 0.7242563366889954, + "learning_rate": 8.976593287131405e-06, + "loss": 5.8955, + "step": 12055 + }, + { + "epoch": 0.8194048104362006, + "grad_norm": 0.634129524230957, + "learning_rate": 8.976168637043076e-06, + "loss": 5.7436, + "step": 12060 + }, + { + "epoch": 0.8197445305068624, + "grad_norm": 0.6670370697975159, + "learning_rate": 8.97574398695475e-06, + "loss": 5.8677, + "step": 12065 + }, + { + "epoch": 0.8200842505775241, + "grad_norm": 0.6639590263366699, + "learning_rate": 8.975319336866423e-06, + "loss": 5.8297, + "step": 12070 + }, + { + "epoch": 0.8204239706481858, + "grad_norm": 0.5384988188743591, + "learning_rate": 8.974894686778094e-06, + "loss": 5.8177, + "step": 12075 + }, + { + "epoch": 0.8207636907188477, + "grad_norm": 0.6719576120376587, + "learning_rate": 8.974470036689769e-06, + "loss": 5.7012, + "step": 12080 + }, + { + "epoch": 0.8211034107895094, + "grad_norm": 0.5597572922706604, + "learning_rate": 8.974045386601442e-06, + "loss": 5.8565, + "step": 12085 + }, + { + "epoch": 0.8214431308601712, + "grad_norm": 0.5664688944816589, + "learning_rate": 8.973620736513113e-06, + "loss": 5.8727, + "step": 12090 + }, + { + "epoch": 0.821782850930833, + "grad_norm": 0.7253386974334717, + "learning_rate": 8.973196086424787e-06, + "loss": 6.0706, + "step": 12095 + }, + { + "epoch": 0.8221225710014948, + "grad_norm": 0.7107961177825928, + "learning_rate": 8.97277143633646e-06, + "loss": 5.6896, + "step": 12100 + }, + { + "epoch": 0.8224622910721565, + "grad_norm": 0.6434924006462097, + "learning_rate": 8.972346786248133e-06, + "loss": 5.7896, + "step": 12105 + }, + { + "epoch": 0.8228020111428184, + "grad_norm": 0.6252070069313049, + "learning_rate": 8.971922136159806e-06, + "loss": 5.9997, + "step": 12110 + }, + { + "epoch": 0.8231417312134801, + "grad_norm": 0.6753676533699036, + "learning_rate": 8.971497486071477e-06, + "loss": 5.8599, + "step": 12115 + }, + { + "epoch": 0.8234814512841419, + "grad_norm": 0.6124852895736694, + "learning_rate": 8.971072835983151e-06, + "loss": 6.0815, + "step": 12120 + }, + { + "epoch": 0.8238211713548036, + "grad_norm": 0.8544199466705322, + "learning_rate": 8.970648185894824e-06, + "loss": 6.0077, + "step": 12125 + }, + { + "epoch": 0.8241608914254654, + "grad_norm": 0.5394419431686401, + "learning_rate": 8.970223535806495e-06, + "loss": 5.9571, + "step": 12130 + }, + { + "epoch": 0.8245006114961272, + "grad_norm": 0.6576294898986816, + "learning_rate": 8.96979888571817e-06, + "loss": 5.7524, + "step": 12135 + }, + { + "epoch": 0.8248403315667889, + "grad_norm": 0.5933584570884705, + "learning_rate": 8.969374235629843e-06, + "loss": 5.7275, + "step": 12140 + }, + { + "epoch": 0.8251800516374508, + "grad_norm": 0.5719273686408997, + "learning_rate": 8.968949585541514e-06, + "loss": 6.0725, + "step": 12145 + }, + { + "epoch": 0.8255197717081125, + "grad_norm": 0.6193808317184448, + "learning_rate": 8.968524935453188e-06, + "loss": 5.8637, + "step": 12150 + }, + { + "epoch": 0.8258594917787743, + "grad_norm": 1.1133207082748413, + "learning_rate": 8.968100285364861e-06, + "loss": 5.9804, + "step": 12155 + }, + { + "epoch": 0.826199211849436, + "grad_norm": 0.6304081082344055, + "learning_rate": 8.967675635276532e-06, + "loss": 5.8682, + "step": 12160 + }, + { + "epoch": 0.8265389319200979, + "grad_norm": 0.6416819095611572, + "learning_rate": 8.967250985188207e-06, + "loss": 5.7639, + "step": 12165 + }, + { + "epoch": 0.8268786519907596, + "grad_norm": 0.6830800771713257, + "learning_rate": 8.96682633509988e-06, + "loss": 5.9937, + "step": 12170 + }, + { + "epoch": 0.8272183720614213, + "grad_norm": 0.6215963959693909, + "learning_rate": 8.96640168501155e-06, + "loss": 5.7049, + "step": 12175 + }, + { + "epoch": 0.8275580921320832, + "grad_norm": 0.6714081168174744, + "learning_rate": 8.965977034923225e-06, + "loss": 5.8064, + "step": 12180 + }, + { + "epoch": 0.8278978122027449, + "grad_norm": 0.6237316131591797, + "learning_rate": 8.965552384834896e-06, + "loss": 5.9086, + "step": 12185 + }, + { + "epoch": 0.8282375322734067, + "grad_norm": 0.8572800755500793, + "learning_rate": 8.965127734746569e-06, + "loss": 5.9436, + "step": 12190 + }, + { + "epoch": 0.8285772523440685, + "grad_norm": 0.5438535213470459, + "learning_rate": 8.964703084658243e-06, + "loss": 6.0503, + "step": 12195 + }, + { + "epoch": 0.8289169724147303, + "grad_norm": 0.6359210014343262, + "learning_rate": 8.964278434569914e-06, + "loss": 5.9055, + "step": 12200 + }, + { + "epoch": 0.829256692485392, + "grad_norm": 0.6876789927482605, + "learning_rate": 8.963853784481587e-06, + "loss": 5.9372, + "step": 12205 + }, + { + "epoch": 0.8295964125560538, + "grad_norm": 0.6024190187454224, + "learning_rate": 8.963429134393262e-06, + "loss": 5.7302, + "step": 12210 + }, + { + "epoch": 0.8299361326267156, + "grad_norm": 0.6234225630760193, + "learning_rate": 8.963004484304933e-06, + "loss": 5.91, + "step": 12215 + }, + { + "epoch": 0.8302758526973774, + "grad_norm": 0.7591701149940491, + "learning_rate": 8.962579834216606e-06, + "loss": 5.8223, + "step": 12220 + }, + { + "epoch": 0.8306155727680391, + "grad_norm": 0.6704758405685425, + "learning_rate": 8.96215518412828e-06, + "loss": 5.9241, + "step": 12225 + }, + { + "epoch": 0.830955292838701, + "grad_norm": 0.7390029430389404, + "learning_rate": 8.961730534039951e-06, + "loss": 5.9193, + "step": 12230 + }, + { + "epoch": 0.8312950129093627, + "grad_norm": 0.641518235206604, + "learning_rate": 8.961305883951624e-06, + "loss": 5.5353, + "step": 12235 + }, + { + "epoch": 0.8316347329800244, + "grad_norm": 0.6208619475364685, + "learning_rate": 8.960881233863299e-06, + "loss": 5.897, + "step": 12240 + }, + { + "epoch": 0.8319744530506862, + "grad_norm": 1.0340980291366577, + "learning_rate": 8.96045658377497e-06, + "loss": 5.4958, + "step": 12245 + }, + { + "epoch": 0.832314173121348, + "grad_norm": 0.6235579252243042, + "learning_rate": 8.960031933686642e-06, + "loss": 5.6743, + "step": 12250 + }, + { + "epoch": 0.8326538931920098, + "grad_norm": 0.8380682468414307, + "learning_rate": 8.959607283598315e-06, + "loss": 5.6835, + "step": 12255 + }, + { + "epoch": 0.8329936132626715, + "grad_norm": 0.6581063866615295, + "learning_rate": 8.959182633509988e-06, + "loss": 5.7539, + "step": 12260 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6982380747795105, + "learning_rate": 8.958757983421661e-06, + "loss": 6.0884, + "step": 12265 + }, + { + "epoch": 0.8336730534039951, + "grad_norm": 0.8236098289489746, + "learning_rate": 8.958333333333334e-06, + "loss": 5.6477, + "step": 12270 + }, + { + "epoch": 0.8340127734746569, + "grad_norm": 0.643986165523529, + "learning_rate": 8.957908683245006e-06, + "loss": 5.8241, + "step": 12275 + }, + { + "epoch": 0.8343524935453187, + "grad_norm": 0.6490594148635864, + "learning_rate": 8.95748403315668e-06, + "loss": 5.8227, + "step": 12280 + }, + { + "epoch": 0.8346922136159804, + "grad_norm": 0.7351505756378174, + "learning_rate": 8.957059383068352e-06, + "loss": 5.8662, + "step": 12285 + }, + { + "epoch": 0.8350319336866422, + "grad_norm": 0.7533822655677795, + "learning_rate": 8.956634732980025e-06, + "loss": 6.0163, + "step": 12290 + }, + { + "epoch": 0.8353716537573039, + "grad_norm": 0.6936841607093811, + "learning_rate": 8.956210082891698e-06, + "loss": 5.6242, + "step": 12295 + }, + { + "epoch": 0.8357113738279658, + "grad_norm": 0.7579020857810974, + "learning_rate": 8.95578543280337e-06, + "loss": 5.9295, + "step": 12300 + }, + { + "epoch": 0.8360510938986275, + "grad_norm": 0.8059443235397339, + "learning_rate": 8.955360782715043e-06, + "loss": 5.6576, + "step": 12305 + }, + { + "epoch": 0.8363908139692893, + "grad_norm": 0.670202374458313, + "learning_rate": 8.954936132626716e-06, + "loss": 5.992, + "step": 12310 + }, + { + "epoch": 0.8367305340399511, + "grad_norm": 0.823380172252655, + "learning_rate": 8.954511482538389e-06, + "loss": 5.917, + "step": 12315 + }, + { + "epoch": 0.8370702541106129, + "grad_norm": 0.5635899901390076, + "learning_rate": 8.954086832450062e-06, + "loss": 6.0318, + "step": 12320 + }, + { + "epoch": 0.8374099741812746, + "grad_norm": 0.848290741443634, + "learning_rate": 8.953662182361734e-06, + "loss": 5.5964, + "step": 12325 + }, + { + "epoch": 0.8377496942519363, + "grad_norm": 0.674221396446228, + "learning_rate": 8.953237532273407e-06, + "loss": 5.6314, + "step": 12330 + }, + { + "epoch": 0.8380894143225982, + "grad_norm": 0.7218658924102783, + "learning_rate": 8.95281288218508e-06, + "loss": 5.9607, + "step": 12335 + }, + { + "epoch": 0.83842913439326, + "grad_norm": 0.752845823764801, + "learning_rate": 8.952388232096753e-06, + "loss": 5.7285, + "step": 12340 + }, + { + "epoch": 0.8387688544639217, + "grad_norm": 0.7059134244918823, + "learning_rate": 8.951963582008426e-06, + "loss": 5.8386, + "step": 12345 + }, + { + "epoch": 0.8391085745345835, + "grad_norm": 0.7488641142845154, + "learning_rate": 8.951538931920098e-06, + "loss": 5.8013, + "step": 12350 + }, + { + "epoch": 0.8394482946052453, + "grad_norm": 0.6293264627456665, + "learning_rate": 8.951114281831771e-06, + "loss": 5.8997, + "step": 12355 + }, + { + "epoch": 0.839788014675907, + "grad_norm": 0.7127301692962646, + "learning_rate": 8.950689631743444e-06, + "loss": 5.7038, + "step": 12360 + }, + { + "epoch": 0.8401277347465689, + "grad_norm": 0.6736544966697693, + "learning_rate": 8.950264981655117e-06, + "loss": 5.746, + "step": 12365 + }, + { + "epoch": 0.8404674548172306, + "grad_norm": 0.6045896410942078, + "learning_rate": 8.94984033156679e-06, + "loss": 5.9514, + "step": 12370 + }, + { + "epoch": 0.8408071748878924, + "grad_norm": 0.6773633360862732, + "learning_rate": 8.949415681478462e-06, + "loss": 5.7521, + "step": 12375 + }, + { + "epoch": 0.8411468949585541, + "grad_norm": 0.592797577381134, + "learning_rate": 8.948991031390135e-06, + "loss": 5.9412, + "step": 12380 + }, + { + "epoch": 0.841486615029216, + "grad_norm": 0.6056862473487854, + "learning_rate": 8.948566381301808e-06, + "loss": 5.9967, + "step": 12385 + }, + { + "epoch": 0.8418263350998777, + "grad_norm": 0.5865188241004944, + "learning_rate": 8.948141731213481e-06, + "loss": 5.8281, + "step": 12390 + }, + { + "epoch": 0.8421660551705394, + "grad_norm": 0.691146731376648, + "learning_rate": 8.947717081125154e-06, + "loss": 5.8176, + "step": 12395 + }, + { + "epoch": 0.8425057752412013, + "grad_norm": 0.5605165362358093, + "learning_rate": 8.947292431036826e-06, + "loss": 5.8407, + "step": 12400 + }, + { + "epoch": 0.842845495311863, + "grad_norm": 0.5446791648864746, + "learning_rate": 8.9468677809485e-06, + "loss": 5.7125, + "step": 12405 + }, + { + "epoch": 0.8431852153825248, + "grad_norm": 0.6143643260002136, + "learning_rate": 8.946443130860172e-06, + "loss": 5.8619, + "step": 12410 + }, + { + "epoch": 0.8435249354531865, + "grad_norm": 0.7464221119880676, + "learning_rate": 8.946018480771845e-06, + "loss": 5.835, + "step": 12415 + }, + { + "epoch": 0.8438646555238484, + "grad_norm": 0.5195165872573853, + "learning_rate": 8.945593830683518e-06, + "loss": 5.967, + "step": 12420 + }, + { + "epoch": 0.8442043755945101, + "grad_norm": 0.6581317186355591, + "learning_rate": 8.94516918059519e-06, + "loss": 5.9668, + "step": 12425 + }, + { + "epoch": 0.8445440956651719, + "grad_norm": 0.6643162369728088, + "learning_rate": 8.944744530506863e-06, + "loss": 5.9452, + "step": 12430 + }, + { + "epoch": 0.8448838157358337, + "grad_norm": 0.7494752407073975, + "learning_rate": 8.944319880418536e-06, + "loss": 5.6681, + "step": 12435 + }, + { + "epoch": 0.8452235358064955, + "grad_norm": 0.7613145709037781, + "learning_rate": 8.943895230330209e-06, + "loss": 5.742, + "step": 12440 + }, + { + "epoch": 0.8455632558771572, + "grad_norm": 0.6092007160186768, + "learning_rate": 8.943470580241882e-06, + "loss": 5.701, + "step": 12445 + }, + { + "epoch": 0.845902975947819, + "grad_norm": 0.5509055256843567, + "learning_rate": 8.943045930153555e-06, + "loss": 5.659, + "step": 12450 + }, + { + "epoch": 0.8462426960184808, + "grad_norm": 0.6697782874107361, + "learning_rate": 8.942621280065227e-06, + "loss": 6.1401, + "step": 12455 + }, + { + "epoch": 0.8465824160891425, + "grad_norm": 0.6531774997711182, + "learning_rate": 8.9421966299769e-06, + "loss": 5.8043, + "step": 12460 + }, + { + "epoch": 0.8469221361598043, + "grad_norm": 0.5440756678581238, + "learning_rate": 8.941771979888573e-06, + "loss": 5.8258, + "step": 12465 + }, + { + "epoch": 0.8472618562304661, + "grad_norm": 0.5890042185783386, + "learning_rate": 8.941347329800246e-06, + "loss": 5.8303, + "step": 12470 + }, + { + "epoch": 0.8476015763011279, + "grad_norm": 0.59710693359375, + "learning_rate": 8.940922679711919e-06, + "loss": 5.6201, + "step": 12475 + }, + { + "epoch": 0.8479412963717896, + "grad_norm": 0.8709649443626404, + "learning_rate": 8.940498029623591e-06, + "loss": 5.7744, + "step": 12480 + }, + { + "epoch": 0.8482810164424515, + "grad_norm": 0.5549447536468506, + "learning_rate": 8.940073379535264e-06, + "loss": 5.7012, + "step": 12485 + }, + { + "epoch": 0.8486207365131132, + "grad_norm": 0.5352239012718201, + "learning_rate": 8.939648729446937e-06, + "loss": 5.6362, + "step": 12490 + }, + { + "epoch": 0.848960456583775, + "grad_norm": 0.795897901058197, + "learning_rate": 8.93922407935861e-06, + "loss": 5.8215, + "step": 12495 + }, + { + "epoch": 0.8493001766544367, + "grad_norm": 0.663456380367279, + "learning_rate": 8.938799429270283e-06, + "loss": 5.7084, + "step": 12500 + }, + { + "epoch": 0.8496398967250985, + "grad_norm": 0.7632206678390503, + "learning_rate": 8.938374779181955e-06, + "loss": 5.8875, + "step": 12505 + }, + { + "epoch": 0.8499796167957603, + "grad_norm": 0.6352850198745728, + "learning_rate": 8.937950129093628e-06, + "loss": 5.8201, + "step": 12510 + }, + { + "epoch": 0.850319336866422, + "grad_norm": 0.5371953248977661, + "learning_rate": 8.937525479005301e-06, + "loss": 5.7165, + "step": 12515 + }, + { + "epoch": 0.8506590569370839, + "grad_norm": 0.6408119201660156, + "learning_rate": 8.937100828916974e-06, + "loss": 5.7999, + "step": 12520 + }, + { + "epoch": 0.8509987770077456, + "grad_norm": 0.6297529339790344, + "learning_rate": 8.936676178828647e-06, + "loss": 5.5837, + "step": 12525 + }, + { + "epoch": 0.8513384970784074, + "grad_norm": 0.6220170855522156, + "learning_rate": 8.936251528740318e-06, + "loss": 6.0536, + "step": 12530 + }, + { + "epoch": 0.8516782171490692, + "grad_norm": 0.6675834655761719, + "learning_rate": 8.935826878651992e-06, + "loss": 5.8203, + "step": 12535 + }, + { + "epoch": 0.852017937219731, + "grad_norm": 0.6738823652267456, + "learning_rate": 8.935402228563665e-06, + "loss": 5.8115, + "step": 12540 + }, + { + "epoch": 0.8523576572903927, + "grad_norm": 0.6538853645324707, + "learning_rate": 8.934977578475336e-06, + "loss": 6.0004, + "step": 12545 + }, + { + "epoch": 0.8526973773610544, + "grad_norm": 0.8825609683990479, + "learning_rate": 8.93455292838701e-06, + "loss": 5.7845, + "step": 12550 + }, + { + "epoch": 0.8530370974317163, + "grad_norm": 0.7448657751083374, + "learning_rate": 8.934128278298683e-06, + "loss": 5.9796, + "step": 12555 + }, + { + "epoch": 0.853376817502378, + "grad_norm": 0.6446197628974915, + "learning_rate": 8.933703628210354e-06, + "loss": 5.6776, + "step": 12560 + }, + { + "epoch": 0.8537165375730398, + "grad_norm": 0.6078523397445679, + "learning_rate": 8.933278978122029e-06, + "loss": 5.7943, + "step": 12565 + }, + { + "epoch": 0.8540562576437016, + "grad_norm": 0.7468020915985107, + "learning_rate": 8.932854328033702e-06, + "loss": 5.7785, + "step": 12570 + }, + { + "epoch": 0.8543959777143634, + "grad_norm": 0.6008862853050232, + "learning_rate": 8.932429677945373e-06, + "loss": 5.9407, + "step": 12575 + }, + { + "epoch": 0.8547356977850251, + "grad_norm": 0.6933932304382324, + "learning_rate": 8.932005027857047e-06, + "loss": 5.7706, + "step": 12580 + }, + { + "epoch": 0.8550754178556869, + "grad_norm": 0.6427901983261108, + "learning_rate": 8.93158037776872e-06, + "loss": 5.5599, + "step": 12585 + }, + { + "epoch": 0.8554151379263487, + "grad_norm": 0.5342215299606323, + "learning_rate": 8.931155727680391e-06, + "loss": 5.7602, + "step": 12590 + }, + { + "epoch": 0.8557548579970105, + "grad_norm": 0.5733574628829956, + "learning_rate": 8.930731077592066e-06, + "loss": 5.7826, + "step": 12595 + }, + { + "epoch": 0.8560945780676722, + "grad_norm": 0.6290380954742432, + "learning_rate": 8.930306427503737e-06, + "loss": 5.8932, + "step": 12600 + }, + { + "epoch": 0.856434298138334, + "grad_norm": 0.7041369676589966, + "learning_rate": 8.92988177741541e-06, + "loss": 5.7207, + "step": 12605 + }, + { + "epoch": 0.8567740182089958, + "grad_norm": 0.5377005338668823, + "learning_rate": 8.929457127327084e-06, + "loss": 5.6167, + "step": 12610 + }, + { + "epoch": 0.8571137382796575, + "grad_norm": 0.5742687582969666, + "learning_rate": 8.929032477238755e-06, + "loss": 5.8103, + "step": 12615 + }, + { + "epoch": 0.8574534583503194, + "grad_norm": 0.6619075536727905, + "learning_rate": 8.928607827150428e-06, + "loss": 6.0095, + "step": 12620 + }, + { + "epoch": 0.8577931784209811, + "grad_norm": 0.6098832488059998, + "learning_rate": 8.928183177062103e-06, + "loss": 6.1286, + "step": 12625 + }, + { + "epoch": 0.8581328984916429, + "grad_norm": 0.6601646542549133, + "learning_rate": 8.927758526973774e-06, + "loss": 5.7558, + "step": 12630 + }, + { + "epoch": 0.8584726185623046, + "grad_norm": 0.7281644940376282, + "learning_rate": 8.927333876885446e-06, + "loss": 5.5034, + "step": 12635 + }, + { + "epoch": 0.8588123386329665, + "grad_norm": 0.6181915998458862, + "learning_rate": 8.926909226797121e-06, + "loss": 5.8359, + "step": 12640 + }, + { + "epoch": 0.8591520587036282, + "grad_norm": 0.544409990310669, + "learning_rate": 8.926484576708792e-06, + "loss": 5.9745, + "step": 12645 + }, + { + "epoch": 0.85949177877429, + "grad_norm": 0.6987597942352295, + "learning_rate": 8.926059926620465e-06, + "loss": 5.6683, + "step": 12650 + }, + { + "epoch": 0.8598314988449518, + "grad_norm": 0.6578654050827026, + "learning_rate": 8.92563527653214e-06, + "loss": 5.7577, + "step": 12655 + }, + { + "epoch": 0.8601712189156135, + "grad_norm": 0.6254228949546814, + "learning_rate": 8.92521062644381e-06, + "loss": 5.5654, + "step": 12660 + }, + { + "epoch": 0.8605109389862753, + "grad_norm": 0.5487316846847534, + "learning_rate": 8.924785976355483e-06, + "loss": 5.8621, + "step": 12665 + }, + { + "epoch": 0.860850659056937, + "grad_norm": 0.7041690349578857, + "learning_rate": 8.924361326267158e-06, + "loss": 5.5579, + "step": 12670 + }, + { + "epoch": 0.8611903791275989, + "grad_norm": 0.5912628769874573, + "learning_rate": 8.923936676178829e-06, + "loss": 5.5621, + "step": 12675 + }, + { + "epoch": 0.8615300991982606, + "grad_norm": 0.6123119592666626, + "learning_rate": 8.923512026090502e-06, + "loss": 5.7638, + "step": 12680 + }, + { + "epoch": 0.8618698192689224, + "grad_norm": 0.6875603199005127, + "learning_rate": 8.923087376002174e-06, + "loss": 5.7905, + "step": 12685 + }, + { + "epoch": 0.8622095393395842, + "grad_norm": 0.7013183832168579, + "learning_rate": 8.922662725913847e-06, + "loss": 5.568, + "step": 12690 + }, + { + "epoch": 0.862549259410246, + "grad_norm": 0.5694549083709717, + "learning_rate": 8.92223807582552e-06, + "loss": 6.0245, + "step": 12695 + }, + { + "epoch": 0.8628889794809077, + "grad_norm": 0.6802852749824524, + "learning_rate": 8.921813425737193e-06, + "loss": 5.7443, + "step": 12700 + }, + { + "epoch": 0.8632286995515696, + "grad_norm": 0.5958121418952942, + "learning_rate": 8.921388775648866e-06, + "loss": 6.048, + "step": 12705 + }, + { + "epoch": 0.8635684196222313, + "grad_norm": 0.6553846001625061, + "learning_rate": 8.920964125560538e-06, + "loss": 5.4984, + "step": 12710 + }, + { + "epoch": 0.863908139692893, + "grad_norm": 0.602883517742157, + "learning_rate": 8.920539475472211e-06, + "loss": 5.929, + "step": 12715 + }, + { + "epoch": 0.8642478597635548, + "grad_norm": 0.6267375349998474, + "learning_rate": 8.920114825383884e-06, + "loss": 5.9949, + "step": 12720 + }, + { + "epoch": 0.8645875798342166, + "grad_norm": 0.5783858895301819, + "learning_rate": 8.919690175295557e-06, + "loss": 5.8465, + "step": 12725 + }, + { + "epoch": 0.8649272999048784, + "grad_norm": 0.5931342840194702, + "learning_rate": 8.91926552520723e-06, + "loss": 5.7209, + "step": 12730 + }, + { + "epoch": 0.8652670199755401, + "grad_norm": 0.7451056241989136, + "learning_rate": 8.918840875118902e-06, + "loss": 5.8596, + "step": 12735 + }, + { + "epoch": 0.865606740046202, + "grad_norm": 0.7350187301635742, + "learning_rate": 8.918416225030575e-06, + "loss": 5.9719, + "step": 12740 + }, + { + "epoch": 0.8659464601168637, + "grad_norm": 0.5860458016395569, + "learning_rate": 8.917991574942248e-06, + "loss": 5.8099, + "step": 12745 + }, + { + "epoch": 0.8662861801875255, + "grad_norm": 0.579587459564209, + "learning_rate": 8.917566924853921e-06, + "loss": 5.8719, + "step": 12750 + }, + { + "epoch": 0.8666259002581872, + "grad_norm": 0.6262181401252747, + "learning_rate": 8.917142274765594e-06, + "loss": 5.7252, + "step": 12755 + }, + { + "epoch": 0.866965620328849, + "grad_norm": 0.5586969256401062, + "learning_rate": 8.916717624677266e-06, + "loss": 5.7844, + "step": 12760 + }, + { + "epoch": 0.8673053403995108, + "grad_norm": 0.6137380599975586, + "learning_rate": 8.91629297458894e-06, + "loss": 5.5165, + "step": 12765 + }, + { + "epoch": 0.8676450604701725, + "grad_norm": 0.6778090000152588, + "learning_rate": 8.915868324500612e-06, + "loss": 5.9063, + "step": 12770 + }, + { + "epoch": 0.8679847805408344, + "grad_norm": 0.5135176777839661, + "learning_rate": 8.915443674412285e-06, + "loss": 6.0446, + "step": 12775 + }, + { + "epoch": 0.8683245006114961, + "grad_norm": 0.5467513799667358, + "learning_rate": 8.915019024323958e-06, + "loss": 5.7618, + "step": 12780 + }, + { + "epoch": 0.8686642206821579, + "grad_norm": 0.5362313389778137, + "learning_rate": 8.91459437423563e-06, + "loss": 5.8083, + "step": 12785 + }, + { + "epoch": 0.8690039407528197, + "grad_norm": 0.6835183501243591, + "learning_rate": 8.914169724147303e-06, + "loss": 5.7777, + "step": 12790 + }, + { + "epoch": 0.8693436608234815, + "grad_norm": 0.6062974333763123, + "learning_rate": 8.913745074058976e-06, + "loss": 5.518, + "step": 12795 + }, + { + "epoch": 0.8696833808941432, + "grad_norm": 0.5831831097602844, + "learning_rate": 8.913320423970649e-06, + "loss": 5.8247, + "step": 12800 + }, + { + "epoch": 0.870023100964805, + "grad_norm": 0.47526606917381287, + "learning_rate": 8.912895773882322e-06, + "loss": 5.9593, + "step": 12805 + }, + { + "epoch": 0.8703628210354668, + "grad_norm": 0.6104617118835449, + "learning_rate": 8.912471123793994e-06, + "loss": 5.839, + "step": 12810 + }, + { + "epoch": 0.8707025411061285, + "grad_norm": 0.7097402811050415, + "learning_rate": 8.912046473705667e-06, + "loss": 5.5388, + "step": 12815 + }, + { + "epoch": 0.8710422611767903, + "grad_norm": 0.7356839179992676, + "learning_rate": 8.91162182361734e-06, + "loss": 5.6636, + "step": 12820 + }, + { + "epoch": 0.8713819812474521, + "grad_norm": 0.7501333355903625, + "learning_rate": 8.911197173529013e-06, + "loss": 5.9977, + "step": 12825 + }, + { + "epoch": 0.8717217013181139, + "grad_norm": 0.6655356884002686, + "learning_rate": 8.910772523440686e-06, + "loss": 5.7668, + "step": 12830 + }, + { + "epoch": 0.8720614213887756, + "grad_norm": 0.5394406914710999, + "learning_rate": 8.910347873352358e-06, + "loss": 5.4924, + "step": 12835 + }, + { + "epoch": 0.8724011414594374, + "grad_norm": 0.6703310608863831, + "learning_rate": 8.909923223264031e-06, + "loss": 5.5711, + "step": 12840 + }, + { + "epoch": 0.8727408615300992, + "grad_norm": 0.6051893830299377, + "learning_rate": 8.909498573175704e-06, + "loss": 5.925, + "step": 12845 + }, + { + "epoch": 0.873080581600761, + "grad_norm": 0.5634622573852539, + "learning_rate": 8.909073923087377e-06, + "loss": 5.7174, + "step": 12850 + }, + { + "epoch": 0.8734203016714227, + "grad_norm": 0.5881903767585754, + "learning_rate": 8.90864927299905e-06, + "loss": 5.7221, + "step": 12855 + }, + { + "epoch": 0.8737600217420846, + "grad_norm": 0.6805367469787598, + "learning_rate": 8.908224622910722e-06, + "loss": 5.7256, + "step": 12860 + }, + { + "epoch": 0.8740997418127463, + "grad_norm": 0.5399341583251953, + "learning_rate": 8.907799972822395e-06, + "loss": 5.8738, + "step": 12865 + }, + { + "epoch": 0.874439461883408, + "grad_norm": 0.9071983695030212, + "learning_rate": 8.907375322734068e-06, + "loss": 5.5717, + "step": 12870 + }, + { + "epoch": 0.8747791819540699, + "grad_norm": 0.6323657035827637, + "learning_rate": 8.906950672645741e-06, + "loss": 5.6554, + "step": 12875 + }, + { + "epoch": 0.8751189020247316, + "grad_norm": 0.5378315448760986, + "learning_rate": 8.906526022557414e-06, + "loss": 5.8667, + "step": 12880 + }, + { + "epoch": 0.8754586220953934, + "grad_norm": 0.6768762469291687, + "learning_rate": 8.906101372469086e-06, + "loss": 5.702, + "step": 12885 + }, + { + "epoch": 0.8757983421660551, + "grad_norm": 0.6849345564842224, + "learning_rate": 8.90567672238076e-06, + "loss": 5.554, + "step": 12890 + }, + { + "epoch": 0.876138062236717, + "grad_norm": 0.6182795166969299, + "learning_rate": 8.905252072292432e-06, + "loss": 5.8279, + "step": 12895 + }, + { + "epoch": 0.8764777823073787, + "grad_norm": 0.5469078421592712, + "learning_rate": 8.904827422204105e-06, + "loss": 5.7831, + "step": 12900 + }, + { + "epoch": 0.8768175023780405, + "grad_norm": 0.5721598863601685, + "learning_rate": 8.904402772115778e-06, + "loss": 5.5836, + "step": 12905 + }, + { + "epoch": 0.8771572224487023, + "grad_norm": 0.8676870465278625, + "learning_rate": 8.90397812202745e-06, + "loss": 5.8398, + "step": 12910 + }, + { + "epoch": 0.877496942519364, + "grad_norm": 0.6195288300514221, + "learning_rate": 8.903553471939123e-06, + "loss": 5.4784, + "step": 12915 + }, + { + "epoch": 0.8778366625900258, + "grad_norm": 0.7082779407501221, + "learning_rate": 8.903128821850796e-06, + "loss": 5.6325, + "step": 12920 + }, + { + "epoch": 0.8781763826606876, + "grad_norm": 0.744795560836792, + "learning_rate": 8.902704171762469e-06, + "loss": 5.6162, + "step": 12925 + }, + { + "epoch": 0.8785161027313494, + "grad_norm": 0.7552177309989929, + "learning_rate": 8.902279521674142e-06, + "loss": 5.7394, + "step": 12930 + }, + { + "epoch": 0.8788558228020111, + "grad_norm": 0.8127071857452393, + "learning_rate": 8.901854871585814e-06, + "loss": 5.7867, + "step": 12935 + }, + { + "epoch": 0.8791955428726729, + "grad_norm": 0.6278104186058044, + "learning_rate": 8.901430221497487e-06, + "loss": 6.1263, + "step": 12940 + }, + { + "epoch": 0.8795352629433347, + "grad_norm": 0.6033484935760498, + "learning_rate": 8.901005571409158e-06, + "loss": 5.4896, + "step": 12945 + }, + { + "epoch": 0.8798749830139965, + "grad_norm": 0.5658538937568665, + "learning_rate": 8.900580921320833e-06, + "loss": 5.492, + "step": 12950 + }, + { + "epoch": 0.8802147030846582, + "grad_norm": 0.6170830726623535, + "learning_rate": 8.900156271232506e-06, + "loss": 5.8899, + "step": 12955 + }, + { + "epoch": 0.8805544231553201, + "grad_norm": 0.6028494834899902, + "learning_rate": 8.899731621144177e-06, + "loss": 5.6637, + "step": 12960 + }, + { + "epoch": 0.8808941432259818, + "grad_norm": 0.6115996241569519, + "learning_rate": 8.899306971055851e-06, + "loss": 5.646, + "step": 12965 + }, + { + "epoch": 0.8812338632966435, + "grad_norm": 0.688993513584137, + "learning_rate": 8.898882320967524e-06, + "loss": 5.7502, + "step": 12970 + }, + { + "epoch": 0.8815735833673053, + "grad_norm": 0.593826949596405, + "learning_rate": 8.898457670879195e-06, + "loss": 5.5696, + "step": 12975 + }, + { + "epoch": 0.8819133034379671, + "grad_norm": 0.5789108276367188, + "learning_rate": 8.89803302079087e-06, + "loss": 5.8168, + "step": 12980 + }, + { + "epoch": 0.8822530235086289, + "grad_norm": 0.6421683430671692, + "learning_rate": 8.897608370702542e-06, + "loss": 5.7939, + "step": 12985 + }, + { + "epoch": 0.8825927435792906, + "grad_norm": 0.9810996055603027, + "learning_rate": 8.897183720614214e-06, + "loss": 5.702, + "step": 12990 + }, + { + "epoch": 0.8829324636499525, + "grad_norm": 0.6604261994361877, + "learning_rate": 8.896759070525888e-06, + "loss": 5.8244, + "step": 12995 + }, + { + "epoch": 0.8832721837206142, + "grad_norm": 0.8080747723579407, + "learning_rate": 8.896334420437561e-06, + "loss": 5.7216, + "step": 13000 + }, + { + "epoch": 0.883611903791276, + "grad_norm": 0.5905890464782715, + "learning_rate": 8.895909770349232e-06, + "loss": 5.6421, + "step": 13005 + }, + { + "epoch": 0.8839516238619378, + "grad_norm": 0.5172755122184753, + "learning_rate": 8.895485120260906e-06, + "loss": 5.6528, + "step": 13010 + }, + { + "epoch": 0.8842913439325996, + "grad_norm": 0.5067132115364075, + "learning_rate": 8.89506047017258e-06, + "loss": 5.5897, + "step": 13015 + }, + { + "epoch": 0.8846310640032613, + "grad_norm": 0.5963840484619141, + "learning_rate": 8.89463582008425e-06, + "loss": 5.6343, + "step": 13020 + }, + { + "epoch": 0.884970784073923, + "grad_norm": 0.6578786969184875, + "learning_rate": 8.894211169995925e-06, + "loss": 5.8029, + "step": 13025 + }, + { + "epoch": 0.8853105041445849, + "grad_norm": 0.7161023616790771, + "learning_rate": 8.893786519907596e-06, + "loss": 5.7995, + "step": 13030 + }, + { + "epoch": 0.8856502242152466, + "grad_norm": 0.7388761639595032, + "learning_rate": 8.893361869819269e-06, + "loss": 5.9425, + "step": 13035 + }, + { + "epoch": 0.8859899442859084, + "grad_norm": 0.5714343786239624, + "learning_rate": 8.892937219730943e-06, + "loss": 5.3535, + "step": 13040 + }, + { + "epoch": 0.8863296643565702, + "grad_norm": 0.6306921243667603, + "learning_rate": 8.892512569642614e-06, + "loss": 5.6179, + "step": 13045 + }, + { + "epoch": 0.886669384427232, + "grad_norm": 0.7199556231498718, + "learning_rate": 8.892087919554287e-06, + "loss": 5.7421, + "step": 13050 + }, + { + "epoch": 0.8870091044978937, + "grad_norm": 0.649043619632721, + "learning_rate": 8.891663269465962e-06, + "loss": 5.6677, + "step": 13055 + }, + { + "epoch": 0.8873488245685555, + "grad_norm": 0.8185535073280334, + "learning_rate": 8.891238619377633e-06, + "loss": 5.8711, + "step": 13060 + }, + { + "epoch": 0.8876885446392173, + "grad_norm": 0.5483009219169617, + "learning_rate": 8.890813969289306e-06, + "loss": 5.662, + "step": 13065 + }, + { + "epoch": 0.8880282647098791, + "grad_norm": 0.6243484020233154, + "learning_rate": 8.89038931920098e-06, + "loss": 5.7099, + "step": 13070 + }, + { + "epoch": 0.8883679847805408, + "grad_norm": 0.6036044359207153, + "learning_rate": 8.889964669112651e-06, + "loss": 5.8734, + "step": 13075 + }, + { + "epoch": 0.8887077048512027, + "grad_norm": 0.7920618653297424, + "learning_rate": 8.889540019024324e-06, + "loss": 5.5796, + "step": 13080 + }, + { + "epoch": 0.8890474249218644, + "grad_norm": 0.6948553323745728, + "learning_rate": 8.889115368935999e-06, + "loss": 5.7799, + "step": 13085 + }, + { + "epoch": 0.8893871449925261, + "grad_norm": 0.6291694045066833, + "learning_rate": 8.88869071884767e-06, + "loss": 5.7633, + "step": 13090 + }, + { + "epoch": 0.889726865063188, + "grad_norm": 0.7240245342254639, + "learning_rate": 8.888266068759342e-06, + "loss": 5.5684, + "step": 13095 + }, + { + "epoch": 0.8900665851338497, + "grad_norm": 0.6280462741851807, + "learning_rate": 8.887841418671015e-06, + "loss": 5.6786, + "step": 13100 + }, + { + "epoch": 0.8904063052045115, + "grad_norm": 0.5998950004577637, + "learning_rate": 8.887416768582688e-06, + "loss": 5.6958, + "step": 13105 + }, + { + "epoch": 0.8907460252751732, + "grad_norm": 0.7153200507164001, + "learning_rate": 8.88699211849436e-06, + "loss": 5.914, + "step": 13110 + }, + { + "epoch": 0.8910857453458351, + "grad_norm": 0.6025747656822205, + "learning_rate": 8.886567468406034e-06, + "loss": 5.6726, + "step": 13115 + }, + { + "epoch": 0.8914254654164968, + "grad_norm": 0.6425291895866394, + "learning_rate": 8.886142818317706e-06, + "loss": 5.8592, + "step": 13120 + }, + { + "epoch": 0.8917651854871586, + "grad_norm": 0.5895359516143799, + "learning_rate": 8.885718168229381e-06, + "loss": 5.5864, + "step": 13125 + }, + { + "epoch": 0.8921049055578204, + "grad_norm": 0.7164894938468933, + "learning_rate": 8.885293518141052e-06, + "loss": 5.742, + "step": 13130 + }, + { + "epoch": 0.8924446256284821, + "grad_norm": 0.6395880579948425, + "learning_rate": 8.884868868052725e-06, + "loss": 5.7125, + "step": 13135 + }, + { + "epoch": 0.8927843456991439, + "grad_norm": 0.5517097115516663, + "learning_rate": 8.8844442179644e-06, + "loss": 5.7462, + "step": 13140 + }, + { + "epoch": 0.8931240657698056, + "grad_norm": 0.6469339728355408, + "learning_rate": 8.88401956787607e-06, + "loss": 5.7541, + "step": 13145 + }, + { + "epoch": 0.8934637858404675, + "grad_norm": 0.7165513634681702, + "learning_rate": 8.883594917787743e-06, + "loss": 5.4559, + "step": 13150 + }, + { + "epoch": 0.8938035059111292, + "grad_norm": 0.5127335786819458, + "learning_rate": 8.883170267699418e-06, + "loss": 6.1606, + "step": 13155 + }, + { + "epoch": 0.894143225981791, + "grad_norm": 0.7536998391151428, + "learning_rate": 8.882745617611089e-06, + "loss": 5.7202, + "step": 13160 + }, + { + "epoch": 0.8944829460524528, + "grad_norm": 0.5773508548736572, + "learning_rate": 8.882320967522762e-06, + "loss": 5.4439, + "step": 13165 + }, + { + "epoch": 0.8948226661231146, + "grad_norm": 0.6164292097091675, + "learning_rate": 8.881896317434434e-06, + "loss": 5.3839, + "step": 13170 + }, + { + "epoch": 0.8951623861937763, + "grad_norm": 0.6766324043273926, + "learning_rate": 8.881556597363773e-06, + "loss": 5.7938, + "step": 13175 + }, + { + "epoch": 0.8955021062644382, + "grad_norm": 0.6856294274330139, + "learning_rate": 8.881131947275446e-06, + "loss": 5.7004, + "step": 13180 + }, + { + "epoch": 0.8958418263350999, + "grad_norm": 0.7194755673408508, + "learning_rate": 8.880707297187119e-06, + "loss": 5.8866, + "step": 13185 + }, + { + "epoch": 0.8961815464057616, + "grad_norm": 0.6141719222068787, + "learning_rate": 8.880282647098792e-06, + "loss": 5.4147, + "step": 13190 + }, + { + "epoch": 0.8965212664764234, + "grad_norm": 0.624872088432312, + "learning_rate": 8.879857997010463e-06, + "loss": 5.9477, + "step": 13195 + }, + { + "epoch": 0.8968609865470852, + "grad_norm": 0.5555028319358826, + "learning_rate": 8.879433346922137e-06, + "loss": 5.895, + "step": 13200 + }, + { + "epoch": 0.897200706617747, + "grad_norm": 0.5417369604110718, + "learning_rate": 8.87900869683381e-06, + "loss": 5.6748, + "step": 13205 + }, + { + "epoch": 0.8975404266884087, + "grad_norm": 0.5448525547981262, + "learning_rate": 8.878584046745481e-06, + "loss": 5.8691, + "step": 13210 + }, + { + "epoch": 0.8978801467590706, + "grad_norm": 0.6937718987464905, + "learning_rate": 8.878159396657156e-06, + "loss": 5.6233, + "step": 13215 + }, + { + "epoch": 0.8982198668297323, + "grad_norm": 0.6090360283851624, + "learning_rate": 8.877734746568829e-06, + "loss": 5.6536, + "step": 13220 + }, + { + "epoch": 0.8985595869003941, + "grad_norm": 0.6543543338775635, + "learning_rate": 8.8773100964805e-06, + "loss": 5.9587, + "step": 13225 + }, + { + "epoch": 0.8988993069710558, + "grad_norm": 0.6502128839492798, + "learning_rate": 8.876885446392174e-06, + "loss": 5.8233, + "step": 13230 + }, + { + "epoch": 0.8992390270417177, + "grad_norm": 0.6688170433044434, + "learning_rate": 8.876460796303847e-06, + "loss": 5.5686, + "step": 13235 + }, + { + "epoch": 0.8995787471123794, + "grad_norm": 0.780290961265564, + "learning_rate": 8.876036146215518e-06, + "loss": 5.7866, + "step": 13240 + }, + { + "epoch": 0.8999184671830411, + "grad_norm": 0.6230904459953308, + "learning_rate": 8.875611496127193e-06, + "loss": 5.4634, + "step": 13245 + }, + { + "epoch": 0.900258187253703, + "grad_norm": 0.5802576541900635, + "learning_rate": 8.875186846038865e-06, + "loss": 5.5408, + "step": 13250 + }, + { + "epoch": 0.9005979073243647, + "grad_norm": 0.6071666479110718, + "learning_rate": 8.874762195950536e-06, + "loss": 5.6339, + "step": 13255 + }, + { + "epoch": 0.9009376273950265, + "grad_norm": 0.6407798528671265, + "learning_rate": 8.874337545862211e-06, + "loss": 5.6056, + "step": 13260 + }, + { + "epoch": 0.9012773474656883, + "grad_norm": 0.6509234309196472, + "learning_rate": 8.873912895773884e-06, + "loss": 5.9042, + "step": 13265 + }, + { + "epoch": 0.9016170675363501, + "grad_norm": 0.6748442053794861, + "learning_rate": 8.873488245685555e-06, + "loss": 5.8004, + "step": 13270 + }, + { + "epoch": 0.9019567876070118, + "grad_norm": 0.7255833745002747, + "learning_rate": 8.87306359559723e-06, + "loss": 5.6561, + "step": 13275 + }, + { + "epoch": 0.9022965076776736, + "grad_norm": 0.6929676532745361, + "learning_rate": 8.8726389455089e-06, + "loss": 5.9132, + "step": 13280 + }, + { + "epoch": 0.9026362277483354, + "grad_norm": 0.5328826308250427, + "learning_rate": 8.872214295420573e-06, + "loss": 5.6279, + "step": 13285 + }, + { + "epoch": 0.9029759478189971, + "grad_norm": 0.6504362225532532, + "learning_rate": 8.871789645332248e-06, + "loss": 5.6006, + "step": 13290 + }, + { + "epoch": 0.9033156678896589, + "grad_norm": 0.5942969918251038, + "learning_rate": 8.871364995243919e-06, + "loss": 5.811, + "step": 13295 + }, + { + "epoch": 0.9036553879603207, + "grad_norm": 0.7943723201751709, + "learning_rate": 8.870940345155592e-06, + "loss": 5.7223, + "step": 13300 + }, + { + "epoch": 0.9039951080309825, + "grad_norm": 0.6282886266708374, + "learning_rate": 8.870515695067266e-06, + "loss": 5.6613, + "step": 13305 + }, + { + "epoch": 0.9043348281016442, + "grad_norm": 0.6144717335700989, + "learning_rate": 8.870091044978937e-06, + "loss": 5.8707, + "step": 13310 + }, + { + "epoch": 0.904674548172306, + "grad_norm": 0.5696581602096558, + "learning_rate": 8.86966639489061e-06, + "loss": 5.7162, + "step": 13315 + }, + { + "epoch": 0.9050142682429678, + "grad_norm": 0.5210216045379639, + "learning_rate": 8.869241744802285e-06, + "loss": 5.5225, + "step": 13320 + }, + { + "epoch": 0.9053539883136296, + "grad_norm": 0.686158299446106, + "learning_rate": 8.868817094713956e-06, + "loss": 5.6077, + "step": 13325 + }, + { + "epoch": 0.9056937083842913, + "grad_norm": 0.6612853407859802, + "learning_rate": 8.86839244462563e-06, + "loss": 5.6049, + "step": 13330 + }, + { + "epoch": 0.9060334284549532, + "grad_norm": 0.6463434100151062, + "learning_rate": 8.867967794537303e-06, + "loss": 5.7996, + "step": 13335 + }, + { + "epoch": 0.9063731485256149, + "grad_norm": 0.5724498629570007, + "learning_rate": 8.867543144448974e-06, + "loss": 5.8982, + "step": 13340 + }, + { + "epoch": 0.9067128685962766, + "grad_norm": 0.4697348475456238, + "learning_rate": 8.867118494360649e-06, + "loss": 5.517, + "step": 13345 + }, + { + "epoch": 0.9070525886669385, + "grad_norm": 0.5711299180984497, + "learning_rate": 8.86669384427232e-06, + "loss": 5.8021, + "step": 13350 + }, + { + "epoch": 0.9073923087376002, + "grad_norm": 0.8129385709762573, + "learning_rate": 8.866269194183992e-06, + "loss": 5.6045, + "step": 13355 + }, + { + "epoch": 0.907732028808262, + "grad_norm": 0.570567786693573, + "learning_rate": 8.865844544095667e-06, + "loss": 5.533, + "step": 13360 + }, + { + "epoch": 0.9080717488789237, + "grad_norm": 0.8076724410057068, + "learning_rate": 8.865419894007338e-06, + "loss": 5.5961, + "step": 13365 + }, + { + "epoch": 0.9084114689495856, + "grad_norm": 0.6159887909889221, + "learning_rate": 8.864995243919011e-06, + "loss": 5.7808, + "step": 13370 + }, + { + "epoch": 0.9087511890202473, + "grad_norm": 0.555792510509491, + "learning_rate": 8.864570593830685e-06, + "loss": 5.4553, + "step": 13375 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.4699522852897644, + "learning_rate": 8.864145943742356e-06, + "loss": 5.5777, + "step": 13380 + }, + { + "epoch": 0.9094306291615709, + "grad_norm": 0.5475332736968994, + "learning_rate": 8.86372129365403e-06, + "loss": 5.635, + "step": 13385 + }, + { + "epoch": 0.9097703492322327, + "grad_norm": 0.6554783582687378, + "learning_rate": 8.863296643565704e-06, + "loss": 5.8874, + "step": 13390 + }, + { + "epoch": 0.9101100693028944, + "grad_norm": 0.6018770933151245, + "learning_rate": 8.862871993477375e-06, + "loss": 5.5595, + "step": 13395 + }, + { + "epoch": 0.9104497893735561, + "grad_norm": 0.638333797454834, + "learning_rate": 8.862447343389048e-06, + "loss": 5.4179, + "step": 13400 + }, + { + "epoch": 0.910789509444218, + "grad_norm": 0.6013758778572083, + "learning_rate": 8.862022693300722e-06, + "loss": 5.8635, + "step": 13405 + }, + { + "epoch": 0.9111292295148797, + "grad_norm": 0.5036157965660095, + "learning_rate": 8.861598043212393e-06, + "loss": 5.5002, + "step": 13410 + }, + { + "epoch": 0.9114689495855415, + "grad_norm": 0.5549906492233276, + "learning_rate": 8.861173393124066e-06, + "loss": 5.6053, + "step": 13415 + }, + { + "epoch": 0.9118086696562033, + "grad_norm": 0.6020652651786804, + "learning_rate": 8.860748743035739e-06, + "loss": 5.8828, + "step": 13420 + }, + { + "epoch": 0.9121483897268651, + "grad_norm": 0.7156270146369934, + "learning_rate": 8.860324092947412e-06, + "loss": 5.7546, + "step": 13425 + }, + { + "epoch": 0.9124881097975268, + "grad_norm": 0.694347620010376, + "learning_rate": 8.859899442859084e-06, + "loss": 5.8308, + "step": 13430 + }, + { + "epoch": 0.9128278298681887, + "grad_norm": 0.692825973033905, + "learning_rate": 8.859474792770757e-06, + "loss": 5.5953, + "step": 13435 + }, + { + "epoch": 0.9131675499388504, + "grad_norm": 0.6445220708847046, + "learning_rate": 8.85905014268243e-06, + "loss": 5.5477, + "step": 13440 + }, + { + "epoch": 0.9135072700095122, + "grad_norm": 0.6497980356216431, + "learning_rate": 8.858625492594103e-06, + "loss": 5.7108, + "step": 13445 + }, + { + "epoch": 0.9138469900801739, + "grad_norm": 0.5861636996269226, + "learning_rate": 8.858200842505776e-06, + "loss": 5.462, + "step": 13450 + }, + { + "epoch": 0.9141867101508357, + "grad_norm": 0.5492913722991943, + "learning_rate": 8.857776192417448e-06, + "loss": 5.5275, + "step": 13455 + }, + { + "epoch": 0.9145264302214975, + "grad_norm": 0.7675657272338867, + "learning_rate": 8.857351542329121e-06, + "loss": 5.6777, + "step": 13460 + }, + { + "epoch": 0.9148661502921592, + "grad_norm": 0.5590558648109436, + "learning_rate": 8.856926892240794e-06, + "loss": 5.9111, + "step": 13465 + }, + { + "epoch": 0.9152058703628211, + "grad_norm": 0.5641390085220337, + "learning_rate": 8.856502242152467e-06, + "loss": 5.6879, + "step": 13470 + }, + { + "epoch": 0.9155455904334828, + "grad_norm": 0.5698555707931519, + "learning_rate": 8.85607759206414e-06, + "loss": 5.6423, + "step": 13475 + }, + { + "epoch": 0.9158853105041446, + "grad_norm": 0.6575483679771423, + "learning_rate": 8.855652941975813e-06, + "loss": 5.4481, + "step": 13480 + }, + { + "epoch": 0.9162250305748063, + "grad_norm": 0.6156527996063232, + "learning_rate": 8.855228291887485e-06, + "loss": 5.5928, + "step": 13485 + }, + { + "epoch": 0.9165647506454682, + "grad_norm": 0.513673722743988, + "learning_rate": 8.854803641799158e-06, + "loss": 5.6594, + "step": 13490 + }, + { + "epoch": 0.9169044707161299, + "grad_norm": 0.652374267578125, + "learning_rate": 8.854378991710831e-06, + "loss": 5.8074, + "step": 13495 + }, + { + "epoch": 0.9172441907867916, + "grad_norm": 0.6255156397819519, + "learning_rate": 8.853954341622504e-06, + "loss": 5.6083, + "step": 13500 + }, + { + "epoch": 0.9175839108574535, + "grad_norm": 0.5112987756729126, + "learning_rate": 8.853529691534177e-06, + "loss": 5.9366, + "step": 13505 + }, + { + "epoch": 0.9179236309281152, + "grad_norm": 0.6585701107978821, + "learning_rate": 8.85310504144585e-06, + "loss": 5.6718, + "step": 13510 + }, + { + "epoch": 0.918263350998777, + "grad_norm": 0.6006505489349365, + "learning_rate": 8.852680391357522e-06, + "loss": 5.19, + "step": 13515 + }, + { + "epoch": 0.9186030710694388, + "grad_norm": 0.842532753944397, + "learning_rate": 8.852255741269195e-06, + "loss": 5.8041, + "step": 13520 + }, + { + "epoch": 0.9189427911401006, + "grad_norm": 0.6174010634422302, + "learning_rate": 8.851831091180868e-06, + "loss": 5.489, + "step": 13525 + }, + { + "epoch": 0.9192825112107623, + "grad_norm": 0.7118439674377441, + "learning_rate": 8.85140644109254e-06, + "loss": 5.1943, + "step": 13530 + }, + { + "epoch": 0.9196222312814241, + "grad_norm": 0.6511855721473694, + "learning_rate": 8.850981791004213e-06, + "loss": 5.6193, + "step": 13535 + }, + { + "epoch": 0.9199619513520859, + "grad_norm": 0.5601558089256287, + "learning_rate": 8.850557140915886e-06, + "loss": 5.6885, + "step": 13540 + }, + { + "epoch": 0.9203016714227477, + "grad_norm": 0.8434257507324219, + "learning_rate": 8.850132490827559e-06, + "loss": 5.7257, + "step": 13545 + }, + { + "epoch": 0.9206413914934094, + "grad_norm": 0.6028275489807129, + "learning_rate": 8.849707840739232e-06, + "loss": 5.6743, + "step": 13550 + }, + { + "epoch": 0.9209811115640713, + "grad_norm": 0.5971229076385498, + "learning_rate": 8.849283190650905e-06, + "loss": 5.6635, + "step": 13555 + }, + { + "epoch": 0.921320831634733, + "grad_norm": 0.7451720833778381, + "learning_rate": 8.848858540562577e-06, + "loss": 5.6573, + "step": 13560 + }, + { + "epoch": 0.9216605517053947, + "grad_norm": 0.7671144604682922, + "learning_rate": 8.84843389047425e-06, + "loss": 5.7069, + "step": 13565 + }, + { + "epoch": 0.9220002717760565, + "grad_norm": 0.5735886693000793, + "learning_rate": 8.848009240385923e-06, + "loss": 5.7508, + "step": 13570 + }, + { + "epoch": 0.9223399918467183, + "grad_norm": 0.5721925497055054, + "learning_rate": 8.847584590297596e-06, + "loss": 5.5654, + "step": 13575 + }, + { + "epoch": 0.9226797119173801, + "grad_norm": 0.5542433261871338, + "learning_rate": 8.847159940209269e-06, + "loss": 5.5316, + "step": 13580 + }, + { + "epoch": 0.9230194319880418, + "grad_norm": 0.7092092037200928, + "learning_rate": 8.846735290120941e-06, + "loss": 5.539, + "step": 13585 + }, + { + "epoch": 0.9233591520587037, + "grad_norm": 0.6068519949913025, + "learning_rate": 8.846310640032614e-06, + "loss": 5.72, + "step": 13590 + }, + { + "epoch": 0.9236988721293654, + "grad_norm": 0.4933003783226013, + "learning_rate": 8.845885989944287e-06, + "loss": 5.6971, + "step": 13595 + }, + { + "epoch": 0.9240385922000272, + "grad_norm": 0.5917801260948181, + "learning_rate": 8.84546133985596e-06, + "loss": 5.4088, + "step": 13600 + }, + { + "epoch": 0.924378312270689, + "grad_norm": 0.5524001717567444, + "learning_rate": 8.845036689767633e-06, + "loss": 5.5771, + "step": 13605 + }, + { + "epoch": 0.9247180323413507, + "grad_norm": 0.5699480772018433, + "learning_rate": 8.844612039679305e-06, + "loss": 5.7877, + "step": 13610 + }, + { + "epoch": 0.9250577524120125, + "grad_norm": 0.5046672224998474, + "learning_rate": 8.844187389590978e-06, + "loss": 5.702, + "step": 13615 + }, + { + "epoch": 0.9253974724826742, + "grad_norm": 0.707632839679718, + "learning_rate": 8.843762739502651e-06, + "loss": 5.7458, + "step": 13620 + }, + { + "epoch": 0.9257371925533361, + "grad_norm": 0.5839803814888, + "learning_rate": 8.843338089414322e-06, + "loss": 5.777, + "step": 13625 + }, + { + "epoch": 0.9260769126239978, + "grad_norm": 0.47856202721595764, + "learning_rate": 8.842913439325997e-06, + "loss": 5.4857, + "step": 13630 + }, + { + "epoch": 0.9264166326946596, + "grad_norm": 0.6171690821647644, + "learning_rate": 8.84248878923767e-06, + "loss": 5.6529, + "step": 13635 + }, + { + "epoch": 0.9267563527653214, + "grad_norm": 0.7039323449134827, + "learning_rate": 8.84206413914934e-06, + "loss": 5.5423, + "step": 13640 + }, + { + "epoch": 0.9270960728359832, + "grad_norm": 0.6625202894210815, + "learning_rate": 8.841639489061015e-06, + "loss": 5.7182, + "step": 13645 + }, + { + "epoch": 0.9274357929066449, + "grad_norm": 0.5865031480789185, + "learning_rate": 8.841214838972688e-06, + "loss": 5.7641, + "step": 13650 + }, + { + "epoch": 0.9277755129773066, + "grad_norm": 0.5817753076553345, + "learning_rate": 8.840790188884359e-06, + "loss": 5.562, + "step": 13655 + }, + { + "epoch": 0.9281152330479685, + "grad_norm": 0.49862170219421387, + "learning_rate": 8.840365538796033e-06, + "loss": 5.6857, + "step": 13660 + }, + { + "epoch": 0.9284549531186302, + "grad_norm": 0.5560623407363892, + "learning_rate": 8.839940888707706e-06, + "loss": 5.3682, + "step": 13665 + }, + { + "epoch": 0.928794673189292, + "grad_norm": 0.6941109299659729, + "learning_rate": 8.839516238619379e-06, + "loss": 5.5711, + "step": 13670 + }, + { + "epoch": 0.9291343932599538, + "grad_norm": 0.5571900010108948, + "learning_rate": 8.839091588531052e-06, + "loss": 5.8012, + "step": 13675 + }, + { + "epoch": 0.9294741133306156, + "grad_norm": 0.6909985542297363, + "learning_rate": 8.838666938442725e-06, + "loss": 5.4677, + "step": 13680 + }, + { + "epoch": 0.9298138334012773, + "grad_norm": 0.6298333406448364, + "learning_rate": 8.838242288354397e-06, + "loss": 5.3727, + "step": 13685 + }, + { + "epoch": 0.9301535534719392, + "grad_norm": 0.5916708111763, + "learning_rate": 8.83781763826607e-06, + "loss": 5.4369, + "step": 13690 + }, + { + "epoch": 0.9304932735426009, + "grad_norm": 0.6865996718406677, + "learning_rate": 8.837392988177741e-06, + "loss": 5.605, + "step": 13695 + }, + { + "epoch": 0.9308329936132627, + "grad_norm": 0.5989307761192322, + "learning_rate": 8.836968338089416e-06, + "loss": 5.6822, + "step": 13700 + }, + { + "epoch": 0.9311727136839244, + "grad_norm": 0.6759678721427917, + "learning_rate": 8.836543688001089e-06, + "loss": 5.7033, + "step": 13705 + }, + { + "epoch": 0.9315124337545863, + "grad_norm": 0.49612605571746826, + "learning_rate": 8.83611903791276e-06, + "loss": 5.4375, + "step": 13710 + }, + { + "epoch": 0.931852153825248, + "grad_norm": 0.497571736574173, + "learning_rate": 8.835694387824434e-06, + "loss": 5.4178, + "step": 13715 + }, + { + "epoch": 0.9321918738959097, + "grad_norm": 0.6228163242340088, + "learning_rate": 8.835269737736107e-06, + "loss": 5.3847, + "step": 13720 + }, + { + "epoch": 0.9325315939665716, + "grad_norm": 0.9073172211647034, + "learning_rate": 8.834845087647778e-06, + "loss": 5.4775, + "step": 13725 + }, + { + "epoch": 0.9328713140372333, + "grad_norm": 0.5845047831535339, + "learning_rate": 8.834420437559453e-06, + "loss": 5.5077, + "step": 13730 + }, + { + "epoch": 0.9332110341078951, + "grad_norm": 0.6198378205299377, + "learning_rate": 8.833995787471125e-06, + "loss": 5.7491, + "step": 13735 + }, + { + "epoch": 0.9335507541785568, + "grad_norm": 0.5339580774307251, + "learning_rate": 8.833571137382796e-06, + "loss": 5.4098, + "step": 13740 + }, + { + "epoch": 0.9338904742492187, + "grad_norm": 0.5764421820640564, + "learning_rate": 8.833146487294471e-06, + "loss": 5.7401, + "step": 13745 + }, + { + "epoch": 0.9342301943198804, + "grad_norm": 0.6808501482009888, + "learning_rate": 8.832721837206144e-06, + "loss": 5.861, + "step": 13750 + }, + { + "epoch": 0.9345699143905422, + "grad_norm": 0.5076799988746643, + "learning_rate": 8.832297187117815e-06, + "loss": 5.5801, + "step": 13755 + }, + { + "epoch": 0.934909634461204, + "grad_norm": 0.7049991488456726, + "learning_rate": 8.83187253702949e-06, + "loss": 5.7414, + "step": 13760 + }, + { + "epoch": 0.9352493545318658, + "grad_norm": 0.5608775615692139, + "learning_rate": 8.83144788694116e-06, + "loss": 5.4723, + "step": 13765 + }, + { + "epoch": 0.9355890746025275, + "grad_norm": 0.5530602931976318, + "learning_rate": 8.831023236852833e-06, + "loss": 5.5175, + "step": 13770 + }, + { + "epoch": 0.9359287946731893, + "grad_norm": 0.7097817659378052, + "learning_rate": 8.830598586764508e-06, + "loss": 5.6293, + "step": 13775 + }, + { + "epoch": 0.9362685147438511, + "grad_norm": 0.7309393882751465, + "learning_rate": 8.830173936676179e-06, + "loss": 5.7093, + "step": 13780 + }, + { + "epoch": 0.9366082348145128, + "grad_norm": 0.7498119473457336, + "learning_rate": 8.829749286587852e-06, + "loss": 5.5434, + "step": 13785 + }, + { + "epoch": 0.9369479548851746, + "grad_norm": 0.4344838261604309, + "learning_rate": 8.829324636499526e-06, + "loss": 5.4504, + "step": 13790 + }, + { + "epoch": 0.9372876749558364, + "grad_norm": 0.6622368097305298, + "learning_rate": 8.828899986411197e-06, + "loss": 5.647, + "step": 13795 + }, + { + "epoch": 0.9376273950264982, + "grad_norm": 0.6214677691459656, + "learning_rate": 8.82847533632287e-06, + "loss": 5.4469, + "step": 13800 + }, + { + "epoch": 0.9379671150971599, + "grad_norm": 0.6154429316520691, + "learning_rate": 8.828050686234545e-06, + "loss": 5.5978, + "step": 13805 + }, + { + "epoch": 0.9383068351678218, + "grad_norm": 0.5437399744987488, + "learning_rate": 8.827626036146216e-06, + "loss": 5.559, + "step": 13810 + }, + { + "epoch": 0.9386465552384835, + "grad_norm": 0.6523188948631287, + "learning_rate": 8.827201386057888e-06, + "loss": 5.6157, + "step": 13815 + }, + { + "epoch": 0.9389862753091452, + "grad_norm": 0.6177864074707031, + "learning_rate": 8.826776735969563e-06, + "loss": 5.658, + "step": 13820 + }, + { + "epoch": 0.939325995379807, + "grad_norm": 0.7815133929252625, + "learning_rate": 8.826352085881234e-06, + "loss": 5.4003, + "step": 13825 + }, + { + "epoch": 0.9396657154504688, + "grad_norm": 0.6894827485084534, + "learning_rate": 8.825927435792907e-06, + "loss": 5.8776, + "step": 13830 + }, + { + "epoch": 0.9400054355211306, + "grad_norm": 0.6089897155761719, + "learning_rate": 8.825502785704581e-06, + "loss": 5.2992, + "step": 13835 + }, + { + "epoch": 0.9403451555917923, + "grad_norm": 0.5665061473846436, + "learning_rate": 8.825078135616252e-06, + "loss": 5.4353, + "step": 13840 + }, + { + "epoch": 0.9406848756624542, + "grad_norm": 0.5308881998062134, + "learning_rate": 8.824653485527925e-06, + "loss": 5.5248, + "step": 13845 + }, + { + "epoch": 0.9410245957331159, + "grad_norm": 0.5614994168281555, + "learning_rate": 8.824228835439598e-06, + "loss": 5.544, + "step": 13850 + }, + { + "epoch": 0.9413643158037777, + "grad_norm": 0.5497748255729675, + "learning_rate": 8.823804185351271e-06, + "loss": 5.6635, + "step": 13855 + }, + { + "epoch": 0.9417040358744395, + "grad_norm": 0.5838170647621155, + "learning_rate": 8.823379535262944e-06, + "loss": 5.3245, + "step": 13860 + }, + { + "epoch": 0.9420437559451013, + "grad_norm": 0.877261221408844, + "learning_rate": 8.822954885174616e-06, + "loss": 5.6626, + "step": 13865 + }, + { + "epoch": 0.942383476015763, + "grad_norm": 0.6991741061210632, + "learning_rate": 8.82253023508629e-06, + "loss": 5.6186, + "step": 13870 + }, + { + "epoch": 0.9427231960864247, + "grad_norm": 0.6869246363639832, + "learning_rate": 8.822105584997962e-06, + "loss": 5.6665, + "step": 13875 + }, + { + "epoch": 0.9430629161570866, + "grad_norm": 0.6262366771697998, + "learning_rate": 8.821680934909635e-06, + "loss": 5.4986, + "step": 13880 + }, + { + "epoch": 0.9434026362277483, + "grad_norm": 0.6502242684364319, + "learning_rate": 8.821256284821308e-06, + "loss": 5.5306, + "step": 13885 + }, + { + "epoch": 0.9437423562984101, + "grad_norm": 0.5807626247406006, + "learning_rate": 8.82083163473298e-06, + "loss": 5.525, + "step": 13890 + }, + { + "epoch": 0.9440820763690719, + "grad_norm": 0.5858907699584961, + "learning_rate": 8.820406984644653e-06, + "loss": 5.6573, + "step": 13895 + }, + { + "epoch": 0.9444217964397337, + "grad_norm": 0.6805034279823303, + "learning_rate": 8.819982334556326e-06, + "loss": 5.5569, + "step": 13900 + }, + { + "epoch": 0.9447615165103954, + "grad_norm": 0.5117987394332886, + "learning_rate": 8.819557684467999e-06, + "loss": 5.5483, + "step": 13905 + }, + { + "epoch": 0.9451012365810572, + "grad_norm": 0.6540390253067017, + "learning_rate": 8.819133034379672e-06, + "loss": 5.6692, + "step": 13910 + }, + { + "epoch": 0.945440956651719, + "grad_norm": 0.672282338142395, + "learning_rate": 8.818708384291344e-06, + "loss": 5.6332, + "step": 13915 + }, + { + "epoch": 0.9457806767223808, + "grad_norm": 0.6815195679664612, + "learning_rate": 8.818283734203017e-06, + "loss": 5.7386, + "step": 13920 + }, + { + "epoch": 0.9461203967930425, + "grad_norm": 0.4960305690765381, + "learning_rate": 8.81785908411469e-06, + "loss": 5.5698, + "step": 13925 + }, + { + "epoch": 0.9464601168637043, + "grad_norm": 0.6239379644393921, + "learning_rate": 8.817434434026363e-06, + "loss": 5.7101, + "step": 13930 + }, + { + "epoch": 0.9467998369343661, + "grad_norm": 0.6866806745529175, + "learning_rate": 8.817009783938036e-06, + "loss": 5.5456, + "step": 13935 + }, + { + "epoch": 0.9471395570050278, + "grad_norm": 0.6890297532081604, + "learning_rate": 8.816585133849708e-06, + "loss": 5.9329, + "step": 13940 + }, + { + "epoch": 0.9474792770756897, + "grad_norm": 0.5269648432731628, + "learning_rate": 8.816160483761381e-06, + "loss": 5.3841, + "step": 13945 + }, + { + "epoch": 0.9478189971463514, + "grad_norm": 0.6642200946807861, + "learning_rate": 8.815735833673054e-06, + "loss": 5.3883, + "step": 13950 + }, + { + "epoch": 0.9481587172170132, + "grad_norm": 0.5801568031311035, + "learning_rate": 8.815311183584727e-06, + "loss": 5.5854, + "step": 13955 + }, + { + "epoch": 0.9484984372876749, + "grad_norm": 0.5775591135025024, + "learning_rate": 8.8148865334964e-06, + "loss": 5.3588, + "step": 13960 + }, + { + "epoch": 0.9488381573583368, + "grad_norm": 0.5386878848075867, + "learning_rate": 8.814461883408072e-06, + "loss": 5.2583, + "step": 13965 + }, + { + "epoch": 0.9491778774289985, + "grad_norm": 0.6425462365150452, + "learning_rate": 8.814037233319745e-06, + "loss": 5.8158, + "step": 13970 + }, + { + "epoch": 0.9495175974996602, + "grad_norm": 0.7788015007972717, + "learning_rate": 8.813612583231418e-06, + "loss": 5.7202, + "step": 13975 + }, + { + "epoch": 0.9498573175703221, + "grad_norm": 0.64101642370224, + "learning_rate": 8.813187933143091e-06, + "loss": 5.6039, + "step": 13980 + }, + { + "epoch": 0.9501970376409838, + "grad_norm": 0.6940484046936035, + "learning_rate": 8.812763283054764e-06, + "loss": 5.5543, + "step": 13985 + }, + { + "epoch": 0.9505367577116456, + "grad_norm": 0.6625894904136658, + "learning_rate": 8.812338632966436e-06, + "loss": 5.4656, + "step": 13990 + }, + { + "epoch": 0.9508764777823073, + "grad_norm": 0.7406980395317078, + "learning_rate": 8.81191398287811e-06, + "loss": 5.6932, + "step": 13995 + }, + { + "epoch": 0.9512161978529692, + "grad_norm": 0.5795179009437561, + "learning_rate": 8.811489332789782e-06, + "loss": 5.766, + "step": 14000 + }, + { + "epoch": 0.9515559179236309, + "grad_norm": 0.5599305629730225, + "learning_rate": 8.811064682701455e-06, + "loss": 5.1524, + "step": 14005 + }, + { + "epoch": 0.9518956379942927, + "grad_norm": 0.5861132144927979, + "learning_rate": 8.810640032613128e-06, + "loss": 5.2826, + "step": 14010 + }, + { + "epoch": 0.9522353580649545, + "grad_norm": 0.6500221490859985, + "learning_rate": 8.8102153825248e-06, + "loss": 5.5161, + "step": 14015 + }, + { + "epoch": 0.9525750781356163, + "grad_norm": 0.4924067258834839, + "learning_rate": 8.809790732436473e-06, + "loss": 5.3576, + "step": 14020 + }, + { + "epoch": 0.952914798206278, + "grad_norm": 0.59602952003479, + "learning_rate": 8.809366082348146e-06, + "loss": 5.5874, + "step": 14025 + }, + { + "epoch": 0.9532545182769399, + "grad_norm": 0.5933939218521118, + "learning_rate": 8.808941432259819e-06, + "loss": 5.4847, + "step": 14030 + }, + { + "epoch": 0.9535942383476016, + "grad_norm": 0.8004318475723267, + "learning_rate": 8.808516782171492e-06, + "loss": 5.6468, + "step": 14035 + }, + { + "epoch": 0.9539339584182633, + "grad_norm": 0.5800765156745911, + "learning_rate": 8.808092132083164e-06, + "loss": 5.6417, + "step": 14040 + }, + { + "epoch": 0.9542736784889251, + "grad_norm": 0.7241641879081726, + "learning_rate": 8.807667481994837e-06, + "loss": 5.5753, + "step": 14045 + }, + { + "epoch": 0.9546133985595869, + "grad_norm": 0.5193501114845276, + "learning_rate": 8.80724283190651e-06, + "loss": 5.7376, + "step": 14050 + }, + { + "epoch": 0.9549531186302487, + "grad_norm": 0.604332447052002, + "learning_rate": 8.806818181818183e-06, + "loss": 5.6853, + "step": 14055 + }, + { + "epoch": 0.9552928387009104, + "grad_norm": 0.645780622959137, + "learning_rate": 8.806393531729856e-06, + "loss": 5.305, + "step": 14060 + }, + { + "epoch": 0.9556325587715723, + "grad_norm": 0.7534486055374146, + "learning_rate": 8.805968881641528e-06, + "loss": 5.6446, + "step": 14065 + }, + { + "epoch": 0.955972278842234, + "grad_norm": 0.5357062816619873, + "learning_rate": 8.805544231553201e-06, + "loss": 5.6398, + "step": 14070 + }, + { + "epoch": 0.9563119989128958, + "grad_norm": 0.5978732109069824, + "learning_rate": 8.805119581464874e-06, + "loss": 5.3664, + "step": 14075 + }, + { + "epoch": 0.9566517189835575, + "grad_norm": 0.8021615147590637, + "learning_rate": 8.804694931376547e-06, + "loss": 5.6012, + "step": 14080 + }, + { + "epoch": 0.9569914390542194, + "grad_norm": 0.5958462357521057, + "learning_rate": 8.80427028128822e-06, + "loss": 5.3778, + "step": 14085 + }, + { + "epoch": 0.9573311591248811, + "grad_norm": 0.6244257688522339, + "learning_rate": 8.803845631199893e-06, + "loss": 5.6111, + "step": 14090 + }, + { + "epoch": 0.9576708791955428, + "grad_norm": 0.5509819388389587, + "learning_rate": 8.803420981111565e-06, + "loss": 5.6477, + "step": 14095 + }, + { + "epoch": 0.9580105992662047, + "grad_norm": 0.6252068281173706, + "learning_rate": 8.802996331023238e-06, + "loss": 5.2502, + "step": 14100 + }, + { + "epoch": 0.9583503193368664, + "grad_norm": 0.7196452617645264, + "learning_rate": 8.802571680934911e-06, + "loss": 5.2957, + "step": 14105 + }, + { + "epoch": 0.9586900394075282, + "grad_norm": 0.6822369694709778, + "learning_rate": 8.802147030846582e-06, + "loss": 5.6094, + "step": 14110 + }, + { + "epoch": 0.95902975947819, + "grad_norm": 0.5380411744117737, + "learning_rate": 8.801722380758257e-06, + "loss": 5.5891, + "step": 14115 + }, + { + "epoch": 0.9593694795488518, + "grad_norm": 0.5657706260681152, + "learning_rate": 8.80129773066993e-06, + "loss": 5.8178, + "step": 14120 + }, + { + "epoch": 0.9597091996195135, + "grad_norm": 0.706994354724884, + "learning_rate": 8.8008730805816e-06, + "loss": 5.6242, + "step": 14125 + }, + { + "epoch": 0.9600489196901753, + "grad_norm": 0.6714161038398743, + "learning_rate": 8.800448430493275e-06, + "loss": 5.4438, + "step": 14130 + }, + { + "epoch": 0.9603886397608371, + "grad_norm": 0.5400075316429138, + "learning_rate": 8.800023780404948e-06, + "loss": 5.338, + "step": 14135 + }, + { + "epoch": 0.9607283598314988, + "grad_norm": 0.6471081972122192, + "learning_rate": 8.799599130316619e-06, + "loss": 5.6755, + "step": 14140 + }, + { + "epoch": 0.9610680799021606, + "grad_norm": 0.5219268202781677, + "learning_rate": 8.799174480228293e-06, + "loss": 5.3037, + "step": 14145 + }, + { + "epoch": 0.9614077999728224, + "grad_norm": 0.5125110149383545, + "learning_rate": 8.798749830139966e-06, + "loss": 5.4233, + "step": 14150 + }, + { + "epoch": 0.9617475200434842, + "grad_norm": 0.5307828187942505, + "learning_rate": 8.798325180051637e-06, + "loss": 5.3529, + "step": 14155 + }, + { + "epoch": 0.9620872401141459, + "grad_norm": 0.5776136517524719, + "learning_rate": 8.797900529963312e-06, + "loss": 5.6007, + "step": 14160 + }, + { + "epoch": 0.9624269601848077, + "grad_norm": 0.5518404245376587, + "learning_rate": 8.797475879874985e-06, + "loss": 5.4493, + "step": 14165 + }, + { + "epoch": 0.9627666802554695, + "grad_norm": 0.7888116240501404, + "learning_rate": 8.797051229786656e-06, + "loss": 5.6559, + "step": 14170 + }, + { + "epoch": 0.9631064003261313, + "grad_norm": 0.6102878451347351, + "learning_rate": 8.79662657969833e-06, + "loss": 5.4008, + "step": 14175 + }, + { + "epoch": 0.963446120396793, + "grad_norm": 0.8542807102203369, + "learning_rate": 8.796201929610003e-06, + "loss": 5.4951, + "step": 14180 + }, + { + "epoch": 0.9637858404674549, + "grad_norm": 0.7857166528701782, + "learning_rate": 8.795777279521674e-06, + "loss": 5.7337, + "step": 14185 + }, + { + "epoch": 0.9641255605381166, + "grad_norm": 0.6325708627700806, + "learning_rate": 8.795352629433349e-06, + "loss": 5.3225, + "step": 14190 + }, + { + "epoch": 0.9644652806087783, + "grad_norm": 0.6460001468658447, + "learning_rate": 8.79492797934502e-06, + "loss": 5.5286, + "step": 14195 + }, + { + "epoch": 0.9648050006794402, + "grad_norm": 0.5316886901855469, + "learning_rate": 8.794503329256692e-06, + "loss": 5.3264, + "step": 14200 + }, + { + "epoch": 0.9651447207501019, + "grad_norm": 0.8407277464866638, + "learning_rate": 8.794078679168367e-06, + "loss": 5.562, + "step": 14205 + }, + { + "epoch": 0.9654844408207637, + "grad_norm": 0.5872427821159363, + "learning_rate": 8.793654029080038e-06, + "loss": 5.4779, + "step": 14210 + }, + { + "epoch": 0.9658241608914254, + "grad_norm": 0.6602181196212769, + "learning_rate": 8.79322937899171e-06, + "loss": 5.7673, + "step": 14215 + }, + { + "epoch": 0.9661638809620873, + "grad_norm": 0.5841967463493347, + "learning_rate": 8.792804728903385e-06, + "loss": 5.5598, + "step": 14220 + }, + { + "epoch": 0.966503601032749, + "grad_norm": 0.5401461124420166, + "learning_rate": 8.792380078815056e-06, + "loss": 5.3802, + "step": 14225 + }, + { + "epoch": 0.9668433211034108, + "grad_norm": 0.5452398657798767, + "learning_rate": 8.79195542872673e-06, + "loss": 5.4131, + "step": 14230 + }, + { + "epoch": 0.9671830411740726, + "grad_norm": 0.6160120368003845, + "learning_rate": 8.791530778638404e-06, + "loss": 5.549, + "step": 14235 + }, + { + "epoch": 0.9675227612447344, + "grad_norm": 0.570662796497345, + "learning_rate": 8.791106128550075e-06, + "loss": 5.3962, + "step": 14240 + }, + { + "epoch": 0.9678624813153961, + "grad_norm": 0.5466603636741638, + "learning_rate": 8.790681478461748e-06, + "loss": 5.3603, + "step": 14245 + }, + { + "epoch": 0.9682022013860578, + "grad_norm": 0.4867437183856964, + "learning_rate": 8.790256828373422e-06, + "loss": 5.6038, + "step": 14250 + }, + { + "epoch": 0.9685419214567197, + "grad_norm": 0.6906166672706604, + "learning_rate": 8.789832178285093e-06, + "loss": 5.5275, + "step": 14255 + }, + { + "epoch": 0.9688816415273814, + "grad_norm": 0.644178032875061, + "learning_rate": 8.789407528196766e-06, + "loss": 5.3909, + "step": 14260 + }, + { + "epoch": 0.9692213615980432, + "grad_norm": 0.6617496609687805, + "learning_rate": 8.788982878108439e-06, + "loss": 5.3166, + "step": 14265 + }, + { + "epoch": 0.969561081668705, + "grad_norm": 0.8017048239707947, + "learning_rate": 8.788558228020112e-06, + "loss": 5.5512, + "step": 14270 + }, + { + "epoch": 0.9699008017393668, + "grad_norm": 0.6186959147453308, + "learning_rate": 8.788133577931784e-06, + "loss": 5.6469, + "step": 14275 + }, + { + "epoch": 0.9702405218100285, + "grad_norm": 0.7635421752929688, + "learning_rate": 8.787708927843457e-06, + "loss": 5.4434, + "step": 14280 + }, + { + "epoch": 0.9705802418806904, + "grad_norm": 0.5885491967201233, + "learning_rate": 8.78728427775513e-06, + "loss": 5.4136, + "step": 14285 + }, + { + "epoch": 0.9709199619513521, + "grad_norm": 0.5551665425300598, + "learning_rate": 8.786859627666803e-06, + "loss": 5.5151, + "step": 14290 + }, + { + "epoch": 0.9712596820220138, + "grad_norm": 0.5199483036994934, + "learning_rate": 8.786434977578476e-06, + "loss": 5.3642, + "step": 14295 + }, + { + "epoch": 0.9715994020926756, + "grad_norm": 0.5395585894584656, + "learning_rate": 8.786010327490148e-06, + "loss": 5.6105, + "step": 14300 + }, + { + "epoch": 0.9719391221633374, + "grad_norm": 0.5540007948875427, + "learning_rate": 8.785585677401821e-06, + "loss": 5.604, + "step": 14305 + }, + { + "epoch": 0.9722788422339992, + "grad_norm": 0.5137684941291809, + "learning_rate": 8.785161027313494e-06, + "loss": 5.6123, + "step": 14310 + }, + { + "epoch": 0.9726185623046609, + "grad_norm": 0.585589587688446, + "learning_rate": 8.784736377225167e-06, + "loss": 5.4445, + "step": 14315 + }, + { + "epoch": 0.9729582823753228, + "grad_norm": 0.5438791513442993, + "learning_rate": 8.78431172713684e-06, + "loss": 5.3677, + "step": 14320 + }, + { + "epoch": 0.9732980024459845, + "grad_norm": 0.6590330004692078, + "learning_rate": 8.783887077048512e-06, + "loss": 5.2185, + "step": 14325 + }, + { + "epoch": 0.9736377225166463, + "grad_norm": 0.5211653709411621, + "learning_rate": 8.783462426960185e-06, + "loss": 5.3053, + "step": 14330 + }, + { + "epoch": 0.973977442587308, + "grad_norm": 0.5654427409172058, + "learning_rate": 8.783037776871858e-06, + "loss": 5.4565, + "step": 14335 + }, + { + "epoch": 0.9743171626579699, + "grad_norm": 0.5675336718559265, + "learning_rate": 8.78261312678353e-06, + "loss": 5.5695, + "step": 14340 + }, + { + "epoch": 0.9746568827286316, + "grad_norm": 0.5684579014778137, + "learning_rate": 8.782188476695204e-06, + "loss": 5.6936, + "step": 14345 + }, + { + "epoch": 0.9749966027992933, + "grad_norm": 0.6812212467193604, + "learning_rate": 8.781763826606876e-06, + "loss": 5.4112, + "step": 14350 + }, + { + "epoch": 0.9753363228699552, + "grad_norm": 0.49184927344322205, + "learning_rate": 8.78133917651855e-06, + "loss": 5.1395, + "step": 14355 + }, + { + "epoch": 0.9756760429406169, + "grad_norm": 0.5991939902305603, + "learning_rate": 8.780914526430222e-06, + "loss": 5.5052, + "step": 14360 + }, + { + "epoch": 0.9760157630112787, + "grad_norm": 0.5908947587013245, + "learning_rate": 8.780489876341895e-06, + "loss": 5.634, + "step": 14365 + }, + { + "epoch": 0.9763554830819405, + "grad_norm": 0.7087668776512146, + "learning_rate": 8.780065226253568e-06, + "loss": 5.645, + "step": 14370 + }, + { + "epoch": 0.9766952031526023, + "grad_norm": 0.6123431921005249, + "learning_rate": 8.77964057616524e-06, + "loss": 5.3219, + "step": 14375 + }, + { + "epoch": 0.977034923223264, + "grad_norm": 0.5870406031608582, + "learning_rate": 8.779215926076913e-06, + "loss": 5.6726, + "step": 14380 + }, + { + "epoch": 0.9773746432939258, + "grad_norm": 0.7340666651725769, + "learning_rate": 8.778791275988586e-06, + "loss": 5.2649, + "step": 14385 + }, + { + "epoch": 0.9777143633645876, + "grad_norm": 0.5839921832084656, + "learning_rate": 8.778366625900259e-06, + "loss": 5.494, + "step": 14390 + }, + { + "epoch": 0.9780540834352494, + "grad_norm": 0.603860080242157, + "learning_rate": 8.777941975811932e-06, + "loss": 5.5632, + "step": 14395 + }, + { + "epoch": 0.9783938035059111, + "grad_norm": 0.5747798085212708, + "learning_rate": 8.777517325723604e-06, + "loss": 5.4697, + "step": 14400 + }, + { + "epoch": 0.978733523576573, + "grad_norm": 0.663516104221344, + "learning_rate": 8.777092675635277e-06, + "loss": 5.2098, + "step": 14405 + }, + { + "epoch": 0.9790732436472347, + "grad_norm": 0.5690048933029175, + "learning_rate": 8.77666802554695e-06, + "loss": 5.3996, + "step": 14410 + }, + { + "epoch": 0.9794129637178964, + "grad_norm": 0.610634446144104, + "learning_rate": 8.776243375458623e-06, + "loss": 5.609, + "step": 14415 + }, + { + "epoch": 0.9797526837885582, + "grad_norm": 0.610836386680603, + "learning_rate": 8.775818725370296e-06, + "loss": 5.4412, + "step": 14420 + }, + { + "epoch": 0.98009240385922, + "grad_norm": 0.46789655089378357, + "learning_rate": 8.775394075281968e-06, + "loss": 5.0408, + "step": 14425 + }, + { + "epoch": 0.9804321239298818, + "grad_norm": 0.623818576335907, + "learning_rate": 8.774969425193641e-06, + "loss": 5.4629, + "step": 14430 + }, + { + "epoch": 0.9807718440005435, + "grad_norm": 0.7160234451293945, + "learning_rate": 8.774544775105314e-06, + "loss": 5.3114, + "step": 14435 + }, + { + "epoch": 0.9811115640712054, + "grad_norm": 0.6514989733695984, + "learning_rate": 8.774120125016987e-06, + "loss": 5.4087, + "step": 14440 + }, + { + "epoch": 0.9814512841418671, + "grad_norm": 0.613371729850769, + "learning_rate": 8.77369547492866e-06, + "loss": 5.4698, + "step": 14445 + }, + { + "epoch": 0.9817910042125289, + "grad_norm": 0.6608002781867981, + "learning_rate": 8.773270824840332e-06, + "loss": 5.3971, + "step": 14450 + }, + { + "epoch": 0.9821307242831907, + "grad_norm": 0.5546053051948547, + "learning_rate": 8.772846174752005e-06, + "loss": 5.5742, + "step": 14455 + }, + { + "epoch": 0.9824704443538524, + "grad_norm": 0.522408664226532, + "learning_rate": 8.772421524663678e-06, + "loss": 5.3536, + "step": 14460 + }, + { + "epoch": 0.9828101644245142, + "grad_norm": 0.6810125112533569, + "learning_rate": 8.771996874575351e-06, + "loss": 5.5567, + "step": 14465 + }, + { + "epoch": 0.9831498844951759, + "grad_norm": 0.648466169834137, + "learning_rate": 8.771572224487024e-06, + "loss": 5.6172, + "step": 14470 + }, + { + "epoch": 0.9834896045658378, + "grad_norm": 0.6718018651008606, + "learning_rate": 8.771147574398696e-06, + "loss": 5.3764, + "step": 14475 + }, + { + "epoch": 0.9838293246364995, + "grad_norm": 0.7017665505409241, + "learning_rate": 8.77072292431037e-06, + "loss": 5.2876, + "step": 14480 + }, + { + "epoch": 0.9841690447071613, + "grad_norm": 0.5425823926925659, + "learning_rate": 8.770298274222042e-06, + "loss": 5.4671, + "step": 14485 + }, + { + "epoch": 0.9845087647778231, + "grad_norm": 0.606766939163208, + "learning_rate": 8.769873624133715e-06, + "loss": 5.5068, + "step": 14490 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.7422763109207153, + "learning_rate": 8.769448974045388e-06, + "loss": 5.6461, + "step": 14495 + }, + { + "epoch": 0.9851882049191466, + "grad_norm": 0.491193026304245, + "learning_rate": 8.76902432395706e-06, + "loss": 5.58, + "step": 14500 + }, + { + "epoch": 0.9855279249898083, + "grad_norm": 0.5728340744972229, + "learning_rate": 8.768599673868733e-06, + "loss": 5.5713, + "step": 14505 + }, + { + "epoch": 0.9858676450604702, + "grad_norm": 0.7902930974960327, + "learning_rate": 8.768175023780406e-06, + "loss": 5.462, + "step": 14510 + }, + { + "epoch": 0.9862073651311319, + "grad_norm": 0.5819207429885864, + "learning_rate": 8.767750373692079e-06, + "loss": 5.5407, + "step": 14515 + }, + { + "epoch": 0.9865470852017937, + "grad_norm": 0.6584261655807495, + "learning_rate": 8.767325723603752e-06, + "loss": 5.3298, + "step": 14520 + }, + { + "epoch": 0.9868868052724555, + "grad_norm": 0.559777021408081, + "learning_rate": 8.766901073515424e-06, + "loss": 5.1048, + "step": 14525 + }, + { + "epoch": 0.9872265253431173, + "grad_norm": 0.6624118089675903, + "learning_rate": 8.766476423427097e-06, + "loss": 5.447, + "step": 14530 + }, + { + "epoch": 0.987566245413779, + "grad_norm": 0.617814838886261, + "learning_rate": 8.76605177333877e-06, + "loss": 5.5877, + "step": 14535 + }, + { + "epoch": 0.9879059654844409, + "grad_norm": 0.7491462826728821, + "learning_rate": 8.765627123250441e-06, + "loss": 5.4799, + "step": 14540 + }, + { + "epoch": 0.9882456855551026, + "grad_norm": 0.49851521849632263, + "learning_rate": 8.765202473162116e-06, + "loss": 5.3376, + "step": 14545 + }, + { + "epoch": 0.9885854056257644, + "grad_norm": 0.6259635090827942, + "learning_rate": 8.764777823073788e-06, + "loss": 5.5526, + "step": 14550 + }, + { + "epoch": 0.9889251256964261, + "grad_norm": 1.0425825119018555, + "learning_rate": 8.76435317298546e-06, + "loss": 5.4561, + "step": 14555 + }, + { + "epoch": 0.989264845767088, + "grad_norm": 0.49268314242362976, + "learning_rate": 8.763928522897134e-06, + "loss": 5.3728, + "step": 14560 + }, + { + "epoch": 0.9896045658377497, + "grad_norm": 0.5743000507354736, + "learning_rate": 8.763503872808807e-06, + "loss": 5.5568, + "step": 14565 + }, + { + "epoch": 0.9899442859084114, + "grad_norm": 0.5083051323890686, + "learning_rate": 8.763079222720478e-06, + "loss": 5.2958, + "step": 14570 + }, + { + "epoch": 0.9902840059790733, + "grad_norm": 0.6363857984542847, + "learning_rate": 8.762654572632152e-06, + "loss": 5.5815, + "step": 14575 + }, + { + "epoch": 0.990623726049735, + "grad_norm": 0.5894413590431213, + "learning_rate": 8.762229922543825e-06, + "loss": 5.2993, + "step": 14580 + }, + { + "epoch": 0.9909634461203968, + "grad_norm": 0.6253748536109924, + "learning_rate": 8.761805272455496e-06, + "loss": 5.2061, + "step": 14585 + }, + { + "epoch": 0.9913031661910585, + "grad_norm": 0.4992522895336151, + "learning_rate": 8.761380622367171e-06, + "loss": 5.4637, + "step": 14590 + }, + { + "epoch": 0.9916428862617204, + "grad_norm": 0.6137253642082214, + "learning_rate": 8.760955972278844e-06, + "loss": 5.4168, + "step": 14595 + }, + { + "epoch": 0.9919826063323821, + "grad_norm": 0.49412792921066284, + "learning_rate": 8.760531322190515e-06, + "loss": 5.6527, + "step": 14600 + }, + { + "epoch": 0.9923223264030439, + "grad_norm": 0.5863257050514221, + "learning_rate": 8.76010667210219e-06, + "loss": 5.5515, + "step": 14605 + }, + { + "epoch": 0.9926620464737057, + "grad_norm": 0.5231751203536987, + "learning_rate": 8.75968202201386e-06, + "loss": 5.2814, + "step": 14610 + }, + { + "epoch": 0.9930017665443674, + "grad_norm": 0.7274008989334106, + "learning_rate": 8.759257371925533e-06, + "loss": 5.5147, + "step": 14615 + }, + { + "epoch": 0.9933414866150292, + "grad_norm": 0.599607527256012, + "learning_rate": 8.758832721837208e-06, + "loss": 5.4709, + "step": 14620 + }, + { + "epoch": 0.993681206685691, + "grad_norm": 0.5375558733940125, + "learning_rate": 8.758408071748879e-06, + "loss": 5.4653, + "step": 14625 + }, + { + "epoch": 0.9940209267563528, + "grad_norm": 0.5619959235191345, + "learning_rate": 8.757983421660552e-06, + "loss": 5.6986, + "step": 14630 + }, + { + "epoch": 0.9943606468270145, + "grad_norm": 0.8035149574279785, + "learning_rate": 8.757558771572226e-06, + "loss": 5.3377, + "step": 14635 + }, + { + "epoch": 0.9947003668976763, + "grad_norm": 0.5306188464164734, + "learning_rate": 8.757134121483897e-06, + "loss": 5.6283, + "step": 14640 + }, + { + "epoch": 0.9950400869683381, + "grad_norm": 0.5911756157875061, + "learning_rate": 8.75670947139557e-06, + "loss": 5.3209, + "step": 14645 + }, + { + "epoch": 0.9953798070389999, + "grad_norm": 0.6238330602645874, + "learning_rate": 8.756284821307244e-06, + "loss": 5.273, + "step": 14650 + }, + { + "epoch": 0.9957195271096616, + "grad_norm": 0.7072955965995789, + "learning_rate": 8.755860171218916e-06, + "loss": 5.4085, + "step": 14655 + }, + { + "epoch": 0.9960592471803235, + "grad_norm": 0.6637516617774963, + "learning_rate": 8.755435521130588e-06, + "loss": 5.192, + "step": 14660 + }, + { + "epoch": 0.9963989672509852, + "grad_norm": 0.7933462858200073, + "learning_rate": 8.755010871042263e-06, + "loss": 5.4462, + "step": 14665 + }, + { + "epoch": 0.9967386873216469, + "grad_norm": 0.720691978931427, + "learning_rate": 8.754586220953934e-06, + "loss": 5.542, + "step": 14670 + }, + { + "epoch": 0.9970784073923087, + "grad_norm": 0.48920145630836487, + "learning_rate": 8.754161570865607e-06, + "loss": 5.4393, + "step": 14675 + }, + { + "epoch": 0.9974181274629705, + "grad_norm": 0.6747395992279053, + "learning_rate": 8.75373692077728e-06, + "loss": 5.6749, + "step": 14680 + }, + { + "epoch": 0.9977578475336323, + "grad_norm": 0.5961885452270508, + "learning_rate": 8.753312270688952e-06, + "loss": 5.4067, + "step": 14685 + }, + { + "epoch": 0.998097567604294, + "grad_norm": 0.646711528301239, + "learning_rate": 8.752887620600627e-06, + "loss": 5.36, + "step": 14690 + }, + { + "epoch": 0.9984372876749559, + "grad_norm": 0.6322925090789795, + "learning_rate": 8.752462970512298e-06, + "loss": 5.7081, + "step": 14695 + }, + { + "epoch": 0.9987770077456176, + "grad_norm": 0.5425825715065002, + "learning_rate": 8.75203832042397e-06, + "loss": 5.4588, + "step": 14700 + }, + { + "epoch": 0.9991167278162794, + "grad_norm": 0.7060564756393433, + "learning_rate": 8.751613670335645e-06, + "loss": 5.1567, + "step": 14705 + }, + { + "epoch": 0.9994564478869412, + "grad_norm": 0.6413284540176392, + "learning_rate": 8.751189020247316e-06, + "loss": 5.4427, + "step": 14710 + }, + { + "epoch": 0.999796167957603, + "grad_norm": 0.6597184538841248, + "learning_rate": 8.75076437015899e-06, + "loss": 5.6843, + "step": 14715 + }, + { + "epoch": 1.0, + "eval_bertscore": { + "f1": 0.7636442824798172, + "precision": 0.7266563582309447, + "recall": 0.8062262918404085 + }, + "eval_bleu_4": 0.001605574187713095, + "eval_exact_match": 0.0, + "eval_loss": 4.88494873046875, + "eval_meteor": 0.01347206986878543, + "eval_rouge": { + "rouge1": 0.024737662830408187, + "rouge2": 0.001105434030744822, + "rougeL": 0.02325929595065118, + "rougeLsum": 0.023292484398575196 + }, + "eval_runtime": 279.1346, + "eval_samples_per_second": 36.968, + "eval_steps_per_second": 4.621, + "step": 14718 + }, + { + "epoch": 1.0001358880282647, + "grad_norm": 0.568225622177124, + "learning_rate": 8.750339720070664e-06, + "loss": 5.4027, + "step": 14720 + }, + { + "epoch": 1.0004756080989265, + "grad_norm": 0.569591760635376, + "learning_rate": 8.749915069982335e-06, + "loss": 5.2827, + "step": 14725 + }, + { + "epoch": 1.0008153281695882, + "grad_norm": 0.733282208442688, + "learning_rate": 8.749490419894008e-06, + "loss": 5.3594, + "step": 14730 + }, + { + "epoch": 1.00115504824025, + "grad_norm": 0.5885306000709534, + "learning_rate": 8.749065769805682e-06, + "loss": 5.3761, + "step": 14735 + }, + { + "epoch": 1.0014947683109119, + "grad_norm": 0.5379327535629272, + "learning_rate": 8.748641119717353e-06, + "loss": 5.5357, + "step": 14740 + }, + { + "epoch": 1.0018344883815735, + "grad_norm": 0.6471124291419983, + "learning_rate": 8.748216469629026e-06, + "loss": 5.5083, + "step": 14745 + }, + { + "epoch": 1.0021742084522354, + "grad_norm": 0.5885576605796814, + "learning_rate": 8.7477918195407e-06, + "loss": 5.3024, + "step": 14750 + }, + { + "epoch": 1.0025139285228972, + "grad_norm": 0.6446468830108643, + "learning_rate": 8.747367169452372e-06, + "loss": 5.0726, + "step": 14755 + }, + { + "epoch": 1.0028536485935589, + "grad_norm": 0.7407423853874207, + "learning_rate": 8.746942519364044e-06, + "loss": 5.5516, + "step": 14760 + }, + { + "epoch": 1.0031933686642207, + "grad_norm": 0.5297294855117798, + "learning_rate": 8.746517869275717e-06, + "loss": 5.4748, + "step": 14765 + }, + { + "epoch": 1.0035330887348826, + "grad_norm": 0.7044637799263, + "learning_rate": 8.74609321918739e-06, + "loss": 5.4588, + "step": 14770 + }, + { + "epoch": 1.0038728088055442, + "grad_norm": 0.5562900304794312, + "learning_rate": 8.745668569099063e-06, + "loss": 5.3589, + "step": 14775 + }, + { + "epoch": 1.004212528876206, + "grad_norm": 0.5983619689941406, + "learning_rate": 8.745243919010736e-06, + "loss": 5.4652, + "step": 14780 + }, + { + "epoch": 1.0045522489468677, + "grad_norm": 0.6071609258651733, + "learning_rate": 8.744819268922408e-06, + "loss": 5.36, + "step": 14785 + }, + { + "epoch": 1.0048919690175295, + "grad_norm": 0.601712167263031, + "learning_rate": 8.744394618834081e-06, + "loss": 5.4509, + "step": 14790 + }, + { + "epoch": 1.0052316890881914, + "grad_norm": 0.6922879219055176, + "learning_rate": 8.743969968745754e-06, + "loss": 5.472, + "step": 14795 + }, + { + "epoch": 1.005571409158853, + "grad_norm": 0.5834575295448303, + "learning_rate": 8.743545318657427e-06, + "loss": 5.5832, + "step": 14800 + }, + { + "epoch": 1.0059111292295149, + "grad_norm": 0.6081736087799072, + "learning_rate": 8.7431206685691e-06, + "loss": 5.5043, + "step": 14805 + }, + { + "epoch": 1.0062508493001767, + "grad_norm": 0.5582435131072998, + "learning_rate": 8.742696018480772e-06, + "loss": 5.3393, + "step": 14810 + }, + { + "epoch": 1.0065905693708384, + "grad_norm": 0.5167001485824585, + "learning_rate": 8.742271368392445e-06, + "loss": 5.2173, + "step": 14815 + }, + { + "epoch": 1.0069302894415002, + "grad_norm": 0.53880375623703, + "learning_rate": 8.741846718304118e-06, + "loss": 5.6581, + "step": 14820 + }, + { + "epoch": 1.007270009512162, + "grad_norm": 0.7524119019508362, + "learning_rate": 8.74142206821579e-06, + "loss": 5.5346, + "step": 14825 + }, + { + "epoch": 1.0076097295828237, + "grad_norm": 0.7120556831359863, + "learning_rate": 8.740997418127464e-06, + "loss": 5.5403, + "step": 14830 + }, + { + "epoch": 1.0079494496534855, + "grad_norm": 0.5663344264030457, + "learning_rate": 8.740572768039136e-06, + "loss": 5.4954, + "step": 14835 + }, + { + "epoch": 1.0082891697241474, + "grad_norm": 0.6621957421302795, + "learning_rate": 8.74014811795081e-06, + "loss": 5.2573, + "step": 14840 + }, + { + "epoch": 1.008628889794809, + "grad_norm": 0.5214236974716187, + "learning_rate": 8.739723467862482e-06, + "loss": 5.6933, + "step": 14845 + }, + { + "epoch": 1.0089686098654709, + "grad_norm": 0.7567070126533508, + "learning_rate": 8.739298817774155e-06, + "loss": 5.5479, + "step": 14850 + }, + { + "epoch": 1.0093083299361327, + "grad_norm": 0.5273669362068176, + "learning_rate": 8.738874167685828e-06, + "loss": 5.7097, + "step": 14855 + }, + { + "epoch": 1.0096480500067944, + "grad_norm": 0.633855402469635, + "learning_rate": 8.7384495175975e-06, + "loss": 5.3142, + "step": 14860 + }, + { + "epoch": 1.0099877700774562, + "grad_norm": 0.6607436537742615, + "learning_rate": 8.738024867509173e-06, + "loss": 5.7372, + "step": 14865 + }, + { + "epoch": 1.0103274901481178, + "grad_norm": 0.5795432329177856, + "learning_rate": 8.737600217420846e-06, + "loss": 5.4387, + "step": 14870 + }, + { + "epoch": 1.0106672102187797, + "grad_norm": 0.7305572628974915, + "learning_rate": 8.737175567332519e-06, + "loss": 5.5425, + "step": 14875 + }, + { + "epoch": 1.0110069302894416, + "grad_norm": 0.6178098917007446, + "learning_rate": 8.736750917244192e-06, + "loss": 5.4547, + "step": 14880 + }, + { + "epoch": 1.0113466503601032, + "grad_norm": 0.6757492423057556, + "learning_rate": 8.736326267155864e-06, + "loss": 5.532, + "step": 14885 + }, + { + "epoch": 1.011686370430765, + "grad_norm": 0.6233121156692505, + "learning_rate": 8.735901617067537e-06, + "loss": 5.6295, + "step": 14890 + }, + { + "epoch": 1.012026090501427, + "grad_norm": 0.4884956181049347, + "learning_rate": 8.73547696697921e-06, + "loss": 5.5875, + "step": 14895 + }, + { + "epoch": 1.0123658105720885, + "grad_norm": 0.5493921637535095, + "learning_rate": 8.735052316890883e-06, + "loss": 5.4257, + "step": 14900 + }, + { + "epoch": 1.0127055306427504, + "grad_norm": 0.47812461853027344, + "learning_rate": 8.734627666802556e-06, + "loss": 5.5813, + "step": 14905 + }, + { + "epoch": 1.0130452507134122, + "grad_norm": 0.4252917170524597, + "learning_rate": 8.734203016714228e-06, + "loss": 5.3394, + "step": 14910 + }, + { + "epoch": 1.0133849707840739, + "grad_norm": 0.729265034198761, + "learning_rate": 8.733778366625901e-06, + "loss": 5.4828, + "step": 14915 + }, + { + "epoch": 1.0137246908547357, + "grad_norm": 0.675237238407135, + "learning_rate": 8.733353716537574e-06, + "loss": 5.13, + "step": 14920 + }, + { + "epoch": 1.0140644109253976, + "grad_norm": 0.4846169650554657, + "learning_rate": 8.732929066449247e-06, + "loss": 5.4003, + "step": 14925 + }, + { + "epoch": 1.0144041309960592, + "grad_norm": 0.747042715549469, + "learning_rate": 8.73250441636092e-06, + "loss": 5.5224, + "step": 14930 + }, + { + "epoch": 1.014743851066721, + "grad_norm": 0.6434257626533508, + "learning_rate": 8.732079766272592e-06, + "loss": 5.3851, + "step": 14935 + }, + { + "epoch": 1.015083571137383, + "grad_norm": 0.6329726576805115, + "learning_rate": 8.731655116184265e-06, + "loss": 5.3864, + "step": 14940 + }, + { + "epoch": 1.0154232912080445, + "grad_norm": 0.5686591267585754, + "learning_rate": 8.731230466095938e-06, + "loss": 5.5212, + "step": 14945 + }, + { + "epoch": 1.0157630112787064, + "grad_norm": 0.6051334738731384, + "learning_rate": 8.73080581600761e-06, + "loss": 5.4586, + "step": 14950 + }, + { + "epoch": 1.016102731349368, + "grad_norm": 0.45974743366241455, + "learning_rate": 8.730381165919282e-06, + "loss": 5.4255, + "step": 14955 + }, + { + "epoch": 1.0164424514200299, + "grad_norm": 0.5251497626304626, + "learning_rate": 8.729956515830956e-06, + "loss": 5.2102, + "step": 14960 + }, + { + "epoch": 1.0167821714906917, + "grad_norm": 0.7465431690216064, + "learning_rate": 8.72953186574263e-06, + "loss": 5.5217, + "step": 14965 + }, + { + "epoch": 1.0171218915613534, + "grad_norm": 0.6738865971565247, + "learning_rate": 8.7291072156543e-06, + "loss": 5.1519, + "step": 14970 + }, + { + "epoch": 1.0174616116320152, + "grad_norm": 0.5763686895370483, + "learning_rate": 8.728682565565975e-06, + "loss": 5.4289, + "step": 14975 + }, + { + "epoch": 1.017801331702677, + "grad_norm": 0.5361834764480591, + "learning_rate": 8.728257915477648e-06, + "loss": 5.3471, + "step": 14980 + }, + { + "epoch": 1.0181410517733387, + "grad_norm": 0.6843949556350708, + "learning_rate": 8.727833265389319e-06, + "loss": 5.5378, + "step": 14985 + }, + { + "epoch": 1.0184807718440005, + "grad_norm": 0.7022501230239868, + "learning_rate": 8.727408615300993e-06, + "loss": 5.431, + "step": 14990 + }, + { + "epoch": 1.0188204919146624, + "grad_norm": 0.7229858040809631, + "learning_rate": 8.726983965212666e-06, + "loss": 5.6392, + "step": 14995 + }, + { + "epoch": 1.019160211985324, + "grad_norm": 0.4692619740962982, + "learning_rate": 8.726559315124337e-06, + "loss": 5.3683, + "step": 15000 + }, + { + "epoch": 1.0194999320559859, + "grad_norm": 0.560610830783844, + "learning_rate": 8.726134665036012e-06, + "loss": 5.3166, + "step": 15005 + }, + { + "epoch": 1.0198396521266477, + "grad_norm": 0.5027889609336853, + "learning_rate": 8.725710014947684e-06, + "loss": 5.2824, + "step": 15010 + }, + { + "epoch": 1.0201793721973094, + "grad_norm": 0.7023258209228516, + "learning_rate": 8.725285364859356e-06, + "loss": 5.2412, + "step": 15015 + }, + { + "epoch": 1.0205190922679712, + "grad_norm": 0.5505580306053162, + "learning_rate": 8.72486071477103e-06, + "loss": 5.3135, + "step": 15020 + }, + { + "epoch": 1.020858812338633, + "grad_norm": 0.5684467554092407, + "learning_rate": 8.724436064682701e-06, + "loss": 5.5234, + "step": 15025 + }, + { + "epoch": 1.0211985324092947, + "grad_norm": 0.6107626557350159, + "learning_rate": 8.724011414594374e-06, + "loss": 5.0251, + "step": 15030 + }, + { + "epoch": 1.0215382524799566, + "grad_norm": 0.6145769357681274, + "learning_rate": 8.723586764506048e-06, + "loss": 5.5656, + "step": 15035 + }, + { + "epoch": 1.0218779725506182, + "grad_norm": 0.5910043120384216, + "learning_rate": 8.72316211441772e-06, + "loss": 5.3671, + "step": 15040 + }, + { + "epoch": 1.02221769262128, + "grad_norm": 0.4583132565021515, + "learning_rate": 8.722737464329394e-06, + "loss": 5.2602, + "step": 15045 + }, + { + "epoch": 1.022557412691942, + "grad_norm": 0.6311168074607849, + "learning_rate": 8.722312814241067e-06, + "loss": 5.4071, + "step": 15050 + }, + { + "epoch": 1.0228971327626035, + "grad_norm": 0.5645589232444763, + "learning_rate": 8.721888164152738e-06, + "loss": 5.5836, + "step": 15055 + }, + { + "epoch": 1.0232368528332654, + "grad_norm": 0.6264359951019287, + "learning_rate": 8.721463514064412e-06, + "loss": 5.3553, + "step": 15060 + }, + { + "epoch": 1.0235765729039272, + "grad_norm": 0.678053617477417, + "learning_rate": 8.721038863976085e-06, + "loss": 5.3731, + "step": 15065 + }, + { + "epoch": 1.0239162929745889, + "grad_norm": 0.6648561358451843, + "learning_rate": 8.720614213887756e-06, + "loss": 5.4282, + "step": 15070 + }, + { + "epoch": 1.0242560130452507, + "grad_norm": 0.6121761202812195, + "learning_rate": 8.720189563799431e-06, + "loss": 5.3436, + "step": 15075 + }, + { + "epoch": 1.0245957331159126, + "grad_norm": 0.5619031190872192, + "learning_rate": 8.719764913711104e-06, + "loss": 5.4241, + "step": 15080 + }, + { + "epoch": 1.0249354531865742, + "grad_norm": 0.5855938196182251, + "learning_rate": 8.719340263622775e-06, + "loss": 5.2056, + "step": 15085 + }, + { + "epoch": 1.025275173257236, + "grad_norm": 0.5368460416793823, + "learning_rate": 8.71891561353445e-06, + "loss": 5.432, + "step": 15090 + }, + { + "epoch": 1.025614893327898, + "grad_norm": 0.6556609869003296, + "learning_rate": 8.718490963446122e-06, + "loss": 5.3316, + "step": 15095 + }, + { + "epoch": 1.0259546133985595, + "grad_norm": 0.602440357208252, + "learning_rate": 8.718066313357793e-06, + "loss": 5.4134, + "step": 15100 + }, + { + "epoch": 1.0262943334692214, + "grad_norm": 0.7163495421409607, + "learning_rate": 8.717641663269468e-06, + "loss": 5.2091, + "step": 15105 + }, + { + "epoch": 1.0266340535398832, + "grad_norm": 0.5937293171882629, + "learning_rate": 8.717217013181139e-06, + "loss": 5.4157, + "step": 15110 + }, + { + "epoch": 1.0269737736105449, + "grad_norm": 0.7494350671768188, + "learning_rate": 8.716792363092812e-06, + "loss": 5.5758, + "step": 15115 + }, + { + "epoch": 1.0273134936812067, + "grad_norm": 0.49457675218582153, + "learning_rate": 8.716367713004486e-06, + "loss": 5.4354, + "step": 15120 + }, + { + "epoch": 1.0276532137518684, + "grad_norm": 0.6049057841300964, + "learning_rate": 8.715943062916157e-06, + "loss": 5.3034, + "step": 15125 + }, + { + "epoch": 1.0279929338225302, + "grad_norm": 0.5466347932815552, + "learning_rate": 8.71551841282783e-06, + "loss": 5.2797, + "step": 15130 + }, + { + "epoch": 1.028332653893192, + "grad_norm": 0.6473033428192139, + "learning_rate": 8.715093762739504e-06, + "loss": 5.5345, + "step": 15135 + }, + { + "epoch": 1.0286723739638537, + "grad_norm": 0.5384989976882935, + "learning_rate": 8.714669112651176e-06, + "loss": 5.3933, + "step": 15140 + }, + { + "epoch": 1.0290120940345155, + "grad_norm": 0.5032271146774292, + "learning_rate": 8.714244462562848e-06, + "loss": 5.2838, + "step": 15145 + }, + { + "epoch": 1.0293518141051774, + "grad_norm": 0.5548399090766907, + "learning_rate": 8.713819812474523e-06, + "loss": 5.3986, + "step": 15150 + }, + { + "epoch": 1.029691534175839, + "grad_norm": 0.5500745177268982, + "learning_rate": 8.713395162386194e-06, + "loss": 5.4449, + "step": 15155 + }, + { + "epoch": 1.0300312542465009, + "grad_norm": 0.5155702829360962, + "learning_rate": 8.712970512297867e-06, + "loss": 5.4012, + "step": 15160 + }, + { + "epoch": 1.0303709743171627, + "grad_norm": 0.5418525338172913, + "learning_rate": 8.712545862209541e-06, + "loss": 5.4587, + "step": 15165 + }, + { + "epoch": 1.0307106943878244, + "grad_norm": 0.48739486932754517, + "learning_rate": 8.712121212121212e-06, + "loss": 5.5798, + "step": 15170 + }, + { + "epoch": 1.0310504144584862, + "grad_norm": 0.8109285235404968, + "learning_rate": 8.711696562032885e-06, + "loss": 5.2317, + "step": 15175 + }, + { + "epoch": 1.031390134529148, + "grad_norm": 0.8828322887420654, + "learning_rate": 8.711271911944558e-06, + "loss": 5.6055, + "step": 15180 + }, + { + "epoch": 1.0317298545998097, + "grad_norm": 0.5035231709480286, + "learning_rate": 8.71084726185623e-06, + "loss": 5.4114, + "step": 15185 + }, + { + "epoch": 1.0320695746704716, + "grad_norm": 0.5640774369239807, + "learning_rate": 8.710422611767904e-06, + "loss": 5.4572, + "step": 15190 + }, + { + "epoch": 1.0324092947411334, + "grad_norm": 0.5073200464248657, + "learning_rate": 8.709997961679576e-06, + "loss": 5.2636, + "step": 15195 + }, + { + "epoch": 1.032749014811795, + "grad_norm": 0.5126234292984009, + "learning_rate": 8.70957331159125e-06, + "loss": 5.4708, + "step": 15200 + }, + { + "epoch": 1.033088734882457, + "grad_norm": 0.6889611482620239, + "learning_rate": 8.709148661502922e-06, + "loss": 5.4739, + "step": 15205 + }, + { + "epoch": 1.0334284549531185, + "grad_norm": 0.5004898905754089, + "learning_rate": 8.708724011414595e-06, + "loss": 5.4936, + "step": 15210 + }, + { + "epoch": 1.0337681750237804, + "grad_norm": 0.4961981475353241, + "learning_rate": 8.708299361326268e-06, + "loss": 5.5511, + "step": 15215 + }, + { + "epoch": 1.0341078950944422, + "grad_norm": 0.4666106700897217, + "learning_rate": 8.70787471123794e-06, + "loss": 5.313, + "step": 15220 + }, + { + "epoch": 1.0344476151651039, + "grad_norm": 0.6992846131324768, + "learning_rate": 8.707450061149613e-06, + "loss": 5.364, + "step": 15225 + }, + { + "epoch": 1.0347873352357657, + "grad_norm": 0.6811745762825012, + "learning_rate": 8.707025411061286e-06, + "loss": 5.4121, + "step": 15230 + }, + { + "epoch": 1.0351270553064276, + "grad_norm": 0.47127509117126465, + "learning_rate": 8.706600760972959e-06, + "loss": 5.5324, + "step": 15235 + }, + { + "epoch": 1.0354667753770892, + "grad_norm": 0.5003260374069214, + "learning_rate": 8.706176110884632e-06, + "loss": 5.1111, + "step": 15240 + }, + { + "epoch": 1.035806495447751, + "grad_norm": 0.5767427682876587, + "learning_rate": 8.705751460796304e-06, + "loss": 5.542, + "step": 15245 + }, + { + "epoch": 1.036146215518413, + "grad_norm": 0.5771222710609436, + "learning_rate": 8.705326810707977e-06, + "loss": 5.3412, + "step": 15250 + }, + { + "epoch": 1.0364859355890745, + "grad_norm": 0.5853641033172607, + "learning_rate": 8.70490216061965e-06, + "loss": 5.2454, + "step": 15255 + }, + { + "epoch": 1.0368256556597364, + "grad_norm": 0.6496791243553162, + "learning_rate": 8.704477510531323e-06, + "loss": 5.4723, + "step": 15260 + }, + { + "epoch": 1.0371653757303982, + "grad_norm": 0.5084103941917419, + "learning_rate": 8.704052860442996e-06, + "loss": 5.0971, + "step": 15265 + }, + { + "epoch": 1.0375050958010599, + "grad_norm": 0.4789447784423828, + "learning_rate": 8.703628210354668e-06, + "loss": 5.5564, + "step": 15270 + }, + { + "epoch": 1.0378448158717217, + "grad_norm": 0.7454558610916138, + "learning_rate": 8.703203560266341e-06, + "loss": 5.2193, + "step": 15275 + }, + { + "epoch": 1.0381845359423836, + "grad_norm": 0.6099151372909546, + "learning_rate": 8.702778910178014e-06, + "loss": 5.3045, + "step": 15280 + }, + { + "epoch": 1.0385242560130452, + "grad_norm": 0.48990631103515625, + "learning_rate": 8.702354260089687e-06, + "loss": 5.7809, + "step": 15285 + }, + { + "epoch": 1.038863976083707, + "grad_norm": 0.5541290640830994, + "learning_rate": 8.70192961000136e-06, + "loss": 5.533, + "step": 15290 + }, + { + "epoch": 1.0392036961543687, + "grad_norm": 0.5202525854110718, + "learning_rate": 8.701504959913032e-06, + "loss": 5.2431, + "step": 15295 + }, + { + "epoch": 1.0395434162250305, + "grad_norm": 0.6096553206443787, + "learning_rate": 8.701080309824705e-06, + "loss": 5.2807, + "step": 15300 + }, + { + "epoch": 1.0398831362956924, + "grad_norm": 0.612290620803833, + "learning_rate": 8.700655659736378e-06, + "loss": 5.4742, + "step": 15305 + }, + { + "epoch": 1.040222856366354, + "grad_norm": 0.541355311870575, + "learning_rate": 8.70023100964805e-06, + "loss": 5.2126, + "step": 15310 + }, + { + "epoch": 1.0405625764370159, + "grad_norm": 0.617739737033844, + "learning_rate": 8.699806359559724e-06, + "loss": 5.7475, + "step": 15315 + }, + { + "epoch": 1.0409022965076777, + "grad_norm": 0.593471348285675, + "learning_rate": 8.699381709471396e-06, + "loss": 5.2809, + "step": 15320 + }, + { + "epoch": 1.0412420165783394, + "grad_norm": 0.5496196150779724, + "learning_rate": 8.69895705938307e-06, + "loss": 5.1996, + "step": 15325 + }, + { + "epoch": 1.0415817366490012, + "grad_norm": 0.561741292476654, + "learning_rate": 8.698532409294742e-06, + "loss": 5.4523, + "step": 15330 + }, + { + "epoch": 1.041921456719663, + "grad_norm": 0.5593153238296509, + "learning_rate": 8.698107759206415e-06, + "loss": 5.4666, + "step": 15335 + }, + { + "epoch": 1.0422611767903247, + "grad_norm": 0.5389003157615662, + "learning_rate": 8.697683109118088e-06, + "loss": 5.537, + "step": 15340 + }, + { + "epoch": 1.0426008968609866, + "grad_norm": 0.4988531172275543, + "learning_rate": 8.697343389047427e-06, + "loss": 5.4138, + "step": 15345 + }, + { + "epoch": 1.0429406169316484, + "grad_norm": 0.6591020226478577, + "learning_rate": 8.696918738959098e-06, + "loss": 4.967, + "step": 15350 + }, + { + "epoch": 1.04328033700231, + "grad_norm": 0.5418461561203003, + "learning_rate": 8.696494088870772e-06, + "loss": 5.2844, + "step": 15355 + }, + { + "epoch": 1.043620057072972, + "grad_norm": 0.5308448076248169, + "learning_rate": 8.696069438782443e-06, + "loss": 5.3587, + "step": 15360 + }, + { + "epoch": 1.0439597771436337, + "grad_norm": 0.5190008282661438, + "learning_rate": 8.695644788694116e-06, + "loss": 4.9993, + "step": 15365 + }, + { + "epoch": 1.0442994972142954, + "grad_norm": 0.6032753586769104, + "learning_rate": 8.69522013860579e-06, + "loss": 5.3043, + "step": 15370 + }, + { + "epoch": 1.0446392172849572, + "grad_norm": 0.6306255459785461, + "learning_rate": 8.694795488517462e-06, + "loss": 5.3714, + "step": 15375 + }, + { + "epoch": 1.0449789373556189, + "grad_norm": 0.5000019073486328, + "learning_rate": 8.694370838429134e-06, + "loss": 5.1488, + "step": 15380 + }, + { + "epoch": 1.0453186574262807, + "grad_norm": 0.6647116541862488, + "learning_rate": 8.693946188340809e-06, + "loss": 5.3552, + "step": 15385 + }, + { + "epoch": 1.0456583774969426, + "grad_norm": 0.6245008707046509, + "learning_rate": 8.69352153825248e-06, + "loss": 5.3534, + "step": 15390 + }, + { + "epoch": 1.0459980975676042, + "grad_norm": 0.5949138402938843, + "learning_rate": 8.693096888164153e-06, + "loss": 5.0694, + "step": 15395 + }, + { + "epoch": 1.046337817638266, + "grad_norm": 0.6309390068054199, + "learning_rate": 8.692672238075827e-06, + "loss": 5.3436, + "step": 15400 + }, + { + "epoch": 1.046677537708928, + "grad_norm": 0.727883517742157, + "learning_rate": 8.692247587987498e-06, + "loss": 5.4653, + "step": 15405 + }, + { + "epoch": 1.0470172577795895, + "grad_norm": 0.8729715943336487, + "learning_rate": 8.691822937899171e-06, + "loss": 5.5222, + "step": 15410 + }, + { + "epoch": 1.0473569778502514, + "grad_norm": 0.4596870541572571, + "learning_rate": 8.691398287810846e-06, + "loss": 5.3622, + "step": 15415 + }, + { + "epoch": 1.0476966979209132, + "grad_norm": 0.7468774318695068, + "learning_rate": 8.690973637722517e-06, + "loss": 5.3033, + "step": 15420 + }, + { + "epoch": 1.0480364179915749, + "grad_norm": 0.4852408170700073, + "learning_rate": 8.69054898763419e-06, + "loss": 5.1446, + "step": 15425 + }, + { + "epoch": 1.0483761380622367, + "grad_norm": 0.585116982460022, + "learning_rate": 8.690124337545862e-06, + "loss": 5.4786, + "step": 15430 + }, + { + "epoch": 1.0487158581328986, + "grad_norm": 0.7001885771751404, + "learning_rate": 8.689699687457535e-06, + "loss": 5.6129, + "step": 15435 + }, + { + "epoch": 1.0490555782035602, + "grad_norm": 0.4579656422138214, + "learning_rate": 8.689275037369208e-06, + "loss": 5.32, + "step": 15440 + }, + { + "epoch": 1.049395298274222, + "grad_norm": 0.5971712470054626, + "learning_rate": 8.688850387280881e-06, + "loss": 5.1304, + "step": 15445 + }, + { + "epoch": 1.049735018344884, + "grad_norm": 0.5501860976219177, + "learning_rate": 8.688425737192554e-06, + "loss": 5.4107, + "step": 15450 + }, + { + "epoch": 1.0500747384155455, + "grad_norm": 0.6321010589599609, + "learning_rate": 8.688001087104226e-06, + "loss": 5.2898, + "step": 15455 + }, + { + "epoch": 1.0504144584862074, + "grad_norm": 0.8382881283760071, + "learning_rate": 8.6875764370159e-06, + "loss": 5.4516, + "step": 15460 + }, + { + "epoch": 1.050754178556869, + "grad_norm": 0.5485391020774841, + "learning_rate": 8.687151786927572e-06, + "loss": 5.487, + "step": 15465 + }, + { + "epoch": 1.0510938986275309, + "grad_norm": 0.6533855199813843, + "learning_rate": 8.686727136839245e-06, + "loss": 5.5249, + "step": 15470 + }, + { + "epoch": 1.0514336186981927, + "grad_norm": 0.5608590245246887, + "learning_rate": 8.686302486750918e-06, + "loss": 5.4135, + "step": 15475 + }, + { + "epoch": 1.0517733387688544, + "grad_norm": 0.5332436561584473, + "learning_rate": 8.68587783666259e-06, + "loss": 5.3603, + "step": 15480 + }, + { + "epoch": 1.0521130588395162, + "grad_norm": 0.6148736476898193, + "learning_rate": 8.685453186574263e-06, + "loss": 5.4727, + "step": 15485 + }, + { + "epoch": 1.052452778910178, + "grad_norm": 0.46995192766189575, + "learning_rate": 8.685028536485936e-06, + "loss": 5.2951, + "step": 15490 + }, + { + "epoch": 1.0527924989808397, + "grad_norm": 0.4745619297027588, + "learning_rate": 8.684603886397609e-06, + "loss": 5.6042, + "step": 15495 + }, + { + "epoch": 1.0531322190515016, + "grad_norm": 0.5229361653327942, + "learning_rate": 8.684179236309282e-06, + "loss": 5.3553, + "step": 15500 + }, + { + "epoch": 1.0534719391221634, + "grad_norm": 0.5673896670341492, + "learning_rate": 8.683754586220954e-06, + "loss": 5.3454, + "step": 15505 + }, + { + "epoch": 1.053811659192825, + "grad_norm": 0.4752430021762848, + "learning_rate": 8.683329936132627e-06, + "loss": 5.4953, + "step": 15510 + }, + { + "epoch": 1.054151379263487, + "grad_norm": 0.6628978252410889, + "learning_rate": 8.6829052860443e-06, + "loss": 5.2857, + "step": 15515 + }, + { + "epoch": 1.0544910993341488, + "grad_norm": 0.6379475593566895, + "learning_rate": 8.682480635955973e-06, + "loss": 5.4529, + "step": 15520 + }, + { + "epoch": 1.0548308194048104, + "grad_norm": 0.5841405391693115, + "learning_rate": 8.682055985867646e-06, + "loss": 5.2157, + "step": 15525 + }, + { + "epoch": 1.0551705394754722, + "grad_norm": 0.5119732022285461, + "learning_rate": 8.681631335779318e-06, + "loss": 4.9719, + "step": 15530 + }, + { + "epoch": 1.055510259546134, + "grad_norm": 0.6466744542121887, + "learning_rate": 8.681206685690991e-06, + "loss": 5.5943, + "step": 15535 + }, + { + "epoch": 1.0558499796167957, + "grad_norm": 0.6435689330101013, + "learning_rate": 8.680782035602664e-06, + "loss": 5.5323, + "step": 15540 + }, + { + "epoch": 1.0561896996874576, + "grad_norm": 0.4494755268096924, + "learning_rate": 8.680357385514337e-06, + "loss": 5.2963, + "step": 15545 + }, + { + "epoch": 1.0565294197581192, + "grad_norm": 0.5109529495239258, + "learning_rate": 8.67993273542601e-06, + "loss": 5.2401, + "step": 15550 + }, + { + "epoch": 1.056869139828781, + "grad_norm": 0.5084643363952637, + "learning_rate": 8.679508085337682e-06, + "loss": 5.2843, + "step": 15555 + }, + { + "epoch": 1.057208859899443, + "grad_norm": 0.6088725924491882, + "learning_rate": 8.679083435249355e-06, + "loss": 5.2192, + "step": 15560 + }, + { + "epoch": 1.0575485799701045, + "grad_norm": 0.579944372177124, + "learning_rate": 8.678658785161028e-06, + "loss": 5.4043, + "step": 15565 + }, + { + "epoch": 1.0578883000407664, + "grad_norm": 0.612557590007782, + "learning_rate": 8.678234135072701e-06, + "loss": 5.5677, + "step": 15570 + }, + { + "epoch": 1.0582280201114282, + "grad_norm": 0.6275022625923157, + "learning_rate": 8.677809484984374e-06, + "loss": 5.5344, + "step": 15575 + }, + { + "epoch": 1.0585677401820899, + "grad_norm": 0.5410671830177307, + "learning_rate": 8.677384834896046e-06, + "loss": 5.3099, + "step": 15580 + }, + { + "epoch": 1.0589074602527517, + "grad_norm": 0.41955578327178955, + "learning_rate": 8.67696018480772e-06, + "loss": 5.0292, + "step": 15585 + }, + { + "epoch": 1.0592471803234136, + "grad_norm": 0.5144418478012085, + "learning_rate": 8.676535534719392e-06, + "loss": 5.3839, + "step": 15590 + }, + { + "epoch": 1.0595869003940752, + "grad_norm": 0.518883466720581, + "learning_rate": 8.676110884631065e-06, + "loss": 5.402, + "step": 15595 + }, + { + "epoch": 1.059926620464737, + "grad_norm": 0.5030735731124878, + "learning_rate": 8.675686234542738e-06, + "loss": 5.5954, + "step": 15600 + }, + { + "epoch": 1.060266340535399, + "grad_norm": 0.8355218172073364, + "learning_rate": 8.67526158445441e-06, + "loss": 5.3373, + "step": 15605 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.6160816550254822, + "learning_rate": 8.674836934366083e-06, + "loss": 5.4217, + "step": 15610 + }, + { + "epoch": 1.0609457806767224, + "grad_norm": 0.5270900130271912, + "learning_rate": 8.674412284277756e-06, + "loss": 5.3187, + "step": 15615 + }, + { + "epoch": 1.0612855007473843, + "grad_norm": 0.44884923100471497, + "learning_rate": 8.673987634189429e-06, + "loss": 5.5539, + "step": 15620 + }, + { + "epoch": 1.061625220818046, + "grad_norm": 0.544512152671814, + "learning_rate": 8.673562984101102e-06, + "loss": 5.3489, + "step": 15625 + }, + { + "epoch": 1.0619649408887077, + "grad_norm": 0.47720351815223694, + "learning_rate": 8.673138334012774e-06, + "loss": 5.0816, + "step": 15630 + }, + { + "epoch": 1.0623046609593694, + "grad_norm": 0.6044997572898865, + "learning_rate": 8.672713683924447e-06, + "loss": 5.4297, + "step": 15635 + }, + { + "epoch": 1.0626443810300312, + "grad_norm": 0.7146291136741638, + "learning_rate": 8.67228903383612e-06, + "loss": 5.5125, + "step": 15640 + }, + { + "epoch": 1.062984101100693, + "grad_norm": 0.5354127287864685, + "learning_rate": 8.671864383747793e-06, + "loss": 5.4546, + "step": 15645 + }, + { + "epoch": 1.0633238211713547, + "grad_norm": 0.6197735667228699, + "learning_rate": 8.671439733659466e-06, + "loss": 5.3326, + "step": 15650 + }, + { + "epoch": 1.0636635412420166, + "grad_norm": 0.6182374954223633, + "learning_rate": 8.671015083571138e-06, + "loss": 5.3889, + "step": 15655 + }, + { + "epoch": 1.0640032613126784, + "grad_norm": 0.7185716032981873, + "learning_rate": 8.670590433482811e-06, + "loss": 5.576, + "step": 15660 + }, + { + "epoch": 1.06434298138334, + "grad_norm": 0.47080764174461365, + "learning_rate": 8.670165783394484e-06, + "loss": 4.9871, + "step": 15665 + }, + { + "epoch": 1.064682701454002, + "grad_norm": 0.4675714373588562, + "learning_rate": 8.669741133306157e-06, + "loss": 5.1053, + "step": 15670 + }, + { + "epoch": 1.0650224215246638, + "grad_norm": 0.6912409067153931, + "learning_rate": 8.66931648321783e-06, + "loss": 5.3371, + "step": 15675 + }, + { + "epoch": 1.0653621415953254, + "grad_norm": 0.5797902941703796, + "learning_rate": 8.668891833129502e-06, + "loss": 5.4158, + "step": 15680 + }, + { + "epoch": 1.0657018616659872, + "grad_norm": 0.7582675814628601, + "learning_rate": 8.668467183041175e-06, + "loss": 5.4482, + "step": 15685 + }, + { + "epoch": 1.066041581736649, + "grad_norm": 0.5373093485832214, + "learning_rate": 8.668042532952848e-06, + "loss": 5.3118, + "step": 15690 + }, + { + "epoch": 1.0663813018073107, + "grad_norm": 0.587485671043396, + "learning_rate": 8.667617882864521e-06, + "loss": 5.4339, + "step": 15695 + }, + { + "epoch": 1.0667210218779726, + "grad_norm": 0.7660253643989563, + "learning_rate": 8.667193232776194e-06, + "loss": 5.4188, + "step": 15700 + }, + { + "epoch": 1.0670607419486344, + "grad_norm": 0.5186721086502075, + "learning_rate": 8.666768582687865e-06, + "loss": 5.3658, + "step": 15705 + }, + { + "epoch": 1.067400462019296, + "grad_norm": 0.5328636169433594, + "learning_rate": 8.66634393259954e-06, + "loss": 5.2721, + "step": 15710 + }, + { + "epoch": 1.067740182089958, + "grad_norm": 0.6870694160461426, + "learning_rate": 8.665919282511212e-06, + "loss": 5.3398, + "step": 15715 + }, + { + "epoch": 1.0680799021606195, + "grad_norm": 0.7394046783447266, + "learning_rate": 8.665494632422883e-06, + "loss": 5.1906, + "step": 15720 + }, + { + "epoch": 1.0684196222312814, + "grad_norm": 0.48738497495651245, + "learning_rate": 8.665069982334558e-06, + "loss": 5.2863, + "step": 15725 + }, + { + "epoch": 1.0687593423019432, + "grad_norm": 0.679041862487793, + "learning_rate": 8.66464533224623e-06, + "loss": 5.3344, + "step": 15730 + }, + { + "epoch": 1.0690990623726049, + "grad_norm": 0.5510556697845459, + "learning_rate": 8.664220682157902e-06, + "loss": 5.3696, + "step": 15735 + }, + { + "epoch": 1.0694387824432667, + "grad_norm": 0.5044730305671692, + "learning_rate": 8.663796032069576e-06, + "loss": 5.3183, + "step": 15740 + }, + { + "epoch": 1.0697785025139286, + "grad_norm": 0.47285211086273193, + "learning_rate": 8.663371381981249e-06, + "loss": 5.1565, + "step": 15745 + }, + { + "epoch": 1.0701182225845902, + "grad_norm": 0.525723397731781, + "learning_rate": 8.66294673189292e-06, + "loss": 5.0507, + "step": 15750 + }, + { + "epoch": 1.070457942655252, + "grad_norm": 0.5479961633682251, + "learning_rate": 8.662522081804594e-06, + "loss": 5.5657, + "step": 15755 + }, + { + "epoch": 1.070797662725914, + "grad_norm": 0.5671096444129944, + "learning_rate": 8.662097431716267e-06, + "loss": 5.2781, + "step": 15760 + }, + { + "epoch": 1.0711373827965756, + "grad_norm": 0.49393483996391296, + "learning_rate": 8.661672781627938e-06, + "loss": 5.28, + "step": 15765 + }, + { + "epoch": 1.0714771028672374, + "grad_norm": 0.616401195526123, + "learning_rate": 8.661248131539613e-06, + "loss": 5.0884, + "step": 15770 + }, + { + "epoch": 1.0718168229378993, + "grad_norm": 0.6368908286094666, + "learning_rate": 8.660823481451284e-06, + "loss": 5.2763, + "step": 15775 + }, + { + "epoch": 1.072156543008561, + "grad_norm": 0.6410385370254517, + "learning_rate": 8.660398831362957e-06, + "loss": 5.4016, + "step": 15780 + }, + { + "epoch": 1.0724962630792227, + "grad_norm": 0.5119884014129639, + "learning_rate": 8.659974181274631e-06, + "loss": 5.1247, + "step": 15785 + }, + { + "epoch": 1.0728359831498846, + "grad_norm": 0.544234037399292, + "learning_rate": 8.659549531186302e-06, + "loss": 5.1795, + "step": 15790 + }, + { + "epoch": 1.0731757032205462, + "grad_norm": 0.489409476518631, + "learning_rate": 8.659124881097975e-06, + "loss": 5.418, + "step": 15795 + }, + { + "epoch": 1.073515423291208, + "grad_norm": 0.6625202894210815, + "learning_rate": 8.65870023100965e-06, + "loss": 5.3826, + "step": 15800 + }, + { + "epoch": 1.07385514336187, + "grad_norm": 0.5369448065757751, + "learning_rate": 8.65827558092132e-06, + "loss": 5.188, + "step": 15805 + }, + { + "epoch": 1.0741948634325316, + "grad_norm": 0.4981456995010376, + "learning_rate": 8.657850930832994e-06, + "loss": 5.323, + "step": 15810 + }, + { + "epoch": 1.0745345835031934, + "grad_norm": 0.6298792362213135, + "learning_rate": 8.657426280744668e-06, + "loss": 5.4245, + "step": 15815 + }, + { + "epoch": 1.074874303573855, + "grad_norm": 0.5230774283409119, + "learning_rate": 8.65700163065634e-06, + "loss": 5.1996, + "step": 15820 + }, + { + "epoch": 1.075214023644517, + "grad_norm": 0.5606353878974915, + "learning_rate": 8.656576980568012e-06, + "loss": 5.3415, + "step": 15825 + }, + { + "epoch": 1.0755537437151788, + "grad_norm": 0.6054101586341858, + "learning_rate": 8.656152330479687e-06, + "loss": 5.4344, + "step": 15830 + }, + { + "epoch": 1.0758934637858404, + "grad_norm": 0.5456124544143677, + "learning_rate": 8.655727680391358e-06, + "loss": 5.2467, + "step": 15835 + }, + { + "epoch": 1.0762331838565022, + "grad_norm": 0.6264598965644836, + "learning_rate": 8.65530303030303e-06, + "loss": 5.2701, + "step": 15840 + }, + { + "epoch": 1.076572903927164, + "grad_norm": 0.5432330965995789, + "learning_rate": 8.654878380214703e-06, + "loss": 5.0378, + "step": 15845 + }, + { + "epoch": 1.0769126239978257, + "grad_norm": 0.5085010528564453, + "learning_rate": 8.654453730126376e-06, + "loss": 4.95, + "step": 15850 + }, + { + "epoch": 1.0772523440684876, + "grad_norm": 0.46282657980918884, + "learning_rate": 8.654029080038049e-06, + "loss": 5.1778, + "step": 15855 + }, + { + "epoch": 1.0775920641391494, + "grad_norm": 0.624658465385437, + "learning_rate": 8.653604429949722e-06, + "loss": 5.5168, + "step": 15860 + }, + { + "epoch": 1.077931784209811, + "grad_norm": 0.5416743159294128, + "learning_rate": 8.653179779861394e-06, + "loss": 5.0985, + "step": 15865 + }, + { + "epoch": 1.078271504280473, + "grad_norm": 0.4943048655986786, + "learning_rate": 8.652755129773067e-06, + "loss": 5.4045, + "step": 15870 + }, + { + "epoch": 1.0786112243511348, + "grad_norm": 0.5492099523544312, + "learning_rate": 8.65233047968474e-06, + "loss": 5.3078, + "step": 15875 + }, + { + "epoch": 1.0789509444217964, + "grad_norm": 0.6031784415245056, + "learning_rate": 8.651905829596413e-06, + "loss": 5.5006, + "step": 15880 + }, + { + "epoch": 1.0792906644924583, + "grad_norm": 0.642697274684906, + "learning_rate": 8.651481179508086e-06, + "loss": 5.3495, + "step": 15885 + }, + { + "epoch": 1.0796303845631199, + "grad_norm": 0.4653856158256531, + "learning_rate": 8.651056529419758e-06, + "loss": 5.2215, + "step": 15890 + }, + { + "epoch": 1.0799701046337817, + "grad_norm": 0.7325856685638428, + "learning_rate": 8.650631879331431e-06, + "loss": 5.1778, + "step": 15895 + }, + { + "epoch": 1.0803098247044436, + "grad_norm": 0.6267562508583069, + "learning_rate": 8.650207229243104e-06, + "loss": 5.1863, + "step": 15900 + }, + { + "epoch": 1.0806495447751052, + "grad_norm": 0.5574700236320496, + "learning_rate": 8.649782579154777e-06, + "loss": 5.47, + "step": 15905 + }, + { + "epoch": 1.080989264845767, + "grad_norm": 0.5538105964660645, + "learning_rate": 8.64935792906645e-06, + "loss": 5.3403, + "step": 15910 + }, + { + "epoch": 1.081328984916429, + "grad_norm": 0.5297386050224304, + "learning_rate": 8.648933278978122e-06, + "loss": 5.496, + "step": 15915 + }, + { + "epoch": 1.0816687049870906, + "grad_norm": 0.6353252530097961, + "learning_rate": 8.648508628889795e-06, + "loss": 5.4099, + "step": 15920 + }, + { + "epoch": 1.0820084250577524, + "grad_norm": 0.5544126033782959, + "learning_rate": 8.648083978801468e-06, + "loss": 5.2184, + "step": 15925 + }, + { + "epoch": 1.0823481451284143, + "grad_norm": 0.43073755502700806, + "learning_rate": 8.64765932871314e-06, + "loss": 5.188, + "step": 15930 + }, + { + "epoch": 1.082687865199076, + "grad_norm": 0.6624279618263245, + "learning_rate": 8.647234678624814e-06, + "loss": 5.2465, + "step": 15935 + }, + { + "epoch": 1.0830275852697377, + "grad_norm": 0.7457336187362671, + "learning_rate": 8.646810028536486e-06, + "loss": 5.3973, + "step": 15940 + }, + { + "epoch": 1.0833673053403996, + "grad_norm": 0.47533535957336426, + "learning_rate": 8.64638537844816e-06, + "loss": 5.3753, + "step": 15945 + }, + { + "epoch": 1.0837070254110612, + "grad_norm": 0.5420754551887512, + "learning_rate": 8.645960728359832e-06, + "loss": 5.1636, + "step": 15950 + }, + { + "epoch": 1.084046745481723, + "grad_norm": 0.5458803772926331, + "learning_rate": 8.645536078271505e-06, + "loss": 5.6042, + "step": 15955 + }, + { + "epoch": 1.084386465552385, + "grad_norm": 0.5518082976341248, + "learning_rate": 8.645111428183178e-06, + "loss": 5.2987, + "step": 15960 + }, + { + "epoch": 1.0847261856230466, + "grad_norm": 0.5450738668441772, + "learning_rate": 8.64468677809485e-06, + "loss": 5.2549, + "step": 15965 + }, + { + "epoch": 1.0850659056937084, + "grad_norm": 0.5431269407272339, + "learning_rate": 8.644262128006523e-06, + "loss": 5.1968, + "step": 15970 + }, + { + "epoch": 1.0854056257643703, + "grad_norm": 0.5097525715827942, + "learning_rate": 8.643837477918196e-06, + "loss": 5.3318, + "step": 15975 + }, + { + "epoch": 1.085745345835032, + "grad_norm": 0.46673697233200073, + "learning_rate": 8.643412827829869e-06, + "loss": 5.2674, + "step": 15980 + }, + { + "epoch": 1.0860850659056938, + "grad_norm": 0.5331701636314392, + "learning_rate": 8.642988177741542e-06, + "loss": 5.1617, + "step": 15985 + }, + { + "epoch": 1.0864247859763554, + "grad_norm": 0.633173942565918, + "learning_rate": 8.642563527653214e-06, + "loss": 5.1896, + "step": 15990 + }, + { + "epoch": 1.0867645060470172, + "grad_norm": 0.5205373764038086, + "learning_rate": 8.642138877564887e-06, + "loss": 5.2952, + "step": 15995 + }, + { + "epoch": 1.087104226117679, + "grad_norm": 0.6175049543380737, + "learning_rate": 8.64171422747656e-06, + "loss": 5.4963, + "step": 16000 + }, + { + "epoch": 1.0874439461883407, + "grad_norm": 0.6160527467727661, + "learning_rate": 8.641289577388233e-06, + "loss": 5.4872, + "step": 16005 + }, + { + "epoch": 1.0877836662590026, + "grad_norm": 0.5587930083274841, + "learning_rate": 8.640864927299906e-06, + "loss": 5.5421, + "step": 16010 + }, + { + "epoch": 1.0881233863296644, + "grad_norm": 0.45083993673324585, + "learning_rate": 8.640440277211578e-06, + "loss": 5.2182, + "step": 16015 + }, + { + "epoch": 1.088463106400326, + "grad_norm": 0.4835324287414551, + "learning_rate": 8.640015627123251e-06, + "loss": 5.3073, + "step": 16020 + }, + { + "epoch": 1.088802826470988, + "grad_norm": 0.46229463815689087, + "learning_rate": 8.639590977034924e-06, + "loss": 5.1996, + "step": 16025 + }, + { + "epoch": 1.0891425465416498, + "grad_norm": 0.5252505540847778, + "learning_rate": 8.639166326946597e-06, + "loss": 5.2131, + "step": 16030 + }, + { + "epoch": 1.0894822666123114, + "grad_norm": 0.6272785067558289, + "learning_rate": 8.63874167685827e-06, + "loss": 5.1442, + "step": 16035 + }, + { + "epoch": 1.0898219866829733, + "grad_norm": 0.6075104475021362, + "learning_rate": 8.638317026769942e-06, + "loss": 5.0271, + "step": 16040 + }, + { + "epoch": 1.090161706753635, + "grad_norm": 0.7495909929275513, + "learning_rate": 8.637892376681615e-06, + "loss": 5.1111, + "step": 16045 + }, + { + "epoch": 1.0905014268242967, + "grad_norm": 0.5147143602371216, + "learning_rate": 8.637467726593288e-06, + "loss": 5.3964, + "step": 16050 + }, + { + "epoch": 1.0908411468949586, + "grad_norm": 0.44410330057144165, + "learning_rate": 8.637043076504961e-06, + "loss": 5.6843, + "step": 16055 + }, + { + "epoch": 1.0911808669656202, + "grad_norm": 0.5713176727294922, + "learning_rate": 8.636618426416634e-06, + "loss": 5.015, + "step": 16060 + }, + { + "epoch": 1.091520587036282, + "grad_norm": 0.5789168477058411, + "learning_rate": 8.636193776328306e-06, + "loss": 5.4512, + "step": 16065 + }, + { + "epoch": 1.091860307106944, + "grad_norm": 0.7382201552391052, + "learning_rate": 8.63576912623998e-06, + "loss": 5.6047, + "step": 16070 + }, + { + "epoch": 1.0922000271776056, + "grad_norm": 0.4404030740261078, + "learning_rate": 8.635344476151652e-06, + "loss": 5.25, + "step": 16075 + }, + { + "epoch": 1.0925397472482674, + "grad_norm": 0.5210697054862976, + "learning_rate": 8.634919826063325e-06, + "loss": 5.4176, + "step": 16080 + }, + { + "epoch": 1.0928794673189293, + "grad_norm": 0.5926135182380676, + "learning_rate": 8.634495175974998e-06, + "loss": 5.3404, + "step": 16085 + }, + { + "epoch": 1.093219187389591, + "grad_norm": 0.5286602973937988, + "learning_rate": 8.63407052588667e-06, + "loss": 5.1681, + "step": 16090 + }, + { + "epoch": 1.0935589074602527, + "grad_norm": 0.6313502192497253, + "learning_rate": 8.633645875798343e-06, + "loss": 5.4454, + "step": 16095 + }, + { + "epoch": 1.0938986275309146, + "grad_norm": 0.6354628205299377, + "learning_rate": 8.633221225710016e-06, + "loss": 5.2885, + "step": 16100 + }, + { + "epoch": 1.0942383476015762, + "grad_norm": 0.6087275147438049, + "learning_rate": 8.632796575621689e-06, + "loss": 5.3369, + "step": 16105 + }, + { + "epoch": 1.094578067672238, + "grad_norm": 0.6069268584251404, + "learning_rate": 8.632371925533362e-06, + "loss": 5.2422, + "step": 16110 + }, + { + "epoch": 1.0949177877429, + "grad_norm": 0.611708402633667, + "learning_rate": 8.631947275445034e-06, + "loss": 5.2203, + "step": 16115 + }, + { + "epoch": 1.0952575078135616, + "grad_norm": 0.48974958062171936, + "learning_rate": 8.631522625356706e-06, + "loss": 5.3875, + "step": 16120 + }, + { + "epoch": 1.0955972278842234, + "grad_norm": 0.6593641042709351, + "learning_rate": 8.63109797526838e-06, + "loss": 5.2592, + "step": 16125 + }, + { + "epoch": 1.0959369479548853, + "grad_norm": 0.5045084357261658, + "learning_rate": 8.630673325180053e-06, + "loss": 5.4303, + "step": 16130 + }, + { + "epoch": 1.096276668025547, + "grad_norm": 0.6064139604568481, + "learning_rate": 8.630248675091724e-06, + "loss": 5.1147, + "step": 16135 + }, + { + "epoch": 1.0966163880962088, + "grad_norm": 0.4162774980068207, + "learning_rate": 8.629824025003398e-06, + "loss": 5.1398, + "step": 16140 + }, + { + "epoch": 1.0969561081668706, + "grad_norm": 0.5080612897872925, + "learning_rate": 8.629399374915071e-06, + "loss": 5.0638, + "step": 16145 + }, + { + "epoch": 1.0972958282375322, + "grad_norm": 0.4673619568347931, + "learning_rate": 8.628974724826742e-06, + "loss": 5.3726, + "step": 16150 + }, + { + "epoch": 1.097635548308194, + "grad_norm": 0.579666018486023, + "learning_rate": 8.628550074738417e-06, + "loss": 5.2898, + "step": 16155 + }, + { + "epoch": 1.0979752683788557, + "grad_norm": 0.46131691336631775, + "learning_rate": 8.62812542465009e-06, + "loss": 5.3345, + "step": 16160 + }, + { + "epoch": 1.0983149884495176, + "grad_norm": 0.6065790057182312, + "learning_rate": 8.62770077456176e-06, + "loss": 5.2134, + "step": 16165 + }, + { + "epoch": 1.0986547085201794, + "grad_norm": 0.606961727142334, + "learning_rate": 8.627276124473435e-06, + "loss": 5.2122, + "step": 16170 + }, + { + "epoch": 1.098994428590841, + "grad_norm": 0.6245341897010803, + "learning_rate": 8.626851474385108e-06, + "loss": 5.3456, + "step": 16175 + }, + { + "epoch": 1.099334148661503, + "grad_norm": 0.5404495000839233, + "learning_rate": 8.62642682429678e-06, + "loss": 5.362, + "step": 16180 + }, + { + "epoch": 1.0996738687321648, + "grad_norm": 0.5192434191703796, + "learning_rate": 8.626002174208454e-06, + "loss": 5.2699, + "step": 16185 + }, + { + "epoch": 1.1000135888028264, + "grad_norm": 0.514495313167572, + "learning_rate": 8.625577524120125e-06, + "loss": 4.8992, + "step": 16190 + }, + { + "epoch": 1.1003533088734883, + "grad_norm": 0.6098704934120178, + "learning_rate": 8.625152874031798e-06, + "loss": 5.3221, + "step": 16195 + }, + { + "epoch": 1.10069302894415, + "grad_norm": 0.4758104681968689, + "learning_rate": 8.624728223943472e-06, + "loss": 5.0632, + "step": 16200 + }, + { + "epoch": 1.1010327490148117, + "grad_norm": 0.4719245433807373, + "learning_rate": 8.624303573855143e-06, + "loss": 5.2027, + "step": 16205 + }, + { + "epoch": 1.1013724690854736, + "grad_norm": 0.6775283217430115, + "learning_rate": 8.623878923766816e-06, + "loss": 5.2228, + "step": 16210 + }, + { + "epoch": 1.1017121891561354, + "grad_norm": 0.5199261903762817, + "learning_rate": 8.62345427367849e-06, + "loss": 5.2744, + "step": 16215 + }, + { + "epoch": 1.102051909226797, + "grad_norm": 0.5249999761581421, + "learning_rate": 8.623029623590162e-06, + "loss": 5.2022, + "step": 16220 + }, + { + "epoch": 1.102391629297459, + "grad_norm": 0.4987199008464813, + "learning_rate": 8.622604973501834e-06, + "loss": 5.3704, + "step": 16225 + }, + { + "epoch": 1.1027313493681206, + "grad_norm": 0.532279372215271, + "learning_rate": 8.622180323413509e-06, + "loss": 5.2004, + "step": 16230 + }, + { + "epoch": 1.1030710694387824, + "grad_norm": 0.5420828461647034, + "learning_rate": 8.62175567332518e-06, + "loss": 5.0807, + "step": 16235 + }, + { + "epoch": 1.1034107895094443, + "grad_norm": 0.5788748860359192, + "learning_rate": 8.621331023236853e-06, + "loss": 5.393, + "step": 16240 + }, + { + "epoch": 1.103750509580106, + "grad_norm": 0.5056318044662476, + "learning_rate": 8.620906373148527e-06, + "loss": 5.1824, + "step": 16245 + }, + { + "epoch": 1.1040902296507678, + "grad_norm": 0.6308184266090393, + "learning_rate": 8.620481723060198e-06, + "loss": 5.3019, + "step": 16250 + }, + { + "epoch": 1.1044299497214296, + "grad_norm": 0.9473641514778137, + "learning_rate": 8.620057072971871e-06, + "loss": 5.2316, + "step": 16255 + }, + { + "epoch": 1.1047696697920912, + "grad_norm": 0.40664514899253845, + "learning_rate": 8.619632422883546e-06, + "loss": 5.2259, + "step": 16260 + }, + { + "epoch": 1.105109389862753, + "grad_norm": 0.5849182605743408, + "learning_rate": 8.619207772795217e-06, + "loss": 5.0162, + "step": 16265 + }, + { + "epoch": 1.105449109933415, + "grad_norm": 0.6605808734893799, + "learning_rate": 8.618783122706891e-06, + "loss": 5.1219, + "step": 16270 + }, + { + "epoch": 1.1057888300040766, + "grad_norm": 0.5031015276908875, + "learning_rate": 8.618358472618562e-06, + "loss": 5.3614, + "step": 16275 + }, + { + "epoch": 1.1061285500747384, + "grad_norm": 0.532670259475708, + "learning_rate": 8.617933822530235e-06, + "loss": 5.1245, + "step": 16280 + }, + { + "epoch": 1.1064682701454003, + "grad_norm": 0.45560193061828613, + "learning_rate": 8.61750917244191e-06, + "loss": 5.2042, + "step": 16285 + }, + { + "epoch": 1.106807990216062, + "grad_norm": 0.517130434513092, + "learning_rate": 8.61708452235358e-06, + "loss": 5.3241, + "step": 16290 + }, + { + "epoch": 1.1071477102867238, + "grad_norm": 0.6240386962890625, + "learning_rate": 8.616659872265254e-06, + "loss": 5.0768, + "step": 16295 + }, + { + "epoch": 1.1074874303573856, + "grad_norm": 0.7072476148605347, + "learning_rate": 8.616235222176928e-06, + "loss": 4.9393, + "step": 16300 + }, + { + "epoch": 1.1078271504280472, + "grad_norm": 0.5179939866065979, + "learning_rate": 8.6158105720886e-06, + "loss": 5.2111, + "step": 16305 + }, + { + "epoch": 1.108166870498709, + "grad_norm": 0.5619356632232666, + "learning_rate": 8.615385922000272e-06, + "loss": 5.3883, + "step": 16310 + }, + { + "epoch": 1.108506590569371, + "grad_norm": 0.5377128720283508, + "learning_rate": 8.614961271911946e-06, + "loss": 5.1209, + "step": 16315 + }, + { + "epoch": 1.1088463106400326, + "grad_norm": 0.6181756258010864, + "learning_rate": 8.614536621823618e-06, + "loss": 5.4314, + "step": 16320 + }, + { + "epoch": 1.1091860307106944, + "grad_norm": 0.4789082109928131, + "learning_rate": 8.61411197173529e-06, + "loss": 5.2746, + "step": 16325 + }, + { + "epoch": 1.109525750781356, + "grad_norm": 0.6403868198394775, + "learning_rate": 8.613687321646965e-06, + "loss": 5.11, + "step": 16330 + }, + { + "epoch": 1.109865470852018, + "grad_norm": 0.35472506284713745, + "learning_rate": 8.613262671558636e-06, + "loss": 5.274, + "step": 16335 + }, + { + "epoch": 1.1102051909226798, + "grad_norm": 0.47365352511405945, + "learning_rate": 8.612838021470309e-06, + "loss": 5.1789, + "step": 16340 + }, + { + "epoch": 1.1105449109933414, + "grad_norm": 0.6650395393371582, + "learning_rate": 8.612413371381982e-06, + "loss": 5.379, + "step": 16345 + }, + { + "epoch": 1.1108846310640033, + "grad_norm": 0.5482186675071716, + "learning_rate": 8.611988721293654e-06, + "loss": 5.1615, + "step": 16350 + }, + { + "epoch": 1.1112243511346651, + "grad_norm": 0.5895190834999084, + "learning_rate": 8.611564071205327e-06, + "loss": 5.2436, + "step": 16355 + }, + { + "epoch": 1.1115640712053267, + "grad_norm": 0.4637793302536011, + "learning_rate": 8.611139421117e-06, + "loss": 4.8762, + "step": 16360 + }, + { + "epoch": 1.1119037912759886, + "grad_norm": 0.4470992684364319, + "learning_rate": 8.610714771028673e-06, + "loss": 5.1417, + "step": 16365 + }, + { + "epoch": 1.1122435113466504, + "grad_norm": 0.5398293137550354, + "learning_rate": 8.610290120940346e-06, + "loss": 5.1626, + "step": 16370 + }, + { + "epoch": 1.112583231417312, + "grad_norm": 0.5054556131362915, + "learning_rate": 8.609865470852018e-06, + "loss": 4.8258, + "step": 16375 + }, + { + "epoch": 1.112922951487974, + "grad_norm": 0.583594799041748, + "learning_rate": 8.609440820763691e-06, + "loss": 5.4867, + "step": 16380 + }, + { + "epoch": 1.1132626715586358, + "grad_norm": 0.4763481318950653, + "learning_rate": 8.609016170675364e-06, + "loss": 5.2803, + "step": 16385 + }, + { + "epoch": 1.1136023916292974, + "grad_norm": 0.500040590763092, + "learning_rate": 8.608591520587037e-06, + "loss": 5.249, + "step": 16390 + }, + { + "epoch": 1.1139421116999593, + "grad_norm": 0.5093843340873718, + "learning_rate": 8.60816687049871e-06, + "loss": 5.4462, + "step": 16395 + }, + { + "epoch": 1.114281831770621, + "grad_norm": 0.5956771373748779, + "learning_rate": 8.607742220410382e-06, + "loss": 5.1898, + "step": 16400 + }, + { + "epoch": 1.1146215518412828, + "grad_norm": 0.5329673886299133, + "learning_rate": 8.607317570322055e-06, + "loss": 5.005, + "step": 16405 + }, + { + "epoch": 1.1149612719119446, + "grad_norm": 0.5567227005958557, + "learning_rate": 8.606892920233728e-06, + "loss": 5.2833, + "step": 16410 + }, + { + "epoch": 1.1153009919826062, + "grad_norm": 0.625278651714325, + "learning_rate": 8.6064682701454e-06, + "loss": 5.4044, + "step": 16415 + }, + { + "epoch": 1.115640712053268, + "grad_norm": 0.47629833221435547, + "learning_rate": 8.606043620057074e-06, + "loss": 5.1288, + "step": 16420 + }, + { + "epoch": 1.11598043212393, + "grad_norm": 0.5073433518409729, + "learning_rate": 8.605618969968746e-06, + "loss": 5.194, + "step": 16425 + }, + { + "epoch": 1.1163201521945916, + "grad_norm": 0.4886217713356018, + "learning_rate": 8.60519431988042e-06, + "loss": 5.3066, + "step": 16430 + }, + { + "epoch": 1.1166598722652534, + "grad_norm": 0.8586699366569519, + "learning_rate": 8.604769669792092e-06, + "loss": 5.2303, + "step": 16435 + }, + { + "epoch": 1.1169995923359153, + "grad_norm": 0.6723941564559937, + "learning_rate": 8.604345019703765e-06, + "loss": 5.077, + "step": 16440 + }, + { + "epoch": 1.117339312406577, + "grad_norm": 0.5164468288421631, + "learning_rate": 8.603920369615438e-06, + "loss": 5.2094, + "step": 16445 + }, + { + "epoch": 1.1176790324772388, + "grad_norm": 0.5484564304351807, + "learning_rate": 8.60349571952711e-06, + "loss": 5.3552, + "step": 16450 + }, + { + "epoch": 1.1180187525479006, + "grad_norm": 0.509772002696991, + "learning_rate": 8.603071069438783e-06, + "loss": 5.3077, + "step": 16455 + }, + { + "epoch": 1.1183584726185622, + "grad_norm": 0.7113659381866455, + "learning_rate": 8.602646419350456e-06, + "loss": 5.4085, + "step": 16460 + }, + { + "epoch": 1.118698192689224, + "grad_norm": 0.6503916382789612, + "learning_rate": 8.602221769262129e-06, + "loss": 5.4259, + "step": 16465 + }, + { + "epoch": 1.119037912759886, + "grad_norm": 0.6220220327377319, + "learning_rate": 8.601797119173802e-06, + "loss": 5.0617, + "step": 16470 + }, + { + "epoch": 1.1193776328305476, + "grad_norm": 0.584502637386322, + "learning_rate": 8.601372469085474e-06, + "loss": 5.1311, + "step": 16475 + }, + { + "epoch": 1.1197173529012094, + "grad_norm": 0.5167200565338135, + "learning_rate": 8.600947818997147e-06, + "loss": 5.2573, + "step": 16480 + }, + { + "epoch": 1.1200570729718713, + "grad_norm": 0.5305880308151245, + "learning_rate": 8.60052316890882e-06, + "loss": 5.2686, + "step": 16485 + }, + { + "epoch": 1.120396793042533, + "grad_norm": 0.5841049551963806, + "learning_rate": 8.600098518820493e-06, + "loss": 5.2084, + "step": 16490 + }, + { + "epoch": 1.1207365131131948, + "grad_norm": 0.48064476251602173, + "learning_rate": 8.599673868732166e-06, + "loss": 5.1268, + "step": 16495 + }, + { + "epoch": 1.1210762331838564, + "grad_norm": 0.5798138976097107, + "learning_rate": 8.599249218643838e-06, + "loss": 5.1914, + "step": 16500 + }, + { + "epoch": 1.1214159532545183, + "grad_norm": 0.5845276713371277, + "learning_rate": 8.598824568555511e-06, + "loss": 5.051, + "step": 16505 + }, + { + "epoch": 1.1217556733251801, + "grad_norm": 0.5549997091293335, + "learning_rate": 8.598399918467184e-06, + "loss": 5.0725, + "step": 16510 + }, + { + "epoch": 1.1220953933958417, + "grad_norm": 0.7934907078742981, + "learning_rate": 8.597975268378857e-06, + "loss": 5.3557, + "step": 16515 + }, + { + "epoch": 1.1224351134665036, + "grad_norm": 0.5949925780296326, + "learning_rate": 8.59755061829053e-06, + "loss": 5.2439, + "step": 16520 + }, + { + "epoch": 1.1227748335371655, + "grad_norm": 0.5112131834030151, + "learning_rate": 8.597125968202202e-06, + "loss": 4.9411, + "step": 16525 + }, + { + "epoch": 1.123114553607827, + "grad_norm": 0.49308502674102783, + "learning_rate": 8.596701318113875e-06, + "loss": 5.2084, + "step": 16530 + }, + { + "epoch": 1.123454273678489, + "grad_norm": 0.5765339136123657, + "learning_rate": 8.596276668025546e-06, + "loss": 5.262, + "step": 16535 + }, + { + "epoch": 1.1237939937491508, + "grad_norm": 0.5146493911743164, + "learning_rate": 8.59585201793722e-06, + "loss": 5.3611, + "step": 16540 + }, + { + "epoch": 1.1241337138198124, + "grad_norm": 0.6649471521377563, + "learning_rate": 8.595427367848894e-06, + "loss": 5.2116, + "step": 16545 + }, + { + "epoch": 1.1244734338904743, + "grad_norm": 0.560444712638855, + "learning_rate": 8.595002717760565e-06, + "loss": 5.2543, + "step": 16550 + }, + { + "epoch": 1.1248131539611361, + "grad_norm": 0.5106379985809326, + "learning_rate": 8.59457806767224e-06, + "loss": 5.1472, + "step": 16555 + }, + { + "epoch": 1.1251528740317978, + "grad_norm": 0.5739969611167908, + "learning_rate": 8.594153417583912e-06, + "loss": 5.0663, + "step": 16560 + }, + { + "epoch": 1.1254925941024596, + "grad_norm": 0.5931854248046875, + "learning_rate": 8.593728767495583e-06, + "loss": 5.3799, + "step": 16565 + }, + { + "epoch": 1.1258323141731212, + "grad_norm": 0.49541914463043213, + "learning_rate": 8.593304117407258e-06, + "loss": 5.1447, + "step": 16570 + }, + { + "epoch": 1.126172034243783, + "grad_norm": 0.6566627025604248, + "learning_rate": 8.59287946731893e-06, + "loss": 5.2654, + "step": 16575 + }, + { + "epoch": 1.126511754314445, + "grad_norm": 0.6389328241348267, + "learning_rate": 8.592454817230602e-06, + "loss": 5.3128, + "step": 16580 + }, + { + "epoch": 1.1268514743851066, + "grad_norm": 0.6165663599967957, + "learning_rate": 8.592030167142276e-06, + "loss": 5.0553, + "step": 16585 + }, + { + "epoch": 1.1271911944557684, + "grad_norm": 0.5415424108505249, + "learning_rate": 8.591605517053949e-06, + "loss": 5.2501, + "step": 16590 + }, + { + "epoch": 1.1275309145264303, + "grad_norm": 0.6345233917236328, + "learning_rate": 8.59118086696562e-06, + "loss": 5.4091, + "step": 16595 + }, + { + "epoch": 1.127870634597092, + "grad_norm": 0.6888380646705627, + "learning_rate": 8.590756216877294e-06, + "loss": 5.1024, + "step": 16600 + }, + { + "epoch": 1.1282103546677538, + "grad_norm": 0.5963483452796936, + "learning_rate": 8.590331566788967e-06, + "loss": 5.4978, + "step": 16605 + }, + { + "epoch": 1.1285500747384156, + "grad_norm": 0.5325688719749451, + "learning_rate": 8.58990691670064e-06, + "loss": 5.4371, + "step": 16610 + }, + { + "epoch": 1.1288897948090773, + "grad_norm": 0.5616849064826965, + "learning_rate": 8.589482266612313e-06, + "loss": 5.1736, + "step": 16615 + }, + { + "epoch": 1.129229514879739, + "grad_norm": 0.5684040188789368, + "learning_rate": 8.589057616523984e-06, + "loss": 4.9637, + "step": 16620 + }, + { + "epoch": 1.129569234950401, + "grad_norm": 0.6194115877151489, + "learning_rate": 8.588632966435658e-06, + "loss": 5.227, + "step": 16625 + }, + { + "epoch": 1.1299089550210626, + "grad_norm": 0.6687846183776855, + "learning_rate": 8.588208316347331e-06, + "loss": 5.0747, + "step": 16630 + }, + { + "epoch": 1.1302486750917244, + "grad_norm": 0.7020384073257446, + "learning_rate": 8.587783666259002e-06, + "loss": 5.2795, + "step": 16635 + }, + { + "epoch": 1.1305883951623863, + "grad_norm": 0.5238447189331055, + "learning_rate": 8.587359016170677e-06, + "loss": 5.2449, + "step": 16640 + }, + { + "epoch": 1.130928115233048, + "grad_norm": 0.5643996000289917, + "learning_rate": 8.58693436608235e-06, + "loss": 5.1256, + "step": 16645 + }, + { + "epoch": 1.1312678353037098, + "grad_norm": 0.6701121926307678, + "learning_rate": 8.58650971599402e-06, + "loss": 5.5467, + "step": 16650 + }, + { + "epoch": 1.1316075553743716, + "grad_norm": 0.5078961849212646, + "learning_rate": 8.586085065905695e-06, + "loss": 4.8158, + "step": 16655 + }, + { + "epoch": 1.1319472754450333, + "grad_norm": 0.5455989837646484, + "learning_rate": 8.585660415817368e-06, + "loss": 5.31, + "step": 16660 + }, + { + "epoch": 1.1322869955156951, + "grad_norm": 1.0586973428726196, + "learning_rate": 8.58523576572904e-06, + "loss": 5.0334, + "step": 16665 + }, + { + "epoch": 1.1326267155863567, + "grad_norm": 0.6094449162483215, + "learning_rate": 8.584811115640714e-06, + "loss": 5.3833, + "step": 16670 + }, + { + "epoch": 1.1329664356570186, + "grad_norm": 0.5789914727210999, + "learning_rate": 8.584386465552386e-06, + "loss": 5.2404, + "step": 16675 + }, + { + "epoch": 1.1333061557276805, + "grad_norm": 0.598869264125824, + "learning_rate": 8.583961815464058e-06, + "loss": 5.1044, + "step": 16680 + }, + { + "epoch": 1.133645875798342, + "grad_norm": 0.46530309319496155, + "learning_rate": 8.583537165375732e-06, + "loss": 5.2748, + "step": 16685 + }, + { + "epoch": 1.133985595869004, + "grad_norm": 0.6233730316162109, + "learning_rate": 8.583112515287403e-06, + "loss": 5.1941, + "step": 16690 + }, + { + "epoch": 1.1343253159396658, + "grad_norm": 0.5970980525016785, + "learning_rate": 8.582687865199076e-06, + "loss": 5.2668, + "step": 16695 + }, + { + "epoch": 1.1346650360103274, + "grad_norm": 0.5466794967651367, + "learning_rate": 8.58226321511075e-06, + "loss": 4.9599, + "step": 16700 + }, + { + "epoch": 1.1350047560809893, + "grad_norm": 0.48978108167648315, + "learning_rate": 8.581838565022422e-06, + "loss": 5.2268, + "step": 16705 + }, + { + "epoch": 1.1353444761516511, + "grad_norm": 0.5777915716171265, + "learning_rate": 8.581413914934094e-06, + "loss": 5.4275, + "step": 16710 + }, + { + "epoch": 1.1356841962223128, + "grad_norm": 0.6192224025726318, + "learning_rate": 8.580989264845769e-06, + "loss": 5.0417, + "step": 16715 + }, + { + "epoch": 1.1360239162929746, + "grad_norm": 0.5082762837409973, + "learning_rate": 8.58056461475744e-06, + "loss": 5.2387, + "step": 16720 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.5812807679176331, + "learning_rate": 8.580139964669113e-06, + "loss": 5.2256, + "step": 16725 + }, + { + "epoch": 1.136703356434298, + "grad_norm": 0.514434278011322, + "learning_rate": 8.579715314580787e-06, + "loss": 5.3229, + "step": 16730 + }, + { + "epoch": 1.13704307650496, + "grad_norm": 0.5479309558868408, + "learning_rate": 8.579290664492458e-06, + "loss": 5.4797, + "step": 16735 + }, + { + "epoch": 1.1373827965756216, + "grad_norm": 0.5350407361984253, + "learning_rate": 8.578866014404131e-06, + "loss": 5.1179, + "step": 16740 + }, + { + "epoch": 1.1377225166462834, + "grad_norm": 0.4868687093257904, + "learning_rate": 8.578441364315806e-06, + "loss": 5.1395, + "step": 16745 + }, + { + "epoch": 1.1380622367169453, + "grad_norm": 0.596741259098053, + "learning_rate": 8.578016714227477e-06, + "loss": 5.326, + "step": 16750 + }, + { + "epoch": 1.138401956787607, + "grad_norm": 0.7051769495010376, + "learning_rate": 8.57759206413915e-06, + "loss": 5.604, + "step": 16755 + }, + { + "epoch": 1.1387416768582688, + "grad_norm": 0.692630410194397, + "learning_rate": 8.577167414050822e-06, + "loss": 5.2485, + "step": 16760 + }, + { + "epoch": 1.1390813969289306, + "grad_norm": 0.6530965566635132, + "learning_rate": 8.576742763962495e-06, + "loss": 5.367, + "step": 16765 + }, + { + "epoch": 1.1394211169995923, + "grad_norm": 0.5360511541366577, + "learning_rate": 8.576318113874168e-06, + "loss": 5.1491, + "step": 16770 + }, + { + "epoch": 1.139760837070254, + "grad_norm": 0.4775511622428894, + "learning_rate": 8.57589346378584e-06, + "loss": 5.2325, + "step": 16775 + }, + { + "epoch": 1.140100557140916, + "grad_norm": 0.5088670253753662, + "learning_rate": 8.575468813697514e-06, + "loss": 5.2576, + "step": 16780 + }, + { + "epoch": 1.1404402772115776, + "grad_norm": 0.5500739216804504, + "learning_rate": 8.575044163609186e-06, + "loss": 5.1824, + "step": 16785 + }, + { + "epoch": 1.1407799972822394, + "grad_norm": 0.5480011105537415, + "learning_rate": 8.57461951352086e-06, + "loss": 5.3051, + "step": 16790 + }, + { + "epoch": 1.1411197173529013, + "grad_norm": 0.6412390470504761, + "learning_rate": 8.574194863432532e-06, + "loss": 5.1858, + "step": 16795 + }, + { + "epoch": 1.141459437423563, + "grad_norm": 0.5795320868492126, + "learning_rate": 8.573770213344205e-06, + "loss": 5.0165, + "step": 16800 + }, + { + "epoch": 1.1417991574942248, + "grad_norm": 0.5684378147125244, + "learning_rate": 8.573345563255878e-06, + "loss": 5.1522, + "step": 16805 + }, + { + "epoch": 1.1421388775648866, + "grad_norm": 0.5639727115631104, + "learning_rate": 8.57292091316755e-06, + "loss": 5.1152, + "step": 16810 + }, + { + "epoch": 1.1424785976355483, + "grad_norm": 0.7242740988731384, + "learning_rate": 8.572496263079223e-06, + "loss": 5.178, + "step": 16815 + }, + { + "epoch": 1.1428183177062101, + "grad_norm": 0.47908100485801697, + "learning_rate": 8.572071612990896e-06, + "loss": 5.1682, + "step": 16820 + }, + { + "epoch": 1.143158037776872, + "grad_norm": 0.6055124998092651, + "learning_rate": 8.571646962902569e-06, + "loss": 5.1877, + "step": 16825 + }, + { + "epoch": 1.1434977578475336, + "grad_norm": 0.5923203229904175, + "learning_rate": 8.571222312814242e-06, + "loss": 5.1417, + "step": 16830 + }, + { + "epoch": 1.1438374779181955, + "grad_norm": 0.5136466026306152, + "learning_rate": 8.570797662725914e-06, + "loss": 5.1635, + "step": 16835 + }, + { + "epoch": 1.144177197988857, + "grad_norm": 0.46609851717948914, + "learning_rate": 8.570373012637587e-06, + "loss": 5.255, + "step": 16840 + }, + { + "epoch": 1.144516918059519, + "grad_norm": 0.516668975353241, + "learning_rate": 8.56994836254926e-06, + "loss": 5.0776, + "step": 16845 + }, + { + "epoch": 1.1448566381301808, + "grad_norm": 0.6591651439666748, + "learning_rate": 8.569523712460933e-06, + "loss": 5.2733, + "step": 16850 + }, + { + "epoch": 1.1451963582008424, + "grad_norm": 0.5889518857002258, + "learning_rate": 8.569099062372606e-06, + "loss": 5.1906, + "step": 16855 + }, + { + "epoch": 1.1455360782715043, + "grad_norm": 0.5536397099494934, + "learning_rate": 8.568674412284278e-06, + "loss": 5.268, + "step": 16860 + }, + { + "epoch": 1.1458757983421661, + "grad_norm": 0.5108366012573242, + "learning_rate": 8.568249762195951e-06, + "loss": 4.9861, + "step": 16865 + }, + { + "epoch": 1.1462155184128278, + "grad_norm": 0.529624879360199, + "learning_rate": 8.567825112107624e-06, + "loss": 5.2278, + "step": 16870 + }, + { + "epoch": 1.1465552384834896, + "grad_norm": 0.5870801210403442, + "learning_rate": 8.567400462019297e-06, + "loss": 5.3084, + "step": 16875 + }, + { + "epoch": 1.1468949585541515, + "grad_norm": 0.5665431022644043, + "learning_rate": 8.56697581193097e-06, + "loss": 5.0392, + "step": 16880 + }, + { + "epoch": 1.147234678624813, + "grad_norm": 0.5984089970588684, + "learning_rate": 8.566551161842642e-06, + "loss": 5.3708, + "step": 16885 + }, + { + "epoch": 1.147574398695475, + "grad_norm": 0.6758708357810974, + "learning_rate": 8.566126511754315e-06, + "loss": 5.189, + "step": 16890 + }, + { + "epoch": 1.1479141187661366, + "grad_norm": 0.6593291759490967, + "learning_rate": 8.565701861665988e-06, + "loss": 5.3276, + "step": 16895 + }, + { + "epoch": 1.1482538388367984, + "grad_norm": 0.4883452355861664, + "learning_rate": 8.56527721157766e-06, + "loss": 5.0911, + "step": 16900 + }, + { + "epoch": 1.1485935589074603, + "grad_norm": 0.4610900282859802, + "learning_rate": 8.564852561489334e-06, + "loss": 5.1775, + "step": 16905 + }, + { + "epoch": 1.148933278978122, + "grad_norm": 0.5305609107017517, + "learning_rate": 8.564427911401006e-06, + "loss": 5.0777, + "step": 16910 + }, + { + "epoch": 1.1492729990487838, + "grad_norm": 0.5752338171005249, + "learning_rate": 8.56400326131268e-06, + "loss": 5.0135, + "step": 16915 + }, + { + "epoch": 1.1496127191194456, + "grad_norm": 0.5255418419837952, + "learning_rate": 8.563578611224352e-06, + "loss": 5.0426, + "step": 16920 + }, + { + "epoch": 1.1499524391901073, + "grad_norm": 0.6444478631019592, + "learning_rate": 8.563153961136025e-06, + "loss": 5.1311, + "step": 16925 + }, + { + "epoch": 1.150292159260769, + "grad_norm": 0.589715301990509, + "learning_rate": 8.562729311047698e-06, + "loss": 5.1065, + "step": 16930 + }, + { + "epoch": 1.150631879331431, + "grad_norm": 0.559420108795166, + "learning_rate": 8.56230466095937e-06, + "loss": 5.0985, + "step": 16935 + }, + { + "epoch": 1.1509715994020926, + "grad_norm": 0.42910927534103394, + "learning_rate": 8.561880010871043e-06, + "loss": 5.0216, + "step": 16940 + }, + { + "epoch": 1.1513113194727544, + "grad_norm": 0.43940526247024536, + "learning_rate": 8.561455360782716e-06, + "loss": 5.1715, + "step": 16945 + }, + { + "epoch": 1.1516510395434163, + "grad_norm": 0.4716680347919464, + "learning_rate": 8.561030710694389e-06, + "loss": 4.932, + "step": 16950 + }, + { + "epoch": 1.151990759614078, + "grad_norm": 0.5005847215652466, + "learning_rate": 8.560606060606062e-06, + "loss": 5.1375, + "step": 16955 + }, + { + "epoch": 1.1523304796847398, + "grad_norm": 0.5411086678504944, + "learning_rate": 8.560181410517734e-06, + "loss": 5.0852, + "step": 16960 + }, + { + "epoch": 1.1526701997554016, + "grad_norm": 0.4903919994831085, + "learning_rate": 8.559756760429407e-06, + "loss": 5.1381, + "step": 16965 + }, + { + "epoch": 1.1530099198260633, + "grad_norm": 0.7254168391227722, + "learning_rate": 8.55933211034108e-06, + "loss": 5.0067, + "step": 16970 + }, + { + "epoch": 1.1533496398967251, + "grad_norm": 0.5476894378662109, + "learning_rate": 8.558907460252753e-06, + "loss": 5.0238, + "step": 16975 + }, + { + "epoch": 1.153689359967387, + "grad_norm": 0.5777945518493652, + "learning_rate": 8.558482810164426e-06, + "loss": 5.1925, + "step": 16980 + }, + { + "epoch": 1.1540290800380486, + "grad_norm": 0.5142073631286621, + "learning_rate": 8.558058160076098e-06, + "loss": 5.2484, + "step": 16985 + }, + { + "epoch": 1.1543688001087105, + "grad_norm": 0.5628915429115295, + "learning_rate": 8.557633509987771e-06, + "loss": 5.0489, + "step": 16990 + }, + { + "epoch": 1.1547085201793723, + "grad_norm": 0.42835769057273865, + "learning_rate": 8.557208859899444e-06, + "loss": 5.0956, + "step": 16995 + }, + { + "epoch": 1.155048240250034, + "grad_norm": 0.6267479658126831, + "learning_rate": 8.556784209811117e-06, + "loss": 5.272, + "step": 17000 + }, + { + "epoch": 1.1553879603206958, + "grad_norm": 0.5270193815231323, + "learning_rate": 8.55635955972279e-06, + "loss": 5.3009, + "step": 17005 + }, + { + "epoch": 1.1557276803913574, + "grad_norm": 0.5888186097145081, + "learning_rate": 8.555934909634462e-06, + "loss": 5.4014, + "step": 17010 + }, + { + "epoch": 1.1560674004620193, + "grad_norm": 0.5000048279762268, + "learning_rate": 8.555510259546135e-06, + "loss": 5.2404, + "step": 17015 + }, + { + "epoch": 1.1564071205326811, + "grad_norm": 0.5102828741073608, + "learning_rate": 8.555085609457808e-06, + "loss": 5.1431, + "step": 17020 + }, + { + "epoch": 1.1567468406033428, + "grad_norm": 0.6053318977355957, + "learning_rate": 8.55466095936948e-06, + "loss": 5.3522, + "step": 17025 + }, + { + "epoch": 1.1570865606740046, + "grad_norm": 0.5917315483093262, + "learning_rate": 8.554236309281154e-06, + "loss": 5.3476, + "step": 17030 + }, + { + "epoch": 1.1574262807446665, + "grad_norm": 0.6452617049217224, + "learning_rate": 8.553811659192825e-06, + "loss": 5.154, + "step": 17035 + }, + { + "epoch": 1.157766000815328, + "grad_norm": 0.6618406772613525, + "learning_rate": 8.5533870091045e-06, + "loss": 5.0881, + "step": 17040 + }, + { + "epoch": 1.15810572088599, + "grad_norm": 0.7940272092819214, + "learning_rate": 8.552962359016172e-06, + "loss": 5.4699, + "step": 17045 + }, + { + "epoch": 1.1584454409566518, + "grad_norm": 0.5813175439834595, + "learning_rate": 8.552537708927843e-06, + "loss": 5.0685, + "step": 17050 + }, + { + "epoch": 1.1587851610273134, + "grad_norm": 0.49547865986824036, + "learning_rate": 8.552113058839518e-06, + "loss": 5.2553, + "step": 17055 + }, + { + "epoch": 1.1591248810979753, + "grad_norm": 0.5636591911315918, + "learning_rate": 8.55168840875119e-06, + "loss": 5.0934, + "step": 17060 + }, + { + "epoch": 1.159464601168637, + "grad_norm": 0.5287961363792419, + "learning_rate": 8.551263758662862e-06, + "loss": 5.1496, + "step": 17065 + }, + { + "epoch": 1.1598043212392988, + "grad_norm": 0.432921439409256, + "learning_rate": 8.550839108574536e-06, + "loss": 5.186, + "step": 17070 + }, + { + "epoch": 1.1601440413099606, + "grad_norm": 0.5274278521537781, + "learning_rate": 8.550414458486209e-06, + "loss": 5.122, + "step": 17075 + }, + { + "epoch": 1.1604837613806223, + "grad_norm": 0.7345967888832092, + "learning_rate": 8.54998980839788e-06, + "loss": 5.2438, + "step": 17080 + }, + { + "epoch": 1.1608234814512841, + "grad_norm": 0.5785577893257141, + "learning_rate": 8.549565158309554e-06, + "loss": 5.0196, + "step": 17085 + }, + { + "epoch": 1.161163201521946, + "grad_norm": 0.49798426032066345, + "learning_rate": 8.549140508221227e-06, + "loss": 4.9669, + "step": 17090 + }, + { + "epoch": 1.1615029215926076, + "grad_norm": 0.45752614736557007, + "learning_rate": 8.548715858132898e-06, + "loss": 5.0326, + "step": 17095 + }, + { + "epoch": 1.1618426416632694, + "grad_norm": 0.5068433284759521, + "learning_rate": 8.548291208044573e-06, + "loss": 5.0972, + "step": 17100 + }, + { + "epoch": 1.1621823617339313, + "grad_norm": 0.6017234325408936, + "learning_rate": 8.547866557956244e-06, + "loss": 5.1374, + "step": 17105 + }, + { + "epoch": 1.162522081804593, + "grad_norm": 0.6215944290161133, + "learning_rate": 8.547441907867917e-06, + "loss": 4.7716, + "step": 17110 + }, + { + "epoch": 1.1628618018752548, + "grad_norm": 0.49311500787734985, + "learning_rate": 8.547017257779591e-06, + "loss": 5.3508, + "step": 17115 + }, + { + "epoch": 1.1632015219459166, + "grad_norm": 0.62859046459198, + "learning_rate": 8.546592607691262e-06, + "loss": 5.0772, + "step": 17120 + }, + { + "epoch": 1.1635412420165783, + "grad_norm": 0.5305058360099792, + "learning_rate": 8.546167957602935e-06, + "loss": 5.2722, + "step": 17125 + }, + { + "epoch": 1.1638809620872401, + "grad_norm": 0.40277099609375, + "learning_rate": 8.54574330751461e-06, + "loss": 5.3328, + "step": 17130 + }, + { + "epoch": 1.164220682157902, + "grad_norm": 0.6238910555839539, + "learning_rate": 8.54531865742628e-06, + "loss": 5.0665, + "step": 17135 + }, + { + "epoch": 1.1645604022285636, + "grad_norm": 0.5802484154701233, + "learning_rate": 8.544894007337954e-06, + "loss": 5.5037, + "step": 17140 + }, + { + "epoch": 1.1649001222992255, + "grad_norm": 0.4784940481185913, + "learning_rate": 8.544469357249628e-06, + "loss": 5.1499, + "step": 17145 + }, + { + "epoch": 1.1652398423698873, + "grad_norm": 0.7848225831985474, + "learning_rate": 8.544044707161299e-06, + "loss": 5.1338, + "step": 17150 + }, + { + "epoch": 1.165579562440549, + "grad_norm": 0.4700789153575897, + "learning_rate": 8.543620057072972e-06, + "loss": 5.0112, + "step": 17155 + }, + { + "epoch": 1.1659192825112108, + "grad_norm": 0.4641840159893036, + "learning_rate": 8.543195406984646e-06, + "loss": 5.0932, + "step": 17160 + }, + { + "epoch": 1.1662590025818727, + "grad_norm": 0.5868046283721924, + "learning_rate": 8.542770756896318e-06, + "loss": 5.1017, + "step": 17165 + }, + { + "epoch": 1.1665987226525343, + "grad_norm": 0.42335519194602966, + "learning_rate": 8.54234610680799e-06, + "loss": 4.9702, + "step": 17170 + }, + { + "epoch": 1.1669384427231961, + "grad_norm": 0.7008082270622253, + "learning_rate": 8.541921456719665e-06, + "loss": 5.3932, + "step": 17175 + }, + { + "epoch": 1.1672781627938578, + "grad_norm": 0.662705659866333, + "learning_rate": 8.541496806631336e-06, + "loss": 5.1711, + "step": 17180 + }, + { + "epoch": 1.1676178828645196, + "grad_norm": 0.4120030701160431, + "learning_rate": 8.541072156543009e-06, + "loss": 5.3802, + "step": 17185 + }, + { + "epoch": 1.1679576029351815, + "grad_norm": 0.5472463965415955, + "learning_rate": 8.540647506454682e-06, + "loss": 5.2825, + "step": 17190 + }, + { + "epoch": 1.168297323005843, + "grad_norm": 0.6967541575431824, + "learning_rate": 8.540222856366354e-06, + "loss": 5.2315, + "step": 17195 + }, + { + "epoch": 1.168637043076505, + "grad_norm": 0.5733007788658142, + "learning_rate": 8.539798206278027e-06, + "loss": 5.1455, + "step": 17200 + }, + { + "epoch": 1.1689767631471668, + "grad_norm": 0.3892599046230316, + "learning_rate": 8.5393735561897e-06, + "loss": 5.1267, + "step": 17205 + }, + { + "epoch": 1.1693164832178284, + "grad_norm": 0.6303438544273376, + "learning_rate": 8.538948906101373e-06, + "loss": 5.1382, + "step": 17210 + }, + { + "epoch": 1.1696562032884903, + "grad_norm": 0.5509541630744934, + "learning_rate": 8.538524256013046e-06, + "loss": 5.2872, + "step": 17215 + }, + { + "epoch": 1.1699959233591521, + "grad_norm": 0.4024409055709839, + "learning_rate": 8.538099605924718e-06, + "loss": 5.1063, + "step": 17220 + }, + { + "epoch": 1.1703356434298138, + "grad_norm": 0.4824397563934326, + "learning_rate": 8.537674955836391e-06, + "loss": 5.034, + "step": 17225 + }, + { + "epoch": 1.1706753635004756, + "grad_norm": 0.5697712898254395, + "learning_rate": 8.537250305748064e-06, + "loss": 5.2855, + "step": 17230 + }, + { + "epoch": 1.1710150835711373, + "grad_norm": 0.5855008959770203, + "learning_rate": 8.536825655659737e-06, + "loss": 5.1468, + "step": 17235 + }, + { + "epoch": 1.1713548036417991, + "grad_norm": 0.8414115905761719, + "learning_rate": 8.53640100557141e-06, + "loss": 5.0922, + "step": 17240 + }, + { + "epoch": 1.171694523712461, + "grad_norm": 0.6027839779853821, + "learning_rate": 8.535976355483082e-06, + "loss": 5.1691, + "step": 17245 + }, + { + "epoch": 1.1720342437831226, + "grad_norm": 0.6209452152252197, + "learning_rate": 8.535551705394755e-06, + "loss": 5.2392, + "step": 17250 + }, + { + "epoch": 1.1723739638537845, + "grad_norm": 0.5836121439933777, + "learning_rate": 8.535127055306428e-06, + "loss": 5.1382, + "step": 17255 + }, + { + "epoch": 1.1727136839244463, + "grad_norm": 0.5114483833312988, + "learning_rate": 8.5347024052181e-06, + "loss": 5.1333, + "step": 17260 + }, + { + "epoch": 1.173053403995108, + "grad_norm": 0.44832122325897217, + "learning_rate": 8.534277755129774e-06, + "loss": 5.1532, + "step": 17265 + }, + { + "epoch": 1.1733931240657698, + "grad_norm": 0.588176965713501, + "learning_rate": 8.533853105041446e-06, + "loss": 4.9825, + "step": 17270 + }, + { + "epoch": 1.1737328441364316, + "grad_norm": 0.6506429314613342, + "learning_rate": 8.53342845495312e-06, + "loss": 4.6443, + "step": 17275 + }, + { + "epoch": 1.1740725642070933, + "grad_norm": 0.5602866411209106, + "learning_rate": 8.533003804864792e-06, + "loss": 5.3376, + "step": 17280 + }, + { + "epoch": 1.1744122842777551, + "grad_norm": 0.5424957871437073, + "learning_rate": 8.532579154776465e-06, + "loss": 5.0909, + "step": 17285 + }, + { + "epoch": 1.174752004348417, + "grad_norm": 0.5213866233825684, + "learning_rate": 8.532154504688138e-06, + "loss": 5.2563, + "step": 17290 + }, + { + "epoch": 1.1750917244190786, + "grad_norm": 0.5133030414581299, + "learning_rate": 8.53172985459981e-06, + "loss": 5.1729, + "step": 17295 + }, + { + "epoch": 1.1754314444897405, + "grad_norm": 0.5681331157684326, + "learning_rate": 8.531305204511483e-06, + "loss": 5.4487, + "step": 17300 + }, + { + "epoch": 1.1757711645604023, + "grad_norm": 0.5030602216720581, + "learning_rate": 8.530880554423156e-06, + "loss": 5.0957, + "step": 17305 + }, + { + "epoch": 1.176110884631064, + "grad_norm": 0.666480541229248, + "learning_rate": 8.530455904334829e-06, + "loss": 5.3438, + "step": 17310 + }, + { + "epoch": 1.1764506047017258, + "grad_norm": 0.5442012548446655, + "learning_rate": 8.530031254246502e-06, + "loss": 5.2367, + "step": 17315 + }, + { + "epoch": 1.1767903247723877, + "grad_norm": 0.492850661277771, + "learning_rate": 8.529606604158174e-06, + "loss": 5.4353, + "step": 17320 + }, + { + "epoch": 1.1771300448430493, + "grad_norm": 0.4843572974205017, + "learning_rate": 8.529181954069847e-06, + "loss": 4.9256, + "step": 17325 + }, + { + "epoch": 1.1774697649137111, + "grad_norm": 0.44916319847106934, + "learning_rate": 8.52875730398152e-06, + "loss": 5.2549, + "step": 17330 + }, + { + "epoch": 1.177809484984373, + "grad_norm": 0.4873046576976776, + "learning_rate": 8.528332653893193e-06, + "loss": 5.3087, + "step": 17335 + }, + { + "epoch": 1.1781492050550346, + "grad_norm": 0.5474375486373901, + "learning_rate": 8.527908003804866e-06, + "loss": 5.12, + "step": 17340 + }, + { + "epoch": 1.1784889251256965, + "grad_norm": 0.5064392685890198, + "learning_rate": 8.527483353716538e-06, + "loss": 5.1465, + "step": 17345 + }, + { + "epoch": 1.178828645196358, + "grad_norm": 0.6391626596450806, + "learning_rate": 8.527058703628211e-06, + "loss": 5.0294, + "step": 17350 + }, + { + "epoch": 1.17916836526702, + "grad_norm": 0.5381090044975281, + "learning_rate": 8.526634053539884e-06, + "loss": 5.1638, + "step": 17355 + }, + { + "epoch": 1.1795080853376818, + "grad_norm": 0.5717123746871948, + "learning_rate": 8.526209403451557e-06, + "loss": 5.23, + "step": 17360 + }, + { + "epoch": 1.1798478054083434, + "grad_norm": 0.7199088335037231, + "learning_rate": 8.52578475336323e-06, + "loss": 4.7417, + "step": 17365 + }, + { + "epoch": 1.1801875254790053, + "grad_norm": 0.45192453265190125, + "learning_rate": 8.525360103274902e-06, + "loss": 4.9597, + "step": 17370 + }, + { + "epoch": 1.1805272455496671, + "grad_norm": 0.45699793100357056, + "learning_rate": 8.524935453186575e-06, + "loss": 5.0109, + "step": 17375 + }, + { + "epoch": 1.1808669656203288, + "grad_norm": 0.45968759059906006, + "learning_rate": 8.524510803098248e-06, + "loss": 5.0267, + "step": 17380 + }, + { + "epoch": 1.1812066856909906, + "grad_norm": 0.5879107117652893, + "learning_rate": 8.52408615300992e-06, + "loss": 4.9345, + "step": 17385 + }, + { + "epoch": 1.1815464057616525, + "grad_norm": 0.636303722858429, + "learning_rate": 8.523661502921594e-06, + "loss": 5.2137, + "step": 17390 + }, + { + "epoch": 1.1818861258323141, + "grad_norm": 0.6913169026374817, + "learning_rate": 8.523236852833266e-06, + "loss": 5.1064, + "step": 17395 + }, + { + "epoch": 1.182225845902976, + "grad_norm": 0.5034259557723999, + "learning_rate": 8.52281220274494e-06, + "loss": 4.8861, + "step": 17400 + }, + { + "epoch": 1.1825655659736376, + "grad_norm": 0.5783173441886902, + "learning_rate": 8.522387552656612e-06, + "loss": 4.9628, + "step": 17405 + }, + { + "epoch": 1.1829052860442995, + "grad_norm": 0.49654698371887207, + "learning_rate": 8.521962902568285e-06, + "loss": 5.2226, + "step": 17410 + }, + { + "epoch": 1.1832450061149613, + "grad_norm": 0.5273087024688721, + "learning_rate": 8.521538252479958e-06, + "loss": 5.0892, + "step": 17415 + }, + { + "epoch": 1.183584726185623, + "grad_norm": 0.6313379406929016, + "learning_rate": 8.52111360239163e-06, + "loss": 5.0235, + "step": 17420 + }, + { + "epoch": 1.1839244462562848, + "grad_norm": 0.44520577788352966, + "learning_rate": 8.520688952303303e-06, + "loss": 4.9958, + "step": 17425 + }, + { + "epoch": 1.1842641663269466, + "grad_norm": 0.5765898823738098, + "learning_rate": 8.520264302214976e-06, + "loss": 5.314, + "step": 17430 + }, + { + "epoch": 1.1846038863976083, + "grad_norm": 0.4890117645263672, + "learning_rate": 8.519839652126649e-06, + "loss": 5.162, + "step": 17435 + }, + { + "epoch": 1.1849436064682701, + "grad_norm": 0.6251245141029358, + "learning_rate": 8.519415002038322e-06, + "loss": 5.118, + "step": 17440 + }, + { + "epoch": 1.185283326538932, + "grad_norm": 0.5946666598320007, + "learning_rate": 8.518990351949994e-06, + "loss": 5.1764, + "step": 17445 + }, + { + "epoch": 1.1856230466095936, + "grad_norm": 0.5433018803596497, + "learning_rate": 8.518565701861665e-06, + "loss": 5.0008, + "step": 17450 + }, + { + "epoch": 1.1859627666802555, + "grad_norm": 0.5489576458930969, + "learning_rate": 8.51814105177334e-06, + "loss": 5.1537, + "step": 17455 + }, + { + "epoch": 1.1863024867509173, + "grad_norm": 0.44503721594810486, + "learning_rate": 8.517716401685013e-06, + "loss": 4.9018, + "step": 17460 + }, + { + "epoch": 1.186642206821579, + "grad_norm": 0.5388177037239075, + "learning_rate": 8.517291751596684e-06, + "loss": 5.272, + "step": 17465 + }, + { + "epoch": 1.1869819268922408, + "grad_norm": 0.45363545417785645, + "learning_rate": 8.516867101508358e-06, + "loss": 5.3133, + "step": 17470 + }, + { + "epoch": 1.1873216469629027, + "grad_norm": 0.5588156580924988, + "learning_rate": 8.516442451420031e-06, + "loss": 4.9429, + "step": 17475 + }, + { + "epoch": 1.1876613670335643, + "grad_norm": 0.5019079446792603, + "learning_rate": 8.516017801331702e-06, + "loss": 5.154, + "step": 17480 + }, + { + "epoch": 1.1880010871042261, + "grad_norm": 0.5874198079109192, + "learning_rate": 8.515593151243377e-06, + "loss": 5.2548, + "step": 17485 + }, + { + "epoch": 1.188340807174888, + "grad_norm": 0.5801103711128235, + "learning_rate": 8.51516850115505e-06, + "loss": 4.8545, + "step": 17490 + }, + { + "epoch": 1.1886805272455496, + "grad_norm": 0.582897424697876, + "learning_rate": 8.51474385106672e-06, + "loss": 4.8909, + "step": 17495 + }, + { + "epoch": 1.1890202473162115, + "grad_norm": 0.6797772645950317, + "learning_rate": 8.514319200978395e-06, + "loss": 4.9481, + "step": 17500 + }, + { + "epoch": 1.1893599673868733, + "grad_norm": 0.44199246168136597, + "learning_rate": 8.513894550890068e-06, + "loss": 5.012, + "step": 17505 + }, + { + "epoch": 1.189699687457535, + "grad_norm": 0.448493629693985, + "learning_rate": 8.513469900801739e-06, + "loss": 4.9325, + "step": 17510 + }, + { + "epoch": 1.1900394075281968, + "grad_norm": 0.42922407388687134, + "learning_rate": 8.513045250713414e-06, + "loss": 5.2952, + "step": 17515 + }, + { + "epoch": 1.1903791275988584, + "grad_norm": 0.5276966094970703, + "learning_rate": 8.512620600625086e-06, + "loss": 4.9743, + "step": 17520 + }, + { + "epoch": 1.1907188476695203, + "grad_norm": 0.6991740465164185, + "learning_rate": 8.512195950536758e-06, + "loss": 5.0532, + "step": 17525 + }, + { + "epoch": 1.1910585677401822, + "grad_norm": 0.6155135631561279, + "learning_rate": 8.511771300448432e-06, + "loss": 5.2959, + "step": 17530 + }, + { + "epoch": 1.1913982878108438, + "grad_norm": 0.5116905570030212, + "learning_rate": 8.511346650360103e-06, + "loss": 5.2711, + "step": 17535 + }, + { + "epoch": 1.1917380078815056, + "grad_norm": 0.5786412954330444, + "learning_rate": 8.510922000271776e-06, + "loss": 5.3178, + "step": 17540 + }, + { + "epoch": 1.1920777279521675, + "grad_norm": 0.5412810444831848, + "learning_rate": 8.51049735018345e-06, + "loss": 5.1504, + "step": 17545 + }, + { + "epoch": 1.1924174480228291, + "grad_norm": 0.7021306753158569, + "learning_rate": 8.510072700095122e-06, + "loss": 5.4028, + "step": 17550 + }, + { + "epoch": 1.192757168093491, + "grad_norm": 0.6019464135169983, + "learning_rate": 8.509648050006794e-06, + "loss": 5.2671, + "step": 17555 + }, + { + "epoch": 1.1930968881641528, + "grad_norm": 0.5132253766059875, + "learning_rate": 8.509223399918469e-06, + "loss": 5.1168, + "step": 17560 + }, + { + "epoch": 1.1934366082348145, + "grad_norm": 0.5490529537200928, + "learning_rate": 8.50879874983014e-06, + "loss": 4.8699, + "step": 17565 + }, + { + "epoch": 1.1937763283054763, + "grad_norm": 0.4524442255496979, + "learning_rate": 8.508374099741813e-06, + "loss": 5.038, + "step": 17570 + }, + { + "epoch": 1.194116048376138, + "grad_norm": 0.7515945434570312, + "learning_rate": 8.507949449653487e-06, + "loss": 4.8845, + "step": 17575 + }, + { + "epoch": 1.1944557684467998, + "grad_norm": 0.5269299745559692, + "learning_rate": 8.507524799565158e-06, + "loss": 5.0796, + "step": 17580 + }, + { + "epoch": 1.1947954885174616, + "grad_norm": 0.524671733379364, + "learning_rate": 8.507100149476831e-06, + "loss": 5.2097, + "step": 17585 + }, + { + "epoch": 1.1951352085881233, + "grad_norm": 0.594027578830719, + "learning_rate": 8.506675499388506e-06, + "loss": 4.954, + "step": 17590 + }, + { + "epoch": 1.1954749286587851, + "grad_norm": 0.4342600405216217, + "learning_rate": 8.506250849300177e-06, + "loss": 5.1346, + "step": 17595 + }, + { + "epoch": 1.195814648729447, + "grad_norm": 0.4705079197883606, + "learning_rate": 8.50582619921185e-06, + "loss": 5.0318, + "step": 17600 + }, + { + "epoch": 1.1961543688001086, + "grad_norm": 0.6318638324737549, + "learning_rate": 8.505401549123522e-06, + "loss": 4.9794, + "step": 17605 + }, + { + "epoch": 1.1964940888707705, + "grad_norm": 0.584078311920166, + "learning_rate": 8.504976899035195e-06, + "loss": 4.8004, + "step": 17610 + }, + { + "epoch": 1.1968338089414323, + "grad_norm": 0.6103923320770264, + "learning_rate": 8.504552248946868e-06, + "loss": 5.3796, + "step": 17615 + }, + { + "epoch": 1.197173529012094, + "grad_norm": 0.574604868888855, + "learning_rate": 8.50412759885854e-06, + "loss": 5.0645, + "step": 17620 + }, + { + "epoch": 1.1975132490827558, + "grad_norm": 0.5595641136169434, + "learning_rate": 8.503702948770214e-06, + "loss": 4.8173, + "step": 17625 + }, + { + "epoch": 1.1978529691534177, + "grad_norm": 0.4503292441368103, + "learning_rate": 8.503278298681888e-06, + "loss": 5.0806, + "step": 17630 + }, + { + "epoch": 1.1981926892240793, + "grad_norm": 0.4475191831588745, + "learning_rate": 8.502853648593559e-06, + "loss": 4.7766, + "step": 17635 + }, + { + "epoch": 1.1985324092947411, + "grad_norm": 0.5157948732376099, + "learning_rate": 8.502428998505232e-06, + "loss": 5.2799, + "step": 17640 + }, + { + "epoch": 1.198872129365403, + "grad_norm": 0.47796306014060974, + "learning_rate": 8.502004348416906e-06, + "loss": 4.9803, + "step": 17645 + }, + { + "epoch": 1.1992118494360646, + "grad_norm": 0.5060136318206787, + "learning_rate": 8.501579698328578e-06, + "loss": 5.1434, + "step": 17650 + }, + { + "epoch": 1.1995515695067265, + "grad_norm": 0.5308713316917419, + "learning_rate": 8.50115504824025e-06, + "loss": 5.1375, + "step": 17655 + }, + { + "epoch": 1.1998912895773883, + "grad_norm": 0.49966302514076233, + "learning_rate": 8.500730398151925e-06, + "loss": 5.0833, + "step": 17660 + }, + { + "epoch": 1.20023100964805, + "grad_norm": 0.5251104235649109, + "learning_rate": 8.500305748063596e-06, + "loss": 5.1473, + "step": 17665 + }, + { + "epoch": 1.2005707297187118, + "grad_norm": 0.4810459613800049, + "learning_rate": 8.499881097975269e-06, + "loss": 4.8882, + "step": 17670 + }, + { + "epoch": 1.2009104497893737, + "grad_norm": 0.5159268975257874, + "learning_rate": 8.499541377904608e-06, + "loss": 4.9702, + "step": 17675 + }, + { + "epoch": 1.2012501698600353, + "grad_norm": 0.6098172664642334, + "learning_rate": 8.49911672781628e-06, + "loss": 4.8542, + "step": 17680 + }, + { + "epoch": 1.2015898899306972, + "grad_norm": 0.6538969278335571, + "learning_rate": 8.498692077727953e-06, + "loss": 5.1565, + "step": 17685 + }, + { + "epoch": 1.2019296100013588, + "grad_norm": 0.5220412611961365, + "learning_rate": 8.498267427639626e-06, + "loss": 5.1823, + "step": 17690 + }, + { + "epoch": 1.2022693300720206, + "grad_norm": 0.4935719072818756, + "learning_rate": 8.497842777551299e-06, + "loss": 4.8007, + "step": 17695 + }, + { + "epoch": 1.2026090501426825, + "grad_norm": 0.4920753836631775, + "learning_rate": 8.49741812746297e-06, + "loss": 5.273, + "step": 17700 + }, + { + "epoch": 1.2029487702133441, + "grad_norm": 0.6763972640037537, + "learning_rate": 8.496993477374644e-06, + "loss": 4.9626, + "step": 17705 + }, + { + "epoch": 1.203288490284006, + "grad_norm": 0.49894973635673523, + "learning_rate": 8.496568827286317e-06, + "loss": 5.2482, + "step": 17710 + }, + { + "epoch": 1.2036282103546678, + "grad_norm": 0.5134072303771973, + "learning_rate": 8.496144177197988e-06, + "loss": 4.9185, + "step": 17715 + }, + { + "epoch": 1.2039679304253295, + "grad_norm": 0.44669657945632935, + "learning_rate": 8.495719527109663e-06, + "loss": 5.086, + "step": 17720 + }, + { + "epoch": 1.2043076504959913, + "grad_norm": 0.4487249255180359, + "learning_rate": 8.495294877021336e-06, + "loss": 5.1848, + "step": 17725 + }, + { + "epoch": 1.2046473705666532, + "grad_norm": 0.5922607779502869, + "learning_rate": 8.494870226933007e-06, + "loss": 5.0254, + "step": 17730 + }, + { + "epoch": 1.2049870906373148, + "grad_norm": 0.7203934788703918, + "learning_rate": 8.494445576844681e-06, + "loss": 5.1655, + "step": 17735 + }, + { + "epoch": 1.2053268107079766, + "grad_norm": 0.455587238073349, + "learning_rate": 8.494020926756354e-06, + "loss": 4.9387, + "step": 17740 + }, + { + "epoch": 1.2056665307786383, + "grad_norm": 0.42877423763275146, + "learning_rate": 8.493596276668025e-06, + "loss": 4.8986, + "step": 17745 + }, + { + "epoch": 1.2060062508493001, + "grad_norm": 0.6368880867958069, + "learning_rate": 8.4931716265797e-06, + "loss": 5.1653, + "step": 17750 + }, + { + "epoch": 1.206345970919962, + "grad_norm": 0.6418430209159851, + "learning_rate": 8.492746976491372e-06, + "loss": 4.9882, + "step": 17755 + }, + { + "epoch": 1.2066856909906236, + "grad_norm": 0.5744255185127258, + "learning_rate": 8.492322326403044e-06, + "loss": 4.9672, + "step": 17760 + }, + { + "epoch": 1.2070254110612855, + "grad_norm": 0.5215529799461365, + "learning_rate": 8.491897676314718e-06, + "loss": 5.2823, + "step": 17765 + }, + { + "epoch": 1.2073651311319473, + "grad_norm": 0.5877645611763, + "learning_rate": 8.491473026226391e-06, + "loss": 4.7879, + "step": 17770 + }, + { + "epoch": 1.207704851202609, + "grad_norm": 0.5827008485794067, + "learning_rate": 8.491048376138062e-06, + "loss": 5.1687, + "step": 17775 + }, + { + "epoch": 1.2080445712732708, + "grad_norm": 0.557538628578186, + "learning_rate": 8.490623726049736e-06, + "loss": 5.3756, + "step": 17780 + }, + { + "epoch": 1.2083842913439327, + "grad_norm": 0.4542071521282196, + "learning_rate": 8.490199075961408e-06, + "loss": 5.0722, + "step": 17785 + }, + { + "epoch": 1.2087240114145943, + "grad_norm": 0.4996698498725891, + "learning_rate": 8.48977442587308e-06, + "loss": 5.3618, + "step": 17790 + }, + { + "epoch": 1.2090637314852561, + "grad_norm": 0.46954718232154846, + "learning_rate": 8.489349775784755e-06, + "loss": 5.0788, + "step": 17795 + }, + { + "epoch": 1.209403451555918, + "grad_norm": 0.4806744456291199, + "learning_rate": 8.488925125696426e-06, + "loss": 5.1486, + "step": 17800 + }, + { + "epoch": 1.2097431716265796, + "grad_norm": 0.6481810808181763, + "learning_rate": 8.488500475608099e-06, + "loss": 5.0588, + "step": 17805 + }, + { + "epoch": 1.2100828916972415, + "grad_norm": 0.6107816100120544, + "learning_rate": 8.488075825519773e-06, + "loss": 5.2153, + "step": 17810 + }, + { + "epoch": 1.2104226117679033, + "grad_norm": 0.635086715221405, + "learning_rate": 8.487651175431444e-06, + "loss": 5.1747, + "step": 17815 + }, + { + "epoch": 1.210762331838565, + "grad_norm": 0.4281291365623474, + "learning_rate": 8.487226525343117e-06, + "loss": 4.9604, + "step": 17820 + }, + { + "epoch": 1.2111020519092268, + "grad_norm": 0.8182510733604431, + "learning_rate": 8.486801875254792e-06, + "loss": 5.3324, + "step": 17825 + }, + { + "epoch": 1.2114417719798887, + "grad_norm": 0.5850277543067932, + "learning_rate": 8.486377225166463e-06, + "loss": 5.2306, + "step": 17830 + }, + { + "epoch": 1.2117814920505503, + "grad_norm": 0.5161892771720886, + "learning_rate": 8.485952575078137e-06, + "loss": 5.1338, + "step": 17835 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.5699204206466675, + "learning_rate": 8.48552792498981e-06, + "loss": 5.3336, + "step": 17840 + }, + { + "epoch": 1.212460932191874, + "grad_norm": 0.4911794662475586, + "learning_rate": 8.485103274901481e-06, + "loss": 5.2234, + "step": 17845 + }, + { + "epoch": 1.2128006522625356, + "grad_norm": 0.5413517355918884, + "learning_rate": 8.484678624813156e-06, + "loss": 5.0075, + "step": 17850 + }, + { + "epoch": 1.2131403723331975, + "grad_norm": 0.5483856797218323, + "learning_rate": 8.484253974724827e-06, + "loss": 5.3006, + "step": 17855 + }, + { + "epoch": 1.2134800924038591, + "grad_norm": 0.4160054624080658, + "learning_rate": 8.4838293246365e-06, + "loss": 4.8195, + "step": 17860 + }, + { + "epoch": 1.213819812474521, + "grad_norm": 0.5283341407775879, + "learning_rate": 8.483404674548174e-06, + "loss": 5.1886, + "step": 17865 + }, + { + "epoch": 1.2141595325451828, + "grad_norm": 0.4701025187969208, + "learning_rate": 8.482980024459845e-06, + "loss": 4.8542, + "step": 17870 + }, + { + "epoch": 1.2144992526158445, + "grad_norm": 0.47529688477516174, + "learning_rate": 8.482555374371518e-06, + "loss": 5.1528, + "step": 17875 + }, + { + "epoch": 1.2148389726865063, + "grad_norm": 0.44658610224723816, + "learning_rate": 8.482130724283192e-06, + "loss": 4.9877, + "step": 17880 + }, + { + "epoch": 1.2151786927571682, + "grad_norm": 0.5382369160652161, + "learning_rate": 8.481706074194864e-06, + "loss": 5.2762, + "step": 17885 + }, + { + "epoch": 1.2155184128278298, + "grad_norm": 0.46304208040237427, + "learning_rate": 8.481281424106536e-06, + "loss": 5.1412, + "step": 17890 + }, + { + "epoch": 1.2158581328984917, + "grad_norm": 0.45578184723854065, + "learning_rate": 8.480856774018211e-06, + "loss": 5.0683, + "step": 17895 + }, + { + "epoch": 1.2161978529691535, + "grad_norm": 0.5186563730239868, + "learning_rate": 8.480432123929882e-06, + "loss": 5.271, + "step": 17900 + }, + { + "epoch": 1.2165375730398151, + "grad_norm": 0.5530164837837219, + "learning_rate": 8.480007473841555e-06, + "loss": 4.9778, + "step": 17905 + }, + { + "epoch": 1.216877293110477, + "grad_norm": 0.5134716033935547, + "learning_rate": 8.47958282375323e-06, + "loss": 5.0343, + "step": 17910 + }, + { + "epoch": 1.2172170131811386, + "grad_norm": 0.7116380333900452, + "learning_rate": 8.4791581736649e-06, + "loss": 5.0916, + "step": 17915 + }, + { + "epoch": 1.2175567332518005, + "grad_norm": 0.5594192743301392, + "learning_rate": 8.478733523576573e-06, + "loss": 5.0913, + "step": 17920 + }, + { + "epoch": 1.2178964533224623, + "grad_norm": 0.4650353193283081, + "learning_rate": 8.478308873488246e-06, + "loss": 5.1274, + "step": 17925 + }, + { + "epoch": 1.218236173393124, + "grad_norm": 0.5823315382003784, + "learning_rate": 8.477884223399919e-06, + "loss": 4.9703, + "step": 17930 + }, + { + "epoch": 1.2185758934637858, + "grad_norm": 0.4894219934940338, + "learning_rate": 8.477459573311592e-06, + "loss": 5.2225, + "step": 17935 + }, + { + "epoch": 1.2189156135344477, + "grad_norm": 0.5347648859024048, + "learning_rate": 8.477034923223264e-06, + "loss": 5.2102, + "step": 17940 + }, + { + "epoch": 1.2192553336051093, + "grad_norm": 0.44939783215522766, + "learning_rate": 8.476610273134937e-06, + "loss": 5.3293, + "step": 17945 + }, + { + "epoch": 1.2195950536757711, + "grad_norm": 0.4995860159397125, + "learning_rate": 8.47618562304661e-06, + "loss": 5.0217, + "step": 17950 + }, + { + "epoch": 1.219934773746433, + "grad_norm": 0.7084755897521973, + "learning_rate": 8.475760972958283e-06, + "loss": 5.1167, + "step": 17955 + }, + { + "epoch": 1.2202744938170946, + "grad_norm": 0.5415509343147278, + "learning_rate": 8.475336322869956e-06, + "loss": 5.1065, + "step": 17960 + }, + { + "epoch": 1.2206142138877565, + "grad_norm": 0.48044636845588684, + "learning_rate": 8.474911672781628e-06, + "loss": 4.7677, + "step": 17965 + }, + { + "epoch": 1.2209539339584183, + "grad_norm": 0.4264732599258423, + "learning_rate": 8.474487022693301e-06, + "loss": 4.9731, + "step": 17970 + }, + { + "epoch": 1.22129365402908, + "grad_norm": 0.6690512895584106, + "learning_rate": 8.474062372604974e-06, + "loss": 5.118, + "step": 17975 + }, + { + "epoch": 1.2216333740997418, + "grad_norm": 0.5394269824028015, + "learning_rate": 8.473637722516647e-06, + "loss": 5.0525, + "step": 17980 + }, + { + "epoch": 1.2219730941704037, + "grad_norm": 0.5489320158958435, + "learning_rate": 8.47321307242832e-06, + "loss": 4.8315, + "step": 17985 + }, + { + "epoch": 1.2223128142410653, + "grad_norm": 0.5461826324462891, + "learning_rate": 8.472788422339992e-06, + "loss": 5.2853, + "step": 17990 + }, + { + "epoch": 1.2226525343117272, + "grad_norm": 0.48679089546203613, + "learning_rate": 8.472363772251665e-06, + "loss": 5.0765, + "step": 17995 + }, + { + "epoch": 1.222992254382389, + "grad_norm": 0.8060910701751709, + "learning_rate": 8.471939122163338e-06, + "loss": 5.3172, + "step": 18000 + }, + { + "epoch": 1.2233319744530506, + "grad_norm": 0.5483223795890808, + "learning_rate": 8.47151447207501e-06, + "loss": 5.0307, + "step": 18005 + }, + { + "epoch": 1.2236716945237125, + "grad_norm": 0.5815694332122803, + "learning_rate": 8.471089821986684e-06, + "loss": 5.0583, + "step": 18010 + }, + { + "epoch": 1.2240114145943743, + "grad_norm": 0.4717200696468353, + "learning_rate": 8.470665171898356e-06, + "loss": 5.2191, + "step": 18015 + }, + { + "epoch": 1.224351134665036, + "grad_norm": 0.45134711265563965, + "learning_rate": 8.47024052181003e-06, + "loss": 5.2249, + "step": 18020 + }, + { + "epoch": 1.2246908547356978, + "grad_norm": 0.5618113279342651, + "learning_rate": 8.469815871721702e-06, + "loss": 5.0737, + "step": 18025 + }, + { + "epoch": 1.2250305748063597, + "grad_norm": 0.6287793517112732, + "learning_rate": 8.469391221633375e-06, + "loss": 5.0235, + "step": 18030 + }, + { + "epoch": 1.2253702948770213, + "grad_norm": 0.47332385182380676, + "learning_rate": 8.468966571545048e-06, + "loss": 5.2496, + "step": 18035 + }, + { + "epoch": 1.2257100149476832, + "grad_norm": 0.590364933013916, + "learning_rate": 8.46854192145672e-06, + "loss": 5.1941, + "step": 18040 + }, + { + "epoch": 1.2260497350183448, + "grad_norm": 0.6986119747161865, + "learning_rate": 8.468117271368393e-06, + "loss": 5.0276, + "step": 18045 + }, + { + "epoch": 1.2263894550890067, + "grad_norm": 0.518956184387207, + "learning_rate": 8.467692621280066e-06, + "loss": 5.0316, + "step": 18050 + }, + { + "epoch": 1.2267291751596685, + "grad_norm": 0.6444506645202637, + "learning_rate": 8.467267971191739e-06, + "loss": 5.0441, + "step": 18055 + }, + { + "epoch": 1.2270688952303301, + "grad_norm": 0.7313891649246216, + "learning_rate": 8.466843321103412e-06, + "loss": 5.1028, + "step": 18060 + }, + { + "epoch": 1.227408615300992, + "grad_norm": 0.45634204149246216, + "learning_rate": 8.466418671015084e-06, + "loss": 5.1272, + "step": 18065 + }, + { + "epoch": 1.2277483353716538, + "grad_norm": 0.591414749622345, + "learning_rate": 8.465994020926757e-06, + "loss": 5.1229, + "step": 18070 + }, + { + "epoch": 1.2280880554423155, + "grad_norm": 0.4718937575817108, + "learning_rate": 8.46556937083843e-06, + "loss": 5.2367, + "step": 18075 + }, + { + "epoch": 1.2284277755129773, + "grad_norm": 0.6031748652458191, + "learning_rate": 8.465144720750103e-06, + "loss": 5.1073, + "step": 18080 + }, + { + "epoch": 1.228767495583639, + "grad_norm": 0.6214359998703003, + "learning_rate": 8.464720070661776e-06, + "loss": 5.0383, + "step": 18085 + }, + { + "epoch": 1.2291072156543008, + "grad_norm": 0.5480508804321289, + "learning_rate": 8.464295420573448e-06, + "loss": 5.0659, + "step": 18090 + }, + { + "epoch": 1.2294469357249627, + "grad_norm": 0.6222751140594482, + "learning_rate": 8.463870770485121e-06, + "loss": 5.0021, + "step": 18095 + }, + { + "epoch": 1.2297866557956243, + "grad_norm": 0.45119529962539673, + "learning_rate": 8.463446120396794e-06, + "loss": 5.329, + "step": 18100 + }, + { + "epoch": 1.2301263758662861, + "grad_norm": 0.5139843821525574, + "learning_rate": 8.463021470308467e-06, + "loss": 4.9993, + "step": 18105 + }, + { + "epoch": 1.230466095936948, + "grad_norm": 0.48846492171287537, + "learning_rate": 8.46259682022014e-06, + "loss": 5.0739, + "step": 18110 + }, + { + "epoch": 1.2308058160076096, + "grad_norm": 0.5248334407806396, + "learning_rate": 8.462172170131812e-06, + "loss": 5.1895, + "step": 18115 + }, + { + "epoch": 1.2311455360782715, + "grad_norm": 0.4982921779155731, + "learning_rate": 8.461747520043485e-06, + "loss": 5.0496, + "step": 18120 + }, + { + "epoch": 1.2314852561489333, + "grad_norm": 0.4668184220790863, + "learning_rate": 8.461322869955158e-06, + "loss": 4.7742, + "step": 18125 + }, + { + "epoch": 1.231824976219595, + "grad_norm": 0.5218693017959595, + "learning_rate": 8.460898219866829e-06, + "loss": 5.0871, + "step": 18130 + }, + { + "epoch": 1.2321646962902568, + "grad_norm": 0.4936673045158386, + "learning_rate": 8.460473569778504e-06, + "loss": 5.0979, + "step": 18135 + }, + { + "epoch": 1.2325044163609187, + "grad_norm": 0.47885528206825256, + "learning_rate": 8.460048919690176e-06, + "loss": 5.2046, + "step": 18140 + }, + { + "epoch": 1.2328441364315803, + "grad_norm": 0.6647428274154663, + "learning_rate": 8.459624269601848e-06, + "loss": 5.1171, + "step": 18145 + }, + { + "epoch": 1.2331838565022422, + "grad_norm": 0.519589900970459, + "learning_rate": 8.459199619513522e-06, + "loss": 4.8734, + "step": 18150 + }, + { + "epoch": 1.233523576572904, + "grad_norm": 0.4722979962825775, + "learning_rate": 8.458774969425195e-06, + "loss": 4.9256, + "step": 18155 + }, + { + "epoch": 1.2338632966435656, + "grad_norm": 0.5551944375038147, + "learning_rate": 8.458350319336866e-06, + "loss": 5.0762, + "step": 18160 + }, + { + "epoch": 1.2342030167142275, + "grad_norm": 0.7264661192893982, + "learning_rate": 8.45792566924854e-06, + "loss": 5.0589, + "step": 18165 + }, + { + "epoch": 1.2345427367848893, + "grad_norm": 0.5895290374755859, + "learning_rate": 8.457501019160213e-06, + "loss": 5.0498, + "step": 18170 + }, + { + "epoch": 1.234882456855551, + "grad_norm": 0.5472476482391357, + "learning_rate": 8.457076369071886e-06, + "loss": 5.0374, + "step": 18175 + }, + { + "epoch": 1.2352221769262128, + "grad_norm": 0.5245071649551392, + "learning_rate": 8.456651718983559e-06, + "loss": 4.9679, + "step": 18180 + }, + { + "epoch": 1.2355618969968747, + "grad_norm": 0.5411975979804993, + "learning_rate": 8.456227068895232e-06, + "loss": 5.1187, + "step": 18185 + }, + { + "epoch": 1.2359016170675363, + "grad_norm": 0.5390090942382812, + "learning_rate": 8.455802418806904e-06, + "loss": 5.0181, + "step": 18190 + }, + { + "epoch": 1.2362413371381982, + "grad_norm": 0.46749719977378845, + "learning_rate": 8.455377768718577e-06, + "loss": 5.0208, + "step": 18195 + }, + { + "epoch": 1.23658105720886, + "grad_norm": 0.4856654107570648, + "learning_rate": 8.454953118630248e-06, + "loss": 5.0967, + "step": 18200 + }, + { + "epoch": 1.2369207772795217, + "grad_norm": 0.5260303616523743, + "learning_rate": 8.454528468541923e-06, + "loss": 4.9784, + "step": 18205 + }, + { + "epoch": 1.2372604973501835, + "grad_norm": 0.576423168182373, + "learning_rate": 8.454103818453596e-06, + "loss": 4.9198, + "step": 18210 + }, + { + "epoch": 1.2376002174208451, + "grad_norm": 0.5354751944541931, + "learning_rate": 8.453679168365267e-06, + "loss": 5.1806, + "step": 18215 + }, + { + "epoch": 1.237939937491507, + "grad_norm": 0.618549108505249, + "learning_rate": 8.453254518276941e-06, + "loss": 4.9405, + "step": 18220 + }, + { + "epoch": 1.2382796575621688, + "grad_norm": 0.482555627822876, + "learning_rate": 8.452829868188614e-06, + "loss": 5.2378, + "step": 18225 + }, + { + "epoch": 1.2386193776328305, + "grad_norm": 0.5864008069038391, + "learning_rate": 8.452405218100285e-06, + "loss": 5.1308, + "step": 18230 + }, + { + "epoch": 1.2389590977034923, + "grad_norm": 0.4534913897514343, + "learning_rate": 8.45198056801196e-06, + "loss": 4.9842, + "step": 18235 + }, + { + "epoch": 1.2392988177741542, + "grad_norm": 0.42258796095848083, + "learning_rate": 8.451555917923632e-06, + "loss": 4.8747, + "step": 18240 + }, + { + "epoch": 1.2396385378448158, + "grad_norm": 0.5883085131645203, + "learning_rate": 8.451131267835304e-06, + "loss": 5.1845, + "step": 18245 + }, + { + "epoch": 1.2399782579154777, + "grad_norm": 0.4216248393058777, + "learning_rate": 8.450706617746978e-06, + "loss": 4.9618, + "step": 18250 + }, + { + "epoch": 1.2403179779861393, + "grad_norm": 0.4601632356643677, + "learning_rate": 8.45028196765865e-06, + "loss": 4.9722, + "step": 18255 + }, + { + "epoch": 1.2406576980568012, + "grad_norm": 0.5462074279785156, + "learning_rate": 8.449857317570322e-06, + "loss": 5.0553, + "step": 18260 + }, + { + "epoch": 1.240997418127463, + "grad_norm": 0.5333732962608337, + "learning_rate": 8.449432667481996e-06, + "loss": 5.1484, + "step": 18265 + }, + { + "epoch": 1.2413371381981246, + "grad_norm": 0.6458606123924255, + "learning_rate": 8.449008017393668e-06, + "loss": 4.9159, + "step": 18270 + }, + { + "epoch": 1.2416768582687865, + "grad_norm": 0.5641586780548096, + "learning_rate": 8.44858336730534e-06, + "loss": 4.8478, + "step": 18275 + }, + { + "epoch": 1.2420165783394483, + "grad_norm": 0.49163708090782166, + "learning_rate": 8.448158717217015e-06, + "loss": 5.0857, + "step": 18280 + }, + { + "epoch": 1.24235629841011, + "grad_norm": 0.5430234670639038, + "learning_rate": 8.447734067128686e-06, + "loss": 5.2405, + "step": 18285 + }, + { + "epoch": 1.2426960184807718, + "grad_norm": 0.5302841067314148, + "learning_rate": 8.447309417040359e-06, + "loss": 4.8681, + "step": 18290 + }, + { + "epoch": 1.2430357385514337, + "grad_norm": 0.46207672357559204, + "learning_rate": 8.446884766952033e-06, + "loss": 5.0268, + "step": 18295 + }, + { + "epoch": 1.2433754586220953, + "grad_norm": 0.6851187348365784, + "learning_rate": 8.446460116863704e-06, + "loss": 5.0582, + "step": 18300 + }, + { + "epoch": 1.2437151786927572, + "grad_norm": 0.45764070749282837, + "learning_rate": 8.446035466775377e-06, + "loss": 5.0654, + "step": 18305 + }, + { + "epoch": 1.244054898763419, + "grad_norm": 0.50992351770401, + "learning_rate": 8.445610816687052e-06, + "loss": 5.3187, + "step": 18310 + }, + { + "epoch": 1.2443946188340806, + "grad_norm": 0.4920012652873993, + "learning_rate": 8.445186166598723e-06, + "loss": 4.9379, + "step": 18315 + }, + { + "epoch": 1.2447343389047425, + "grad_norm": 0.560835599899292, + "learning_rate": 8.444761516510396e-06, + "loss": 5.1919, + "step": 18320 + }, + { + "epoch": 1.2450740589754044, + "grad_norm": 0.4867154359817505, + "learning_rate": 8.44433686642207e-06, + "loss": 4.9709, + "step": 18325 + }, + { + "epoch": 1.245413779046066, + "grad_norm": 0.5259329676628113, + "learning_rate": 8.443912216333741e-06, + "loss": 4.8412, + "step": 18330 + }, + { + "epoch": 1.2457534991167278, + "grad_norm": 0.6392688155174255, + "learning_rate": 8.443487566245414e-06, + "loss": 5.2032, + "step": 18335 + }, + { + "epoch": 1.2460932191873897, + "grad_norm": 0.46285155415534973, + "learning_rate": 8.443062916157088e-06, + "loss": 5.032, + "step": 18340 + }, + { + "epoch": 1.2464329392580513, + "grad_norm": 0.4752609133720398, + "learning_rate": 8.44263826606876e-06, + "loss": 4.944, + "step": 18345 + }, + { + "epoch": 1.2467726593287132, + "grad_norm": 0.4495883285999298, + "learning_rate": 8.442213615980432e-06, + "loss": 5.0169, + "step": 18350 + }, + { + "epoch": 1.247112379399375, + "grad_norm": 0.43115541338920593, + "learning_rate": 8.441788965892105e-06, + "loss": 5.2422, + "step": 18355 + }, + { + "epoch": 1.2474520994700367, + "grad_norm": 0.47078049182891846, + "learning_rate": 8.441364315803778e-06, + "loss": 5.0481, + "step": 18360 + }, + { + "epoch": 1.2477918195406985, + "grad_norm": 0.3373432755470276, + "learning_rate": 8.44093966571545e-06, + "loss": 5.0052, + "step": 18365 + }, + { + "epoch": 1.2481315396113604, + "grad_norm": 0.5268682241439819, + "learning_rate": 8.440515015627124e-06, + "loss": 4.9774, + "step": 18370 + }, + { + "epoch": 1.248471259682022, + "grad_norm": 0.4129626154899597, + "learning_rate": 8.440090365538796e-06, + "loss": 5.223, + "step": 18375 + }, + { + "epoch": 1.2488109797526838, + "grad_norm": 0.48684564232826233, + "learning_rate": 8.43966571545047e-06, + "loss": 4.7853, + "step": 18380 + }, + { + "epoch": 1.2491506998233455, + "grad_norm": 0.5593785643577576, + "learning_rate": 8.439241065362142e-06, + "loss": 5.3688, + "step": 18385 + }, + { + "epoch": 1.2494904198940073, + "grad_norm": 0.4916324317455292, + "learning_rate": 8.438816415273815e-06, + "loss": 4.8964, + "step": 18390 + }, + { + "epoch": 1.2498301399646692, + "grad_norm": 0.4972854554653168, + "learning_rate": 8.438391765185488e-06, + "loss": 4.7298, + "step": 18395 + }, + { + "epoch": 1.2501698600353308, + "grad_norm": 0.43039900064468384, + "learning_rate": 8.43796711509716e-06, + "loss": 5.1394, + "step": 18400 + }, + { + "epoch": 1.2505095801059927, + "grad_norm": 0.47865742444992065, + "learning_rate": 8.437542465008833e-06, + "loss": 5.1024, + "step": 18405 + }, + { + "epoch": 1.2508493001766543, + "grad_norm": 0.45308759808540344, + "learning_rate": 8.437117814920506e-06, + "loss": 5.0825, + "step": 18410 + }, + { + "epoch": 1.2511890202473162, + "grad_norm": 0.4890417754650116, + "learning_rate": 8.436693164832179e-06, + "loss": 4.7081, + "step": 18415 + }, + { + "epoch": 1.251528740317978, + "grad_norm": 0.5339360237121582, + "learning_rate": 8.436268514743852e-06, + "loss": 5.1328, + "step": 18420 + }, + { + "epoch": 1.2518684603886396, + "grad_norm": 0.6388021111488342, + "learning_rate": 8.435843864655524e-06, + "loss": 5.2334, + "step": 18425 + }, + { + "epoch": 1.2522081804593015, + "grad_norm": 0.5120349526405334, + "learning_rate": 8.435419214567197e-06, + "loss": 5.1294, + "step": 18430 + }, + { + "epoch": 1.2525479005299633, + "grad_norm": 0.5196710228919983, + "learning_rate": 8.43499456447887e-06, + "loss": 4.9846, + "step": 18435 + }, + { + "epoch": 1.252887620600625, + "grad_norm": 0.47590968012809753, + "learning_rate": 8.434569914390543e-06, + "loss": 5.2423, + "step": 18440 + }, + { + "epoch": 1.2532273406712868, + "grad_norm": 0.5277749300003052, + "learning_rate": 8.434145264302216e-06, + "loss": 5.1552, + "step": 18445 + }, + { + "epoch": 1.2535670607419487, + "grad_norm": 0.47048500180244446, + "learning_rate": 8.433720614213888e-06, + "loss": 4.9706, + "step": 18450 + }, + { + "epoch": 1.2539067808126103, + "grad_norm": 0.4690874516963959, + "learning_rate": 8.433295964125561e-06, + "loss": 5.0598, + "step": 18455 + }, + { + "epoch": 1.2542465008832722, + "grad_norm": 0.4915511906147003, + "learning_rate": 8.432871314037234e-06, + "loss": 5.0815, + "step": 18460 + }, + { + "epoch": 1.254586220953934, + "grad_norm": 0.5588038563728333, + "learning_rate": 8.432446663948907e-06, + "loss": 4.766, + "step": 18465 + }, + { + "epoch": 1.2549259410245956, + "grad_norm": 0.5935094356536865, + "learning_rate": 8.43202201386058e-06, + "loss": 4.7738, + "step": 18470 + }, + { + "epoch": 1.2552656610952575, + "grad_norm": 0.6051486730575562, + "learning_rate": 8.431597363772252e-06, + "loss": 4.9491, + "step": 18475 + }, + { + "epoch": 1.2556053811659194, + "grad_norm": 0.5222749710083008, + "learning_rate": 8.431172713683925e-06, + "loss": 4.8853, + "step": 18480 + }, + { + "epoch": 1.255945101236581, + "grad_norm": 0.4954664409160614, + "learning_rate": 8.430748063595598e-06, + "loss": 4.9163, + "step": 18485 + }, + { + "epoch": 1.2562848213072428, + "grad_norm": 0.5171204209327698, + "learning_rate": 8.43032341350727e-06, + "loss": 5.0601, + "step": 18490 + }, + { + "epoch": 1.2566245413779047, + "grad_norm": 0.4613696336746216, + "learning_rate": 8.429898763418944e-06, + "loss": 4.9664, + "step": 18495 + }, + { + "epoch": 1.2569642614485663, + "grad_norm": 0.5877586603164673, + "learning_rate": 8.429474113330616e-06, + "loss": 5.0772, + "step": 18500 + }, + { + "epoch": 1.2573039815192282, + "grad_norm": 0.6230931282043457, + "learning_rate": 8.42904946324229e-06, + "loss": 5.3616, + "step": 18505 + }, + { + "epoch": 1.25764370158989, + "grad_norm": 0.5127224922180176, + "learning_rate": 8.428624813153962e-06, + "loss": 5.1541, + "step": 18510 + }, + { + "epoch": 1.2579834216605517, + "grad_norm": 0.48090341687202454, + "learning_rate": 8.428200163065635e-06, + "loss": 5.1567, + "step": 18515 + }, + { + "epoch": 1.2583231417312135, + "grad_norm": 0.6177384257316589, + "learning_rate": 8.427775512977308e-06, + "loss": 5.1769, + "step": 18520 + }, + { + "epoch": 1.2586628618018754, + "grad_norm": 0.59925377368927, + "learning_rate": 8.42735086288898e-06, + "loss": 5.3453, + "step": 18525 + }, + { + "epoch": 1.259002581872537, + "grad_norm": 0.4504432678222656, + "learning_rate": 8.426926212800653e-06, + "loss": 5.1706, + "step": 18530 + }, + { + "epoch": 1.2593423019431988, + "grad_norm": 0.4805571436882019, + "learning_rate": 8.426501562712326e-06, + "loss": 5.0625, + "step": 18535 + }, + { + "epoch": 1.2596820220138607, + "grad_norm": 0.4823440611362457, + "learning_rate": 8.426076912623999e-06, + "loss": 4.769, + "step": 18540 + }, + { + "epoch": 1.2600217420845223, + "grad_norm": 0.5900486707687378, + "learning_rate": 8.425652262535672e-06, + "loss": 5.3525, + "step": 18545 + }, + { + "epoch": 1.2603614621551842, + "grad_norm": 0.47625967860221863, + "learning_rate": 8.425227612447344e-06, + "loss": 5.4063, + "step": 18550 + }, + { + "epoch": 1.260701182225846, + "grad_norm": 0.6220616698265076, + "learning_rate": 8.424802962359017e-06, + "loss": 5.051, + "step": 18555 + }, + { + "epoch": 1.2610409022965077, + "grad_norm": 0.46590927243232727, + "learning_rate": 8.42437831227069e-06, + "loss": 5.1818, + "step": 18560 + }, + { + "epoch": 1.2613806223671695, + "grad_norm": 0.5331813097000122, + "learning_rate": 8.423953662182363e-06, + "loss": 5.0658, + "step": 18565 + }, + { + "epoch": 1.2617203424378312, + "grad_norm": 0.5343018770217896, + "learning_rate": 8.423529012094036e-06, + "loss": 5.2005, + "step": 18570 + }, + { + "epoch": 1.262060062508493, + "grad_norm": 0.498983770608902, + "learning_rate": 8.423104362005708e-06, + "loss": 4.949, + "step": 18575 + }, + { + "epoch": 1.2623997825791546, + "grad_norm": 0.5224140882492065, + "learning_rate": 8.422679711917381e-06, + "loss": 4.9368, + "step": 18580 + }, + { + "epoch": 1.2627395026498165, + "grad_norm": 0.6526961326599121, + "learning_rate": 8.422255061829054e-06, + "loss": 4.896, + "step": 18585 + }, + { + "epoch": 1.2630792227204783, + "grad_norm": 0.47392964363098145, + "learning_rate": 8.421830411740727e-06, + "loss": 5.1874, + "step": 18590 + }, + { + "epoch": 1.26341894279114, + "grad_norm": 0.52006596326828, + "learning_rate": 8.4214057616524e-06, + "loss": 4.9613, + "step": 18595 + }, + { + "epoch": 1.2637586628618018, + "grad_norm": 0.5167462825775146, + "learning_rate": 8.420981111564072e-06, + "loss": 5.1195, + "step": 18600 + }, + { + "epoch": 1.2640983829324637, + "grad_norm": 0.4817925691604614, + "learning_rate": 8.420556461475745e-06, + "loss": 5.2158, + "step": 18605 + }, + { + "epoch": 1.2644381030031253, + "grad_norm": 0.44854408502578735, + "learning_rate": 8.420131811387418e-06, + "loss": 4.709, + "step": 18610 + }, + { + "epoch": 1.2647778230737872, + "grad_norm": 0.39555060863494873, + "learning_rate": 8.419707161299089e-06, + "loss": 5.1177, + "step": 18615 + }, + { + "epoch": 1.265117543144449, + "grad_norm": 0.47922423481941223, + "learning_rate": 8.419282511210764e-06, + "loss": 5.4188, + "step": 18620 + }, + { + "epoch": 1.2654572632151107, + "grad_norm": 0.4994868040084839, + "learning_rate": 8.418857861122436e-06, + "loss": 4.9794, + "step": 18625 + }, + { + "epoch": 1.2657969832857725, + "grad_norm": 0.49992647767066956, + "learning_rate": 8.418433211034108e-06, + "loss": 4.9635, + "step": 18630 + }, + { + "epoch": 1.2661367033564344, + "grad_norm": 0.44357073307037354, + "learning_rate": 8.418008560945782e-06, + "loss": 5.2483, + "step": 18635 + }, + { + "epoch": 1.266476423427096, + "grad_norm": 0.5342909693717957, + "learning_rate": 8.417583910857455e-06, + "loss": 4.9566, + "step": 18640 + }, + { + "epoch": 1.2668161434977578, + "grad_norm": 0.4987943768501282, + "learning_rate": 8.417159260769126e-06, + "loss": 4.8205, + "step": 18645 + }, + { + "epoch": 1.2671558635684197, + "grad_norm": 0.5964763760566711, + "learning_rate": 8.4167346106808e-06, + "loss": 4.9809, + "step": 18650 + }, + { + "epoch": 1.2674955836390813, + "grad_norm": 0.4263489544391632, + "learning_rate": 8.416309960592473e-06, + "loss": 5.0245, + "step": 18655 + }, + { + "epoch": 1.2678353037097432, + "grad_norm": 0.5335677862167358, + "learning_rate": 8.415885310504144e-06, + "loss": 5.3764, + "step": 18660 + }, + { + "epoch": 1.268175023780405, + "grad_norm": 0.5421167016029358, + "learning_rate": 8.415460660415819e-06, + "loss": 5.1387, + "step": 18665 + }, + { + "epoch": 1.2685147438510667, + "grad_norm": 0.6402767896652222, + "learning_rate": 8.415036010327492e-06, + "loss": 5.0367, + "step": 18670 + }, + { + "epoch": 1.2688544639217285, + "grad_norm": 0.5957735180854797, + "learning_rate": 8.414611360239163e-06, + "loss": 5.0183, + "step": 18675 + }, + { + "epoch": 1.2691941839923904, + "grad_norm": 0.5015127658843994, + "learning_rate": 8.414186710150837e-06, + "loss": 4.8848, + "step": 18680 + }, + { + "epoch": 1.269533904063052, + "grad_norm": 0.4848366975784302, + "learning_rate": 8.41376206006251e-06, + "loss": 5.1997, + "step": 18685 + }, + { + "epoch": 1.2698736241337139, + "grad_norm": 0.47941768169403076, + "learning_rate": 8.413337409974181e-06, + "loss": 5.1613, + "step": 18690 + }, + { + "epoch": 1.2702133442043757, + "grad_norm": 0.7085881233215332, + "learning_rate": 8.412912759885856e-06, + "loss": 5.281, + "step": 18695 + }, + { + "epoch": 1.2705530642750373, + "grad_norm": 0.5275726318359375, + "learning_rate": 8.412488109797527e-06, + "loss": 5.1711, + "step": 18700 + }, + { + "epoch": 1.2708927843456992, + "grad_norm": 0.4694676101207733, + "learning_rate": 8.4120634597092e-06, + "loss": 4.9086, + "step": 18705 + }, + { + "epoch": 1.271232504416361, + "grad_norm": 0.4674123227596283, + "learning_rate": 8.411638809620874e-06, + "loss": 5.1092, + "step": 18710 + }, + { + "epoch": 1.2715722244870227, + "grad_norm": 0.6145933270454407, + "learning_rate": 8.411214159532545e-06, + "loss": 4.8838, + "step": 18715 + }, + { + "epoch": 1.2719119445576845, + "grad_norm": 0.5079854726791382, + "learning_rate": 8.410789509444218e-06, + "loss": 4.9341, + "step": 18720 + }, + { + "epoch": 1.2722516646283464, + "grad_norm": 0.451139897108078, + "learning_rate": 8.410364859355892e-06, + "loss": 5.1616, + "step": 18725 + }, + { + "epoch": 1.272591384699008, + "grad_norm": 0.5025660991668701, + "learning_rate": 8.409940209267564e-06, + "loss": 4.8478, + "step": 18730 + }, + { + "epoch": 1.2729311047696699, + "grad_norm": 0.38279250264167786, + "learning_rate": 8.409515559179236e-06, + "loss": 4.9486, + "step": 18735 + }, + { + "epoch": 1.2732708248403315, + "grad_norm": 0.45434755086898804, + "learning_rate": 8.40909090909091e-06, + "loss": 4.8417, + "step": 18740 + }, + { + "epoch": 1.2736105449109933, + "grad_norm": 0.49401405453681946, + "learning_rate": 8.408666259002582e-06, + "loss": 5.0071, + "step": 18745 + }, + { + "epoch": 1.273950264981655, + "grad_norm": 0.5394780039787292, + "learning_rate": 8.408241608914255e-06, + "loss": 5.0794, + "step": 18750 + }, + { + "epoch": 1.2742899850523168, + "grad_norm": 0.6512455940246582, + "learning_rate": 8.40781695882593e-06, + "loss": 5.3076, + "step": 18755 + }, + { + "epoch": 1.2746297051229787, + "grad_norm": 0.4740900695323944, + "learning_rate": 8.4073923087376e-06, + "loss": 5.0092, + "step": 18760 + }, + { + "epoch": 1.2749694251936403, + "grad_norm": 0.4854411780834198, + "learning_rate": 8.406967658649273e-06, + "loss": 4.9627, + "step": 18765 + }, + { + "epoch": 1.2753091452643022, + "grad_norm": 0.3955738842487335, + "learning_rate": 8.406543008560946e-06, + "loss": 5.0856, + "step": 18770 + }, + { + "epoch": 1.275648865334964, + "grad_norm": 0.5668619871139526, + "learning_rate": 8.406118358472619e-06, + "loss": 4.9986, + "step": 18775 + }, + { + "epoch": 1.2759885854056257, + "grad_norm": 0.4159271717071533, + "learning_rate": 8.405693708384292e-06, + "loss": 5.1278, + "step": 18780 + }, + { + "epoch": 1.2763283054762875, + "grad_norm": 0.6335753798484802, + "learning_rate": 8.405269058295964e-06, + "loss": 5.0604, + "step": 18785 + }, + { + "epoch": 1.2766680255469494, + "grad_norm": 0.555728554725647, + "learning_rate": 8.404844408207637e-06, + "loss": 5.2291, + "step": 18790 + }, + { + "epoch": 1.277007745617611, + "grad_norm": 0.46713128685951233, + "learning_rate": 8.40441975811931e-06, + "loss": 5.031, + "step": 18795 + }, + { + "epoch": 1.2773474656882728, + "grad_norm": 0.5682691335678101, + "learning_rate": 8.403995108030983e-06, + "loss": 5.0368, + "step": 18800 + }, + { + "epoch": 1.2776871857589347, + "grad_norm": 0.42160892486572266, + "learning_rate": 8.403570457942656e-06, + "loss": 5.1099, + "step": 18805 + }, + { + "epoch": 1.2780269058295963, + "grad_norm": 0.4778864085674286, + "learning_rate": 8.403145807854328e-06, + "loss": 5.0856, + "step": 18810 + }, + { + "epoch": 1.2783666259002582, + "grad_norm": 0.5695703625679016, + "learning_rate": 8.402721157766001e-06, + "loss": 5.0295, + "step": 18815 + }, + { + "epoch": 1.27870634597092, + "grad_norm": 0.5775588750839233, + "learning_rate": 8.402296507677674e-06, + "loss": 5.1294, + "step": 18820 + }, + { + "epoch": 1.2790460660415817, + "grad_norm": 0.39709392189979553, + "learning_rate": 8.401871857589347e-06, + "loss": 4.9575, + "step": 18825 + }, + { + "epoch": 1.2793857861122435, + "grad_norm": 0.44119584560394287, + "learning_rate": 8.40144720750102e-06, + "loss": 4.9395, + "step": 18830 + }, + { + "epoch": 1.2797255061829054, + "grad_norm": 0.564111590385437, + "learning_rate": 8.401022557412692e-06, + "loss": 5.2062, + "step": 18835 + }, + { + "epoch": 1.280065226253567, + "grad_norm": 0.49311283230781555, + "learning_rate": 8.400597907324365e-06, + "loss": 4.8853, + "step": 18840 + }, + { + "epoch": 1.2804049463242289, + "grad_norm": 0.4477153718471527, + "learning_rate": 8.400173257236038e-06, + "loss": 5.0028, + "step": 18845 + }, + { + "epoch": 1.2807446663948907, + "grad_norm": 0.5436672568321228, + "learning_rate": 8.39974860714771e-06, + "loss": 4.8939, + "step": 18850 + }, + { + "epoch": 1.2810843864655523, + "grad_norm": 0.5215460062026978, + "learning_rate": 8.399323957059384e-06, + "loss": 4.7607, + "step": 18855 + }, + { + "epoch": 1.2814241065362142, + "grad_norm": 0.4188626706600189, + "learning_rate": 8.398899306971056e-06, + "loss": 5.1455, + "step": 18860 + }, + { + "epoch": 1.281763826606876, + "grad_norm": 0.4844125509262085, + "learning_rate": 8.398474656882729e-06, + "loss": 5.1427, + "step": 18865 + }, + { + "epoch": 1.2821035466775377, + "grad_norm": 0.46652889251708984, + "learning_rate": 8.398050006794402e-06, + "loss": 5.1733, + "step": 18870 + }, + { + "epoch": 1.2824432667481995, + "grad_norm": 0.4565170705318451, + "learning_rate": 8.397625356706075e-06, + "loss": 4.8634, + "step": 18875 + }, + { + "epoch": 1.2827829868188614, + "grad_norm": 0.5034279227256775, + "learning_rate": 8.397200706617748e-06, + "loss": 4.8257, + "step": 18880 + }, + { + "epoch": 1.283122706889523, + "grad_norm": 0.47737932205200195, + "learning_rate": 8.39677605652942e-06, + "loss": 5.0453, + "step": 18885 + }, + { + "epoch": 1.2834624269601849, + "grad_norm": 0.5220845341682434, + "learning_rate": 8.396351406441093e-06, + "loss": 4.795, + "step": 18890 + }, + { + "epoch": 1.2838021470308467, + "grad_norm": 0.4469449818134308, + "learning_rate": 8.395926756352766e-06, + "loss": 4.7827, + "step": 18895 + }, + { + "epoch": 1.2841418671015083, + "grad_norm": 0.3882735073566437, + "learning_rate": 8.395502106264439e-06, + "loss": 5.0992, + "step": 18900 + }, + { + "epoch": 1.2844815871721702, + "grad_norm": 0.5734479427337646, + "learning_rate": 8.395077456176112e-06, + "loss": 4.9716, + "step": 18905 + }, + { + "epoch": 1.2848213072428318, + "grad_norm": 0.4459156394004822, + "learning_rate": 8.394652806087784e-06, + "loss": 4.986, + "step": 18910 + }, + { + "epoch": 1.2851610273134937, + "grad_norm": 0.658108651638031, + "learning_rate": 8.394228155999457e-06, + "loss": 4.849, + "step": 18915 + }, + { + "epoch": 1.2855007473841553, + "grad_norm": 0.4222866892814636, + "learning_rate": 8.39380350591113e-06, + "loss": 4.9558, + "step": 18920 + }, + { + "epoch": 1.2858404674548172, + "grad_norm": 0.45727795362472534, + "learning_rate": 8.393378855822803e-06, + "loss": 4.7975, + "step": 18925 + }, + { + "epoch": 1.286180187525479, + "grad_norm": 0.4340682923793793, + "learning_rate": 8.392954205734476e-06, + "loss": 4.9573, + "step": 18930 + }, + { + "epoch": 1.2865199075961407, + "grad_norm": 0.5915932059288025, + "learning_rate": 8.392529555646148e-06, + "loss": 5.0307, + "step": 18935 + }, + { + "epoch": 1.2868596276668025, + "grad_norm": 0.5833204388618469, + "learning_rate": 8.392104905557821e-06, + "loss": 5.0117, + "step": 18940 + }, + { + "epoch": 1.2871993477374644, + "grad_norm": 0.5559777021408081, + "learning_rate": 8.391680255469494e-06, + "loss": 5.0357, + "step": 18945 + }, + { + "epoch": 1.287539067808126, + "grad_norm": 0.43162640929222107, + "learning_rate": 8.391255605381167e-06, + "loss": 5.2463, + "step": 18950 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 0.43274128437042236, + "learning_rate": 8.39083095529284e-06, + "loss": 4.9282, + "step": 18955 + }, + { + "epoch": 1.2882185079494497, + "grad_norm": 0.5890533924102783, + "learning_rate": 8.390406305204512e-06, + "loss": 4.8662, + "step": 18960 + }, + { + "epoch": 1.2885582280201113, + "grad_norm": 0.5715261101722717, + "learning_rate": 8.389981655116185e-06, + "loss": 4.9572, + "step": 18965 + }, + { + "epoch": 1.2888979480907732, + "grad_norm": 0.567186176776886, + "learning_rate": 8.389557005027858e-06, + "loss": 4.9831, + "step": 18970 + }, + { + "epoch": 1.289237668161435, + "grad_norm": 0.5304973721504211, + "learning_rate": 8.38913235493953e-06, + "loss": 5.0811, + "step": 18975 + }, + { + "epoch": 1.2895773882320967, + "grad_norm": 0.5242959260940552, + "learning_rate": 8.388707704851204e-06, + "loss": 4.7001, + "step": 18980 + }, + { + "epoch": 1.2899171083027585, + "grad_norm": 0.43484756350517273, + "learning_rate": 8.388283054762876e-06, + "loss": 5.2583, + "step": 18985 + }, + { + "epoch": 1.2902568283734204, + "grad_norm": 0.5858462452888489, + "learning_rate": 8.38785840467455e-06, + "loss": 4.7969, + "step": 18990 + }, + { + "epoch": 1.290596548444082, + "grad_norm": 0.5588483810424805, + "learning_rate": 8.387433754586222e-06, + "loss": 5.2971, + "step": 18995 + }, + { + "epoch": 1.2909362685147439, + "grad_norm": 0.488151878118515, + "learning_rate": 8.387009104497895e-06, + "loss": 4.9086, + "step": 19000 + }, + { + "epoch": 1.2912759885854057, + "grad_norm": 0.5393924117088318, + "learning_rate": 8.386584454409568e-06, + "loss": 5.0038, + "step": 19005 + }, + { + "epoch": 1.2916157086560673, + "grad_norm": 0.4269374907016754, + "learning_rate": 8.38615980432124e-06, + "loss": 4.9809, + "step": 19010 + }, + { + "epoch": 1.2919554287267292, + "grad_norm": 0.5878955125808716, + "learning_rate": 8.385735154232913e-06, + "loss": 5.0207, + "step": 19015 + }, + { + "epoch": 1.292295148797391, + "grad_norm": 0.5544813275337219, + "learning_rate": 8.385310504144586e-06, + "loss": 5.0335, + "step": 19020 + }, + { + "epoch": 1.2926348688680527, + "grad_norm": 0.45408621430397034, + "learning_rate": 8.384885854056259e-06, + "loss": 5.0986, + "step": 19025 + }, + { + "epoch": 1.2929745889387145, + "grad_norm": 0.5011275410652161, + "learning_rate": 8.384461203967932e-06, + "loss": 5.063, + "step": 19030 + }, + { + "epoch": 1.2933143090093764, + "grad_norm": 0.5632134675979614, + "learning_rate": 8.384036553879604e-06, + "loss": 4.9859, + "step": 19035 + }, + { + "epoch": 1.293654029080038, + "grad_norm": 0.42047062516212463, + "learning_rate": 8.383611903791277e-06, + "loss": 4.9989, + "step": 19040 + }, + { + "epoch": 1.2939937491506999, + "grad_norm": 0.5312302708625793, + "learning_rate": 8.383187253702948e-06, + "loss": 5.1635, + "step": 19045 + }, + { + "epoch": 1.2943334692213617, + "grad_norm": 0.4682811498641968, + "learning_rate": 8.382762603614623e-06, + "loss": 4.8183, + "step": 19050 + }, + { + "epoch": 1.2946731892920234, + "grad_norm": 0.39546138048171997, + "learning_rate": 8.382337953526296e-06, + "loss": 5.0139, + "step": 19055 + }, + { + "epoch": 1.2950129093626852, + "grad_norm": 0.5691708326339722, + "learning_rate": 8.381913303437967e-06, + "loss": 5.0422, + "step": 19060 + }, + { + "epoch": 1.295352629433347, + "grad_norm": 0.49399468302726746, + "learning_rate": 8.381488653349641e-06, + "loss": 4.9767, + "step": 19065 + }, + { + "epoch": 1.2956923495040087, + "grad_norm": 0.5618157386779785, + "learning_rate": 8.381064003261314e-06, + "loss": 4.8792, + "step": 19070 + }, + { + "epoch": 1.2960320695746705, + "grad_norm": 0.689786970615387, + "learning_rate": 8.380639353172985e-06, + "loss": 5.0072, + "step": 19075 + }, + { + "epoch": 1.2963717896453322, + "grad_norm": 0.4532225728034973, + "learning_rate": 8.38021470308466e-06, + "loss": 5.0969, + "step": 19080 + }, + { + "epoch": 1.296711509715994, + "grad_norm": 0.4720099866390228, + "learning_rate": 8.379790052996332e-06, + "loss": 5.088, + "step": 19085 + }, + { + "epoch": 1.2970512297866557, + "grad_norm": 0.3762291669845581, + "learning_rate": 8.379365402908003e-06, + "loss": 4.8998, + "step": 19090 + }, + { + "epoch": 1.2973909498573175, + "grad_norm": 0.5154570937156677, + "learning_rate": 8.378940752819678e-06, + "loss": 4.9272, + "step": 19095 + }, + { + "epoch": 1.2977306699279794, + "grad_norm": 0.6425920128822327, + "learning_rate": 8.37851610273135e-06, + "loss": 5.0053, + "step": 19100 + }, + { + "epoch": 1.298070389998641, + "grad_norm": 0.5820443630218506, + "learning_rate": 8.378091452643022e-06, + "loss": 4.8928, + "step": 19105 + }, + { + "epoch": 1.2984101100693028, + "grad_norm": 0.6386792063713074, + "learning_rate": 8.377666802554696e-06, + "loss": 5.1723, + "step": 19110 + }, + { + "epoch": 1.2987498301399647, + "grad_norm": 0.583911120891571, + "learning_rate": 8.377242152466367e-06, + "loss": 5.1354, + "step": 19115 + }, + { + "epoch": 1.2990895502106263, + "grad_norm": 0.489857941865921, + "learning_rate": 8.37681750237804e-06, + "loss": 5.0774, + "step": 19120 + }, + { + "epoch": 1.2994292702812882, + "grad_norm": 0.5093833208084106, + "learning_rate": 8.376392852289715e-06, + "loss": 5.0868, + "step": 19125 + }, + { + "epoch": 1.29976899035195, + "grad_norm": 0.5258435606956482, + "learning_rate": 8.375968202201386e-06, + "loss": 4.9685, + "step": 19130 + }, + { + "epoch": 1.3001087104226117, + "grad_norm": 0.5160557627677917, + "learning_rate": 8.375543552113059e-06, + "loss": 5.0044, + "step": 19135 + }, + { + "epoch": 1.3004484304932735, + "grad_norm": 0.48454365134239197, + "learning_rate": 8.375118902024733e-06, + "loss": 4.8804, + "step": 19140 + }, + { + "epoch": 1.3007881505639354, + "grad_norm": 0.46770796179771423, + "learning_rate": 8.374694251936404e-06, + "loss": 4.8034, + "step": 19145 + }, + { + "epoch": 1.301127870634597, + "grad_norm": 0.48655417561531067, + "learning_rate": 8.374269601848077e-06, + "loss": 5.016, + "step": 19150 + }, + { + "epoch": 1.3014675907052589, + "grad_norm": 0.44074133038520813, + "learning_rate": 8.373844951759752e-06, + "loss": 4.777, + "step": 19155 + }, + { + "epoch": 1.3018073107759207, + "grad_norm": 0.5764362812042236, + "learning_rate": 8.373420301671423e-06, + "loss": 5.1315, + "step": 19160 + }, + { + "epoch": 1.3021470308465823, + "grad_norm": 0.5419714450836182, + "learning_rate": 8.372995651583095e-06, + "loss": 4.7648, + "step": 19165 + }, + { + "epoch": 1.3024867509172442, + "grad_norm": 0.4891796410083771, + "learning_rate": 8.37257100149477e-06, + "loss": 5.0551, + "step": 19170 + }, + { + "epoch": 1.302826470987906, + "grad_norm": 0.48729127645492554, + "learning_rate": 8.372146351406441e-06, + "loss": 4.742, + "step": 19175 + }, + { + "epoch": 1.3031661910585677, + "grad_norm": 0.5510108470916748, + "learning_rate": 8.371721701318114e-06, + "loss": 4.9929, + "step": 19180 + }, + { + "epoch": 1.3035059111292295, + "grad_norm": 0.48630186915397644, + "learning_rate": 8.371297051229787e-06, + "loss": 4.947, + "step": 19185 + }, + { + "epoch": 1.3038456311998914, + "grad_norm": 0.37310296297073364, + "learning_rate": 8.37087240114146e-06, + "loss": 5.0254, + "step": 19190 + }, + { + "epoch": 1.304185351270553, + "grad_norm": 0.46573933959007263, + "learning_rate": 8.370447751053134e-06, + "loss": 4.8038, + "step": 19195 + }, + { + "epoch": 1.3045250713412149, + "grad_norm": 0.5717459917068481, + "learning_rate": 8.370023100964805e-06, + "loss": 5.1381, + "step": 19200 + }, + { + "epoch": 1.3048647914118767, + "grad_norm": 0.6323161721229553, + "learning_rate": 8.369598450876478e-06, + "loss": 5.1749, + "step": 19205 + }, + { + "epoch": 1.3052045114825384, + "grad_norm": 0.46296998858451843, + "learning_rate": 8.369173800788152e-06, + "loss": 4.8443, + "step": 19210 + }, + { + "epoch": 1.3055442315532002, + "grad_norm": 0.432028204202652, + "learning_rate": 8.368749150699824e-06, + "loss": 5.0213, + "step": 19215 + }, + { + "epoch": 1.305883951623862, + "grad_norm": 0.4981033205986023, + "learning_rate": 8.368324500611496e-06, + "loss": 4.9922, + "step": 19220 + }, + { + "epoch": 1.3062236716945237, + "grad_norm": 0.4651148021221161, + "learning_rate": 8.36789985052317e-06, + "loss": 5.2486, + "step": 19225 + }, + { + "epoch": 1.3065633917651855, + "grad_norm": 0.4546286463737488, + "learning_rate": 8.367475200434842e-06, + "loss": 5.119, + "step": 19230 + }, + { + "epoch": 1.3069031118358474, + "grad_norm": 0.5409123301506042, + "learning_rate": 8.367050550346515e-06, + "loss": 4.7178, + "step": 19235 + }, + { + "epoch": 1.307242831906509, + "grad_norm": 0.42765435576438904, + "learning_rate": 8.36662590025819e-06, + "loss": 4.9353, + "step": 19240 + }, + { + "epoch": 1.3075825519771709, + "grad_norm": 0.4274669289588928, + "learning_rate": 8.36620125016986e-06, + "loss": 5.2154, + "step": 19245 + }, + { + "epoch": 1.3079222720478325, + "grad_norm": 0.4731990694999695, + "learning_rate": 8.365776600081533e-06, + "loss": 5.0188, + "step": 19250 + }, + { + "epoch": 1.3082619921184944, + "grad_norm": 0.4644988179206848, + "learning_rate": 8.365351949993208e-06, + "loss": 5.0663, + "step": 19255 + }, + { + "epoch": 1.308601712189156, + "grad_norm": 0.5805973410606384, + "learning_rate": 8.364927299904879e-06, + "loss": 5.0376, + "step": 19260 + }, + { + "epoch": 1.3089414322598178, + "grad_norm": 0.491750031709671, + "learning_rate": 8.364502649816552e-06, + "loss": 4.9209, + "step": 19265 + }, + { + "epoch": 1.3092811523304797, + "grad_norm": 0.679786205291748, + "learning_rate": 8.364077999728224e-06, + "loss": 5.2184, + "step": 19270 + }, + { + "epoch": 1.3096208724011413, + "grad_norm": 0.42651039361953735, + "learning_rate": 8.363653349639897e-06, + "loss": 4.9781, + "step": 19275 + }, + { + "epoch": 1.3099605924718032, + "grad_norm": 0.5118865370750427, + "learning_rate": 8.36322869955157e-06, + "loss": 4.8221, + "step": 19280 + }, + { + "epoch": 1.310300312542465, + "grad_norm": 0.4709728956222534, + "learning_rate": 8.362804049463243e-06, + "loss": 4.8018, + "step": 19285 + }, + { + "epoch": 1.3106400326131267, + "grad_norm": 0.4293476641178131, + "learning_rate": 8.362379399374916e-06, + "loss": 4.7277, + "step": 19290 + }, + { + "epoch": 1.3109797526837885, + "grad_norm": 0.4097418189048767, + "learning_rate": 8.361954749286588e-06, + "loss": 5.0817, + "step": 19295 + }, + { + "epoch": 1.3113194727544504, + "grad_norm": 0.5120959281921387, + "learning_rate": 8.361530099198261e-06, + "loss": 5.0308, + "step": 19300 + }, + { + "epoch": 1.311659192825112, + "grad_norm": 0.5065838694572449, + "learning_rate": 8.361105449109934e-06, + "loss": 5.0637, + "step": 19305 + }, + { + "epoch": 1.3119989128957739, + "grad_norm": 0.4622547924518585, + "learning_rate": 8.360680799021607e-06, + "loss": 4.8053, + "step": 19310 + }, + { + "epoch": 1.3123386329664357, + "grad_norm": 0.45042943954467773, + "learning_rate": 8.36025614893328e-06, + "loss": 4.855, + "step": 19315 + }, + { + "epoch": 1.3126783530370973, + "grad_norm": 0.3687123954296112, + "learning_rate": 8.359831498844952e-06, + "loss": 4.8497, + "step": 19320 + }, + { + "epoch": 1.3130180731077592, + "grad_norm": 0.5314401388168335, + "learning_rate": 8.359406848756625e-06, + "loss": 5.1265, + "step": 19325 + }, + { + "epoch": 1.313357793178421, + "grad_norm": 0.6242157816886902, + "learning_rate": 8.358982198668298e-06, + "loss": 5.1355, + "step": 19330 + }, + { + "epoch": 1.3136975132490827, + "grad_norm": 0.6123301982879639, + "learning_rate": 8.35855754857997e-06, + "loss": 4.933, + "step": 19335 + }, + { + "epoch": 1.3140372333197445, + "grad_norm": 0.45048752427101135, + "learning_rate": 8.358132898491644e-06, + "loss": 4.8653, + "step": 19340 + }, + { + "epoch": 1.3143769533904064, + "grad_norm": 0.4748658835887909, + "learning_rate": 8.357708248403316e-06, + "loss": 5.1547, + "step": 19345 + }, + { + "epoch": 1.314716673461068, + "grad_norm": 0.5229340195655823, + "learning_rate": 8.357283598314989e-06, + "loss": 5.3195, + "step": 19350 + }, + { + "epoch": 1.3150563935317299, + "grad_norm": 0.9053743481636047, + "learning_rate": 8.356858948226662e-06, + "loss": 4.7409, + "step": 19355 + }, + { + "epoch": 1.3153961136023917, + "grad_norm": 0.590802013874054, + "learning_rate": 8.356434298138335e-06, + "loss": 4.9867, + "step": 19360 + }, + { + "epoch": 1.3157358336730534, + "grad_norm": 0.4468270242214203, + "learning_rate": 8.356009648050008e-06, + "loss": 5.2835, + "step": 19365 + }, + { + "epoch": 1.3160755537437152, + "grad_norm": 0.43326622247695923, + "learning_rate": 8.35558499796168e-06, + "loss": 4.9004, + "step": 19370 + }, + { + "epoch": 1.316415273814377, + "grad_norm": 0.42298924922943115, + "learning_rate": 8.355160347873353e-06, + "loss": 5.0431, + "step": 19375 + }, + { + "epoch": 1.3167549938850387, + "grad_norm": 0.506742000579834, + "learning_rate": 8.354735697785026e-06, + "loss": 5.0089, + "step": 19380 + }, + { + "epoch": 1.3170947139557005, + "grad_norm": 0.44916701316833496, + "learning_rate": 8.354311047696699e-06, + "loss": 5.1294, + "step": 19385 + }, + { + "epoch": 1.3174344340263624, + "grad_norm": 0.6373274326324463, + "learning_rate": 8.353886397608372e-06, + "loss": 5.2051, + "step": 19390 + }, + { + "epoch": 1.317774154097024, + "grad_norm": 0.6529784202575684, + "learning_rate": 8.353461747520044e-06, + "loss": 5.0762, + "step": 19395 + }, + { + "epoch": 1.3181138741676859, + "grad_norm": 0.43924954533576965, + "learning_rate": 8.353037097431717e-06, + "loss": 4.8802, + "step": 19400 + }, + { + "epoch": 1.3184535942383477, + "grad_norm": 0.46705514192581177, + "learning_rate": 8.35261244734339e-06, + "loss": 4.9013, + "step": 19405 + }, + { + "epoch": 1.3187933143090094, + "grad_norm": 0.49319225549697876, + "learning_rate": 8.352187797255063e-06, + "loss": 5.0305, + "step": 19410 + }, + { + "epoch": 1.3191330343796712, + "grad_norm": 0.48539498448371887, + "learning_rate": 8.351763147166736e-06, + "loss": 4.8499, + "step": 19415 + }, + { + "epoch": 1.3194727544503329, + "grad_norm": 0.47164401412010193, + "learning_rate": 8.351338497078408e-06, + "loss": 5.1426, + "step": 19420 + }, + { + "epoch": 1.3198124745209947, + "grad_norm": 0.5059059858322144, + "learning_rate": 8.350913846990081e-06, + "loss": 4.8285, + "step": 19425 + }, + { + "epoch": 1.3201521945916566, + "grad_norm": 0.47930270433425903, + "learning_rate": 8.350489196901754e-06, + "loss": 4.8801, + "step": 19430 + }, + { + "epoch": 1.3204919146623182, + "grad_norm": 0.3964381515979767, + "learning_rate": 8.350064546813427e-06, + "loss": 4.9252, + "step": 19435 + }, + { + "epoch": 1.32083163473298, + "grad_norm": 0.5191445350646973, + "learning_rate": 8.3496398967251e-06, + "loss": 4.8401, + "step": 19440 + }, + { + "epoch": 1.3211713548036417, + "grad_norm": 0.5524365901947021, + "learning_rate": 8.349215246636772e-06, + "loss": 4.994, + "step": 19445 + }, + { + "epoch": 1.3215110748743035, + "grad_norm": 0.5686788558959961, + "learning_rate": 8.348790596548445e-06, + "loss": 4.8906, + "step": 19450 + }, + { + "epoch": 1.3218507949449654, + "grad_norm": 0.4943873882293701, + "learning_rate": 8.348365946460118e-06, + "loss": 5.0004, + "step": 19455 + }, + { + "epoch": 1.322190515015627, + "grad_norm": 0.5111988186836243, + "learning_rate": 8.347941296371789e-06, + "loss": 5.1897, + "step": 19460 + }, + { + "epoch": 1.3225302350862889, + "grad_norm": 0.416199266910553, + "learning_rate": 8.347516646283464e-06, + "loss": 4.7929, + "step": 19465 + }, + { + "epoch": 1.3228699551569507, + "grad_norm": 0.5701894760131836, + "learning_rate": 8.347091996195136e-06, + "loss": 4.9342, + "step": 19470 + }, + { + "epoch": 1.3232096752276123, + "grad_norm": 0.5698426365852356, + "learning_rate": 8.346667346106807e-06, + "loss": 4.9644, + "step": 19475 + }, + { + "epoch": 1.3235493952982742, + "grad_norm": 0.525658905506134, + "learning_rate": 8.346242696018482e-06, + "loss": 4.9502, + "step": 19480 + }, + { + "epoch": 1.323889115368936, + "grad_norm": 0.4563967287540436, + "learning_rate": 8.345818045930155e-06, + "loss": 5.0871, + "step": 19485 + }, + { + "epoch": 1.3242288354395977, + "grad_norm": 0.4426518976688385, + "learning_rate": 8.345393395841826e-06, + "loss": 4.6903, + "step": 19490 + }, + { + "epoch": 1.3245685555102595, + "grad_norm": 0.6469057202339172, + "learning_rate": 8.3449687457535e-06, + "loss": 4.9945, + "step": 19495 + }, + { + "epoch": 1.3249082755809214, + "grad_norm": 0.397634357213974, + "learning_rate": 8.344544095665173e-06, + "loss": 4.9545, + "step": 19500 + }, + { + "epoch": 1.325247995651583, + "grad_norm": 0.656531810760498, + "learning_rate": 8.344119445576844e-06, + "loss": 5.0, + "step": 19505 + }, + { + "epoch": 1.3255877157222449, + "grad_norm": 0.7402647137641907, + "learning_rate": 8.343694795488519e-06, + "loss": 5.2285, + "step": 19510 + }, + { + "epoch": 1.3259274357929067, + "grad_norm": 0.4506836533546448, + "learning_rate": 8.343270145400192e-06, + "loss": 5.1219, + "step": 19515 + }, + { + "epoch": 1.3262671558635684, + "grad_norm": 0.5392155647277832, + "learning_rate": 8.342845495311863e-06, + "loss": 4.7921, + "step": 19520 + }, + { + "epoch": 1.3266068759342302, + "grad_norm": 0.45754528045654297, + "learning_rate": 8.342420845223537e-06, + "loss": 5.0939, + "step": 19525 + }, + { + "epoch": 1.326946596004892, + "grad_norm": 0.5549595952033997, + "learning_rate": 8.341996195135208e-06, + "loss": 4.8156, + "step": 19530 + }, + { + "epoch": 1.3272863160755537, + "grad_norm": 0.4842107594013214, + "learning_rate": 8.341571545046883e-06, + "loss": 5.0863, + "step": 19535 + }, + { + "epoch": 1.3276260361462155, + "grad_norm": 0.5989494919776917, + "learning_rate": 8.341146894958556e-06, + "loss": 4.9399, + "step": 19540 + }, + { + "epoch": 1.3279657562168774, + "grad_norm": 0.43126603960990906, + "learning_rate": 8.340722244870227e-06, + "loss": 4.7855, + "step": 19545 + }, + { + "epoch": 1.328305476287539, + "grad_norm": 0.4598788022994995, + "learning_rate": 8.340297594781901e-06, + "loss": 4.9944, + "step": 19550 + }, + { + "epoch": 1.3286451963582009, + "grad_norm": 0.6808159947395325, + "learning_rate": 8.339872944693574e-06, + "loss": 5.054, + "step": 19555 + }, + { + "epoch": 1.3289849164288627, + "grad_norm": 0.4627915322780609, + "learning_rate": 8.339448294605245e-06, + "loss": 4.944, + "step": 19560 + }, + { + "epoch": 1.3293246364995244, + "grad_norm": 0.5145645141601562, + "learning_rate": 8.33902364451692e-06, + "loss": 4.9483, + "step": 19565 + }, + { + "epoch": 1.3296643565701862, + "grad_norm": 0.5253284573554993, + "learning_rate": 8.338598994428592e-06, + "loss": 5.0388, + "step": 19570 + }, + { + "epoch": 1.330004076640848, + "grad_norm": 0.6288500428199768, + "learning_rate": 8.338174344340263e-06, + "loss": 4.9155, + "step": 19575 + }, + { + "epoch": 1.3303437967115097, + "grad_norm": 0.4043360650539398, + "learning_rate": 8.337749694251938e-06, + "loss": 5.0641, + "step": 19580 + }, + { + "epoch": 1.3306835167821716, + "grad_norm": 0.5088081955909729, + "learning_rate": 8.33732504416361e-06, + "loss": 5.0955, + "step": 19585 + }, + { + "epoch": 1.3310232368528332, + "grad_norm": 0.4504567086696625, + "learning_rate": 8.336900394075282e-06, + "loss": 4.8045, + "step": 19590 + }, + { + "epoch": 1.331362956923495, + "grad_norm": 0.4719714820384979, + "learning_rate": 8.336475743986956e-06, + "loss": 5.1822, + "step": 19595 + }, + { + "epoch": 1.331702676994157, + "grad_norm": 0.583567202091217, + "learning_rate": 8.33605109389863e-06, + "loss": 5.1444, + "step": 19600 + }, + { + "epoch": 1.3320423970648185, + "grad_norm": 0.5244925022125244, + "learning_rate": 8.3356264438103e-06, + "loss": 5.0697, + "step": 19605 + }, + { + "epoch": 1.3323821171354804, + "grad_norm": 0.5979457497596741, + "learning_rate": 8.335201793721975e-06, + "loss": 4.6862, + "step": 19610 + }, + { + "epoch": 1.332721837206142, + "grad_norm": 0.6037037372589111, + "learning_rate": 8.334777143633646e-06, + "loss": 4.9977, + "step": 19615 + }, + { + "epoch": 1.3330615572768039, + "grad_norm": 0.5485708117485046, + "learning_rate": 8.334352493545319e-06, + "loss": 5.1501, + "step": 19620 + }, + { + "epoch": 1.3334012773474657, + "grad_norm": 0.4829530119895935, + "learning_rate": 8.333927843456993e-06, + "loss": 4.9008, + "step": 19625 + }, + { + "epoch": 1.3337409974181273, + "grad_norm": 0.49090996384620667, + "learning_rate": 8.333503193368664e-06, + "loss": 5.2352, + "step": 19630 + }, + { + "epoch": 1.3340807174887892, + "grad_norm": 0.46510374546051025, + "learning_rate": 8.333078543280337e-06, + "loss": 4.9664, + "step": 19635 + }, + { + "epoch": 1.334420437559451, + "grad_norm": 0.5287171006202698, + "learning_rate": 8.332653893192012e-06, + "loss": 4.8551, + "step": 19640 + }, + { + "epoch": 1.3347601576301127, + "grad_norm": 0.49780890345573425, + "learning_rate": 8.332229243103683e-06, + "loss": 5.045, + "step": 19645 + }, + { + "epoch": 1.3350998777007745, + "grad_norm": 0.567924976348877, + "learning_rate": 8.331804593015355e-06, + "loss": 4.5409, + "step": 19650 + }, + { + "epoch": 1.3354395977714364, + "grad_norm": 0.39018383622169495, + "learning_rate": 8.33137994292703e-06, + "loss": 4.7098, + "step": 19655 + }, + { + "epoch": 1.335779317842098, + "grad_norm": 0.4471184015274048, + "learning_rate": 8.330955292838701e-06, + "loss": 4.9668, + "step": 19660 + }, + { + "epoch": 1.3361190379127599, + "grad_norm": 0.5062467455863953, + "learning_rate": 8.330530642750374e-06, + "loss": 5.0622, + "step": 19665 + }, + { + "epoch": 1.3364587579834217, + "grad_norm": 0.41651421785354614, + "learning_rate": 8.330105992662048e-06, + "loss": 5.0219, + "step": 19670 + }, + { + "epoch": 1.3367984780540834, + "grad_norm": 0.4765626788139343, + "learning_rate": 8.32968134257372e-06, + "loss": 5.3033, + "step": 19675 + }, + { + "epoch": 1.3371381981247452, + "grad_norm": 0.5989362001419067, + "learning_rate": 8.329256692485392e-06, + "loss": 5.1415, + "step": 19680 + }, + { + "epoch": 1.337477918195407, + "grad_norm": 0.5437412261962891, + "learning_rate": 8.328832042397065e-06, + "loss": 5.021, + "step": 19685 + }, + { + "epoch": 1.3378176382660687, + "grad_norm": 0.5203415155410767, + "learning_rate": 8.328407392308738e-06, + "loss": 4.8761, + "step": 19690 + }, + { + "epoch": 1.3381573583367306, + "grad_norm": 0.4467916488647461, + "learning_rate": 8.32798274222041e-06, + "loss": 4.9483, + "step": 19695 + }, + { + "epoch": 1.3384970784073924, + "grad_norm": 0.5136055946350098, + "learning_rate": 8.327558092132083e-06, + "loss": 4.8236, + "step": 19700 + }, + { + "epoch": 1.338836798478054, + "grad_norm": 0.42920389771461487, + "learning_rate": 8.327133442043756e-06, + "loss": 4.947, + "step": 19705 + }, + { + "epoch": 1.3391765185487159, + "grad_norm": 0.5447189211845398, + "learning_rate": 8.326708791955429e-06, + "loss": 4.9315, + "step": 19710 + }, + { + "epoch": 1.3395162386193777, + "grad_norm": 0.4137323200702667, + "learning_rate": 8.326284141867102e-06, + "loss": 4.7212, + "step": 19715 + }, + { + "epoch": 1.3398559586900394, + "grad_norm": 0.5748496651649475, + "learning_rate": 8.325859491778775e-06, + "loss": 5.0985, + "step": 19720 + }, + { + "epoch": 1.3401956787607012, + "grad_norm": 0.46367672085762024, + "learning_rate": 8.325434841690447e-06, + "loss": 4.8411, + "step": 19725 + }, + { + "epoch": 1.340535398831363, + "grad_norm": 0.42698854207992554, + "learning_rate": 8.32501019160212e-06, + "loss": 5.0405, + "step": 19730 + }, + { + "epoch": 1.3408751189020247, + "grad_norm": 0.5243262052536011, + "learning_rate": 8.324585541513793e-06, + "loss": 4.9011, + "step": 19735 + }, + { + "epoch": 1.3412148389726866, + "grad_norm": 0.46912696957588196, + "learning_rate": 8.324160891425466e-06, + "loss": 5.0171, + "step": 19740 + }, + { + "epoch": 1.3415545590433484, + "grad_norm": 0.4915119707584381, + "learning_rate": 8.323736241337139e-06, + "loss": 4.8506, + "step": 19745 + }, + { + "epoch": 1.34189427911401, + "grad_norm": 0.5434019565582275, + "learning_rate": 8.323311591248811e-06, + "loss": 4.8285, + "step": 19750 + }, + { + "epoch": 1.342233999184672, + "grad_norm": 0.4056124687194824, + "learning_rate": 8.322886941160484e-06, + "loss": 5.0062, + "step": 19755 + }, + { + "epoch": 1.3425737192553335, + "grad_norm": 0.5709623694419861, + "learning_rate": 8.322462291072157e-06, + "loss": 4.9178, + "step": 19760 + }, + { + "epoch": 1.3429134393259954, + "grad_norm": 0.49156707525253296, + "learning_rate": 8.32203764098383e-06, + "loss": 5.0073, + "step": 19765 + }, + { + "epoch": 1.3432531593966572, + "grad_norm": 0.6773743629455566, + "learning_rate": 8.321612990895503e-06, + "loss": 4.9113, + "step": 19770 + }, + { + "epoch": 1.3435928794673189, + "grad_norm": 0.7104859352111816, + "learning_rate": 8.321188340807175e-06, + "loss": 4.7904, + "step": 19775 + }, + { + "epoch": 1.3439325995379807, + "grad_norm": 0.4358169734477997, + "learning_rate": 8.320763690718848e-06, + "loss": 5.003, + "step": 19780 + }, + { + "epoch": 1.3442723196086424, + "grad_norm": 0.5795621275901794, + "learning_rate": 8.320339040630521e-06, + "loss": 4.8528, + "step": 19785 + }, + { + "epoch": 1.3446120396793042, + "grad_norm": 0.5227441787719727, + "learning_rate": 8.319914390542194e-06, + "loss": 4.8628, + "step": 19790 + }, + { + "epoch": 1.344951759749966, + "grad_norm": 0.6457685232162476, + "learning_rate": 8.319489740453867e-06, + "loss": 5.2171, + "step": 19795 + }, + { + "epoch": 1.3452914798206277, + "grad_norm": 0.6092387437820435, + "learning_rate": 8.31906509036554e-06, + "loss": 5.0901, + "step": 19800 + }, + { + "epoch": 1.3456311998912895, + "grad_norm": 0.4664703905582428, + "learning_rate": 8.318640440277212e-06, + "loss": 4.8277, + "step": 19805 + }, + { + "epoch": 1.3459709199619514, + "grad_norm": 0.4049430787563324, + "learning_rate": 8.318215790188885e-06, + "loss": 4.6366, + "step": 19810 + }, + { + "epoch": 1.346310640032613, + "grad_norm": 0.5194348096847534, + "learning_rate": 8.317791140100558e-06, + "loss": 5.044, + "step": 19815 + }, + { + "epoch": 1.3466503601032749, + "grad_norm": 0.443501740694046, + "learning_rate": 8.31736649001223e-06, + "loss": 4.9649, + "step": 19820 + }, + { + "epoch": 1.3469900801739367, + "grad_norm": 0.45190635323524475, + "learning_rate": 8.316941839923904e-06, + "loss": 4.8764, + "step": 19825 + }, + { + "epoch": 1.3473298002445984, + "grad_norm": 0.3774018883705139, + "learning_rate": 8.316517189835576e-06, + "loss": 4.8843, + "step": 19830 + }, + { + "epoch": 1.3476695203152602, + "grad_norm": 0.45184847712516785, + "learning_rate": 8.316092539747249e-06, + "loss": 4.9745, + "step": 19835 + }, + { + "epoch": 1.348009240385922, + "grad_norm": 0.5042328238487244, + "learning_rate": 8.315667889658922e-06, + "loss": 5.0875, + "step": 19840 + }, + { + "epoch": 1.3483489604565837, + "grad_norm": 0.46115103363990784, + "learning_rate": 8.315243239570595e-06, + "loss": 4.6577, + "step": 19845 + }, + { + "epoch": 1.3486886805272456, + "grad_norm": 0.6202085614204407, + "learning_rate": 8.314818589482268e-06, + "loss": 5.1816, + "step": 19850 + }, + { + "epoch": 1.3490284005979074, + "grad_norm": 0.4347580373287201, + "learning_rate": 8.31439393939394e-06, + "loss": 4.9326, + "step": 19855 + }, + { + "epoch": 1.349368120668569, + "grad_norm": 0.41656967997550964, + "learning_rate": 8.313969289305613e-06, + "loss": 4.8808, + "step": 19860 + }, + { + "epoch": 1.349707840739231, + "grad_norm": 0.6461437344551086, + "learning_rate": 8.313544639217286e-06, + "loss": 5.2154, + "step": 19865 + }, + { + "epoch": 1.3500475608098927, + "grad_norm": 0.4531140625476837, + "learning_rate": 8.313119989128959e-06, + "loss": 5.0501, + "step": 19870 + }, + { + "epoch": 1.3503872808805544, + "grad_norm": 0.6087677478790283, + "learning_rate": 8.312695339040632e-06, + "loss": 5.2904, + "step": 19875 + }, + { + "epoch": 1.3507270009512162, + "grad_norm": 0.47238579392433167, + "learning_rate": 8.312270688952304e-06, + "loss": 4.9651, + "step": 19880 + }, + { + "epoch": 1.351066721021878, + "grad_norm": 0.4521082639694214, + "learning_rate": 8.311846038863977e-06, + "loss": 5.1023, + "step": 19885 + }, + { + "epoch": 1.3514064410925397, + "grad_norm": 0.5287049412727356, + "learning_rate": 8.31142138877565e-06, + "loss": 4.9969, + "step": 19890 + }, + { + "epoch": 1.3517461611632016, + "grad_norm": 0.46470627188682556, + "learning_rate": 8.310996738687323e-06, + "loss": 4.9121, + "step": 19895 + }, + { + "epoch": 1.3520858812338634, + "grad_norm": 0.5043793320655823, + "learning_rate": 8.310572088598996e-06, + "loss": 4.7275, + "step": 19900 + }, + { + "epoch": 1.352425601304525, + "grad_norm": 0.4963722229003906, + "learning_rate": 8.310147438510668e-06, + "loss": 4.7084, + "step": 19905 + }, + { + "epoch": 1.352765321375187, + "grad_norm": 0.5152466297149658, + "learning_rate": 8.309722788422341e-06, + "loss": 4.932, + "step": 19910 + }, + { + "epoch": 1.3531050414458488, + "grad_norm": 0.39120346307754517, + "learning_rate": 8.309298138334014e-06, + "loss": 4.7774, + "step": 19915 + }, + { + "epoch": 1.3534447615165104, + "grad_norm": 0.4685819149017334, + "learning_rate": 8.308873488245687e-06, + "loss": 4.8056, + "step": 19920 + }, + { + "epoch": 1.3537844815871722, + "grad_norm": 0.4306707978248596, + "learning_rate": 8.30844883815736e-06, + "loss": 4.6208, + "step": 19925 + }, + { + "epoch": 1.3541242016578339, + "grad_norm": 0.4847170114517212, + "learning_rate": 8.308024188069032e-06, + "loss": 4.9441, + "step": 19930 + }, + { + "epoch": 1.3544639217284957, + "grad_norm": 0.47284042835235596, + "learning_rate": 8.307599537980705e-06, + "loss": 4.8885, + "step": 19935 + }, + { + "epoch": 1.3548036417991576, + "grad_norm": 0.5225037336349487, + "learning_rate": 8.307174887892378e-06, + "loss": 5.0444, + "step": 19940 + }, + { + "epoch": 1.3551433618698192, + "grad_norm": 0.4258582293987274, + "learning_rate": 8.30675023780405e-06, + "loss": 4.8641, + "step": 19945 + }, + { + "epoch": 1.355483081940481, + "grad_norm": 0.37125349044799805, + "learning_rate": 8.306325587715724e-06, + "loss": 5.1088, + "step": 19950 + }, + { + "epoch": 1.3558228020111427, + "grad_norm": 0.46825337409973145, + "learning_rate": 8.305900937627396e-06, + "loss": 5.016, + "step": 19955 + }, + { + "epoch": 1.3561625220818045, + "grad_norm": 0.4484724998474121, + "learning_rate": 8.305476287539067e-06, + "loss": 5.0242, + "step": 19960 + }, + { + "epoch": 1.3565022421524664, + "grad_norm": 0.536715030670166, + "learning_rate": 8.305051637450742e-06, + "loss": 5.1035, + "step": 19965 + }, + { + "epoch": 1.356841962223128, + "grad_norm": 0.47332948446273804, + "learning_rate": 8.304626987362415e-06, + "loss": 4.9693, + "step": 19970 + }, + { + "epoch": 1.3571816822937899, + "grad_norm": 0.4487746059894562, + "learning_rate": 8.304202337274086e-06, + "loss": 4.8198, + "step": 19975 + }, + { + "epoch": 1.3575214023644517, + "grad_norm": 0.6447779536247253, + "learning_rate": 8.30377768718576e-06, + "loss": 4.8159, + "step": 19980 + }, + { + "epoch": 1.3578611224351134, + "grad_norm": 0.4600960314273834, + "learning_rate": 8.303353037097433e-06, + "loss": 4.8747, + "step": 19985 + }, + { + "epoch": 1.3582008425057752, + "grad_norm": 0.4144807755947113, + "learning_rate": 8.302928387009104e-06, + "loss": 4.7594, + "step": 19990 + }, + { + "epoch": 1.358540562576437, + "grad_norm": 0.5162563323974609, + "learning_rate": 8.302503736920779e-06, + "loss": 4.9158, + "step": 19995 + }, + { + "epoch": 1.3588802826470987, + "grad_norm": 0.4399194121360779, + "learning_rate": 8.302079086832452e-06, + "loss": 5.0367, + "step": 20000 + }, + { + "epoch": 1.3592200027177606, + "grad_norm": 0.4757802486419678, + "learning_rate": 8.301654436744123e-06, + "loss": 5.1352, + "step": 20005 + }, + { + "epoch": 1.3595597227884224, + "grad_norm": 0.47348105907440186, + "learning_rate": 8.301229786655797e-06, + "loss": 5.196, + "step": 20010 + }, + { + "epoch": 1.359899442859084, + "grad_norm": 0.4191504418849945, + "learning_rate": 8.30080513656747e-06, + "loss": 4.9677, + "step": 20015 + }, + { + "epoch": 1.360239162929746, + "grad_norm": 0.42379915714263916, + "learning_rate": 8.300380486479141e-06, + "loss": 4.8961, + "step": 20020 + }, + { + "epoch": 1.3605788830004077, + "grad_norm": 0.5573774576187134, + "learning_rate": 8.299955836390816e-06, + "loss": 4.9114, + "step": 20025 + }, + { + "epoch": 1.3609186030710694, + "grad_norm": 0.43667665123939514, + "learning_rate": 8.299531186302487e-06, + "loss": 4.9888, + "step": 20030 + }, + { + "epoch": 1.3612583231417312, + "grad_norm": 0.41860225796699524, + "learning_rate": 8.29910653621416e-06, + "loss": 5.0947, + "step": 20035 + }, + { + "epoch": 1.361598043212393, + "grad_norm": 0.5250735282897949, + "learning_rate": 8.298681886125834e-06, + "loss": 5.0403, + "step": 20040 + }, + { + "epoch": 1.3619377632830547, + "grad_norm": 0.4846242070198059, + "learning_rate": 8.298257236037505e-06, + "loss": 4.8644, + "step": 20045 + }, + { + "epoch": 1.3622774833537166, + "grad_norm": 0.4870905876159668, + "learning_rate": 8.297832585949178e-06, + "loss": 5.143, + "step": 20050 + }, + { + "epoch": 1.3626172034243784, + "grad_norm": 0.5532228946685791, + "learning_rate": 8.297407935860852e-06, + "loss": 5.0596, + "step": 20055 + }, + { + "epoch": 1.36295692349504, + "grad_norm": 0.4726620018482208, + "learning_rate": 8.296983285772523e-06, + "loss": 5.0457, + "step": 20060 + }, + { + "epoch": 1.363296643565702, + "grad_norm": 0.4379010796546936, + "learning_rate": 8.296558635684196e-06, + "loss": 4.7228, + "step": 20065 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.4592396318912506, + "learning_rate": 8.29613398559587e-06, + "loss": 4.9764, + "step": 20070 + }, + { + "epoch": 1.3639760837070254, + "grad_norm": 0.5473035573959351, + "learning_rate": 8.295709335507542e-06, + "loss": 4.897, + "step": 20075 + }, + { + "epoch": 1.3643158037776872, + "grad_norm": 0.48659634590148926, + "learning_rate": 8.295284685419215e-06, + "loss": 4.8406, + "step": 20080 + }, + { + "epoch": 1.364655523848349, + "grad_norm": 0.48590293526649475, + "learning_rate": 8.294860035330889e-06, + "loss": 4.9283, + "step": 20085 + }, + { + "epoch": 1.3649952439190107, + "grad_norm": 0.5067980289459229, + "learning_rate": 8.29443538524256e-06, + "loss": 5.0815, + "step": 20090 + }, + { + "epoch": 1.3653349639896726, + "grad_norm": 0.45446571707725525, + "learning_rate": 8.294010735154233e-06, + "loss": 4.8168, + "step": 20095 + }, + { + "epoch": 1.3656746840603342, + "grad_norm": 0.48877865076065063, + "learning_rate": 8.293586085065906e-06, + "loss": 4.8456, + "step": 20100 + }, + { + "epoch": 1.366014404130996, + "grad_norm": 0.4040021598339081, + "learning_rate": 8.293161434977579e-06, + "loss": 4.9672, + "step": 20105 + }, + { + "epoch": 1.366354124201658, + "grad_norm": 0.515235424041748, + "learning_rate": 8.292736784889251e-06, + "loss": 4.8627, + "step": 20110 + }, + { + "epoch": 1.3666938442723195, + "grad_norm": 0.610463559627533, + "learning_rate": 8.292312134800924e-06, + "loss": 5.0512, + "step": 20115 + }, + { + "epoch": 1.3670335643429814, + "grad_norm": 0.520429253578186, + "learning_rate": 8.291887484712597e-06, + "loss": 4.4422, + "step": 20120 + }, + { + "epoch": 1.367373284413643, + "grad_norm": 0.48954641819000244, + "learning_rate": 8.29146283462427e-06, + "loss": 4.8347, + "step": 20125 + }, + { + "epoch": 1.3677130044843049, + "grad_norm": 0.5605508685112, + "learning_rate": 8.291038184535943e-06, + "loss": 4.8482, + "step": 20130 + }, + { + "epoch": 1.3680527245549667, + "grad_norm": 0.5035854578018188, + "learning_rate": 8.290613534447615e-06, + "loss": 5.0316, + "step": 20135 + }, + { + "epoch": 1.3683924446256284, + "grad_norm": 0.4764573872089386, + "learning_rate": 8.290188884359288e-06, + "loss": 4.541, + "step": 20140 + }, + { + "epoch": 1.3687321646962902, + "grad_norm": 0.5296124219894409, + "learning_rate": 8.289764234270961e-06, + "loss": 4.967, + "step": 20145 + }, + { + "epoch": 1.369071884766952, + "grad_norm": 0.5771543383598328, + "learning_rate": 8.289339584182634e-06, + "loss": 5.0126, + "step": 20150 + }, + { + "epoch": 1.3694116048376137, + "grad_norm": 0.5314679741859436, + "learning_rate": 8.288914934094307e-06, + "loss": 5.0261, + "step": 20155 + }, + { + "epoch": 1.3697513249082756, + "grad_norm": 0.5047651529312134, + "learning_rate": 8.28849028400598e-06, + "loss": 4.9298, + "step": 20160 + }, + { + "epoch": 1.3700910449789374, + "grad_norm": 0.408841609954834, + "learning_rate": 8.288065633917652e-06, + "loss": 4.5933, + "step": 20165 + }, + { + "epoch": 1.370430765049599, + "grad_norm": 0.5046980381011963, + "learning_rate": 8.287640983829325e-06, + "loss": 4.8235, + "step": 20170 + }, + { + "epoch": 1.370770485120261, + "grad_norm": 0.4598609209060669, + "learning_rate": 8.287216333740998e-06, + "loss": 4.814, + "step": 20175 + }, + { + "epoch": 1.3711102051909227, + "grad_norm": 0.44150403141975403, + "learning_rate": 8.28679168365267e-06, + "loss": 4.9127, + "step": 20180 + }, + { + "epoch": 1.3714499252615844, + "grad_norm": 0.42828628420829773, + "learning_rate": 8.286367033564343e-06, + "loss": 4.7229, + "step": 20185 + }, + { + "epoch": 1.3717896453322462, + "grad_norm": 0.5110692381858826, + "learning_rate": 8.285942383476016e-06, + "loss": 4.9661, + "step": 20190 + }, + { + "epoch": 1.372129365402908, + "grad_norm": 0.5758607983589172, + "learning_rate": 8.285517733387689e-06, + "loss": 4.8108, + "step": 20195 + }, + { + "epoch": 1.3724690854735697, + "grad_norm": 0.565619170665741, + "learning_rate": 8.285093083299362e-06, + "loss": 4.786, + "step": 20200 + }, + { + "epoch": 1.3728088055442316, + "grad_norm": 0.3898303806781769, + "learning_rate": 8.284668433211035e-06, + "loss": 4.6024, + "step": 20205 + }, + { + "epoch": 1.3731485256148934, + "grad_norm": 0.5015624165534973, + "learning_rate": 8.284243783122707e-06, + "loss": 4.8145, + "step": 20210 + }, + { + "epoch": 1.373488245685555, + "grad_norm": 0.45781198143959045, + "learning_rate": 8.28381913303438e-06, + "loss": 4.6859, + "step": 20215 + }, + { + "epoch": 1.373827965756217, + "grad_norm": 0.5421363711357117, + "learning_rate": 8.283394482946053e-06, + "loss": 4.612, + "step": 20220 + }, + { + "epoch": 1.3741676858268788, + "grad_norm": 0.4595852494239807, + "learning_rate": 8.282969832857726e-06, + "loss": 4.9128, + "step": 20225 + }, + { + "epoch": 1.3745074058975404, + "grad_norm": 0.36586642265319824, + "learning_rate": 8.282545182769399e-06, + "loss": 4.4689, + "step": 20230 + }, + { + "epoch": 1.3748471259682022, + "grad_norm": 0.5402349829673767, + "learning_rate": 8.282120532681071e-06, + "loss": 4.937, + "step": 20235 + }, + { + "epoch": 1.375186846038864, + "grad_norm": 0.3985472619533539, + "learning_rate": 8.281695882592744e-06, + "loss": 4.7717, + "step": 20240 + }, + { + "epoch": 1.3755265661095257, + "grad_norm": 0.5140781998634338, + "learning_rate": 8.281271232504417e-06, + "loss": 4.7039, + "step": 20245 + }, + { + "epoch": 1.3758662861801876, + "grad_norm": 0.6551767587661743, + "learning_rate": 8.28084658241609e-06, + "loss": 4.7905, + "step": 20250 + }, + { + "epoch": 1.3762060062508494, + "grad_norm": 0.4985913038253784, + "learning_rate": 8.280421932327763e-06, + "loss": 4.9138, + "step": 20255 + }, + { + "epoch": 1.376545726321511, + "grad_norm": 0.512902557849884, + "learning_rate": 8.279997282239435e-06, + "loss": 4.8288, + "step": 20260 + }, + { + "epoch": 1.376885446392173, + "grad_norm": 0.7253643870353699, + "learning_rate": 8.279572632151108e-06, + "loss": 4.874, + "step": 20265 + }, + { + "epoch": 1.3772251664628345, + "grad_norm": 0.4982483983039856, + "learning_rate": 8.279147982062781e-06, + "loss": 5.1588, + "step": 20270 + }, + { + "epoch": 1.3775648865334964, + "grad_norm": 0.6416604518890381, + "learning_rate": 8.278723331974454e-06, + "loss": 4.9274, + "step": 20275 + }, + { + "epoch": 1.3779046066041583, + "grad_norm": 0.48157259821891785, + "learning_rate": 8.278298681886127e-06, + "loss": 4.9668, + "step": 20280 + }, + { + "epoch": 1.3782443266748199, + "grad_norm": 0.5599878430366516, + "learning_rate": 8.2778740317978e-06, + "loss": 4.8109, + "step": 20285 + }, + { + "epoch": 1.3785840467454817, + "grad_norm": 0.41240260004997253, + "learning_rate": 8.277449381709472e-06, + "loss": 4.9456, + "step": 20290 + }, + { + "epoch": 1.3789237668161434, + "grad_norm": 0.42778971791267395, + "learning_rate": 8.277024731621145e-06, + "loss": 4.944, + "step": 20295 + }, + { + "epoch": 1.3792634868868052, + "grad_norm": 0.40302470326423645, + "learning_rate": 8.276600081532818e-06, + "loss": 4.8994, + "step": 20300 + }, + { + "epoch": 1.379603206957467, + "grad_norm": 0.6072494387626648, + "learning_rate": 8.27617543144449e-06, + "loss": 4.6254, + "step": 20305 + }, + { + "epoch": 1.3799429270281287, + "grad_norm": 0.5106683373451233, + "learning_rate": 8.275750781356163e-06, + "loss": 4.821, + "step": 20310 + }, + { + "epoch": 1.3802826470987906, + "grad_norm": 0.42904967069625854, + "learning_rate": 8.275326131267836e-06, + "loss": 4.9554, + "step": 20315 + }, + { + "epoch": 1.3806223671694524, + "grad_norm": 0.6152747869491577, + "learning_rate": 8.274901481179509e-06, + "loss": 5.0318, + "step": 20320 + }, + { + "epoch": 1.380962087240114, + "grad_norm": 0.6476578712463379, + "learning_rate": 8.274476831091182e-06, + "loss": 4.8774, + "step": 20325 + }, + { + "epoch": 1.381301807310776, + "grad_norm": 0.40265899896621704, + "learning_rate": 8.274052181002855e-06, + "loss": 4.9751, + "step": 20330 + }, + { + "epoch": 1.3816415273814378, + "grad_norm": 0.45681771636009216, + "learning_rate": 8.273627530914527e-06, + "loss": 4.8118, + "step": 20335 + }, + { + "epoch": 1.3819812474520994, + "grad_norm": 0.48918575048446655, + "learning_rate": 8.2732028808262e-06, + "loss": 4.8153, + "step": 20340 + }, + { + "epoch": 1.3823209675227612, + "grad_norm": 0.43809714913368225, + "learning_rate": 8.272778230737873e-06, + "loss": 4.7331, + "step": 20345 + }, + { + "epoch": 1.382660687593423, + "grad_norm": 0.41430166363716125, + "learning_rate": 8.272353580649546e-06, + "loss": 4.9128, + "step": 20350 + }, + { + "epoch": 1.3830004076640847, + "grad_norm": 0.43277424573898315, + "learning_rate": 8.271928930561219e-06, + "loss": 5.1352, + "step": 20355 + }, + { + "epoch": 1.3833401277347466, + "grad_norm": 0.5082614421844482, + "learning_rate": 8.271504280472891e-06, + "loss": 4.5748, + "step": 20360 + }, + { + "epoch": 1.3836798478054084, + "grad_norm": 0.4269615709781647, + "learning_rate": 8.271079630384564e-06, + "loss": 4.9275, + "step": 20365 + }, + { + "epoch": 1.38401956787607, + "grad_norm": 0.5168194770812988, + "learning_rate": 8.270654980296237e-06, + "loss": 4.8753, + "step": 20370 + }, + { + "epoch": 1.384359287946732, + "grad_norm": 0.45852094888687134, + "learning_rate": 8.270230330207908e-06, + "loss": 4.9027, + "step": 20375 + }, + { + "epoch": 1.3846990080173938, + "grad_norm": 0.4258691072463989, + "learning_rate": 8.269805680119583e-06, + "loss": 4.9884, + "step": 20380 + }, + { + "epoch": 1.3850387280880554, + "grad_norm": 0.4949409067630768, + "learning_rate": 8.269381030031255e-06, + "loss": 5.0281, + "step": 20385 + }, + { + "epoch": 1.3853784481587172, + "grad_norm": 0.4441811442375183, + "learning_rate": 8.268956379942927e-06, + "loss": 4.7972, + "step": 20390 + }, + { + "epoch": 1.385718168229379, + "grad_norm": 0.5086604356765747, + "learning_rate": 8.268531729854601e-06, + "loss": 5.0689, + "step": 20395 + }, + { + "epoch": 1.3860578883000407, + "grad_norm": 0.45611611008644104, + "learning_rate": 8.268107079766274e-06, + "loss": 4.9071, + "step": 20400 + }, + { + "epoch": 1.3863976083707026, + "grad_norm": 0.618400514125824, + "learning_rate": 8.267682429677945e-06, + "loss": 4.9422, + "step": 20405 + }, + { + "epoch": 1.3867373284413644, + "grad_norm": 0.41105180978775024, + "learning_rate": 8.26725777958962e-06, + "loss": 4.8101, + "step": 20410 + }, + { + "epoch": 1.387077048512026, + "grad_norm": 0.5682533979415894, + "learning_rate": 8.266833129501292e-06, + "loss": 4.8252, + "step": 20415 + }, + { + "epoch": 1.387416768582688, + "grad_norm": 0.36270672082901, + "learning_rate": 8.266408479412963e-06, + "loss": 5.1787, + "step": 20420 + }, + { + "epoch": 1.3877564886533498, + "grad_norm": 0.42694830894470215, + "learning_rate": 8.265983829324638e-06, + "loss": 4.8422, + "step": 20425 + }, + { + "epoch": 1.3880962087240114, + "grad_norm": 0.36671343445777893, + "learning_rate": 8.26555917923631e-06, + "loss": 4.832, + "step": 20430 + }, + { + "epoch": 1.3884359287946733, + "grad_norm": 0.4527735114097595, + "learning_rate": 8.265134529147982e-06, + "loss": 5.0462, + "step": 20435 + }, + { + "epoch": 1.3887756488653349, + "grad_norm": 0.5035310983657837, + "learning_rate": 8.264709879059656e-06, + "loss": 4.887, + "step": 20440 + }, + { + "epoch": 1.3891153689359967, + "grad_norm": 0.4404597282409668, + "learning_rate": 8.264285228971327e-06, + "loss": 5.0731, + "step": 20445 + }, + { + "epoch": 1.3894550890066586, + "grad_norm": 0.623361349105835, + "learning_rate": 8.263860578883e-06, + "loss": 5.0426, + "step": 20450 + }, + { + "epoch": 1.3897948090773202, + "grad_norm": 0.5033490657806396, + "learning_rate": 8.263520858812339e-06, + "loss": 5.1392, + "step": 20455 + }, + { + "epoch": 1.390134529147982, + "grad_norm": 0.42238831520080566, + "learning_rate": 8.263096208724012e-06, + "loss": 4.9762, + "step": 20460 + }, + { + "epoch": 1.3904742492186437, + "grad_norm": 0.40900057554244995, + "learning_rate": 8.262671558635685e-06, + "loss": 4.9236, + "step": 20465 + }, + { + "epoch": 1.3908139692893056, + "grad_norm": 0.4978570342063904, + "learning_rate": 8.262246908547358e-06, + "loss": 4.8997, + "step": 20470 + }, + { + "epoch": 1.3911536893599674, + "grad_norm": 0.45639899373054504, + "learning_rate": 8.26182225845903e-06, + "loss": 4.907, + "step": 20475 + }, + { + "epoch": 1.391493409430629, + "grad_norm": 0.5582418441772461, + "learning_rate": 8.261397608370703e-06, + "loss": 4.8242, + "step": 20480 + }, + { + "epoch": 1.391833129501291, + "grad_norm": 0.5724921822547913, + "learning_rate": 8.260972958282376e-06, + "loss": 4.7624, + "step": 20485 + }, + { + "epoch": 1.3921728495719528, + "grad_norm": 0.5465311408042908, + "learning_rate": 8.260548308194049e-06, + "loss": 4.9407, + "step": 20490 + }, + { + "epoch": 1.3925125696426144, + "grad_norm": 0.6072901487350464, + "learning_rate": 8.260123658105722e-06, + "loss": 4.7515, + "step": 20495 + }, + { + "epoch": 1.3928522897132762, + "grad_norm": 0.8230658173561096, + "learning_rate": 8.259699008017394e-06, + "loss": 4.9352, + "step": 20500 + }, + { + "epoch": 1.393192009783938, + "grad_norm": 0.53493732213974, + "learning_rate": 8.259274357929067e-06, + "loss": 5.0191, + "step": 20505 + }, + { + "epoch": 1.3935317298545997, + "grad_norm": 0.5815847516059875, + "learning_rate": 8.25884970784074e-06, + "loss": 4.7588, + "step": 20510 + }, + { + "epoch": 1.3938714499252616, + "grad_norm": 0.566575288772583, + "learning_rate": 8.258425057752413e-06, + "loss": 4.8225, + "step": 20515 + }, + { + "epoch": 1.3942111699959234, + "grad_norm": 0.5016651749610901, + "learning_rate": 8.258000407664086e-06, + "loss": 4.9362, + "step": 20520 + }, + { + "epoch": 1.394550890066585, + "grad_norm": 0.41862645745277405, + "learning_rate": 8.257575757575758e-06, + "loss": 4.964, + "step": 20525 + }, + { + "epoch": 1.394890610137247, + "grad_norm": 0.6391115188598633, + "learning_rate": 8.257151107487431e-06, + "loss": 4.7531, + "step": 20530 + }, + { + "epoch": 1.3952303302079088, + "grad_norm": 0.44676443934440613, + "learning_rate": 8.256726457399104e-06, + "loss": 4.8634, + "step": 20535 + }, + { + "epoch": 1.3955700502785704, + "grad_norm": 0.41804489493370056, + "learning_rate": 8.256301807310777e-06, + "loss": 4.9226, + "step": 20540 + }, + { + "epoch": 1.3959097703492322, + "grad_norm": 0.4117504060268402, + "learning_rate": 8.25587715722245e-06, + "loss": 4.8198, + "step": 20545 + }, + { + "epoch": 1.396249490419894, + "grad_norm": 0.6179744005203247, + "learning_rate": 8.255452507134122e-06, + "loss": 4.7674, + "step": 20550 + }, + { + "epoch": 1.3965892104905557, + "grad_norm": 0.3534131348133087, + "learning_rate": 8.255027857045795e-06, + "loss": 4.7546, + "step": 20555 + }, + { + "epoch": 1.3969289305612176, + "grad_norm": 0.45446521043777466, + "learning_rate": 8.254603206957468e-06, + "loss": 4.7415, + "step": 20560 + }, + { + "epoch": 1.3972686506318794, + "grad_norm": 0.42283064126968384, + "learning_rate": 8.25417855686914e-06, + "loss": 4.8828, + "step": 20565 + }, + { + "epoch": 1.397608370702541, + "grad_norm": 0.4339250922203064, + "learning_rate": 8.253753906780814e-06, + "loss": 4.9432, + "step": 20570 + }, + { + "epoch": 1.397948090773203, + "grad_norm": 0.4506535530090332, + "learning_rate": 8.253329256692486e-06, + "loss": 4.6457, + "step": 20575 + }, + { + "epoch": 1.3982878108438648, + "grad_norm": 0.5443385243415833, + "learning_rate": 8.252904606604159e-06, + "loss": 5.0001, + "step": 20580 + }, + { + "epoch": 1.3986275309145264, + "grad_norm": 0.5762808322906494, + "learning_rate": 8.252479956515832e-06, + "loss": 4.8015, + "step": 20585 + }, + { + "epoch": 1.3989672509851883, + "grad_norm": 0.45582953095436096, + "learning_rate": 8.252055306427505e-06, + "loss": 4.9155, + "step": 20590 + }, + { + "epoch": 1.3993069710558501, + "grad_norm": 0.5352105498313904, + "learning_rate": 8.251630656339178e-06, + "loss": 4.8809, + "step": 20595 + }, + { + "epoch": 1.3996466911265117, + "grad_norm": 0.5879018306732178, + "learning_rate": 8.25120600625085e-06, + "loss": 4.9355, + "step": 20600 + }, + { + "epoch": 1.3999864111971736, + "grad_norm": 0.4404139518737793, + "learning_rate": 8.250781356162523e-06, + "loss": 4.8701, + "step": 20605 + }, + { + "epoch": 1.4003261312678352, + "grad_norm": 0.5061983466148376, + "learning_rate": 8.250356706074196e-06, + "loss": 4.8659, + "step": 20610 + }, + { + "epoch": 1.400665851338497, + "grad_norm": 0.38031864166259766, + "learning_rate": 8.249932055985869e-06, + "loss": 4.9262, + "step": 20615 + }, + { + "epoch": 1.401005571409159, + "grad_norm": 0.4132619798183441, + "learning_rate": 8.249507405897542e-06, + "loss": 4.8458, + "step": 20620 + }, + { + "epoch": 1.4013452914798206, + "grad_norm": 0.47960948944091797, + "learning_rate": 8.249082755809213e-06, + "loss": 4.7823, + "step": 20625 + }, + { + "epoch": 1.4016850115504824, + "grad_norm": 0.3853764832019806, + "learning_rate": 8.248658105720887e-06, + "loss": 4.9287, + "step": 20630 + }, + { + "epoch": 1.402024731621144, + "grad_norm": 0.4769870638847351, + "learning_rate": 8.24823345563256e-06, + "loss": 4.7322, + "step": 20635 + }, + { + "epoch": 1.402364451691806, + "grad_norm": 0.41845861077308655, + "learning_rate": 8.247808805544231e-06, + "loss": 5.0381, + "step": 20640 + }, + { + "epoch": 1.4027041717624678, + "grad_norm": 0.46277594566345215, + "learning_rate": 8.247384155455906e-06, + "loss": 5.0701, + "step": 20645 + }, + { + "epoch": 1.4030438918331294, + "grad_norm": 0.5954791903495789, + "learning_rate": 8.246959505367578e-06, + "loss": 4.995, + "step": 20650 + }, + { + "epoch": 1.4033836119037912, + "grad_norm": 0.43248963356018066, + "learning_rate": 8.24653485527925e-06, + "loss": 4.8241, + "step": 20655 + }, + { + "epoch": 1.403723331974453, + "grad_norm": 0.5001408457756042, + "learning_rate": 8.246110205190924e-06, + "loss": 4.8246, + "step": 20660 + }, + { + "epoch": 1.4040630520451147, + "grad_norm": 0.4301888048648834, + "learning_rate": 8.245685555102597e-06, + "loss": 4.7983, + "step": 20665 + }, + { + "epoch": 1.4044027721157766, + "grad_norm": 0.5156233310699463, + "learning_rate": 8.245260905014268e-06, + "loss": 4.9338, + "step": 20670 + }, + { + "epoch": 1.4047424921864384, + "grad_norm": 0.4354807436466217, + "learning_rate": 8.244836254925942e-06, + "loss": 4.8913, + "step": 20675 + }, + { + "epoch": 1.4050822122571, + "grad_norm": 0.4091547131538391, + "learning_rate": 8.244411604837615e-06, + "loss": 4.8967, + "step": 20680 + }, + { + "epoch": 1.405421932327762, + "grad_norm": 0.34487056732177734, + "learning_rate": 8.243986954749286e-06, + "loss": 4.8075, + "step": 20685 + }, + { + "epoch": 1.4057616523984238, + "grad_norm": 0.42491793632507324, + "learning_rate": 8.24356230466096e-06, + "loss": 4.7854, + "step": 20690 + }, + { + "epoch": 1.4061013724690854, + "grad_norm": 0.4879341721534729, + "learning_rate": 8.243137654572632e-06, + "loss": 4.8564, + "step": 20695 + }, + { + "epoch": 1.4064410925397473, + "grad_norm": 0.5138850212097168, + "learning_rate": 8.242713004484305e-06, + "loss": 4.9454, + "step": 20700 + }, + { + "epoch": 1.406780812610409, + "grad_norm": 0.4931190013885498, + "learning_rate": 8.24228835439598e-06, + "loss": 4.9908, + "step": 20705 + }, + { + "epoch": 1.4071205326810707, + "grad_norm": 0.4840802252292633, + "learning_rate": 8.24186370430765e-06, + "loss": 5.3443, + "step": 20710 + }, + { + "epoch": 1.4074602527517326, + "grad_norm": 0.471152126789093, + "learning_rate": 8.241439054219323e-06, + "loss": 4.809, + "step": 20715 + }, + { + "epoch": 1.4077999728223944, + "grad_norm": 0.4305839538574219, + "learning_rate": 8.241014404130998e-06, + "loss": 4.9009, + "step": 20720 + }, + { + "epoch": 1.408139692893056, + "grad_norm": 0.44592738151550293, + "learning_rate": 8.240589754042669e-06, + "loss": 4.7573, + "step": 20725 + }, + { + "epoch": 1.408479412963718, + "grad_norm": 0.5336706042289734, + "learning_rate": 8.240165103954341e-06, + "loss": 4.5709, + "step": 20730 + }, + { + "epoch": 1.4088191330343798, + "grad_norm": 0.4454817473888397, + "learning_rate": 8.239740453866016e-06, + "loss": 5.0642, + "step": 20735 + }, + { + "epoch": 1.4091588531050414, + "grad_norm": 0.7442151308059692, + "learning_rate": 8.239315803777687e-06, + "loss": 4.8183, + "step": 20740 + }, + { + "epoch": 1.4094985731757033, + "grad_norm": 0.4472779929637909, + "learning_rate": 8.23889115368936e-06, + "loss": 4.6957, + "step": 20745 + }, + { + "epoch": 1.4098382932463651, + "grad_norm": 0.6176978349685669, + "learning_rate": 8.238466503601034e-06, + "loss": 5.1737, + "step": 20750 + }, + { + "epoch": 1.4101780133170267, + "grad_norm": 0.4622267782688141, + "learning_rate": 8.238041853512705e-06, + "loss": 4.8183, + "step": 20755 + }, + { + "epoch": 1.4105177333876886, + "grad_norm": 0.4526957869529724, + "learning_rate": 8.23761720342438e-06, + "loss": 4.6964, + "step": 20760 + }, + { + "epoch": 1.4108574534583505, + "grad_norm": 0.4303617775440216, + "learning_rate": 8.237192553336053e-06, + "loss": 4.7808, + "step": 20765 + }, + { + "epoch": 1.411197173529012, + "grad_norm": 0.6120204925537109, + "learning_rate": 8.236767903247724e-06, + "loss": 4.9863, + "step": 20770 + }, + { + "epoch": 1.411536893599674, + "grad_norm": 0.5054359436035156, + "learning_rate": 8.236343253159398e-06, + "loss": 4.5488, + "step": 20775 + }, + { + "epoch": 1.4118766136703356, + "grad_norm": 0.4588595926761627, + "learning_rate": 8.23591860307107e-06, + "loss": 4.9841, + "step": 20780 + }, + { + "epoch": 1.4122163337409974, + "grad_norm": 0.5002236366271973, + "learning_rate": 8.235493952982742e-06, + "loss": 5.0174, + "step": 20785 + }, + { + "epoch": 1.4125560538116593, + "grad_norm": 0.538806676864624, + "learning_rate": 8.235069302894417e-06, + "loss": 4.8727, + "step": 20790 + }, + { + "epoch": 1.412895773882321, + "grad_norm": 0.4392905831336975, + "learning_rate": 8.234644652806088e-06, + "loss": 4.776, + "step": 20795 + }, + { + "epoch": 1.4132354939529828, + "grad_norm": 0.5742761492729187, + "learning_rate": 8.23422000271776e-06, + "loss": 5.0621, + "step": 20800 + }, + { + "epoch": 1.4135752140236444, + "grad_norm": 0.5043716430664062, + "learning_rate": 8.233795352629435e-06, + "loss": 4.8869, + "step": 20805 + }, + { + "epoch": 1.4139149340943062, + "grad_norm": 0.43432897329330444, + "learning_rate": 8.233370702541106e-06, + "loss": 4.657, + "step": 20810 + }, + { + "epoch": 1.414254654164968, + "grad_norm": 0.4663873016834259, + "learning_rate": 8.232946052452779e-06, + "loss": 4.8984, + "step": 20815 + }, + { + "epoch": 1.4145943742356297, + "grad_norm": 0.5583134293556213, + "learning_rate": 8.232521402364454e-06, + "loss": 4.992, + "step": 20820 + }, + { + "epoch": 1.4149340943062916, + "grad_norm": 0.44352370500564575, + "learning_rate": 8.232096752276125e-06, + "loss": 4.8927, + "step": 20825 + }, + { + "epoch": 1.4152738143769534, + "grad_norm": 0.48476049304008484, + "learning_rate": 8.231672102187797e-06, + "loss": 4.8884, + "step": 20830 + }, + { + "epoch": 1.415613534447615, + "grad_norm": 0.4912559390068054, + "learning_rate": 8.231247452099472e-06, + "loss": 4.722, + "step": 20835 + }, + { + "epoch": 1.415953254518277, + "grad_norm": 0.3808330297470093, + "learning_rate": 8.230822802011143e-06, + "loss": 4.7463, + "step": 20840 + }, + { + "epoch": 1.4162929745889388, + "grad_norm": 0.5694152116775513, + "learning_rate": 8.230398151922816e-06, + "loss": 4.9614, + "step": 20845 + }, + { + "epoch": 1.4166326946596004, + "grad_norm": 0.42848992347717285, + "learning_rate": 8.229973501834489e-06, + "loss": 4.8093, + "step": 20850 + }, + { + "epoch": 1.4169724147302623, + "grad_norm": 0.3917299509048462, + "learning_rate": 8.229548851746161e-06, + "loss": 4.6147, + "step": 20855 + }, + { + "epoch": 1.417312134800924, + "grad_norm": 0.4126831591129303, + "learning_rate": 8.229124201657834e-06, + "loss": 4.8396, + "step": 20860 + }, + { + "epoch": 1.4176518548715857, + "grad_norm": 0.49711090326309204, + "learning_rate": 8.228699551569507e-06, + "loss": 4.8367, + "step": 20865 + }, + { + "epoch": 1.4179915749422476, + "grad_norm": 0.4101358652114868, + "learning_rate": 8.22827490148118e-06, + "loss": 4.9789, + "step": 20870 + }, + { + "epoch": 1.4183312950129094, + "grad_norm": 0.41255268454551697, + "learning_rate": 8.227850251392853e-06, + "loss": 4.7755, + "step": 20875 + }, + { + "epoch": 1.418671015083571, + "grad_norm": 0.45253947377204895, + "learning_rate": 8.227425601304526e-06, + "loss": 5.1124, + "step": 20880 + }, + { + "epoch": 1.419010735154233, + "grad_norm": 0.43227508664131165, + "learning_rate": 8.227000951216198e-06, + "loss": 4.8748, + "step": 20885 + }, + { + "epoch": 1.4193504552248948, + "grad_norm": 0.3735332489013672, + "learning_rate": 8.226576301127871e-06, + "loss": 4.8679, + "step": 20890 + }, + { + "epoch": 1.4196901752955564, + "grad_norm": 0.41216468811035156, + "learning_rate": 8.226151651039544e-06, + "loss": 5.0271, + "step": 20895 + }, + { + "epoch": 1.4200298953662183, + "grad_norm": 0.5186771750450134, + "learning_rate": 8.225727000951217e-06, + "loss": 4.8582, + "step": 20900 + }, + { + "epoch": 1.4203696154368801, + "grad_norm": 0.5614728331565857, + "learning_rate": 8.22530235086289e-06, + "loss": 4.6449, + "step": 20905 + }, + { + "epoch": 1.4207093355075417, + "grad_norm": 0.42678022384643555, + "learning_rate": 8.224877700774562e-06, + "loss": 5.1832, + "step": 20910 + }, + { + "epoch": 1.4210490555782036, + "grad_norm": 0.48621878027915955, + "learning_rate": 8.224453050686235e-06, + "loss": 4.92, + "step": 20915 + }, + { + "epoch": 1.4213887756488655, + "grad_norm": 0.4780360758304596, + "learning_rate": 8.224028400597908e-06, + "loss": 5.0975, + "step": 20920 + }, + { + "epoch": 1.421728495719527, + "grad_norm": 0.4674043357372284, + "learning_rate": 8.22360375050958e-06, + "loss": 4.8444, + "step": 20925 + }, + { + "epoch": 1.422068215790189, + "grad_norm": 0.3998740315437317, + "learning_rate": 8.223179100421254e-06, + "loss": 4.8285, + "step": 20930 + }, + { + "epoch": 1.4224079358608508, + "grad_norm": 0.510077714920044, + "learning_rate": 8.222754450332926e-06, + "loss": 4.8133, + "step": 20935 + }, + { + "epoch": 1.4227476559315124, + "grad_norm": 0.42369261384010315, + "learning_rate": 8.222329800244599e-06, + "loss": 4.7754, + "step": 20940 + }, + { + "epoch": 1.4230873760021743, + "grad_norm": 0.44802701473236084, + "learning_rate": 8.221905150156272e-06, + "loss": 4.8795, + "step": 20945 + }, + { + "epoch": 1.423427096072836, + "grad_norm": 0.5890004634857178, + "learning_rate": 8.221480500067945e-06, + "loss": 5.1135, + "step": 20950 + }, + { + "epoch": 1.4237668161434978, + "grad_norm": 0.4003121554851532, + "learning_rate": 8.221055849979618e-06, + "loss": 4.7776, + "step": 20955 + }, + { + "epoch": 1.4241065362141596, + "grad_norm": 0.6013637781143188, + "learning_rate": 8.22063119989129e-06, + "loss": 5.0503, + "step": 20960 + }, + { + "epoch": 1.4244462562848212, + "grad_norm": 0.41247108578681946, + "learning_rate": 8.220206549802963e-06, + "loss": 4.6043, + "step": 20965 + }, + { + "epoch": 1.424785976355483, + "grad_norm": 0.5228216052055359, + "learning_rate": 8.219781899714636e-06, + "loss": 4.9321, + "step": 20970 + }, + { + "epoch": 1.4251256964261447, + "grad_norm": 0.5067607760429382, + "learning_rate": 8.219357249626309e-06, + "loss": 4.8899, + "step": 20975 + }, + { + "epoch": 1.4254654164968066, + "grad_norm": 0.4673682451248169, + "learning_rate": 8.218932599537982e-06, + "loss": 5.0279, + "step": 20980 + }, + { + "epoch": 1.4258051365674684, + "grad_norm": 0.5193368792533875, + "learning_rate": 8.218507949449654e-06, + "loss": 4.8045, + "step": 20985 + }, + { + "epoch": 1.42614485663813, + "grad_norm": 0.6721665859222412, + "learning_rate": 8.218083299361327e-06, + "loss": 4.8361, + "step": 20990 + }, + { + "epoch": 1.426484576708792, + "grad_norm": 0.3860321342945099, + "learning_rate": 8.217658649273e-06, + "loss": 4.6341, + "step": 20995 + }, + { + "epoch": 1.4268242967794538, + "grad_norm": 0.40075692534446716, + "learning_rate": 8.217233999184673e-06, + "loss": 4.6137, + "step": 21000 + }, + { + "epoch": 1.4271640168501154, + "grad_norm": 0.5171543955802917, + "learning_rate": 8.216809349096346e-06, + "loss": 5.0058, + "step": 21005 + }, + { + "epoch": 1.4275037369207773, + "grad_norm": 0.4211844205856323, + "learning_rate": 8.216384699008018e-06, + "loss": 5.1459, + "step": 21010 + }, + { + "epoch": 1.427843456991439, + "grad_norm": 0.46353480219841003, + "learning_rate": 8.215960048919691e-06, + "loss": 5.2156, + "step": 21015 + }, + { + "epoch": 1.4281831770621007, + "grad_norm": 0.48519909381866455, + "learning_rate": 8.215535398831364e-06, + "loss": 4.8201, + "step": 21020 + }, + { + "epoch": 1.4285228971327626, + "grad_norm": 0.40208253264427185, + "learning_rate": 8.215110748743037e-06, + "loss": 4.8051, + "step": 21025 + }, + { + "epoch": 1.4288626172034244, + "grad_norm": 0.6270159482955933, + "learning_rate": 8.21468609865471e-06, + "loss": 4.9049, + "step": 21030 + }, + { + "epoch": 1.429202337274086, + "grad_norm": 0.4509592056274414, + "learning_rate": 8.214261448566382e-06, + "loss": 4.9868, + "step": 21035 + }, + { + "epoch": 1.429542057344748, + "grad_norm": 0.3694640100002289, + "learning_rate": 8.213836798478053e-06, + "loss": 4.8519, + "step": 21040 + }, + { + "epoch": 1.4298817774154098, + "grad_norm": 0.32406705617904663, + "learning_rate": 8.213412148389728e-06, + "loss": 4.7198, + "step": 21045 + }, + { + "epoch": 1.4302214974860714, + "grad_norm": 0.4458869397640228, + "learning_rate": 8.2129874983014e-06, + "loss": 4.7966, + "step": 21050 + }, + { + "epoch": 1.4305612175567333, + "grad_norm": 0.5107039213180542, + "learning_rate": 8.212562848213072e-06, + "loss": 4.9269, + "step": 21055 + }, + { + "epoch": 1.4309009376273951, + "grad_norm": 0.4360866844654083, + "learning_rate": 8.212138198124746e-06, + "loss": 4.8646, + "step": 21060 + }, + { + "epoch": 1.4312406576980568, + "grad_norm": 0.3662164807319641, + "learning_rate": 8.211713548036419e-06, + "loss": 4.8122, + "step": 21065 + }, + { + "epoch": 1.4315803777687186, + "grad_norm": 0.4472116231918335, + "learning_rate": 8.21128889794809e-06, + "loss": 4.8716, + "step": 21070 + }, + { + "epoch": 1.4319200978393805, + "grad_norm": 0.38645973801612854, + "learning_rate": 8.210864247859765e-06, + "loss": 4.6958, + "step": 21075 + }, + { + "epoch": 1.432259817910042, + "grad_norm": 0.46917521953582764, + "learning_rate": 8.210439597771438e-06, + "loss": 4.7226, + "step": 21080 + }, + { + "epoch": 1.432599537980704, + "grad_norm": 0.4379695951938629, + "learning_rate": 8.210014947683109e-06, + "loss": 4.9022, + "step": 21085 + }, + { + "epoch": 1.4329392580513658, + "grad_norm": 0.4726272225379944, + "learning_rate": 8.209590297594783e-06, + "loss": 4.9295, + "step": 21090 + }, + { + "epoch": 1.4332789781220274, + "grad_norm": 0.45003077387809753, + "learning_rate": 8.209165647506456e-06, + "loss": 4.8843, + "step": 21095 + }, + { + "epoch": 1.4336186981926893, + "grad_norm": 0.4905664324760437, + "learning_rate": 8.208740997418129e-06, + "loss": 4.5396, + "step": 21100 + }, + { + "epoch": 1.4339584182633511, + "grad_norm": 0.4895677864551544, + "learning_rate": 8.208316347329802e-06, + "loss": 4.6443, + "step": 21105 + }, + { + "epoch": 1.4342981383340128, + "grad_norm": 0.36284878849983215, + "learning_rate": 8.207891697241474e-06, + "loss": 4.7168, + "step": 21110 + }, + { + "epoch": 1.4346378584046746, + "grad_norm": 0.38617444038391113, + "learning_rate": 8.207467047153147e-06, + "loss": 4.6917, + "step": 21115 + }, + { + "epoch": 1.4349775784753362, + "grad_norm": 0.5079323053359985, + "learning_rate": 8.20704239706482e-06, + "loss": 4.8553, + "step": 21120 + }, + { + "epoch": 1.435317298545998, + "grad_norm": 0.4178099036216736, + "learning_rate": 8.206617746976491e-06, + "loss": 4.7584, + "step": 21125 + }, + { + "epoch": 1.43565701861666, + "grad_norm": 0.5023597478866577, + "learning_rate": 8.206193096888166e-06, + "loss": 5.0659, + "step": 21130 + }, + { + "epoch": 1.4359967386873216, + "grad_norm": 0.37395304441452026, + "learning_rate": 8.205768446799838e-06, + "loss": 4.7777, + "step": 21135 + }, + { + "epoch": 1.4363364587579834, + "grad_norm": 0.4182054400444031, + "learning_rate": 8.20534379671151e-06, + "loss": 4.7602, + "step": 21140 + }, + { + "epoch": 1.436676178828645, + "grad_norm": 0.44835278391838074, + "learning_rate": 8.204919146623184e-06, + "loss": 4.8556, + "step": 21145 + }, + { + "epoch": 1.437015898899307, + "grad_norm": 0.5824118852615356, + "learning_rate": 8.204494496534857e-06, + "loss": 4.988, + "step": 21150 + }, + { + "epoch": 1.4373556189699688, + "grad_norm": 0.4352263808250427, + "learning_rate": 8.204069846446528e-06, + "loss": 4.6888, + "step": 21155 + }, + { + "epoch": 1.4376953390406304, + "grad_norm": 0.671047031879425, + "learning_rate": 8.203645196358202e-06, + "loss": 4.5808, + "step": 21160 + }, + { + "epoch": 1.4380350591112923, + "grad_norm": 0.6360852122306824, + "learning_rate": 8.203220546269875e-06, + "loss": 5.0646, + "step": 21165 + }, + { + "epoch": 1.438374779181954, + "grad_norm": 0.4346616268157959, + "learning_rate": 8.202795896181546e-06, + "loss": 5.0718, + "step": 21170 + }, + { + "epoch": 1.4387144992526157, + "grad_norm": 0.5196588039398193, + "learning_rate": 8.20237124609322e-06, + "loss": 4.9112, + "step": 21175 + }, + { + "epoch": 1.4390542193232776, + "grad_norm": 0.39375099539756775, + "learning_rate": 8.201946596004894e-06, + "loss": 4.5744, + "step": 21180 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 0.4429626166820526, + "learning_rate": 8.201521945916565e-06, + "loss": 5.0134, + "step": 21185 + }, + { + "epoch": 1.439733659464601, + "grad_norm": 0.46169352531433105, + "learning_rate": 8.201097295828239e-06, + "loss": 4.9257, + "step": 21190 + }, + { + "epoch": 1.440073379535263, + "grad_norm": 0.5095987319946289, + "learning_rate": 8.20067264573991e-06, + "loss": 4.7747, + "step": 21195 + }, + { + "epoch": 1.4404130996059248, + "grad_norm": 0.47671306133270264, + "learning_rate": 8.200247995651583e-06, + "loss": 4.6936, + "step": 21200 + }, + { + "epoch": 1.4407528196765864, + "grad_norm": 0.5162780284881592, + "learning_rate": 8.199823345563258e-06, + "loss": 5.1565, + "step": 21205 + }, + { + "epoch": 1.4410925397472483, + "grad_norm": 0.4368775188922882, + "learning_rate": 8.199398695474929e-06, + "loss": 4.7686, + "step": 21210 + }, + { + "epoch": 1.4414322598179101, + "grad_norm": 0.47638803720474243, + "learning_rate": 8.198974045386601e-06, + "loss": 4.8669, + "step": 21215 + }, + { + "epoch": 1.4417719798885718, + "grad_norm": 0.5999294519424438, + "learning_rate": 8.198549395298276e-06, + "loss": 4.7609, + "step": 21220 + }, + { + "epoch": 1.4421116999592336, + "grad_norm": 0.5196375846862793, + "learning_rate": 8.198124745209947e-06, + "loss": 5.2605, + "step": 21225 + }, + { + "epoch": 1.4424514200298955, + "grad_norm": 0.5295382142066956, + "learning_rate": 8.19770009512162e-06, + "loss": 4.7785, + "step": 21230 + }, + { + "epoch": 1.442791140100557, + "grad_norm": 0.5704073309898376, + "learning_rate": 8.197275445033294e-06, + "loss": 4.8124, + "step": 21235 + }, + { + "epoch": 1.443130860171219, + "grad_norm": 0.4193112254142761, + "learning_rate": 8.196850794944965e-06, + "loss": 4.827, + "step": 21240 + }, + { + "epoch": 1.4434705802418808, + "grad_norm": 0.6042715311050415, + "learning_rate": 8.196426144856638e-06, + "loss": 5.1139, + "step": 21245 + }, + { + "epoch": 1.4438103003125424, + "grad_norm": 0.5078057646751404, + "learning_rate": 8.196001494768313e-06, + "loss": 4.8271, + "step": 21250 + }, + { + "epoch": 1.4441500203832043, + "grad_norm": 0.4754032790660858, + "learning_rate": 8.195576844679984e-06, + "loss": 4.9277, + "step": 21255 + }, + { + "epoch": 1.4444897404538661, + "grad_norm": 0.4747053384780884, + "learning_rate": 8.195152194591657e-06, + "loss": 4.7843, + "step": 21260 + }, + { + "epoch": 1.4448294605245278, + "grad_norm": 0.44950586557388306, + "learning_rate": 8.19472754450333e-06, + "loss": 4.4748, + "step": 21265 + }, + { + "epoch": 1.4451691805951896, + "grad_norm": 0.3925251066684723, + "learning_rate": 8.194302894415002e-06, + "loss": 4.8416, + "step": 21270 + }, + { + "epoch": 1.4455089006658515, + "grad_norm": 0.575383186340332, + "learning_rate": 8.193878244326675e-06, + "loss": 4.8312, + "step": 21275 + }, + { + "epoch": 1.445848620736513, + "grad_norm": 0.4034932255744934, + "learning_rate": 8.193453594238348e-06, + "loss": 4.9123, + "step": 21280 + }, + { + "epoch": 1.446188340807175, + "grad_norm": 0.43354332447052, + "learning_rate": 8.19302894415002e-06, + "loss": 4.8586, + "step": 21285 + }, + { + "epoch": 1.4465280608778366, + "grad_norm": 0.3805350959300995, + "learning_rate": 8.192604294061693e-06, + "loss": 4.8869, + "step": 21290 + }, + { + "epoch": 1.4468677809484984, + "grad_norm": 0.5906457901000977, + "learning_rate": 8.192179643973366e-06, + "loss": 4.9726, + "step": 21295 + }, + { + "epoch": 1.4472075010191603, + "grad_norm": 0.43134182691574097, + "learning_rate": 8.191754993885039e-06, + "loss": 4.785, + "step": 21300 + }, + { + "epoch": 1.447547221089822, + "grad_norm": 0.5300790071487427, + "learning_rate": 8.191330343796712e-06, + "loss": 4.788, + "step": 21305 + }, + { + "epoch": 1.4478869411604838, + "grad_norm": 0.4741014242172241, + "learning_rate": 8.190905693708385e-06, + "loss": 4.9, + "step": 21310 + }, + { + "epoch": 1.4482266612311454, + "grad_norm": 0.6269193887710571, + "learning_rate": 8.190481043620057e-06, + "loss": 4.6033, + "step": 21315 + }, + { + "epoch": 1.4485663813018073, + "grad_norm": 0.5470094084739685, + "learning_rate": 8.19005639353173e-06, + "loss": 4.7586, + "step": 21320 + }, + { + "epoch": 1.4489061013724691, + "grad_norm": 0.48873618245124817, + "learning_rate": 8.189631743443403e-06, + "loss": 4.8601, + "step": 21325 + }, + { + "epoch": 1.4492458214431307, + "grad_norm": 0.5410587787628174, + "learning_rate": 8.189207093355076e-06, + "loss": 4.726, + "step": 21330 + }, + { + "epoch": 1.4495855415137926, + "grad_norm": 0.484819233417511, + "learning_rate": 8.188782443266749e-06, + "loss": 5.0038, + "step": 21335 + }, + { + "epoch": 1.4499252615844545, + "grad_norm": 0.3826753795146942, + "learning_rate": 8.188357793178421e-06, + "loss": 4.7427, + "step": 21340 + }, + { + "epoch": 1.450264981655116, + "grad_norm": 0.3787674009799957, + "learning_rate": 8.187933143090094e-06, + "loss": 4.8207, + "step": 21345 + }, + { + "epoch": 1.450604701725778, + "grad_norm": 0.655407726764679, + "learning_rate": 8.187508493001767e-06, + "loss": 4.8157, + "step": 21350 + }, + { + "epoch": 1.4509444217964398, + "grad_norm": 0.5097930431365967, + "learning_rate": 8.18708384291344e-06, + "loss": 4.9401, + "step": 21355 + }, + { + "epoch": 1.4512841418671014, + "grad_norm": 0.4854896664619446, + "learning_rate": 8.186659192825113e-06, + "loss": 4.7713, + "step": 21360 + }, + { + "epoch": 1.4516238619377633, + "grad_norm": 0.49029698967933655, + "learning_rate": 8.186234542736785e-06, + "loss": 4.5172, + "step": 21365 + }, + { + "epoch": 1.4519635820084251, + "grad_norm": 0.4369466006755829, + "learning_rate": 8.185809892648458e-06, + "loss": 4.9204, + "step": 21370 + }, + { + "epoch": 1.4523033020790868, + "grad_norm": 0.4087437689304352, + "learning_rate": 8.185385242560131e-06, + "loss": 5.0522, + "step": 21375 + }, + { + "epoch": 1.4526430221497486, + "grad_norm": 0.5471604466438293, + "learning_rate": 8.184960592471804e-06, + "loss": 4.6322, + "step": 21380 + }, + { + "epoch": 1.4529827422204105, + "grad_norm": 0.5121372938156128, + "learning_rate": 8.184535942383477e-06, + "loss": 5.0027, + "step": 21385 + }, + { + "epoch": 1.453322462291072, + "grad_norm": 0.4560524821281433, + "learning_rate": 8.18411129229515e-06, + "loss": 5.0162, + "step": 21390 + }, + { + "epoch": 1.453662182361734, + "grad_norm": 0.562006413936615, + "learning_rate": 8.183686642206822e-06, + "loss": 4.6665, + "step": 21395 + }, + { + "epoch": 1.4540019024323958, + "grad_norm": 0.4092576205730438, + "learning_rate": 8.183261992118495e-06, + "loss": 4.5755, + "step": 21400 + }, + { + "epoch": 1.4543416225030574, + "grad_norm": 0.4284387528896332, + "learning_rate": 8.182837342030168e-06, + "loss": 4.6889, + "step": 21405 + }, + { + "epoch": 1.4546813425737193, + "grad_norm": 0.5238933563232422, + "learning_rate": 8.18241269194184e-06, + "loss": 4.891, + "step": 21410 + }, + { + "epoch": 1.4550210626443811, + "grad_norm": 0.5079160332679749, + "learning_rate": 8.181988041853513e-06, + "loss": 4.7822, + "step": 21415 + }, + { + "epoch": 1.4553607827150428, + "grad_norm": 0.7905279397964478, + "learning_rate": 8.181563391765186e-06, + "loss": 4.742, + "step": 21420 + }, + { + "epoch": 1.4557005027857046, + "grad_norm": 0.4523125886917114, + "learning_rate": 8.181138741676859e-06, + "loss": 4.7533, + "step": 21425 + }, + { + "epoch": 1.4560402228563665, + "grad_norm": 0.5539008975028992, + "learning_rate": 8.180714091588532e-06, + "loss": 4.7875, + "step": 21430 + }, + { + "epoch": 1.456379942927028, + "grad_norm": 0.4081052541732788, + "learning_rate": 8.180289441500205e-06, + "loss": 5.0951, + "step": 21435 + }, + { + "epoch": 1.45671966299769, + "grad_norm": 0.4788355827331543, + "learning_rate": 8.179864791411877e-06, + "loss": 4.7874, + "step": 21440 + }, + { + "epoch": 1.4570593830683518, + "grad_norm": 0.5243462920188904, + "learning_rate": 8.17944014132355e-06, + "loss": 5.0947, + "step": 21445 + }, + { + "epoch": 1.4573991031390134, + "grad_norm": 0.4428265392780304, + "learning_rate": 8.179015491235223e-06, + "loss": 4.8756, + "step": 21450 + }, + { + "epoch": 1.4577388232096753, + "grad_norm": 0.4503236413002014, + "learning_rate": 8.178590841146896e-06, + "loss": 4.938, + "step": 21455 + }, + { + "epoch": 1.458078543280337, + "grad_norm": 0.6445163488388062, + "learning_rate": 8.178166191058569e-06, + "loss": 4.7403, + "step": 21460 + }, + { + "epoch": 1.4584182633509988, + "grad_norm": 0.5049019455909729, + "learning_rate": 8.177741540970241e-06, + "loss": 4.9261, + "step": 21465 + }, + { + "epoch": 1.4587579834216606, + "grad_norm": 0.3909730017185211, + "learning_rate": 8.177316890881914e-06, + "loss": 4.4493, + "step": 21470 + }, + { + "epoch": 1.4590977034923223, + "grad_norm": 0.5006139278411865, + "learning_rate": 8.176892240793587e-06, + "loss": 4.8337, + "step": 21475 + }, + { + "epoch": 1.4594374235629841, + "grad_norm": 0.42171671986579895, + "learning_rate": 8.17646759070526e-06, + "loss": 5.0298, + "step": 21480 + }, + { + "epoch": 1.4597771436336457, + "grad_norm": 0.6019606590270996, + "learning_rate": 8.176042940616933e-06, + "loss": 4.9761, + "step": 21485 + }, + { + "epoch": 1.4601168637043076, + "grad_norm": 0.5918442606925964, + "learning_rate": 8.175618290528606e-06, + "loss": 5.1211, + "step": 21490 + }, + { + "epoch": 1.4604565837749695, + "grad_norm": 0.4721817970275879, + "learning_rate": 8.175193640440278e-06, + "loss": 4.8517, + "step": 21495 + }, + { + "epoch": 1.460796303845631, + "grad_norm": 0.45358356833457947, + "learning_rate": 8.174768990351951e-06, + "loss": 4.7937, + "step": 21500 + }, + { + "epoch": 1.461136023916293, + "grad_norm": 0.4323587417602539, + "learning_rate": 8.174344340263624e-06, + "loss": 4.8551, + "step": 21505 + }, + { + "epoch": 1.4614757439869548, + "grad_norm": 0.3953838348388672, + "learning_rate": 8.173919690175297e-06, + "loss": 4.9647, + "step": 21510 + }, + { + "epoch": 1.4618154640576164, + "grad_norm": 0.5551892518997192, + "learning_rate": 8.17349504008697e-06, + "loss": 5.0535, + "step": 21515 + }, + { + "epoch": 1.4621551841282783, + "grad_norm": 0.406004935503006, + "learning_rate": 8.173070389998642e-06, + "loss": 4.3641, + "step": 21520 + }, + { + "epoch": 1.4624949041989401, + "grad_norm": 0.4120928645133972, + "learning_rate": 8.172645739910315e-06, + "loss": 4.8881, + "step": 21525 + }, + { + "epoch": 1.4628346242696018, + "grad_norm": 0.4894176423549652, + "learning_rate": 8.172221089821988e-06, + "loss": 4.8225, + "step": 21530 + }, + { + "epoch": 1.4631743443402636, + "grad_norm": 0.555510938167572, + "learning_rate": 8.17179643973366e-06, + "loss": 5.0565, + "step": 21535 + }, + { + "epoch": 1.4635140644109255, + "grad_norm": 0.9230882525444031, + "learning_rate": 8.171371789645332e-06, + "loss": 4.9361, + "step": 21540 + }, + { + "epoch": 1.463853784481587, + "grad_norm": 0.47723621129989624, + "learning_rate": 8.170947139557006e-06, + "loss": 4.8567, + "step": 21545 + }, + { + "epoch": 1.464193504552249, + "grad_norm": 0.44052988290786743, + "learning_rate": 8.170522489468679e-06, + "loss": 4.9507, + "step": 21550 + }, + { + "epoch": 1.4645332246229108, + "grad_norm": 0.6862558126449585, + "learning_rate": 8.17009783938035e-06, + "loss": 5.0206, + "step": 21555 + }, + { + "epoch": 1.4648729446935724, + "grad_norm": 0.41005298495292664, + "learning_rate": 8.169673189292025e-06, + "loss": 4.6792, + "step": 21560 + }, + { + "epoch": 1.4652126647642343, + "grad_norm": 0.3268030285835266, + "learning_rate": 8.169248539203698e-06, + "loss": 4.9928, + "step": 21565 + }, + { + "epoch": 1.4655523848348961, + "grad_norm": 0.53488689661026, + "learning_rate": 8.168823889115369e-06, + "loss": 5.1447, + "step": 21570 + }, + { + "epoch": 1.4658921049055578, + "grad_norm": 0.417776495218277, + "learning_rate": 8.168399239027043e-06, + "loss": 4.8265, + "step": 21575 + }, + { + "epoch": 1.4662318249762196, + "grad_norm": 0.49488958716392517, + "learning_rate": 8.167974588938716e-06, + "loss": 5.0045, + "step": 21580 + }, + { + "epoch": 1.4665715450468815, + "grad_norm": 0.47409588098526, + "learning_rate": 8.167549938850387e-06, + "loss": 4.8796, + "step": 21585 + }, + { + "epoch": 1.466911265117543, + "grad_norm": 0.37268805503845215, + "learning_rate": 8.167125288762062e-06, + "loss": 4.5043, + "step": 21590 + }, + { + "epoch": 1.467250985188205, + "grad_norm": 0.502851128578186, + "learning_rate": 8.166700638673734e-06, + "loss": 4.9161, + "step": 21595 + }, + { + "epoch": 1.4675907052588668, + "grad_norm": 0.5054653882980347, + "learning_rate": 8.166275988585405e-06, + "loss": 5.0524, + "step": 21600 + }, + { + "epoch": 1.4679304253295284, + "grad_norm": 0.4353584945201874, + "learning_rate": 8.16585133849708e-06, + "loss": 4.8528, + "step": 21605 + }, + { + "epoch": 1.4682701454001903, + "grad_norm": 0.4804656207561493, + "learning_rate": 8.165426688408751e-06, + "loss": 4.8091, + "step": 21610 + }, + { + "epoch": 1.4686098654708521, + "grad_norm": 0.3851783871650696, + "learning_rate": 8.165002038320424e-06, + "loss": 4.4994, + "step": 21615 + }, + { + "epoch": 1.4689495855415138, + "grad_norm": 0.45113974809646606, + "learning_rate": 8.164577388232098e-06, + "loss": 4.7095, + "step": 21620 + }, + { + "epoch": 1.4692893056121756, + "grad_norm": 0.5587056875228882, + "learning_rate": 8.16415273814377e-06, + "loss": 4.8531, + "step": 21625 + }, + { + "epoch": 1.4696290256828373, + "grad_norm": 0.4593208432197571, + "learning_rate": 8.163728088055442e-06, + "loss": 4.5945, + "step": 21630 + }, + { + "epoch": 1.4699687457534991, + "grad_norm": 0.5182281732559204, + "learning_rate": 8.163303437967117e-06, + "loss": 4.9316, + "step": 21635 + }, + { + "epoch": 1.470308465824161, + "grad_norm": 0.5810526609420776, + "learning_rate": 8.162878787878788e-06, + "loss": 4.757, + "step": 21640 + }, + { + "epoch": 1.4706481858948226, + "grad_norm": 0.4360375702381134, + "learning_rate": 8.16245413779046e-06, + "loss": 4.6579, + "step": 21645 + }, + { + "epoch": 1.4709879059654845, + "grad_norm": 0.4405723512172699, + "learning_rate": 8.162029487702135e-06, + "loss": 4.9752, + "step": 21650 + }, + { + "epoch": 1.471327626036146, + "grad_norm": 0.6366196870803833, + "learning_rate": 8.161604837613806e-06, + "loss": 4.6613, + "step": 21655 + }, + { + "epoch": 1.471667346106808, + "grad_norm": 0.4354563355445862, + "learning_rate": 8.161180187525479e-06, + "loss": 4.836, + "step": 21660 + }, + { + "epoch": 1.4720070661774698, + "grad_norm": 0.5692440867424011, + "learning_rate": 8.160755537437154e-06, + "loss": 4.975, + "step": 21665 + }, + { + "epoch": 1.4723467862481314, + "grad_norm": 0.4539027214050293, + "learning_rate": 8.160330887348825e-06, + "loss": 5.0457, + "step": 21670 + }, + { + "epoch": 1.4726865063187933, + "grad_norm": 0.4370480477809906, + "learning_rate": 8.159906237260497e-06, + "loss": 4.9233, + "step": 21675 + }, + { + "epoch": 1.4730262263894551, + "grad_norm": 0.37144920229911804, + "learning_rate": 8.159481587172172e-06, + "loss": 4.8351, + "step": 21680 + }, + { + "epoch": 1.4733659464601168, + "grad_norm": 0.47682178020477295, + "learning_rate": 8.159056937083843e-06, + "loss": 4.7659, + "step": 21685 + }, + { + "epoch": 1.4737056665307786, + "grad_norm": 0.41532883048057556, + "learning_rate": 8.158632286995516e-06, + "loss": 5.0056, + "step": 21690 + }, + { + "epoch": 1.4740453866014405, + "grad_norm": 0.45928066968917847, + "learning_rate": 8.158207636907189e-06, + "loss": 5.0155, + "step": 21695 + }, + { + "epoch": 1.474385106672102, + "grad_norm": 0.4183253347873688, + "learning_rate": 8.157782986818861e-06, + "loss": 4.5792, + "step": 21700 + }, + { + "epoch": 1.474724826742764, + "grad_norm": 0.42961159348487854, + "learning_rate": 8.157358336730534e-06, + "loss": 4.8873, + "step": 21705 + }, + { + "epoch": 1.4750645468134258, + "grad_norm": 0.3727744519710541, + "learning_rate": 8.156933686642207e-06, + "loss": 4.6744, + "step": 21710 + }, + { + "epoch": 1.4754042668840874, + "grad_norm": 0.36460769176483154, + "learning_rate": 8.15650903655388e-06, + "loss": 4.8854, + "step": 21715 + }, + { + "epoch": 1.4757439869547493, + "grad_norm": 0.48191699385643005, + "learning_rate": 8.156084386465553e-06, + "loss": 4.7775, + "step": 21720 + }, + { + "epoch": 1.4760837070254111, + "grad_norm": 0.5201433300971985, + "learning_rate": 8.155659736377225e-06, + "loss": 4.9619, + "step": 21725 + }, + { + "epoch": 1.4764234270960728, + "grad_norm": 0.46930599212646484, + "learning_rate": 8.155235086288898e-06, + "loss": 4.504, + "step": 21730 + }, + { + "epoch": 1.4767631471667346, + "grad_norm": 0.47125986218452454, + "learning_rate": 8.154810436200571e-06, + "loss": 4.9178, + "step": 21735 + }, + { + "epoch": 1.4771028672373965, + "grad_norm": 0.4578631818294525, + "learning_rate": 8.154385786112244e-06, + "loss": 4.7404, + "step": 21740 + }, + { + "epoch": 1.477442587308058, + "grad_norm": 0.5273672342300415, + "learning_rate": 8.153961136023917e-06, + "loss": 4.5292, + "step": 21745 + }, + { + "epoch": 1.47778230737872, + "grad_norm": 0.4174686074256897, + "learning_rate": 8.15353648593559e-06, + "loss": 4.5682, + "step": 21750 + }, + { + "epoch": 1.4781220274493818, + "grad_norm": 0.37711167335510254, + "learning_rate": 8.153111835847262e-06, + "loss": 4.7488, + "step": 21755 + }, + { + "epoch": 1.4784617475200434, + "grad_norm": 0.38611605763435364, + "learning_rate": 8.152687185758935e-06, + "loss": 4.8563, + "step": 21760 + }, + { + "epoch": 1.4788014675907053, + "grad_norm": 0.5123317837715149, + "learning_rate": 8.152262535670608e-06, + "loss": 4.9244, + "step": 21765 + }, + { + "epoch": 1.4791411876613672, + "grad_norm": 0.4926532506942749, + "learning_rate": 8.15183788558228e-06, + "loss": 4.8443, + "step": 21770 + }, + { + "epoch": 1.4794809077320288, + "grad_norm": 0.5066883563995361, + "learning_rate": 8.151413235493953e-06, + "loss": 4.8082, + "step": 21775 + }, + { + "epoch": 1.4798206278026906, + "grad_norm": 0.4219716489315033, + "learning_rate": 8.150988585405626e-06, + "loss": 4.6005, + "step": 21780 + }, + { + "epoch": 1.4801603478733525, + "grad_norm": 0.37337934970855713, + "learning_rate": 8.150563935317299e-06, + "loss": 5.0865, + "step": 21785 + }, + { + "epoch": 1.4805000679440141, + "grad_norm": 0.6414660215377808, + "learning_rate": 8.150139285228972e-06, + "loss": 4.9076, + "step": 21790 + }, + { + "epoch": 1.480839788014676, + "grad_norm": 0.41409677267074585, + "learning_rate": 8.149714635140645e-06, + "loss": 4.8175, + "step": 21795 + }, + { + "epoch": 1.4811795080853376, + "grad_norm": 0.6038466095924377, + "learning_rate": 8.149289985052317e-06, + "loss": 4.9649, + "step": 21800 + }, + { + "epoch": 1.4815192281559995, + "grad_norm": 0.3465467095375061, + "learning_rate": 8.14886533496399e-06, + "loss": 4.9295, + "step": 21805 + }, + { + "epoch": 1.4818589482266613, + "grad_norm": 0.38087478280067444, + "learning_rate": 8.148440684875663e-06, + "loss": 4.7529, + "step": 21810 + }, + { + "epoch": 1.482198668297323, + "grad_norm": 0.534708559513092, + "learning_rate": 8.148016034787336e-06, + "loss": 4.8796, + "step": 21815 + }, + { + "epoch": 1.4825383883679848, + "grad_norm": 0.38533756136894226, + "learning_rate": 8.147591384699009e-06, + "loss": 4.66, + "step": 21820 + }, + { + "epoch": 1.4828781084386464, + "grad_norm": 0.41961488127708435, + "learning_rate": 8.147166734610681e-06, + "loss": 5.2806, + "step": 21825 + }, + { + "epoch": 1.4832178285093083, + "grad_norm": 0.4260583817958832, + "learning_rate": 8.146742084522354e-06, + "loss": 4.7352, + "step": 21830 + }, + { + "epoch": 1.4835575485799701, + "grad_norm": 0.5423315167427063, + "learning_rate": 8.146317434434027e-06, + "loss": 4.9267, + "step": 21835 + }, + { + "epoch": 1.4838972686506318, + "grad_norm": 0.41430774331092834, + "learning_rate": 8.1458927843457e-06, + "loss": 4.8323, + "step": 21840 + }, + { + "epoch": 1.4842369887212936, + "grad_norm": 0.39771750569343567, + "learning_rate": 8.145468134257373e-06, + "loss": 4.8527, + "step": 21845 + }, + { + "epoch": 1.4845767087919555, + "grad_norm": 0.5684492588043213, + "learning_rate": 8.145043484169045e-06, + "loss": 5.0699, + "step": 21850 + }, + { + "epoch": 1.484916428862617, + "grad_norm": 0.6332288980484009, + "learning_rate": 8.144618834080718e-06, + "loss": 5.0289, + "step": 21855 + }, + { + "epoch": 1.485256148933279, + "grad_norm": 0.5387458801269531, + "learning_rate": 8.144194183992391e-06, + "loss": 4.8846, + "step": 21860 + }, + { + "epoch": 1.4855958690039408, + "grad_norm": 0.3605036735534668, + "learning_rate": 8.143769533904064e-06, + "loss": 4.853, + "step": 21865 + }, + { + "epoch": 1.4859355890746024, + "grad_norm": 0.5090234875679016, + "learning_rate": 8.143344883815737e-06, + "loss": 4.6622, + "step": 21870 + }, + { + "epoch": 1.4862753091452643, + "grad_norm": 0.40762263536453247, + "learning_rate": 8.14292023372741e-06, + "loss": 4.7985, + "step": 21875 + }, + { + "epoch": 1.4866150292159261, + "grad_norm": 0.5529220104217529, + "learning_rate": 8.142495583639082e-06, + "loss": 4.8327, + "step": 21880 + }, + { + "epoch": 1.4869547492865878, + "grad_norm": 0.467644065618515, + "learning_rate": 8.142070933550755e-06, + "loss": 4.4639, + "step": 21885 + }, + { + "epoch": 1.4872944693572496, + "grad_norm": 0.5029266476631165, + "learning_rate": 8.141646283462428e-06, + "loss": 4.7662, + "step": 21890 + }, + { + "epoch": 1.4876341894279115, + "grad_norm": 0.49704304337501526, + "learning_rate": 8.1412216333741e-06, + "loss": 4.824, + "step": 21895 + }, + { + "epoch": 1.487973909498573, + "grad_norm": 0.48360589146614075, + "learning_rate": 8.140796983285773e-06, + "loss": 4.7685, + "step": 21900 + }, + { + "epoch": 1.488313629569235, + "grad_norm": 0.570248007774353, + "learning_rate": 8.140372333197446e-06, + "loss": 4.8547, + "step": 21905 + }, + { + "epoch": 1.4886533496398968, + "grad_norm": 0.43371495604515076, + "learning_rate": 8.139947683109119e-06, + "loss": 4.4298, + "step": 21910 + }, + { + "epoch": 1.4889930697105584, + "grad_norm": 0.5165069103240967, + "learning_rate": 8.139523033020792e-06, + "loss": 4.7131, + "step": 21915 + }, + { + "epoch": 1.4893327897812203, + "grad_norm": 0.5287138223648071, + "learning_rate": 8.139098382932465e-06, + "loss": 4.776, + "step": 21920 + }, + { + "epoch": 1.4896725098518822, + "grad_norm": 0.5360073447227478, + "learning_rate": 8.138673732844137e-06, + "loss": 4.7305, + "step": 21925 + }, + { + "epoch": 1.4900122299225438, + "grad_norm": 0.3874132037162781, + "learning_rate": 8.13824908275581e-06, + "loss": 4.9469, + "step": 21930 + }, + { + "epoch": 1.4903519499932056, + "grad_norm": 0.3305451273918152, + "learning_rate": 8.137824432667483e-06, + "loss": 4.6753, + "step": 21935 + }, + { + "epoch": 1.4906916700638675, + "grad_norm": 0.5658344030380249, + "learning_rate": 8.137399782579156e-06, + "loss": 5.0346, + "step": 21940 + }, + { + "epoch": 1.4910313901345291, + "grad_norm": 0.46120312809944153, + "learning_rate": 8.136975132490829e-06, + "loss": 5.0207, + "step": 21945 + }, + { + "epoch": 1.491371110205191, + "grad_norm": 0.5367444157600403, + "learning_rate": 8.136550482402501e-06, + "loss": 4.9829, + "step": 21950 + }, + { + "epoch": 1.4917108302758528, + "grad_norm": 0.38880231976509094, + "learning_rate": 8.136125832314173e-06, + "loss": 4.781, + "step": 21955 + }, + { + "epoch": 1.4920505503465145, + "grad_norm": 0.4172866940498352, + "learning_rate": 8.135701182225847e-06, + "loss": 4.7913, + "step": 21960 + }, + { + "epoch": 1.4923902704171763, + "grad_norm": 0.46848565340042114, + "learning_rate": 8.13527653213752e-06, + "loss": 4.676, + "step": 21965 + }, + { + "epoch": 1.492729990487838, + "grad_norm": 0.48508700728416443, + "learning_rate": 8.134851882049191e-06, + "loss": 4.5992, + "step": 21970 + }, + { + "epoch": 1.4930697105584998, + "grad_norm": 0.46892085671424866, + "learning_rate": 8.134427231960865e-06, + "loss": 4.8927, + "step": 21975 + }, + { + "epoch": 1.4934094306291616, + "grad_norm": 0.40538859367370605, + "learning_rate": 8.134002581872538e-06, + "loss": 5.0039, + "step": 21980 + }, + { + "epoch": 1.4937491506998233, + "grad_norm": 0.5471259355545044, + "learning_rate": 8.13357793178421e-06, + "loss": 4.7137, + "step": 21985 + }, + { + "epoch": 1.4940888707704851, + "grad_norm": 0.35296469926834106, + "learning_rate": 8.133153281695884e-06, + "loss": 4.6521, + "step": 21990 + }, + { + "epoch": 1.4944285908411468, + "grad_norm": 0.46806684136390686, + "learning_rate": 8.132728631607557e-06, + "loss": 4.8795, + "step": 21995 + }, + { + "epoch": 1.4947683109118086, + "grad_norm": 0.39197206497192383, + "learning_rate": 8.132303981519228e-06, + "loss": 4.9386, + "step": 22000 + }, + { + "epoch": 1.4951080309824705, + "grad_norm": 0.4819774329662323, + "learning_rate": 8.131879331430902e-06, + "loss": 4.9319, + "step": 22005 + }, + { + "epoch": 1.495447751053132, + "grad_norm": 0.4597817361354828, + "learning_rate": 8.131454681342575e-06, + "loss": 4.8416, + "step": 22010 + }, + { + "epoch": 1.495787471123794, + "grad_norm": 0.5055804252624512, + "learning_rate": 8.131030031254246e-06, + "loss": 4.9764, + "step": 22015 + }, + { + "epoch": 1.4961271911944558, + "grad_norm": 0.4042135179042816, + "learning_rate": 8.13060538116592e-06, + "loss": 4.7509, + "step": 22020 + }, + { + "epoch": 1.4964669112651174, + "grad_norm": 0.46461570262908936, + "learning_rate": 8.130180731077593e-06, + "loss": 4.7162, + "step": 22025 + }, + { + "epoch": 1.4968066313357793, + "grad_norm": 0.42105457186698914, + "learning_rate": 8.129756080989265e-06, + "loss": 4.9232, + "step": 22030 + }, + { + "epoch": 1.4971463514064411, + "grad_norm": 0.47409549355506897, + "learning_rate": 8.129331430900939e-06, + "loss": 4.8361, + "step": 22035 + }, + { + "epoch": 1.4974860714771028, + "grad_norm": 0.4138262867927551, + "learning_rate": 8.12890678081261e-06, + "loss": 4.8411, + "step": 22040 + }, + { + "epoch": 1.4978257915477646, + "grad_norm": 0.4185262620449066, + "learning_rate": 8.128482130724283e-06, + "loss": 4.5323, + "step": 22045 + }, + { + "epoch": 1.4981655116184265, + "grad_norm": 0.5419734120368958, + "learning_rate": 8.128057480635957e-06, + "loss": 4.8785, + "step": 22050 + }, + { + "epoch": 1.4985052316890881, + "grad_norm": 0.4630097448825836, + "learning_rate": 8.127632830547629e-06, + "loss": 4.6354, + "step": 22055 + }, + { + "epoch": 1.49884495175975, + "grad_norm": 0.4400067627429962, + "learning_rate": 8.127208180459301e-06, + "loss": 4.7719, + "step": 22060 + }, + { + "epoch": 1.4991846718304118, + "grad_norm": 0.4132564663887024, + "learning_rate": 8.126783530370976e-06, + "loss": 4.938, + "step": 22065 + }, + { + "epoch": 1.4995243919010735, + "grad_norm": 0.49349409341812134, + "learning_rate": 8.126358880282647e-06, + "loss": 4.6128, + "step": 22070 + }, + { + "epoch": 1.4998641119717353, + "grad_norm": 0.4557749629020691, + "learning_rate": 8.12593423019432e-06, + "loss": 4.8962, + "step": 22075 + }, + { + "epoch": 1.5002038320423972, + "grad_norm": 0.6452621817588806, + "learning_rate": 8.125509580105994e-06, + "loss": 4.831, + "step": 22080 + }, + { + "epoch": 1.5005435521130588, + "grad_norm": 0.37887027859687805, + "learning_rate": 8.125084930017665e-06, + "loss": 4.6486, + "step": 22085 + }, + { + "epoch": 1.5008832721837206, + "grad_norm": 0.5738966464996338, + "learning_rate": 8.124660279929338e-06, + "loss": 4.5826, + "step": 22090 + }, + { + "epoch": 1.5012229922543825, + "grad_norm": 0.5395826697349548, + "learning_rate": 8.124235629841013e-06, + "loss": 4.7616, + "step": 22095 + }, + { + "epoch": 1.5015627123250441, + "grad_norm": 0.44238990545272827, + "learning_rate": 8.123810979752684e-06, + "loss": 4.6995, + "step": 22100 + }, + { + "epoch": 1.501902432395706, + "grad_norm": 0.5688230395317078, + "learning_rate": 8.123386329664357e-06, + "loss": 4.8856, + "step": 22105 + }, + { + "epoch": 1.5022421524663678, + "grad_norm": 0.4032575488090515, + "learning_rate": 8.12296167957603e-06, + "loss": 4.7873, + "step": 22110 + }, + { + "epoch": 1.5025818725370295, + "grad_norm": 0.48039406538009644, + "learning_rate": 8.122537029487702e-06, + "loss": 4.9741, + "step": 22115 + }, + { + "epoch": 1.5029215926076913, + "grad_norm": 0.5019194483757019, + "learning_rate": 8.122112379399375e-06, + "loss": 4.6616, + "step": 22120 + }, + { + "epoch": 1.5032613126783532, + "grad_norm": 0.3938286304473877, + "learning_rate": 8.121687729311048e-06, + "loss": 4.6393, + "step": 22125 + }, + { + "epoch": 1.5036010327490148, + "grad_norm": 0.4559648633003235, + "learning_rate": 8.12126307922272e-06, + "loss": 4.8626, + "step": 22130 + }, + { + "epoch": 1.5039407528196764, + "grad_norm": 0.4659184515476227, + "learning_rate": 8.120838429134395e-06, + "loss": 4.7962, + "step": 22135 + }, + { + "epoch": 1.5042804728903385, + "grad_norm": 0.6193770170211792, + "learning_rate": 8.120413779046066e-06, + "loss": 4.8084, + "step": 22140 + }, + { + "epoch": 1.5046201929610001, + "grad_norm": 0.47316503524780273, + "learning_rate": 8.119989128957739e-06, + "loss": 4.9467, + "step": 22145 + }, + { + "epoch": 1.5049599130316618, + "grad_norm": 0.5967952013015747, + "learning_rate": 8.119564478869414e-06, + "loss": 4.8587, + "step": 22150 + }, + { + "epoch": 1.5052996331023238, + "grad_norm": 0.5076853632926941, + "learning_rate": 8.119139828781085e-06, + "loss": 4.8084, + "step": 22155 + }, + { + "epoch": 1.5056393531729855, + "grad_norm": 0.5532291531562805, + "learning_rate": 8.118715178692757e-06, + "loss": 4.9378, + "step": 22160 + }, + { + "epoch": 1.505979073243647, + "grad_norm": 0.48410868644714355, + "learning_rate": 8.118290528604432e-06, + "loss": 4.7991, + "step": 22165 + }, + { + "epoch": 1.506318793314309, + "grad_norm": 0.4870273470878601, + "learning_rate": 8.117865878516103e-06, + "loss": 4.6652, + "step": 22170 + }, + { + "epoch": 1.5066585133849708, + "grad_norm": 0.49703648686408997, + "learning_rate": 8.117441228427776e-06, + "loss": 4.7538, + "step": 22175 + }, + { + "epoch": 1.5069982334556324, + "grad_norm": 0.43527185916900635, + "learning_rate": 8.117016578339449e-06, + "loss": 4.8573, + "step": 22180 + }, + { + "epoch": 1.5073379535262943, + "grad_norm": 0.48510709404945374, + "learning_rate": 8.116591928251121e-06, + "loss": 5.1253, + "step": 22185 + }, + { + "epoch": 1.5076776735969561, + "grad_norm": 0.5316776633262634, + "learning_rate": 8.116167278162794e-06, + "loss": 4.5383, + "step": 22190 + }, + { + "epoch": 1.5080173936676178, + "grad_norm": 0.3864249289035797, + "learning_rate": 8.115742628074467e-06, + "loss": 4.6817, + "step": 22195 + }, + { + "epoch": 1.5083571137382796, + "grad_norm": 0.41060304641723633, + "learning_rate": 8.11531797798614e-06, + "loss": 4.8717, + "step": 22200 + }, + { + "epoch": 1.5086968338089415, + "grad_norm": 0.5716449022293091, + "learning_rate": 8.114893327897813e-06, + "loss": 5.0434, + "step": 22205 + }, + { + "epoch": 1.5090365538796031, + "grad_norm": 0.53287672996521, + "learning_rate": 8.114468677809485e-06, + "loss": 4.6913, + "step": 22210 + }, + { + "epoch": 1.509376273950265, + "grad_norm": 0.37652134895324707, + "learning_rate": 8.114044027721158e-06, + "loss": 4.8752, + "step": 22215 + }, + { + "epoch": 1.5097159940209268, + "grad_norm": 0.4300931692123413, + "learning_rate": 8.113619377632831e-06, + "loss": 4.6796, + "step": 22220 + }, + { + "epoch": 1.5100557140915885, + "grad_norm": 0.40822792053222656, + "learning_rate": 8.113194727544504e-06, + "loss": 4.6557, + "step": 22225 + }, + { + "epoch": 1.5103954341622503, + "grad_norm": 0.555594801902771, + "learning_rate": 8.112770077456177e-06, + "loss": 4.7206, + "step": 22230 + }, + { + "epoch": 1.5107351542329122, + "grad_norm": 0.4241052269935608, + "learning_rate": 8.11234542736785e-06, + "loss": 4.7204, + "step": 22235 + }, + { + "epoch": 1.5110748743035738, + "grad_norm": 0.5288113951683044, + "learning_rate": 8.111920777279522e-06, + "loss": 4.7428, + "step": 22240 + }, + { + "epoch": 1.5114145943742356, + "grad_norm": 0.3947908282279968, + "learning_rate": 8.111496127191195e-06, + "loss": 4.6041, + "step": 22245 + }, + { + "epoch": 1.5117543144448975, + "grad_norm": 0.3872932493686676, + "learning_rate": 8.111071477102868e-06, + "loss": 4.7215, + "step": 22250 + }, + { + "epoch": 1.5120940345155591, + "grad_norm": 0.4985330402851105, + "learning_rate": 8.11064682701454e-06, + "loss": 4.7461, + "step": 22255 + }, + { + "epoch": 1.512433754586221, + "grad_norm": 0.4413547217845917, + "learning_rate": 8.110222176926213e-06, + "loss": 4.5498, + "step": 22260 + }, + { + "epoch": 1.5127734746568828, + "grad_norm": 0.3742732107639313, + "learning_rate": 8.109797526837886e-06, + "loss": 4.7597, + "step": 22265 + }, + { + "epoch": 1.5131131947275445, + "grad_norm": 0.41613319516181946, + "learning_rate": 8.109372876749559e-06, + "loss": 4.8066, + "step": 22270 + }, + { + "epoch": 1.5134529147982063, + "grad_norm": 0.4300568699836731, + "learning_rate": 8.108948226661232e-06, + "loss": 4.6437, + "step": 22275 + }, + { + "epoch": 1.5137926348688682, + "grad_norm": 0.6900607943534851, + "learning_rate": 8.108523576572905e-06, + "loss": 4.87, + "step": 22280 + }, + { + "epoch": 1.5141323549395298, + "grad_norm": 0.4049525558948517, + "learning_rate": 8.108098926484577e-06, + "loss": 4.7566, + "step": 22285 + }, + { + "epoch": 1.5144720750101917, + "grad_norm": 0.420379638671875, + "learning_rate": 8.10767427639625e-06, + "loss": 4.8983, + "step": 22290 + }, + { + "epoch": 1.5148117950808535, + "grad_norm": 0.5765732526779175, + "learning_rate": 8.107249626307923e-06, + "loss": 4.9207, + "step": 22295 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 0.3997924029827118, + "learning_rate": 8.106824976219596e-06, + "loss": 4.8456, + "step": 22300 + }, + { + "epoch": 1.5154912352221768, + "grad_norm": 0.5158077478408813, + "learning_rate": 8.106400326131269e-06, + "loss": 4.799, + "step": 22305 + }, + { + "epoch": 1.5158309552928388, + "grad_norm": 0.4161911606788635, + "learning_rate": 8.105975676042941e-06, + "loss": 4.6089, + "step": 22310 + }, + { + "epoch": 1.5161706753635005, + "grad_norm": 0.4515521824359894, + "learning_rate": 8.105551025954614e-06, + "loss": 4.7343, + "step": 22315 + }, + { + "epoch": 1.516510395434162, + "grad_norm": 0.448284775018692, + "learning_rate": 8.105126375866287e-06, + "loss": 4.804, + "step": 22320 + }, + { + "epoch": 1.5168501155048242, + "grad_norm": 0.5468096137046814, + "learning_rate": 8.10470172577796e-06, + "loss": 4.7663, + "step": 22325 + }, + { + "epoch": 1.5171898355754858, + "grad_norm": 0.537230372428894, + "learning_rate": 8.104277075689633e-06, + "loss": 4.849, + "step": 22330 + }, + { + "epoch": 1.5175295556461474, + "grad_norm": 0.32716745138168335, + "learning_rate": 8.103852425601305e-06, + "loss": 4.6938, + "step": 22335 + }, + { + "epoch": 1.5178692757168093, + "grad_norm": 0.47437286376953125, + "learning_rate": 8.103427775512978e-06, + "loss": 4.6112, + "step": 22340 + }, + { + "epoch": 1.5182089957874711, + "grad_norm": 0.3858714699745178, + "learning_rate": 8.103003125424651e-06, + "loss": 4.5487, + "step": 22345 + }, + { + "epoch": 1.5185487158581328, + "grad_norm": 0.35317209362983704, + "learning_rate": 8.102578475336324e-06, + "loss": 4.6675, + "step": 22350 + }, + { + "epoch": 1.5188884359287946, + "grad_norm": 0.49008703231811523, + "learning_rate": 8.102153825247997e-06, + "loss": 4.528, + "step": 22355 + }, + { + "epoch": 1.5192281559994565, + "grad_norm": 0.3545466661453247, + "learning_rate": 8.10172917515967e-06, + "loss": 4.8112, + "step": 22360 + }, + { + "epoch": 1.5195678760701181, + "grad_norm": 0.5384146571159363, + "learning_rate": 8.101304525071342e-06, + "loss": 4.6902, + "step": 22365 + }, + { + "epoch": 1.51990759614078, + "grad_norm": 0.43355128169059753, + "learning_rate": 8.100879874983013e-06, + "loss": 4.7532, + "step": 22370 + }, + { + "epoch": 1.5202473162114418, + "grad_norm": 0.3330450654029846, + "learning_rate": 8.100455224894688e-06, + "loss": 4.7962, + "step": 22375 + }, + { + "epoch": 1.5205870362821035, + "grad_norm": 0.38749629259109497, + "learning_rate": 8.10003057480636e-06, + "loss": 4.9456, + "step": 22380 + }, + { + "epoch": 1.5209267563527653, + "grad_norm": 0.4364577829837799, + "learning_rate": 8.099605924718032e-06, + "loss": 4.8758, + "step": 22385 + }, + { + "epoch": 1.5212664764234272, + "grad_norm": 0.505120038986206, + "learning_rate": 8.099181274629706e-06, + "loss": 4.9309, + "step": 22390 + }, + { + "epoch": 1.5216061964940888, + "grad_norm": 0.467910498380661, + "learning_rate": 8.098756624541379e-06, + "loss": 4.8919, + "step": 22395 + }, + { + "epoch": 1.5219459165647506, + "grad_norm": 0.449407696723938, + "learning_rate": 8.09833197445305e-06, + "loss": 4.781, + "step": 22400 + }, + { + "epoch": 1.5222856366354125, + "grad_norm": 0.5802556276321411, + "learning_rate": 8.097907324364725e-06, + "loss": 4.9493, + "step": 22405 + }, + { + "epoch": 1.5226253567060741, + "grad_norm": 0.3605288863182068, + "learning_rate": 8.097482674276397e-06, + "loss": 5.0827, + "step": 22410 + }, + { + "epoch": 1.522965076776736, + "grad_norm": 0.4063089191913605, + "learning_rate": 8.097058024188069e-06, + "loss": 4.9031, + "step": 22415 + }, + { + "epoch": 1.5233047968473978, + "grad_norm": 0.5157103538513184, + "learning_rate": 8.096633374099743e-06, + "loss": 4.8123, + "step": 22420 + }, + { + "epoch": 1.5236445169180595, + "grad_norm": 0.3788120150566101, + "learning_rate": 8.096208724011416e-06, + "loss": 4.8287, + "step": 22425 + }, + { + "epoch": 1.5239842369887213, + "grad_norm": 0.4689258635044098, + "learning_rate": 8.095784073923087e-06, + "loss": 4.7691, + "step": 22430 + }, + { + "epoch": 1.5243239570593832, + "grad_norm": 0.5315669178962708, + "learning_rate": 8.095359423834761e-06, + "loss": 4.737, + "step": 22435 + }, + { + "epoch": 1.5246636771300448, + "grad_norm": 0.4455491900444031, + "learning_rate": 8.094934773746434e-06, + "loss": 5.0012, + "step": 22440 + }, + { + "epoch": 1.5250033972007067, + "grad_norm": 0.4747687876224518, + "learning_rate": 8.094510123658105e-06, + "loss": 4.989, + "step": 22445 + }, + { + "epoch": 1.5253431172713685, + "grad_norm": 0.7094155550003052, + "learning_rate": 8.09408547356978e-06, + "loss": 4.728, + "step": 22450 + }, + { + "epoch": 1.5256828373420301, + "grad_norm": 0.5222007632255554, + "learning_rate": 8.093660823481451e-06, + "loss": 4.5938, + "step": 22455 + }, + { + "epoch": 1.526022557412692, + "grad_norm": 0.6843433976173401, + "learning_rate": 8.093236173393124e-06, + "loss": 4.8384, + "step": 22460 + }, + { + "epoch": 1.5263622774833538, + "grad_norm": 0.3932431936264038, + "learning_rate": 8.092811523304798e-06, + "loss": 4.754, + "step": 22465 + }, + { + "epoch": 1.5267019975540155, + "grad_norm": 0.3719608783721924, + "learning_rate": 8.09238687321647e-06, + "loss": 4.7909, + "step": 22470 + }, + { + "epoch": 1.527041717624677, + "grad_norm": 0.468204140663147, + "learning_rate": 8.091962223128144e-06, + "loss": 4.971, + "step": 22475 + }, + { + "epoch": 1.5273814376953392, + "grad_norm": 0.5787028670310974, + "learning_rate": 8.091537573039817e-06, + "loss": 4.7447, + "step": 22480 + }, + { + "epoch": 1.5277211577660008, + "grad_norm": 0.3915419578552246, + "learning_rate": 8.091112922951488e-06, + "loss": 4.8306, + "step": 22485 + }, + { + "epoch": 1.5280608778366624, + "grad_norm": 0.4089128077030182, + "learning_rate": 8.090688272863162e-06, + "loss": 4.7515, + "step": 22490 + }, + { + "epoch": 1.5284005979073245, + "grad_norm": 0.5562814474105835, + "learning_rate": 8.090263622774835e-06, + "loss": 4.8697, + "step": 22495 + }, + { + "epoch": 1.5287403179779862, + "grad_norm": 0.5353394150733948, + "learning_rate": 8.089838972686506e-06, + "loss": 4.6391, + "step": 22500 + }, + { + "epoch": 1.5290800380486478, + "grad_norm": 0.5242924094200134, + "learning_rate": 8.08941432259818e-06, + "loss": 4.6685, + "step": 22505 + }, + { + "epoch": 1.5294197581193096, + "grad_norm": 0.4754197597503662, + "learning_rate": 8.088989672509853e-06, + "loss": 4.8199, + "step": 22510 + }, + { + "epoch": 1.5297594781899715, + "grad_norm": 0.5232486128807068, + "learning_rate": 8.088565022421525e-06, + "loss": 4.6507, + "step": 22515 + }, + { + "epoch": 1.5300991982606331, + "grad_norm": 0.6458593606948853, + "learning_rate": 8.088140372333199e-06, + "loss": 4.512, + "step": 22520 + }, + { + "epoch": 1.530438918331295, + "grad_norm": 0.43357810378074646, + "learning_rate": 8.08771572224487e-06, + "loss": 4.8293, + "step": 22525 + }, + { + "epoch": 1.5307786384019568, + "grad_norm": 0.4033759534358978, + "learning_rate": 8.087291072156543e-06, + "loss": 4.5878, + "step": 22530 + }, + { + "epoch": 1.5311183584726185, + "grad_norm": 0.39923757314682007, + "learning_rate": 8.086866422068217e-06, + "loss": 4.9508, + "step": 22535 + }, + { + "epoch": 1.5314580785432803, + "grad_norm": 0.6054941415786743, + "learning_rate": 8.086441771979889e-06, + "loss": 4.8952, + "step": 22540 + }, + { + "epoch": 1.5317977986139422, + "grad_norm": 0.3795141577720642, + "learning_rate": 8.086017121891561e-06, + "loss": 4.8294, + "step": 22545 + }, + { + "epoch": 1.5321375186846038, + "grad_norm": 0.4406128227710724, + "learning_rate": 8.085592471803236e-06, + "loss": 4.7927, + "step": 22550 + }, + { + "epoch": 1.5324772387552656, + "grad_norm": 0.5024167895317078, + "learning_rate": 8.085167821714907e-06, + "loss": 4.6503, + "step": 22555 + }, + { + "epoch": 1.5328169588259275, + "grad_norm": 0.38925445079803467, + "learning_rate": 8.08474317162658e-06, + "loss": 4.7528, + "step": 22560 + }, + { + "epoch": 1.5331566788965891, + "grad_norm": 0.5490087270736694, + "learning_rate": 8.084318521538254e-06, + "loss": 4.7559, + "step": 22565 + }, + { + "epoch": 1.533496398967251, + "grad_norm": 0.5613436698913574, + "learning_rate": 8.083893871449925e-06, + "loss": 4.7524, + "step": 22570 + }, + { + "epoch": 1.5338361190379128, + "grad_norm": 0.46507328748703003, + "learning_rate": 8.083469221361598e-06, + "loss": 4.7437, + "step": 22575 + }, + { + "epoch": 1.5341758391085745, + "grad_norm": 0.5941947102546692, + "learning_rate": 8.083044571273273e-06, + "loss": 5.0099, + "step": 22580 + }, + { + "epoch": 1.5345155591792363, + "grad_norm": 0.5611535906791687, + "learning_rate": 8.082619921184944e-06, + "loss": 4.9427, + "step": 22585 + }, + { + "epoch": 1.5348552792498982, + "grad_norm": 0.4050995707511902, + "learning_rate": 8.082195271096617e-06, + "loss": 5.0185, + "step": 22590 + }, + { + "epoch": 1.5351949993205598, + "grad_norm": 0.420233815908432, + "learning_rate": 8.081770621008291e-06, + "loss": 4.9934, + "step": 22595 + }, + { + "epoch": 1.5355347193912217, + "grad_norm": 0.39888057112693787, + "learning_rate": 8.081345970919962e-06, + "loss": 4.6899, + "step": 22600 + }, + { + "epoch": 1.5358744394618835, + "grad_norm": 0.47107160091400146, + "learning_rate": 8.080921320831635e-06, + "loss": 4.77, + "step": 22605 + }, + { + "epoch": 1.5362141595325451, + "grad_norm": 0.5156400203704834, + "learning_rate": 8.080496670743308e-06, + "loss": 4.715, + "step": 22610 + }, + { + "epoch": 1.536553879603207, + "grad_norm": 0.3361877501010895, + "learning_rate": 8.08007202065498e-06, + "loss": 4.9486, + "step": 22615 + }, + { + "epoch": 1.5368935996738688, + "grad_norm": 0.41605472564697266, + "learning_rate": 8.079647370566653e-06, + "loss": 4.6365, + "step": 22620 + }, + { + "epoch": 1.5372333197445305, + "grad_norm": 0.4190599024295807, + "learning_rate": 8.079222720478326e-06, + "loss": 4.8073, + "step": 22625 + }, + { + "epoch": 1.5375730398151923, + "grad_norm": 0.44330811500549316, + "learning_rate": 8.078798070389999e-06, + "loss": 4.7948, + "step": 22630 + }, + { + "epoch": 1.5379127598858542, + "grad_norm": 0.437061607837677, + "learning_rate": 8.078373420301672e-06, + "loss": 4.8489, + "step": 22635 + }, + { + "epoch": 1.5382524799565158, + "grad_norm": 0.39421015977859497, + "learning_rate": 8.077948770213345e-06, + "loss": 4.6496, + "step": 22640 + }, + { + "epoch": 1.5385922000271774, + "grad_norm": 0.3768816888332367, + "learning_rate": 8.077524120125017e-06, + "loss": 4.8176, + "step": 22645 + }, + { + "epoch": 1.5389319200978395, + "grad_norm": 0.39260393381118774, + "learning_rate": 8.07709947003669e-06, + "loss": 4.707, + "step": 22650 + }, + { + "epoch": 1.5392716401685012, + "grad_norm": 0.4360595643520355, + "learning_rate": 8.076674819948363e-06, + "loss": 4.7472, + "step": 22655 + }, + { + "epoch": 1.5396113602391628, + "grad_norm": 0.4166605770587921, + "learning_rate": 8.076250169860036e-06, + "loss": 4.71, + "step": 22660 + }, + { + "epoch": 1.5399510803098249, + "grad_norm": 0.4777477979660034, + "learning_rate": 8.075825519771709e-06, + "loss": 5.1622, + "step": 22665 + }, + { + "epoch": 1.5402908003804865, + "grad_norm": 0.4110572934150696, + "learning_rate": 8.075400869683381e-06, + "loss": 4.7087, + "step": 22670 + }, + { + "epoch": 1.5406305204511481, + "grad_norm": 0.39917421340942383, + "learning_rate": 8.074976219595054e-06, + "loss": 4.8393, + "step": 22675 + }, + { + "epoch": 1.54097024052181, + "grad_norm": 0.5086607933044434, + "learning_rate": 8.074551569506727e-06, + "loss": 4.9436, + "step": 22680 + }, + { + "epoch": 1.5413099605924718, + "grad_norm": 0.4231780767440796, + "learning_rate": 8.0741269194184e-06, + "loss": 4.7764, + "step": 22685 + }, + { + "epoch": 1.5416496806631335, + "grad_norm": 0.41867852210998535, + "learning_rate": 8.073702269330073e-06, + "loss": 4.7116, + "step": 22690 + }, + { + "epoch": 1.5419894007337953, + "grad_norm": 0.44734448194503784, + "learning_rate": 8.073277619241745e-06, + "loss": 4.7333, + "step": 22695 + }, + { + "epoch": 1.5423291208044572, + "grad_norm": 0.48929959535598755, + "learning_rate": 8.072852969153418e-06, + "loss": 4.7062, + "step": 22700 + }, + { + "epoch": 1.5426688408751188, + "grad_norm": 0.3042430579662323, + "learning_rate": 8.072428319065091e-06, + "loss": 4.6952, + "step": 22705 + }, + { + "epoch": 1.5430085609457806, + "grad_norm": 0.6341671347618103, + "learning_rate": 8.072003668976764e-06, + "loss": 4.783, + "step": 22710 + }, + { + "epoch": 1.5433482810164425, + "grad_norm": 0.4722593426704407, + "learning_rate": 8.071579018888437e-06, + "loss": 4.6185, + "step": 22715 + }, + { + "epoch": 1.5436880010871041, + "grad_norm": 0.4976082742214203, + "learning_rate": 8.07115436880011e-06, + "loss": 4.9691, + "step": 22720 + }, + { + "epoch": 1.544027721157766, + "grad_norm": 0.4550344944000244, + "learning_rate": 8.070729718711782e-06, + "loss": 4.7594, + "step": 22725 + }, + { + "epoch": 1.5443674412284278, + "grad_norm": 0.5217934846878052, + "learning_rate": 8.070305068623455e-06, + "loss": 4.8522, + "step": 22730 + }, + { + "epoch": 1.5447071612990895, + "grad_norm": 0.4103492200374603, + "learning_rate": 8.069880418535128e-06, + "loss": 4.4897, + "step": 22735 + }, + { + "epoch": 1.5450468813697513, + "grad_norm": 0.36507952213287354, + "learning_rate": 8.0694557684468e-06, + "loss": 4.886, + "step": 22740 + }, + { + "epoch": 1.5453866014404132, + "grad_norm": 0.3820270001888275, + "learning_rate": 8.069031118358473e-06, + "loss": 4.8403, + "step": 22745 + }, + { + "epoch": 1.5457263215110748, + "grad_norm": 0.46322545409202576, + "learning_rate": 8.068606468270146e-06, + "loss": 5.0212, + "step": 22750 + }, + { + "epoch": 1.5460660415817367, + "grad_norm": 0.42791980504989624, + "learning_rate": 8.068181818181819e-06, + "loss": 4.6648, + "step": 22755 + }, + { + "epoch": 1.5464057616523985, + "grad_norm": 0.4538920819759369, + "learning_rate": 8.067757168093492e-06, + "loss": 4.5095, + "step": 22760 + }, + { + "epoch": 1.5467454817230601, + "grad_norm": 0.44482842087745667, + "learning_rate": 8.067332518005165e-06, + "loss": 4.72, + "step": 22765 + }, + { + "epoch": 1.547085201793722, + "grad_norm": 0.43630656599998474, + "learning_rate": 8.066907867916837e-06, + "loss": 4.6878, + "step": 22770 + }, + { + "epoch": 1.5474249218643839, + "grad_norm": 0.450265109539032, + "learning_rate": 8.06648321782851e-06, + "loss": 4.9696, + "step": 22775 + }, + { + "epoch": 1.5477646419350455, + "grad_norm": 0.6364747285842896, + "learning_rate": 8.066058567740183e-06, + "loss": 4.7275, + "step": 22780 + }, + { + "epoch": 1.5481043620057073, + "grad_norm": 0.48084065318107605, + "learning_rate": 8.065633917651856e-06, + "loss": 4.8857, + "step": 22785 + }, + { + "epoch": 1.5484440820763692, + "grad_norm": 0.5078391432762146, + "learning_rate": 8.065209267563529e-06, + "loss": 4.9045, + "step": 22790 + }, + { + "epoch": 1.5487838021470308, + "grad_norm": 0.4393092393875122, + "learning_rate": 8.064784617475201e-06, + "loss": 4.951, + "step": 22795 + }, + { + "epoch": 1.5491235222176927, + "grad_norm": 0.38985714316368103, + "learning_rate": 8.064359967386873e-06, + "loss": 4.6529, + "step": 22800 + }, + { + "epoch": 1.5494632422883545, + "grad_norm": 0.45083701610565186, + "learning_rate": 8.063935317298547e-06, + "loss": 4.4543, + "step": 22805 + }, + { + "epoch": 1.5498029623590162, + "grad_norm": 0.4786733090877533, + "learning_rate": 8.06351066721022e-06, + "loss": 4.5743, + "step": 22810 + }, + { + "epoch": 1.5501426824296778, + "grad_norm": 0.3865468204021454, + "learning_rate": 8.063086017121893e-06, + "loss": 5.044, + "step": 22815 + }, + { + "epoch": 1.5504824025003399, + "grad_norm": 0.3756445348262787, + "learning_rate": 8.062661367033565e-06, + "loss": 4.5253, + "step": 22820 + }, + { + "epoch": 1.5508221225710015, + "grad_norm": 0.4960957467556, + "learning_rate": 8.062236716945238e-06, + "loss": 4.9045, + "step": 22825 + }, + { + "epoch": 1.5511618426416631, + "grad_norm": 0.4476701021194458, + "learning_rate": 8.061812066856911e-06, + "loss": 4.8972, + "step": 22830 + }, + { + "epoch": 1.5515015627123252, + "grad_norm": 0.44057440757751465, + "learning_rate": 8.061387416768584e-06, + "loss": 4.8809, + "step": 22835 + }, + { + "epoch": 1.5518412827829868, + "grad_norm": 0.49857839941978455, + "learning_rate": 8.060962766680257e-06, + "loss": 4.8018, + "step": 22840 + }, + { + "epoch": 1.5521810028536485, + "grad_norm": 0.380214124917984, + "learning_rate": 8.06053811659193e-06, + "loss": 4.8308, + "step": 22845 + }, + { + "epoch": 1.5525207229243103, + "grad_norm": 0.3966732919216156, + "learning_rate": 8.060113466503602e-06, + "loss": 4.7119, + "step": 22850 + }, + { + "epoch": 1.5528604429949722, + "grad_norm": 0.5714302062988281, + "learning_rate": 8.059688816415275e-06, + "loss": 4.7449, + "step": 22855 + }, + { + "epoch": 1.5532001630656338, + "grad_norm": 0.5053598284721375, + "learning_rate": 8.059264166326948e-06, + "loss": 4.6877, + "step": 22860 + }, + { + "epoch": 1.5535398831362957, + "grad_norm": 0.4063282608985901, + "learning_rate": 8.05883951623862e-06, + "loss": 4.87, + "step": 22865 + }, + { + "epoch": 1.5538796032069575, + "grad_norm": 0.6001986861228943, + "learning_rate": 8.058414866150292e-06, + "loss": 4.8729, + "step": 22870 + }, + { + "epoch": 1.5542193232776191, + "grad_norm": 0.47255921363830566, + "learning_rate": 8.057990216061966e-06, + "loss": 4.5675, + "step": 22875 + }, + { + "epoch": 1.554559043348281, + "grad_norm": 0.3869221806526184, + "learning_rate": 8.057565565973639e-06, + "loss": 4.8112, + "step": 22880 + }, + { + "epoch": 1.5548987634189428, + "grad_norm": 0.4128265082836151, + "learning_rate": 8.057225845902976e-06, + "loss": 4.9384, + "step": 22885 + }, + { + "epoch": 1.5552384834896045, + "grad_norm": 0.6193118095397949, + "learning_rate": 8.056801195814649e-06, + "loss": 5.0056, + "step": 22890 + }, + { + "epoch": 1.5555782035602663, + "grad_norm": 0.4948267340660095, + "learning_rate": 8.056376545726322e-06, + "loss": 4.99, + "step": 22895 + }, + { + "epoch": 1.5559179236309282, + "grad_norm": 0.4180375635623932, + "learning_rate": 8.055951895637995e-06, + "loss": 4.7604, + "step": 22900 + }, + { + "epoch": 1.5562576437015898, + "grad_norm": 0.39513692259788513, + "learning_rate": 8.055527245549667e-06, + "loss": 4.6279, + "step": 22905 + }, + { + "epoch": 1.5565973637722517, + "grad_norm": 0.4640156924724579, + "learning_rate": 8.05510259546134e-06, + "loss": 4.7924, + "step": 22910 + }, + { + "epoch": 1.5569370838429135, + "grad_norm": 0.45335423946380615, + "learning_rate": 8.054677945373013e-06, + "loss": 4.8823, + "step": 22915 + }, + { + "epoch": 1.5572768039135751, + "grad_norm": 0.5222555994987488, + "learning_rate": 8.054253295284686e-06, + "loss": 4.5165, + "step": 22920 + }, + { + "epoch": 1.557616523984237, + "grad_norm": 0.37914812564849854, + "learning_rate": 8.053828645196359e-06, + "loss": 4.9352, + "step": 22925 + }, + { + "epoch": 1.5579562440548989, + "grad_norm": 0.4222639501094818, + "learning_rate": 8.053403995108031e-06, + "loss": 4.7358, + "step": 22930 + }, + { + "epoch": 1.5582959641255605, + "grad_norm": 0.4647897779941559, + "learning_rate": 8.052979345019704e-06, + "loss": 4.6152, + "step": 22935 + }, + { + "epoch": 1.5586356841962223, + "grad_norm": 0.35410580039024353, + "learning_rate": 8.052554694931377e-06, + "loss": 4.9161, + "step": 22940 + }, + { + "epoch": 1.5589754042668842, + "grad_norm": 0.465859055519104, + "learning_rate": 8.05213004484305e-06, + "loss": 4.5732, + "step": 22945 + }, + { + "epoch": 1.5593151243375458, + "grad_norm": 0.38483235239982605, + "learning_rate": 8.051705394754723e-06, + "loss": 4.7912, + "step": 22950 + }, + { + "epoch": 1.5596548444082077, + "grad_norm": 0.5037460923194885, + "learning_rate": 8.051280744666395e-06, + "loss": 4.8736, + "step": 22955 + }, + { + "epoch": 1.5599945644788695, + "grad_norm": 0.41498029232025146, + "learning_rate": 8.050856094578068e-06, + "loss": 4.8057, + "step": 22960 + }, + { + "epoch": 1.5603342845495312, + "grad_norm": 0.5086881518363953, + "learning_rate": 8.050431444489741e-06, + "loss": 4.7605, + "step": 22965 + }, + { + "epoch": 1.560674004620193, + "grad_norm": 0.48956406116485596, + "learning_rate": 8.050006794401414e-06, + "loss": 4.9907, + "step": 22970 + }, + { + "epoch": 1.5610137246908549, + "grad_norm": 0.3378429710865021, + "learning_rate": 8.049582144313087e-06, + "loss": 4.4549, + "step": 22975 + }, + { + "epoch": 1.5613534447615165, + "grad_norm": 0.5633531808853149, + "learning_rate": 8.04915749422476e-06, + "loss": 4.6882, + "step": 22980 + }, + { + "epoch": 1.5616931648321781, + "grad_norm": 0.5664250254631042, + "learning_rate": 8.048732844136432e-06, + "loss": 4.852, + "step": 22985 + }, + { + "epoch": 1.5620328849028402, + "grad_norm": 0.447426974773407, + "learning_rate": 8.048308194048105e-06, + "loss": 5.1215, + "step": 22990 + }, + { + "epoch": 1.5623726049735018, + "grad_norm": 0.34887877106666565, + "learning_rate": 8.047883543959778e-06, + "loss": 5.0086, + "step": 22995 + }, + { + "epoch": 1.5627123250441635, + "grad_norm": 0.4445120394229889, + "learning_rate": 8.04745889387145e-06, + "loss": 4.6476, + "step": 23000 + }, + { + "epoch": 1.5630520451148255, + "grad_norm": 0.4806201457977295, + "learning_rate": 8.047034243783123e-06, + "loss": 4.7478, + "step": 23005 + }, + { + "epoch": 1.5633917651854872, + "grad_norm": 0.41096505522727966, + "learning_rate": 8.046609593694796e-06, + "loss": 4.6433, + "step": 23010 + }, + { + "epoch": 1.5637314852561488, + "grad_norm": 0.44209030270576477, + "learning_rate": 8.046184943606469e-06, + "loss": 5.0169, + "step": 23015 + }, + { + "epoch": 1.5640712053268107, + "grad_norm": 0.6003164052963257, + "learning_rate": 8.045760293518142e-06, + "loss": 4.6824, + "step": 23020 + }, + { + "epoch": 1.5644109253974725, + "grad_norm": 0.47912803292274475, + "learning_rate": 8.045335643429815e-06, + "loss": 4.5942, + "step": 23025 + }, + { + "epoch": 1.5647506454681341, + "grad_norm": 0.6574408411979675, + "learning_rate": 8.044910993341487e-06, + "loss": 4.6498, + "step": 23030 + }, + { + "epoch": 1.565090365538796, + "grad_norm": 0.43133577704429626, + "learning_rate": 8.04448634325316e-06, + "loss": 5.1133, + "step": 23035 + }, + { + "epoch": 1.5654300856094578, + "grad_norm": 0.4761475622653961, + "learning_rate": 8.044061693164833e-06, + "loss": 4.6572, + "step": 23040 + }, + { + "epoch": 1.5657698056801195, + "grad_norm": 0.48242828249931335, + "learning_rate": 8.043637043076506e-06, + "loss": 5.0001, + "step": 23045 + }, + { + "epoch": 1.5661095257507813, + "grad_norm": 0.40040096640586853, + "learning_rate": 8.043212392988179e-06, + "loss": 4.7119, + "step": 23050 + }, + { + "epoch": 1.5664492458214432, + "grad_norm": 0.5699989199638367, + "learning_rate": 8.042787742899851e-06, + "loss": 4.83, + "step": 23055 + }, + { + "epoch": 1.5667889658921048, + "grad_norm": 0.4004640579223633, + "learning_rate": 8.042363092811524e-06, + "loss": 4.9242, + "step": 23060 + }, + { + "epoch": 1.5671286859627667, + "grad_norm": 0.44074463844299316, + "learning_rate": 8.041938442723197e-06, + "loss": 4.948, + "step": 23065 + }, + { + "epoch": 1.5674684060334285, + "grad_norm": 0.38956013321876526, + "learning_rate": 8.04151379263487e-06, + "loss": 4.7565, + "step": 23070 + }, + { + "epoch": 1.5678081261040901, + "grad_norm": 0.47251754999160767, + "learning_rate": 8.041089142546543e-06, + "loss": 4.8662, + "step": 23075 + }, + { + "epoch": 1.568147846174752, + "grad_norm": 0.47871702909469604, + "learning_rate": 8.040664492458215e-06, + "loss": 4.8391, + "step": 23080 + }, + { + "epoch": 1.5684875662454139, + "grad_norm": 0.43384504318237305, + "learning_rate": 8.040239842369888e-06, + "loss": 4.7832, + "step": 23085 + }, + { + "epoch": 1.5688272863160755, + "grad_norm": 0.44529810547828674, + "learning_rate": 8.039815192281561e-06, + "loss": 4.6808, + "step": 23090 + }, + { + "epoch": 1.5691670063867373, + "grad_norm": 0.519644021987915, + "learning_rate": 8.039390542193234e-06, + "loss": 4.9186, + "step": 23095 + }, + { + "epoch": 1.5695067264573992, + "grad_norm": 0.4325195252895355, + "learning_rate": 8.038965892104907e-06, + "loss": 4.7437, + "step": 23100 + }, + { + "epoch": 1.5698464465280608, + "grad_norm": 0.45678889751434326, + "learning_rate": 8.03854124201658e-06, + "loss": 4.8718, + "step": 23105 + }, + { + "epoch": 1.5701861665987227, + "grad_norm": 0.35139200091362, + "learning_rate": 8.038116591928252e-06, + "loss": 4.5854, + "step": 23110 + }, + { + "epoch": 1.5705258866693845, + "grad_norm": 0.5576383471488953, + "learning_rate": 8.037691941839925e-06, + "loss": 4.984, + "step": 23115 + }, + { + "epoch": 1.5708656067400462, + "grad_norm": 0.4917481541633606, + "learning_rate": 8.037267291751596e-06, + "loss": 4.6014, + "step": 23120 + }, + { + "epoch": 1.571205326810708, + "grad_norm": 0.41665297746658325, + "learning_rate": 8.03684264166327e-06, + "loss": 4.6815, + "step": 23125 + }, + { + "epoch": 1.5715450468813699, + "grad_norm": 0.4034712016582489, + "learning_rate": 8.036417991574943e-06, + "loss": 4.7194, + "step": 23130 + }, + { + "epoch": 1.5718847669520315, + "grad_norm": 0.41746804118156433, + "learning_rate": 8.035993341486615e-06, + "loss": 4.8331, + "step": 23135 + }, + { + "epoch": 1.5722244870226934, + "grad_norm": 0.44515812397003174, + "learning_rate": 8.035568691398289e-06, + "loss": 4.5961, + "step": 23140 + }, + { + "epoch": 1.5725642070933552, + "grad_norm": 0.3838973641395569, + "learning_rate": 8.035144041309962e-06, + "loss": 4.8861, + "step": 23145 + }, + { + "epoch": 1.5729039271640168, + "grad_norm": 0.38316479325294495, + "learning_rate": 8.034719391221633e-06, + "loss": 4.7546, + "step": 23150 + }, + { + "epoch": 1.5732436472346785, + "grad_norm": 0.49523770809173584, + "learning_rate": 8.034294741133308e-06, + "loss": 4.9719, + "step": 23155 + }, + { + "epoch": 1.5735833673053405, + "grad_norm": 0.44069862365722656, + "learning_rate": 8.03387009104498e-06, + "loss": 4.7272, + "step": 23160 + }, + { + "epoch": 1.5739230873760022, + "grad_norm": 0.3413865268230438, + "learning_rate": 8.033445440956651e-06, + "loss": 4.7637, + "step": 23165 + }, + { + "epoch": 1.5742628074466638, + "grad_norm": 0.666420042514801, + "learning_rate": 8.033020790868326e-06, + "loss": 4.8167, + "step": 23170 + }, + { + "epoch": 1.5746025275173259, + "grad_norm": 0.5157902240753174, + "learning_rate": 8.032596140779999e-06, + "loss": 4.8902, + "step": 23175 + }, + { + "epoch": 1.5749422475879875, + "grad_norm": 0.4448080360889435, + "learning_rate": 8.03217149069167e-06, + "loss": 4.8831, + "step": 23180 + }, + { + "epoch": 1.5752819676586491, + "grad_norm": 0.5288764238357544, + "learning_rate": 8.031746840603344e-06, + "loss": 4.7292, + "step": 23185 + }, + { + "epoch": 1.575621687729311, + "grad_norm": 0.47644487023353577, + "learning_rate": 8.031322190515017e-06, + "loss": 4.9404, + "step": 23190 + }, + { + "epoch": 1.5759614077999728, + "grad_norm": 0.44572848081588745, + "learning_rate": 8.030897540426688e-06, + "loss": 4.8833, + "step": 23195 + }, + { + "epoch": 1.5763011278706345, + "grad_norm": 0.4582953453063965, + "learning_rate": 8.030472890338363e-06, + "loss": 4.4114, + "step": 23200 + }, + { + "epoch": 1.5766408479412963, + "grad_norm": 0.5468581914901733, + "learning_rate": 8.030048240250034e-06, + "loss": 4.7306, + "step": 23205 + }, + { + "epoch": 1.5769805680119582, + "grad_norm": 0.6249271035194397, + "learning_rate": 8.029623590161707e-06, + "loss": 4.8736, + "step": 23210 + }, + { + "epoch": 1.5773202880826198, + "grad_norm": 0.40169456601142883, + "learning_rate": 8.029198940073381e-06, + "loss": 4.7424, + "step": 23215 + }, + { + "epoch": 1.5776600081532817, + "grad_norm": 0.43739935755729675, + "learning_rate": 8.028774289985052e-06, + "loss": 4.8077, + "step": 23220 + }, + { + "epoch": 1.5779997282239435, + "grad_norm": 0.3806428909301758, + "learning_rate": 8.028349639896725e-06, + "loss": 4.8302, + "step": 23225 + }, + { + "epoch": 1.5783394482946052, + "grad_norm": 0.4374402165412903, + "learning_rate": 8.0279249898084e-06, + "loss": 4.5659, + "step": 23230 + }, + { + "epoch": 1.578679168365267, + "grad_norm": 0.5850626230239868, + "learning_rate": 8.02750033972007e-06, + "loss": 4.8604, + "step": 23235 + }, + { + "epoch": 1.5790188884359289, + "grad_norm": 0.4553356468677521, + "learning_rate": 8.027075689631743e-06, + "loss": 4.6577, + "step": 23240 + }, + { + "epoch": 1.5793586085065905, + "grad_norm": 0.47287076711654663, + "learning_rate": 8.026651039543418e-06, + "loss": 4.9057, + "step": 23245 + }, + { + "epoch": 1.5796983285772523, + "grad_norm": 0.4114150106906891, + "learning_rate": 8.026226389455089e-06, + "loss": 4.6644, + "step": 23250 + }, + { + "epoch": 1.5800380486479142, + "grad_norm": 0.4477390646934509, + "learning_rate": 8.025801739366762e-06, + "loss": 4.8089, + "step": 23255 + }, + { + "epoch": 1.5803777687185758, + "grad_norm": 0.513881266117096, + "learning_rate": 8.025377089278436e-06, + "loss": 4.7796, + "step": 23260 + }, + { + "epoch": 1.5807174887892377, + "grad_norm": 0.4566428065299988, + "learning_rate": 8.024952439190107e-06, + "loss": 4.7163, + "step": 23265 + }, + { + "epoch": 1.5810572088598995, + "grad_norm": 0.38639605045318604, + "learning_rate": 8.02452778910178e-06, + "loss": 4.6665, + "step": 23270 + }, + { + "epoch": 1.5813969289305612, + "grad_norm": 0.38349810242652893, + "learning_rate": 8.024103139013453e-06, + "loss": 4.9315, + "step": 23275 + }, + { + "epoch": 1.581736649001223, + "grad_norm": 0.45973625779151917, + "learning_rate": 8.023678488925126e-06, + "loss": 4.9897, + "step": 23280 + }, + { + "epoch": 1.5820763690718849, + "grad_norm": 0.5384525060653687, + "learning_rate": 8.023253838836799e-06, + "loss": 4.6814, + "step": 23285 + }, + { + "epoch": 1.5824160891425465, + "grad_norm": 0.6288036704063416, + "learning_rate": 8.022829188748471e-06, + "loss": 4.5187, + "step": 23290 + }, + { + "epoch": 1.5827558092132084, + "grad_norm": 0.536689043045044, + "learning_rate": 8.022404538660144e-06, + "loss": 4.7022, + "step": 23295 + }, + { + "epoch": 1.5830955292838702, + "grad_norm": 0.43303000926971436, + "learning_rate": 8.021979888571817e-06, + "loss": 4.8743, + "step": 23300 + }, + { + "epoch": 1.5834352493545318, + "grad_norm": 0.4626923203468323, + "learning_rate": 8.02155523848349e-06, + "loss": 4.6527, + "step": 23305 + }, + { + "epoch": 1.5837749694251937, + "grad_norm": 0.5261439085006714, + "learning_rate": 8.021130588395163e-06, + "loss": 4.5418, + "step": 23310 + }, + { + "epoch": 1.5841146894958555, + "grad_norm": 0.4321671426296234, + "learning_rate": 8.020705938306835e-06, + "loss": 4.558, + "step": 23315 + }, + { + "epoch": 1.5844544095665172, + "grad_norm": 0.47081053256988525, + "learning_rate": 8.020281288218508e-06, + "loss": 4.6836, + "step": 23320 + }, + { + "epoch": 1.5847941296371788, + "grad_norm": 0.4288552403450012, + "learning_rate": 8.019856638130181e-06, + "loss": 4.7338, + "step": 23325 + }, + { + "epoch": 1.5851338497078409, + "grad_norm": 0.3292202949523926, + "learning_rate": 8.019431988041854e-06, + "loss": 4.5837, + "step": 23330 + }, + { + "epoch": 1.5854735697785025, + "grad_norm": 0.38914811611175537, + "learning_rate": 8.019007337953527e-06, + "loss": 4.7291, + "step": 23335 + }, + { + "epoch": 1.5858132898491641, + "grad_norm": 0.4183700978755951, + "learning_rate": 8.0185826878652e-06, + "loss": 4.6363, + "step": 23340 + }, + { + "epoch": 1.5861530099198262, + "grad_norm": 0.5042991638183594, + "learning_rate": 8.018158037776872e-06, + "loss": 4.8984, + "step": 23345 + }, + { + "epoch": 1.5864927299904878, + "grad_norm": 0.42914190888404846, + "learning_rate": 8.017733387688545e-06, + "loss": 4.923, + "step": 23350 + }, + { + "epoch": 1.5868324500611495, + "grad_norm": 0.46615585684776306, + "learning_rate": 8.017308737600218e-06, + "loss": 4.7362, + "step": 23355 + }, + { + "epoch": 1.5871721701318113, + "grad_norm": 0.6062822937965393, + "learning_rate": 8.01688408751189e-06, + "loss": 4.6956, + "step": 23360 + }, + { + "epoch": 1.5875118902024732, + "grad_norm": 0.37794333696365356, + "learning_rate": 8.016459437423563e-06, + "loss": 4.8167, + "step": 23365 + }, + { + "epoch": 1.5878516102731348, + "grad_norm": 0.4160827696323395, + "learning_rate": 8.016034787335236e-06, + "loss": 4.829, + "step": 23370 + }, + { + "epoch": 1.5881913303437967, + "grad_norm": 0.32366472482681274, + "learning_rate": 8.015610137246909e-06, + "loss": 4.7467, + "step": 23375 + }, + { + "epoch": 1.5885310504144585, + "grad_norm": 0.38381460309028625, + "learning_rate": 8.015185487158582e-06, + "loss": 4.6219, + "step": 23380 + }, + { + "epoch": 1.5888707704851202, + "grad_norm": 0.5453047156333923, + "learning_rate": 8.014760837070255e-06, + "loss": 4.882, + "step": 23385 + }, + { + "epoch": 1.589210490555782, + "grad_norm": 0.4969326853752136, + "learning_rate": 8.014336186981927e-06, + "loss": 4.6615, + "step": 23390 + }, + { + "epoch": 1.5895502106264439, + "grad_norm": 0.4277118742465973, + "learning_rate": 8.0139115368936e-06, + "loss": 4.7581, + "step": 23395 + }, + { + "epoch": 1.5898899306971055, + "grad_norm": 0.559553325176239, + "learning_rate": 8.013486886805273e-06, + "loss": 4.8432, + "step": 23400 + }, + { + "epoch": 1.5902296507677673, + "grad_norm": 0.40549781918525696, + "learning_rate": 8.013062236716946e-06, + "loss": 4.7791, + "step": 23405 + }, + { + "epoch": 1.5905693708384292, + "grad_norm": 0.4955947697162628, + "learning_rate": 8.012637586628619e-06, + "loss": 4.78, + "step": 23410 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 0.4214164614677429, + "learning_rate": 8.012212936540291e-06, + "loss": 4.711, + "step": 23415 + }, + { + "epoch": 1.5912488109797527, + "grad_norm": 0.4918133616447449, + "learning_rate": 8.011788286451964e-06, + "loss": 4.6763, + "step": 23420 + }, + { + "epoch": 1.5915885310504145, + "grad_norm": 0.558413028717041, + "learning_rate": 8.011363636363637e-06, + "loss": 4.7775, + "step": 23425 + }, + { + "epoch": 1.5919282511210762, + "grad_norm": 0.4205312728881836, + "learning_rate": 8.01093898627531e-06, + "loss": 4.5028, + "step": 23430 + }, + { + "epoch": 1.592267971191738, + "grad_norm": 0.5166853070259094, + "learning_rate": 8.010514336186983e-06, + "loss": 4.6208, + "step": 23435 + }, + { + "epoch": 1.5926076912623999, + "grad_norm": 0.5177450180053711, + "learning_rate": 8.010089686098655e-06, + "loss": 4.4549, + "step": 23440 + }, + { + "epoch": 1.5929474113330615, + "grad_norm": 0.42961668968200684, + "learning_rate": 8.009665036010328e-06, + "loss": 4.622, + "step": 23445 + }, + { + "epoch": 1.5932871314037234, + "grad_norm": 0.42124906182289124, + "learning_rate": 8.009240385922001e-06, + "loss": 5.1095, + "step": 23450 + }, + { + "epoch": 1.5936268514743852, + "grad_norm": 0.49265196919441223, + "learning_rate": 8.008815735833674e-06, + "loss": 4.7745, + "step": 23455 + }, + { + "epoch": 1.5939665715450468, + "grad_norm": 0.44149836897850037, + "learning_rate": 8.008391085745347e-06, + "loss": 4.7473, + "step": 23460 + }, + { + "epoch": 1.5943062916157087, + "grad_norm": 0.35925811529159546, + "learning_rate": 8.00796643565702e-06, + "loss": 4.7133, + "step": 23465 + }, + { + "epoch": 1.5946460116863705, + "grad_norm": 0.5192461013793945, + "learning_rate": 8.007541785568692e-06, + "loss": 5.1386, + "step": 23470 + }, + { + "epoch": 1.5949857317570322, + "grad_norm": 0.5143252611160278, + "learning_rate": 8.007117135480365e-06, + "loss": 4.2996, + "step": 23475 + }, + { + "epoch": 1.595325451827694, + "grad_norm": 0.40716472268104553, + "learning_rate": 8.006692485392038e-06, + "loss": 4.6138, + "step": 23480 + }, + { + "epoch": 1.5956651718983559, + "grad_norm": 0.47666290402412415, + "learning_rate": 8.00626783530371e-06, + "loss": 4.7624, + "step": 23485 + }, + { + "epoch": 1.5960048919690175, + "grad_norm": 0.44302603602409363, + "learning_rate": 8.005843185215383e-06, + "loss": 4.5123, + "step": 23490 + }, + { + "epoch": 1.5963446120396791, + "grad_norm": 0.4487234354019165, + "learning_rate": 8.005418535127056e-06, + "loss": 4.6783, + "step": 23495 + }, + { + "epoch": 1.5966843321103412, + "grad_norm": 0.524582028388977, + "learning_rate": 8.004993885038729e-06, + "loss": 4.8143, + "step": 23500 + }, + { + "epoch": 1.5970240521810029, + "grad_norm": 0.48778364062309265, + "learning_rate": 8.004569234950402e-06, + "loss": 4.6066, + "step": 23505 + }, + { + "epoch": 1.5973637722516645, + "grad_norm": 0.48154330253601074, + "learning_rate": 8.004144584862075e-06, + "loss": 4.7942, + "step": 23510 + }, + { + "epoch": 1.5977034923223266, + "grad_norm": 0.42869237065315247, + "learning_rate": 8.003719934773747e-06, + "loss": 4.9468, + "step": 23515 + }, + { + "epoch": 1.5980432123929882, + "grad_norm": 0.45369967818260193, + "learning_rate": 8.00329528468542e-06, + "loss": 4.6606, + "step": 23520 + }, + { + "epoch": 1.5983829324636498, + "grad_norm": 0.42714598774909973, + "learning_rate": 8.002870634597093e-06, + "loss": 4.9375, + "step": 23525 + }, + { + "epoch": 1.5987226525343117, + "grad_norm": 0.38718870282173157, + "learning_rate": 8.002445984508766e-06, + "loss": 4.5324, + "step": 23530 + }, + { + "epoch": 1.5990623726049735, + "grad_norm": 0.4264513850212097, + "learning_rate": 8.002021334420439e-06, + "loss": 4.8402, + "step": 23535 + }, + { + "epoch": 1.5994020926756352, + "grad_norm": 0.5066158175468445, + "learning_rate": 8.001596684332111e-06, + "loss": 5.0013, + "step": 23540 + }, + { + "epoch": 1.599741812746297, + "grad_norm": 0.5162839293479919, + "learning_rate": 8.001172034243784e-06, + "loss": 4.561, + "step": 23545 + }, + { + "epoch": 1.6000815328169589, + "grad_norm": 0.41720426082611084, + "learning_rate": 8.000747384155455e-06, + "loss": 4.8183, + "step": 23550 + }, + { + "epoch": 1.6004212528876205, + "grad_norm": 0.3399890661239624, + "learning_rate": 8.00032273406713e-06, + "loss": 4.8517, + "step": 23555 + }, + { + "epoch": 1.6007609729582823, + "grad_norm": 0.41694891452789307, + "learning_rate": 7.999898083978803e-06, + "loss": 4.9419, + "step": 23560 + }, + { + "epoch": 1.6011006930289442, + "grad_norm": 0.456879198551178, + "learning_rate": 7.999473433890474e-06, + "loss": 4.5545, + "step": 23565 + }, + { + "epoch": 1.6014404130996058, + "grad_norm": 0.4347374439239502, + "learning_rate": 7.999048783802148e-06, + "loss": 4.7017, + "step": 23570 + }, + { + "epoch": 1.6017801331702677, + "grad_norm": 0.3964327573776245, + "learning_rate": 7.998624133713821e-06, + "loss": 4.5823, + "step": 23575 + }, + { + "epoch": 1.6021198532409295, + "grad_norm": 0.6593539714813232, + "learning_rate": 7.998199483625492e-06, + "loss": 4.6217, + "step": 23580 + }, + { + "epoch": 1.6024595733115912, + "grad_norm": 0.46741190552711487, + "learning_rate": 7.997774833537167e-06, + "loss": 4.954, + "step": 23585 + }, + { + "epoch": 1.602799293382253, + "grad_norm": 0.4038826823234558, + "learning_rate": 7.99735018344884e-06, + "loss": 4.7269, + "step": 23590 + }, + { + "epoch": 1.6031390134529149, + "grad_norm": 0.42185133695602417, + "learning_rate": 7.99692553336051e-06, + "loss": 4.6392, + "step": 23595 + }, + { + "epoch": 1.6034787335235765, + "grad_norm": 0.4680512845516205, + "learning_rate": 7.996500883272185e-06, + "loss": 4.8119, + "step": 23600 + }, + { + "epoch": 1.6038184535942384, + "grad_norm": 0.41714704036712646, + "learning_rate": 7.996076233183858e-06, + "loss": 4.7112, + "step": 23605 + }, + { + "epoch": 1.6041581736649002, + "grad_norm": 0.4341706931591034, + "learning_rate": 7.995651583095529e-06, + "loss": 4.6338, + "step": 23610 + }, + { + "epoch": 1.6044978937355618, + "grad_norm": 0.5913296341896057, + "learning_rate": 7.995226933007203e-06, + "loss": 4.6131, + "step": 23615 + }, + { + "epoch": 1.6048376138062237, + "grad_norm": 0.4646327793598175, + "learning_rate": 7.994802282918875e-06, + "loss": 4.8916, + "step": 23620 + }, + { + "epoch": 1.6051773338768855, + "grad_norm": 0.36893144249916077, + "learning_rate": 7.994377632830547e-06, + "loss": 4.7086, + "step": 23625 + }, + { + "epoch": 1.6055170539475472, + "grad_norm": 0.32517150044441223, + "learning_rate": 7.993952982742222e-06, + "loss": 4.7514, + "step": 23630 + }, + { + "epoch": 1.605856774018209, + "grad_norm": 0.4840342104434967, + "learning_rate": 7.993528332653893e-06, + "loss": 4.8409, + "step": 23635 + }, + { + "epoch": 1.6061964940888709, + "grad_norm": 0.3872290253639221, + "learning_rate": 7.993103682565566e-06, + "loss": 4.7975, + "step": 23640 + }, + { + "epoch": 1.6065362141595325, + "grad_norm": 0.32229557633399963, + "learning_rate": 7.99267903247724e-06, + "loss": 4.6676, + "step": 23645 + }, + { + "epoch": 1.6068759342301944, + "grad_norm": 0.40870413184165955, + "learning_rate": 7.992254382388911e-06, + "loss": 4.7184, + "step": 23650 + }, + { + "epoch": 1.6072156543008562, + "grad_norm": 0.5535767674446106, + "learning_rate": 7.991829732300584e-06, + "loss": 4.9234, + "step": 23655 + }, + { + "epoch": 1.6075553743715179, + "grad_norm": 0.39446306228637695, + "learning_rate": 7.991405082212259e-06, + "loss": 4.6654, + "step": 23660 + }, + { + "epoch": 1.6078950944421795, + "grad_norm": 0.4601157307624817, + "learning_rate": 7.99098043212393e-06, + "loss": 4.3943, + "step": 23665 + }, + { + "epoch": 1.6082348145128416, + "grad_norm": 0.5816711187362671, + "learning_rate": 7.990555782035603e-06, + "loss": 4.4037, + "step": 23670 + }, + { + "epoch": 1.6085745345835032, + "grad_norm": 0.4775501489639282, + "learning_rate": 7.990131131947277e-06, + "loss": 4.9877, + "step": 23675 + }, + { + "epoch": 1.6089142546541648, + "grad_norm": 0.5188002586364746, + "learning_rate": 7.989706481858948e-06, + "loss": 4.7324, + "step": 23680 + }, + { + "epoch": 1.609253974724827, + "grad_norm": 0.45850515365600586, + "learning_rate": 7.989281831770621e-06, + "loss": 4.7574, + "step": 23685 + }, + { + "epoch": 1.6095936947954885, + "grad_norm": 0.4303278625011444, + "learning_rate": 7.988857181682294e-06, + "loss": 4.7762, + "step": 23690 + }, + { + "epoch": 1.6099334148661502, + "grad_norm": 0.4849305748939514, + "learning_rate": 7.988432531593967e-06, + "loss": 4.4922, + "step": 23695 + }, + { + "epoch": 1.610273134936812, + "grad_norm": 0.394534170627594, + "learning_rate": 7.988007881505641e-06, + "loss": 4.7548, + "step": 23700 + }, + { + "epoch": 1.6106128550074739, + "grad_norm": 0.3257955014705658, + "learning_rate": 7.987583231417312e-06, + "loss": 4.7749, + "step": 23705 + }, + { + "epoch": 1.6109525750781355, + "grad_norm": 0.4524613916873932, + "learning_rate": 7.987158581328985e-06, + "loss": 5.096, + "step": 23710 + }, + { + "epoch": 1.6112922951487973, + "grad_norm": 0.4670681357383728, + "learning_rate": 7.98673393124066e-06, + "loss": 4.77, + "step": 23715 + }, + { + "epoch": 1.6116320152194592, + "grad_norm": 0.4381813704967499, + "learning_rate": 7.98630928115233e-06, + "loss": 4.8959, + "step": 23720 + }, + { + "epoch": 1.6119717352901208, + "grad_norm": 0.3609737753868103, + "learning_rate": 7.985884631064003e-06, + "loss": 4.6516, + "step": 23725 + }, + { + "epoch": 1.6123114553607827, + "grad_norm": 0.5879809260368347, + "learning_rate": 7.985459980975678e-06, + "loss": 5.002, + "step": 23730 + }, + { + "epoch": 1.6126511754314445, + "grad_norm": 0.516655445098877, + "learning_rate": 7.985035330887349e-06, + "loss": 4.8757, + "step": 23735 + }, + { + "epoch": 1.6129908955021062, + "grad_norm": 0.41792938113212585, + "learning_rate": 7.984610680799022e-06, + "loss": 4.6135, + "step": 23740 + }, + { + "epoch": 1.613330615572768, + "grad_norm": 0.5329369902610779, + "learning_rate": 7.984186030710696e-06, + "loss": 4.7859, + "step": 23745 + }, + { + "epoch": 1.6136703356434299, + "grad_norm": 0.4810100197792053, + "learning_rate": 7.983761380622367e-06, + "loss": 4.694, + "step": 23750 + }, + { + "epoch": 1.6140100557140915, + "grad_norm": 0.48786041140556335, + "learning_rate": 7.98333673053404e-06, + "loss": 4.8029, + "step": 23755 + }, + { + "epoch": 1.6143497757847534, + "grad_norm": 0.4162970185279846, + "learning_rate": 7.982912080445715e-06, + "loss": 4.868, + "step": 23760 + }, + { + "epoch": 1.6146894958554152, + "grad_norm": 0.4496709108352661, + "learning_rate": 7.982487430357386e-06, + "loss": 4.6978, + "step": 23765 + }, + { + "epoch": 1.6150292159260768, + "grad_norm": 0.4208287000656128, + "learning_rate": 7.982062780269059e-06, + "loss": 4.5623, + "step": 23770 + }, + { + "epoch": 1.6153689359967387, + "grad_norm": 0.3953610360622406, + "learning_rate": 7.981638130180731e-06, + "loss": 4.7269, + "step": 23775 + }, + { + "epoch": 1.6157086560674006, + "grad_norm": 0.44011569023132324, + "learning_rate": 7.981213480092404e-06, + "loss": 4.8307, + "step": 23780 + }, + { + "epoch": 1.6160483761380622, + "grad_norm": 0.5385610461235046, + "learning_rate": 7.980788830004077e-06, + "loss": 4.8169, + "step": 23785 + }, + { + "epoch": 1.616388096208724, + "grad_norm": 0.4387204349040985, + "learning_rate": 7.98036417991575e-06, + "loss": 4.5954, + "step": 23790 + }, + { + "epoch": 1.6167278162793859, + "grad_norm": 0.5672205090522766, + "learning_rate": 7.979939529827423e-06, + "loss": 4.7132, + "step": 23795 + }, + { + "epoch": 1.6170675363500475, + "grad_norm": 0.33562618494033813, + "learning_rate": 7.979514879739095e-06, + "loss": 4.5748, + "step": 23800 + }, + { + "epoch": 1.6174072564207094, + "grad_norm": 0.36884838342666626, + "learning_rate": 7.979090229650768e-06, + "loss": 4.5472, + "step": 23805 + }, + { + "epoch": 1.6177469764913712, + "grad_norm": 0.5449354648590088, + "learning_rate": 7.978665579562441e-06, + "loss": 4.8339, + "step": 23810 + }, + { + "epoch": 1.6180866965620329, + "grad_norm": 0.40072864294052124, + "learning_rate": 7.978240929474114e-06, + "loss": 4.8775, + "step": 23815 + }, + { + "epoch": 1.6184264166326947, + "grad_norm": 0.351818710565567, + "learning_rate": 7.977816279385787e-06, + "loss": 4.6555, + "step": 23820 + }, + { + "epoch": 1.6187661367033566, + "grad_norm": 0.5608483552932739, + "learning_rate": 7.97739162929746e-06, + "loss": 4.6306, + "step": 23825 + }, + { + "epoch": 1.6191058567740182, + "grad_norm": 0.4084293842315674, + "learning_rate": 7.976966979209132e-06, + "loss": 4.708, + "step": 23830 + }, + { + "epoch": 1.6194455768446798, + "grad_norm": 0.5169339179992676, + "learning_rate": 7.976542329120805e-06, + "loss": 4.5803, + "step": 23835 + }, + { + "epoch": 1.619785296915342, + "grad_norm": 0.3776840567588806, + "learning_rate": 7.976117679032478e-06, + "loss": 4.484, + "step": 23840 + }, + { + "epoch": 1.6201250169860035, + "grad_norm": 0.4301459789276123, + "learning_rate": 7.97569302894415e-06, + "loss": 4.3538, + "step": 23845 + }, + { + "epoch": 1.6204647370566652, + "grad_norm": 0.4298301935195923, + "learning_rate": 7.975268378855823e-06, + "loss": 4.7793, + "step": 23850 + }, + { + "epoch": 1.6208044571273272, + "grad_norm": 0.49232879281044006, + "learning_rate": 7.974843728767496e-06, + "loss": 4.6672, + "step": 23855 + }, + { + "epoch": 1.6211441771979889, + "grad_norm": 0.4581429958343506, + "learning_rate": 7.974419078679169e-06, + "loss": 4.6572, + "step": 23860 + }, + { + "epoch": 1.6214838972686505, + "grad_norm": 0.3586576282978058, + "learning_rate": 7.973994428590842e-06, + "loss": 4.6287, + "step": 23865 + }, + { + "epoch": 1.6218236173393124, + "grad_norm": 0.3935891091823578, + "learning_rate": 7.973569778502515e-06, + "loss": 4.6253, + "step": 23870 + }, + { + "epoch": 1.6221633374099742, + "grad_norm": 0.4523427188396454, + "learning_rate": 7.973145128414187e-06, + "loss": 4.6532, + "step": 23875 + }, + { + "epoch": 1.6225030574806358, + "grad_norm": 0.3705112338066101, + "learning_rate": 7.97272047832586e-06, + "loss": 4.6382, + "step": 23880 + }, + { + "epoch": 1.6228427775512977, + "grad_norm": 0.4061708152294159, + "learning_rate": 7.972295828237533e-06, + "loss": 4.8308, + "step": 23885 + }, + { + "epoch": 1.6231824976219595, + "grad_norm": 0.4228380024433136, + "learning_rate": 7.971871178149206e-06, + "loss": 4.8784, + "step": 23890 + }, + { + "epoch": 1.6235222176926212, + "grad_norm": 0.4817356467247009, + "learning_rate": 7.971446528060879e-06, + "loss": 4.5504, + "step": 23895 + }, + { + "epoch": 1.623861937763283, + "grad_norm": 0.3930428624153137, + "learning_rate": 7.971021877972551e-06, + "loss": 4.7474, + "step": 23900 + }, + { + "epoch": 1.6242016578339449, + "grad_norm": 0.39014825224876404, + "learning_rate": 7.970597227884224e-06, + "loss": 4.7639, + "step": 23905 + }, + { + "epoch": 1.6245413779046065, + "grad_norm": 0.4613354802131653, + "learning_rate": 7.970172577795897e-06, + "loss": 4.83, + "step": 23910 + }, + { + "epoch": 1.6248810979752684, + "grad_norm": 0.5070062875747681, + "learning_rate": 7.96974792770757e-06, + "loss": 4.956, + "step": 23915 + }, + { + "epoch": 1.6252208180459302, + "grad_norm": 0.36605679988861084, + "learning_rate": 7.969323277619243e-06, + "loss": 4.6869, + "step": 23920 + }, + { + "epoch": 1.6255605381165918, + "grad_norm": 0.37907108664512634, + "learning_rate": 7.968898627530915e-06, + "loss": 4.6846, + "step": 23925 + }, + { + "epoch": 1.6259002581872537, + "grad_norm": 0.33440282940864563, + "learning_rate": 7.968473977442588e-06, + "loss": 4.7386, + "step": 23930 + }, + { + "epoch": 1.6262399782579156, + "grad_norm": 0.48522257804870605, + "learning_rate": 7.968049327354261e-06, + "loss": 4.559, + "step": 23935 + }, + { + "epoch": 1.6265796983285772, + "grad_norm": 0.40168753266334534, + "learning_rate": 7.967624677265934e-06, + "loss": 4.6552, + "step": 23940 + }, + { + "epoch": 1.626919418399239, + "grad_norm": 0.4830774664878845, + "learning_rate": 7.967200027177607e-06, + "loss": 4.5061, + "step": 23945 + }, + { + "epoch": 1.627259138469901, + "grad_norm": 0.42536285519599915, + "learning_rate": 7.96677537708928e-06, + "loss": 4.39, + "step": 23950 + }, + { + "epoch": 1.6275988585405625, + "grad_norm": 0.4896010458469391, + "learning_rate": 7.966350727000952e-06, + "loss": 4.6952, + "step": 23955 + }, + { + "epoch": 1.6279385786112244, + "grad_norm": 0.3609086573123932, + "learning_rate": 7.965926076912625e-06, + "loss": 4.7869, + "step": 23960 + }, + { + "epoch": 1.6282782986818862, + "grad_norm": 0.47401782870292664, + "learning_rate": 7.965501426824296e-06, + "loss": 4.6705, + "step": 23965 + }, + { + "epoch": 1.6286180187525479, + "grad_norm": 0.42925864458084106, + "learning_rate": 7.96507677673597e-06, + "loss": 4.749, + "step": 23970 + }, + { + "epoch": 1.6289577388232097, + "grad_norm": 0.4352329671382904, + "learning_rate": 7.964652126647643e-06, + "loss": 4.7094, + "step": 23975 + }, + { + "epoch": 1.6292974588938716, + "grad_norm": 0.5143061280250549, + "learning_rate": 7.964227476559315e-06, + "loss": 4.7312, + "step": 23980 + }, + { + "epoch": 1.6296371789645332, + "grad_norm": 0.5008975267410278, + "learning_rate": 7.963802826470989e-06, + "loss": 4.8104, + "step": 23985 + }, + { + "epoch": 1.629976899035195, + "grad_norm": 0.49029237031936646, + "learning_rate": 7.963378176382662e-06, + "loss": 4.809, + "step": 23990 + }, + { + "epoch": 1.630316619105857, + "grad_norm": 0.384713739156723, + "learning_rate": 7.962953526294333e-06, + "loss": 4.5934, + "step": 23995 + }, + { + "epoch": 1.6306563391765185, + "grad_norm": 0.32494181394577026, + "learning_rate": 7.962528876206007e-06, + "loss": 4.5843, + "step": 24000 + }, + { + "epoch": 1.6309960592471802, + "grad_norm": 0.49495676159858704, + "learning_rate": 7.96210422611768e-06, + "loss": 4.8322, + "step": 24005 + }, + { + "epoch": 1.6313357793178422, + "grad_norm": 0.4436922073364258, + "learning_rate": 7.961679576029351e-06, + "loss": 4.4655, + "step": 24010 + }, + { + "epoch": 1.6316754993885039, + "grad_norm": 0.3754251003265381, + "learning_rate": 7.961254925941026e-06, + "loss": 4.5984, + "step": 24015 + }, + { + "epoch": 1.6320152194591655, + "grad_norm": 0.405000239610672, + "learning_rate": 7.960830275852699e-06, + "loss": 4.8915, + "step": 24020 + }, + { + "epoch": 1.6323549395298276, + "grad_norm": 0.40814587473869324, + "learning_rate": 7.96040562576437e-06, + "loss": 4.8682, + "step": 24025 + }, + { + "epoch": 1.6326946596004892, + "grad_norm": 0.4002135992050171, + "learning_rate": 7.959980975676044e-06, + "loss": 4.7574, + "step": 24030 + }, + { + "epoch": 1.6330343796711508, + "grad_norm": 0.39047712087631226, + "learning_rate": 7.959556325587715e-06, + "loss": 4.9076, + "step": 24035 + }, + { + "epoch": 1.633374099741813, + "grad_norm": 0.47025537490844727, + "learning_rate": 7.95913167549939e-06, + "loss": 4.9652, + "step": 24040 + }, + { + "epoch": 1.6337138198124745, + "grad_norm": 0.3601923882961273, + "learning_rate": 7.958707025411063e-06, + "loss": 4.5625, + "step": 24045 + }, + { + "epoch": 1.6340535398831362, + "grad_norm": 0.5419212579727173, + "learning_rate": 7.958282375322734e-06, + "loss": 4.6769, + "step": 24050 + }, + { + "epoch": 1.634393259953798, + "grad_norm": 0.5023762583732605, + "learning_rate": 7.957857725234408e-06, + "loss": 4.5301, + "step": 24055 + }, + { + "epoch": 1.6347329800244599, + "grad_norm": 0.34634634852409363, + "learning_rate": 7.957433075146081e-06, + "loss": 4.5826, + "step": 24060 + }, + { + "epoch": 1.6350727000951215, + "grad_norm": 0.3967016637325287, + "learning_rate": 7.957008425057752e-06, + "loss": 4.6849, + "step": 24065 + }, + { + "epoch": 1.6354124201657834, + "grad_norm": 0.4568611681461334, + "learning_rate": 7.956583774969427e-06, + "loss": 4.865, + "step": 24070 + }, + { + "epoch": 1.6357521402364452, + "grad_norm": 0.3416267931461334, + "learning_rate": 7.9561591248811e-06, + "loss": 4.6249, + "step": 24075 + }, + { + "epoch": 1.6360918603071068, + "grad_norm": 0.4307798743247986, + "learning_rate": 7.95573447479277e-06, + "loss": 5.036, + "step": 24080 + }, + { + "epoch": 1.6364315803777687, + "grad_norm": 0.4711313545703888, + "learning_rate": 7.955309824704445e-06, + "loss": 5.021, + "step": 24085 + }, + { + "epoch": 1.6367713004484306, + "grad_norm": 0.5274609923362732, + "learning_rate": 7.954885174616118e-06, + "loss": 4.6728, + "step": 24090 + }, + { + "epoch": 1.6371110205190922, + "grad_norm": 0.49416401982307434, + "learning_rate": 7.954460524527789e-06, + "loss": 4.5493, + "step": 24095 + }, + { + "epoch": 1.637450740589754, + "grad_norm": 0.4351581037044525, + "learning_rate": 7.954035874439463e-06, + "loss": 4.6279, + "step": 24100 + }, + { + "epoch": 1.637790460660416, + "grad_norm": 0.48800936341285706, + "learning_rate": 7.953611224351136e-06, + "loss": 4.625, + "step": 24105 + }, + { + "epoch": 1.6381301807310775, + "grad_norm": 0.5051233172416687, + "learning_rate": 7.953186574262807e-06, + "loss": 4.444, + "step": 24110 + }, + { + "epoch": 1.6384699008017394, + "grad_norm": 0.44485828280448914, + "learning_rate": 7.952761924174482e-06, + "loss": 4.627, + "step": 24115 + }, + { + "epoch": 1.6388096208724012, + "grad_norm": 0.4260469973087311, + "learning_rate": 7.952337274086153e-06, + "loss": 4.4789, + "step": 24120 + }, + { + "epoch": 1.6391493409430629, + "grad_norm": 0.41709432005882263, + "learning_rate": 7.951912623997826e-06, + "loss": 4.5806, + "step": 24125 + }, + { + "epoch": 1.6394890610137247, + "grad_norm": 0.42428523302078247, + "learning_rate": 7.9514879739095e-06, + "loss": 4.5858, + "step": 24130 + }, + { + "epoch": 1.6398287810843866, + "grad_norm": 0.3601992726325989, + "learning_rate": 7.951063323821171e-06, + "loss": 4.6209, + "step": 24135 + }, + { + "epoch": 1.6401685011550482, + "grad_norm": 0.4538712203502655, + "learning_rate": 7.950638673732844e-06, + "loss": 4.7854, + "step": 24140 + }, + { + "epoch": 1.64050822122571, + "grad_norm": 0.47775694727897644, + "learning_rate": 7.950214023644519e-06, + "loss": 4.5175, + "step": 24145 + }, + { + "epoch": 1.640847941296372, + "grad_norm": 0.4567701816558838, + "learning_rate": 7.94978937355619e-06, + "loss": 4.9655, + "step": 24150 + }, + { + "epoch": 1.6411876613670335, + "grad_norm": 0.3974217176437378, + "learning_rate": 7.949364723467863e-06, + "loss": 4.5963, + "step": 24155 + }, + { + "epoch": 1.6415273814376954, + "grad_norm": 0.42230910062789917, + "learning_rate": 7.948940073379537e-06, + "loss": 4.6582, + "step": 24160 + }, + { + "epoch": 1.6418671015083572, + "grad_norm": 0.37427350878715515, + "learning_rate": 7.948515423291208e-06, + "loss": 4.9583, + "step": 24165 + }, + { + "epoch": 1.6422068215790189, + "grad_norm": 0.5346946120262146, + "learning_rate": 7.948090773202881e-06, + "loss": 4.6332, + "step": 24170 + }, + { + "epoch": 1.6425465416496805, + "grad_norm": 0.3865606188774109, + "learning_rate": 7.947666123114555e-06, + "loss": 4.7043, + "step": 24175 + }, + { + "epoch": 1.6428862617203426, + "grad_norm": 0.4461875557899475, + "learning_rate": 7.947241473026227e-06, + "loss": 4.696, + "step": 24180 + }, + { + "epoch": 1.6432259817910042, + "grad_norm": 0.5258568525314331, + "learning_rate": 7.9468168229379e-06, + "loss": 4.6403, + "step": 24185 + }, + { + "epoch": 1.6435657018616658, + "grad_norm": 0.36330944299697876, + "learning_rate": 7.946392172849572e-06, + "loss": 4.6286, + "step": 24190 + }, + { + "epoch": 1.643905421932328, + "grad_norm": 0.4507869482040405, + "learning_rate": 7.945967522761245e-06, + "loss": 4.5202, + "step": 24195 + }, + { + "epoch": 1.6442451420029895, + "grad_norm": 0.4731317162513733, + "learning_rate": 7.945542872672918e-06, + "loss": 4.9891, + "step": 24200 + }, + { + "epoch": 1.6445848620736512, + "grad_norm": 0.5075430870056152, + "learning_rate": 7.94511822258459e-06, + "loss": 4.679, + "step": 24205 + }, + { + "epoch": 1.6449245821443133, + "grad_norm": 0.390370637178421, + "learning_rate": 7.944693572496263e-06, + "loss": 4.5635, + "step": 24210 + }, + { + "epoch": 1.6452643022149749, + "grad_norm": 0.47494828701019287, + "learning_rate": 7.944268922407936e-06, + "loss": 4.8415, + "step": 24215 + }, + { + "epoch": 1.6456040222856365, + "grad_norm": 0.38732293248176575, + "learning_rate": 7.943844272319609e-06, + "loss": 4.5817, + "step": 24220 + }, + { + "epoch": 1.6459437423562984, + "grad_norm": 0.3411112427711487, + "learning_rate": 7.943419622231282e-06, + "loss": 4.7395, + "step": 24225 + }, + { + "epoch": 1.6462834624269602, + "grad_norm": 0.5209558010101318, + "learning_rate": 7.942994972142955e-06, + "loss": 4.5821, + "step": 24230 + }, + { + "epoch": 1.6466231824976219, + "grad_norm": 0.4184539318084717, + "learning_rate": 7.942570322054627e-06, + "loss": 4.6058, + "step": 24235 + }, + { + "epoch": 1.6469629025682837, + "grad_norm": 0.3960507810115814, + "learning_rate": 7.9421456719663e-06, + "loss": 4.8227, + "step": 24240 + }, + { + "epoch": 1.6473026226389456, + "grad_norm": 0.36986902356147766, + "learning_rate": 7.941721021877973e-06, + "loss": 4.5313, + "step": 24245 + }, + { + "epoch": 1.6476423427096072, + "grad_norm": 0.7614092230796814, + "learning_rate": 7.941296371789646e-06, + "loss": 4.8239, + "step": 24250 + }, + { + "epoch": 1.647982062780269, + "grad_norm": 0.44751477241516113, + "learning_rate": 7.940871721701319e-06, + "loss": 4.6089, + "step": 24255 + }, + { + "epoch": 1.648321782850931, + "grad_norm": 0.48971477150917053, + "learning_rate": 7.940447071612991e-06, + "loss": 4.7174, + "step": 24260 + }, + { + "epoch": 1.6486615029215925, + "grad_norm": 0.3927144706249237, + "learning_rate": 7.940022421524664e-06, + "loss": 4.7142, + "step": 24265 + }, + { + "epoch": 1.6490012229922544, + "grad_norm": 0.5572820901870728, + "learning_rate": 7.939597771436337e-06, + "loss": 4.6241, + "step": 24270 + }, + { + "epoch": 1.6493409430629162, + "grad_norm": 0.379780113697052, + "learning_rate": 7.93917312134801e-06, + "loss": 4.5359, + "step": 24275 + }, + { + "epoch": 1.6496806631335779, + "grad_norm": 0.5032530426979065, + "learning_rate": 7.938748471259683e-06, + "loss": 4.8292, + "step": 24280 + }, + { + "epoch": 1.6500203832042397, + "grad_norm": 0.4448731541633606, + "learning_rate": 7.938323821171355e-06, + "loss": 4.6957, + "step": 24285 + }, + { + "epoch": 1.6503601032749016, + "grad_norm": 0.6634674072265625, + "learning_rate": 7.937899171083028e-06, + "loss": 4.7608, + "step": 24290 + }, + { + "epoch": 1.6506998233455632, + "grad_norm": 0.45518267154693604, + "learning_rate": 7.937474520994701e-06, + "loss": 4.7242, + "step": 24295 + }, + { + "epoch": 1.651039543416225, + "grad_norm": 0.5081242918968201, + "learning_rate": 7.937049870906374e-06, + "loss": 4.5845, + "step": 24300 + }, + { + "epoch": 1.651379263486887, + "grad_norm": 0.39341095089912415, + "learning_rate": 7.936625220818047e-06, + "loss": 4.6224, + "step": 24305 + }, + { + "epoch": 1.6517189835575485, + "grad_norm": 0.47306832671165466, + "learning_rate": 7.93620057072972e-06, + "loss": 4.9508, + "step": 24310 + }, + { + "epoch": 1.6520587036282104, + "grad_norm": 0.38390153646469116, + "learning_rate": 7.935775920641392e-06, + "loss": 4.6714, + "step": 24315 + }, + { + "epoch": 1.6523984236988722, + "grad_norm": 0.5430972576141357, + "learning_rate": 7.935351270553065e-06, + "loss": 4.641, + "step": 24320 + }, + { + "epoch": 1.6527381437695339, + "grad_norm": 0.43436816334724426, + "learning_rate": 7.934926620464738e-06, + "loss": 4.641, + "step": 24325 + }, + { + "epoch": 1.6530778638401957, + "grad_norm": 0.5212448835372925, + "learning_rate": 7.93450197037641e-06, + "loss": 4.4848, + "step": 24330 + }, + { + "epoch": 1.6534175839108576, + "grad_norm": 0.44938167929649353, + "learning_rate": 7.934077320288083e-06, + "loss": 4.6343, + "step": 24335 + }, + { + "epoch": 1.6537573039815192, + "grad_norm": 0.41012412309646606, + "learning_rate": 7.933652670199756e-06, + "loss": 4.4466, + "step": 24340 + }, + { + "epoch": 1.6540970240521808, + "grad_norm": 0.3962321877479553, + "learning_rate": 7.933228020111429e-06, + "loss": 4.6324, + "step": 24345 + }, + { + "epoch": 1.654436744122843, + "grad_norm": 0.4446921944618225, + "learning_rate": 7.932803370023102e-06, + "loss": 4.7174, + "step": 24350 + }, + { + "epoch": 1.6547764641935045, + "grad_norm": 0.5335832238197327, + "learning_rate": 7.932378719934775e-06, + "loss": 4.8318, + "step": 24355 + }, + { + "epoch": 1.6551161842641662, + "grad_norm": 0.48261502385139465, + "learning_rate": 7.931954069846447e-06, + "loss": 4.7114, + "step": 24360 + }, + { + "epoch": 1.6554559043348283, + "grad_norm": 0.40521618723869324, + "learning_rate": 7.93152941975812e-06, + "loss": 4.672, + "step": 24365 + }, + { + "epoch": 1.6557956244054899, + "grad_norm": 0.46203774213790894, + "learning_rate": 7.931104769669793e-06, + "loss": 4.7189, + "step": 24370 + }, + { + "epoch": 1.6561353444761515, + "grad_norm": 0.35758253931999207, + "learning_rate": 7.930680119581466e-06, + "loss": 4.5884, + "step": 24375 + }, + { + "epoch": 1.6564750645468136, + "grad_norm": 0.38788384199142456, + "learning_rate": 7.930255469493139e-06, + "loss": 4.7598, + "step": 24380 + }, + { + "epoch": 1.6568147846174752, + "grad_norm": 0.42928242683410645, + "learning_rate": 7.929830819404811e-06, + "loss": 4.8801, + "step": 24385 + }, + { + "epoch": 1.6571545046881369, + "grad_norm": 0.4766900837421417, + "learning_rate": 7.929406169316484e-06, + "loss": 4.3984, + "step": 24390 + }, + { + "epoch": 1.6574942247587987, + "grad_norm": 0.5246879458427429, + "learning_rate": 7.928981519228157e-06, + "loss": 4.8006, + "step": 24395 + }, + { + "epoch": 1.6578339448294606, + "grad_norm": 0.4069972038269043, + "learning_rate": 7.92855686913983e-06, + "loss": 4.7424, + "step": 24400 + }, + { + "epoch": 1.6581736649001222, + "grad_norm": 0.4820341169834137, + "learning_rate": 7.928132219051503e-06, + "loss": 4.5464, + "step": 24405 + }, + { + "epoch": 1.658513384970784, + "grad_norm": 0.4946250319480896, + "learning_rate": 7.927707568963175e-06, + "loss": 4.5986, + "step": 24410 + }, + { + "epoch": 1.658853105041446, + "grad_norm": 0.4300293028354645, + "learning_rate": 7.927282918874848e-06, + "loss": 4.8321, + "step": 24415 + }, + { + "epoch": 1.6591928251121075, + "grad_norm": 0.45755016803741455, + "learning_rate": 7.926858268786521e-06, + "loss": 4.7731, + "step": 24420 + }, + { + "epoch": 1.6595325451827694, + "grad_norm": 0.4602547883987427, + "learning_rate": 7.926433618698194e-06, + "loss": 4.5335, + "step": 24425 + }, + { + "epoch": 1.6598722652534312, + "grad_norm": 0.4566879868507385, + "learning_rate": 7.926008968609867e-06, + "loss": 4.9128, + "step": 24430 + }, + { + "epoch": 1.6602119853240929, + "grad_norm": 0.48623794317245483, + "learning_rate": 7.92558431852154e-06, + "loss": 4.572, + "step": 24435 + }, + { + "epoch": 1.6605517053947547, + "grad_norm": 0.6072973012924194, + "learning_rate": 7.925159668433212e-06, + "loss": 4.9277, + "step": 24440 + }, + { + "epoch": 1.6608914254654166, + "grad_norm": 0.4165029525756836, + "learning_rate": 7.924735018344885e-06, + "loss": 4.5818, + "step": 24445 + }, + { + "epoch": 1.6612311455360782, + "grad_norm": 0.37150561809539795, + "learning_rate": 7.924310368256558e-06, + "loss": 4.8553, + "step": 24450 + }, + { + "epoch": 1.66157086560674, + "grad_norm": 0.4553486704826355, + "learning_rate": 7.92388571816823e-06, + "loss": 4.4391, + "step": 24455 + }, + { + "epoch": 1.661910585677402, + "grad_norm": 0.4150717556476593, + "learning_rate": 7.923461068079903e-06, + "loss": 4.5727, + "step": 24460 + }, + { + "epoch": 1.6622503057480635, + "grad_norm": 0.6360970735549927, + "learning_rate": 7.923036417991575e-06, + "loss": 4.8649, + "step": 24465 + }, + { + "epoch": 1.6625900258187254, + "grad_norm": 0.46311333775520325, + "learning_rate": 7.922611767903249e-06, + "loss": 4.9871, + "step": 24470 + }, + { + "epoch": 1.6629297458893872, + "grad_norm": 0.4390932321548462, + "learning_rate": 7.922187117814922e-06, + "loss": 5.0793, + "step": 24475 + }, + { + "epoch": 1.6632694659600489, + "grad_norm": 0.40690967440605164, + "learning_rate": 7.921762467726593e-06, + "loss": 4.826, + "step": 24480 + }, + { + "epoch": 1.6636091860307107, + "grad_norm": 0.48985177278518677, + "learning_rate": 7.921337817638267e-06, + "loss": 4.7607, + "step": 24485 + }, + { + "epoch": 1.6639489061013726, + "grad_norm": 0.3865562677383423, + "learning_rate": 7.92091316754994e-06, + "loss": 4.6682, + "step": 24490 + }, + { + "epoch": 1.6642886261720342, + "grad_norm": 0.4958510398864746, + "learning_rate": 7.920488517461611e-06, + "loss": 4.9706, + "step": 24495 + }, + { + "epoch": 1.664628346242696, + "grad_norm": 0.4351615905761719, + "learning_rate": 7.920063867373286e-06, + "loss": 4.8313, + "step": 24500 + }, + { + "epoch": 1.664968066313358, + "grad_norm": 0.5198603272438049, + "learning_rate": 7.919639217284959e-06, + "loss": 4.5139, + "step": 24505 + }, + { + "epoch": 1.6653077863840196, + "grad_norm": 0.4284389317035675, + "learning_rate": 7.91921456719663e-06, + "loss": 4.6913, + "step": 24510 + }, + { + "epoch": 1.6656475064546812, + "grad_norm": 0.4433286488056183, + "learning_rate": 7.918789917108304e-06, + "loss": 4.4019, + "step": 24515 + }, + { + "epoch": 1.6659872265253433, + "grad_norm": 0.40757665038108826, + "learning_rate": 7.918365267019977e-06, + "loss": 4.5126, + "step": 24520 + }, + { + "epoch": 1.6663269465960049, + "grad_norm": 0.40604737401008606, + "learning_rate": 7.917940616931648e-06, + "loss": 4.6478, + "step": 24525 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.4097709357738495, + "learning_rate": 7.917515966843323e-06, + "loss": 4.8561, + "step": 24530 + }, + { + "epoch": 1.6670063867373286, + "grad_norm": 0.45521190762519836, + "learning_rate": 7.917091316754994e-06, + "loss": 4.6791, + "step": 24535 + }, + { + "epoch": 1.6673461068079902, + "grad_norm": 0.4331361949443817, + "learning_rate": 7.916666666666667e-06, + "loss": 4.6915, + "step": 24540 + }, + { + "epoch": 1.6676858268786519, + "grad_norm": 0.5263180136680603, + "learning_rate": 7.916242016578341e-06, + "loss": 4.7652, + "step": 24545 + }, + { + "epoch": 1.668025546949314, + "grad_norm": 0.5411679744720459, + "learning_rate": 7.915817366490012e-06, + "loss": 4.7109, + "step": 24550 + }, + { + "epoch": 1.6683652670199756, + "grad_norm": 0.4367801249027252, + "learning_rate": 7.915392716401685e-06, + "loss": 4.9816, + "step": 24555 + }, + { + "epoch": 1.6687049870906372, + "grad_norm": 0.4377882480621338, + "learning_rate": 7.91496806631336e-06, + "loss": 4.6337, + "step": 24560 + }, + { + "epoch": 1.669044707161299, + "grad_norm": 0.37350448966026306, + "learning_rate": 7.91454341622503e-06, + "loss": 4.7305, + "step": 24565 + }, + { + "epoch": 1.669384427231961, + "grad_norm": 0.3183067739009857, + "learning_rate": 7.914118766136703e-06, + "loss": 4.8747, + "step": 24570 + }, + { + "epoch": 1.6697241473026225, + "grad_norm": 0.35744813084602356, + "learning_rate": 7.913694116048378e-06, + "loss": 4.6456, + "step": 24575 + }, + { + "epoch": 1.6700638673732844, + "grad_norm": 0.49554863572120667, + "learning_rate": 7.913269465960049e-06, + "loss": 4.5686, + "step": 24580 + }, + { + "epoch": 1.6704035874439462, + "grad_norm": 0.49545058608055115, + "learning_rate": 7.912844815871722e-06, + "loss": 4.5862, + "step": 24585 + }, + { + "epoch": 1.6707433075146079, + "grad_norm": 0.4368768334388733, + "learning_rate": 7.912420165783396e-06, + "loss": 4.3648, + "step": 24590 + }, + { + "epoch": 1.6710830275852697, + "grad_norm": 0.39531201124191284, + "learning_rate": 7.911995515695067e-06, + "loss": 4.636, + "step": 24595 + }, + { + "epoch": 1.6714227476559316, + "grad_norm": 0.4308014214038849, + "learning_rate": 7.91157086560674e-06, + "loss": 4.5136, + "step": 24600 + }, + { + "epoch": 1.6717624677265932, + "grad_norm": 0.4460943043231964, + "learning_rate": 7.911146215518413e-06, + "loss": 4.878, + "step": 24605 + }, + { + "epoch": 1.672102187797255, + "grad_norm": 0.5252681970596313, + "learning_rate": 7.910721565430086e-06, + "loss": 4.7152, + "step": 24610 + }, + { + "epoch": 1.672441907867917, + "grad_norm": 0.5225245356559753, + "learning_rate": 7.910296915341759e-06, + "loss": 4.9262, + "step": 24615 + }, + { + "epoch": 1.6727816279385785, + "grad_norm": 0.46151208877563477, + "learning_rate": 7.909872265253431e-06, + "loss": 4.6753, + "step": 24620 + }, + { + "epoch": 1.6731213480092404, + "grad_norm": 0.48203879594802856, + "learning_rate": 7.909447615165104e-06, + "loss": 4.634, + "step": 24625 + }, + { + "epoch": 1.6734610680799022, + "grad_norm": 0.3245142996311188, + "learning_rate": 7.909022965076777e-06, + "loss": 4.7622, + "step": 24630 + }, + { + "epoch": 1.6738007881505639, + "grad_norm": 0.35675475001335144, + "learning_rate": 7.90859831498845e-06, + "loss": 4.5207, + "step": 24635 + }, + { + "epoch": 1.6741405082212257, + "grad_norm": 0.3819650113582611, + "learning_rate": 7.908173664900123e-06, + "loss": 4.7048, + "step": 24640 + }, + { + "epoch": 1.6744802282918876, + "grad_norm": 0.3596920669078827, + "learning_rate": 7.907749014811795e-06, + "loss": 4.7626, + "step": 24645 + }, + { + "epoch": 1.6748199483625492, + "grad_norm": 0.4099414348602295, + "learning_rate": 7.907324364723468e-06, + "loss": 4.7003, + "step": 24650 + }, + { + "epoch": 1.675159668433211, + "grad_norm": 0.5440972447395325, + "learning_rate": 7.906899714635141e-06, + "loss": 4.7894, + "step": 24655 + }, + { + "epoch": 1.675499388503873, + "grad_norm": 0.3819756507873535, + "learning_rate": 7.906475064546814e-06, + "loss": 4.5751, + "step": 24660 + }, + { + "epoch": 1.6758391085745346, + "grad_norm": 0.49045976996421814, + "learning_rate": 7.906050414458487e-06, + "loss": 4.8587, + "step": 24665 + }, + { + "epoch": 1.6761788286451964, + "grad_norm": 0.6015753746032715, + "learning_rate": 7.90562576437016e-06, + "loss": 4.7746, + "step": 24670 + }, + { + "epoch": 1.6765185487158583, + "grad_norm": 0.3695853650569916, + "learning_rate": 7.905201114281832e-06, + "loss": 4.8216, + "step": 24675 + }, + { + "epoch": 1.67685826878652, + "grad_norm": 0.44260719418525696, + "learning_rate": 7.904776464193505e-06, + "loss": 4.5702, + "step": 24680 + }, + { + "epoch": 1.6771979888571815, + "grad_norm": 0.4718773663043976, + "learning_rate": 7.904351814105178e-06, + "loss": 4.9117, + "step": 24685 + }, + { + "epoch": 1.6775377089278436, + "grad_norm": 0.4188182055950165, + "learning_rate": 7.90392716401685e-06, + "loss": 4.6784, + "step": 24690 + }, + { + "epoch": 1.6778774289985052, + "grad_norm": 0.5544421076774597, + "learning_rate": 7.903502513928523e-06, + "loss": 4.7483, + "step": 24695 + }, + { + "epoch": 1.6782171490691669, + "grad_norm": 0.42980486154556274, + "learning_rate": 7.903077863840196e-06, + "loss": 4.6967, + "step": 24700 + }, + { + "epoch": 1.678556869139829, + "grad_norm": 0.44648391008377075, + "learning_rate": 7.902653213751869e-06, + "loss": 4.6367, + "step": 24705 + }, + { + "epoch": 1.6788965892104906, + "grad_norm": 0.3308204710483551, + "learning_rate": 7.902228563663542e-06, + "loss": 4.774, + "step": 24710 + }, + { + "epoch": 1.6792363092811522, + "grad_norm": 0.3997538685798645, + "learning_rate": 7.901803913575215e-06, + "loss": 4.8493, + "step": 24715 + }, + { + "epoch": 1.6795760293518143, + "grad_norm": 0.4122156798839569, + "learning_rate": 7.901379263486887e-06, + "loss": 4.6203, + "step": 24720 + }, + { + "epoch": 1.679915749422476, + "grad_norm": 0.3647080063819885, + "learning_rate": 7.90095461339856e-06, + "loss": 4.5592, + "step": 24725 + }, + { + "epoch": 1.6802554694931375, + "grad_norm": 0.5947235822677612, + "learning_rate": 7.900529963310233e-06, + "loss": 4.5832, + "step": 24730 + }, + { + "epoch": 1.6805951895637994, + "grad_norm": 0.6257196068763733, + "learning_rate": 7.900105313221906e-06, + "loss": 4.6351, + "step": 24735 + }, + { + "epoch": 1.6809349096344612, + "grad_norm": 0.3458637595176697, + "learning_rate": 7.899680663133579e-06, + "loss": 4.4813, + "step": 24740 + }, + { + "epoch": 1.6812746297051229, + "grad_norm": 0.44495338201522827, + "learning_rate": 7.899256013045251e-06, + "loss": 4.9336, + "step": 24745 + }, + { + "epoch": 1.6816143497757847, + "grad_norm": 0.4177995026111603, + "learning_rate": 7.898831362956924e-06, + "loss": 4.8294, + "step": 24750 + }, + { + "epoch": 1.6819540698464466, + "grad_norm": 0.45578113198280334, + "learning_rate": 7.898406712868597e-06, + "loss": 4.854, + "step": 24755 + }, + { + "epoch": 1.6822937899171082, + "grad_norm": 0.46056923270225525, + "learning_rate": 7.89798206278027e-06, + "loss": 4.6338, + "step": 24760 + }, + { + "epoch": 1.68263350998777, + "grad_norm": 0.3842639625072479, + "learning_rate": 7.897557412691943e-06, + "loss": 4.3249, + "step": 24765 + }, + { + "epoch": 1.682973230058432, + "grad_norm": 0.5082845091819763, + "learning_rate": 7.897132762603615e-06, + "loss": 4.6765, + "step": 24770 + }, + { + "epoch": 1.6833129501290935, + "grad_norm": 0.4215257167816162, + "learning_rate": 7.896708112515288e-06, + "loss": 4.5104, + "step": 24775 + }, + { + "epoch": 1.6836526701997554, + "grad_norm": 0.3707644045352936, + "learning_rate": 7.896283462426961e-06, + "loss": 4.9838, + "step": 24780 + }, + { + "epoch": 1.6839923902704172, + "grad_norm": 0.4007570445537567, + "learning_rate": 7.895858812338634e-06, + "loss": 4.5569, + "step": 24785 + }, + { + "epoch": 1.6843321103410789, + "grad_norm": 0.35617804527282715, + "learning_rate": 7.895434162250307e-06, + "loss": 4.5332, + "step": 24790 + }, + { + "epoch": 1.6846718304117407, + "grad_norm": 0.4533764123916626, + "learning_rate": 7.89500951216198e-06, + "loss": 4.9076, + "step": 24795 + }, + { + "epoch": 1.6850115504824026, + "grad_norm": 0.3711850047111511, + "learning_rate": 7.894584862073652e-06, + "loss": 4.6575, + "step": 24800 + }, + { + "epoch": 1.6853512705530642, + "grad_norm": 0.5715999603271484, + "learning_rate": 7.894160211985325e-06, + "loss": 4.4877, + "step": 24805 + }, + { + "epoch": 1.685690990623726, + "grad_norm": 0.8556183576583862, + "learning_rate": 7.893735561896998e-06, + "loss": 4.7113, + "step": 24810 + }, + { + "epoch": 1.686030710694388, + "grad_norm": 0.36973389983177185, + "learning_rate": 7.89331091180867e-06, + "loss": 4.8822, + "step": 24815 + }, + { + "epoch": 1.6863704307650496, + "grad_norm": 0.3519313633441925, + "learning_rate": 7.892886261720343e-06, + "loss": 4.6006, + "step": 24820 + }, + { + "epoch": 1.6867101508357114, + "grad_norm": 0.34575578570365906, + "learning_rate": 7.892461611632016e-06, + "loss": 4.7614, + "step": 24825 + }, + { + "epoch": 1.6870498709063733, + "grad_norm": 0.42081108689308167, + "learning_rate": 7.892036961543689e-06, + "loss": 4.7595, + "step": 24830 + }, + { + "epoch": 1.687389590977035, + "grad_norm": 0.41868317127227783, + "learning_rate": 7.891612311455362e-06, + "loss": 4.7182, + "step": 24835 + }, + { + "epoch": 1.6877293110476967, + "grad_norm": 0.4015575647354126, + "learning_rate": 7.891187661367035e-06, + "loss": 4.604, + "step": 24840 + }, + { + "epoch": 1.6880690311183586, + "grad_norm": 0.5237539410591125, + "learning_rate": 7.890763011278707e-06, + "loss": 4.7134, + "step": 24845 + }, + { + "epoch": 1.6884087511890202, + "grad_norm": 0.5730625987052917, + "learning_rate": 7.89033836119038e-06, + "loss": 4.6929, + "step": 24850 + }, + { + "epoch": 1.6887484712596819, + "grad_norm": 0.40010881423950195, + "learning_rate": 7.889913711102053e-06, + "loss": 4.5409, + "step": 24855 + }, + { + "epoch": 1.689088191330344, + "grad_norm": 0.46562448143959045, + "learning_rate": 7.889489061013726e-06, + "loss": 4.5299, + "step": 24860 + }, + { + "epoch": 1.6894279114010056, + "grad_norm": 0.37030550837516785, + "learning_rate": 7.889064410925399e-06, + "loss": 4.4757, + "step": 24865 + }, + { + "epoch": 1.6897676314716672, + "grad_norm": 0.4137650430202484, + "learning_rate": 7.888639760837071e-06, + "loss": 4.394, + "step": 24870 + }, + { + "epoch": 1.6901073515423293, + "grad_norm": 0.2782476842403412, + "learning_rate": 7.888215110748744e-06, + "loss": 4.7167, + "step": 24875 + }, + { + "epoch": 1.690447071612991, + "grad_norm": 0.33911505341529846, + "learning_rate": 7.887790460660415e-06, + "loss": 4.5521, + "step": 24880 + }, + { + "epoch": 1.6907867916836525, + "grad_norm": 0.4592677354812622, + "learning_rate": 7.88736581057209e-06, + "loss": 4.3901, + "step": 24885 + }, + { + "epoch": 1.6911265117543146, + "grad_norm": 0.42196449637413025, + "learning_rate": 7.886941160483763e-06, + "loss": 4.5524, + "step": 24890 + }, + { + "epoch": 1.6914662318249762, + "grad_norm": 0.49453046917915344, + "learning_rate": 7.886516510395434e-06, + "loss": 4.6795, + "step": 24895 + }, + { + "epoch": 1.6918059518956379, + "grad_norm": 0.4631955325603485, + "learning_rate": 7.886176790324773e-06, + "loss": 4.4244, + "step": 24900 + }, + { + "epoch": 1.6921456719662997, + "grad_norm": 0.4514710605144501, + "learning_rate": 7.885752140236445e-06, + "loss": 4.4591, + "step": 24905 + }, + { + "epoch": 1.6924853920369616, + "grad_norm": 0.4033190906047821, + "learning_rate": 7.885327490148118e-06, + "loss": 4.5962, + "step": 24910 + }, + { + "epoch": 1.6928251121076232, + "grad_norm": 0.4481847584247589, + "learning_rate": 7.884902840059791e-06, + "loss": 4.7044, + "step": 24915 + }, + { + "epoch": 1.693164832178285, + "grad_norm": 0.3597300350666046, + "learning_rate": 7.884478189971464e-06, + "loss": 4.703, + "step": 24920 + }, + { + "epoch": 1.693504552248947, + "grad_norm": 0.4409237205982208, + "learning_rate": 7.884053539883138e-06, + "loss": 4.8481, + "step": 24925 + }, + { + "epoch": 1.6938442723196085, + "grad_norm": 0.5726591348648071, + "learning_rate": 7.88362888979481e-06, + "loss": 4.4933, + "step": 24930 + }, + { + "epoch": 1.6941839923902704, + "grad_norm": 0.3666136562824249, + "learning_rate": 7.883204239706482e-06, + "loss": 4.7358, + "step": 24935 + }, + { + "epoch": 1.6945237124609323, + "grad_norm": 0.43596044182777405, + "learning_rate": 7.882779589618155e-06, + "loss": 4.6881, + "step": 24940 + }, + { + "epoch": 1.6948634325315939, + "grad_norm": 0.3743937015533447, + "learning_rate": 7.882354939529828e-06, + "loss": 4.4476, + "step": 24945 + }, + { + "epoch": 1.6952031526022557, + "grad_norm": 0.47819361090660095, + "learning_rate": 7.8819302894415e-06, + "loss": 4.7241, + "step": 24950 + }, + { + "epoch": 1.6955428726729176, + "grad_norm": 0.3954106569290161, + "learning_rate": 7.881505639353173e-06, + "loss": 4.6225, + "step": 24955 + }, + { + "epoch": 1.6958825927435792, + "grad_norm": 0.44774913787841797, + "learning_rate": 7.881080989264846e-06, + "loss": 4.6113, + "step": 24960 + }, + { + "epoch": 1.696222312814241, + "grad_norm": 0.35919198393821716, + "learning_rate": 7.880656339176519e-06, + "loss": 4.6568, + "step": 24965 + }, + { + "epoch": 1.696562032884903, + "grad_norm": 0.33912286162376404, + "learning_rate": 7.880231689088192e-06, + "loss": 4.6434, + "step": 24970 + }, + { + "epoch": 1.6969017529555646, + "grad_norm": 0.35249343514442444, + "learning_rate": 7.879807038999865e-06, + "loss": 4.6567, + "step": 24975 + }, + { + "epoch": 1.6972414730262264, + "grad_norm": 0.455807089805603, + "learning_rate": 7.879382388911537e-06, + "loss": 4.5738, + "step": 24980 + }, + { + "epoch": 1.6975811930968883, + "grad_norm": 0.5568078756332397, + "learning_rate": 7.87895773882321e-06, + "loss": 4.6394, + "step": 24985 + }, + { + "epoch": 1.69792091316755, + "grad_norm": 0.43301886320114136, + "learning_rate": 7.878533088734883e-06, + "loss": 4.464, + "step": 24990 + }, + { + "epoch": 1.6982606332382117, + "grad_norm": 0.4022790193557739, + "learning_rate": 7.878108438646556e-06, + "loss": 4.8915, + "step": 24995 + }, + { + "epoch": 1.6986003533088736, + "grad_norm": 0.42233482003211975, + "learning_rate": 7.877683788558229e-06, + "loss": 4.8341, + "step": 25000 + }, + { + "epoch": 1.6989400733795352, + "grad_norm": 0.3735266625881195, + "learning_rate": 7.877259138469901e-06, + "loss": 4.6123, + "step": 25005 + }, + { + "epoch": 1.699279793450197, + "grad_norm": 0.500694215297699, + "learning_rate": 7.876834488381574e-06, + "loss": 4.7947, + "step": 25010 + }, + { + "epoch": 1.699619513520859, + "grad_norm": 0.5608490705490112, + "learning_rate": 7.876409838293247e-06, + "loss": 4.4243, + "step": 25015 + }, + { + "epoch": 1.6999592335915206, + "grad_norm": 0.4320232570171356, + "learning_rate": 7.87598518820492e-06, + "loss": 4.7982, + "step": 25020 + }, + { + "epoch": 1.7002989536621822, + "grad_norm": 0.45891472697257996, + "learning_rate": 7.875560538116593e-06, + "loss": 4.6353, + "step": 25025 + }, + { + "epoch": 1.7006386737328443, + "grad_norm": 0.45325276255607605, + "learning_rate": 7.875135888028265e-06, + "loss": 4.534, + "step": 25030 + }, + { + "epoch": 1.700978393803506, + "grad_norm": 0.4806153476238251, + "learning_rate": 7.874711237939938e-06, + "loss": 4.507, + "step": 25035 + }, + { + "epoch": 1.7013181138741675, + "grad_norm": 0.40890613198280334, + "learning_rate": 7.874286587851611e-06, + "loss": 4.5595, + "step": 25040 + }, + { + "epoch": 1.7016578339448296, + "grad_norm": 0.45894020795822144, + "learning_rate": 7.873861937763284e-06, + "loss": 4.6076, + "step": 25045 + }, + { + "epoch": 1.7019975540154912, + "grad_norm": 0.5420960187911987, + "learning_rate": 7.873437287674957e-06, + "loss": 4.619, + "step": 25050 + }, + { + "epoch": 1.7023372740861529, + "grad_norm": 0.5137192010879517, + "learning_rate": 7.87301263758663e-06, + "loss": 4.7187, + "step": 25055 + }, + { + "epoch": 1.702676994156815, + "grad_norm": 0.40703654289245605, + "learning_rate": 7.872587987498302e-06, + "loss": 4.5193, + "step": 25060 + }, + { + "epoch": 1.7030167142274766, + "grad_norm": 0.4248145818710327, + "learning_rate": 7.872163337409975e-06, + "loss": 4.564, + "step": 25065 + }, + { + "epoch": 1.7033564342981382, + "grad_norm": 0.3541250228881836, + "learning_rate": 7.871738687321648e-06, + "loss": 4.6448, + "step": 25070 + }, + { + "epoch": 1.7036961543688, + "grad_norm": 0.40628740191459656, + "learning_rate": 7.87131403723332e-06, + "loss": 4.4083, + "step": 25075 + }, + { + "epoch": 1.704035874439462, + "grad_norm": 0.40510231256484985, + "learning_rate": 7.870889387144993e-06, + "loss": 4.6476, + "step": 25080 + }, + { + "epoch": 1.7043755945101235, + "grad_norm": 0.41931018233299255, + "learning_rate": 7.870464737056666e-06, + "loss": 4.7923, + "step": 25085 + }, + { + "epoch": 1.7047153145807854, + "grad_norm": 0.33029359579086304, + "learning_rate": 7.870040086968339e-06, + "loss": 4.7099, + "step": 25090 + }, + { + "epoch": 1.7050550346514473, + "grad_norm": 0.4037790596485138, + "learning_rate": 7.869615436880012e-06, + "loss": 4.976, + "step": 25095 + }, + { + "epoch": 1.7053947547221089, + "grad_norm": 0.3355174660682678, + "learning_rate": 7.869190786791685e-06, + "loss": 4.613, + "step": 25100 + }, + { + "epoch": 1.7057344747927707, + "grad_norm": 0.39365464448928833, + "learning_rate": 7.868766136703357e-06, + "loss": 4.7693, + "step": 25105 + }, + { + "epoch": 1.7060741948634326, + "grad_norm": 0.32101038098335266, + "learning_rate": 7.86834148661503e-06, + "loss": 4.4297, + "step": 25110 + }, + { + "epoch": 1.7064139149340942, + "grad_norm": 0.33305689692497253, + "learning_rate": 7.867916836526703e-06, + "loss": 4.5796, + "step": 25115 + }, + { + "epoch": 1.706753635004756, + "grad_norm": 0.43237802386283875, + "learning_rate": 7.867492186438376e-06, + "loss": 4.6645, + "step": 25120 + }, + { + "epoch": 1.707093355075418, + "grad_norm": 0.5885447263717651, + "learning_rate": 7.867067536350049e-06, + "loss": 4.7582, + "step": 25125 + }, + { + "epoch": 1.7074330751460796, + "grad_norm": 0.3540913164615631, + "learning_rate": 7.86664288626172e-06, + "loss": 4.8862, + "step": 25130 + }, + { + "epoch": 1.7077727952167414, + "grad_norm": 0.3379102647304535, + "learning_rate": 7.866218236173394e-06, + "loss": 4.3854, + "step": 25135 + }, + { + "epoch": 1.7081125152874033, + "grad_norm": 0.41611382365226746, + "learning_rate": 7.865793586085067e-06, + "loss": 4.8046, + "step": 25140 + }, + { + "epoch": 1.708452235358065, + "grad_norm": 0.547683596611023, + "learning_rate": 7.865368935996738e-06, + "loss": 4.6069, + "step": 25145 + }, + { + "epoch": 1.7087919554287267, + "grad_norm": 0.3939986526966095, + "learning_rate": 7.864944285908413e-06, + "loss": 4.9195, + "step": 25150 + }, + { + "epoch": 1.7091316754993886, + "grad_norm": 0.4545876681804657, + "learning_rate": 7.864519635820085e-06, + "loss": 4.7623, + "step": 25155 + }, + { + "epoch": 1.7094713955700502, + "grad_norm": 0.5958176255226135, + "learning_rate": 7.864094985731757e-06, + "loss": 4.7904, + "step": 25160 + }, + { + "epoch": 1.709811115640712, + "grad_norm": 0.46268829703330994, + "learning_rate": 7.863670335643431e-06, + "loss": 4.5094, + "step": 25165 + }, + { + "epoch": 1.710150835711374, + "grad_norm": 0.43236371874809265, + "learning_rate": 7.863245685555104e-06, + "loss": 4.743, + "step": 25170 + }, + { + "epoch": 1.7104905557820356, + "grad_norm": 0.4551037549972534, + "learning_rate": 7.862821035466775e-06, + "loss": 4.8373, + "step": 25175 + }, + { + "epoch": 1.7108302758526974, + "grad_norm": 0.41879531741142273, + "learning_rate": 7.86239638537845e-06, + "loss": 4.806, + "step": 25180 + }, + { + "epoch": 1.7111699959233593, + "grad_norm": 0.3814605176448822, + "learning_rate": 7.861971735290122e-06, + "loss": 4.7932, + "step": 25185 + }, + { + "epoch": 1.711509715994021, + "grad_norm": 0.70428067445755, + "learning_rate": 7.861547085201793e-06, + "loss": 5.0181, + "step": 25190 + }, + { + "epoch": 1.7118494360646825, + "grad_norm": 0.5036877989768982, + "learning_rate": 7.861122435113468e-06, + "loss": 4.8476, + "step": 25195 + }, + { + "epoch": 1.7121891561353446, + "grad_norm": 0.4549928605556488, + "learning_rate": 7.860697785025139e-06, + "loss": 4.4756, + "step": 25200 + }, + { + "epoch": 1.7125288762060062, + "grad_norm": 0.41295310854911804, + "learning_rate": 7.860273134936812e-06, + "loss": 4.5319, + "step": 25205 + }, + { + "epoch": 1.7128685962766679, + "grad_norm": 0.42931342124938965, + "learning_rate": 7.859848484848486e-06, + "loss": 4.7042, + "step": 25210 + }, + { + "epoch": 1.71320831634733, + "grad_norm": 0.38658902049064636, + "learning_rate": 7.859423834760157e-06, + "loss": 4.4668, + "step": 25215 + }, + { + "epoch": 1.7135480364179916, + "grad_norm": 0.38623911142349243, + "learning_rate": 7.85899918467183e-06, + "loss": 4.8354, + "step": 25220 + }, + { + "epoch": 1.7138877564886532, + "grad_norm": 0.47556254267692566, + "learning_rate": 7.858574534583505e-06, + "loss": 4.7963, + "step": 25225 + }, + { + "epoch": 1.7142274765593153, + "grad_norm": 0.45663803815841675, + "learning_rate": 7.858149884495176e-06, + "loss": 4.4604, + "step": 25230 + }, + { + "epoch": 1.714567196629977, + "grad_norm": 0.42087483406066895, + "learning_rate": 7.857725234406849e-06, + "loss": 4.6347, + "step": 25235 + }, + { + "epoch": 1.7149069167006386, + "grad_norm": 0.5006410479545593, + "learning_rate": 7.857300584318523e-06, + "loss": 5.0256, + "step": 25240 + }, + { + "epoch": 1.7152466367713004, + "grad_norm": 0.3296578824520111, + "learning_rate": 7.856875934230194e-06, + "loss": 4.6217, + "step": 25245 + }, + { + "epoch": 1.7155863568419623, + "grad_norm": 0.40127283334732056, + "learning_rate": 7.856451284141867e-06, + "loss": 4.6883, + "step": 25250 + }, + { + "epoch": 1.7159260769126239, + "grad_norm": 0.4206618666648865, + "learning_rate": 7.856026634053541e-06, + "loss": 4.6454, + "step": 25255 + }, + { + "epoch": 1.7162657969832857, + "grad_norm": 0.47606417536735535, + "learning_rate": 7.855601983965213e-06, + "loss": 4.6189, + "step": 25260 + }, + { + "epoch": 1.7166055170539476, + "grad_norm": 0.4254990518093109, + "learning_rate": 7.855177333876887e-06, + "loss": 4.8577, + "step": 25265 + }, + { + "epoch": 1.7169452371246092, + "grad_norm": 0.3284289836883545, + "learning_rate": 7.85475268378856e-06, + "loss": 4.692, + "step": 25270 + }, + { + "epoch": 1.717284957195271, + "grad_norm": 0.32110413908958435, + "learning_rate": 7.854328033700231e-06, + "loss": 4.4889, + "step": 25275 + }, + { + "epoch": 1.717624677265933, + "grad_norm": 0.565393328666687, + "learning_rate": 7.853903383611905e-06, + "loss": 4.827, + "step": 25280 + }, + { + "epoch": 1.7179643973365946, + "grad_norm": 0.3805563449859619, + "learning_rate": 7.853478733523577e-06, + "loss": 4.6271, + "step": 25285 + }, + { + "epoch": 1.7183041174072564, + "grad_norm": 0.36509084701538086, + "learning_rate": 7.85305408343525e-06, + "loss": 4.6706, + "step": 25290 + }, + { + "epoch": 1.7186438374779183, + "grad_norm": 0.5262141823768616, + "learning_rate": 7.852629433346924e-06, + "loss": 4.5628, + "step": 25295 + }, + { + "epoch": 1.71898355754858, + "grad_norm": 0.33030250668525696, + "learning_rate": 7.852204783258595e-06, + "loss": 4.5859, + "step": 25300 + }, + { + "epoch": 1.7193232776192418, + "grad_norm": 0.4879510700702667, + "learning_rate": 7.851780133170268e-06, + "loss": 4.64, + "step": 25305 + }, + { + "epoch": 1.7196629976899036, + "grad_norm": 0.4210696518421173, + "learning_rate": 7.851355483081942e-06, + "loss": 4.8771, + "step": 25310 + }, + { + "epoch": 1.7200027177605652, + "grad_norm": 0.5237396955490112, + "learning_rate": 7.850930832993613e-06, + "loss": 4.7413, + "step": 25315 + }, + { + "epoch": 1.720342437831227, + "grad_norm": 0.494505912065506, + "learning_rate": 7.850506182905286e-06, + "loss": 4.7371, + "step": 25320 + }, + { + "epoch": 1.720682157901889, + "grad_norm": 0.40730029344558716, + "learning_rate": 7.85008153281696e-06, + "loss": 4.4874, + "step": 25325 + }, + { + "epoch": 1.7210218779725506, + "grad_norm": 0.41722923517227173, + "learning_rate": 7.849656882728632e-06, + "loss": 4.6851, + "step": 25330 + }, + { + "epoch": 1.7213615980432124, + "grad_norm": 0.4664181172847748, + "learning_rate": 7.849232232640305e-06, + "loss": 4.5539, + "step": 25335 + }, + { + "epoch": 1.7217013181138743, + "grad_norm": 0.3922017216682434, + "learning_rate": 7.848807582551979e-06, + "loss": 4.5761, + "step": 25340 + }, + { + "epoch": 1.722041038184536, + "grad_norm": 0.4053286612033844, + "learning_rate": 7.84838293246365e-06, + "loss": 4.4376, + "step": 25345 + }, + { + "epoch": 1.7223807582551978, + "grad_norm": 0.4572867453098297, + "learning_rate": 7.847958282375323e-06, + "loss": 4.4628, + "step": 25350 + }, + { + "epoch": 1.7227204783258596, + "grad_norm": 0.47185540199279785, + "learning_rate": 7.847533632286996e-06, + "loss": 4.6488, + "step": 25355 + }, + { + "epoch": 1.7230601983965212, + "grad_norm": 0.3550848364830017, + "learning_rate": 7.847108982198669e-06, + "loss": 4.4487, + "step": 25360 + }, + { + "epoch": 1.7233999184671829, + "grad_norm": 0.5119723081588745, + "learning_rate": 7.846684332110341e-06, + "loss": 4.519, + "step": 25365 + }, + { + "epoch": 1.723739638537845, + "grad_norm": 0.3986358642578125, + "learning_rate": 7.846259682022014e-06, + "loss": 4.8769, + "step": 25370 + }, + { + "epoch": 1.7240793586085066, + "grad_norm": 0.48101672530174255, + "learning_rate": 7.845835031933687e-06, + "loss": 4.4434, + "step": 25375 + }, + { + "epoch": 1.7244190786791682, + "grad_norm": 0.4424699544906616, + "learning_rate": 7.84541038184536e-06, + "loss": 4.6591, + "step": 25380 + }, + { + "epoch": 1.7247587987498303, + "grad_norm": 0.5165992975234985, + "learning_rate": 7.844985731757033e-06, + "loss": 4.67, + "step": 25385 + }, + { + "epoch": 1.725098518820492, + "grad_norm": 0.35568612813949585, + "learning_rate": 7.844561081668705e-06, + "loss": 4.7768, + "step": 25390 + }, + { + "epoch": 1.7254382388911536, + "grad_norm": 0.5038064122200012, + "learning_rate": 7.844136431580378e-06, + "loss": 4.6423, + "step": 25395 + }, + { + "epoch": 1.7257779589618156, + "grad_norm": 0.4424954950809479, + "learning_rate": 7.843711781492051e-06, + "loss": 4.6546, + "step": 25400 + }, + { + "epoch": 1.7261176790324773, + "grad_norm": 0.4274149537086487, + "learning_rate": 7.843287131403724e-06, + "loss": 4.6026, + "step": 25405 + }, + { + "epoch": 1.726457399103139, + "grad_norm": 0.460077702999115, + "learning_rate": 7.842862481315397e-06, + "loss": 4.7453, + "step": 25410 + }, + { + "epoch": 1.7267971191738007, + "grad_norm": 0.43425947427749634, + "learning_rate": 7.84243783122707e-06, + "loss": 4.7532, + "step": 25415 + }, + { + "epoch": 1.7271368392444626, + "grad_norm": 0.4728919267654419, + "learning_rate": 7.842013181138742e-06, + "loss": 4.7833, + "step": 25420 + }, + { + "epoch": 1.7274765593151242, + "grad_norm": 0.402034193277359, + "learning_rate": 7.841588531050415e-06, + "loss": 4.6305, + "step": 25425 + }, + { + "epoch": 1.727816279385786, + "grad_norm": 0.545667827129364, + "learning_rate": 7.841163880962088e-06, + "loss": 4.7269, + "step": 25430 + }, + { + "epoch": 1.728155999456448, + "grad_norm": 0.5342793464660645, + "learning_rate": 7.84073923087376e-06, + "loss": 4.9881, + "step": 25435 + }, + { + "epoch": 1.7284957195271096, + "grad_norm": 0.35441529750823975, + "learning_rate": 7.840314580785433e-06, + "loss": 4.6633, + "step": 25440 + }, + { + "epoch": 1.7288354395977714, + "grad_norm": 0.4049098491668701, + "learning_rate": 7.839889930697106e-06, + "loss": 4.5218, + "step": 25445 + }, + { + "epoch": 1.7291751596684333, + "grad_norm": 0.3967810869216919, + "learning_rate": 7.839465280608779e-06, + "loss": 4.6765, + "step": 25450 + }, + { + "epoch": 1.729514879739095, + "grad_norm": 0.3930293619632721, + "learning_rate": 7.839040630520452e-06, + "loss": 4.5381, + "step": 25455 + }, + { + "epoch": 1.7298545998097568, + "grad_norm": 0.36600059270858765, + "learning_rate": 7.838615980432125e-06, + "loss": 4.878, + "step": 25460 + }, + { + "epoch": 1.7301943198804186, + "grad_norm": 0.45845672488212585, + "learning_rate": 7.838191330343797e-06, + "loss": 4.6675, + "step": 25465 + }, + { + "epoch": 1.7305340399510802, + "grad_norm": 0.5841081142425537, + "learning_rate": 7.83776668025547e-06, + "loss": 4.5643, + "step": 25470 + }, + { + "epoch": 1.730873760021742, + "grad_norm": 0.3465852439403534, + "learning_rate": 7.837342030167143e-06, + "loss": 4.779, + "step": 25475 + }, + { + "epoch": 1.731213480092404, + "grad_norm": 0.46258577704429626, + "learning_rate": 7.836917380078816e-06, + "loss": 4.3797, + "step": 25480 + }, + { + "epoch": 1.7315532001630656, + "grad_norm": 0.5583425164222717, + "learning_rate": 7.836492729990489e-06, + "loss": 4.7465, + "step": 25485 + }, + { + "epoch": 1.7318929202337274, + "grad_norm": 0.5875880718231201, + "learning_rate": 7.836068079902161e-06, + "loss": 4.8054, + "step": 25490 + }, + { + "epoch": 1.7322326403043893, + "grad_norm": 0.4316066801548004, + "learning_rate": 7.835643429813834e-06, + "loss": 4.3898, + "step": 25495 + }, + { + "epoch": 1.732572360375051, + "grad_norm": 0.544813871383667, + "learning_rate": 7.835218779725507e-06, + "loss": 4.7198, + "step": 25500 + }, + { + "epoch": 1.7329120804457128, + "grad_norm": 0.5127374529838562, + "learning_rate": 7.83479412963718e-06, + "loss": 4.622, + "step": 25505 + }, + { + "epoch": 1.7332518005163746, + "grad_norm": 0.4149376153945923, + "learning_rate": 7.834369479548853e-06, + "loss": 4.7217, + "step": 25510 + }, + { + "epoch": 1.7335915205870362, + "grad_norm": 0.40121760964393616, + "learning_rate": 7.833944829460525e-06, + "loss": 4.5816, + "step": 25515 + }, + { + "epoch": 1.733931240657698, + "grad_norm": 0.447351336479187, + "learning_rate": 7.833520179372198e-06, + "loss": 4.6211, + "step": 25520 + }, + { + "epoch": 1.73427096072836, + "grad_norm": 0.4510680139064789, + "learning_rate": 7.833095529283871e-06, + "loss": 4.5462, + "step": 25525 + }, + { + "epoch": 1.7346106807990216, + "grad_norm": 0.4663338363170624, + "learning_rate": 7.832670879195544e-06, + "loss": 4.3818, + "step": 25530 + }, + { + "epoch": 1.7349504008696832, + "grad_norm": 0.3539447784423828, + "learning_rate": 7.832246229107217e-06, + "loss": 4.5079, + "step": 25535 + }, + { + "epoch": 1.7352901209403453, + "grad_norm": 0.3859492540359497, + "learning_rate": 7.83182157901889e-06, + "loss": 4.5018, + "step": 25540 + }, + { + "epoch": 1.735629841011007, + "grad_norm": 0.513664722442627, + "learning_rate": 7.83139692893056e-06, + "loss": 4.6669, + "step": 25545 + }, + { + "epoch": 1.7359695610816686, + "grad_norm": 0.49039068818092346, + "learning_rate": 7.830972278842235e-06, + "loss": 4.748, + "step": 25550 + }, + { + "epoch": 1.7363092811523306, + "grad_norm": 0.44590121507644653, + "learning_rate": 7.830547628753908e-06, + "loss": 4.8438, + "step": 25555 + }, + { + "epoch": 1.7366490012229923, + "grad_norm": 0.36376577615737915, + "learning_rate": 7.830122978665579e-06, + "loss": 4.4266, + "step": 25560 + }, + { + "epoch": 1.736988721293654, + "grad_norm": 0.4183202087879181, + "learning_rate": 7.829698328577253e-06, + "loss": 4.5149, + "step": 25565 + }, + { + "epoch": 1.737328441364316, + "grad_norm": 0.6619214415550232, + "learning_rate": 7.829273678488926e-06, + "loss": 4.5021, + "step": 25570 + }, + { + "epoch": 1.7376681614349776, + "grad_norm": 0.4081850051879883, + "learning_rate": 7.828849028400597e-06, + "loss": 4.453, + "step": 25575 + }, + { + "epoch": 1.7380078815056392, + "grad_norm": 0.4749740958213806, + "learning_rate": 7.828424378312272e-06, + "loss": 4.7925, + "step": 25580 + }, + { + "epoch": 1.738347601576301, + "grad_norm": 0.4038395285606384, + "learning_rate": 7.827999728223945e-06, + "loss": 4.5439, + "step": 25585 + }, + { + "epoch": 1.738687321646963, + "grad_norm": 0.3811510503292084, + "learning_rate": 7.827575078135616e-06, + "loss": 4.5638, + "step": 25590 + }, + { + "epoch": 1.7390270417176246, + "grad_norm": 0.42365947365760803, + "learning_rate": 7.82715042804729e-06, + "loss": 4.6319, + "step": 25595 + }, + { + "epoch": 1.7393667617882864, + "grad_norm": 0.45255622267723083, + "learning_rate": 7.826725777958963e-06, + "loss": 4.8395, + "step": 25600 + }, + { + "epoch": 1.7397064818589483, + "grad_norm": 0.5059241056442261, + "learning_rate": 7.826301127870636e-06, + "loss": 4.8075, + "step": 25605 + }, + { + "epoch": 1.74004620192961, + "grad_norm": 0.46475327014923096, + "learning_rate": 7.825876477782309e-06, + "loss": 4.5071, + "step": 25610 + }, + { + "epoch": 1.7403859220002718, + "grad_norm": 0.43080878257751465, + "learning_rate": 7.825451827693981e-06, + "loss": 4.8646, + "step": 25615 + }, + { + "epoch": 1.7407256420709336, + "grad_norm": 0.4844416677951813, + "learning_rate": 7.825027177605654e-06, + "loss": 4.483, + "step": 25620 + }, + { + "epoch": 1.7410653621415952, + "grad_norm": 0.4506946802139282, + "learning_rate": 7.824602527517327e-06, + "loss": 4.6139, + "step": 25625 + }, + { + "epoch": 1.741405082212257, + "grad_norm": 0.3891696333885193, + "learning_rate": 7.824177877428998e-06, + "loss": 4.5048, + "step": 25630 + }, + { + "epoch": 1.741744802282919, + "grad_norm": 0.4200358986854553, + "learning_rate": 7.823753227340673e-06, + "loss": 4.715, + "step": 25635 + }, + { + "epoch": 1.7420845223535806, + "grad_norm": 0.3518584668636322, + "learning_rate": 7.823328577252345e-06, + "loss": 4.502, + "step": 25640 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 0.4806276857852936, + "learning_rate": 7.822903927164017e-06, + "loss": 4.5983, + "step": 25645 + }, + { + "epoch": 1.7427639624949043, + "grad_norm": 0.46751663088798523, + "learning_rate": 7.822479277075691e-06, + "loss": 4.6186, + "step": 25650 + }, + { + "epoch": 1.743103682565566, + "grad_norm": 0.3752039968967438, + "learning_rate": 7.822054626987364e-06, + "loss": 4.5227, + "step": 25655 + }, + { + "epoch": 1.7434434026362278, + "grad_norm": 0.3795965015888214, + "learning_rate": 7.821629976899035e-06, + "loss": 4.4557, + "step": 25660 + }, + { + "epoch": 1.7437831227068896, + "grad_norm": 0.5159714221954346, + "learning_rate": 7.82120532681071e-06, + "loss": 4.613, + "step": 25665 + }, + { + "epoch": 1.7441228427775513, + "grad_norm": 0.31269776821136475, + "learning_rate": 7.820780676722382e-06, + "loss": 4.7523, + "step": 25670 + }, + { + "epoch": 1.744462562848213, + "grad_norm": 0.38371962308883667, + "learning_rate": 7.820356026634053e-06, + "loss": 4.4457, + "step": 25675 + }, + { + "epoch": 1.744802282918875, + "grad_norm": 0.35330456495285034, + "learning_rate": 7.819931376545728e-06, + "loss": 4.5364, + "step": 25680 + }, + { + "epoch": 1.7451420029895366, + "grad_norm": 0.4572889804840088, + "learning_rate": 7.8195067264574e-06, + "loss": 4.5386, + "step": 25685 + }, + { + "epoch": 1.7454817230601984, + "grad_norm": 0.49569326639175415, + "learning_rate": 7.819082076369072e-06, + "loss": 4.9422, + "step": 25690 + }, + { + "epoch": 1.7458214431308603, + "grad_norm": 0.4394396245479584, + "learning_rate": 7.818657426280746e-06, + "loss": 4.7705, + "step": 25695 + }, + { + "epoch": 1.746161163201522, + "grad_norm": 0.36789175868034363, + "learning_rate": 7.818232776192417e-06, + "loss": 4.4399, + "step": 25700 + }, + { + "epoch": 1.7465008832721836, + "grad_norm": 0.5713211894035339, + "learning_rate": 7.81780812610409e-06, + "loss": 4.802, + "step": 25705 + }, + { + "epoch": 1.7468406033428456, + "grad_norm": 0.5535262227058411, + "learning_rate": 7.817383476015765e-06, + "loss": 4.793, + "step": 25710 + }, + { + "epoch": 1.7471803234135073, + "grad_norm": 0.5908305644989014, + "learning_rate": 7.816958825927436e-06, + "loss": 4.619, + "step": 25715 + }, + { + "epoch": 1.747520043484169, + "grad_norm": 0.4590142071247101, + "learning_rate": 7.816534175839109e-06, + "loss": 4.5665, + "step": 25720 + }, + { + "epoch": 1.747859763554831, + "grad_norm": 0.4024465084075928, + "learning_rate": 7.816109525750783e-06, + "loss": 4.7796, + "step": 25725 + }, + { + "epoch": 1.7481994836254926, + "grad_norm": 0.38263601064682007, + "learning_rate": 7.815684875662454e-06, + "loss": 4.5129, + "step": 25730 + }, + { + "epoch": 1.7485392036961542, + "grad_norm": 0.3499296009540558, + "learning_rate": 7.815260225574127e-06, + "loss": 4.4482, + "step": 25735 + }, + { + "epoch": 1.7488789237668163, + "grad_norm": 0.5551217794418335, + "learning_rate": 7.814835575485801e-06, + "loss": 4.6809, + "step": 25740 + }, + { + "epoch": 1.749218643837478, + "grad_norm": 0.41390371322631836, + "learning_rate": 7.814410925397473e-06, + "loss": 4.3584, + "step": 25745 + }, + { + "epoch": 1.7495583639081396, + "grad_norm": 0.37352654337882996, + "learning_rate": 7.813986275309145e-06, + "loss": 4.7375, + "step": 25750 + }, + { + "epoch": 1.7498980839788014, + "grad_norm": 0.6110312342643738, + "learning_rate": 7.81356162522082e-06, + "loss": 4.6593, + "step": 25755 + }, + { + "epoch": 1.7502378040494633, + "grad_norm": 0.4589904546737671, + "learning_rate": 7.813136975132491e-06, + "loss": 4.703, + "step": 25760 + }, + { + "epoch": 1.750577524120125, + "grad_norm": 0.4372006058692932, + "learning_rate": 7.812712325044164e-06, + "loss": 4.6783, + "step": 25765 + }, + { + "epoch": 1.7509172441907868, + "grad_norm": 0.34005457162857056, + "learning_rate": 7.812287674955837e-06, + "loss": 4.4028, + "step": 25770 + }, + { + "epoch": 1.7512569642614486, + "grad_norm": 0.4195004403591156, + "learning_rate": 7.81186302486751e-06, + "loss": 4.8943, + "step": 25775 + }, + { + "epoch": 1.7515966843321102, + "grad_norm": 0.3865126371383667, + "learning_rate": 7.811438374779182e-06, + "loss": 4.6416, + "step": 25780 + }, + { + "epoch": 1.751936404402772, + "grad_norm": 0.47707808017730713, + "learning_rate": 7.811013724690855e-06, + "loss": 4.3841, + "step": 25785 + }, + { + "epoch": 1.752276124473434, + "grad_norm": 0.4224635362625122, + "learning_rate": 7.810589074602528e-06, + "loss": 4.9836, + "step": 25790 + }, + { + "epoch": 1.7526158445440956, + "grad_norm": 0.38415613770484924, + "learning_rate": 7.8101644245142e-06, + "loss": 4.5707, + "step": 25795 + }, + { + "epoch": 1.7529555646147574, + "grad_norm": 0.41947805881500244, + "learning_rate": 7.809739774425873e-06, + "loss": 4.6403, + "step": 25800 + }, + { + "epoch": 1.7532952846854193, + "grad_norm": 0.550134003162384, + "learning_rate": 7.809315124337546e-06, + "loss": 4.8039, + "step": 25805 + }, + { + "epoch": 1.753635004756081, + "grad_norm": 0.3057790994644165, + "learning_rate": 7.808890474249219e-06, + "loss": 4.7184, + "step": 25810 + }, + { + "epoch": 1.7539747248267428, + "grad_norm": 0.38505685329437256, + "learning_rate": 7.808465824160892e-06, + "loss": 4.7944, + "step": 25815 + }, + { + "epoch": 1.7543144448974046, + "grad_norm": 0.5086534023284912, + "learning_rate": 7.808041174072565e-06, + "loss": 4.9136, + "step": 25820 + }, + { + "epoch": 1.7546541649680663, + "grad_norm": 0.5279123783111572, + "learning_rate": 7.807616523984237e-06, + "loss": 4.5128, + "step": 25825 + }, + { + "epoch": 1.754993885038728, + "grad_norm": 0.6609620451927185, + "learning_rate": 7.80719187389591e-06, + "loss": 4.9094, + "step": 25830 + }, + { + "epoch": 1.75533360510939, + "grad_norm": 0.39490893483161926, + "learning_rate": 7.806767223807583e-06, + "loss": 4.7355, + "step": 25835 + }, + { + "epoch": 1.7556733251800516, + "grad_norm": 0.43036362528800964, + "learning_rate": 7.806342573719256e-06, + "loss": 4.5242, + "step": 25840 + }, + { + "epoch": 1.7560130452507134, + "grad_norm": 0.5266620516777039, + "learning_rate": 7.805917923630929e-06, + "loss": 4.7229, + "step": 25845 + }, + { + "epoch": 1.7563527653213753, + "grad_norm": 0.43798425793647766, + "learning_rate": 7.805493273542601e-06, + "loss": 4.7138, + "step": 25850 + }, + { + "epoch": 1.756692485392037, + "grad_norm": 0.4434475302696228, + "learning_rate": 7.805068623454274e-06, + "loss": 4.5004, + "step": 25855 + }, + { + "epoch": 1.7570322054626988, + "grad_norm": 0.5276234149932861, + "learning_rate": 7.804643973365947e-06, + "loss": 4.8529, + "step": 25860 + }, + { + "epoch": 1.7573719255333606, + "grad_norm": 0.44664397835731506, + "learning_rate": 7.80421932327762e-06, + "loss": 4.6526, + "step": 25865 + }, + { + "epoch": 1.7577116456040223, + "grad_norm": 0.31335538625717163, + "learning_rate": 7.803794673189293e-06, + "loss": 4.5525, + "step": 25870 + }, + { + "epoch": 1.758051365674684, + "grad_norm": 0.46913662552833557, + "learning_rate": 7.803370023100965e-06, + "loss": 4.4018, + "step": 25875 + }, + { + "epoch": 1.758391085745346, + "grad_norm": 0.4370150566101074, + "learning_rate": 7.802945373012638e-06, + "loss": 4.4839, + "step": 25880 + }, + { + "epoch": 1.7587308058160076, + "grad_norm": 0.36346372961997986, + "learning_rate": 7.802520722924311e-06, + "loss": 4.3844, + "step": 25885 + }, + { + "epoch": 1.7590705258866692, + "grad_norm": 0.3993002474308014, + "learning_rate": 7.802096072835984e-06, + "loss": 4.6952, + "step": 25890 + }, + { + "epoch": 1.7594102459573313, + "grad_norm": 0.38251182436943054, + "learning_rate": 7.801671422747657e-06, + "loss": 4.6117, + "step": 25895 + }, + { + "epoch": 1.759749966027993, + "grad_norm": 0.4264671802520752, + "learning_rate": 7.80124677265933e-06, + "loss": 4.2092, + "step": 25900 + }, + { + "epoch": 1.7600896860986546, + "grad_norm": 0.46188539266586304, + "learning_rate": 7.800822122571002e-06, + "loss": 4.9288, + "step": 25905 + }, + { + "epoch": 1.7604294061693166, + "grad_norm": 0.3938162624835968, + "learning_rate": 7.800397472482675e-06, + "loss": 4.6597, + "step": 25910 + }, + { + "epoch": 1.7607691262399783, + "grad_norm": 0.33854588866233826, + "learning_rate": 7.799972822394348e-06, + "loss": 4.5057, + "step": 25915 + }, + { + "epoch": 1.76110884631064, + "grad_norm": 0.3731670081615448, + "learning_rate": 7.79954817230602e-06, + "loss": 4.582, + "step": 25920 + }, + { + "epoch": 1.7614485663813018, + "grad_norm": 0.4096626937389374, + "learning_rate": 7.799123522217693e-06, + "loss": 4.7213, + "step": 25925 + }, + { + "epoch": 1.7617882864519636, + "grad_norm": 0.4420050084590912, + "learning_rate": 7.798698872129366e-06, + "loss": 4.8005, + "step": 25930 + }, + { + "epoch": 1.7621280065226252, + "grad_norm": 0.5917261838912964, + "learning_rate": 7.798274222041039e-06, + "loss": 4.8251, + "step": 25935 + }, + { + "epoch": 1.762467726593287, + "grad_norm": 0.42202243208885193, + "learning_rate": 7.797849571952712e-06, + "loss": 4.5835, + "step": 25940 + }, + { + "epoch": 1.762807446663949, + "grad_norm": 0.33939555287361145, + "learning_rate": 7.797424921864385e-06, + "loss": 4.6991, + "step": 25945 + }, + { + "epoch": 1.7631471667346106, + "grad_norm": 0.45292603969573975, + "learning_rate": 7.797000271776057e-06, + "loss": 4.6161, + "step": 25950 + }, + { + "epoch": 1.7634868868052724, + "grad_norm": 0.4496372938156128, + "learning_rate": 7.79657562168773e-06, + "loss": 4.7457, + "step": 25955 + }, + { + "epoch": 1.7638266068759343, + "grad_norm": 0.39729878306388855, + "learning_rate": 7.796150971599403e-06, + "loss": 4.8042, + "step": 25960 + }, + { + "epoch": 1.764166326946596, + "grad_norm": 0.38009801506996155, + "learning_rate": 7.795726321511076e-06, + "loss": 4.5206, + "step": 25965 + }, + { + "epoch": 1.7645060470172578, + "grad_norm": 0.3971935510635376, + "learning_rate": 7.795301671422749e-06, + "loss": 4.9935, + "step": 25970 + }, + { + "epoch": 1.7648457670879196, + "grad_norm": 0.31924203038215637, + "learning_rate": 7.794877021334421e-06, + "loss": 4.7825, + "step": 25975 + }, + { + "epoch": 1.7651854871585813, + "grad_norm": 0.3356826603412628, + "learning_rate": 7.794452371246094e-06, + "loss": 4.5565, + "step": 25980 + }, + { + "epoch": 1.765525207229243, + "grad_norm": 0.44807177782058716, + "learning_rate": 7.794027721157767e-06, + "loss": 4.6224, + "step": 25985 + }, + { + "epoch": 1.765864927299905, + "grad_norm": 0.4123261570930481, + "learning_rate": 7.79360307106944e-06, + "loss": 4.6004, + "step": 25990 + }, + { + "epoch": 1.7662046473705666, + "grad_norm": 0.48820438981056213, + "learning_rate": 7.793178420981113e-06, + "loss": 4.8964, + "step": 25995 + }, + { + "epoch": 1.7665443674412284, + "grad_norm": 0.34750351309776306, + "learning_rate": 7.792753770892785e-06, + "loss": 4.568, + "step": 26000 + }, + { + "epoch": 1.7668840875118903, + "grad_norm": 0.36671143770217896, + "learning_rate": 7.792329120804458e-06, + "loss": 4.4877, + "step": 26005 + }, + { + "epoch": 1.767223807582552, + "grad_norm": 0.45130884647369385, + "learning_rate": 7.791904470716131e-06, + "loss": 4.8655, + "step": 26010 + }, + { + "epoch": 1.7675635276532138, + "grad_norm": 0.40064048767089844, + "learning_rate": 7.791479820627804e-06, + "loss": 4.6196, + "step": 26015 + }, + { + "epoch": 1.7679032477238756, + "grad_norm": 0.5088414549827576, + "learning_rate": 7.791055170539477e-06, + "loss": 4.5399, + "step": 26020 + }, + { + "epoch": 1.7682429677945373, + "grad_norm": 0.4235353171825409, + "learning_rate": 7.79063052045115e-06, + "loss": 4.5825, + "step": 26025 + }, + { + "epoch": 1.7685826878651991, + "grad_norm": 0.3729190528392792, + "learning_rate": 7.790205870362822e-06, + "loss": 4.6826, + "step": 26030 + }, + { + "epoch": 1.768922407935861, + "grad_norm": 0.39163029193878174, + "learning_rate": 7.789781220274495e-06, + "loss": 4.5412, + "step": 26035 + }, + { + "epoch": 1.7692621280065226, + "grad_norm": 0.41246867179870605, + "learning_rate": 7.789356570186168e-06, + "loss": 4.6933, + "step": 26040 + }, + { + "epoch": 1.7696018480771842, + "grad_norm": 0.4055160582065582, + "learning_rate": 7.788931920097839e-06, + "loss": 4.5769, + "step": 26045 + }, + { + "epoch": 1.7699415681478463, + "grad_norm": 0.44822800159454346, + "learning_rate": 7.788507270009513e-06, + "loss": 4.6129, + "step": 26050 + }, + { + "epoch": 1.770281288218508, + "grad_norm": 0.46218082308769226, + "learning_rate": 7.788082619921186e-06, + "loss": 4.443, + "step": 26055 + }, + { + "epoch": 1.7706210082891696, + "grad_norm": 0.509584367275238, + "learning_rate": 7.787657969832857e-06, + "loss": 4.5904, + "step": 26060 + }, + { + "epoch": 1.7709607283598316, + "grad_norm": 0.4524422585964203, + "learning_rate": 7.787233319744532e-06, + "loss": 4.5581, + "step": 26065 + }, + { + "epoch": 1.7713004484304933, + "grad_norm": 0.5595578551292419, + "learning_rate": 7.786808669656205e-06, + "loss": 4.408, + "step": 26070 + }, + { + "epoch": 1.771640168501155, + "grad_norm": 0.4962616562843323, + "learning_rate": 7.786384019567876e-06, + "loss": 4.4013, + "step": 26075 + }, + { + "epoch": 1.771979888571817, + "grad_norm": 0.43957948684692383, + "learning_rate": 7.78595936947955e-06, + "loss": 4.8115, + "step": 26080 + }, + { + "epoch": 1.7723196086424786, + "grad_norm": 0.3713083267211914, + "learning_rate": 7.785534719391223e-06, + "loss": 4.5467, + "step": 26085 + }, + { + "epoch": 1.7726593287131402, + "grad_norm": 0.3463685214519501, + "learning_rate": 7.785110069302894e-06, + "loss": 4.5272, + "step": 26090 + }, + { + "epoch": 1.772999048783802, + "grad_norm": 0.5645448565483093, + "learning_rate": 7.784685419214569e-06, + "loss": 4.5931, + "step": 26095 + }, + { + "epoch": 1.773338768854464, + "grad_norm": 0.3937724232673645, + "learning_rate": 7.784260769126241e-06, + "loss": 4.5849, + "step": 26100 + }, + { + "epoch": 1.7736784889251256, + "grad_norm": 0.3241925835609436, + "learning_rate": 7.783836119037913e-06, + "loss": 4.4783, + "step": 26105 + }, + { + "epoch": 1.7740182089957874, + "grad_norm": 0.5452935099601746, + "learning_rate": 7.783411468949587e-06, + "loss": 4.4666, + "step": 26110 + }, + { + "epoch": 1.7743579290664493, + "grad_norm": 0.4025939106941223, + "learning_rate": 7.782986818861258e-06, + "loss": 4.6593, + "step": 26115 + }, + { + "epoch": 1.774697649137111, + "grad_norm": 0.48830005526542664, + "learning_rate": 7.782562168772931e-06, + "loss": 4.7047, + "step": 26120 + }, + { + "epoch": 1.7750373692077728, + "grad_norm": 0.4210384488105774, + "learning_rate": 7.782137518684605e-06, + "loss": 4.4742, + "step": 26125 + }, + { + "epoch": 1.7753770892784346, + "grad_norm": 0.34620267152786255, + "learning_rate": 7.781712868596277e-06, + "loss": 4.4783, + "step": 26130 + }, + { + "epoch": 1.7757168093490963, + "grad_norm": 0.46200257539749146, + "learning_rate": 7.78128821850795e-06, + "loss": 4.6352, + "step": 26135 + }, + { + "epoch": 1.7760565294197581, + "grad_norm": 0.35934874415397644, + "learning_rate": 7.780863568419624e-06, + "loss": 4.486, + "step": 26140 + }, + { + "epoch": 1.77639624949042, + "grad_norm": 0.4994049370288849, + "learning_rate": 7.780438918331295e-06, + "loss": 4.5889, + "step": 26145 + }, + { + "epoch": 1.7767359695610816, + "grad_norm": 0.47909778356552124, + "learning_rate": 7.780014268242968e-06, + "loss": 4.5561, + "step": 26150 + }, + { + "epoch": 1.7770756896317434, + "grad_norm": 0.48096585273742676, + "learning_rate": 7.779589618154642e-06, + "loss": 4.3824, + "step": 26155 + }, + { + "epoch": 1.7774154097024053, + "grad_norm": 0.39345604181289673, + "learning_rate": 7.779164968066313e-06, + "loss": 4.7056, + "step": 26160 + }, + { + "epoch": 1.777755129773067, + "grad_norm": 0.47845327854156494, + "learning_rate": 7.778740317977986e-06, + "loss": 4.7611, + "step": 26165 + }, + { + "epoch": 1.7780948498437288, + "grad_norm": 0.5452906489372253, + "learning_rate": 7.77831566788966e-06, + "loss": 4.6225, + "step": 26170 + }, + { + "epoch": 1.7784345699143906, + "grad_norm": 0.46958127617836, + "learning_rate": 7.777891017801332e-06, + "loss": 4.4088, + "step": 26175 + }, + { + "epoch": 1.7787742899850523, + "grad_norm": 0.3603004515171051, + "learning_rate": 7.777466367713005e-06, + "loss": 4.8588, + "step": 26180 + }, + { + "epoch": 1.7791140100557141, + "grad_norm": 0.32158800959587097, + "learning_rate": 7.777041717624679e-06, + "loss": 4.7383, + "step": 26185 + }, + { + "epoch": 1.779453730126376, + "grad_norm": 0.5131334066390991, + "learning_rate": 7.77661706753635e-06, + "loss": 4.4935, + "step": 26190 + }, + { + "epoch": 1.7797934501970376, + "grad_norm": 0.4100748598575592, + "learning_rate": 7.776192417448023e-06, + "loss": 4.3484, + "step": 26195 + }, + { + "epoch": 1.7801331702676995, + "grad_norm": 0.5245316028594971, + "learning_rate": 7.775767767359696e-06, + "loss": 4.564, + "step": 26200 + }, + { + "epoch": 1.7804728903383613, + "grad_norm": 0.46110787987709045, + "learning_rate": 7.775343117271369e-06, + "loss": 4.676, + "step": 26205 + }, + { + "epoch": 1.780812610409023, + "grad_norm": 0.5281656980514526, + "learning_rate": 7.774918467183041e-06, + "loss": 4.5703, + "step": 26210 + }, + { + "epoch": 1.7811523304796846, + "grad_norm": 0.5318871140480042, + "learning_rate": 7.774493817094714e-06, + "loss": 4.8844, + "step": 26215 + }, + { + "epoch": 1.7814920505503467, + "grad_norm": 0.4550105929374695, + "learning_rate": 7.774069167006387e-06, + "loss": 4.6837, + "step": 26220 + }, + { + "epoch": 1.7818317706210083, + "grad_norm": 0.3257743716239929, + "learning_rate": 7.77364451691806e-06, + "loss": 4.8449, + "step": 26225 + }, + { + "epoch": 1.78217149069167, + "grad_norm": 0.40638062357902527, + "learning_rate": 7.773219866829733e-06, + "loss": 4.7524, + "step": 26230 + }, + { + "epoch": 1.782511210762332, + "grad_norm": 0.329719215631485, + "learning_rate": 7.772795216741405e-06, + "loss": 4.5426, + "step": 26235 + }, + { + "epoch": 1.7828509308329936, + "grad_norm": 0.5199711322784424, + "learning_rate": 7.772370566653078e-06, + "loss": 4.9346, + "step": 26240 + }, + { + "epoch": 1.7831906509036552, + "grad_norm": 0.3816494643688202, + "learning_rate": 7.771945916564751e-06, + "loss": 4.5851, + "step": 26245 + }, + { + "epoch": 1.7835303709743173, + "grad_norm": 0.45146334171295166, + "learning_rate": 7.771521266476424e-06, + "loss": 4.703, + "step": 26250 + }, + { + "epoch": 1.783870091044979, + "grad_norm": 0.30695149302482605, + "learning_rate": 7.771096616388097e-06, + "loss": 4.2984, + "step": 26255 + }, + { + "epoch": 1.7842098111156406, + "grad_norm": 0.5132983922958374, + "learning_rate": 7.77067196629977e-06, + "loss": 4.5821, + "step": 26260 + }, + { + "epoch": 1.7845495311863024, + "grad_norm": 0.4297429025173187, + "learning_rate": 7.770247316211442e-06, + "loss": 4.5036, + "step": 26265 + }, + { + "epoch": 1.7848892512569643, + "grad_norm": 0.3405957818031311, + "learning_rate": 7.769822666123115e-06, + "loss": 4.4916, + "step": 26270 + }, + { + "epoch": 1.785228971327626, + "grad_norm": 0.5197687149047852, + "learning_rate": 7.769398016034788e-06, + "loss": 4.6975, + "step": 26275 + }, + { + "epoch": 1.7855686913982878, + "grad_norm": 0.3467085659503937, + "learning_rate": 7.76897336594646e-06, + "loss": 4.6252, + "step": 26280 + }, + { + "epoch": 1.7859084114689496, + "grad_norm": 0.42023590207099915, + "learning_rate": 7.768548715858133e-06, + "loss": 4.5605, + "step": 26285 + }, + { + "epoch": 1.7862481315396113, + "grad_norm": 0.3441694378852844, + "learning_rate": 7.768124065769806e-06, + "loss": 4.6337, + "step": 26290 + }, + { + "epoch": 1.7865878516102731, + "grad_norm": 0.417074590921402, + "learning_rate": 7.767699415681479e-06, + "loss": 4.5649, + "step": 26295 + }, + { + "epoch": 1.786927571680935, + "grad_norm": 0.396127849817276, + "learning_rate": 7.767274765593152e-06, + "loss": 4.5205, + "step": 26300 + }, + { + "epoch": 1.7872672917515966, + "grad_norm": 0.36795660853385925, + "learning_rate": 7.766850115504825e-06, + "loss": 4.6914, + "step": 26305 + }, + { + "epoch": 1.7876070118222585, + "grad_norm": 0.4442894160747528, + "learning_rate": 7.766425465416497e-06, + "loss": 4.4237, + "step": 26310 + }, + { + "epoch": 1.7879467318929203, + "grad_norm": 0.4028794765472412, + "learning_rate": 7.76600081532817e-06, + "loss": 4.7529, + "step": 26315 + }, + { + "epoch": 1.788286451963582, + "grad_norm": 0.3065252900123596, + "learning_rate": 7.765576165239843e-06, + "loss": 4.4949, + "step": 26320 + }, + { + "epoch": 1.7886261720342438, + "grad_norm": 0.3902488052845001, + "learning_rate": 7.765151515151516e-06, + "loss": 4.3699, + "step": 26325 + }, + { + "epoch": 1.7889658921049056, + "grad_norm": 0.4976438879966736, + "learning_rate": 7.764726865063189e-06, + "loss": 4.5399, + "step": 26330 + }, + { + "epoch": 1.7893056121755673, + "grad_norm": 0.4447367489337921, + "learning_rate": 7.764302214974861e-06, + "loss": 4.4847, + "step": 26335 + }, + { + "epoch": 1.7896453322462291, + "grad_norm": 0.4394040107727051, + "learning_rate": 7.763877564886534e-06, + "loss": 4.8335, + "step": 26340 + }, + { + "epoch": 1.789985052316891, + "grad_norm": 0.39894986152648926, + "learning_rate": 7.763452914798207e-06, + "loss": 4.5857, + "step": 26345 + }, + { + "epoch": 1.7903247723875526, + "grad_norm": 0.4152306914329529, + "learning_rate": 7.76302826470988e-06, + "loss": 4.996, + "step": 26350 + }, + { + "epoch": 1.7906644924582145, + "grad_norm": 0.4747753441333771, + "learning_rate": 7.762603614621553e-06, + "loss": 4.5879, + "step": 26355 + }, + { + "epoch": 1.7910042125288763, + "grad_norm": 0.3151250183582306, + "learning_rate": 7.762178964533225e-06, + "loss": 4.507, + "step": 26360 + }, + { + "epoch": 1.791343932599538, + "grad_norm": 0.48364517092704773, + "learning_rate": 7.761754314444898e-06, + "loss": 4.5105, + "step": 26365 + }, + { + "epoch": 1.7916836526701998, + "grad_norm": 0.4052703380584717, + "learning_rate": 7.761329664356571e-06, + "loss": 4.8106, + "step": 26370 + }, + { + "epoch": 1.7920233727408617, + "grad_norm": 0.5016494989395142, + "learning_rate": 7.760905014268244e-06, + "loss": 4.5349, + "step": 26375 + }, + { + "epoch": 1.7923630928115233, + "grad_norm": 0.40038570761680603, + "learning_rate": 7.760480364179917e-06, + "loss": 4.743, + "step": 26380 + }, + { + "epoch": 1.792702812882185, + "grad_norm": 0.3378240764141083, + "learning_rate": 7.76005571409159e-06, + "loss": 4.6849, + "step": 26385 + }, + { + "epoch": 1.793042532952847, + "grad_norm": 0.417552649974823, + "learning_rate": 7.759631064003262e-06, + "loss": 4.7785, + "step": 26390 + }, + { + "epoch": 1.7933822530235086, + "grad_norm": 0.3398965299129486, + "learning_rate": 7.759206413914935e-06, + "loss": 4.6016, + "step": 26395 + }, + { + "epoch": 1.7937219730941703, + "grad_norm": 0.4281592071056366, + "learning_rate": 7.758781763826608e-06, + "loss": 4.5431, + "step": 26400 + }, + { + "epoch": 1.7940616931648323, + "grad_norm": 0.3909308910369873, + "learning_rate": 7.75835711373828e-06, + "loss": 4.6836, + "step": 26405 + }, + { + "epoch": 1.794401413235494, + "grad_norm": 0.40702199935913086, + "learning_rate": 7.757932463649953e-06, + "loss": 4.6057, + "step": 26410 + }, + { + "epoch": 1.7947411333061556, + "grad_norm": 0.30171847343444824, + "learning_rate": 7.757507813561626e-06, + "loss": 4.5013, + "step": 26415 + }, + { + "epoch": 1.7950808533768177, + "grad_norm": 0.4116762578487396, + "learning_rate": 7.757083163473299e-06, + "loss": 4.3655, + "step": 26420 + }, + { + "epoch": 1.7954205734474793, + "grad_norm": 0.42408594489097595, + "learning_rate": 7.756658513384972e-06, + "loss": 4.6583, + "step": 26425 + }, + { + "epoch": 1.795760293518141, + "grad_norm": 0.4232158660888672, + "learning_rate": 7.756233863296645e-06, + "loss": 4.5767, + "step": 26430 + }, + { + "epoch": 1.7961000135888028, + "grad_norm": 0.4551107585430145, + "learning_rate": 7.755809213208317e-06, + "loss": 4.3498, + "step": 26435 + }, + { + "epoch": 1.7964397336594646, + "grad_norm": 0.4703330397605896, + "learning_rate": 7.75538456311999e-06, + "loss": 4.7741, + "step": 26440 + }, + { + "epoch": 1.7967794537301263, + "grad_norm": 0.31068477034568787, + "learning_rate": 7.754959913031663e-06, + "loss": 4.4196, + "step": 26445 + }, + { + "epoch": 1.7971191738007881, + "grad_norm": 0.4244861602783203, + "learning_rate": 7.754535262943336e-06, + "loss": 4.4188, + "step": 26450 + }, + { + "epoch": 1.79745889387145, + "grad_norm": 0.4917745292186737, + "learning_rate": 7.754110612855009e-06, + "loss": 4.5655, + "step": 26455 + }, + { + "epoch": 1.7977986139421116, + "grad_norm": 0.4893403947353363, + "learning_rate": 7.75368596276668e-06, + "loss": 4.62, + "step": 26460 + }, + { + "epoch": 1.7981383340127735, + "grad_norm": 0.3807523846626282, + "learning_rate": 7.753261312678354e-06, + "loss": 4.6187, + "step": 26465 + }, + { + "epoch": 1.7984780540834353, + "grad_norm": 0.4563961625099182, + "learning_rate": 7.752836662590027e-06, + "loss": 4.7332, + "step": 26470 + }, + { + "epoch": 1.798817774154097, + "grad_norm": 0.43891218304634094, + "learning_rate": 7.752412012501698e-06, + "loss": 4.5919, + "step": 26475 + }, + { + "epoch": 1.7991574942247588, + "grad_norm": 0.3835899531841278, + "learning_rate": 7.751987362413373e-06, + "loss": 4.6524, + "step": 26480 + }, + { + "epoch": 1.7994972142954206, + "grad_norm": 0.4715486764907837, + "learning_rate": 7.751562712325045e-06, + "loss": 4.5341, + "step": 26485 + }, + { + "epoch": 1.7998369343660823, + "grad_norm": 0.4683159589767456, + "learning_rate": 7.751138062236716e-06, + "loss": 4.6199, + "step": 26490 + }, + { + "epoch": 1.8001766544367441, + "grad_norm": 0.3405665159225464, + "learning_rate": 7.750713412148391e-06, + "loss": 4.631, + "step": 26495 + }, + { + "epoch": 1.800516374507406, + "grad_norm": 0.3588625192642212, + "learning_rate": 7.750288762060064e-06, + "loss": 4.6108, + "step": 26500 + }, + { + "epoch": 1.8008560945780676, + "grad_norm": 0.573013961315155, + "learning_rate": 7.749864111971735e-06, + "loss": 4.5962, + "step": 26505 + }, + { + "epoch": 1.8011958146487295, + "grad_norm": 0.3455224931240082, + "learning_rate": 7.74943946188341e-06, + "loss": 4.7552, + "step": 26510 + }, + { + "epoch": 1.8015355347193913, + "grad_norm": 0.41996335983276367, + "learning_rate": 7.749014811795082e-06, + "loss": 4.5718, + "step": 26515 + }, + { + "epoch": 1.801875254790053, + "grad_norm": 0.39521217346191406, + "learning_rate": 7.748590161706753e-06, + "loss": 4.4263, + "step": 26520 + }, + { + "epoch": 1.8022149748607148, + "grad_norm": 0.37534865736961365, + "learning_rate": 7.748165511618428e-06, + "loss": 4.796, + "step": 26525 + }, + { + "epoch": 1.8025546949313767, + "grad_norm": 0.4522918462753296, + "learning_rate": 7.7477408615301e-06, + "loss": 4.5626, + "step": 26530 + }, + { + "epoch": 1.8028944150020383, + "grad_norm": 0.4920691251754761, + "learning_rate": 7.747316211441772e-06, + "loss": 4.8588, + "step": 26535 + }, + { + "epoch": 1.8032341350727001, + "grad_norm": 0.4127686023712158, + "learning_rate": 7.746891561353446e-06, + "loss": 4.6062, + "step": 26540 + }, + { + "epoch": 1.803573855143362, + "grad_norm": 0.353045791387558, + "learning_rate": 7.746466911265117e-06, + "loss": 4.5755, + "step": 26545 + }, + { + "epoch": 1.8039135752140236, + "grad_norm": 0.46614065766334534, + "learning_rate": 7.74604226117679e-06, + "loss": 4.6265, + "step": 26550 + }, + { + "epoch": 1.8042532952846853, + "grad_norm": 0.38480833172798157, + "learning_rate": 7.745617611088465e-06, + "loss": 4.8149, + "step": 26555 + }, + { + "epoch": 1.8045930153553473, + "grad_norm": 0.43401920795440674, + "learning_rate": 7.745192961000136e-06, + "loss": 4.8583, + "step": 26560 + }, + { + "epoch": 1.804932735426009, + "grad_norm": 0.359157919883728, + "learning_rate": 7.744768310911808e-06, + "loss": 4.6921, + "step": 26565 + }, + { + "epoch": 1.8052724554966706, + "grad_norm": 0.3805539906024933, + "learning_rate": 7.744343660823483e-06, + "loss": 4.6662, + "step": 26570 + }, + { + "epoch": 1.8056121755673327, + "grad_norm": 0.502591609954834, + "learning_rate": 7.743919010735154e-06, + "loss": 4.4874, + "step": 26575 + }, + { + "epoch": 1.8059518956379943, + "grad_norm": 0.40016454458236694, + "learning_rate": 7.743494360646827e-06, + "loss": 4.5645, + "step": 26580 + }, + { + "epoch": 1.806291615708656, + "grad_norm": 0.3537563681602478, + "learning_rate": 7.743069710558501e-06, + "loss": 4.4178, + "step": 26585 + }, + { + "epoch": 1.806631335779318, + "grad_norm": 0.4771043062210083, + "learning_rate": 7.742645060470173e-06, + "loss": 4.4709, + "step": 26590 + }, + { + "epoch": 1.8069710558499796, + "grad_norm": 0.37508708238601685, + "learning_rate": 7.742220410381845e-06, + "loss": 4.7958, + "step": 26595 + }, + { + "epoch": 1.8073107759206413, + "grad_norm": 0.5083226561546326, + "learning_rate": 7.74179576029352e-06, + "loss": 4.721, + "step": 26600 + }, + { + "epoch": 1.8076504959913031, + "grad_norm": 0.4108913838863373, + "learning_rate": 7.741371110205191e-06, + "loss": 4.5133, + "step": 26605 + }, + { + "epoch": 1.807990216061965, + "grad_norm": 0.3726930022239685, + "learning_rate": 7.740946460116864e-06, + "loss": 4.6465, + "step": 26610 + }, + { + "epoch": 1.8083299361326266, + "grad_norm": 0.3678167164325714, + "learning_rate": 7.740521810028537e-06, + "loss": 4.6499, + "step": 26615 + }, + { + "epoch": 1.8086696562032885, + "grad_norm": 0.49961450695991516, + "learning_rate": 7.74009715994021e-06, + "loss": 4.6013, + "step": 26620 + }, + { + "epoch": 1.8090093762739503, + "grad_norm": 0.3485662341117859, + "learning_rate": 7.739672509851884e-06, + "loss": 4.5004, + "step": 26625 + }, + { + "epoch": 1.809349096344612, + "grad_norm": 0.3370159864425659, + "learning_rate": 7.739247859763555e-06, + "loss": 4.6052, + "step": 26630 + }, + { + "epoch": 1.8096888164152738, + "grad_norm": 0.37544959783554077, + "learning_rate": 7.738823209675228e-06, + "loss": 4.6862, + "step": 26635 + }, + { + "epoch": 1.8100285364859356, + "grad_norm": 0.43412837386131287, + "learning_rate": 7.738398559586902e-06, + "loss": 4.5969, + "step": 26640 + }, + { + "epoch": 1.8103682565565973, + "grad_norm": 0.3320101201534271, + "learning_rate": 7.737973909498573e-06, + "loss": 4.5132, + "step": 26645 + }, + { + "epoch": 1.8107079766272591, + "grad_norm": 0.3431031405925751, + "learning_rate": 7.737549259410246e-06, + "loss": 4.544, + "step": 26650 + }, + { + "epoch": 1.811047696697921, + "grad_norm": 0.39039871096611023, + "learning_rate": 7.73712460932192e-06, + "loss": 4.6189, + "step": 26655 + }, + { + "epoch": 1.8113874167685826, + "grad_norm": 0.3355623185634613, + "learning_rate": 7.736699959233592e-06, + "loss": 4.5065, + "step": 26660 + }, + { + "epoch": 1.8117271368392445, + "grad_norm": 0.3745642900466919, + "learning_rate": 7.736275309145265e-06, + "loss": 4.395, + "step": 26665 + }, + { + "epoch": 1.8120668569099063, + "grad_norm": 0.4175267517566681, + "learning_rate": 7.735850659056939e-06, + "loss": 4.346, + "step": 26670 + }, + { + "epoch": 1.812406576980568, + "grad_norm": 0.4682762622833252, + "learning_rate": 7.73542600896861e-06, + "loss": 4.4442, + "step": 26675 + }, + { + "epoch": 1.8127462970512298, + "grad_norm": 0.5011439919471741, + "learning_rate": 7.735001358880283e-06, + "loss": 4.3929, + "step": 26680 + }, + { + "epoch": 1.8130860171218917, + "grad_norm": 0.42422670125961304, + "learning_rate": 7.734576708791956e-06, + "loss": 4.6042, + "step": 26685 + }, + { + "epoch": 1.8134257371925533, + "grad_norm": 0.474730908870697, + "learning_rate": 7.734152058703629e-06, + "loss": 4.5435, + "step": 26690 + }, + { + "epoch": 1.8137654572632151, + "grad_norm": 0.4368428587913513, + "learning_rate": 7.733727408615301e-06, + "loss": 4.87, + "step": 26695 + }, + { + "epoch": 1.814105177333877, + "grad_norm": 0.3815293610095978, + "learning_rate": 7.733302758526974e-06, + "loss": 4.7124, + "step": 26700 + }, + { + "epoch": 1.8144448974045386, + "grad_norm": 0.35608479380607605, + "learning_rate": 7.732878108438647e-06, + "loss": 4.3783, + "step": 26705 + }, + { + "epoch": 1.8147846174752005, + "grad_norm": 0.3242974877357483, + "learning_rate": 7.73245345835032e-06, + "loss": 4.5529, + "step": 26710 + }, + { + "epoch": 1.8151243375458623, + "grad_norm": 0.4954844117164612, + "learning_rate": 7.732028808261993e-06, + "loss": 4.5753, + "step": 26715 + }, + { + "epoch": 1.815464057616524, + "grad_norm": 0.49143195152282715, + "learning_rate": 7.731604158173665e-06, + "loss": 4.6724, + "step": 26720 + }, + { + "epoch": 1.8158037776871856, + "grad_norm": 0.365060418844223, + "learning_rate": 7.731179508085338e-06, + "loss": 4.5625, + "step": 26725 + }, + { + "epoch": 1.8161434977578477, + "grad_norm": 0.41176068782806396, + "learning_rate": 7.730754857997011e-06, + "loss": 4.8598, + "step": 26730 + }, + { + "epoch": 1.8164832178285093, + "grad_norm": 0.3977012038230896, + "learning_rate": 7.730330207908684e-06, + "loss": 4.5694, + "step": 26735 + }, + { + "epoch": 1.816822937899171, + "grad_norm": 0.4139137864112854, + "learning_rate": 7.729905557820357e-06, + "loss": 4.5924, + "step": 26740 + }, + { + "epoch": 1.817162657969833, + "grad_norm": 0.37340492010116577, + "learning_rate": 7.72948090773203e-06, + "loss": 4.794, + "step": 26745 + }, + { + "epoch": 1.8175023780404946, + "grad_norm": 0.4490406811237335, + "learning_rate": 7.729056257643702e-06, + "loss": 4.7188, + "step": 26750 + }, + { + "epoch": 1.8178420981111563, + "grad_norm": 0.49188828468322754, + "learning_rate": 7.728631607555375e-06, + "loss": 4.5029, + "step": 26755 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.5720055103302002, + "learning_rate": 7.728206957467048e-06, + "loss": 4.5732, + "step": 26760 + }, + { + "epoch": 1.81852153825248, + "grad_norm": 0.38279736042022705, + "learning_rate": 7.72778230737872e-06, + "loss": 4.5476, + "step": 26765 + }, + { + "epoch": 1.8188612583231416, + "grad_norm": 0.40208202600479126, + "learning_rate": 7.727357657290393e-06, + "loss": 4.5124, + "step": 26770 + }, + { + "epoch": 1.8192009783938035, + "grad_norm": 0.4428703784942627, + "learning_rate": 7.726933007202066e-06, + "loss": 4.6776, + "step": 26775 + }, + { + "epoch": 1.8195406984644653, + "grad_norm": 0.5372276306152344, + "learning_rate": 7.726508357113739e-06, + "loss": 4.6279, + "step": 26780 + }, + { + "epoch": 1.819880418535127, + "grad_norm": 0.34158942103385925, + "learning_rate": 7.726083707025412e-06, + "loss": 4.6662, + "step": 26785 + }, + { + "epoch": 1.8202201386057888, + "grad_norm": 0.5261467099189758, + "learning_rate": 7.725659056937085e-06, + "loss": 4.7189, + "step": 26790 + }, + { + "epoch": 1.8205598586764506, + "grad_norm": 0.3968527615070343, + "learning_rate": 7.725234406848757e-06, + "loss": 4.5651, + "step": 26795 + }, + { + "epoch": 1.8208995787471123, + "grad_norm": 0.4925079345703125, + "learning_rate": 7.72480975676043e-06, + "loss": 4.7196, + "step": 26800 + }, + { + "epoch": 1.8212392988177741, + "grad_norm": 0.4550491273403168, + "learning_rate": 7.724385106672103e-06, + "loss": 4.6382, + "step": 26805 + }, + { + "epoch": 1.821579018888436, + "grad_norm": 0.6193518042564392, + "learning_rate": 7.723960456583776e-06, + "loss": 4.7667, + "step": 26810 + }, + { + "epoch": 1.8219187389590976, + "grad_norm": 0.4191024899482727, + "learning_rate": 7.723535806495449e-06, + "loss": 4.4567, + "step": 26815 + }, + { + "epoch": 1.8222584590297595, + "grad_norm": 0.4201189875602722, + "learning_rate": 7.723111156407121e-06, + "loss": 4.4668, + "step": 26820 + }, + { + "epoch": 1.8225981791004213, + "grad_norm": 0.4503380060195923, + "learning_rate": 7.722686506318794e-06, + "loss": 4.5984, + "step": 26825 + }, + { + "epoch": 1.822937899171083, + "grad_norm": 0.3885537087917328, + "learning_rate": 7.722261856230467e-06, + "loss": 4.5739, + "step": 26830 + }, + { + "epoch": 1.8232776192417448, + "grad_norm": 0.489587664604187, + "learning_rate": 7.72183720614214e-06, + "loss": 4.5214, + "step": 26835 + }, + { + "epoch": 1.8236173393124067, + "grad_norm": 0.4080455005168915, + "learning_rate": 7.721412556053813e-06, + "loss": 4.5503, + "step": 26840 + }, + { + "epoch": 1.8239570593830683, + "grad_norm": 0.3778328001499176, + "learning_rate": 7.720987905965485e-06, + "loss": 4.6132, + "step": 26845 + }, + { + "epoch": 1.8242967794537301, + "grad_norm": 0.38161495327949524, + "learning_rate": 7.720563255877158e-06, + "loss": 4.7415, + "step": 26850 + }, + { + "epoch": 1.824636499524392, + "grad_norm": 0.40990254282951355, + "learning_rate": 7.720138605788831e-06, + "loss": 4.5584, + "step": 26855 + }, + { + "epoch": 1.8249762195950536, + "grad_norm": 0.4836742579936981, + "learning_rate": 7.719713955700504e-06, + "loss": 4.4397, + "step": 26860 + }, + { + "epoch": 1.8253159396657155, + "grad_norm": 0.3901146352291107, + "learning_rate": 7.719289305612177e-06, + "loss": 4.3753, + "step": 26865 + }, + { + "epoch": 1.8256556597363773, + "grad_norm": 0.49197766184806824, + "learning_rate": 7.71886465552385e-06, + "loss": 4.7393, + "step": 26870 + }, + { + "epoch": 1.825995379807039, + "grad_norm": 0.3807205259799957, + "learning_rate": 7.718440005435522e-06, + "loss": 4.8688, + "step": 26875 + }, + { + "epoch": 1.8263350998777008, + "grad_norm": 0.3897400200366974, + "learning_rate": 7.718015355347195e-06, + "loss": 4.513, + "step": 26880 + }, + { + "epoch": 1.8266748199483627, + "grad_norm": 0.3974590599536896, + "learning_rate": 7.717590705258868e-06, + "loss": 4.5439, + "step": 26885 + }, + { + "epoch": 1.8270145400190243, + "grad_norm": 0.5784851312637329, + "learning_rate": 7.717166055170539e-06, + "loss": 4.4941, + "step": 26890 + }, + { + "epoch": 1.827354260089686, + "grad_norm": 0.3760233521461487, + "learning_rate": 7.716741405082213e-06, + "loss": 4.4225, + "step": 26895 + }, + { + "epoch": 1.827693980160348, + "grad_norm": 0.47007882595062256, + "learning_rate": 7.716316754993886e-06, + "loss": 4.5291, + "step": 26900 + }, + { + "epoch": 1.8280337002310096, + "grad_norm": 0.7044577598571777, + "learning_rate": 7.715892104905557e-06, + "loss": 4.7347, + "step": 26905 + }, + { + "epoch": 1.8283734203016713, + "grad_norm": 0.330392986536026, + "learning_rate": 7.715467454817232e-06, + "loss": 4.6806, + "step": 26910 + }, + { + "epoch": 1.8287131403723333, + "grad_norm": 0.40689727663993835, + "learning_rate": 7.715042804728905e-06, + "loss": 4.639, + "step": 26915 + }, + { + "epoch": 1.829052860442995, + "grad_norm": 0.5064714550971985, + "learning_rate": 7.714618154640576e-06, + "loss": 4.5488, + "step": 26920 + }, + { + "epoch": 1.8293925805136566, + "grad_norm": 0.6618513464927673, + "learning_rate": 7.71419350455225e-06, + "loss": 4.611, + "step": 26925 + }, + { + "epoch": 1.8297323005843187, + "grad_norm": 0.352780818939209, + "learning_rate": 7.713768854463923e-06, + "loss": 4.688, + "step": 26930 + }, + { + "epoch": 1.8300720206549803, + "grad_norm": 0.32621845602989197, + "learning_rate": 7.713344204375594e-06, + "loss": 4.4813, + "step": 26935 + }, + { + "epoch": 1.830411740725642, + "grad_norm": 0.4221431314945221, + "learning_rate": 7.712919554287269e-06, + "loss": 4.7185, + "step": 26940 + }, + { + "epoch": 1.8307514607963038, + "grad_norm": 0.4299621284008026, + "learning_rate": 7.712494904198941e-06, + "loss": 4.7433, + "step": 26945 + }, + { + "epoch": 1.8310911808669657, + "grad_norm": 0.39301058650016785, + "learning_rate": 7.712070254110612e-06, + "loss": 4.6485, + "step": 26950 + }, + { + "epoch": 1.8314309009376273, + "grad_norm": 0.565019965171814, + "learning_rate": 7.711645604022287e-06, + "loss": 4.7918, + "step": 26955 + }, + { + "epoch": 1.8317706210082891, + "grad_norm": 0.34387677907943726, + "learning_rate": 7.711220953933958e-06, + "loss": 4.4952, + "step": 26960 + }, + { + "epoch": 1.832110341078951, + "grad_norm": 0.37570920586586, + "learning_rate": 7.710796303845633e-06, + "loss": 4.7773, + "step": 26965 + }, + { + "epoch": 1.8324500611496126, + "grad_norm": 0.4504804015159607, + "learning_rate": 7.710371653757305e-06, + "loss": 4.7023, + "step": 26970 + }, + { + "epoch": 1.8327897812202745, + "grad_norm": 0.430777370929718, + "learning_rate": 7.709947003668976e-06, + "loss": 4.7746, + "step": 26975 + }, + { + "epoch": 1.8331295012909363, + "grad_norm": 0.32162973284721375, + "learning_rate": 7.709522353580651e-06, + "loss": 4.6211, + "step": 26980 + }, + { + "epoch": 1.833469221361598, + "grad_norm": 0.3928171694278717, + "learning_rate": 7.709097703492324e-06, + "loss": 4.5983, + "step": 26985 + }, + { + "epoch": 1.8338089414322598, + "grad_norm": 0.46696022152900696, + "learning_rate": 7.708673053403995e-06, + "loss": 4.565, + "step": 26990 + }, + { + "epoch": 1.8341486615029217, + "grad_norm": 0.42673632502555847, + "learning_rate": 7.70824840331567e-06, + "loss": 4.3203, + "step": 26995 + }, + { + "epoch": 1.8344883815735833, + "grad_norm": 0.40173056721687317, + "learning_rate": 7.707823753227342e-06, + "loss": 4.7424, + "step": 27000 + }, + { + "epoch": 1.8348281016442451, + "grad_norm": 0.5123261213302612, + "learning_rate": 7.707399103139013e-06, + "loss": 4.3248, + "step": 27005 + }, + { + "epoch": 1.835167821714907, + "grad_norm": 0.4597005844116211, + "learning_rate": 7.706974453050688e-06, + "loss": 4.5102, + "step": 27010 + }, + { + "epoch": 1.8355075417855686, + "grad_norm": 0.3936178684234619, + "learning_rate": 7.70654980296236e-06, + "loss": 4.7684, + "step": 27015 + }, + { + "epoch": 1.8358472618562305, + "grad_norm": 0.39935967326164246, + "learning_rate": 7.706125152874032e-06, + "loss": 4.736, + "step": 27020 + }, + { + "epoch": 1.8361869819268923, + "grad_norm": 0.4429110586643219, + "learning_rate": 7.705700502785706e-06, + "loss": 4.8574, + "step": 27025 + }, + { + "epoch": 1.836526701997554, + "grad_norm": 0.5329156517982483, + "learning_rate": 7.705275852697377e-06, + "loss": 4.4363, + "step": 27030 + }, + { + "epoch": 1.8368664220682158, + "grad_norm": 0.28491535782814026, + "learning_rate": 7.70485120260905e-06, + "loss": 4.5915, + "step": 27035 + }, + { + "epoch": 1.8372061421388777, + "grad_norm": 0.3848627507686615, + "learning_rate": 7.704426552520725e-06, + "loss": 4.3582, + "step": 27040 + }, + { + "epoch": 1.8375458622095393, + "grad_norm": 0.5380426645278931, + "learning_rate": 7.704001902432396e-06, + "loss": 4.4422, + "step": 27045 + }, + { + "epoch": 1.8378855822802012, + "grad_norm": 0.37453946471214294, + "learning_rate": 7.703577252344068e-06, + "loss": 4.6123, + "step": 27050 + }, + { + "epoch": 1.838225302350863, + "grad_norm": 0.40897271037101746, + "learning_rate": 7.703152602255743e-06, + "loss": 4.6927, + "step": 27055 + }, + { + "epoch": 1.8385650224215246, + "grad_norm": 0.38481810688972473, + "learning_rate": 7.702727952167414e-06, + "loss": 4.7297, + "step": 27060 + }, + { + "epoch": 1.8389047424921863, + "grad_norm": 0.45337605476379395, + "learning_rate": 7.702303302079087e-06, + "loss": 4.6681, + "step": 27065 + }, + { + "epoch": 1.8392444625628483, + "grad_norm": 0.3536357283592224, + "learning_rate": 7.701878651990761e-06, + "loss": 4.516, + "step": 27070 + }, + { + "epoch": 1.83958418263351, + "grad_norm": 0.4087814390659332, + "learning_rate": 7.701454001902432e-06, + "loss": 4.6473, + "step": 27075 + }, + { + "epoch": 1.8399239027041716, + "grad_norm": 0.4269687831401825, + "learning_rate": 7.701029351814105e-06, + "loss": 4.5628, + "step": 27080 + }, + { + "epoch": 1.8402636227748337, + "grad_norm": 0.5669506192207336, + "learning_rate": 7.70060470172578e-06, + "loss": 4.6474, + "step": 27085 + }, + { + "epoch": 1.8406033428454953, + "grad_norm": 0.4226812720298767, + "learning_rate": 7.700180051637451e-06, + "loss": 4.5272, + "step": 27090 + }, + { + "epoch": 1.840943062916157, + "grad_norm": 0.39149340987205505, + "learning_rate": 7.699755401549124e-06, + "loss": 4.3085, + "step": 27095 + }, + { + "epoch": 1.841282782986819, + "grad_norm": 0.5084627866744995, + "learning_rate": 7.699330751460798e-06, + "loss": 4.5573, + "step": 27100 + }, + { + "epoch": 1.8416225030574807, + "grad_norm": 0.3826448321342468, + "learning_rate": 7.69890610137247e-06, + "loss": 4.5777, + "step": 27105 + }, + { + "epoch": 1.8419622231281423, + "grad_norm": 0.31962522864341736, + "learning_rate": 7.698481451284142e-06, + "loss": 4.5185, + "step": 27110 + }, + { + "epoch": 1.8423019431988041, + "grad_norm": 0.4615759551525116, + "learning_rate": 7.698056801195815e-06, + "loss": 4.5125, + "step": 27115 + }, + { + "epoch": 1.842641663269466, + "grad_norm": 0.41246646642684937, + "learning_rate": 7.697632151107488e-06, + "loss": 4.582, + "step": 27120 + }, + { + "epoch": 1.8429813833401276, + "grad_norm": 0.392976313829422, + "learning_rate": 7.69720750101916e-06, + "loss": 4.6912, + "step": 27125 + }, + { + "epoch": 1.8433211034107895, + "grad_norm": 0.40203550457954407, + "learning_rate": 7.696782850930833e-06, + "loss": 4.524, + "step": 27130 + }, + { + "epoch": 1.8436608234814513, + "grad_norm": 0.4453551471233368, + "learning_rate": 7.696358200842506e-06, + "loss": 4.5213, + "step": 27135 + }, + { + "epoch": 1.844000543552113, + "grad_norm": 0.34362995624542236, + "learning_rate": 7.695933550754179e-06, + "loss": 4.6575, + "step": 27140 + }, + { + "epoch": 1.8443402636227748, + "grad_norm": 0.3483355641365051, + "learning_rate": 7.695508900665852e-06, + "loss": 4.3908, + "step": 27145 + }, + { + "epoch": 1.8446799836934367, + "grad_norm": 0.5931193232536316, + "learning_rate": 7.695084250577524e-06, + "loss": 4.5731, + "step": 27150 + }, + { + "epoch": 1.8450197037640983, + "grad_norm": 0.39316144585609436, + "learning_rate": 7.694659600489197e-06, + "loss": 4.5899, + "step": 27155 + }, + { + "epoch": 1.8453594238347601, + "grad_norm": 0.39347201585769653, + "learning_rate": 7.69423495040087e-06, + "loss": 4.5019, + "step": 27160 + }, + { + "epoch": 1.845699143905422, + "grad_norm": 0.5826134085655212, + "learning_rate": 7.693810300312543e-06, + "loss": 4.5657, + "step": 27165 + }, + { + "epoch": 1.8460388639760836, + "grad_norm": 0.44311344623565674, + "learning_rate": 7.693385650224216e-06, + "loss": 4.4159, + "step": 27170 + }, + { + "epoch": 1.8463785840467455, + "grad_norm": 0.40609148144721985, + "learning_rate": 7.692961000135888e-06, + "loss": 4.6344, + "step": 27175 + }, + { + "epoch": 1.8467183041174073, + "grad_norm": 0.5515376329421997, + "learning_rate": 7.692536350047561e-06, + "loss": 4.946, + "step": 27180 + }, + { + "epoch": 1.847058024188069, + "grad_norm": 0.4833298921585083, + "learning_rate": 7.692111699959234e-06, + "loss": 4.6839, + "step": 27185 + }, + { + "epoch": 1.8473977442587308, + "grad_norm": 0.4718901515007019, + "learning_rate": 7.691687049870907e-06, + "loss": 4.6891, + "step": 27190 + }, + { + "epoch": 1.8477374643293927, + "grad_norm": 0.2832084000110626, + "learning_rate": 7.69126239978258e-06, + "loss": 4.7948, + "step": 27195 + }, + { + "epoch": 1.8480771844000543, + "grad_norm": 0.4011099636554718, + "learning_rate": 7.690837749694253e-06, + "loss": 4.6626, + "step": 27200 + }, + { + "epoch": 1.8484169044707162, + "grad_norm": 0.36022356152534485, + "learning_rate": 7.690413099605925e-06, + "loss": 4.7382, + "step": 27205 + }, + { + "epoch": 1.848756624541378, + "grad_norm": 0.4190797805786133, + "learning_rate": 7.689988449517598e-06, + "loss": 4.7047, + "step": 27210 + }, + { + "epoch": 1.8490963446120396, + "grad_norm": 0.3276585340499878, + "learning_rate": 7.689563799429271e-06, + "loss": 4.4634, + "step": 27215 + }, + { + "epoch": 1.8494360646827015, + "grad_norm": 0.3730742335319519, + "learning_rate": 7.689139149340944e-06, + "loss": 4.7769, + "step": 27220 + }, + { + "epoch": 1.8497757847533634, + "grad_norm": 0.4808198809623718, + "learning_rate": 7.688714499252617e-06, + "loss": 4.7558, + "step": 27225 + }, + { + "epoch": 1.850115504824025, + "grad_norm": 0.5039113163948059, + "learning_rate": 7.688374779181955e-06, + "loss": 4.561, + "step": 27230 + }, + { + "epoch": 1.8504552248946866, + "grad_norm": 0.35408541560173035, + "learning_rate": 7.687950129093628e-06, + "loss": 4.7419, + "step": 27235 + }, + { + "epoch": 1.8507949449653487, + "grad_norm": 0.4516547918319702, + "learning_rate": 7.6875254790053e-06, + "loss": 4.7404, + "step": 27240 + }, + { + "epoch": 1.8511346650360103, + "grad_norm": 0.44947579503059387, + "learning_rate": 7.687100828916974e-06, + "loss": 4.7595, + "step": 27245 + }, + { + "epoch": 1.851474385106672, + "grad_norm": 0.5665019154548645, + "learning_rate": 7.686676178828647e-06, + "loss": 4.2901, + "step": 27250 + }, + { + "epoch": 1.851814105177334, + "grad_norm": 0.4202868938446045, + "learning_rate": 7.686251528740318e-06, + "loss": 4.5561, + "step": 27255 + }, + { + "epoch": 1.8521538252479957, + "grad_norm": 0.5314123630523682, + "learning_rate": 7.685826878651992e-06, + "loss": 4.7014, + "step": 27260 + }, + { + "epoch": 1.8524935453186573, + "grad_norm": 0.4724714159965515, + "learning_rate": 7.685402228563665e-06, + "loss": 4.8087, + "step": 27265 + }, + { + "epoch": 1.8528332653893194, + "grad_norm": 0.48066550493240356, + "learning_rate": 7.684977578475336e-06, + "loss": 4.7527, + "step": 27270 + }, + { + "epoch": 1.853172985459981, + "grad_norm": 0.4019571542739868, + "learning_rate": 7.68455292838701e-06, + "loss": 4.9686, + "step": 27275 + }, + { + "epoch": 1.8535127055306426, + "grad_norm": 0.5163405537605286, + "learning_rate": 7.684128278298682e-06, + "loss": 4.7183, + "step": 27280 + }, + { + "epoch": 1.8538524256013045, + "grad_norm": 0.4549175202846527, + "learning_rate": 7.683703628210355e-06, + "loss": 4.4963, + "step": 27285 + }, + { + "epoch": 1.8541921456719663, + "grad_norm": 0.2846766412258148, + "learning_rate": 7.683278978122029e-06, + "loss": 4.5613, + "step": 27290 + }, + { + "epoch": 1.854531865742628, + "grad_norm": 0.45305830240249634, + "learning_rate": 7.6828543280337e-06, + "loss": 4.5817, + "step": 27295 + }, + { + "epoch": 1.8548715858132898, + "grad_norm": 0.46711376309394836, + "learning_rate": 7.682429677945373e-06, + "loss": 4.7028, + "step": 27300 + }, + { + "epoch": 1.8552113058839517, + "grad_norm": 0.3773200213909149, + "learning_rate": 7.682005027857047e-06, + "loss": 4.561, + "step": 27305 + }, + { + "epoch": 1.8555510259546133, + "grad_norm": 0.6908542513847351, + "learning_rate": 7.681580377768719e-06, + "loss": 4.6917, + "step": 27310 + }, + { + "epoch": 1.8558907460252752, + "grad_norm": 0.4640747010707855, + "learning_rate": 7.681155727680391e-06, + "loss": 4.7392, + "step": 27315 + }, + { + "epoch": 1.856230466095937, + "grad_norm": 0.3796621561050415, + "learning_rate": 7.680731077592066e-06, + "loss": 4.4493, + "step": 27320 + }, + { + "epoch": 1.8565701861665986, + "grad_norm": 0.3847677409648895, + "learning_rate": 7.680306427503737e-06, + "loss": 4.5005, + "step": 27325 + }, + { + "epoch": 1.8569099062372605, + "grad_norm": 0.4944515824317932, + "learning_rate": 7.67988177741541e-06, + "loss": 4.5402, + "step": 27330 + }, + { + "epoch": 1.8572496263079223, + "grad_norm": 0.41553470492362976, + "learning_rate": 7.679457127327084e-06, + "loss": 4.3609, + "step": 27335 + }, + { + "epoch": 1.857589346378584, + "grad_norm": 0.37691864371299744, + "learning_rate": 7.679032477238755e-06, + "loss": 4.4352, + "step": 27340 + }, + { + "epoch": 1.8579290664492458, + "grad_norm": 0.42087888717651367, + "learning_rate": 7.678607827150428e-06, + "loss": 4.4675, + "step": 27345 + }, + { + "epoch": 1.8582687865199077, + "grad_norm": 0.40653327107429504, + "learning_rate": 7.678183177062103e-06, + "loss": 4.4683, + "step": 27350 + }, + { + "epoch": 1.8586085065905693, + "grad_norm": 0.40871867537498474, + "learning_rate": 7.677758526973774e-06, + "loss": 4.5451, + "step": 27355 + }, + { + "epoch": 1.8589482266612312, + "grad_norm": 0.29750803112983704, + "learning_rate": 7.677333876885447e-06, + "loss": 4.6416, + "step": 27360 + }, + { + "epoch": 1.859287946731893, + "grad_norm": 0.4528070092201233, + "learning_rate": 7.67690922679712e-06, + "loss": 4.4996, + "step": 27365 + }, + { + "epoch": 1.8596276668025546, + "grad_norm": 0.3410159647464752, + "learning_rate": 7.676484576708792e-06, + "loss": 4.4316, + "step": 27370 + }, + { + "epoch": 1.8599673868732165, + "grad_norm": 0.6040562987327576, + "learning_rate": 7.676059926620465e-06, + "loss": 4.5994, + "step": 27375 + }, + { + "epoch": 1.8603071069438784, + "grad_norm": 0.3959942162036896, + "learning_rate": 7.675635276532138e-06, + "loss": 4.7802, + "step": 27380 + }, + { + "epoch": 1.86064682701454, + "grad_norm": 0.35631734132766724, + "learning_rate": 7.67521062644381e-06, + "loss": 4.4791, + "step": 27385 + }, + { + "epoch": 1.8609865470852018, + "grad_norm": 0.47452282905578613, + "learning_rate": 7.674785976355483e-06, + "loss": 4.3951, + "step": 27390 + }, + { + "epoch": 1.8613262671558637, + "grad_norm": 0.3281933069229126, + "learning_rate": 7.674361326267156e-06, + "loss": 4.716, + "step": 27395 + }, + { + "epoch": 1.8616659872265253, + "grad_norm": 0.5148266553878784, + "learning_rate": 7.673936676178829e-06, + "loss": 4.8671, + "step": 27400 + }, + { + "epoch": 1.862005707297187, + "grad_norm": 0.6066763401031494, + "learning_rate": 7.673512026090502e-06, + "loss": 4.6006, + "step": 27405 + }, + { + "epoch": 1.862345427367849, + "grad_norm": 0.407998651266098, + "learning_rate": 7.673087376002175e-06, + "loss": 4.7818, + "step": 27410 + }, + { + "epoch": 1.8626851474385107, + "grad_norm": 0.466865599155426, + "learning_rate": 7.672662725913847e-06, + "loss": 4.9534, + "step": 27415 + }, + { + "epoch": 1.8630248675091723, + "grad_norm": 0.3713204860687256, + "learning_rate": 7.67223807582552e-06, + "loss": 4.5852, + "step": 27420 + }, + { + "epoch": 1.8633645875798344, + "grad_norm": 0.4286060035228729, + "learning_rate": 7.671813425737193e-06, + "loss": 4.6469, + "step": 27425 + }, + { + "epoch": 1.863704307650496, + "grad_norm": 0.39361587166786194, + "learning_rate": 7.671388775648866e-06, + "loss": 4.6669, + "step": 27430 + }, + { + "epoch": 1.8640440277211576, + "grad_norm": 0.3299100995063782, + "learning_rate": 7.670964125560539e-06, + "loss": 4.7134, + "step": 27435 + }, + { + "epoch": 1.8643837477918197, + "grad_norm": 0.4639350473880768, + "learning_rate": 7.670539475472211e-06, + "loss": 4.837, + "step": 27440 + }, + { + "epoch": 1.8647234678624813, + "grad_norm": 0.42637357115745544, + "learning_rate": 7.670114825383884e-06, + "loss": 4.569, + "step": 27445 + }, + { + "epoch": 1.865063187933143, + "grad_norm": 0.3950236439704895, + "learning_rate": 7.669690175295557e-06, + "loss": 4.4698, + "step": 27450 + }, + { + "epoch": 1.8654029080038048, + "grad_norm": 0.4261275827884674, + "learning_rate": 7.66926552520723e-06, + "loss": 4.481, + "step": 27455 + }, + { + "epoch": 1.8657426280744667, + "grad_norm": 0.3813804090023041, + "learning_rate": 7.668840875118903e-06, + "loss": 4.719, + "step": 27460 + }, + { + "epoch": 1.8660823481451283, + "grad_norm": 0.5265907049179077, + "learning_rate": 7.668416225030575e-06, + "loss": 4.5483, + "step": 27465 + }, + { + "epoch": 1.8664220682157902, + "grad_norm": 0.39182499051094055, + "learning_rate": 7.667991574942248e-06, + "loss": 4.8639, + "step": 27470 + }, + { + "epoch": 1.866761788286452, + "grad_norm": 0.35301387310028076, + "learning_rate": 7.667566924853921e-06, + "loss": 4.6155, + "step": 27475 + }, + { + "epoch": 1.8671015083571136, + "grad_norm": 0.40110522508621216, + "learning_rate": 7.667142274765594e-06, + "loss": 4.7166, + "step": 27480 + }, + { + "epoch": 1.8674412284277755, + "grad_norm": 0.3386758267879486, + "learning_rate": 7.666717624677267e-06, + "loss": 4.4583, + "step": 27485 + }, + { + "epoch": 1.8677809484984373, + "grad_norm": 0.38497141003608704, + "learning_rate": 7.66629297458894e-06, + "loss": 4.3599, + "step": 27490 + }, + { + "epoch": 1.868120668569099, + "grad_norm": 0.5052996873855591, + "learning_rate": 7.665868324500612e-06, + "loss": 4.8317, + "step": 27495 + }, + { + "epoch": 1.8684603886397608, + "grad_norm": 0.4994673430919647, + "learning_rate": 7.665443674412285e-06, + "loss": 4.6245, + "step": 27500 + }, + { + "epoch": 1.8688001087104227, + "grad_norm": 0.6175853610038757, + "learning_rate": 7.665019024323958e-06, + "loss": 4.7436, + "step": 27505 + }, + { + "epoch": 1.8691398287810843, + "grad_norm": 0.459731787443161, + "learning_rate": 7.66459437423563e-06, + "loss": 4.3128, + "step": 27510 + }, + { + "epoch": 1.8694795488517462, + "grad_norm": 0.3705977201461792, + "learning_rate": 7.664169724147303e-06, + "loss": 4.5501, + "step": 27515 + }, + { + "epoch": 1.869819268922408, + "grad_norm": 0.4135197699069977, + "learning_rate": 7.663745074058976e-06, + "loss": 4.5698, + "step": 27520 + }, + { + "epoch": 1.8701589889930696, + "grad_norm": 0.3988901674747467, + "learning_rate": 7.663320423970649e-06, + "loss": 4.3616, + "step": 27525 + }, + { + "epoch": 1.8704987090637315, + "grad_norm": 0.33010080456733704, + "learning_rate": 7.662895773882322e-06, + "loss": 4.6162, + "step": 27530 + }, + { + "epoch": 1.8708384291343934, + "grad_norm": 0.4123457372188568, + "learning_rate": 7.662471123793995e-06, + "loss": 4.6183, + "step": 27535 + }, + { + "epoch": 1.871178149205055, + "grad_norm": 0.39774659276008606, + "learning_rate": 7.662046473705667e-06, + "loss": 4.8802, + "step": 27540 + }, + { + "epoch": 1.8715178692757168, + "grad_norm": 0.3332107663154602, + "learning_rate": 7.66162182361734e-06, + "loss": 4.3844, + "step": 27545 + }, + { + "epoch": 1.8718575893463787, + "grad_norm": 0.3574596643447876, + "learning_rate": 7.661197173529013e-06, + "loss": 4.7754, + "step": 27550 + }, + { + "epoch": 1.8721973094170403, + "grad_norm": 0.43073195219039917, + "learning_rate": 7.660772523440686e-06, + "loss": 4.4436, + "step": 27555 + }, + { + "epoch": 1.8725370294877022, + "grad_norm": 0.5201811194419861, + "learning_rate": 7.660347873352359e-06, + "loss": 4.4397, + "step": 27560 + }, + { + "epoch": 1.872876749558364, + "grad_norm": 0.35687386989593506, + "learning_rate": 7.659923223264031e-06, + "loss": 4.4632, + "step": 27565 + }, + { + "epoch": 1.8732164696290257, + "grad_norm": 0.3422692120075226, + "learning_rate": 7.659498573175704e-06, + "loss": 4.4936, + "step": 27570 + }, + { + "epoch": 1.8735561896996873, + "grad_norm": 0.5075245499610901, + "learning_rate": 7.659073923087377e-06, + "loss": 4.6555, + "step": 27575 + }, + { + "epoch": 1.8738959097703494, + "grad_norm": 0.5689677000045776, + "learning_rate": 7.65864927299905e-06, + "loss": 4.7581, + "step": 27580 + }, + { + "epoch": 1.874235629841011, + "grad_norm": 0.3652532696723938, + "learning_rate": 7.658224622910723e-06, + "loss": 4.5384, + "step": 27585 + }, + { + "epoch": 1.8745753499116726, + "grad_norm": 0.32351458072662354, + "learning_rate": 7.657799972822395e-06, + "loss": 4.5633, + "step": 27590 + }, + { + "epoch": 1.8749150699823347, + "grad_norm": 0.38398414850234985, + "learning_rate": 7.657375322734068e-06, + "loss": 4.5901, + "step": 27595 + }, + { + "epoch": 1.8752547900529963, + "grad_norm": 0.4813700020313263, + "learning_rate": 7.656950672645741e-06, + "loss": 4.5107, + "step": 27600 + }, + { + "epoch": 1.875594510123658, + "grad_norm": 0.4504034221172333, + "learning_rate": 7.656526022557414e-06, + "loss": 4.7126, + "step": 27605 + }, + { + "epoch": 1.87593423019432, + "grad_norm": 0.3433671295642853, + "learning_rate": 7.656101372469087e-06, + "loss": 4.6708, + "step": 27610 + }, + { + "epoch": 1.8762739502649817, + "grad_norm": 0.38186898827552795, + "learning_rate": 7.65567672238076e-06, + "loss": 4.6763, + "step": 27615 + }, + { + "epoch": 1.8766136703356433, + "grad_norm": 0.3643990457057953, + "learning_rate": 7.655252072292432e-06, + "loss": 4.5981, + "step": 27620 + }, + { + "epoch": 1.8769533904063052, + "grad_norm": 0.48844143748283386, + "learning_rate": 7.654827422204103e-06, + "loss": 4.8404, + "step": 27625 + }, + { + "epoch": 1.877293110476967, + "grad_norm": 0.3779102861881256, + "learning_rate": 7.654402772115778e-06, + "loss": 4.3591, + "step": 27630 + }, + { + "epoch": 1.8776328305476286, + "grad_norm": 0.4410356879234314, + "learning_rate": 7.65397812202745e-06, + "loss": 4.3017, + "step": 27635 + }, + { + "epoch": 1.8779725506182905, + "grad_norm": 0.3204877972602844, + "learning_rate": 7.653553471939122e-06, + "loss": 4.6695, + "step": 27640 + }, + { + "epoch": 1.8783122706889523, + "grad_norm": 0.38509881496429443, + "learning_rate": 7.653128821850796e-06, + "loss": 4.5745, + "step": 27645 + }, + { + "epoch": 1.878651990759614, + "grad_norm": 0.5768201351165771, + "learning_rate": 7.652704171762469e-06, + "loss": 4.5112, + "step": 27650 + }, + { + "epoch": 1.8789917108302758, + "grad_norm": 0.3148260712623596, + "learning_rate": 7.65227952167414e-06, + "loss": 4.574, + "step": 27655 + }, + { + "epoch": 1.8793314309009377, + "grad_norm": 0.36598917841911316, + "learning_rate": 7.651854871585815e-06, + "loss": 4.4471, + "step": 27660 + }, + { + "epoch": 1.8796711509715993, + "grad_norm": 0.40334004163742065, + "learning_rate": 7.651430221497487e-06, + "loss": 4.7721, + "step": 27665 + }, + { + "epoch": 1.8800108710422612, + "grad_norm": 0.458253413438797, + "learning_rate": 7.651005571409159e-06, + "loss": 4.5269, + "step": 27670 + }, + { + "epoch": 1.880350591112923, + "grad_norm": 0.3873257040977478, + "learning_rate": 7.650580921320833e-06, + "loss": 4.7435, + "step": 27675 + }, + { + "epoch": 1.8806903111835847, + "grad_norm": 0.40018558502197266, + "learning_rate": 7.650156271232506e-06, + "loss": 4.7458, + "step": 27680 + }, + { + "epoch": 1.8810300312542465, + "grad_norm": 0.4664532542228699, + "learning_rate": 7.649731621144177e-06, + "loss": 4.6531, + "step": 27685 + }, + { + "epoch": 1.8813697513249084, + "grad_norm": 0.35977762937545776, + "learning_rate": 7.649306971055851e-06, + "loss": 4.5619, + "step": 27690 + }, + { + "epoch": 1.88170947139557, + "grad_norm": 0.5062816143035889, + "learning_rate": 7.648882320967524e-06, + "loss": 4.5707, + "step": 27695 + }, + { + "epoch": 1.8820491914662318, + "grad_norm": 0.3905816376209259, + "learning_rate": 7.648457670879195e-06, + "loss": 4.6454, + "step": 27700 + }, + { + "epoch": 1.8823889115368937, + "grad_norm": 0.4353102743625641, + "learning_rate": 7.64803302079087e-06, + "loss": 4.5149, + "step": 27705 + }, + { + "epoch": 1.8827286316075553, + "grad_norm": 0.4786261320114136, + "learning_rate": 7.647608370702541e-06, + "loss": 4.778, + "step": 27710 + }, + { + "epoch": 1.8830683516782172, + "grad_norm": 0.3742923438549042, + "learning_rate": 7.647183720614214e-06, + "loss": 4.6304, + "step": 27715 + }, + { + "epoch": 1.883408071748879, + "grad_norm": 0.383016973733902, + "learning_rate": 7.646759070525888e-06, + "loss": 4.4468, + "step": 27720 + }, + { + "epoch": 1.8837477918195407, + "grad_norm": 0.41954848170280457, + "learning_rate": 7.64633442043756e-06, + "loss": 4.6917, + "step": 27725 + }, + { + "epoch": 1.8840875118902025, + "grad_norm": 0.4741382896900177, + "learning_rate": 7.645909770349232e-06, + "loss": 4.5998, + "step": 27730 + }, + { + "epoch": 1.8844272319608644, + "grad_norm": 0.3931966722011566, + "learning_rate": 7.645485120260907e-06, + "loss": 4.6938, + "step": 27735 + }, + { + "epoch": 1.884766952031526, + "grad_norm": 0.4387543797492981, + "learning_rate": 7.645060470172578e-06, + "loss": 4.6779, + "step": 27740 + }, + { + "epoch": 1.8851066721021879, + "grad_norm": 0.34734228253364563, + "learning_rate": 7.64463582008425e-06, + "loss": 4.3581, + "step": 27745 + }, + { + "epoch": 1.8854463921728497, + "grad_norm": 0.35040217638015747, + "learning_rate": 7.644211169995925e-06, + "loss": 4.4113, + "step": 27750 + }, + { + "epoch": 1.8857861122435113, + "grad_norm": 0.3416902422904968, + "learning_rate": 7.643786519907596e-06, + "loss": 4.2745, + "step": 27755 + }, + { + "epoch": 1.886125832314173, + "grad_norm": 0.4290429651737213, + "learning_rate": 7.643361869819269e-06, + "loss": 4.6379, + "step": 27760 + }, + { + "epoch": 1.886465552384835, + "grad_norm": 0.38537439703941345, + "learning_rate": 7.642937219730943e-06, + "loss": 4.5852, + "step": 27765 + }, + { + "epoch": 1.8868052724554967, + "grad_norm": 0.4091152846813202, + "learning_rate": 7.642512569642615e-06, + "loss": 4.6014, + "step": 27770 + }, + { + "epoch": 1.8871449925261583, + "grad_norm": 0.39994847774505615, + "learning_rate": 7.642087919554287e-06, + "loss": 4.3052, + "step": 27775 + }, + { + "epoch": 1.8874847125968204, + "grad_norm": 0.2883261442184448, + "learning_rate": 7.64166326946596e-06, + "loss": 4.3016, + "step": 27780 + }, + { + "epoch": 1.887824432667482, + "grad_norm": 0.4956487715244293, + "learning_rate": 7.641238619377633e-06, + "loss": 4.441, + "step": 27785 + }, + { + "epoch": 1.8881641527381436, + "grad_norm": 0.35177138447761536, + "learning_rate": 7.640813969289306e-06, + "loss": 4.7895, + "step": 27790 + }, + { + "epoch": 1.8885038728088055, + "grad_norm": 0.3111691176891327, + "learning_rate": 7.640389319200979e-06, + "loss": 4.3512, + "step": 27795 + }, + { + "epoch": 1.8888435928794673, + "grad_norm": 0.38490819931030273, + "learning_rate": 7.639964669112651e-06, + "loss": 4.4841, + "step": 27800 + }, + { + "epoch": 1.889183312950129, + "grad_norm": 0.32912689447402954, + "learning_rate": 7.639540019024324e-06, + "loss": 4.5531, + "step": 27805 + }, + { + "epoch": 1.8895230330207908, + "grad_norm": 0.37406617403030396, + "learning_rate": 7.639115368935997e-06, + "loss": 4.7871, + "step": 27810 + }, + { + "epoch": 1.8898627530914527, + "grad_norm": 0.5683684945106506, + "learning_rate": 7.63869071884767e-06, + "loss": 4.5574, + "step": 27815 + }, + { + "epoch": 1.8902024731621143, + "grad_norm": 0.332856148481369, + "learning_rate": 7.638266068759343e-06, + "loss": 4.5149, + "step": 27820 + }, + { + "epoch": 1.8905421932327762, + "grad_norm": 0.3295251727104187, + "learning_rate": 7.637841418671015e-06, + "loss": 4.3017, + "step": 27825 + }, + { + "epoch": 1.890881913303438, + "grad_norm": 0.5184071063995361, + "learning_rate": 7.637416768582688e-06, + "loss": 4.8458, + "step": 27830 + }, + { + "epoch": 1.8912216333740997, + "grad_norm": 0.40060925483703613, + "learning_rate": 7.636992118494361e-06, + "loss": 4.6817, + "step": 27835 + }, + { + "epoch": 1.8915613534447615, + "grad_norm": 0.3994477391242981, + "learning_rate": 7.636567468406034e-06, + "loss": 4.5513, + "step": 27840 + }, + { + "epoch": 1.8919010735154234, + "grad_norm": 0.429721862077713, + "learning_rate": 7.636142818317707e-06, + "loss": 4.5105, + "step": 27845 + }, + { + "epoch": 1.892240793586085, + "grad_norm": 0.47275376319885254, + "learning_rate": 7.63571816822938e-06, + "loss": 4.6576, + "step": 27850 + }, + { + "epoch": 1.8925805136567468, + "grad_norm": 0.43747997283935547, + "learning_rate": 7.635293518141052e-06, + "loss": 4.8012, + "step": 27855 + }, + { + "epoch": 1.8929202337274087, + "grad_norm": 0.38351017236709595, + "learning_rate": 7.634868868052725e-06, + "loss": 4.8081, + "step": 27860 + }, + { + "epoch": 1.8932599537980703, + "grad_norm": 0.663493812084198, + "learning_rate": 7.634444217964398e-06, + "loss": 4.5683, + "step": 27865 + }, + { + "epoch": 1.8935996738687322, + "grad_norm": 0.4414658844470978, + "learning_rate": 7.63401956787607e-06, + "loss": 4.7497, + "step": 27870 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.42425161600112915, + "learning_rate": 7.633594917787743e-06, + "loss": 4.5629, + "step": 27875 + }, + { + "epoch": 1.8942791140100557, + "grad_norm": 0.4470316469669342, + "learning_rate": 7.633170267699416e-06, + "loss": 4.7408, + "step": 27880 + }, + { + "epoch": 1.8946188340807175, + "grad_norm": 0.4373485743999481, + "learning_rate": 7.632745617611089e-06, + "loss": 4.4958, + "step": 27885 + }, + { + "epoch": 1.8949585541513794, + "grad_norm": 0.3125493824481964, + "learning_rate": 7.632320967522762e-06, + "loss": 4.3313, + "step": 27890 + }, + { + "epoch": 1.895298274222041, + "grad_norm": 0.4364946186542511, + "learning_rate": 7.631896317434435e-06, + "loss": 4.6828, + "step": 27895 + }, + { + "epoch": 1.8956379942927029, + "grad_norm": 0.39508867263793945, + "learning_rate": 7.631471667346107e-06, + "loss": 4.5748, + "step": 27900 + }, + { + "epoch": 1.8959777143633647, + "grad_norm": 0.45771777629852295, + "learning_rate": 7.63104701725778e-06, + "loss": 4.4415, + "step": 27905 + }, + { + "epoch": 1.8963174344340263, + "grad_norm": 0.37288418412208557, + "learning_rate": 7.630622367169453e-06, + "loss": 4.4819, + "step": 27910 + }, + { + "epoch": 1.8966571545046882, + "grad_norm": 0.3681783080101013, + "learning_rate": 7.630197717081126e-06, + "loss": 4.5109, + "step": 27915 + }, + { + "epoch": 1.89699687457535, + "grad_norm": 0.47219252586364746, + "learning_rate": 7.629773066992799e-06, + "loss": 4.6523, + "step": 27920 + }, + { + "epoch": 1.8973365946460117, + "grad_norm": 0.4215749502182007, + "learning_rate": 7.629348416904472e-06, + "loss": 4.6509, + "step": 27925 + }, + { + "epoch": 1.8976763147166733, + "grad_norm": 0.388714998960495, + "learning_rate": 7.628923766816144e-06, + "loss": 4.5263, + "step": 27930 + }, + { + "epoch": 1.8980160347873354, + "grad_norm": 0.4396142065525055, + "learning_rate": 7.628499116727817e-06, + "loss": 4.4961, + "step": 27935 + }, + { + "epoch": 1.898355754857997, + "grad_norm": 0.4735078513622284, + "learning_rate": 7.62807446663949e-06, + "loss": 4.5365, + "step": 27940 + }, + { + "epoch": 1.8986954749286586, + "grad_norm": 0.401222825050354, + "learning_rate": 7.6276498165511625e-06, + "loss": 4.4989, + "step": 27945 + }, + { + "epoch": 1.8990351949993207, + "grad_norm": 0.3564615845680237, + "learning_rate": 7.627225166462835e-06, + "loss": 4.445, + "step": 27950 + }, + { + "epoch": 1.8993749150699824, + "grad_norm": 0.41821739077568054, + "learning_rate": 7.626800516374508e-06, + "loss": 4.4332, + "step": 27955 + }, + { + "epoch": 1.899714635140644, + "grad_norm": 0.33287304639816284, + "learning_rate": 7.626375866286181e-06, + "loss": 4.4981, + "step": 27960 + }, + { + "epoch": 1.9000543552113058, + "grad_norm": 0.4037229120731354, + "learning_rate": 7.625951216197853e-06, + "loss": 4.6521, + "step": 27965 + }, + { + "epoch": 1.9003940752819677, + "grad_norm": 0.322032630443573, + "learning_rate": 7.6255265661095265e-06, + "loss": 4.647, + "step": 27970 + }, + { + "epoch": 1.9007337953526293, + "grad_norm": 0.3777194023132324, + "learning_rate": 7.625101916021199e-06, + "loss": 4.617, + "step": 27975 + }, + { + "epoch": 1.9010735154232912, + "grad_norm": 0.3502219021320343, + "learning_rate": 7.624677265932871e-06, + "loss": 4.5842, + "step": 27980 + }, + { + "epoch": 1.901413235493953, + "grad_norm": 0.36773183941841125, + "learning_rate": 7.624252615844545e-06, + "loss": 4.8035, + "step": 27985 + }, + { + "epoch": 1.9017529555646147, + "grad_norm": 0.4477490782737732, + "learning_rate": 7.623827965756218e-06, + "loss": 4.5736, + "step": 27990 + }, + { + "epoch": 1.9020926756352765, + "grad_norm": 0.49055472016334534, + "learning_rate": 7.62340331566789e-06, + "loss": 4.4424, + "step": 27995 + }, + { + "epoch": 1.9024323957059384, + "grad_norm": 0.3831854462623596, + "learning_rate": 7.622978665579563e-06, + "loss": 4.3302, + "step": 28000 + }, + { + "epoch": 1.9027721157766, + "grad_norm": 0.346426784992218, + "learning_rate": 7.622554015491236e-06, + "loss": 4.5615, + "step": 28005 + }, + { + "epoch": 1.9031118358472618, + "grad_norm": 0.39707106351852417, + "learning_rate": 7.622129365402908e-06, + "loss": 4.723, + "step": 28010 + }, + { + "epoch": 1.9034515559179237, + "grad_norm": 0.43244463205337524, + "learning_rate": 7.621704715314582e-06, + "loss": 4.1973, + "step": 28015 + }, + { + "epoch": 1.9037912759885853, + "grad_norm": 0.37452980875968933, + "learning_rate": 7.6212800652262546e-06, + "loss": 4.5491, + "step": 28020 + }, + { + "epoch": 1.9041309960592472, + "grad_norm": 0.40550151467323303, + "learning_rate": 7.6208554151379265e-06, + "loss": 4.5017, + "step": 28025 + }, + { + "epoch": 1.904470716129909, + "grad_norm": 0.5356399416923523, + "learning_rate": 7.6204307650496e-06, + "loss": 4.5753, + "step": 28030 + }, + { + "epoch": 1.9048104362005707, + "grad_norm": 0.4444677531719208, + "learning_rate": 7.620006114961272e-06, + "loss": 4.6906, + "step": 28035 + }, + { + "epoch": 1.9051501562712325, + "grad_norm": 0.5730629563331604, + "learning_rate": 7.619581464872945e-06, + "loss": 4.7221, + "step": 28040 + }, + { + "epoch": 1.9054898763418944, + "grad_norm": 0.3509010076522827, + "learning_rate": 7.6191568147846186e-06, + "loss": 4.8024, + "step": 28045 + }, + { + "epoch": 1.905829596412556, + "grad_norm": 0.41072776913642883, + "learning_rate": 7.6187321646962905e-06, + "loss": 4.3294, + "step": 28050 + }, + { + "epoch": 1.9061693164832179, + "grad_norm": 0.49228978157043457, + "learning_rate": 7.618307514607963e-06, + "loss": 4.5574, + "step": 28055 + }, + { + "epoch": 1.9065090365538797, + "grad_norm": 0.4026014804840088, + "learning_rate": 7.617882864519637e-06, + "loss": 4.5945, + "step": 28060 + }, + { + "epoch": 1.9068487566245413, + "grad_norm": 0.3380400538444519, + "learning_rate": 7.617458214431309e-06, + "loss": 4.3244, + "step": 28065 + }, + { + "epoch": 1.9071884766952032, + "grad_norm": 0.31337282061576843, + "learning_rate": 7.617033564342982e-06, + "loss": 4.4037, + "step": 28070 + }, + { + "epoch": 1.907528196765865, + "grad_norm": 0.3957672715187073, + "learning_rate": 7.616608914254655e-06, + "loss": 4.7328, + "step": 28075 + }, + { + "epoch": 1.9078679168365267, + "grad_norm": 0.3141512870788574, + "learning_rate": 7.616184264166327e-06, + "loss": 4.6308, + "step": 28080 + }, + { + "epoch": 1.9082076369071885, + "grad_norm": 0.4378131628036499, + "learning_rate": 7.615759614078e-06, + "loss": 4.3445, + "step": 28085 + }, + { + "epoch": 1.9085473569778504, + "grad_norm": 0.3594874143600464, + "learning_rate": 7.615334963989674e-06, + "loss": 4.5698, + "step": 28090 + }, + { + "epoch": 1.908887077048512, + "grad_norm": 0.3965783715248108, + "learning_rate": 7.614910313901346e-06, + "loss": 4.4154, + "step": 28095 + }, + { + "epoch": 1.9092267971191736, + "grad_norm": 0.4417901039123535, + "learning_rate": 7.6144856638130185e-06, + "loss": 4.4791, + "step": 28100 + }, + { + "epoch": 1.9095665171898357, + "grad_norm": 0.3731612265110016, + "learning_rate": 7.614061013724692e-06, + "loss": 4.459, + "step": 28105 + }, + { + "epoch": 1.9099062372604974, + "grad_norm": 0.4512810707092285, + "learning_rate": 7.613636363636364e-06, + "loss": 4.4279, + "step": 28110 + }, + { + "epoch": 1.910245957331159, + "grad_norm": 0.40797674655914307, + "learning_rate": 7.613211713548037e-06, + "loss": 4.4717, + "step": 28115 + }, + { + "epoch": 1.910585677401821, + "grad_norm": 0.38960981369018555, + "learning_rate": 7.61278706345971e-06, + "loss": 4.6663, + "step": 28120 + }, + { + "epoch": 1.9109253974724827, + "grad_norm": 0.37361815571784973, + "learning_rate": 7.6123624133713825e-06, + "loss": 4.7497, + "step": 28125 + }, + { + "epoch": 1.9112651175431443, + "grad_norm": 0.417995423078537, + "learning_rate": 7.6119377632830545e-06, + "loss": 4.6582, + "step": 28130 + }, + { + "epoch": 1.9116048376138062, + "grad_norm": 0.4713684916496277, + "learning_rate": 7.611513113194728e-06, + "loss": 4.5783, + "step": 28135 + }, + { + "epoch": 1.911944557684468, + "grad_norm": 0.3776243329048157, + "learning_rate": 7.611088463106401e-06, + "loss": 4.8707, + "step": 28140 + }, + { + "epoch": 1.9122842777551297, + "grad_norm": 0.313595175743103, + "learning_rate": 7.610663813018073e-06, + "loss": 4.688, + "step": 28145 + }, + { + "epoch": 1.9126239978257915, + "grad_norm": 0.4245684742927551, + "learning_rate": 7.6102391629297465e-06, + "loss": 4.4776, + "step": 28150 + }, + { + "epoch": 1.9129637178964534, + "grad_norm": 0.36567604541778564, + "learning_rate": 7.609814512841419e-06, + "loss": 4.6138, + "step": 28155 + }, + { + "epoch": 1.913303437967115, + "grad_norm": 0.42498674988746643, + "learning_rate": 7.609389862753091e-06, + "loss": 4.7758, + "step": 28160 + }, + { + "epoch": 1.9136431580377768, + "grad_norm": 0.39247697591781616, + "learning_rate": 7.608965212664765e-06, + "loss": 4.6073, + "step": 28165 + }, + { + "epoch": 1.9139828781084387, + "grad_norm": 0.4384235441684723, + "learning_rate": 7.608540562576438e-06, + "loss": 4.4912, + "step": 28170 + }, + { + "epoch": 1.9143225981791003, + "grad_norm": 0.44547516107559204, + "learning_rate": 7.60811591248811e-06, + "loss": 4.4231, + "step": 28175 + }, + { + "epoch": 1.9146623182497622, + "grad_norm": 0.4911632835865021, + "learning_rate": 7.607691262399783e-06, + "loss": 4.567, + "step": 28180 + }, + { + "epoch": 1.915002038320424, + "grad_norm": 0.4453125, + "learning_rate": 7.607266612311456e-06, + "loss": 4.5936, + "step": 28185 + }, + { + "epoch": 1.9153417583910857, + "grad_norm": 0.48313602805137634, + "learning_rate": 7.606841962223129e-06, + "loss": 4.5222, + "step": 28190 + }, + { + "epoch": 1.9156814784617475, + "grad_norm": 0.35515785217285156, + "learning_rate": 7.606417312134802e-06, + "loss": 4.3815, + "step": 28195 + }, + { + "epoch": 1.9160211985324094, + "grad_norm": 0.36267298460006714, + "learning_rate": 7.6059926620464745e-06, + "loss": 4.8367, + "step": 28200 + }, + { + "epoch": 1.916360918603071, + "grad_norm": 0.4785403311252594, + "learning_rate": 7.605568011958147e-06, + "loss": 4.6601, + "step": 28205 + }, + { + "epoch": 1.9167006386737329, + "grad_norm": 0.2918042242527008, + "learning_rate": 7.60514336186982e-06, + "loss": 4.3717, + "step": 28210 + }, + { + "epoch": 1.9170403587443947, + "grad_norm": 0.46290573477745056, + "learning_rate": 7.604718711781492e-06, + "loss": 4.4028, + "step": 28215 + }, + { + "epoch": 1.9173800788150563, + "grad_norm": 0.397903710603714, + "learning_rate": 7.604294061693166e-06, + "loss": 4.5767, + "step": 28220 + }, + { + "epoch": 1.9177197988857182, + "grad_norm": 0.5363309383392334, + "learning_rate": 7.6038694116048385e-06, + "loss": 4.5442, + "step": 28225 + }, + { + "epoch": 1.91805951895638, + "grad_norm": 0.3947249948978424, + "learning_rate": 7.6034447615165105e-06, + "loss": 4.0535, + "step": 28230 + }, + { + "epoch": 1.9183992390270417, + "grad_norm": 0.31306594610214233, + "learning_rate": 7.603020111428184e-06, + "loss": 4.3485, + "step": 28235 + }, + { + "epoch": 1.9187389590977035, + "grad_norm": 0.3273119628429413, + "learning_rate": 7.602595461339857e-06, + "loss": 4.524, + "step": 28240 + }, + { + "epoch": 1.9190786791683654, + "grad_norm": 0.4812859892845154, + "learning_rate": 7.602170811251529e-06, + "loss": 4.5521, + "step": 28245 + }, + { + "epoch": 1.919418399239027, + "grad_norm": 0.4291004240512848, + "learning_rate": 7.6017461611632025e-06, + "loss": 4.3461, + "step": 28250 + }, + { + "epoch": 1.9197581193096889, + "grad_norm": 0.45089489221572876, + "learning_rate": 7.601321511074875e-06, + "loss": 4.7493, + "step": 28255 + }, + { + "epoch": 1.9200978393803507, + "grad_norm": 0.40887829661369324, + "learning_rate": 7.600896860986547e-06, + "loss": 4.7499, + "step": 28260 + }, + { + "epoch": 1.9204375594510124, + "grad_norm": 0.5145461559295654, + "learning_rate": 7.600472210898221e-06, + "loss": 4.7835, + "step": 28265 + }, + { + "epoch": 1.920777279521674, + "grad_norm": 0.45430827140808105, + "learning_rate": 7.600047560809894e-06, + "loss": 4.6181, + "step": 28270 + }, + { + "epoch": 1.921116999592336, + "grad_norm": 0.3680402934551239, + "learning_rate": 7.599622910721566e-06, + "loss": 4.6191, + "step": 28275 + }, + { + "epoch": 1.9214567196629977, + "grad_norm": 0.37025126814842224, + "learning_rate": 7.599198260633239e-06, + "loss": 4.6275, + "step": 28280 + }, + { + "epoch": 1.9217964397336593, + "grad_norm": 0.4081991910934448, + "learning_rate": 7.598773610544911e-06, + "loss": 4.6925, + "step": 28285 + }, + { + "epoch": 1.9221361598043214, + "grad_norm": 0.31962689757347107, + "learning_rate": 7.598348960456584e-06, + "loss": 4.6229, + "step": 28290 + }, + { + "epoch": 1.922475879874983, + "grad_norm": 0.4629650413990021, + "learning_rate": 7.597924310368258e-06, + "loss": 4.6665, + "step": 28295 + }, + { + "epoch": 1.9228155999456447, + "grad_norm": 0.42062634229660034, + "learning_rate": 7.59749966027993e-06, + "loss": 4.6656, + "step": 28300 + }, + { + "epoch": 1.9231553200163065, + "grad_norm": 0.35109743475914, + "learning_rate": 7.5970750101916025e-06, + "loss": 4.5552, + "step": 28305 + }, + { + "epoch": 1.9234950400869684, + "grad_norm": 0.34536150097846985, + "learning_rate": 7.596650360103276e-06, + "loss": 4.5118, + "step": 28310 + }, + { + "epoch": 1.92383476015763, + "grad_norm": 0.3114737868309021, + "learning_rate": 7.596225710014948e-06, + "loss": 4.7304, + "step": 28315 + }, + { + "epoch": 1.9241744802282919, + "grad_norm": 0.7265421748161316, + "learning_rate": 7.595801059926621e-06, + "loss": 4.6115, + "step": 28320 + }, + { + "epoch": 1.9245142002989537, + "grad_norm": 0.5215314626693726, + "learning_rate": 7.5953764098382946e-06, + "loss": 4.7357, + "step": 28325 + }, + { + "epoch": 1.9248539203696153, + "grad_norm": 0.4533412456512451, + "learning_rate": 7.5949517597499665e-06, + "loss": 4.6316, + "step": 28330 + }, + { + "epoch": 1.9251936404402772, + "grad_norm": 0.482986181974411, + "learning_rate": 7.594527109661639e-06, + "loss": 4.7566, + "step": 28335 + }, + { + "epoch": 1.925533360510939, + "grad_norm": 0.38249072432518005, + "learning_rate": 7.594102459573313e-06, + "loss": 4.7872, + "step": 28340 + }, + { + "epoch": 1.9258730805816007, + "grad_norm": 0.7260622382164001, + "learning_rate": 7.593677809484985e-06, + "loss": 4.5043, + "step": 28345 + }, + { + "epoch": 1.9262128006522625, + "grad_norm": 0.4211864173412323, + "learning_rate": 7.593253159396658e-06, + "loss": 4.4772, + "step": 28350 + }, + { + "epoch": 1.9265525207229244, + "grad_norm": 0.405673086643219, + "learning_rate": 7.5928285093083305e-06, + "loss": 4.3445, + "step": 28355 + }, + { + "epoch": 1.926892240793586, + "grad_norm": 0.4522375166416168, + "learning_rate": 7.592403859220003e-06, + "loss": 4.5194, + "step": 28360 + }, + { + "epoch": 1.9272319608642479, + "grad_norm": 0.3646881878376007, + "learning_rate": 7.591979209131676e-06, + "loss": 4.5022, + "step": 28365 + }, + { + "epoch": 1.9275716809349097, + "grad_norm": 0.3786730170249939, + "learning_rate": 7.591554559043349e-06, + "loss": 4.4642, + "step": 28370 + }, + { + "epoch": 1.9279114010055713, + "grad_norm": 0.3361833691596985, + "learning_rate": 7.591129908955022e-06, + "loss": 4.438, + "step": 28375 + }, + { + "epoch": 1.9282511210762332, + "grad_norm": 0.4981807768344879, + "learning_rate": 7.590705258866694e-06, + "loss": 4.5167, + "step": 28380 + }, + { + "epoch": 1.928590841146895, + "grad_norm": 0.2991105318069458, + "learning_rate": 7.590280608778367e-06, + "loss": 4.4347, + "step": 28385 + }, + { + "epoch": 1.9289305612175567, + "grad_norm": 0.5172913670539856, + "learning_rate": 7.58985595869004e-06, + "loss": 4.707, + "step": 28390 + }, + { + "epoch": 1.9292702812882185, + "grad_norm": 0.38931596279144287, + "learning_rate": 7.589431308601712e-06, + "loss": 4.5674, + "step": 28395 + }, + { + "epoch": 1.9296100013588804, + "grad_norm": 0.47036486864089966, + "learning_rate": 7.589006658513386e-06, + "loss": 4.6365, + "step": 28400 + }, + { + "epoch": 1.929949721429542, + "grad_norm": 0.45560920238494873, + "learning_rate": 7.5885820084250585e-06, + "loss": 4.4312, + "step": 28405 + }, + { + "epoch": 1.9302894415002039, + "grad_norm": 0.3283085823059082, + "learning_rate": 7.5881573583367305e-06, + "loss": 4.5526, + "step": 28410 + }, + { + "epoch": 1.9306291615708657, + "grad_norm": 0.48831331729888916, + "learning_rate": 7.587732708248404e-06, + "loss": 4.5069, + "step": 28415 + }, + { + "epoch": 1.9309688816415274, + "grad_norm": 0.3312652111053467, + "learning_rate": 7.587308058160077e-06, + "loss": 4.6905, + "step": 28420 + }, + { + "epoch": 1.9313086017121892, + "grad_norm": 0.38209375739097595, + "learning_rate": 7.586883408071749e-06, + "loss": 4.7512, + "step": 28425 + }, + { + "epoch": 1.931648321782851, + "grad_norm": 0.47176462411880493, + "learning_rate": 7.5864587579834225e-06, + "loss": 4.3902, + "step": 28430 + }, + { + "epoch": 1.9319880418535127, + "grad_norm": 0.4767155051231384, + "learning_rate": 7.586034107895095e-06, + "loss": 4.5331, + "step": 28435 + }, + { + "epoch": 1.9323277619241743, + "grad_norm": 0.48905545473098755, + "learning_rate": 7.585609457806767e-06, + "loss": 4.6757, + "step": 28440 + }, + { + "epoch": 1.9326674819948364, + "grad_norm": 0.3743753433227539, + "learning_rate": 7.585184807718441e-06, + "loss": 4.5363, + "step": 28445 + }, + { + "epoch": 1.933007202065498, + "grad_norm": 0.46111661195755005, + "learning_rate": 7.584760157630114e-06, + "loss": 4.4999, + "step": 28450 + }, + { + "epoch": 1.9333469221361597, + "grad_norm": 0.355182021856308, + "learning_rate": 7.584335507541786e-06, + "loss": 4.5532, + "step": 28455 + }, + { + "epoch": 1.9336866422068217, + "grad_norm": 0.33571553230285645, + "learning_rate": 7.583910857453459e-06, + "loss": 4.6245, + "step": 28460 + }, + { + "epoch": 1.9340263622774834, + "grad_norm": 0.4095207154750824, + "learning_rate": 7.583486207365131e-06, + "loss": 4.6127, + "step": 28465 + }, + { + "epoch": 1.934366082348145, + "grad_norm": 0.32605230808258057, + "learning_rate": 7.583061557276804e-06, + "loss": 4.5365, + "step": 28470 + }, + { + "epoch": 1.9347058024188069, + "grad_norm": 0.48662611842155457, + "learning_rate": 7.582636907188478e-06, + "loss": 4.7767, + "step": 28475 + }, + { + "epoch": 1.9350455224894687, + "grad_norm": 0.2920769155025482, + "learning_rate": 7.58221225710015e-06, + "loss": 4.576, + "step": 28480 + }, + { + "epoch": 1.9353852425601303, + "grad_norm": 0.4062121510505676, + "learning_rate": 7.5817876070118225e-06, + "loss": 4.453, + "step": 28485 + }, + { + "epoch": 1.9357249626307922, + "grad_norm": 0.33832648396492004, + "learning_rate": 7.581362956923496e-06, + "loss": 4.5025, + "step": 28490 + }, + { + "epoch": 1.936064682701454, + "grad_norm": 0.5310803055763245, + "learning_rate": 7.580938306835168e-06, + "loss": 4.7226, + "step": 28495 + }, + { + "epoch": 1.9364044027721157, + "grad_norm": 0.43315064907073975, + "learning_rate": 7.580513656746841e-06, + "loss": 4.5933, + "step": 28500 + }, + { + "epoch": 1.9367441228427775, + "grad_norm": 0.36797699332237244, + "learning_rate": 7.5800890066585145e-06, + "loss": 4.6518, + "step": 28505 + }, + { + "epoch": 1.9370838429134394, + "grad_norm": 0.45705971121788025, + "learning_rate": 7.5796643565701865e-06, + "loss": 4.4202, + "step": 28510 + }, + { + "epoch": 1.937423562984101, + "grad_norm": 0.4165302515029907, + "learning_rate": 7.579239706481859e-06, + "loss": 4.6854, + "step": 28515 + }, + { + "epoch": 1.9377632830547629, + "grad_norm": 0.396110475063324, + "learning_rate": 7.578815056393533e-06, + "loss": 4.5003, + "step": 28520 + }, + { + "epoch": 1.9381030031254247, + "grad_norm": 0.3478889465332031, + "learning_rate": 7.578390406305205e-06, + "loss": 4.6314, + "step": 28525 + }, + { + "epoch": 1.9384427231960863, + "grad_norm": 0.31362172961235046, + "learning_rate": 7.5779657562168785e-06, + "loss": 4.5099, + "step": 28530 + }, + { + "epoch": 1.9387824432667482, + "grad_norm": 0.3555907607078552, + "learning_rate": 7.5775411061285505e-06, + "loss": 4.5233, + "step": 28535 + }, + { + "epoch": 1.93912216333741, + "grad_norm": 0.57781982421875, + "learning_rate": 7.577116456040223e-06, + "loss": 4.7528, + "step": 28540 + }, + { + "epoch": 1.9394618834080717, + "grad_norm": 0.520878791809082, + "learning_rate": 7.576691805951897e-06, + "loss": 4.5345, + "step": 28545 + }, + { + "epoch": 1.9398016034787335, + "grad_norm": 0.37600454688072205, + "learning_rate": 7.576267155863569e-06, + "loss": 4.5337, + "step": 28550 + }, + { + "epoch": 1.9401413235493954, + "grad_norm": 0.3709515333175659, + "learning_rate": 7.575842505775242e-06, + "loss": 4.2927, + "step": 28555 + }, + { + "epoch": 1.940481043620057, + "grad_norm": 0.381867378950119, + "learning_rate": 7.575417855686915e-06, + "loss": 4.6501, + "step": 28560 + }, + { + "epoch": 1.9408207636907189, + "grad_norm": 0.34788453578948975, + "learning_rate": 7.574993205598587e-06, + "loss": 4.4583, + "step": 28565 + }, + { + "epoch": 1.9411604837613807, + "grad_norm": 0.3953450620174408, + "learning_rate": 7.57456855551026e-06, + "loss": 4.5855, + "step": 28570 + }, + { + "epoch": 1.9415002038320424, + "grad_norm": 0.4280330240726471, + "learning_rate": 7.574143905421934e-06, + "loss": 4.404, + "step": 28575 + }, + { + "epoch": 1.9418399239027042, + "grad_norm": 0.44367435574531555, + "learning_rate": 7.573719255333606e-06, + "loss": 4.674, + "step": 28580 + }, + { + "epoch": 1.942179643973366, + "grad_norm": 0.31751933693885803, + "learning_rate": 7.5732946052452785e-06, + "loss": 4.8792, + "step": 28585 + }, + { + "epoch": 1.9425193640440277, + "grad_norm": 0.5128030776977539, + "learning_rate": 7.572869955156952e-06, + "loss": 4.5263, + "step": 28590 + }, + { + "epoch": 1.9428590841146895, + "grad_norm": 0.38558897376060486, + "learning_rate": 7.572445305068624e-06, + "loss": 4.459, + "step": 28595 + }, + { + "epoch": 1.9431988041853514, + "grad_norm": 0.6151425242424011, + "learning_rate": 7.572020654980297e-06, + "loss": 4.3652, + "step": 28600 + }, + { + "epoch": 1.943538524256013, + "grad_norm": 0.4028032124042511, + "learning_rate": 7.57159600489197e-06, + "loss": 4.5582, + "step": 28605 + }, + { + "epoch": 1.9438782443266747, + "grad_norm": 0.38791951537132263, + "learning_rate": 7.5711713548036425e-06, + "loss": 4.7469, + "step": 28610 + }, + { + "epoch": 1.9442179643973367, + "grad_norm": 0.43401673436164856, + "learning_rate": 7.570746704715315e-06, + "loss": 4.5268, + "step": 28615 + }, + { + "epoch": 1.9445576844679984, + "grad_norm": 0.6374907493591309, + "learning_rate": 7.570322054626988e-06, + "loss": 4.1737, + "step": 28620 + }, + { + "epoch": 1.94489740453866, + "grad_norm": 0.4287441372871399, + "learning_rate": 7.569897404538661e-06, + "loss": 4.4767, + "step": 28625 + }, + { + "epoch": 1.945237124609322, + "grad_norm": 0.3428209722042084, + "learning_rate": 7.569472754450333e-06, + "loss": 4.5083, + "step": 28630 + }, + { + "epoch": 1.9455768446799837, + "grad_norm": 0.3912215530872345, + "learning_rate": 7.5690481043620065e-06, + "loss": 4.4507, + "step": 28635 + }, + { + "epoch": 1.9459165647506453, + "grad_norm": 0.41786980628967285, + "learning_rate": 7.568623454273679e-06, + "loss": 4.3154, + "step": 28640 + }, + { + "epoch": 1.9462562848213072, + "grad_norm": 0.42409369349479675, + "learning_rate": 7.568198804185351e-06, + "loss": 4.546, + "step": 28645 + }, + { + "epoch": 1.946596004891969, + "grad_norm": 0.5046223402023315, + "learning_rate": 7.567774154097025e-06, + "loss": 4.2544, + "step": 28650 + }, + { + "epoch": 1.9469357249626307, + "grad_norm": 0.4461921453475952, + "learning_rate": 7.567349504008698e-06, + "loss": 4.6479, + "step": 28655 + }, + { + "epoch": 1.9472754450332925, + "grad_norm": 0.4302857220172882, + "learning_rate": 7.56692485392037e-06, + "loss": 4.8264, + "step": 28660 + }, + { + "epoch": 1.9476151651039544, + "grad_norm": 0.44640621542930603, + "learning_rate": 7.566500203832043e-06, + "loss": 4.4671, + "step": 28665 + }, + { + "epoch": 1.947954885174616, + "grad_norm": 0.4881040155887604, + "learning_rate": 7.566075553743716e-06, + "loss": 4.5957, + "step": 28670 + }, + { + "epoch": 1.9482946052452779, + "grad_norm": 0.49948644638061523, + "learning_rate": 7.565650903655388e-06, + "loss": 4.4429, + "step": 28675 + }, + { + "epoch": 1.9486343253159397, + "grad_norm": 0.37833648920059204, + "learning_rate": 7.565226253567062e-06, + "loss": 4.6647, + "step": 28680 + }, + { + "epoch": 1.9489740453866014, + "grad_norm": 0.4400046169757843, + "learning_rate": 7.5648016034787345e-06, + "loss": 4.5577, + "step": 28685 + }, + { + "epoch": 1.9493137654572632, + "grad_norm": 0.422395795583725, + "learning_rate": 7.5643769533904065e-06, + "loss": 4.4151, + "step": 28690 + }, + { + "epoch": 1.949653485527925, + "grad_norm": 0.34346577525138855, + "learning_rate": 7.56395230330208e-06, + "loss": 4.5388, + "step": 28695 + }, + { + "epoch": 1.9499932055985867, + "grad_norm": 0.45671600103378296, + "learning_rate": 7.563527653213752e-06, + "loss": 4.6518, + "step": 28700 + }, + { + "epoch": 1.9503329256692485, + "grad_norm": 0.34220677614212036, + "learning_rate": 7.563103003125425e-06, + "loss": 4.4643, + "step": 28705 + }, + { + "epoch": 1.9506726457399104, + "grad_norm": 0.4951692521572113, + "learning_rate": 7.5626783530370985e-06, + "loss": 4.7853, + "step": 28710 + }, + { + "epoch": 1.951012365810572, + "grad_norm": 0.41911038756370544, + "learning_rate": 7.5622537029487705e-06, + "loss": 4.5302, + "step": 28715 + }, + { + "epoch": 1.9513520858812339, + "grad_norm": 0.323246031999588, + "learning_rate": 7.561829052860443e-06, + "loss": 4.5787, + "step": 28720 + }, + { + "epoch": 1.9516918059518957, + "grad_norm": 0.3656970262527466, + "learning_rate": 7.561404402772117e-06, + "loss": 4.4722, + "step": 28725 + }, + { + "epoch": 1.9520315260225574, + "grad_norm": 0.48602747917175293, + "learning_rate": 7.560979752683789e-06, + "loss": 4.804, + "step": 28730 + }, + { + "epoch": 1.9523712460932192, + "grad_norm": 0.38935714960098267, + "learning_rate": 7.560555102595462e-06, + "loss": 4.6343, + "step": 28735 + }, + { + "epoch": 1.952710966163881, + "grad_norm": 0.3878921568393707, + "learning_rate": 7.560130452507135e-06, + "loss": 4.5475, + "step": 28740 + }, + { + "epoch": 1.9530506862345427, + "grad_norm": 0.37403711676597595, + "learning_rate": 7.559705802418807e-06, + "loss": 4.7265, + "step": 28745 + }, + { + "epoch": 1.9533904063052046, + "grad_norm": 0.3533264398574829, + "learning_rate": 7.55928115233048e-06, + "loss": 4.2723, + "step": 28750 + }, + { + "epoch": 1.9537301263758664, + "grad_norm": 0.3427486717700958, + "learning_rate": 7.558856502242154e-06, + "loss": 4.7174, + "step": 28755 + }, + { + "epoch": 1.954069846446528, + "grad_norm": 0.38093480467796326, + "learning_rate": 7.558431852153826e-06, + "loss": 4.7461, + "step": 28760 + }, + { + "epoch": 1.95440956651719, + "grad_norm": 0.5190484523773193, + "learning_rate": 7.5580072020654985e-06, + "loss": 4.6289, + "step": 28765 + }, + { + "epoch": 1.9547492865878517, + "grad_norm": 0.4967063069343567, + "learning_rate": 7.557582551977172e-06, + "loss": 4.5841, + "step": 28770 + }, + { + "epoch": 1.9550890066585134, + "grad_norm": 0.4328489601612091, + "learning_rate": 7.557157901888844e-06, + "loss": 4.5968, + "step": 28775 + }, + { + "epoch": 1.955428726729175, + "grad_norm": 0.38537347316741943, + "learning_rate": 7.556733251800517e-06, + "loss": 4.6792, + "step": 28780 + }, + { + "epoch": 1.955768446799837, + "grad_norm": 0.47490394115448, + "learning_rate": 7.55630860171219e-06, + "loss": 4.5546, + "step": 28785 + }, + { + "epoch": 1.9561081668704987, + "grad_norm": 0.36968478560447693, + "learning_rate": 7.5558839516238625e-06, + "loss": 4.5709, + "step": 28790 + }, + { + "epoch": 1.9564478869411603, + "grad_norm": 0.45556706190109253, + "learning_rate": 7.5554593015355344e-06, + "loss": 4.4278, + "step": 28795 + }, + { + "epoch": 1.9567876070118224, + "grad_norm": 0.3205968141555786, + "learning_rate": 7.555034651447208e-06, + "loss": 4.544, + "step": 28800 + }, + { + "epoch": 1.957127327082484, + "grad_norm": 0.37424755096435547, + "learning_rate": 7.554610001358881e-06, + "loss": 4.3365, + "step": 28805 + }, + { + "epoch": 1.9574670471531457, + "grad_norm": 0.49891942739486694, + "learning_rate": 7.554185351270553e-06, + "loss": 4.5215, + "step": 28810 + }, + { + "epoch": 1.9578067672238075, + "grad_norm": 0.4222252666950226, + "learning_rate": 7.5537607011822265e-06, + "loss": 4.4911, + "step": 28815 + }, + { + "epoch": 1.9581464872944694, + "grad_norm": 0.36127957701683044, + "learning_rate": 7.553336051093899e-06, + "loss": 4.5284, + "step": 28820 + }, + { + "epoch": 1.958486207365131, + "grad_norm": 0.3841056525707245, + "learning_rate": 7.552911401005571e-06, + "loss": 4.5667, + "step": 28825 + }, + { + "epoch": 1.9588259274357929, + "grad_norm": 0.46359193325042725, + "learning_rate": 7.552486750917245e-06, + "loss": 4.5789, + "step": 28830 + }, + { + "epoch": 1.9591656475064547, + "grad_norm": 0.4217718541622162, + "learning_rate": 7.552062100828918e-06, + "loss": 4.4249, + "step": 28835 + }, + { + "epoch": 1.9595053675771164, + "grad_norm": 0.3774777948856354, + "learning_rate": 7.55163745074059e-06, + "loss": 4.511, + "step": 28840 + }, + { + "epoch": 1.9598450876477782, + "grad_norm": 0.40469157695770264, + "learning_rate": 7.551212800652263e-06, + "loss": 4.5648, + "step": 28845 + }, + { + "epoch": 1.96018480771844, + "grad_norm": 0.5816948413848877, + "learning_rate": 7.550788150563936e-06, + "loss": 4.5718, + "step": 28850 + }, + { + "epoch": 1.9605245277891017, + "grad_norm": 0.3243058919906616, + "learning_rate": 7.550363500475608e-06, + "loss": 4.6032, + "step": 28855 + }, + { + "epoch": 1.9608642478597635, + "grad_norm": 0.4013819992542267, + "learning_rate": 7.549938850387282e-06, + "loss": 4.6458, + "step": 28860 + }, + { + "epoch": 1.9612039679304254, + "grad_norm": 0.35949063301086426, + "learning_rate": 7.5495142002989545e-06, + "loss": 4.5564, + "step": 28865 + }, + { + "epoch": 1.961543688001087, + "grad_norm": 0.4248729348182678, + "learning_rate": 7.549089550210627e-06, + "loss": 4.6055, + "step": 28870 + }, + { + "epoch": 1.9618834080717489, + "grad_norm": 0.4235672652721405, + "learning_rate": 7.5486649001223e-06, + "loss": 4.7713, + "step": 28875 + }, + { + "epoch": 1.9622231281424107, + "grad_norm": 0.34289419651031494, + "learning_rate": 7.548240250033972e-06, + "loss": 4.4688, + "step": 28880 + }, + { + "epoch": 1.9625628482130724, + "grad_norm": 0.43661701679229736, + "learning_rate": 7.547815599945646e-06, + "loss": 4.4679, + "step": 28885 + }, + { + "epoch": 1.9629025682837342, + "grad_norm": 0.3615751564502716, + "learning_rate": 7.5473909498573185e-06, + "loss": 4.3857, + "step": 28890 + }, + { + "epoch": 1.963242288354396, + "grad_norm": 0.48710137605667114, + "learning_rate": 7.5469662997689905e-06, + "loss": 4.5797, + "step": 28895 + }, + { + "epoch": 1.9635820084250577, + "grad_norm": 0.39801931381225586, + "learning_rate": 7.546541649680664e-06, + "loss": 4.4206, + "step": 28900 + }, + { + "epoch": 1.9639217284957196, + "grad_norm": 0.5170295238494873, + "learning_rate": 7.546116999592337e-06, + "loss": 4.8206, + "step": 28905 + }, + { + "epoch": 1.9642614485663814, + "grad_norm": 0.40816667675971985, + "learning_rate": 7.545692349504009e-06, + "loss": 4.2303, + "step": 28910 + }, + { + "epoch": 1.964601168637043, + "grad_norm": 0.34545227885246277, + "learning_rate": 7.5452676994156825e-06, + "loss": 4.4911, + "step": 28915 + }, + { + "epoch": 1.964940888707705, + "grad_norm": 0.33072978258132935, + "learning_rate": 7.544843049327355e-06, + "loss": 4.7405, + "step": 28920 + }, + { + "epoch": 1.9652806087783667, + "grad_norm": 0.371162474155426, + "learning_rate": 7.544418399239027e-06, + "loss": 4.5733, + "step": 28925 + }, + { + "epoch": 1.9656203288490284, + "grad_norm": 0.41967371106147766, + "learning_rate": 7.543993749150701e-06, + "loss": 4.5502, + "step": 28930 + }, + { + "epoch": 1.9659600489196902, + "grad_norm": 0.4225642681121826, + "learning_rate": 7.543569099062374e-06, + "loss": 4.4581, + "step": 28935 + }, + { + "epoch": 1.966299768990352, + "grad_norm": 0.3096923828125, + "learning_rate": 7.543144448974046e-06, + "loss": 4.5672, + "step": 28940 + }, + { + "epoch": 1.9666394890610137, + "grad_norm": 0.33211421966552734, + "learning_rate": 7.542719798885719e-06, + "loss": 4.6379, + "step": 28945 + }, + { + "epoch": 1.9669792091316753, + "grad_norm": 0.6147076487541199, + "learning_rate": 7.542295148797391e-06, + "loss": 4.629, + "step": 28950 + }, + { + "epoch": 1.9673189292023374, + "grad_norm": 0.49156442284584045, + "learning_rate": 7.541870498709064e-06, + "loss": 4.6716, + "step": 28955 + }, + { + "epoch": 1.967658649272999, + "grad_norm": 0.3834266662597656, + "learning_rate": 7.541445848620738e-06, + "loss": 4.3221, + "step": 28960 + }, + { + "epoch": 1.9679983693436607, + "grad_norm": 0.46159055829048157, + "learning_rate": 7.54102119853241e-06, + "loss": 4.9319, + "step": 28965 + }, + { + "epoch": 1.9683380894143228, + "grad_norm": 0.5006670355796814, + "learning_rate": 7.5405965484440825e-06, + "loss": 4.7092, + "step": 28970 + }, + { + "epoch": 1.9686778094849844, + "grad_norm": 0.4457678198814392, + "learning_rate": 7.540171898355756e-06, + "loss": 4.3382, + "step": 28975 + }, + { + "epoch": 1.969017529555646, + "grad_norm": 0.32047775387763977, + "learning_rate": 7.539747248267428e-06, + "loss": 4.2698, + "step": 28980 + }, + { + "epoch": 1.9693572496263079, + "grad_norm": 0.48839321732521057, + "learning_rate": 7.539322598179101e-06, + "loss": 4.391, + "step": 28985 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.37976327538490295, + "learning_rate": 7.5388979480907745e-06, + "loss": 4.4047, + "step": 28990 + }, + { + "epoch": 1.9700366897676314, + "grad_norm": 0.3448270559310913, + "learning_rate": 7.5384732980024465e-06, + "loss": 4.2633, + "step": 28995 + }, + { + "epoch": 1.9703764098382932, + "grad_norm": 0.39294618368148804, + "learning_rate": 7.538048647914119e-06, + "loss": 4.6631, + "step": 29000 + }, + { + "epoch": 1.970716129908955, + "grad_norm": 0.34695619344711304, + "learning_rate": 7.537623997825793e-06, + "loss": 4.4671, + "step": 29005 + }, + { + "epoch": 1.9710558499796167, + "grad_norm": 0.36692720651626587, + "learning_rate": 7.537199347737465e-06, + "loss": 4.6219, + "step": 29010 + }, + { + "epoch": 1.9713955700502785, + "grad_norm": 0.43201446533203125, + "learning_rate": 7.536774697649138e-06, + "loss": 4.5912, + "step": 29015 + }, + { + "epoch": 1.9717352901209404, + "grad_norm": 0.4028998911380768, + "learning_rate": 7.5363500475608105e-06, + "loss": 4.6452, + "step": 29020 + }, + { + "epoch": 1.972075010191602, + "grad_norm": 0.3763993978500366, + "learning_rate": 7.535925397472483e-06, + "loss": 4.5201, + "step": 29025 + }, + { + "epoch": 1.9724147302622639, + "grad_norm": 0.6647985577583313, + "learning_rate": 7.535500747384156e-06, + "loss": 4.6565, + "step": 29030 + }, + { + "epoch": 1.9727544503329257, + "grad_norm": 0.46458208560943604, + "learning_rate": 7.535076097295829e-06, + "loss": 4.4843, + "step": 29035 + }, + { + "epoch": 1.9730941704035874, + "grad_norm": 0.4542465806007385, + "learning_rate": 7.534651447207502e-06, + "loss": 4.4301, + "step": 29040 + }, + { + "epoch": 1.9734338904742492, + "grad_norm": 0.49822622537612915, + "learning_rate": 7.534226797119174e-06, + "loss": 4.5061, + "step": 29045 + }, + { + "epoch": 1.973773610544911, + "grad_norm": 0.38393568992614746, + "learning_rate": 7.533802147030847e-06, + "loss": 4.5873, + "step": 29050 + }, + { + "epoch": 1.9741133306155727, + "grad_norm": 0.44437167048454285, + "learning_rate": 7.53337749694252e-06, + "loss": 4.7155, + "step": 29055 + }, + { + "epoch": 1.9744530506862346, + "grad_norm": 0.3133980929851532, + "learning_rate": 7.532952846854192e-06, + "loss": 4.54, + "step": 29060 + }, + { + "epoch": 1.9747927707568964, + "grad_norm": 0.42817607522010803, + "learning_rate": 7.532528196765866e-06, + "loss": 4.7121, + "step": 29065 + }, + { + "epoch": 1.975132490827558, + "grad_norm": 0.43325668573379517, + "learning_rate": 7.5321035466775385e-06, + "loss": 4.6057, + "step": 29070 + }, + { + "epoch": 1.97547221089822, + "grad_norm": 0.4215782880783081, + "learning_rate": 7.5316788965892104e-06, + "loss": 4.5427, + "step": 29075 + }, + { + "epoch": 1.9758119309688817, + "grad_norm": 0.3852553069591522, + "learning_rate": 7.531254246500884e-06, + "loss": 4.3371, + "step": 29080 + }, + { + "epoch": 1.9761516510395434, + "grad_norm": 0.4303349554538727, + "learning_rate": 7.530829596412557e-06, + "loss": 4.3948, + "step": 29085 + }, + { + "epoch": 1.9764913711102052, + "grad_norm": 0.4885919690132141, + "learning_rate": 7.530404946324229e-06, + "loss": 4.5973, + "step": 29090 + }, + { + "epoch": 1.976831091180867, + "grad_norm": 0.3710692524909973, + "learning_rate": 7.5299802962359025e-06, + "loss": 4.8631, + "step": 29095 + }, + { + "epoch": 1.9771708112515287, + "grad_norm": 0.33605173230171204, + "learning_rate": 7.529555646147575e-06, + "loss": 4.3029, + "step": 29100 + }, + { + "epoch": 1.9775105313221906, + "grad_norm": 0.3458583652973175, + "learning_rate": 7.529130996059247e-06, + "loss": 4.6397, + "step": 29105 + }, + { + "epoch": 1.9778502513928524, + "grad_norm": 0.4337533414363861, + "learning_rate": 7.528706345970921e-06, + "loss": 4.4295, + "step": 29110 + }, + { + "epoch": 1.978189971463514, + "grad_norm": 0.298714280128479, + "learning_rate": 7.528281695882594e-06, + "loss": 4.547, + "step": 29115 + }, + { + "epoch": 1.9785296915341757, + "grad_norm": 0.4034809172153473, + "learning_rate": 7.527857045794266e-06, + "loss": 4.516, + "step": 29120 + }, + { + "epoch": 1.9788694116048378, + "grad_norm": 0.3325503170490265, + "learning_rate": 7.527432395705939e-06, + "loss": 4.7478, + "step": 29125 + }, + { + "epoch": 1.9792091316754994, + "grad_norm": 0.4345927834510803, + "learning_rate": 7.527007745617611e-06, + "loss": 4.6143, + "step": 29130 + }, + { + "epoch": 1.979548851746161, + "grad_norm": 0.4233904182910919, + "learning_rate": 7.526583095529284e-06, + "loss": 4.6078, + "step": 29135 + }, + { + "epoch": 1.979888571816823, + "grad_norm": 0.5196325182914734, + "learning_rate": 7.526158445440958e-06, + "loss": 4.6113, + "step": 29140 + }, + { + "epoch": 1.9802282918874847, + "grad_norm": 0.3158845901489258, + "learning_rate": 7.52573379535263e-06, + "loss": 4.5626, + "step": 29145 + }, + { + "epoch": 1.9805680119581464, + "grad_norm": 0.35323551297187805, + "learning_rate": 7.5253091452643024e-06, + "loss": 4.4541, + "step": 29150 + }, + { + "epoch": 1.9809077320288082, + "grad_norm": 0.46035122871398926, + "learning_rate": 7.524884495175976e-06, + "loss": 4.349, + "step": 29155 + }, + { + "epoch": 1.98124745209947, + "grad_norm": 0.5592741370201111, + "learning_rate": 7.524459845087648e-06, + "loss": 4.7672, + "step": 29160 + }, + { + "epoch": 1.9815871721701317, + "grad_norm": 0.5948301553726196, + "learning_rate": 7.524035194999321e-06, + "loss": 4.4042, + "step": 29165 + }, + { + "epoch": 1.9819268922407935, + "grad_norm": 0.5960759520530701, + "learning_rate": 7.5236105449109945e-06, + "loss": 4.6679, + "step": 29170 + }, + { + "epoch": 1.9822666123114554, + "grad_norm": 0.302245169878006, + "learning_rate": 7.5231858948226665e-06, + "loss": 4.6953, + "step": 29175 + }, + { + "epoch": 1.982606332382117, + "grad_norm": 0.40396445989608765, + "learning_rate": 7.522761244734339e-06, + "loss": 4.4102, + "step": 29180 + }, + { + "epoch": 1.9829460524527789, + "grad_norm": 0.3579828143119812, + "learning_rate": 7.522336594646013e-06, + "loss": 4.5307, + "step": 29185 + }, + { + "epoch": 1.9832857725234407, + "grad_norm": 0.46441125869750977, + "learning_rate": 7.521911944557685e-06, + "loss": 4.6803, + "step": 29190 + }, + { + "epoch": 1.9836254925941024, + "grad_norm": 0.4146718382835388, + "learning_rate": 7.521487294469358e-06, + "loss": 4.6404, + "step": 29195 + }, + { + "epoch": 1.9839652126647642, + "grad_norm": 0.42987698316574097, + "learning_rate": 7.5210626443810305e-06, + "loss": 4.7075, + "step": 29200 + }, + { + "epoch": 1.984304932735426, + "grad_norm": 0.3788318336009979, + "learning_rate": 7.520637994292703e-06, + "loss": 4.3976, + "step": 29205 + }, + { + "epoch": 1.9846446528060877, + "grad_norm": 0.4073922336101532, + "learning_rate": 7.520213344204377e-06, + "loss": 4.3985, + "step": 29210 + }, + { + "epoch": 1.9849843728767496, + "grad_norm": 0.37523096799850464, + "learning_rate": 7.519788694116049e-06, + "loss": 4.4628, + "step": 29215 + }, + { + "epoch": 1.9853240929474114, + "grad_norm": 0.48765403032302856, + "learning_rate": 7.519364044027722e-06, + "loss": 4.6678, + "step": 29220 + }, + { + "epoch": 1.985663813018073, + "grad_norm": 0.43292951583862305, + "learning_rate": 7.518939393939395e-06, + "loss": 4.665, + "step": 29225 + }, + { + "epoch": 1.986003533088735, + "grad_norm": 0.46795397996902466, + "learning_rate": 7.518514743851067e-06, + "loss": 4.5922, + "step": 29230 + }, + { + "epoch": 1.9863432531593967, + "grad_norm": 0.3691636323928833, + "learning_rate": 7.51809009376274e-06, + "loss": 4.3402, + "step": 29235 + }, + { + "epoch": 1.9866829732300584, + "grad_norm": 0.43560683727264404, + "learning_rate": 7.517665443674414e-06, + "loss": 4.3299, + "step": 29240 + }, + { + "epoch": 1.9870226933007202, + "grad_norm": 0.42926517128944397, + "learning_rate": 7.517240793586086e-06, + "loss": 4.5902, + "step": 29245 + }, + { + "epoch": 1.987362413371382, + "grad_norm": 0.4320036470890045, + "learning_rate": 7.5168161434977585e-06, + "loss": 4.6522, + "step": 29250 + }, + { + "epoch": 1.9877021334420437, + "grad_norm": 0.4513557553291321, + "learning_rate": 7.516391493409432e-06, + "loss": 4.4833, + "step": 29255 + }, + { + "epoch": 1.9880418535127056, + "grad_norm": 0.34953632950782776, + "learning_rate": 7.515966843321104e-06, + "loss": 4.6897, + "step": 29260 + }, + { + "epoch": 1.9883815735833674, + "grad_norm": 0.3757166266441345, + "learning_rate": 7.515542193232777e-06, + "loss": 4.4429, + "step": 29265 + }, + { + "epoch": 1.988721293654029, + "grad_norm": 0.32373154163360596, + "learning_rate": 7.51511754314445e-06, + "loss": 4.5504, + "step": 29270 + }, + { + "epoch": 1.989061013724691, + "grad_norm": 0.4057324528694153, + "learning_rate": 7.5146928930561225e-06, + "loss": 4.3729, + "step": 29275 + }, + { + "epoch": 1.9894007337953528, + "grad_norm": 0.34794262051582336, + "learning_rate": 7.514268242967795e-06, + "loss": 4.2676, + "step": 29280 + }, + { + "epoch": 1.9897404538660144, + "grad_norm": 0.4267287850379944, + "learning_rate": 7.513843592879468e-06, + "loss": 4.5985, + "step": 29285 + }, + { + "epoch": 1.990080173936676, + "grad_norm": 0.32950496673583984, + "learning_rate": 7.513418942791141e-06, + "loss": 4.4017, + "step": 29290 + }, + { + "epoch": 1.990419894007338, + "grad_norm": 0.25690147280693054, + "learning_rate": 7.512994292702813e-06, + "loss": 4.5074, + "step": 29295 + }, + { + "epoch": 1.9907596140779997, + "grad_norm": 0.3645676374435425, + "learning_rate": 7.5125696426144865e-06, + "loss": 4.8302, + "step": 29300 + }, + { + "epoch": 1.9910993341486614, + "grad_norm": 0.36233922839164734, + "learning_rate": 7.512144992526159e-06, + "loss": 4.2738, + "step": 29305 + }, + { + "epoch": 1.9914390542193234, + "grad_norm": 0.2621535360813141, + "learning_rate": 7.511720342437831e-06, + "loss": 4.5711, + "step": 29310 + }, + { + "epoch": 1.991778774289985, + "grad_norm": 0.37553870677948, + "learning_rate": 7.511295692349505e-06, + "loss": 4.3898, + "step": 29315 + }, + { + "epoch": 1.9921184943606467, + "grad_norm": 0.3324916660785675, + "learning_rate": 7.510871042261178e-06, + "loss": 4.5849, + "step": 29320 + }, + { + "epoch": 1.9924582144313085, + "grad_norm": 0.4164249897003174, + "learning_rate": 7.51044639217285e-06, + "loss": 4.5983, + "step": 29325 + }, + { + "epoch": 1.9927979345019704, + "grad_norm": 0.3471715748310089, + "learning_rate": 7.510021742084523e-06, + "loss": 4.2839, + "step": 29330 + }, + { + "epoch": 1.993137654572632, + "grad_norm": 0.3918708860874176, + "learning_rate": 7.509597091996196e-06, + "loss": 4.5093, + "step": 29335 + }, + { + "epoch": 1.9934773746432939, + "grad_norm": 0.3831522762775421, + "learning_rate": 7.509172441907868e-06, + "loss": 4.7858, + "step": 29340 + }, + { + "epoch": 1.9938170947139557, + "grad_norm": 0.54977947473526, + "learning_rate": 7.508747791819542e-06, + "loss": 4.3736, + "step": 29345 + }, + { + "epoch": 1.9941568147846174, + "grad_norm": 0.35302215814590454, + "learning_rate": 7.5083231417312145e-06, + "loss": 4.72, + "step": 29350 + }, + { + "epoch": 1.9944965348552792, + "grad_norm": 0.3621215224266052, + "learning_rate": 7.5078984916428864e-06, + "loss": 4.5983, + "step": 29355 + }, + { + "epoch": 1.994836254925941, + "grad_norm": 0.620525062084198, + "learning_rate": 7.50747384155456e-06, + "loss": 4.6899, + "step": 29360 + }, + { + "epoch": 1.9951759749966027, + "grad_norm": 0.3695339262485504, + "learning_rate": 7.507049191466232e-06, + "loss": 4.4918, + "step": 29365 + }, + { + "epoch": 1.9955156950672646, + "grad_norm": 0.44442278146743774, + "learning_rate": 7.506624541377905e-06, + "loss": 4.7746, + "step": 29370 + }, + { + "epoch": 1.9958554151379264, + "grad_norm": 0.3930512070655823, + "learning_rate": 7.5061998912895785e-06, + "loss": 4.6771, + "step": 29375 + }, + { + "epoch": 1.996195135208588, + "grad_norm": 0.30820104479789734, + "learning_rate": 7.5057752412012504e-06, + "loss": 4.6162, + "step": 29380 + }, + { + "epoch": 1.99653485527925, + "grad_norm": 0.5130590796470642, + "learning_rate": 7.505350591112923e-06, + "loss": 4.7218, + "step": 29385 + }, + { + "epoch": 1.9968745753499118, + "grad_norm": 0.442256897687912, + "learning_rate": 7.504925941024597e-06, + "loss": 4.6441, + "step": 29390 + }, + { + "epoch": 1.9972142954205734, + "grad_norm": 0.3054395616054535, + "learning_rate": 7.504501290936269e-06, + "loss": 4.4335, + "step": 29395 + }, + { + "epoch": 1.9975540154912352, + "grad_norm": 0.41907599568367004, + "learning_rate": 7.504076640847942e-06, + "loss": 4.6311, + "step": 29400 + }, + { + "epoch": 1.997893735561897, + "grad_norm": 0.43304863572120667, + "learning_rate": 7.503651990759615e-06, + "loss": 4.915, + "step": 29405 + }, + { + "epoch": 1.9982334556325587, + "grad_norm": 0.3544602692127228, + "learning_rate": 7.503227340671287e-06, + "loss": 4.4906, + "step": 29410 + }, + { + "epoch": 1.9985731757032206, + "grad_norm": 0.47165292501449585, + "learning_rate": 7.50280269058296e-06, + "loss": 4.6953, + "step": 29415 + }, + { + "epoch": 1.9989128957738824, + "grad_norm": 0.49527689814567566, + "learning_rate": 7.502378040494634e-06, + "loss": 4.6025, + "step": 29420 + }, + { + "epoch": 1.999252615844544, + "grad_norm": 0.3178909420967102, + "learning_rate": 7.501953390406306e-06, + "loss": 4.4, + "step": 29425 + }, + { + "epoch": 1.999592335915206, + "grad_norm": 0.3628191649913788, + "learning_rate": 7.5015287403179784e-06, + "loss": 4.5109, + "step": 29430 + }, + { + "epoch": 1.9999320559858678, + "grad_norm": 0.3404077887535095, + "learning_rate": 7.501104090229652e-06, + "loss": 4.4819, + "step": 29435 + }, + { + "epoch": 2.0, + "eval_bertscore": { + "f1": 0.7927632880760771, + "precision": 0.7737424174001113, + "recall": 0.815319485454807 + }, + "eval_bleu_4": 0.0025555367224237373, + "eval_exact_match": 0.0, + "eval_loss": 4.163323879241943, + "eval_meteor": 0.0361714519344169, + "eval_rouge": { + "rouge1": 0.0579294986772288, + "rouge2": 0.003987150089692776, + "rougeL": 0.05064911377640767, + "rougeLsum": 0.050634473224390714 + }, + "eval_runtime": 285.8385, + "eval_samples_per_second": 36.101, + "eval_steps_per_second": 4.513, + "step": 29436 + }, + { + "epoch": 2.0002717760565294, + "grad_norm": 0.5430731773376465, + "learning_rate": 7.500679440141324e-06, + "loss": 4.7163, + "step": 29440 + }, + { + "epoch": 2.000611496127191, + "grad_norm": 0.311860591173172, + "learning_rate": 7.500254790052997e-06, + "loss": 4.3384, + "step": 29445 + }, + { + "epoch": 2.000951216197853, + "grad_norm": 0.38015344738960266, + "learning_rate": 7.49983013996467e-06, + "loss": 4.4349, + "step": 29450 + }, + { + "epoch": 2.0012909362685147, + "grad_norm": 0.3849876821041107, + "learning_rate": 7.4994054898763424e-06, + "loss": 4.3582, + "step": 29455 + }, + { + "epoch": 2.0016306563391764, + "grad_norm": 0.571831464767456, + "learning_rate": 7.498980839788015e-06, + "loss": 4.6881, + "step": 29460 + }, + { + "epoch": 2.0019703764098384, + "grad_norm": 0.5183097124099731, + "learning_rate": 7.498556189699688e-06, + "loss": 4.3075, + "step": 29465 + }, + { + "epoch": 2.0023100964805, + "grad_norm": 0.39803236722946167, + "learning_rate": 7.498216469629026e-06, + "loss": 4.4696, + "step": 29470 + }, + { + "epoch": 2.0026498165511617, + "grad_norm": 0.4925556480884552, + "learning_rate": 7.4977918195407e-06, + "loss": 4.6251, + "step": 29475 + }, + { + "epoch": 2.0029895366218238, + "grad_norm": 0.43459659814834595, + "learning_rate": 7.497367169452372e-06, + "loss": 4.4432, + "step": 29480 + }, + { + "epoch": 2.0033292566924854, + "grad_norm": 0.40914058685302734, + "learning_rate": 7.4969425193640445e-06, + "loss": 4.6515, + "step": 29485 + }, + { + "epoch": 2.003668976763147, + "grad_norm": 0.3622908592224121, + "learning_rate": 7.496517869275718e-06, + "loss": 4.5377, + "step": 29490 + }, + { + "epoch": 2.004008696833809, + "grad_norm": 0.325761616230011, + "learning_rate": 7.49609321918739e-06, + "loss": 4.5812, + "step": 29495 + }, + { + "epoch": 2.0043484169044707, + "grad_norm": 0.43775567412376404, + "learning_rate": 7.495668569099063e-06, + "loss": 4.3757, + "step": 29500 + }, + { + "epoch": 2.0046881369751324, + "grad_norm": 0.5340738296508789, + "learning_rate": 7.4952439190107366e-06, + "loss": 4.582, + "step": 29505 + }, + { + "epoch": 2.0050278570457944, + "grad_norm": 0.4713257849216461, + "learning_rate": 7.4948192689224085e-06, + "loss": 4.5879, + "step": 29510 + }, + { + "epoch": 2.005367577116456, + "grad_norm": 0.38161736726760864, + "learning_rate": 7.494394618834081e-06, + "loss": 4.5244, + "step": 29515 + }, + { + "epoch": 2.0057072971871177, + "grad_norm": 0.7044808268547058, + "learning_rate": 7.493969968745754e-06, + "loss": 4.6464, + "step": 29520 + }, + { + "epoch": 2.00604701725778, + "grad_norm": 0.47754713892936707, + "learning_rate": 7.493545318657427e-06, + "loss": 4.5519, + "step": 29525 + }, + { + "epoch": 2.0063867373284414, + "grad_norm": 0.35139650106430054, + "learning_rate": 7.4931206685691e-06, + "loss": 4.4606, + "step": 29530 + }, + { + "epoch": 2.006726457399103, + "grad_norm": 0.3800577223300934, + "learning_rate": 7.4926960184807725e-06, + "loss": 4.7172, + "step": 29535 + }, + { + "epoch": 2.007066177469765, + "grad_norm": 0.3783891201019287, + "learning_rate": 7.492271368392445e-06, + "loss": 4.6377, + "step": 29540 + }, + { + "epoch": 2.0074058975404268, + "grad_norm": 0.36392709612846375, + "learning_rate": 7.491846718304117e-06, + "loss": 4.6931, + "step": 29545 + }, + { + "epoch": 2.0077456176110884, + "grad_norm": 0.46847304701805115, + "learning_rate": 7.491422068215791e-06, + "loss": 4.0609, + "step": 29550 + }, + { + "epoch": 2.0080853376817505, + "grad_norm": 0.4683804512023926, + "learning_rate": 7.490997418127464e-06, + "loss": 4.6559, + "step": 29555 + }, + { + "epoch": 2.008425057752412, + "grad_norm": 0.41965118050575256, + "learning_rate": 7.490572768039136e-06, + "loss": 4.2012, + "step": 29560 + }, + { + "epoch": 2.0087647778230737, + "grad_norm": 0.35703879594802856, + "learning_rate": 7.490148117950809e-06, + "loss": 4.5268, + "step": 29565 + }, + { + "epoch": 2.0091044978937354, + "grad_norm": 0.39180901646614075, + "learning_rate": 7.489723467862482e-06, + "loss": 4.777, + "step": 29570 + }, + { + "epoch": 2.0094442179643974, + "grad_norm": 0.355457067489624, + "learning_rate": 7.489298817774154e-06, + "loss": 4.6741, + "step": 29575 + }, + { + "epoch": 2.009783938035059, + "grad_norm": 0.39142146706581116, + "learning_rate": 7.488874167685828e-06, + "loss": 4.6352, + "step": 29580 + }, + { + "epoch": 2.0101236581057207, + "grad_norm": 0.33538758754730225, + "learning_rate": 7.4884495175975005e-06, + "loss": 4.385, + "step": 29585 + }, + { + "epoch": 2.0104633781763828, + "grad_norm": 0.42444565892219543, + "learning_rate": 7.4880248675091725e-06, + "loss": 4.4805, + "step": 29590 + }, + { + "epoch": 2.0108030982470444, + "grad_norm": 0.4611833393573761, + "learning_rate": 7.487600217420846e-06, + "loss": 4.4519, + "step": 29595 + }, + { + "epoch": 2.011142818317706, + "grad_norm": 0.41015926003456116, + "learning_rate": 7.487175567332519e-06, + "loss": 4.2981, + "step": 29600 + }, + { + "epoch": 2.011482538388368, + "grad_norm": 0.4428670108318329, + "learning_rate": 7.486750917244191e-06, + "loss": 4.354, + "step": 29605 + }, + { + "epoch": 2.0118222584590297, + "grad_norm": 0.3876844346523285, + "learning_rate": 7.4863262671558645e-06, + "loss": 4.4593, + "step": 29610 + }, + { + "epoch": 2.0121619785296914, + "grad_norm": 0.5042430758476257, + "learning_rate": 7.485901617067537e-06, + "loss": 4.5414, + "step": 29615 + }, + { + "epoch": 2.0125016986003534, + "grad_norm": 0.4611300528049469, + "learning_rate": 7.485476966979209e-06, + "loss": 4.5335, + "step": 29620 + }, + { + "epoch": 2.012841418671015, + "grad_norm": 0.4140489399433136, + "learning_rate": 7.485052316890883e-06, + "loss": 4.4261, + "step": 29625 + }, + { + "epoch": 2.0131811387416767, + "grad_norm": 0.4903642237186432, + "learning_rate": 7.484627666802555e-06, + "loss": 4.6167, + "step": 29630 + }, + { + "epoch": 2.0135208588123388, + "grad_norm": 0.3179190158843994, + "learning_rate": 7.484203016714228e-06, + "loss": 4.652, + "step": 29635 + }, + { + "epoch": 2.0138605788830004, + "grad_norm": 0.5099160075187683, + "learning_rate": 7.483778366625901e-06, + "loss": 4.3364, + "step": 29640 + }, + { + "epoch": 2.014200298953662, + "grad_norm": 0.35060372948646545, + "learning_rate": 7.483353716537573e-06, + "loss": 4.3597, + "step": 29645 + }, + { + "epoch": 2.014540019024324, + "grad_norm": 0.314727783203125, + "learning_rate": 7.482929066449246e-06, + "loss": 4.4708, + "step": 29650 + }, + { + "epoch": 2.0148797390949857, + "grad_norm": 0.5546934008598328, + "learning_rate": 7.48250441636092e-06, + "loss": 4.846, + "step": 29655 + }, + { + "epoch": 2.0152194591656474, + "grad_norm": 0.3348732888698578, + "learning_rate": 7.482079766272592e-06, + "loss": 4.5933, + "step": 29660 + }, + { + "epoch": 2.0155591792363095, + "grad_norm": 0.32399284839630127, + "learning_rate": 7.4816551161842645e-06, + "loss": 4.4034, + "step": 29665 + }, + { + "epoch": 2.015898899306971, + "grad_norm": 0.39164331555366516, + "learning_rate": 7.481230466095938e-06, + "loss": 4.3776, + "step": 29670 + }, + { + "epoch": 2.0162386193776327, + "grad_norm": 0.5084218978881836, + "learning_rate": 7.48080581600761e-06, + "loss": 4.4801, + "step": 29675 + }, + { + "epoch": 2.016578339448295, + "grad_norm": 0.38186466693878174, + "learning_rate": 7.480381165919283e-06, + "loss": 4.8022, + "step": 29680 + }, + { + "epoch": 2.0169180595189564, + "grad_norm": 0.4068457782268524, + "learning_rate": 7.4799565158309566e-06, + "loss": 4.3526, + "step": 29685 + }, + { + "epoch": 2.017257779589618, + "grad_norm": 0.36152181029319763, + "learning_rate": 7.4795318657426285e-06, + "loss": 4.45, + "step": 29690 + }, + { + "epoch": 2.01759749966028, + "grad_norm": 0.31223368644714355, + "learning_rate": 7.479107215654301e-06, + "loss": 4.5245, + "step": 29695 + }, + { + "epoch": 2.0179372197309418, + "grad_norm": 0.40237170457839966, + "learning_rate": 7.478682565565974e-06, + "loss": 4.505, + "step": 29700 + }, + { + "epoch": 2.0182769398016034, + "grad_norm": 0.3976059854030609, + "learning_rate": 7.478257915477647e-06, + "loss": 4.4713, + "step": 29705 + }, + { + "epoch": 2.0186166598722655, + "grad_norm": 0.482710599899292, + "learning_rate": 7.47783326538932e-06, + "loss": 4.4533, + "step": 29710 + }, + { + "epoch": 2.018956379942927, + "grad_norm": 0.4368327558040619, + "learning_rate": 7.4774086153009925e-06, + "loss": 4.542, + "step": 29715 + }, + { + "epoch": 2.0192961000135887, + "grad_norm": 0.3293151259422302, + "learning_rate": 7.476983965212665e-06, + "loss": 4.4092, + "step": 29720 + }, + { + "epoch": 2.0196358200842504, + "grad_norm": 0.4036010503768921, + "learning_rate": 7.476559315124337e-06, + "loss": 4.4556, + "step": 29725 + }, + { + "epoch": 2.0199755401549124, + "grad_norm": 0.4547683596611023, + "learning_rate": 7.476134665036011e-06, + "loss": 4.3247, + "step": 29730 + }, + { + "epoch": 2.020315260225574, + "grad_norm": 0.43272969126701355, + "learning_rate": 7.475710014947684e-06, + "loss": 4.4913, + "step": 29735 + }, + { + "epoch": 2.0206549802962357, + "grad_norm": 0.3747922480106354, + "learning_rate": 7.475285364859356e-06, + "loss": 4.4611, + "step": 29740 + }, + { + "epoch": 2.0209947003668978, + "grad_norm": 0.7393590807914734, + "learning_rate": 7.474860714771029e-06, + "loss": 4.449, + "step": 29745 + }, + { + "epoch": 2.0213344204375594, + "grad_norm": 0.31821906566619873, + "learning_rate": 7.474436064682702e-06, + "loss": 4.6844, + "step": 29750 + }, + { + "epoch": 2.021674140508221, + "grad_norm": 0.36577108502388, + "learning_rate": 7.474011414594374e-06, + "loss": 4.462, + "step": 29755 + }, + { + "epoch": 2.022013860578883, + "grad_norm": 0.35309526324272156, + "learning_rate": 7.473586764506048e-06, + "loss": 4.6018, + "step": 29760 + }, + { + "epoch": 2.0223535806495447, + "grad_norm": 0.3748389482498169, + "learning_rate": 7.4731621144177205e-06, + "loss": 4.4604, + "step": 29765 + }, + { + "epoch": 2.0226933007202064, + "grad_norm": 0.503986656665802, + "learning_rate": 7.472737464329393e-06, + "loss": 4.3524, + "step": 29770 + }, + { + "epoch": 2.0230330207908684, + "grad_norm": 0.42840254306793213, + "learning_rate": 7.472312814241066e-06, + "loss": 4.5381, + "step": 29775 + }, + { + "epoch": 2.02337274086153, + "grad_norm": 0.4410347044467926, + "learning_rate": 7.471888164152739e-06, + "loss": 4.5556, + "step": 29780 + }, + { + "epoch": 2.0237124609321917, + "grad_norm": 0.40175294876098633, + "learning_rate": 7.471463514064412e-06, + "loss": 4.4541, + "step": 29785 + }, + { + "epoch": 2.024052181002854, + "grad_norm": 0.41289380192756653, + "learning_rate": 7.4710388639760845e-06, + "loss": 4.7332, + "step": 29790 + }, + { + "epoch": 2.0243919010735154, + "grad_norm": 0.5487479567527771, + "learning_rate": 7.4706142138877565e-06, + "loss": 4.3957, + "step": 29795 + }, + { + "epoch": 2.024731621144177, + "grad_norm": 0.35899195075035095, + "learning_rate": 7.47018956379943e-06, + "loss": 4.8849, + "step": 29800 + }, + { + "epoch": 2.025071341214839, + "grad_norm": 0.43632861971855164, + "learning_rate": 7.469764913711103e-06, + "loss": 4.5044, + "step": 29805 + }, + { + "epoch": 2.0254110612855007, + "grad_norm": 0.3711369037628174, + "learning_rate": 7.469340263622775e-06, + "loss": 4.675, + "step": 29810 + }, + { + "epoch": 2.0257507813561624, + "grad_norm": 0.4583915174007416, + "learning_rate": 7.4689156135344485e-06, + "loss": 4.6386, + "step": 29815 + }, + { + "epoch": 2.0260905014268245, + "grad_norm": 0.3233878016471863, + "learning_rate": 7.468490963446121e-06, + "loss": 4.3933, + "step": 29820 + }, + { + "epoch": 2.026430221497486, + "grad_norm": 0.42408522963523865, + "learning_rate": 7.468066313357793e-06, + "loss": 4.6024, + "step": 29825 + }, + { + "epoch": 2.0267699415681477, + "grad_norm": 0.4110647737979889, + "learning_rate": 7.467641663269467e-06, + "loss": 4.2114, + "step": 29830 + }, + { + "epoch": 2.02710966163881, + "grad_norm": 0.3614123463630676, + "learning_rate": 7.46721701318114e-06, + "loss": 4.3877, + "step": 29835 + }, + { + "epoch": 2.0274493817094714, + "grad_norm": 0.4593547284603119, + "learning_rate": 7.466792363092812e-06, + "loss": 4.6278, + "step": 29840 + }, + { + "epoch": 2.027789101780133, + "grad_norm": 0.3764742314815521, + "learning_rate": 7.466367713004485e-06, + "loss": 4.3492, + "step": 29845 + }, + { + "epoch": 2.028128821850795, + "grad_norm": 0.4171487092971802, + "learning_rate": 7.465943062916158e-06, + "loss": 4.334, + "step": 29850 + }, + { + "epoch": 2.0284685419214568, + "grad_norm": 0.3759028911590576, + "learning_rate": 7.46551841282783e-06, + "loss": 4.5042, + "step": 29855 + }, + { + "epoch": 2.0288082619921184, + "grad_norm": 0.3103712201118469, + "learning_rate": 7.465093762739504e-06, + "loss": 4.3511, + "step": 29860 + }, + { + "epoch": 2.0291479820627805, + "grad_norm": 0.5506786704063416, + "learning_rate": 7.464669112651176e-06, + "loss": 4.5553, + "step": 29865 + }, + { + "epoch": 2.029487702133442, + "grad_norm": 0.6370071172714233, + "learning_rate": 7.4642444625628485e-06, + "loss": 4.7035, + "step": 29870 + }, + { + "epoch": 2.0298274222041037, + "grad_norm": 0.3349555730819702, + "learning_rate": 7.463819812474522e-06, + "loss": 4.7817, + "step": 29875 + }, + { + "epoch": 2.030167142274766, + "grad_norm": 0.47764015197753906, + "learning_rate": 7.463395162386194e-06, + "loss": 4.6924, + "step": 29880 + }, + { + "epoch": 2.0305068623454274, + "grad_norm": 0.44843414425849915, + "learning_rate": 7.462970512297867e-06, + "loss": 4.4293, + "step": 29885 + }, + { + "epoch": 2.030846582416089, + "grad_norm": 0.3879605531692505, + "learning_rate": 7.4625458622095405e-06, + "loss": 4.4427, + "step": 29890 + }, + { + "epoch": 2.031186302486751, + "grad_norm": 0.5196486115455627, + "learning_rate": 7.4621212121212125e-06, + "loss": 4.3198, + "step": 29895 + }, + { + "epoch": 2.0315260225574128, + "grad_norm": 0.478885680437088, + "learning_rate": 7.461696562032885e-06, + "loss": 4.3515, + "step": 29900 + }, + { + "epoch": 2.0318657426280744, + "grad_norm": 0.3717541992664337, + "learning_rate": 7.461271911944559e-06, + "loss": 4.6691, + "step": 29905 + }, + { + "epoch": 2.032205462698736, + "grad_norm": 0.3216395974159241, + "learning_rate": 7.460847261856231e-06, + "loss": 4.6002, + "step": 29910 + }, + { + "epoch": 2.032545182769398, + "grad_norm": 0.42029809951782227, + "learning_rate": 7.460422611767904e-06, + "loss": 4.5947, + "step": 29915 + }, + { + "epoch": 2.0328849028400597, + "grad_norm": 0.3331969976425171, + "learning_rate": 7.459997961679577e-06, + "loss": 4.4583, + "step": 29920 + }, + { + "epoch": 2.0332246229107214, + "grad_norm": 0.6087578535079956, + "learning_rate": 7.459573311591249e-06, + "loss": 4.4918, + "step": 29925 + }, + { + "epoch": 2.0335643429813834, + "grad_norm": 0.3739081919193268, + "learning_rate": 7.459148661502922e-06, + "loss": 4.5904, + "step": 29930 + }, + { + "epoch": 2.033904063052045, + "grad_norm": 0.46926429867744446, + "learning_rate": 7.458724011414596e-06, + "loss": 4.5532, + "step": 29935 + }, + { + "epoch": 2.0342437831227067, + "grad_norm": 0.4113500118255615, + "learning_rate": 7.458299361326268e-06, + "loss": 4.5907, + "step": 29940 + }, + { + "epoch": 2.034583503193369, + "grad_norm": 0.43805965781211853, + "learning_rate": 7.4578747112379405e-06, + "loss": 4.425, + "step": 29945 + }, + { + "epoch": 2.0349232232640304, + "grad_norm": 0.29457196593284607, + "learning_rate": 7.457450061149613e-06, + "loss": 4.5456, + "step": 29950 + }, + { + "epoch": 2.035262943334692, + "grad_norm": 0.41630440950393677, + "learning_rate": 7.457025411061286e-06, + "loss": 4.4538, + "step": 29955 + }, + { + "epoch": 2.035602663405354, + "grad_norm": 0.3619729280471802, + "learning_rate": 7.456600760972959e-06, + "loss": 4.4721, + "step": 29960 + }, + { + "epoch": 2.0359423834760157, + "grad_norm": 0.37176159024238586, + "learning_rate": 7.456176110884632e-06, + "loss": 4.4147, + "step": 29965 + }, + { + "epoch": 2.0362821035466774, + "grad_norm": 0.5963063836097717, + "learning_rate": 7.4557514607963045e-06, + "loss": 4.664, + "step": 29970 + }, + { + "epoch": 2.0366218236173395, + "grad_norm": 0.3454267978668213, + "learning_rate": 7.4553268107079765e-06, + "loss": 4.4858, + "step": 29975 + }, + { + "epoch": 2.036961543688001, + "grad_norm": 0.5044786334037781, + "learning_rate": 7.45490216061965e-06, + "loss": 4.5382, + "step": 29980 + }, + { + "epoch": 2.0373012637586627, + "grad_norm": 0.37864187359809875, + "learning_rate": 7.454477510531323e-06, + "loss": 4.381, + "step": 29985 + }, + { + "epoch": 2.037640983829325, + "grad_norm": 0.4080984592437744, + "learning_rate": 7.454052860442995e-06, + "loss": 4.6364, + "step": 29990 + }, + { + "epoch": 2.0379807038999864, + "grad_norm": 0.301602840423584, + "learning_rate": 7.4536282103546685e-06, + "loss": 4.6009, + "step": 29995 + }, + { + "epoch": 2.038320423970648, + "grad_norm": 0.3058913052082062, + "learning_rate": 7.453203560266341e-06, + "loss": 4.522, + "step": 30000 + }, + { + "epoch": 2.03866014404131, + "grad_norm": 0.34521982073783875, + "learning_rate": 7.452778910178013e-06, + "loss": 4.5836, + "step": 30005 + }, + { + "epoch": 2.0389998641119718, + "grad_norm": 0.3926522433757782, + "learning_rate": 7.452354260089687e-06, + "loss": 4.621, + "step": 30010 + }, + { + "epoch": 2.0393395841826334, + "grad_norm": 0.3642564117908478, + "learning_rate": 7.45192961000136e-06, + "loss": 4.6704, + "step": 30015 + }, + { + "epoch": 2.0396793042532955, + "grad_norm": 0.3338351547718048, + "learning_rate": 7.451504959913032e-06, + "loss": 4.4822, + "step": 30020 + }, + { + "epoch": 2.040019024323957, + "grad_norm": 0.40858104825019836, + "learning_rate": 7.451080309824705e-06, + "loss": 4.5975, + "step": 30025 + }, + { + "epoch": 2.0403587443946187, + "grad_norm": 0.3237113654613495, + "learning_rate": 7.450655659736378e-06, + "loss": 4.667, + "step": 30030 + }, + { + "epoch": 2.040698464465281, + "grad_norm": 0.40910038352012634, + "learning_rate": 7.45023100964805e-06, + "loss": 4.5774, + "step": 30035 + }, + { + "epoch": 2.0410381845359424, + "grad_norm": 0.5191749930381775, + "learning_rate": 7.449806359559724e-06, + "loss": 4.4456, + "step": 30040 + }, + { + "epoch": 2.041377904606604, + "grad_norm": 0.4949016571044922, + "learning_rate": 7.449381709471396e-06, + "loss": 4.6108, + "step": 30045 + }, + { + "epoch": 2.041717624677266, + "grad_norm": 0.4093218743801117, + "learning_rate": 7.4489570593830685e-06, + "loss": 4.313, + "step": 30050 + }, + { + "epoch": 2.0420573447479278, + "grad_norm": 0.4082130193710327, + "learning_rate": 7.448532409294742e-06, + "loss": 4.2823, + "step": 30055 + }, + { + "epoch": 2.0423970648185894, + "grad_norm": 0.5053543448448181, + "learning_rate": 7.448107759206414e-06, + "loss": 4.6412, + "step": 30060 + }, + { + "epoch": 2.042736784889251, + "grad_norm": 0.4480048418045044, + "learning_rate": 7.447683109118087e-06, + "loss": 4.5065, + "step": 30065 + }, + { + "epoch": 2.043076504959913, + "grad_norm": 0.3054220974445343, + "learning_rate": 7.4472584590297605e-06, + "loss": 4.7665, + "step": 30070 + }, + { + "epoch": 2.0434162250305747, + "grad_norm": 0.35643264651298523, + "learning_rate": 7.4468338089414325e-06, + "loss": 4.3219, + "step": 30075 + }, + { + "epoch": 2.0437559451012364, + "grad_norm": 0.3559984862804413, + "learning_rate": 7.446409158853105e-06, + "loss": 4.2266, + "step": 30080 + }, + { + "epoch": 2.0440956651718984, + "grad_norm": 0.36456283926963806, + "learning_rate": 7.445984508764779e-06, + "loss": 4.5382, + "step": 30085 + }, + { + "epoch": 2.04443538524256, + "grad_norm": 0.3492412269115448, + "learning_rate": 7.445559858676451e-06, + "loss": 4.5304, + "step": 30090 + }, + { + "epoch": 2.0447751053132217, + "grad_norm": 0.35522952675819397, + "learning_rate": 7.445135208588124e-06, + "loss": 4.4943, + "step": 30095 + }, + { + "epoch": 2.045114825383884, + "grad_norm": 0.4570012092590332, + "learning_rate": 7.444710558499797e-06, + "loss": 4.4716, + "step": 30100 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.3728595972061157, + "learning_rate": 7.444285908411469e-06, + "loss": 4.4987, + "step": 30105 + }, + { + "epoch": 2.045794265525207, + "grad_norm": 0.4160204529762268, + "learning_rate": 7.443861258323143e-06, + "loss": 4.366, + "step": 30110 + }, + { + "epoch": 2.046133985595869, + "grad_norm": 0.34847593307495117, + "learning_rate": 7.443436608234815e-06, + "loss": 4.5486, + "step": 30115 + }, + { + "epoch": 2.0464737056665308, + "grad_norm": 0.33242273330688477, + "learning_rate": 7.443011958146488e-06, + "loss": 4.6208, + "step": 30120 + }, + { + "epoch": 2.0468134257371924, + "grad_norm": 0.3991026282310486, + "learning_rate": 7.442587308058161e-06, + "loss": 4.6773, + "step": 30125 + }, + { + "epoch": 2.0471531458078545, + "grad_norm": 0.3779822885990143, + "learning_rate": 7.442162657969833e-06, + "loss": 4.4198, + "step": 30130 + }, + { + "epoch": 2.047492865878516, + "grad_norm": 0.38208162784576416, + "learning_rate": 7.441738007881506e-06, + "loss": 4.2605, + "step": 30135 + }, + { + "epoch": 2.0478325859491777, + "grad_norm": 0.311443954706192, + "learning_rate": 7.44131335779318e-06, + "loss": 4.5214, + "step": 30140 + }, + { + "epoch": 2.04817230601984, + "grad_norm": 0.4057064950466156, + "learning_rate": 7.440888707704852e-06, + "loss": 4.5529, + "step": 30145 + }, + { + "epoch": 2.0485120260905014, + "grad_norm": 0.40666255354881287, + "learning_rate": 7.4404640576165245e-06, + "loss": 4.4821, + "step": 30150 + }, + { + "epoch": 2.048851746161163, + "grad_norm": 0.3818376958370209, + "learning_rate": 7.440039407528198e-06, + "loss": 4.4326, + "step": 30155 + }, + { + "epoch": 2.049191466231825, + "grad_norm": 0.4011995196342468, + "learning_rate": 7.43961475743987e-06, + "loss": 4.5116, + "step": 30160 + }, + { + "epoch": 2.0495311863024868, + "grad_norm": 0.3197149336338043, + "learning_rate": 7.439190107351543e-06, + "loss": 4.3089, + "step": 30165 + }, + { + "epoch": 2.0498709063731484, + "grad_norm": 0.5682432651519775, + "learning_rate": 7.4387654572632165e-06, + "loss": 4.4938, + "step": 30170 + }, + { + "epoch": 2.0502106264438105, + "grad_norm": 0.42558735609054565, + "learning_rate": 7.4383408071748885e-06, + "loss": 4.364, + "step": 30175 + }, + { + "epoch": 2.050550346514472, + "grad_norm": 0.2780661880970001, + "learning_rate": 7.437916157086561e-06, + "loss": 4.5201, + "step": 30180 + }, + { + "epoch": 2.0508900665851337, + "grad_norm": 0.6277728080749512, + "learning_rate": 7.437491506998235e-06, + "loss": 4.4154, + "step": 30185 + }, + { + "epoch": 2.051229786655796, + "grad_norm": 0.3895467519760132, + "learning_rate": 7.437066856909907e-06, + "loss": 4.468, + "step": 30190 + }, + { + "epoch": 2.0515695067264574, + "grad_norm": 0.4176378548145294, + "learning_rate": 7.43664220682158e-06, + "loss": 4.3861, + "step": 30195 + }, + { + "epoch": 2.051909226797119, + "grad_norm": 0.39421284198760986, + "learning_rate": 7.4362175567332525e-06, + "loss": 4.7589, + "step": 30200 + }, + { + "epoch": 2.052248946867781, + "grad_norm": 0.30184489488601685, + "learning_rate": 7.435792906644925e-06, + "loss": 4.457, + "step": 30205 + }, + { + "epoch": 2.0525886669384428, + "grad_norm": 0.6503439545631409, + "learning_rate": 7.435368256556597e-06, + "loss": 4.5334, + "step": 30210 + }, + { + "epoch": 2.0529283870091044, + "grad_norm": 0.3870721161365509, + "learning_rate": 7.434943606468271e-06, + "loss": 4.5895, + "step": 30215 + }, + { + "epoch": 2.0532681070797665, + "grad_norm": 0.329388290643692, + "learning_rate": 7.434518956379944e-06, + "loss": 4.7165, + "step": 30220 + }, + { + "epoch": 2.053607827150428, + "grad_norm": 0.35365304350852966, + "learning_rate": 7.434094306291616e-06, + "loss": 4.396, + "step": 30225 + }, + { + "epoch": 2.0539475472210897, + "grad_norm": 0.43435925245285034, + "learning_rate": 7.433669656203289e-06, + "loss": 4.2809, + "step": 30230 + }, + { + "epoch": 2.054287267291752, + "grad_norm": 0.3143440783023834, + "learning_rate": 7.433245006114962e-06, + "loss": 4.3743, + "step": 30235 + }, + { + "epoch": 2.0546269873624134, + "grad_norm": 0.3711945712566376, + "learning_rate": 7.432820356026634e-06, + "loss": 4.4531, + "step": 30240 + }, + { + "epoch": 2.054966707433075, + "grad_norm": 0.3749752342700958, + "learning_rate": 7.432395705938308e-06, + "loss": 4.4664, + "step": 30245 + }, + { + "epoch": 2.0553064275037367, + "grad_norm": 0.35867172479629517, + "learning_rate": 7.4319710558499805e-06, + "loss": 4.3523, + "step": 30250 + }, + { + "epoch": 2.055646147574399, + "grad_norm": 0.3285810053348541, + "learning_rate": 7.4315464057616525e-06, + "loss": 4.712, + "step": 30255 + }, + { + "epoch": 2.0559858676450604, + "grad_norm": 0.3143194019794464, + "learning_rate": 7.431121755673326e-06, + "loss": 4.7773, + "step": 30260 + }, + { + "epoch": 2.056325587715722, + "grad_norm": 0.3807256519794464, + "learning_rate": 7.430697105584999e-06, + "loss": 4.5448, + "step": 30265 + }, + { + "epoch": 2.056665307786384, + "grad_norm": 0.3562292456626892, + "learning_rate": 7.430272455496671e-06, + "loss": 4.2426, + "step": 30270 + }, + { + "epoch": 2.0570050278570458, + "grad_norm": 0.44405800104141235, + "learning_rate": 7.4298478054083445e-06, + "loss": 4.5068, + "step": 30275 + }, + { + "epoch": 2.0573447479277074, + "grad_norm": 0.37634748220443726, + "learning_rate": 7.429423155320017e-06, + "loss": 4.3518, + "step": 30280 + }, + { + "epoch": 2.0576844679983695, + "grad_norm": 0.4203161895275116, + "learning_rate": 7.428998505231689e-06, + "loss": 4.8807, + "step": 30285 + }, + { + "epoch": 2.058024188069031, + "grad_norm": 0.36163151264190674, + "learning_rate": 7.428573855143363e-06, + "loss": 4.3632, + "step": 30290 + }, + { + "epoch": 2.0583639081396927, + "grad_norm": 0.8683540225028992, + "learning_rate": 7.428149205055035e-06, + "loss": 4.3789, + "step": 30295 + }, + { + "epoch": 2.058703628210355, + "grad_norm": 0.4987124800682068, + "learning_rate": 7.427724554966708e-06, + "loss": 4.5819, + "step": 30300 + }, + { + "epoch": 2.0590433482810164, + "grad_norm": 0.3389034569263458, + "learning_rate": 7.427299904878381e-06, + "loss": 4.4708, + "step": 30305 + }, + { + "epoch": 2.059383068351678, + "grad_norm": 0.5911346077919006, + "learning_rate": 7.426875254790053e-06, + "loss": 4.6177, + "step": 30310 + }, + { + "epoch": 2.05972278842234, + "grad_norm": 0.4215952455997467, + "learning_rate": 7.426450604701726e-06, + "loss": 4.6475, + "step": 30315 + }, + { + "epoch": 2.0600625084930018, + "grad_norm": 0.3804725706577301, + "learning_rate": 7.4260259546134e-06, + "loss": 4.5108, + "step": 30320 + }, + { + "epoch": 2.0604022285636634, + "grad_norm": 0.4352291226387024, + "learning_rate": 7.425601304525072e-06, + "loss": 4.6805, + "step": 30325 + }, + { + "epoch": 2.0607419486343255, + "grad_norm": 0.4042663276195526, + "learning_rate": 7.4251766544367445e-06, + "loss": 4.2372, + "step": 30330 + }, + { + "epoch": 2.061081668704987, + "grad_norm": 0.40372946858406067, + "learning_rate": 7.424752004348418e-06, + "loss": 4.4061, + "step": 30335 + }, + { + "epoch": 2.0614213887756487, + "grad_norm": 0.33562228083610535, + "learning_rate": 7.42432735426009e-06, + "loss": 4.308, + "step": 30340 + }, + { + "epoch": 2.061761108846311, + "grad_norm": 0.36779704689979553, + "learning_rate": 7.423902704171763e-06, + "loss": 4.3537, + "step": 30345 + }, + { + "epoch": 2.0621008289169724, + "grad_norm": 0.3370119333267212, + "learning_rate": 7.4234780540834365e-06, + "loss": 4.4037, + "step": 30350 + }, + { + "epoch": 2.062440548987634, + "grad_norm": 0.3557696044445038, + "learning_rate": 7.4230534039951085e-06, + "loss": 4.5065, + "step": 30355 + }, + { + "epoch": 2.062780269058296, + "grad_norm": 0.4090683162212372, + "learning_rate": 7.422628753906781e-06, + "loss": 4.508, + "step": 30360 + }, + { + "epoch": 2.0631199891289578, + "grad_norm": 0.3542342782020569, + "learning_rate": 7.422204103818454e-06, + "loss": 4.3255, + "step": 30365 + }, + { + "epoch": 2.0634597091996194, + "grad_norm": 0.38164806365966797, + "learning_rate": 7.421779453730127e-06, + "loss": 4.6237, + "step": 30370 + }, + { + "epoch": 2.0637994292702815, + "grad_norm": 0.4060073494911194, + "learning_rate": 7.4213548036418e-06, + "loss": 4.4603, + "step": 30375 + }, + { + "epoch": 2.064139149340943, + "grad_norm": 0.3382193148136139, + "learning_rate": 7.4209301535534725e-06, + "loss": 4.3414, + "step": 30380 + }, + { + "epoch": 2.0644788694116047, + "grad_norm": 0.40583640336990356, + "learning_rate": 7.420505503465145e-06, + "loss": 4.5918, + "step": 30385 + }, + { + "epoch": 2.064818589482267, + "grad_norm": 0.5231906175613403, + "learning_rate": 7.420080853376817e-06, + "loss": 4.5625, + "step": 30390 + }, + { + "epoch": 2.0651583095529285, + "grad_norm": 0.4105894863605499, + "learning_rate": 7.419656203288491e-06, + "loss": 4.5663, + "step": 30395 + }, + { + "epoch": 2.06549802962359, + "grad_norm": 0.42256835103034973, + "learning_rate": 7.419231553200164e-06, + "loss": 4.1031, + "step": 30400 + }, + { + "epoch": 2.0658377496942517, + "grad_norm": 0.3896794617176056, + "learning_rate": 7.418806903111836e-06, + "loss": 4.4854, + "step": 30405 + }, + { + "epoch": 2.066177469764914, + "grad_norm": 0.3316176235675812, + "learning_rate": 7.418382253023509e-06, + "loss": 4.4884, + "step": 30410 + }, + { + "epoch": 2.0665171898355754, + "grad_norm": 0.28452345728874207, + "learning_rate": 7.417957602935182e-06, + "loss": 4.2823, + "step": 30415 + }, + { + "epoch": 2.066856909906237, + "grad_norm": 0.45572927594184875, + "learning_rate": 7.417532952846854e-06, + "loss": 4.3623, + "step": 30420 + }, + { + "epoch": 2.067196629976899, + "grad_norm": 0.33980467915534973, + "learning_rate": 7.417108302758528e-06, + "loss": 4.3243, + "step": 30425 + }, + { + "epoch": 2.0675363500475608, + "grad_norm": 0.3788129687309265, + "learning_rate": 7.4166836526702005e-06, + "loss": 4.675, + "step": 30430 + }, + { + "epoch": 2.0678760701182224, + "grad_norm": 0.27688702940940857, + "learning_rate": 7.4162590025818724e-06, + "loss": 4.5467, + "step": 30435 + }, + { + "epoch": 2.0682157901888845, + "grad_norm": 0.33335626125335693, + "learning_rate": 7.415834352493546e-06, + "loss": 4.4218, + "step": 30440 + }, + { + "epoch": 2.068555510259546, + "grad_norm": 0.3708083927631378, + "learning_rate": 7.415409702405219e-06, + "loss": 4.5219, + "step": 30445 + }, + { + "epoch": 2.0688952303302077, + "grad_norm": 0.431622177362442, + "learning_rate": 7.414985052316892e-06, + "loss": 4.7628, + "step": 30450 + }, + { + "epoch": 2.06923495040087, + "grad_norm": 0.3627296984195709, + "learning_rate": 7.4145604022285645e-06, + "loss": 4.4075, + "step": 30455 + }, + { + "epoch": 2.0695746704715314, + "grad_norm": 0.4315982460975647, + "learning_rate": 7.4141357521402364e-06, + "loss": 4.5659, + "step": 30460 + }, + { + "epoch": 2.069914390542193, + "grad_norm": 0.4469831585884094, + "learning_rate": 7.41371110205191e-06, + "loss": 4.7643, + "step": 30465 + }, + { + "epoch": 2.070254110612855, + "grad_norm": 0.33172979950904846, + "learning_rate": 7.413286451963583e-06, + "loss": 4.6035, + "step": 30470 + }, + { + "epoch": 2.0705938306835168, + "grad_norm": 0.3497803211212158, + "learning_rate": 7.412861801875255e-06, + "loss": 4.3448, + "step": 30475 + }, + { + "epoch": 2.0709335507541784, + "grad_norm": 0.4459521770477295, + "learning_rate": 7.4124371517869285e-06, + "loss": 4.5779, + "step": 30480 + }, + { + "epoch": 2.0712732708248405, + "grad_norm": 0.4236423969268799, + "learning_rate": 7.412012501698601e-06, + "loss": 4.3622, + "step": 30485 + }, + { + "epoch": 2.071612990895502, + "grad_norm": 0.40181106328964233, + "learning_rate": 7.411587851610273e-06, + "loss": 4.4415, + "step": 30490 + }, + { + "epoch": 2.0719527109661637, + "grad_norm": 0.3313480019569397, + "learning_rate": 7.411163201521947e-06, + "loss": 4.5325, + "step": 30495 + }, + { + "epoch": 2.072292431036826, + "grad_norm": 0.5095503330230713, + "learning_rate": 7.41073855143362e-06, + "loss": 4.31, + "step": 30500 + }, + { + "epoch": 2.0726321511074874, + "grad_norm": 0.3699764907360077, + "learning_rate": 7.410313901345292e-06, + "loss": 4.6738, + "step": 30505 + }, + { + "epoch": 2.072971871178149, + "grad_norm": 0.3601519465446472, + "learning_rate": 7.409889251256965e-06, + "loss": 4.3814, + "step": 30510 + }, + { + "epoch": 2.073311591248811, + "grad_norm": 0.44356557726860046, + "learning_rate": 7.409464601168638e-06, + "loss": 4.4573, + "step": 30515 + }, + { + "epoch": 2.073651311319473, + "grad_norm": 0.4354664385318756, + "learning_rate": 7.40903995108031e-06, + "loss": 4.5076, + "step": 30520 + }, + { + "epoch": 2.0739910313901344, + "grad_norm": 0.3323846757411957, + "learning_rate": 7.408615300991984e-06, + "loss": 4.6501, + "step": 30525 + }, + { + "epoch": 2.0743307514607965, + "grad_norm": 0.33007022738456726, + "learning_rate": 7.4081906509036565e-06, + "loss": 4.6914, + "step": 30530 + }, + { + "epoch": 2.074670471531458, + "grad_norm": 0.38020530343055725, + "learning_rate": 7.4077660008153285e-06, + "loss": 4.2023, + "step": 30535 + }, + { + "epoch": 2.0750101916021197, + "grad_norm": 0.40969425439834595, + "learning_rate": 7.407341350727002e-06, + "loss": 4.249, + "step": 30540 + }, + { + "epoch": 2.075349911672782, + "grad_norm": 0.3075801432132721, + "learning_rate": 7.406916700638674e-06, + "loss": 4.4632, + "step": 30545 + }, + { + "epoch": 2.0756896317434435, + "grad_norm": 0.36197081208229065, + "learning_rate": 7.406492050550347e-06, + "loss": 4.4965, + "step": 30550 + }, + { + "epoch": 2.076029351814105, + "grad_norm": 0.3432229161262512, + "learning_rate": 7.4060674004620205e-06, + "loss": 4.5053, + "step": 30555 + }, + { + "epoch": 2.076369071884767, + "grad_norm": 0.41555747389793396, + "learning_rate": 7.4056427503736925e-06, + "loss": 4.3771, + "step": 30560 + }, + { + "epoch": 2.076708791955429, + "grad_norm": 0.3898455798625946, + "learning_rate": 7.405218100285365e-06, + "loss": 4.3745, + "step": 30565 + }, + { + "epoch": 2.0770485120260904, + "grad_norm": 0.40085935592651367, + "learning_rate": 7.404793450197039e-06, + "loss": 4.4897, + "step": 30570 + }, + { + "epoch": 2.0773882320967525, + "grad_norm": 0.44748079776763916, + "learning_rate": 7.404368800108711e-06, + "loss": 4.4411, + "step": 30575 + }, + { + "epoch": 2.077727952167414, + "grad_norm": 0.3966715335845947, + "learning_rate": 7.403944150020384e-06, + "loss": 4.4723, + "step": 30580 + }, + { + "epoch": 2.0780676722380758, + "grad_norm": 0.3992503881454468, + "learning_rate": 7.403519499932057e-06, + "loss": 4.1831, + "step": 30585 + }, + { + "epoch": 2.0784073923087374, + "grad_norm": 0.367765873670578, + "learning_rate": 7.403094849843729e-06, + "loss": 4.2747, + "step": 30590 + }, + { + "epoch": 2.0787471123793995, + "grad_norm": 0.4250212013721466, + "learning_rate": 7.402670199755402e-06, + "loss": 4.741, + "step": 30595 + }, + { + "epoch": 2.079086832450061, + "grad_norm": 0.3550928831100464, + "learning_rate": 7.402245549667076e-06, + "loss": 4.4955, + "step": 30600 + }, + { + "epoch": 2.0794265525207227, + "grad_norm": 0.4148925840854645, + "learning_rate": 7.401820899578748e-06, + "loss": 4.4143, + "step": 30605 + }, + { + "epoch": 2.079766272591385, + "grad_norm": 0.39254316687583923, + "learning_rate": 7.4013962494904205e-06, + "loss": 4.5405, + "step": 30610 + }, + { + "epoch": 2.0801059926620464, + "grad_norm": 0.3587006628513336, + "learning_rate": 7.400971599402093e-06, + "loss": 4.4609, + "step": 30615 + }, + { + "epoch": 2.080445712732708, + "grad_norm": 0.43870946764945984, + "learning_rate": 7.400546949313766e-06, + "loss": 4.47, + "step": 30620 + }, + { + "epoch": 2.08078543280337, + "grad_norm": 0.31939947605133057, + "learning_rate": 7.400122299225439e-06, + "loss": 4.6129, + "step": 30625 + }, + { + "epoch": 2.0811251528740318, + "grad_norm": 0.3188628852367401, + "learning_rate": 7.399697649137112e-06, + "loss": 4.2324, + "step": 30630 + }, + { + "epoch": 2.0814648729446934, + "grad_norm": 0.43460148572921753, + "learning_rate": 7.3992729990487845e-06, + "loss": 4.4902, + "step": 30635 + }, + { + "epoch": 2.0818045930153555, + "grad_norm": 0.4393424391746521, + "learning_rate": 7.398848348960456e-06, + "loss": 4.5179, + "step": 30640 + }, + { + "epoch": 2.082144313086017, + "grad_norm": 0.5139976739883423, + "learning_rate": 7.39842369887213e-06, + "loss": 4.7918, + "step": 30645 + }, + { + "epoch": 2.0824840331566787, + "grad_norm": 0.40344923734664917, + "learning_rate": 7.397999048783803e-06, + "loss": 4.6162, + "step": 30650 + }, + { + "epoch": 2.082823753227341, + "grad_norm": 0.33693134784698486, + "learning_rate": 7.397574398695475e-06, + "loss": 4.8249, + "step": 30655 + }, + { + "epoch": 2.0831634732980024, + "grad_norm": 0.44470345973968506, + "learning_rate": 7.3971497486071485e-06, + "loss": 4.5291, + "step": 30660 + }, + { + "epoch": 2.083503193368664, + "grad_norm": 0.307306170463562, + "learning_rate": 7.396725098518821e-06, + "loss": 4.2862, + "step": 30665 + }, + { + "epoch": 2.083842913439326, + "grad_norm": 0.5135406255722046, + "learning_rate": 7.396300448430493e-06, + "loss": 4.503, + "step": 30670 + }, + { + "epoch": 2.084182633509988, + "grad_norm": 0.4308706820011139, + "learning_rate": 7.395875798342167e-06, + "loss": 4.4731, + "step": 30675 + }, + { + "epoch": 2.0845223535806494, + "grad_norm": 0.3359813094139099, + "learning_rate": 7.39545114825384e-06, + "loss": 4.4475, + "step": 30680 + }, + { + "epoch": 2.0848620736513115, + "grad_norm": 0.4341946840286255, + "learning_rate": 7.395026498165512e-06, + "loss": 4.5987, + "step": 30685 + }, + { + "epoch": 2.085201793721973, + "grad_norm": 0.3659304082393646, + "learning_rate": 7.394601848077185e-06, + "loss": 4.4979, + "step": 30690 + }, + { + "epoch": 2.0855415137926347, + "grad_norm": 0.36288368701934814, + "learning_rate": 7.394177197988858e-06, + "loss": 4.4053, + "step": 30695 + }, + { + "epoch": 2.085881233863297, + "grad_norm": 0.4952450692653656, + "learning_rate": 7.39375254790053e-06, + "loss": 4.204, + "step": 30700 + }, + { + "epoch": 2.0862209539339585, + "grad_norm": 0.3242911398410797, + "learning_rate": 7.393327897812204e-06, + "loss": 4.5169, + "step": 30705 + }, + { + "epoch": 2.08656067400462, + "grad_norm": 0.417003870010376, + "learning_rate": 7.392903247723876e-06, + "loss": 4.4972, + "step": 30710 + }, + { + "epoch": 2.086900394075282, + "grad_norm": 0.3884710967540741, + "learning_rate": 7.3924785976355484e-06, + "loss": 4.5256, + "step": 30715 + }, + { + "epoch": 2.087240114145944, + "grad_norm": 0.34252500534057617, + "learning_rate": 7.392053947547222e-06, + "loss": 4.5265, + "step": 30720 + }, + { + "epoch": 2.0875798342166054, + "grad_norm": 0.3472640812397003, + "learning_rate": 7.391629297458894e-06, + "loss": 4.945, + "step": 30725 + }, + { + "epoch": 2.0879195542872675, + "grad_norm": 0.34549078345298767, + "learning_rate": 7.391204647370567e-06, + "loss": 4.6203, + "step": 30730 + }, + { + "epoch": 2.088259274357929, + "grad_norm": 0.44265103340148926, + "learning_rate": 7.3907799972822405e-06, + "loss": 4.5653, + "step": 30735 + }, + { + "epoch": 2.0885989944285908, + "grad_norm": 0.2806856036186218, + "learning_rate": 7.3903553471939124e-06, + "loss": 4.3261, + "step": 30740 + }, + { + "epoch": 2.0889387144992524, + "grad_norm": 0.3195551335811615, + "learning_rate": 7.389930697105585e-06, + "loss": 4.6143, + "step": 30745 + }, + { + "epoch": 2.0892784345699145, + "grad_norm": 0.3760533332824707, + "learning_rate": 7.389506047017259e-06, + "loss": 4.4521, + "step": 30750 + }, + { + "epoch": 2.089618154640576, + "grad_norm": 0.35731643438339233, + "learning_rate": 7.389081396928931e-06, + "loss": 4.7464, + "step": 30755 + }, + { + "epoch": 2.0899578747112377, + "grad_norm": 0.38643357157707214, + "learning_rate": 7.388656746840604e-06, + "loss": 4.4862, + "step": 30760 + }, + { + "epoch": 2.0902975947819, + "grad_norm": 0.27833718061447144, + "learning_rate": 7.388232096752277e-06, + "loss": 4.9358, + "step": 30765 + }, + { + "epoch": 2.0906373148525614, + "grad_norm": 0.32238316535949707, + "learning_rate": 7.387807446663949e-06, + "loss": 4.4415, + "step": 30770 + }, + { + "epoch": 2.090977034923223, + "grad_norm": 0.30506429076194763, + "learning_rate": 7.387382796575622e-06, + "loss": 4.3304, + "step": 30775 + }, + { + "epoch": 2.091316754993885, + "grad_norm": 0.42381638288497925, + "learning_rate": 7.386958146487295e-06, + "loss": 4.5553, + "step": 30780 + }, + { + "epoch": 2.0916564750645468, + "grad_norm": 0.35364240407943726, + "learning_rate": 7.386533496398968e-06, + "loss": 4.4911, + "step": 30785 + }, + { + "epoch": 2.0919961951352084, + "grad_norm": 0.3432139456272125, + "learning_rate": 7.386108846310641e-06, + "loss": 4.4384, + "step": 30790 + }, + { + "epoch": 2.0923359152058705, + "grad_norm": 0.5573235750198364, + "learning_rate": 7.385684196222313e-06, + "loss": 4.6624, + "step": 30795 + }, + { + "epoch": 2.092675635276532, + "grad_norm": 0.35048744082450867, + "learning_rate": 7.385259546133986e-06, + "loss": 4.2339, + "step": 30800 + }, + { + "epoch": 2.0930153553471937, + "grad_norm": 0.6720607280731201, + "learning_rate": 7.38483489604566e-06, + "loss": 4.3188, + "step": 30805 + }, + { + "epoch": 2.093355075417856, + "grad_norm": 0.4066426455974579, + "learning_rate": 7.384410245957332e-06, + "loss": 4.4277, + "step": 30810 + }, + { + "epoch": 2.0936947954885174, + "grad_norm": 0.32910391688346863, + "learning_rate": 7.3839855958690044e-06, + "loss": 4.3958, + "step": 30815 + }, + { + "epoch": 2.094034515559179, + "grad_norm": 0.3357582986354828, + "learning_rate": 7.383560945780678e-06, + "loss": 4.3576, + "step": 30820 + }, + { + "epoch": 2.094374235629841, + "grad_norm": 0.6834408044815063, + "learning_rate": 7.38313629569235e-06, + "loss": 4.4265, + "step": 30825 + }, + { + "epoch": 2.094713955700503, + "grad_norm": 0.39478859305381775, + "learning_rate": 7.382711645604023e-06, + "loss": 4.5048, + "step": 30830 + }, + { + "epoch": 2.0950536757711644, + "grad_norm": 0.37314411997795105, + "learning_rate": 7.3822869955156965e-06, + "loss": 4.5066, + "step": 30835 + }, + { + "epoch": 2.0953933958418265, + "grad_norm": 0.3933577239513397, + "learning_rate": 7.3818623454273685e-06, + "loss": 4.4969, + "step": 30840 + }, + { + "epoch": 2.095733115912488, + "grad_norm": 0.3798844516277313, + "learning_rate": 7.381437695339041e-06, + "loss": 4.6449, + "step": 30845 + }, + { + "epoch": 2.0960728359831498, + "grad_norm": 0.3169780969619751, + "learning_rate": 7.381013045250715e-06, + "loss": 4.6164, + "step": 30850 + }, + { + "epoch": 2.096412556053812, + "grad_norm": 0.3838716447353363, + "learning_rate": 7.380588395162387e-06, + "loss": 4.3839, + "step": 30855 + }, + { + "epoch": 2.0967522761244735, + "grad_norm": 0.3689945936203003, + "learning_rate": 7.38016374507406e-06, + "loss": 4.523, + "step": 30860 + }, + { + "epoch": 2.097091996195135, + "grad_norm": 0.49764320254325867, + "learning_rate": 7.3797390949857325e-06, + "loss": 4.2826, + "step": 30865 + }, + { + "epoch": 2.097431716265797, + "grad_norm": 0.3236634135246277, + "learning_rate": 7.379314444897405e-06, + "loss": 4.4398, + "step": 30870 + }, + { + "epoch": 2.097771436336459, + "grad_norm": 0.3013951778411865, + "learning_rate": 7.378889794809077e-06, + "loss": 4.2835, + "step": 30875 + }, + { + "epoch": 2.0981111564071204, + "grad_norm": 0.39656156301498413, + "learning_rate": 7.378465144720751e-06, + "loss": 4.4102, + "step": 30880 + }, + { + "epoch": 2.0984508764777825, + "grad_norm": 0.3571912944316864, + "learning_rate": 7.378040494632424e-06, + "loss": 4.6247, + "step": 30885 + }, + { + "epoch": 2.098790596548444, + "grad_norm": 0.3566024899482727, + "learning_rate": 7.377615844544096e-06, + "loss": 4.7226, + "step": 30890 + }, + { + "epoch": 2.0991303166191058, + "grad_norm": 0.4206675887107849, + "learning_rate": 7.377191194455769e-06, + "loss": 4.4574, + "step": 30895 + }, + { + "epoch": 2.099470036689768, + "grad_norm": 0.3610745072364807, + "learning_rate": 7.376766544367442e-06, + "loss": 4.4815, + "step": 30900 + }, + { + "epoch": 2.0998097567604295, + "grad_norm": 0.3378487527370453, + "learning_rate": 7.376341894279114e-06, + "loss": 4.5425, + "step": 30905 + }, + { + "epoch": 2.100149476831091, + "grad_norm": 0.34606361389160156, + "learning_rate": 7.375917244190788e-06, + "loss": 4.509, + "step": 30910 + }, + { + "epoch": 2.100489196901753, + "grad_norm": 0.356599360704422, + "learning_rate": 7.3754925941024605e-06, + "loss": 4.6278, + "step": 30915 + }, + { + "epoch": 2.100828916972415, + "grad_norm": 0.35006821155548096, + "learning_rate": 7.375067944014132e-06, + "loss": 4.1588, + "step": 30920 + }, + { + "epoch": 2.1011686370430764, + "grad_norm": 0.7228179574012756, + "learning_rate": 7.374643293925806e-06, + "loss": 4.7879, + "step": 30925 + }, + { + "epoch": 2.101508357113738, + "grad_norm": 0.3473651111125946, + "learning_rate": 7.374218643837479e-06, + "loss": 4.2568, + "step": 30930 + }, + { + "epoch": 2.1018480771844, + "grad_norm": 0.482926607131958, + "learning_rate": 7.373793993749151e-06, + "loss": 4.4864, + "step": 30935 + }, + { + "epoch": 2.1021877972550618, + "grad_norm": 0.3623417019844055, + "learning_rate": 7.3733693436608245e-06, + "loss": 4.5558, + "step": 30940 + }, + { + "epoch": 2.1025275173257234, + "grad_norm": 0.4031290113925934, + "learning_rate": 7.372944693572497e-06, + "loss": 4.3228, + "step": 30945 + }, + { + "epoch": 2.1028672373963855, + "grad_norm": 0.4823075234889984, + "learning_rate": 7.372520043484169e-06, + "loss": 4.469, + "step": 30950 + }, + { + "epoch": 2.103206957467047, + "grad_norm": 0.682388424873352, + "learning_rate": 7.372095393395843e-06, + "loss": 4.623, + "step": 30955 + }, + { + "epoch": 2.1035466775377087, + "grad_norm": 0.3405109643936157, + "learning_rate": 7.371670743307515e-06, + "loss": 4.6721, + "step": 30960 + }, + { + "epoch": 2.103886397608371, + "grad_norm": 0.39426273107528687, + "learning_rate": 7.371246093219188e-06, + "loss": 4.4388, + "step": 30965 + }, + { + "epoch": 2.1042261176790324, + "grad_norm": 0.38182249665260315, + "learning_rate": 7.370821443130861e-06, + "loss": 4.3705, + "step": 30970 + }, + { + "epoch": 2.104565837749694, + "grad_norm": 0.4084590673446655, + "learning_rate": 7.370396793042533e-06, + "loss": 4.4125, + "step": 30975 + }, + { + "epoch": 2.104905557820356, + "grad_norm": 0.342626690864563, + "learning_rate": 7.369972142954206e-06, + "loss": 4.3662, + "step": 30980 + }, + { + "epoch": 2.105245277891018, + "grad_norm": 0.34432584047317505, + "learning_rate": 7.36954749286588e-06, + "loss": 4.3005, + "step": 30985 + }, + { + "epoch": 2.1055849979616794, + "grad_norm": 0.42360278964042664, + "learning_rate": 7.369122842777552e-06, + "loss": 4.4986, + "step": 30990 + }, + { + "epoch": 2.1059247180323415, + "grad_norm": 0.39092937111854553, + "learning_rate": 7.3686981926892244e-06, + "loss": 4.5111, + "step": 30995 + }, + { + "epoch": 2.106264438103003, + "grad_norm": 0.3864152133464813, + "learning_rate": 7.368273542600898e-06, + "loss": 4.6596, + "step": 31000 + }, + { + "epoch": 2.1066041581736648, + "grad_norm": 0.32832711935043335, + "learning_rate": 7.36784889251257e-06, + "loss": 4.5055, + "step": 31005 + }, + { + "epoch": 2.106943878244327, + "grad_norm": 0.3083752691745758, + "learning_rate": 7.367424242424243e-06, + "loss": 4.465, + "step": 31010 + }, + { + "epoch": 2.1072835983149885, + "grad_norm": 0.32494956254959106, + "learning_rate": 7.3669995923359165e-06, + "loss": 4.6355, + "step": 31015 + }, + { + "epoch": 2.10762331838565, + "grad_norm": 0.2799331843852997, + "learning_rate": 7.3665749422475884e-06, + "loss": 4.2515, + "step": 31020 + }, + { + "epoch": 2.107963038456312, + "grad_norm": 0.4161858856678009, + "learning_rate": 7.366150292159261e-06, + "loss": 4.3438, + "step": 31025 + }, + { + "epoch": 2.108302758526974, + "grad_norm": 0.32685795426368713, + "learning_rate": 7.365725642070934e-06, + "loss": 4.5164, + "step": 31030 + }, + { + "epoch": 2.1086424785976354, + "grad_norm": 0.5659498572349548, + "learning_rate": 7.365300991982607e-06, + "loss": 4.3383, + "step": 31035 + }, + { + "epoch": 2.1089821986682975, + "grad_norm": 0.3493325710296631, + "learning_rate": 7.36487634189428e-06, + "loss": 4.4903, + "step": 31040 + }, + { + "epoch": 2.109321918738959, + "grad_norm": 0.3478587865829468, + "learning_rate": 7.3644516918059524e-06, + "loss": 4.4454, + "step": 31045 + }, + { + "epoch": 2.1096616388096208, + "grad_norm": 0.6117619276046753, + "learning_rate": 7.364027041717625e-06, + "loss": 4.5547, + "step": 31050 + }, + { + "epoch": 2.110001358880283, + "grad_norm": 0.3987860679626465, + "learning_rate": 7.363602391629297e-06, + "loss": 4.6423, + "step": 31055 + }, + { + "epoch": 2.1103410789509445, + "grad_norm": 0.4396669566631317, + "learning_rate": 7.363177741540971e-06, + "loss": 4.7339, + "step": 31060 + }, + { + "epoch": 2.110680799021606, + "grad_norm": 0.37853845953941345, + "learning_rate": 7.362753091452644e-06, + "loss": 4.4505, + "step": 31065 + }, + { + "epoch": 2.111020519092268, + "grad_norm": 0.45746564865112305, + "learning_rate": 7.362328441364316e-06, + "loss": 4.6983, + "step": 31070 + }, + { + "epoch": 2.11136023916293, + "grad_norm": 0.37306392192840576, + "learning_rate": 7.361903791275989e-06, + "loss": 4.6114, + "step": 31075 + }, + { + "epoch": 2.1116999592335914, + "grad_norm": 0.5011457204818726, + "learning_rate": 7.361479141187662e-06, + "loss": 4.437, + "step": 31080 + }, + { + "epoch": 2.112039679304253, + "grad_norm": 0.46067366003990173, + "learning_rate": 7.361054491099334e-06, + "loss": 4.4752, + "step": 31085 + }, + { + "epoch": 2.112379399374915, + "grad_norm": 0.4164734482765198, + "learning_rate": 7.360629841011008e-06, + "loss": 4.6578, + "step": 31090 + }, + { + "epoch": 2.1127191194455768, + "grad_norm": 0.30018576979637146, + "learning_rate": 7.3602051909226804e-06, + "loss": 4.3638, + "step": 31095 + }, + { + "epoch": 2.1130588395162384, + "grad_norm": 0.32933852076530457, + "learning_rate": 7.359780540834352e-06, + "loss": 4.307, + "step": 31100 + }, + { + "epoch": 2.1133985595869005, + "grad_norm": 0.4296659529209137, + "learning_rate": 7.359355890746026e-06, + "loss": 4.5786, + "step": 31105 + }, + { + "epoch": 2.113738279657562, + "grad_norm": 0.3974895477294922, + "learning_rate": 7.358931240657699e-06, + "loss": 4.6212, + "step": 31110 + }, + { + "epoch": 2.1140779997282237, + "grad_norm": 0.3427017331123352, + "learning_rate": 7.358506590569371e-06, + "loss": 4.2631, + "step": 31115 + }, + { + "epoch": 2.114417719798886, + "grad_norm": 0.3423398435115814, + "learning_rate": 7.3580819404810444e-06, + "loss": 4.6048, + "step": 31120 + }, + { + "epoch": 2.1147574398695475, + "grad_norm": 0.3597431778907776, + "learning_rate": 7.357657290392716e-06, + "loss": 4.5468, + "step": 31125 + }, + { + "epoch": 2.115097159940209, + "grad_norm": 0.4631529152393341, + "learning_rate": 7.35723264030439e-06, + "loss": 4.3761, + "step": 31130 + }, + { + "epoch": 2.115436880010871, + "grad_norm": 0.47357526421546936, + "learning_rate": 7.356807990216063e-06, + "loss": 4.1473, + "step": 31135 + }, + { + "epoch": 2.115776600081533, + "grad_norm": 0.40023425221443176, + "learning_rate": 7.356383340127735e-06, + "loss": 4.4866, + "step": 31140 + }, + { + "epoch": 2.1161163201521944, + "grad_norm": 0.37423843145370483, + "learning_rate": 7.3559586900394085e-06, + "loss": 4.7321, + "step": 31145 + }, + { + "epoch": 2.1164560402228565, + "grad_norm": 0.5709297060966492, + "learning_rate": 7.355534039951081e-06, + "loss": 4.7079, + "step": 31150 + }, + { + "epoch": 2.116795760293518, + "grad_norm": 0.41821983456611633, + "learning_rate": 7.355109389862753e-06, + "loss": 4.3552, + "step": 31155 + }, + { + "epoch": 2.1171354803641798, + "grad_norm": 0.45297467708587646, + "learning_rate": 7.354684739774427e-06, + "loss": 4.5885, + "step": 31160 + }, + { + "epoch": 2.117475200434842, + "grad_norm": 0.30134010314941406, + "learning_rate": 7.3542600896861e-06, + "loss": 4.7033, + "step": 31165 + }, + { + "epoch": 2.1178149205055035, + "grad_norm": 0.41812142729759216, + "learning_rate": 7.353835439597772e-06, + "loss": 4.2552, + "step": 31170 + }, + { + "epoch": 2.118154640576165, + "grad_norm": 0.45992311835289, + "learning_rate": 7.353410789509445e-06, + "loss": 4.5638, + "step": 31175 + }, + { + "epoch": 2.118494360646827, + "grad_norm": 0.42144137620925903, + "learning_rate": 7.352986139421118e-06, + "loss": 4.4567, + "step": 31180 + }, + { + "epoch": 2.118834080717489, + "grad_norm": 0.37157565355300903, + "learning_rate": 7.35256148933279e-06, + "loss": 4.4609, + "step": 31185 + }, + { + "epoch": 2.1191738007881504, + "grad_norm": 0.3712669610977173, + "learning_rate": 7.352136839244464e-06, + "loss": 4.1795, + "step": 31190 + }, + { + "epoch": 2.1195135208588125, + "grad_norm": 0.43325284123420715, + "learning_rate": 7.3517121891561365e-06, + "loss": 4.6899, + "step": 31195 + }, + { + "epoch": 2.119853240929474, + "grad_norm": 0.39909130334854126, + "learning_rate": 7.351287539067808e-06, + "loss": 4.4645, + "step": 31200 + }, + { + "epoch": 2.1201929610001358, + "grad_norm": 0.37440136075019836, + "learning_rate": 7.350862888979482e-06, + "loss": 4.642, + "step": 31205 + }, + { + "epoch": 2.120532681070798, + "grad_norm": 0.5223204493522644, + "learning_rate": 7.350438238891154e-06, + "loss": 4.3359, + "step": 31210 + }, + { + "epoch": 2.1208724011414595, + "grad_norm": 0.49641454219818115, + "learning_rate": 7.350013588802827e-06, + "loss": 4.5479, + "step": 31215 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.440872460603714, + "learning_rate": 7.3495889387145005e-06, + "loss": 4.5574, + "step": 31220 + }, + { + "epoch": 2.121551841282783, + "grad_norm": 0.42667558789253235, + "learning_rate": 7.349164288626172e-06, + "loss": 4.4227, + "step": 31225 + }, + { + "epoch": 2.121891561353445, + "grad_norm": 0.3145882785320282, + "learning_rate": 7.348739638537845e-06, + "loss": 4.4075, + "step": 31230 + }, + { + "epoch": 2.1222312814241064, + "grad_norm": 0.5226957201957703, + "learning_rate": 7.348314988449519e-06, + "loss": 4.6336, + "step": 31235 + }, + { + "epoch": 2.1225710014947685, + "grad_norm": 0.32841843366622925, + "learning_rate": 7.347890338361191e-06, + "loss": 4.5535, + "step": 31240 + }, + { + "epoch": 2.12291072156543, + "grad_norm": 0.35920435190200806, + "learning_rate": 7.347465688272864e-06, + "loss": 4.6495, + "step": 31245 + }, + { + "epoch": 2.123250441636092, + "grad_norm": 0.36361101269721985, + "learning_rate": 7.347041038184537e-06, + "loss": 4.6014, + "step": 31250 + }, + { + "epoch": 2.123590161706754, + "grad_norm": 0.48959508538246155, + "learning_rate": 7.346616388096209e-06, + "loss": 4.5867, + "step": 31255 + }, + { + "epoch": 2.1239298817774155, + "grad_norm": 0.3431367576122284, + "learning_rate": 7.346191738007882e-06, + "loss": 4.3459, + "step": 31260 + }, + { + "epoch": 2.124269601848077, + "grad_norm": 0.5552244782447815, + "learning_rate": 7.345767087919556e-06, + "loss": 4.3884, + "step": 31265 + }, + { + "epoch": 2.1246093219187387, + "grad_norm": 0.4661150872707367, + "learning_rate": 7.345342437831228e-06, + "loss": 4.3583, + "step": 31270 + }, + { + "epoch": 2.124949041989401, + "grad_norm": 0.35648152232170105, + "learning_rate": 7.3449177877429004e-06, + "loss": 4.3989, + "step": 31275 + }, + { + "epoch": 2.1252887620600625, + "grad_norm": 0.34624364972114563, + "learning_rate": 7.344493137654573e-06, + "loss": 4.4704, + "step": 31280 + }, + { + "epoch": 2.125628482130724, + "grad_norm": 0.3031330108642578, + "learning_rate": 7.344068487566246e-06, + "loss": 4.6565, + "step": 31285 + }, + { + "epoch": 2.125968202201386, + "grad_norm": 0.4283573031425476, + "learning_rate": 7.343643837477919e-06, + "loss": 4.6263, + "step": 31290 + }, + { + "epoch": 2.126307922272048, + "grad_norm": 0.31427648663520813, + "learning_rate": 7.343219187389592e-06, + "loss": 4.3072, + "step": 31295 + }, + { + "epoch": 2.1266476423427094, + "grad_norm": 0.5724141597747803, + "learning_rate": 7.3427945373012644e-06, + "loss": 4.3221, + "step": 31300 + }, + { + "epoch": 2.1269873624133715, + "grad_norm": 0.3241426646709442, + "learning_rate": 7.342369887212936e-06, + "loss": 4.2512, + "step": 31305 + }, + { + "epoch": 2.127327082484033, + "grad_norm": 0.36064282059669495, + "learning_rate": 7.34194523712461e-06, + "loss": 4.4263, + "step": 31310 + }, + { + "epoch": 2.1276668025546948, + "grad_norm": 0.41086477041244507, + "learning_rate": 7.341520587036283e-06, + "loss": 4.4372, + "step": 31315 + }, + { + "epoch": 2.128006522625357, + "grad_norm": 0.38939112424850464, + "learning_rate": 7.341095936947955e-06, + "loss": 4.4111, + "step": 31320 + }, + { + "epoch": 2.1283462426960185, + "grad_norm": 0.36664584279060364, + "learning_rate": 7.3406712868596284e-06, + "loss": 4.6014, + "step": 31325 + }, + { + "epoch": 2.12868596276668, + "grad_norm": 0.49767011404037476, + "learning_rate": 7.340246636771301e-06, + "loss": 4.447, + "step": 31330 + }, + { + "epoch": 2.129025682837342, + "grad_norm": 0.36645135283470154, + "learning_rate": 7.339821986682973e-06, + "loss": 4.5845, + "step": 31335 + }, + { + "epoch": 2.129365402908004, + "grad_norm": 0.398077130317688, + "learning_rate": 7.339397336594647e-06, + "loss": 4.5236, + "step": 31340 + }, + { + "epoch": 2.1297051229786654, + "grad_norm": 0.4322221279144287, + "learning_rate": 7.33897268650632e-06, + "loss": 4.371, + "step": 31345 + }, + { + "epoch": 2.1300448430493275, + "grad_norm": 0.4020387530326843, + "learning_rate": 7.338548036417992e-06, + "loss": 4.4307, + "step": 31350 + }, + { + "epoch": 2.130384563119989, + "grad_norm": 0.40499117970466614, + "learning_rate": 7.338123386329665e-06, + "loss": 4.4527, + "step": 31355 + }, + { + "epoch": 2.1307242831906508, + "grad_norm": 0.34566617012023926, + "learning_rate": 7.337698736241338e-06, + "loss": 4.129, + "step": 31360 + }, + { + "epoch": 2.131064003261313, + "grad_norm": 0.35027942061424255, + "learning_rate": 7.33727408615301e-06, + "loss": 4.2642, + "step": 31365 + }, + { + "epoch": 2.1314037233319745, + "grad_norm": 0.32662954926490784, + "learning_rate": 7.336849436064684e-06, + "loss": 4.4274, + "step": 31370 + }, + { + "epoch": 2.131743443402636, + "grad_norm": 0.45722144842147827, + "learning_rate": 7.336424785976356e-06, + "loss": 4.7135, + "step": 31375 + }, + { + "epoch": 2.132083163473298, + "grad_norm": 0.4824817478656769, + "learning_rate": 7.336000135888028e-06, + "loss": 4.6062, + "step": 31380 + }, + { + "epoch": 2.13242288354396, + "grad_norm": 0.3660512864589691, + "learning_rate": 7.335575485799702e-06, + "loss": 4.4729, + "step": 31385 + }, + { + "epoch": 2.1327626036146214, + "grad_norm": 0.6136674880981445, + "learning_rate": 7.335150835711374e-06, + "loss": 4.584, + "step": 31390 + }, + { + "epoch": 2.1331023236852835, + "grad_norm": 0.44805338978767395, + "learning_rate": 7.334726185623047e-06, + "loss": 4.3787, + "step": 31395 + }, + { + "epoch": 2.133442043755945, + "grad_norm": 0.6896963119506836, + "learning_rate": 7.3343015355347204e-06, + "loss": 4.5405, + "step": 31400 + }, + { + "epoch": 2.133781763826607, + "grad_norm": 0.4591418206691742, + "learning_rate": 7.333876885446392e-06, + "loss": 4.3634, + "step": 31405 + }, + { + "epoch": 2.134121483897269, + "grad_norm": 0.31499719619750977, + "learning_rate": 7.333452235358065e-06, + "loss": 4.5158, + "step": 31410 + }, + { + "epoch": 2.1344612039679305, + "grad_norm": 0.5178230404853821, + "learning_rate": 7.333027585269739e-06, + "loss": 4.4291, + "step": 31415 + }, + { + "epoch": 2.134800924038592, + "grad_norm": 0.3615489602088928, + "learning_rate": 7.332602935181411e-06, + "loss": 4.5115, + "step": 31420 + }, + { + "epoch": 2.1351406441092537, + "grad_norm": 0.40835073590278625, + "learning_rate": 7.332178285093084e-06, + "loss": 4.6125, + "step": 31425 + }, + { + "epoch": 2.135480364179916, + "grad_norm": 0.49004870653152466, + "learning_rate": 7.331753635004757e-06, + "loss": 4.5969, + "step": 31430 + }, + { + "epoch": 2.1358200842505775, + "grad_norm": 0.572614312171936, + "learning_rate": 7.331328984916429e-06, + "loss": 4.2454, + "step": 31435 + }, + { + "epoch": 2.136159804321239, + "grad_norm": 0.44841107726097107, + "learning_rate": 7.330904334828102e-06, + "loss": 4.5732, + "step": 31440 + }, + { + "epoch": 2.136499524391901, + "grad_norm": 0.33612576127052307, + "learning_rate": 7.330479684739775e-06, + "loss": 4.3106, + "step": 31445 + }, + { + "epoch": 2.136839244462563, + "grad_norm": 0.4401785433292389, + "learning_rate": 7.330055034651448e-06, + "loss": 4.4456, + "step": 31450 + }, + { + "epoch": 2.1371789645332244, + "grad_norm": 0.38039952516555786, + "learning_rate": 7.32963038456312e-06, + "loss": 4.4703, + "step": 31455 + }, + { + "epoch": 2.1375186846038865, + "grad_norm": 0.34236228466033936, + "learning_rate": 7.329205734474793e-06, + "loss": 4.4049, + "step": 31460 + }, + { + "epoch": 2.137858404674548, + "grad_norm": 0.39476844668388367, + "learning_rate": 7.328781084386466e-06, + "loss": 4.4396, + "step": 31465 + }, + { + "epoch": 2.1381981247452098, + "grad_norm": 0.4526391327381134, + "learning_rate": 7.32835643429814e-06, + "loss": 4.442, + "step": 31470 + }, + { + "epoch": 2.138537844815872, + "grad_norm": 0.4218117594718933, + "learning_rate": 7.327931784209812e-06, + "loss": 4.0837, + "step": 31475 + }, + { + "epoch": 2.1388775648865335, + "grad_norm": 0.3798951804637909, + "learning_rate": 7.327507134121484e-06, + "loss": 4.2133, + "step": 31480 + }, + { + "epoch": 2.139217284957195, + "grad_norm": 0.3937246799468994, + "learning_rate": 7.327082484033158e-06, + "loss": 4.459, + "step": 31485 + }, + { + "epoch": 2.139557005027857, + "grad_norm": 0.5592990517616272, + "learning_rate": 7.32665783394483e-06, + "loss": 4.56, + "step": 31490 + }, + { + "epoch": 2.139896725098519, + "grad_norm": 0.6309366822242737, + "learning_rate": 7.326233183856503e-06, + "loss": 4.5342, + "step": 31495 + }, + { + "epoch": 2.1402364451691804, + "grad_norm": 0.32375723123550415, + "learning_rate": 7.3258085337681765e-06, + "loss": 4.2187, + "step": 31500 + }, + { + "epoch": 2.1405761652398425, + "grad_norm": 0.33367735147476196, + "learning_rate": 7.325383883679848e-06, + "loss": 4.6176, + "step": 31505 + }, + { + "epoch": 2.140915885310504, + "grad_norm": 0.48898711800575256, + "learning_rate": 7.324959233591521e-06, + "loss": 4.5187, + "step": 31510 + }, + { + "epoch": 2.1412556053811658, + "grad_norm": 0.38700738549232483, + "learning_rate": 7.324534583503195e-06, + "loss": 4.4568, + "step": 31515 + }, + { + "epoch": 2.141595325451828, + "grad_norm": 0.39195236563682556, + "learning_rate": 7.324109933414867e-06, + "loss": 4.4185, + "step": 31520 + }, + { + "epoch": 2.1419350455224895, + "grad_norm": 0.4190770983695984, + "learning_rate": 7.32368528332654e-06, + "loss": 4.5094, + "step": 31525 + }, + { + "epoch": 2.142274765593151, + "grad_norm": 0.38271716237068176, + "learning_rate": 7.323260633238212e-06, + "loss": 4.6601, + "step": 31530 + }, + { + "epoch": 2.142614485663813, + "grad_norm": 0.3466336131095886, + "learning_rate": 7.322835983149885e-06, + "loss": 4.4373, + "step": 31535 + }, + { + "epoch": 2.142954205734475, + "grad_norm": 0.33356258273124695, + "learning_rate": 7.322411333061558e-06, + "loss": 4.6304, + "step": 31540 + }, + { + "epoch": 2.1432939258051364, + "grad_norm": 0.45868444442749023, + "learning_rate": 7.321986682973231e-06, + "loss": 4.3815, + "step": 31545 + }, + { + "epoch": 2.1436336458757985, + "grad_norm": 0.2534137964248657, + "learning_rate": 7.321562032884904e-06, + "loss": 4.5815, + "step": 31550 + }, + { + "epoch": 2.14397336594646, + "grad_norm": 0.519102931022644, + "learning_rate": 7.3211373827965756e-06, + "loss": 4.7236, + "step": 31555 + }, + { + "epoch": 2.144313086017122, + "grad_norm": 0.37167462706565857, + "learning_rate": 7.320712732708249e-06, + "loss": 4.4776, + "step": 31560 + }, + { + "epoch": 2.144652806087784, + "grad_norm": 0.5548316240310669, + "learning_rate": 7.320288082619922e-06, + "loss": 4.9059, + "step": 31565 + }, + { + "epoch": 2.1449925261584455, + "grad_norm": 0.3981879651546478, + "learning_rate": 7.319863432531594e-06, + "loss": 4.2685, + "step": 31570 + }, + { + "epoch": 2.145332246229107, + "grad_norm": 0.45142245292663574, + "learning_rate": 7.319438782443268e-06, + "loss": 4.6204, + "step": 31575 + }, + { + "epoch": 2.145671966299769, + "grad_norm": 0.4538729190826416, + "learning_rate": 7.3190141323549404e-06, + "loss": 4.4788, + "step": 31580 + }, + { + "epoch": 2.146011686370431, + "grad_norm": 0.43602532148361206, + "learning_rate": 7.318589482266612e-06, + "loss": 4.2809, + "step": 31585 + }, + { + "epoch": 2.1463514064410925, + "grad_norm": 0.4274493455886841, + "learning_rate": 7.318164832178286e-06, + "loss": 4.5676, + "step": 31590 + }, + { + "epoch": 2.1466911265117545, + "grad_norm": 0.4902375638484955, + "learning_rate": 7.317740182089959e-06, + "loss": 4.4693, + "step": 31595 + }, + { + "epoch": 2.147030846582416, + "grad_norm": 0.3714122772216797, + "learning_rate": 7.317315532001631e-06, + "loss": 4.4266, + "step": 31600 + }, + { + "epoch": 2.147370566653078, + "grad_norm": 0.4895103871822357, + "learning_rate": 7.3168908819133044e-06, + "loss": 4.4374, + "step": 31605 + }, + { + "epoch": 2.14771028672374, + "grad_norm": 0.5124219655990601, + "learning_rate": 7.316466231824977e-06, + "loss": 4.3867, + "step": 31610 + }, + { + "epoch": 2.1480500067944015, + "grad_norm": 0.35056376457214355, + "learning_rate": 7.316041581736649e-06, + "loss": 4.303, + "step": 31615 + }, + { + "epoch": 2.148389726865063, + "grad_norm": 0.356959730386734, + "learning_rate": 7.315616931648323e-06, + "loss": 4.3505, + "step": 31620 + }, + { + "epoch": 2.1487294469357248, + "grad_norm": 0.5258558988571167, + "learning_rate": 7.315192281559995e-06, + "loss": 4.6066, + "step": 31625 + }, + { + "epoch": 2.149069167006387, + "grad_norm": 0.5588876605033875, + "learning_rate": 7.314767631471668e-06, + "loss": 4.2228, + "step": 31630 + }, + { + "epoch": 2.1494088870770485, + "grad_norm": 0.5271003246307373, + "learning_rate": 7.314342981383341e-06, + "loss": 4.4653, + "step": 31635 + }, + { + "epoch": 2.14974860714771, + "grad_norm": 0.46959084272384644, + "learning_rate": 7.313918331295013e-06, + "loss": 4.7491, + "step": 31640 + }, + { + "epoch": 2.150088327218372, + "grad_norm": 0.42539164423942566, + "learning_rate": 7.313493681206686e-06, + "loss": 4.5402, + "step": 31645 + }, + { + "epoch": 2.150428047289034, + "grad_norm": 0.4492422044277191, + "learning_rate": 7.31306903111836e-06, + "loss": 4.715, + "step": 31650 + }, + { + "epoch": 2.1507677673596954, + "grad_norm": 0.3144838809967041, + "learning_rate": 7.312644381030032e-06, + "loss": 4.5357, + "step": 31655 + }, + { + "epoch": 2.1511074874303575, + "grad_norm": 0.3081875145435333, + "learning_rate": 7.312219730941704e-06, + "loss": 4.6753, + "step": 31660 + }, + { + "epoch": 2.151447207501019, + "grad_norm": 0.354669988155365, + "learning_rate": 7.311795080853378e-06, + "loss": 4.5079, + "step": 31665 + }, + { + "epoch": 2.1517869275716808, + "grad_norm": 0.3581621050834656, + "learning_rate": 7.31137043076505e-06, + "loss": 4.5648, + "step": 31670 + }, + { + "epoch": 2.152126647642343, + "grad_norm": 0.4403235912322998, + "learning_rate": 7.310945780676723e-06, + "loss": 4.5422, + "step": 31675 + }, + { + "epoch": 2.1524663677130045, + "grad_norm": 0.456778883934021, + "learning_rate": 7.3105211305883964e-06, + "loss": 4.3155, + "step": 31680 + }, + { + "epoch": 2.152806087783666, + "grad_norm": 0.44174090027809143, + "learning_rate": 7.310096480500068e-06, + "loss": 4.543, + "step": 31685 + }, + { + "epoch": 2.153145807854328, + "grad_norm": 0.387722373008728, + "learning_rate": 7.309671830411741e-06, + "loss": 4.6406, + "step": 31690 + }, + { + "epoch": 2.15348552792499, + "grad_norm": 0.3931795656681061, + "learning_rate": 7.309247180323414e-06, + "loss": 4.2867, + "step": 31695 + }, + { + "epoch": 2.1538252479956514, + "grad_norm": 0.4181809425354004, + "learning_rate": 7.308822530235087e-06, + "loss": 4.5618, + "step": 31700 + }, + { + "epoch": 2.1541649680663135, + "grad_norm": 0.49515172839164734, + "learning_rate": 7.30839788014676e-06, + "loss": 4.514, + "step": 31705 + }, + { + "epoch": 2.154504688136975, + "grad_norm": 0.37443387508392334, + "learning_rate": 7.307973230058432e-06, + "loss": 4.5389, + "step": 31710 + }, + { + "epoch": 2.154844408207637, + "grad_norm": 0.2758277654647827, + "learning_rate": 7.307548579970105e-06, + "loss": 4.285, + "step": 31715 + }, + { + "epoch": 2.155184128278299, + "grad_norm": 0.4259923994541168, + "learning_rate": 7.307123929881777e-06, + "loss": 4.3174, + "step": 31720 + }, + { + "epoch": 2.1555238483489605, + "grad_norm": 0.35332897305488586, + "learning_rate": 7.306699279793451e-06, + "loss": 4.2718, + "step": 31725 + }, + { + "epoch": 2.155863568419622, + "grad_norm": 0.2904186546802521, + "learning_rate": 7.306274629705124e-06, + "loss": 4.5083, + "step": 31730 + }, + { + "epoch": 2.156203288490284, + "grad_norm": 0.5490436553955078, + "learning_rate": 7.3058499796167956e-06, + "loss": 4.2191, + "step": 31735 + }, + { + "epoch": 2.156543008560946, + "grad_norm": 0.44197899103164673, + "learning_rate": 7.305425329528469e-06, + "loss": 4.6859, + "step": 31740 + }, + { + "epoch": 2.1568827286316075, + "grad_norm": 0.32253819704055786, + "learning_rate": 7.305000679440142e-06, + "loss": 4.5648, + "step": 31745 + }, + { + "epoch": 2.1572224487022695, + "grad_norm": 0.39641067385673523, + "learning_rate": 7.304576029351814e-06, + "loss": 4.2243, + "step": 31750 + }, + { + "epoch": 2.157562168772931, + "grad_norm": 0.5026589632034302, + "learning_rate": 7.304236309281153e-06, + "loss": 4.7782, + "step": 31755 + }, + { + "epoch": 2.157901888843593, + "grad_norm": 0.5710040330886841, + "learning_rate": 7.303811659192826e-06, + "loss": 4.5835, + "step": 31760 + }, + { + "epoch": 2.1582416089142544, + "grad_norm": 0.46071597933769226, + "learning_rate": 7.303387009104499e-06, + "loss": 4.4616, + "step": 31765 + }, + { + "epoch": 2.1585813289849165, + "grad_norm": 0.37069442868232727, + "learning_rate": 7.302962359016171e-06, + "loss": 4.5952, + "step": 31770 + }, + { + "epoch": 2.158921049055578, + "grad_norm": 0.4856126308441162, + "learning_rate": 7.302537708927844e-06, + "loss": 4.3033, + "step": 31775 + }, + { + "epoch": 2.1592607691262398, + "grad_norm": 0.34542587399482727, + "learning_rate": 7.302113058839517e-06, + "loss": 4.5103, + "step": 31780 + }, + { + "epoch": 2.159600489196902, + "grad_norm": 0.37260568141937256, + "learning_rate": 7.30168840875119e-06, + "loss": 4.4896, + "step": 31785 + }, + { + "epoch": 2.1599402092675635, + "grad_norm": 0.39199918508529663, + "learning_rate": 7.3012637586628625e-06, + "loss": 4.5735, + "step": 31790 + }, + { + "epoch": 2.160279929338225, + "grad_norm": 0.3567996323108673, + "learning_rate": 7.300839108574535e-06, + "loss": 4.5094, + "step": 31795 + }, + { + "epoch": 2.160619649408887, + "grad_norm": 0.35118502378463745, + "learning_rate": 7.300414458486208e-06, + "loss": 4.2612, + "step": 31800 + }, + { + "epoch": 2.160959369479549, + "grad_norm": 0.5866617560386658, + "learning_rate": 7.29998980839788e-06, + "loss": 4.6815, + "step": 31805 + }, + { + "epoch": 2.1612990895502104, + "grad_norm": 0.33925861120224, + "learning_rate": 7.299565158309554e-06, + "loss": 4.8308, + "step": 31810 + }, + { + "epoch": 2.1616388096208725, + "grad_norm": 0.41139060258865356, + "learning_rate": 7.2991405082212265e-06, + "loss": 4.3603, + "step": 31815 + }, + { + "epoch": 2.161978529691534, + "grad_norm": 0.33483198285102844, + "learning_rate": 7.2987158581328984e-06, + "loss": 4.3672, + "step": 31820 + }, + { + "epoch": 2.1623182497621958, + "grad_norm": 0.39327239990234375, + "learning_rate": 7.298291208044572e-06, + "loss": 4.5406, + "step": 31825 + }, + { + "epoch": 2.162657969832858, + "grad_norm": 0.35845550894737244, + "learning_rate": 7.297866557956245e-06, + "loss": 4.5512, + "step": 31830 + }, + { + "epoch": 2.1629976899035195, + "grad_norm": 0.4754214584827423, + "learning_rate": 7.297441907867917e-06, + "loss": 4.4652, + "step": 31835 + }, + { + "epoch": 2.163337409974181, + "grad_norm": 0.382125586271286, + "learning_rate": 7.2970172577795905e-06, + "loss": 4.6149, + "step": 31840 + }, + { + "epoch": 2.163677130044843, + "grad_norm": 0.4026920199394226, + "learning_rate": 7.296592607691263e-06, + "loss": 4.5778, + "step": 31845 + }, + { + "epoch": 2.164016850115505, + "grad_norm": 0.4150540232658386, + "learning_rate": 7.296167957602935e-06, + "loss": 4.3609, + "step": 31850 + }, + { + "epoch": 2.1643565701861665, + "grad_norm": 0.3187137246131897, + "learning_rate": 7.295743307514609e-06, + "loss": 4.4094, + "step": 31855 + }, + { + "epoch": 2.1646962902568285, + "grad_norm": 0.4462384283542633, + "learning_rate": 7.295318657426282e-06, + "loss": 4.4831, + "step": 31860 + }, + { + "epoch": 2.16503601032749, + "grad_norm": 0.3420651853084564, + "learning_rate": 7.294894007337954e-06, + "loss": 4.6379, + "step": 31865 + }, + { + "epoch": 2.165375730398152, + "grad_norm": 0.37163135409355164, + "learning_rate": 7.294469357249627e-06, + "loss": 4.4872, + "step": 31870 + }, + { + "epoch": 2.165715450468814, + "grad_norm": 0.3762786388397217, + "learning_rate": 7.294044707161299e-06, + "loss": 4.4904, + "step": 31875 + }, + { + "epoch": 2.1660551705394755, + "grad_norm": 0.46364641189575195, + "learning_rate": 7.293620057072972e-06, + "loss": 4.5007, + "step": 31880 + }, + { + "epoch": 2.166394890610137, + "grad_norm": 0.27322229743003845, + "learning_rate": 7.293195406984646e-06, + "loss": 4.5395, + "step": 31885 + }, + { + "epoch": 2.166734610680799, + "grad_norm": 0.41728997230529785, + "learning_rate": 7.292770756896318e-06, + "loss": 4.5276, + "step": 31890 + }, + { + "epoch": 2.167074330751461, + "grad_norm": 0.43520012497901917, + "learning_rate": 7.2923461068079905e-06, + "loss": 4.6951, + "step": 31895 + }, + { + "epoch": 2.1674140508221225, + "grad_norm": 0.39223581552505493, + "learning_rate": 7.291921456719664e-06, + "loss": 4.2402, + "step": 31900 + }, + { + "epoch": 2.1677537708927845, + "grad_norm": 0.3406607210636139, + "learning_rate": 7.291496806631336e-06, + "loss": 4.5186, + "step": 31905 + }, + { + "epoch": 2.168093490963446, + "grad_norm": 0.4589231312274933, + "learning_rate": 7.291072156543009e-06, + "loss": 4.7392, + "step": 31910 + }, + { + "epoch": 2.168433211034108, + "grad_norm": 0.4921623766422272, + "learning_rate": 7.2906475064546825e-06, + "loss": 4.6128, + "step": 31915 + }, + { + "epoch": 2.16877293110477, + "grad_norm": 0.4429367184638977, + "learning_rate": 7.2902228563663545e-06, + "loss": 4.4756, + "step": 31920 + }, + { + "epoch": 2.1691126511754315, + "grad_norm": 0.3490200936794281, + "learning_rate": 7.289798206278027e-06, + "loss": 4.4491, + "step": 31925 + }, + { + "epoch": 2.169452371246093, + "grad_norm": 0.36651068925857544, + "learning_rate": 7.289373556189701e-06, + "loss": 4.3168, + "step": 31930 + }, + { + "epoch": 2.169792091316755, + "grad_norm": 0.38714873790740967, + "learning_rate": 7.288948906101373e-06, + "loss": 4.747, + "step": 31935 + }, + { + "epoch": 2.170131811387417, + "grad_norm": 0.4233246445655823, + "learning_rate": 7.288524256013046e-06, + "loss": 4.4693, + "step": 31940 + }, + { + "epoch": 2.1704715314580785, + "grad_norm": 0.41216927766799927, + "learning_rate": 7.2880996059247185e-06, + "loss": 4.4727, + "step": 31945 + }, + { + "epoch": 2.1708112515287405, + "grad_norm": 0.41921770572662354, + "learning_rate": 7.287674955836391e-06, + "loss": 4.293, + "step": 31950 + }, + { + "epoch": 2.171150971599402, + "grad_norm": 0.5846239328384399, + "learning_rate": 7.287250305748064e-06, + "loss": 4.459, + "step": 31955 + }, + { + "epoch": 2.171490691670064, + "grad_norm": 0.36743220686912537, + "learning_rate": 7.286825655659737e-06, + "loss": 4.3678, + "step": 31960 + }, + { + "epoch": 2.1718304117407254, + "grad_norm": 0.3776685297489166, + "learning_rate": 7.28640100557141e-06, + "loss": 4.5282, + "step": 31965 + }, + { + "epoch": 2.1721701318113875, + "grad_norm": 0.36286887526512146, + "learning_rate": 7.285976355483082e-06, + "loss": 4.4944, + "step": 31970 + }, + { + "epoch": 2.172509851882049, + "grad_norm": 0.5972253084182739, + "learning_rate": 7.285551705394755e-06, + "loss": 4.6382, + "step": 31975 + }, + { + "epoch": 2.172849571952711, + "grad_norm": 0.4926326870918274, + "learning_rate": 7.285127055306428e-06, + "loss": 4.3221, + "step": 31980 + }, + { + "epoch": 2.173189292023373, + "grad_norm": 0.4244426488876343, + "learning_rate": 7.2847024052181e-06, + "loss": 4.3112, + "step": 31985 + }, + { + "epoch": 2.1735290120940345, + "grad_norm": 0.45897096395492554, + "learning_rate": 7.284277755129774e-06, + "loss": 4.2748, + "step": 31990 + }, + { + "epoch": 2.173868732164696, + "grad_norm": 0.5013942718505859, + "learning_rate": 7.2838531050414465e-06, + "loss": 4.5269, + "step": 31995 + }, + { + "epoch": 2.174208452235358, + "grad_norm": 0.34067627787590027, + "learning_rate": 7.283428454953118e-06, + "loss": 4.182, + "step": 32000 + }, + { + "epoch": 2.17454817230602, + "grad_norm": 0.4389899671077728, + "learning_rate": 7.283003804864792e-06, + "loss": 4.622, + "step": 32005 + }, + { + "epoch": 2.1748878923766815, + "grad_norm": 0.44090643525123596, + "learning_rate": 7.282579154776465e-06, + "loss": 4.501, + "step": 32010 + }, + { + "epoch": 2.1752276124473435, + "grad_norm": 0.3146018087863922, + "learning_rate": 7.2821545046881385e-06, + "loss": 4.551, + "step": 32015 + }, + { + "epoch": 2.175567332518005, + "grad_norm": 0.5104544162750244, + "learning_rate": 7.2817298545998105e-06, + "loss": 4.6265, + "step": 32020 + }, + { + "epoch": 2.175907052588667, + "grad_norm": 0.4609287977218628, + "learning_rate": 7.281305204511483e-06, + "loss": 4.5622, + "step": 32025 + }, + { + "epoch": 2.176246772659329, + "grad_norm": 0.46087849140167236, + "learning_rate": 7.280880554423156e-06, + "loss": 4.3672, + "step": 32030 + }, + { + "epoch": 2.1765864927299905, + "grad_norm": 0.709624171257019, + "learning_rate": 7.280455904334829e-06, + "loss": 4.5662, + "step": 32035 + }, + { + "epoch": 2.176926212800652, + "grad_norm": 0.2862081527709961, + "learning_rate": 7.280031254246502e-06, + "loss": 4.1839, + "step": 32040 + }, + { + "epoch": 2.177265932871314, + "grad_norm": 0.3525993227958679, + "learning_rate": 7.2796066041581745e-06, + "loss": 4.3981, + "step": 32045 + }, + { + "epoch": 2.177605652941976, + "grad_norm": 0.4470003843307495, + "learning_rate": 7.279181954069847e-06, + "loss": 4.3874, + "step": 32050 + }, + { + "epoch": 2.1779453730126375, + "grad_norm": 0.4075321555137634, + "learning_rate": 7.278757303981519e-06, + "loss": 4.4858, + "step": 32055 + }, + { + "epoch": 2.1782850930832995, + "grad_norm": 0.3293878138065338, + "learning_rate": 7.278332653893193e-06, + "loss": 4.4746, + "step": 32060 + }, + { + "epoch": 2.178624813153961, + "grad_norm": 0.44205406308174133, + "learning_rate": 7.277908003804866e-06, + "loss": 4.5316, + "step": 32065 + }, + { + "epoch": 2.178964533224623, + "grad_norm": 0.31192171573638916, + "learning_rate": 7.277483353716538e-06, + "loss": 4.7591, + "step": 32070 + }, + { + "epoch": 2.179304253295285, + "grad_norm": 0.4289840757846832, + "learning_rate": 7.277058703628211e-06, + "loss": 4.4515, + "step": 32075 + }, + { + "epoch": 2.1796439733659465, + "grad_norm": 0.3999194800853729, + "learning_rate": 7.276634053539884e-06, + "loss": 4.6273, + "step": 32080 + }, + { + "epoch": 2.179983693436608, + "grad_norm": 0.39931216835975647, + "learning_rate": 7.276209403451556e-06, + "loss": 4.6024, + "step": 32085 + }, + { + "epoch": 2.18032341350727, + "grad_norm": 0.4180415868759155, + "learning_rate": 7.27578475336323e-06, + "loss": 4.0791, + "step": 32090 + }, + { + "epoch": 2.180663133577932, + "grad_norm": 0.49892184138298035, + "learning_rate": 7.2753601032749025e-06, + "loss": 4.4383, + "step": 32095 + }, + { + "epoch": 2.1810028536485935, + "grad_norm": 0.30577850341796875, + "learning_rate": 7.2749354531865744e-06, + "loss": 4.4475, + "step": 32100 + }, + { + "epoch": 2.181342573719255, + "grad_norm": 0.5910438299179077, + "learning_rate": 7.274510803098248e-06, + "loss": 4.4574, + "step": 32105 + }, + { + "epoch": 2.181682293789917, + "grad_norm": 0.5524200201034546, + "learning_rate": 7.274086153009921e-06, + "loss": 4.4237, + "step": 32110 + }, + { + "epoch": 2.182022013860579, + "grad_norm": 0.45853984355926514, + "learning_rate": 7.273661502921593e-06, + "loss": 4.3376, + "step": 32115 + }, + { + "epoch": 2.1823617339312404, + "grad_norm": 0.553009033203125, + "learning_rate": 7.2732368528332665e-06, + "loss": 4.4797, + "step": 32120 + }, + { + "epoch": 2.1827014540019025, + "grad_norm": 0.36353668570518494, + "learning_rate": 7.2728122027449384e-06, + "loss": 4.5523, + "step": 32125 + }, + { + "epoch": 2.183041174072564, + "grad_norm": 0.34307506680488586, + "learning_rate": 7.272387552656611e-06, + "loss": 4.2804, + "step": 32130 + }, + { + "epoch": 2.183380894143226, + "grad_norm": 0.43654361367225647, + "learning_rate": 7.271962902568285e-06, + "loss": 4.4839, + "step": 32135 + }, + { + "epoch": 2.183720614213888, + "grad_norm": 0.3334740102291107, + "learning_rate": 7.271538252479957e-06, + "loss": 4.4672, + "step": 32140 + }, + { + "epoch": 2.1840603342845495, + "grad_norm": 0.35116681456565857, + "learning_rate": 7.27111360239163e-06, + "loss": 4.147, + "step": 32145 + }, + { + "epoch": 2.184400054355211, + "grad_norm": 0.38567739725112915, + "learning_rate": 7.270688952303303e-06, + "loss": 4.3212, + "step": 32150 + }, + { + "epoch": 2.184739774425873, + "grad_norm": 0.5235497951507568, + "learning_rate": 7.270264302214975e-06, + "loss": 4.2636, + "step": 32155 + }, + { + "epoch": 2.185079494496535, + "grad_norm": 0.37254083156585693, + "learning_rate": 7.269839652126648e-06, + "loss": 4.3969, + "step": 32160 + }, + { + "epoch": 2.1854192145671965, + "grad_norm": 0.3496417999267578, + "learning_rate": 7.269415002038322e-06, + "loss": 4.2498, + "step": 32165 + }, + { + "epoch": 2.1857589346378585, + "grad_norm": 0.32791751623153687, + "learning_rate": 7.268990351949994e-06, + "loss": 4.3994, + "step": 32170 + }, + { + "epoch": 2.18609865470852, + "grad_norm": 0.5835317969322205, + "learning_rate": 7.2685657018616664e-06, + "loss": 4.2755, + "step": 32175 + }, + { + "epoch": 2.186438374779182, + "grad_norm": 0.3495549261569977, + "learning_rate": 7.26814105177334e-06, + "loss": 4.4006, + "step": 32180 + }, + { + "epoch": 2.186778094849844, + "grad_norm": 0.397574782371521, + "learning_rate": 7.267716401685012e-06, + "loss": 4.3976, + "step": 32185 + }, + { + "epoch": 2.1871178149205055, + "grad_norm": 0.34778112173080444, + "learning_rate": 7.267291751596685e-06, + "loss": 4.4964, + "step": 32190 + }, + { + "epoch": 2.187457534991167, + "grad_norm": 0.46371400356292725, + "learning_rate": 7.266867101508358e-06, + "loss": 4.4074, + "step": 32195 + }, + { + "epoch": 2.187797255061829, + "grad_norm": 0.4345504641532898, + "learning_rate": 7.2664424514200305e-06, + "loss": 4.1663, + "step": 32200 + }, + { + "epoch": 2.188136975132491, + "grad_norm": 0.7311795353889465, + "learning_rate": 7.266017801331703e-06, + "loss": 4.4515, + "step": 32205 + }, + { + "epoch": 2.1884766952031525, + "grad_norm": 0.35861602425575256, + "learning_rate": 7.265593151243376e-06, + "loss": 4.4242, + "step": 32210 + }, + { + "epoch": 2.1888164152738145, + "grad_norm": 0.4653001129627228, + "learning_rate": 7.265168501155049e-06, + "loss": 4.41, + "step": 32215 + }, + { + "epoch": 2.189156135344476, + "grad_norm": 0.4276869297027588, + "learning_rate": 7.264743851066721e-06, + "loss": 4.4295, + "step": 32220 + }, + { + "epoch": 2.189495855415138, + "grad_norm": 0.3706018626689911, + "learning_rate": 7.2643192009783945e-06, + "loss": 4.4981, + "step": 32225 + }, + { + "epoch": 2.1898355754858, + "grad_norm": 0.59515380859375, + "learning_rate": 7.263894550890067e-06, + "loss": 4.664, + "step": 32230 + }, + { + "epoch": 2.1901752955564615, + "grad_norm": 0.43135198950767517, + "learning_rate": 7.263469900801739e-06, + "loss": 4.5346, + "step": 32235 + }, + { + "epoch": 2.190515015627123, + "grad_norm": 0.3892386853694916, + "learning_rate": 7.263045250713413e-06, + "loss": 4.427, + "step": 32240 + }, + { + "epoch": 2.190854735697785, + "grad_norm": 0.44741904735565186, + "learning_rate": 7.262620600625086e-06, + "loss": 4.2213, + "step": 32245 + }, + { + "epoch": 2.191194455768447, + "grad_norm": 0.31596043705940247, + "learning_rate": 7.262195950536758e-06, + "loss": 4.5596, + "step": 32250 + }, + { + "epoch": 2.1915341758391085, + "grad_norm": 0.37363389134407043, + "learning_rate": 7.261771300448431e-06, + "loss": 4.5463, + "step": 32255 + }, + { + "epoch": 2.1918738959097706, + "grad_norm": 0.3588021397590637, + "learning_rate": 7.261346650360104e-06, + "loss": 4.2993, + "step": 32260 + }, + { + "epoch": 2.192213615980432, + "grad_norm": 0.345658540725708, + "learning_rate": 7.260922000271776e-06, + "loss": 4.7227, + "step": 32265 + }, + { + "epoch": 2.192553336051094, + "grad_norm": 0.3547104597091675, + "learning_rate": 7.26049735018345e-06, + "loss": 4.5999, + "step": 32270 + }, + { + "epoch": 2.192893056121756, + "grad_norm": 0.4759494364261627, + "learning_rate": 7.2600727000951225e-06, + "loss": 4.4416, + "step": 32275 + }, + { + "epoch": 2.1932327761924175, + "grad_norm": 0.5652551651000977, + "learning_rate": 7.259648050006794e-06, + "loss": 4.554, + "step": 32280 + }, + { + "epoch": 2.193572496263079, + "grad_norm": 0.5124533176422119, + "learning_rate": 7.259223399918468e-06, + "loss": 3.9911, + "step": 32285 + }, + { + "epoch": 2.1939122163337412, + "grad_norm": 0.535819947719574, + "learning_rate": 7.25879874983014e-06, + "loss": 4.4782, + "step": 32290 + }, + { + "epoch": 2.194251936404403, + "grad_norm": 0.4384392201900482, + "learning_rate": 7.258374099741813e-06, + "loss": 4.6206, + "step": 32295 + }, + { + "epoch": 2.1945916564750645, + "grad_norm": 0.4829307198524475, + "learning_rate": 7.2579494496534865e-06, + "loss": 4.3977, + "step": 32300 + }, + { + "epoch": 2.194931376545726, + "grad_norm": 0.46190017461776733, + "learning_rate": 7.257524799565158e-06, + "loss": 4.6395, + "step": 32305 + }, + { + "epoch": 2.195271096616388, + "grad_norm": 0.7360871434211731, + "learning_rate": 7.257100149476831e-06, + "loss": 4.2347, + "step": 32310 + }, + { + "epoch": 2.19561081668705, + "grad_norm": 0.5982373952865601, + "learning_rate": 7.256675499388505e-06, + "loss": 4.1499, + "step": 32315 + }, + { + "epoch": 2.1959505367577115, + "grad_norm": 0.5021691918373108, + "learning_rate": 7.256250849300177e-06, + "loss": 4.2283, + "step": 32320 + }, + { + "epoch": 2.1962902568283735, + "grad_norm": 0.34466752409935, + "learning_rate": 7.25582619921185e-06, + "loss": 4.3972, + "step": 32325 + }, + { + "epoch": 2.196629976899035, + "grad_norm": 0.32863232493400574, + "learning_rate": 7.255401549123523e-06, + "loss": 4.2879, + "step": 32330 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 0.3697986602783203, + "learning_rate": 7.254976899035195e-06, + "loss": 4.383, + "step": 32335 + }, + { + "epoch": 2.197309417040359, + "grad_norm": 0.41532769799232483, + "learning_rate": 7.254552248946868e-06, + "loss": 4.3776, + "step": 32340 + }, + { + "epoch": 2.1976491371110205, + "grad_norm": 0.3520854413509369, + "learning_rate": 7.254127598858542e-06, + "loss": 4.5585, + "step": 32345 + }, + { + "epoch": 2.197988857181682, + "grad_norm": 0.37879839539527893, + "learning_rate": 7.253702948770214e-06, + "loss": 4.786, + "step": 32350 + }, + { + "epoch": 2.198328577252344, + "grad_norm": 0.4488598704338074, + "learning_rate": 7.253278298681887e-06, + "loss": 4.4923, + "step": 32355 + }, + { + "epoch": 2.198668297323006, + "grad_norm": 0.4056428372859955, + "learning_rate": 7.25285364859356e-06, + "loss": 4.4465, + "step": 32360 + }, + { + "epoch": 2.1990080173936675, + "grad_norm": 0.34273162484169006, + "learning_rate": 7.252428998505232e-06, + "loss": 4.4926, + "step": 32365 + }, + { + "epoch": 2.1993477374643295, + "grad_norm": 0.3709791600704193, + "learning_rate": 7.252004348416906e-06, + "loss": 4.3914, + "step": 32370 + }, + { + "epoch": 2.199687457534991, + "grad_norm": 0.3550542891025543, + "learning_rate": 7.251579698328578e-06, + "loss": 4.3516, + "step": 32375 + }, + { + "epoch": 2.200027177605653, + "grad_norm": 0.47366127371788025, + "learning_rate": 7.2511550482402504e-06, + "loss": 4.4981, + "step": 32380 + }, + { + "epoch": 2.200366897676315, + "grad_norm": 0.4283706545829773, + "learning_rate": 7.250730398151924e-06, + "loss": 4.5622, + "step": 32385 + }, + { + "epoch": 2.2007066177469765, + "grad_norm": 0.3306225836277008, + "learning_rate": 7.250305748063596e-06, + "loss": 4.6298, + "step": 32390 + }, + { + "epoch": 2.201046337817638, + "grad_norm": 0.5398788452148438, + "learning_rate": 7.249881097975269e-06, + "loss": 4.3378, + "step": 32395 + }, + { + "epoch": 2.2013860578883, + "grad_norm": 0.5138960480690002, + "learning_rate": 7.2494564478869425e-06, + "loss": 4.4144, + "step": 32400 + }, + { + "epoch": 2.201725777958962, + "grad_norm": 0.415897011756897, + "learning_rate": 7.2490317977986144e-06, + "loss": 4.5069, + "step": 32405 + }, + { + "epoch": 2.2020654980296235, + "grad_norm": 0.4372958242893219, + "learning_rate": 7.248607147710287e-06, + "loss": 4.2133, + "step": 32410 + }, + { + "epoch": 2.2024052181002856, + "grad_norm": 0.29815933108329773, + "learning_rate": 7.248182497621961e-06, + "loss": 4.3488, + "step": 32415 + }, + { + "epoch": 2.202744938170947, + "grad_norm": 0.6743544340133667, + "learning_rate": 7.247757847533633e-06, + "loss": 4.2742, + "step": 32420 + }, + { + "epoch": 2.203084658241609, + "grad_norm": 0.41248154640197754, + "learning_rate": 7.247333197445306e-06, + "loss": 4.4669, + "step": 32425 + }, + { + "epoch": 2.203424378312271, + "grad_norm": 0.4987228512763977, + "learning_rate": 7.246908547356979e-06, + "loss": 4.4298, + "step": 32430 + }, + { + "epoch": 2.2037640983829325, + "grad_norm": 0.389047771692276, + "learning_rate": 7.246483897268651e-06, + "loss": 4.3009, + "step": 32435 + }, + { + "epoch": 2.204103818453594, + "grad_norm": 0.3833167552947998, + "learning_rate": 7.246059247180324e-06, + "loss": 4.6084, + "step": 32440 + }, + { + "epoch": 2.204443538524256, + "grad_norm": 0.34299665689468384, + "learning_rate": 7.245634597091997e-06, + "loss": 4.4859, + "step": 32445 + }, + { + "epoch": 2.204783258594918, + "grad_norm": 0.4009578227996826, + "learning_rate": 7.24520994700367e-06, + "loss": 4.3587, + "step": 32450 + }, + { + "epoch": 2.2051229786655795, + "grad_norm": 0.31722843647003174, + "learning_rate": 7.2447852969153424e-06, + "loss": 4.5056, + "step": 32455 + }, + { + "epoch": 2.205462698736241, + "grad_norm": 0.36645641922950745, + "learning_rate": 7.244360646827015e-06, + "loss": 4.3432, + "step": 32460 + }, + { + "epoch": 2.205802418806903, + "grad_norm": 0.3294338881969452, + "learning_rate": 7.243935996738688e-06, + "loss": 4.4876, + "step": 32465 + }, + { + "epoch": 2.206142138877565, + "grad_norm": 0.3874713182449341, + "learning_rate": 7.24351134665036e-06, + "loss": 4.3735, + "step": 32470 + }, + { + "epoch": 2.2064818589482265, + "grad_norm": 0.4063383638858795, + "learning_rate": 7.243086696562034e-06, + "loss": 4.5832, + "step": 32475 + }, + { + "epoch": 2.2068215790188885, + "grad_norm": 0.8202680945396423, + "learning_rate": 7.2426620464737064e-06, + "loss": 4.2881, + "step": 32480 + }, + { + "epoch": 2.20716129908955, + "grad_norm": 0.33704298734664917, + "learning_rate": 7.242237396385378e-06, + "loss": 4.5241, + "step": 32485 + }, + { + "epoch": 2.207501019160212, + "grad_norm": 0.3720546066761017, + "learning_rate": 7.241812746297052e-06, + "loss": 4.3182, + "step": 32490 + }, + { + "epoch": 2.207840739230874, + "grad_norm": 0.4039842188358307, + "learning_rate": 7.241388096208725e-06, + "loss": 4.513, + "step": 32495 + }, + { + "epoch": 2.2081804593015355, + "grad_norm": 0.4033445715904236, + "learning_rate": 7.240963446120397e-06, + "loss": 4.3083, + "step": 32500 + }, + { + "epoch": 2.208520179372197, + "grad_norm": 0.354000061750412, + "learning_rate": 7.2405387960320705e-06, + "loss": 4.5137, + "step": 32505 + }, + { + "epoch": 2.208859899442859, + "grad_norm": 0.42095062136650085, + "learning_rate": 7.240114145943743e-06, + "loss": 4.4608, + "step": 32510 + }, + { + "epoch": 2.209199619513521, + "grad_norm": 0.4339427649974823, + "learning_rate": 7.239689495855415e-06, + "loss": 4.5946, + "step": 32515 + }, + { + "epoch": 2.2095393395841825, + "grad_norm": 0.508380651473999, + "learning_rate": 7.239264845767089e-06, + "loss": 4.5228, + "step": 32520 + }, + { + "epoch": 2.2098790596548445, + "grad_norm": 0.3851505517959595, + "learning_rate": 7.238840195678762e-06, + "loss": 4.6282, + "step": 32525 + }, + { + "epoch": 2.210218779725506, + "grad_norm": 0.4076378047466278, + "learning_rate": 7.238415545590434e-06, + "loss": 4.3734, + "step": 32530 + }, + { + "epoch": 2.210558499796168, + "grad_norm": 0.3713880181312561, + "learning_rate": 7.237990895502107e-06, + "loss": 4.3694, + "step": 32535 + }, + { + "epoch": 2.21089821986683, + "grad_norm": 0.3694669008255005, + "learning_rate": 7.237566245413779e-06, + "loss": 4.597, + "step": 32540 + }, + { + "epoch": 2.2112379399374915, + "grad_norm": 0.4635324478149414, + "learning_rate": 7.237141595325452e-06, + "loss": 4.515, + "step": 32545 + }, + { + "epoch": 2.211577660008153, + "grad_norm": 0.3770679831504822, + "learning_rate": 7.236716945237126e-06, + "loss": 4.3869, + "step": 32550 + }, + { + "epoch": 2.211917380078815, + "grad_norm": 0.4096917510032654, + "learning_rate": 7.236292295148798e-06, + "loss": 4.4004, + "step": 32555 + }, + { + "epoch": 2.212257100149477, + "grad_norm": 0.40898820757865906, + "learning_rate": 7.23586764506047e-06, + "loss": 4.2468, + "step": 32560 + }, + { + "epoch": 2.2125968202201385, + "grad_norm": 0.35379013419151306, + "learning_rate": 7.235442994972144e-06, + "loss": 4.41, + "step": 32565 + }, + { + "epoch": 2.2129365402908006, + "grad_norm": 0.4118158519268036, + "learning_rate": 7.235018344883816e-06, + "loss": 4.2902, + "step": 32570 + }, + { + "epoch": 2.213276260361462, + "grad_norm": 0.37395381927490234, + "learning_rate": 7.234593694795489e-06, + "loss": 4.2602, + "step": 32575 + }, + { + "epoch": 2.213615980432124, + "grad_norm": 0.3463687300682068, + "learning_rate": 7.2341690447071625e-06, + "loss": 4.3999, + "step": 32580 + }, + { + "epoch": 2.213955700502786, + "grad_norm": 0.3646765947341919, + "learning_rate": 7.233744394618834e-06, + "loss": 4.9575, + "step": 32585 + }, + { + "epoch": 2.2142954205734475, + "grad_norm": 0.36172059178352356, + "learning_rate": 7.233319744530507e-06, + "loss": 4.1311, + "step": 32590 + }, + { + "epoch": 2.214635140644109, + "grad_norm": 0.4263448715209961, + "learning_rate": 7.232895094442181e-06, + "loss": 4.4747, + "step": 32595 + }, + { + "epoch": 2.2149748607147712, + "grad_norm": 0.41511964797973633, + "learning_rate": 7.232470444353853e-06, + "loss": 4.4844, + "step": 32600 + }, + { + "epoch": 2.215314580785433, + "grad_norm": 0.4188733994960785, + "learning_rate": 7.232130724283192e-06, + "loss": 4.5866, + "step": 32605 + }, + { + "epoch": 2.2156543008560945, + "grad_norm": 0.451450914144516, + "learning_rate": 7.2317060741948645e-06, + "loss": 4.2083, + "step": 32610 + }, + { + "epoch": 2.2159940209267566, + "grad_norm": 0.4436188340187073, + "learning_rate": 7.2312814241065365e-06, + "loss": 4.6348, + "step": 32615 + }, + { + "epoch": 2.216333740997418, + "grad_norm": 0.5304457545280457, + "learning_rate": 7.23085677401821e-06, + "loss": 4.4743, + "step": 32620 + }, + { + "epoch": 2.21667346106808, + "grad_norm": 0.7187544703483582, + "learning_rate": 7.230432123929882e-06, + "loss": 4.8353, + "step": 32625 + }, + { + "epoch": 2.217013181138742, + "grad_norm": 0.4199334383010864, + "learning_rate": 7.230007473841555e-06, + "loss": 4.4007, + "step": 32630 + }, + { + "epoch": 2.2173529012094035, + "grad_norm": 0.48227810859680176, + "learning_rate": 7.2295828237532285e-06, + "loss": 4.3145, + "step": 32635 + }, + { + "epoch": 2.217692621280065, + "grad_norm": 0.36826685070991516, + "learning_rate": 7.2291581736649005e-06, + "loss": 4.4255, + "step": 32640 + }, + { + "epoch": 2.218032341350727, + "grad_norm": 0.4067295491695404, + "learning_rate": 7.228733523576573e-06, + "loss": 4.2905, + "step": 32645 + }, + { + "epoch": 2.218372061421389, + "grad_norm": 0.33910539746284485, + "learning_rate": 7.228308873488247e-06, + "loss": 4.3496, + "step": 32650 + }, + { + "epoch": 2.2187117814920505, + "grad_norm": 0.4979385435581207, + "learning_rate": 7.227884223399919e-06, + "loss": 4.529, + "step": 32655 + }, + { + "epoch": 2.219051501562712, + "grad_norm": 0.4139474630355835, + "learning_rate": 7.227459573311592e-06, + "loss": 4.365, + "step": 32660 + }, + { + "epoch": 2.219391221633374, + "grad_norm": 0.4458146095275879, + "learning_rate": 7.227034923223265e-06, + "loss": 4.0418, + "step": 32665 + }, + { + "epoch": 2.219730941704036, + "grad_norm": 0.4658309817314148, + "learning_rate": 7.226610273134937e-06, + "loss": 4.4479, + "step": 32670 + }, + { + "epoch": 2.2200706617746975, + "grad_norm": 0.40257734060287476, + "learning_rate": 7.22618562304661e-06, + "loss": 4.4663, + "step": 32675 + }, + { + "epoch": 2.2204103818453595, + "grad_norm": 0.37089303135871887, + "learning_rate": 7.225760972958284e-06, + "loss": 4.2102, + "step": 32680 + }, + { + "epoch": 2.220750101916021, + "grad_norm": 0.35033029317855835, + "learning_rate": 7.225336322869956e-06, + "loss": 4.2329, + "step": 32685 + }, + { + "epoch": 2.221089821986683, + "grad_norm": 0.5173636674880981, + "learning_rate": 7.2249116727816285e-06, + "loss": 4.3254, + "step": 32690 + }, + { + "epoch": 2.221429542057345, + "grad_norm": 0.4388141930103302, + "learning_rate": 7.224487022693301e-06, + "loss": 4.3654, + "step": 32695 + }, + { + "epoch": 2.2217692621280065, + "grad_norm": 0.3056028485298157, + "learning_rate": 7.224062372604974e-06, + "loss": 4.4591, + "step": 32700 + }, + { + "epoch": 2.222108982198668, + "grad_norm": 0.2996659278869629, + "learning_rate": 7.223637722516647e-06, + "loss": 4.1178, + "step": 32705 + }, + { + "epoch": 2.2224487022693302, + "grad_norm": 0.2992543578147888, + "learning_rate": 7.22321307242832e-06, + "loss": 4.2125, + "step": 32710 + }, + { + "epoch": 2.222788422339992, + "grad_norm": 0.4559740126132965, + "learning_rate": 7.2227884223399925e-06, + "loss": 4.4988, + "step": 32715 + }, + { + "epoch": 2.2231281424106535, + "grad_norm": 0.48042941093444824, + "learning_rate": 7.2223637722516645e-06, + "loss": 4.4233, + "step": 32720 + }, + { + "epoch": 2.2234678624813156, + "grad_norm": 0.37220731377601624, + "learning_rate": 7.221939122163338e-06, + "loss": 4.2833, + "step": 32725 + }, + { + "epoch": 2.223807582551977, + "grad_norm": 0.48920169472694397, + "learning_rate": 7.221514472075011e-06, + "loss": 4.5444, + "step": 32730 + }, + { + "epoch": 2.224147302622639, + "grad_norm": 0.4213607609272003, + "learning_rate": 7.221089821986683e-06, + "loss": 4.4299, + "step": 32735 + }, + { + "epoch": 2.224487022693301, + "grad_norm": 0.42975425720214844, + "learning_rate": 7.2206651718983565e-06, + "loss": 4.5635, + "step": 32740 + }, + { + "epoch": 2.2248267427639625, + "grad_norm": 0.39701783657073975, + "learning_rate": 7.220240521810029e-06, + "loss": 4.3523, + "step": 32745 + }, + { + "epoch": 2.225166462834624, + "grad_norm": 0.7135312557220459, + "learning_rate": 7.219815871721701e-06, + "loss": 4.2614, + "step": 32750 + }, + { + "epoch": 2.2255061829052862, + "grad_norm": 0.4268359839916229, + "learning_rate": 7.219391221633375e-06, + "loss": 4.514, + "step": 32755 + }, + { + "epoch": 2.225845902975948, + "grad_norm": 0.41583189368247986, + "learning_rate": 7.218966571545048e-06, + "loss": 4.3111, + "step": 32760 + }, + { + "epoch": 2.2261856230466095, + "grad_norm": 0.45745861530303955, + "learning_rate": 7.21854192145672e-06, + "loss": 4.272, + "step": 32765 + }, + { + "epoch": 2.2265253431172716, + "grad_norm": 0.41200610995292664, + "learning_rate": 7.218117271368393e-06, + "loss": 4.3715, + "step": 32770 + }, + { + "epoch": 2.226865063187933, + "grad_norm": 0.42366188764572144, + "learning_rate": 7.217692621280066e-06, + "loss": 4.3673, + "step": 32775 + }, + { + "epoch": 2.227204783258595, + "grad_norm": 0.6677181124687195, + "learning_rate": 7.217267971191738e-06, + "loss": 4.5043, + "step": 32780 + }, + { + "epoch": 2.2275445033292565, + "grad_norm": 0.4110129773616791, + "learning_rate": 7.216843321103412e-06, + "loss": 4.2456, + "step": 32785 + }, + { + "epoch": 2.2278842233999185, + "grad_norm": 0.3598437011241913, + "learning_rate": 7.216418671015084e-06, + "loss": 4.2417, + "step": 32790 + }, + { + "epoch": 2.22822394347058, + "grad_norm": 0.37884703278541565, + "learning_rate": 7.2159940209267565e-06, + "loss": 4.585, + "step": 32795 + }, + { + "epoch": 2.228563663541242, + "grad_norm": 0.36246606707572937, + "learning_rate": 7.21556937083843e-06, + "loss": 4.658, + "step": 32800 + }, + { + "epoch": 2.228903383611904, + "grad_norm": 0.3738260865211487, + "learning_rate": 7.215144720750102e-06, + "loss": 4.3392, + "step": 32805 + }, + { + "epoch": 2.2292431036825655, + "grad_norm": 0.5057758092880249, + "learning_rate": 7.214720070661775e-06, + "loss": 4.3753, + "step": 32810 + }, + { + "epoch": 2.229582823753227, + "grad_norm": 0.4581598937511444, + "learning_rate": 7.2142954205734485e-06, + "loss": 4.5255, + "step": 32815 + }, + { + "epoch": 2.229922543823889, + "grad_norm": 0.4579208493232727, + "learning_rate": 7.2138707704851205e-06, + "loss": 4.4722, + "step": 32820 + }, + { + "epoch": 2.230262263894551, + "grad_norm": 0.39192506670951843, + "learning_rate": 7.213446120396793e-06, + "loss": 4.4344, + "step": 32825 + }, + { + "epoch": 2.2306019839652125, + "grad_norm": 0.4229869246482849, + "learning_rate": 7.213021470308467e-06, + "loss": 4.4702, + "step": 32830 + }, + { + "epoch": 2.2309417040358746, + "grad_norm": 0.4275625944137573, + "learning_rate": 7.212596820220139e-06, + "loss": 4.5293, + "step": 32835 + }, + { + "epoch": 2.231281424106536, + "grad_norm": 0.41662243008613586, + "learning_rate": 7.212172170131812e-06, + "loss": 4.345, + "step": 32840 + }, + { + "epoch": 2.231621144177198, + "grad_norm": 0.4890299141407013, + "learning_rate": 7.211747520043485e-06, + "loss": 4.4102, + "step": 32845 + }, + { + "epoch": 2.23196086424786, + "grad_norm": 0.3633269965648651, + "learning_rate": 7.211322869955157e-06, + "loss": 4.4404, + "step": 32850 + }, + { + "epoch": 2.2323005843185215, + "grad_norm": 0.6761294007301331, + "learning_rate": 7.21089821986683e-06, + "loss": 4.2449, + "step": 32855 + }, + { + "epoch": 2.232640304389183, + "grad_norm": 0.37871935963630676, + "learning_rate": 7.210473569778504e-06, + "loss": 4.1542, + "step": 32860 + }, + { + "epoch": 2.2329800244598452, + "grad_norm": 0.44642165303230286, + "learning_rate": 7.210048919690176e-06, + "loss": 4.4445, + "step": 32865 + }, + { + "epoch": 2.233319744530507, + "grad_norm": 0.3482911288738251, + "learning_rate": 7.2096242696018485e-06, + "loss": 4.5023, + "step": 32870 + }, + { + "epoch": 2.2336594646011685, + "grad_norm": 0.4112599492073059, + "learning_rate": 7.209199619513521e-06, + "loss": 4.541, + "step": 32875 + }, + { + "epoch": 2.2339991846718306, + "grad_norm": 0.3974965214729309, + "learning_rate": 7.208774969425194e-06, + "loss": 4.4717, + "step": 32880 + }, + { + "epoch": 2.234338904742492, + "grad_norm": 0.3928911089897156, + "learning_rate": 7.208350319336867e-06, + "loss": 4.4611, + "step": 32885 + }, + { + "epoch": 2.234678624813154, + "grad_norm": 0.37098759412765503, + "learning_rate": 7.20792566924854e-06, + "loss": 4.4394, + "step": 32890 + }, + { + "epoch": 2.235018344883816, + "grad_norm": 0.503211498260498, + "learning_rate": 7.2075010191602125e-06, + "loss": 4.3748, + "step": 32895 + }, + { + "epoch": 2.2353580649544775, + "grad_norm": 0.4968910813331604, + "learning_rate": 7.207076369071886e-06, + "loss": 4.5068, + "step": 32900 + }, + { + "epoch": 2.235697785025139, + "grad_norm": 0.3437946140766144, + "learning_rate": 7.206651718983558e-06, + "loss": 4.3455, + "step": 32905 + }, + { + "epoch": 2.2360375050958012, + "grad_norm": 0.3313203752040863, + "learning_rate": 7.206227068895231e-06, + "loss": 4.6488, + "step": 32910 + }, + { + "epoch": 2.236377225166463, + "grad_norm": 0.5226248502731323, + "learning_rate": 7.2058024188069045e-06, + "loss": 4.5819, + "step": 32915 + }, + { + "epoch": 2.2367169452371245, + "grad_norm": 0.517860472202301, + "learning_rate": 7.2053777687185765e-06, + "loss": 4.3444, + "step": 32920 + }, + { + "epoch": 2.2370566653077866, + "grad_norm": 0.40597453713417053, + "learning_rate": 7.204953118630249e-06, + "loss": 4.611, + "step": 32925 + }, + { + "epoch": 2.237396385378448, + "grad_norm": 0.3683614730834961, + "learning_rate": 7.204528468541923e-06, + "loss": 4.3865, + "step": 32930 + }, + { + "epoch": 2.23773610544911, + "grad_norm": 0.4226249158382416, + "learning_rate": 7.204103818453595e-06, + "loss": 4.4721, + "step": 32935 + }, + { + "epoch": 2.238075825519772, + "grad_norm": 0.7094238996505737, + "learning_rate": 7.203679168365268e-06, + "loss": 4.1555, + "step": 32940 + }, + { + "epoch": 2.2384155455904335, + "grad_norm": 0.42673444747924805, + "learning_rate": 7.2032545182769405e-06, + "loss": 4.6043, + "step": 32945 + }, + { + "epoch": 2.238755265661095, + "grad_norm": 0.28767305612564087, + "learning_rate": 7.202829868188613e-06, + "loss": 4.405, + "step": 32950 + }, + { + "epoch": 2.2390949857317572, + "grad_norm": 0.4389684200286865, + "learning_rate": 7.202405218100286e-06, + "loss": 4.4101, + "step": 32955 + }, + { + "epoch": 2.239434705802419, + "grad_norm": 0.3431914746761322, + "learning_rate": 7.201980568011959e-06, + "loss": 4.3627, + "step": 32960 + }, + { + "epoch": 2.2397744258730805, + "grad_norm": 0.4418516755104065, + "learning_rate": 7.201555917923632e-06, + "loss": 4.3059, + "step": 32965 + }, + { + "epoch": 2.2401141459437426, + "grad_norm": 0.4231242537498474, + "learning_rate": 7.201131267835304e-06, + "loss": 4.4501, + "step": 32970 + }, + { + "epoch": 2.240453866014404, + "grad_norm": 0.41876503825187683, + "learning_rate": 7.200706617746977e-06, + "loss": 4.4081, + "step": 32975 + }, + { + "epoch": 2.240793586085066, + "grad_norm": 0.48064300417900085, + "learning_rate": 7.20028196765865e-06, + "loss": 4.4672, + "step": 32980 + }, + { + "epoch": 2.2411333061557275, + "grad_norm": 0.5171815752983093, + "learning_rate": 7.199857317570322e-06, + "loss": 4.4028, + "step": 32985 + }, + { + "epoch": 2.2414730262263896, + "grad_norm": 0.4632335603237152, + "learning_rate": 7.199432667481996e-06, + "loss": 4.1946, + "step": 32990 + }, + { + "epoch": 2.241812746297051, + "grad_norm": 0.3512803018093109, + "learning_rate": 7.1990080173936685e-06, + "loss": 4.4435, + "step": 32995 + }, + { + "epoch": 2.242152466367713, + "grad_norm": 0.3922803997993469, + "learning_rate": 7.1985833673053405e-06, + "loss": 4.4544, + "step": 33000 + }, + { + "epoch": 2.242492186438375, + "grad_norm": 0.33638831973075867, + "learning_rate": 7.198158717217014e-06, + "loss": 4.2778, + "step": 33005 + }, + { + "epoch": 2.2428319065090365, + "grad_norm": 0.5159732103347778, + "learning_rate": 7.197734067128687e-06, + "loss": 4.2107, + "step": 33010 + }, + { + "epoch": 2.243171626579698, + "grad_norm": 0.48419490456581116, + "learning_rate": 7.197309417040359e-06, + "loss": 4.7036, + "step": 33015 + }, + { + "epoch": 2.2435113466503602, + "grad_norm": 0.3517955541610718, + "learning_rate": 7.1968847669520325e-06, + "loss": 4.4411, + "step": 33020 + }, + { + "epoch": 2.243851066721022, + "grad_norm": 0.3564264178276062, + "learning_rate": 7.196460116863705e-06, + "loss": 4.4715, + "step": 33025 + }, + { + "epoch": 2.2441907867916835, + "grad_norm": 0.4085005223751068, + "learning_rate": 7.196035466775377e-06, + "loss": 4.5145, + "step": 33030 + }, + { + "epoch": 2.2445305068623456, + "grad_norm": 0.43306320905685425, + "learning_rate": 7.195610816687051e-06, + "loss": 4.5635, + "step": 33035 + }, + { + "epoch": 2.244870226933007, + "grad_norm": 0.36951756477355957, + "learning_rate": 7.195186166598723e-06, + "loss": 4.3587, + "step": 33040 + }, + { + "epoch": 2.245209947003669, + "grad_norm": 0.43169474601745605, + "learning_rate": 7.194761516510396e-06, + "loss": 4.3386, + "step": 33045 + }, + { + "epoch": 2.245549667074331, + "grad_norm": 0.3306077718734741, + "learning_rate": 7.194336866422069e-06, + "loss": 4.7415, + "step": 33050 + }, + { + "epoch": 2.2458893871449925, + "grad_norm": 0.40825116634368896, + "learning_rate": 7.193912216333741e-06, + "loss": 4.3706, + "step": 33055 + }, + { + "epoch": 2.246229107215654, + "grad_norm": 0.41613641381263733, + "learning_rate": 7.193487566245414e-06, + "loss": 4.3899, + "step": 33060 + }, + { + "epoch": 2.2465688272863162, + "grad_norm": 0.6278674602508545, + "learning_rate": 7.193062916157088e-06, + "loss": 4.5435, + "step": 33065 + }, + { + "epoch": 2.246908547356978, + "grad_norm": 0.5792927742004395, + "learning_rate": 7.19263826606876e-06, + "loss": 4.381, + "step": 33070 + }, + { + "epoch": 2.2472482674276395, + "grad_norm": 0.36767280101776123, + "learning_rate": 7.1922136159804325e-06, + "loss": 4.6328, + "step": 33075 + }, + { + "epoch": 2.2475879874983016, + "grad_norm": 0.45170608162879944, + "learning_rate": 7.191788965892106e-06, + "loss": 4.3692, + "step": 33080 + }, + { + "epoch": 2.247927707568963, + "grad_norm": 0.4650638997554779, + "learning_rate": 7.191364315803778e-06, + "loss": 4.4898, + "step": 33085 + }, + { + "epoch": 2.248267427639625, + "grad_norm": 0.342094749212265, + "learning_rate": 7.190939665715451e-06, + "loss": 4.0774, + "step": 33090 + }, + { + "epoch": 2.248607147710287, + "grad_norm": 0.3384876251220703, + "learning_rate": 7.1905150156271245e-06, + "loss": 4.1591, + "step": 33095 + }, + { + "epoch": 2.2489468677809485, + "grad_norm": 0.5121850967407227, + "learning_rate": 7.1900903655387965e-06, + "loss": 4.4714, + "step": 33100 + }, + { + "epoch": 2.24928658785161, + "grad_norm": 0.3066790997982025, + "learning_rate": 7.189665715450469e-06, + "loss": 4.2488, + "step": 33105 + }, + { + "epoch": 2.2496263079222723, + "grad_norm": 0.47255656123161316, + "learning_rate": 7.189241065362143e-06, + "loss": 4.2454, + "step": 33110 + }, + { + "epoch": 2.249966027992934, + "grad_norm": 0.4996696412563324, + "learning_rate": 7.188816415273815e-06, + "loss": 4.3467, + "step": 33115 + }, + { + "epoch": 2.2503057480635955, + "grad_norm": 0.5197917222976685, + "learning_rate": 7.188391765185488e-06, + "loss": 4.4247, + "step": 33120 + }, + { + "epoch": 2.250645468134257, + "grad_norm": 0.40494927763938904, + "learning_rate": 7.1879671150971605e-06, + "loss": 4.56, + "step": 33125 + }, + { + "epoch": 2.250985188204919, + "grad_norm": 0.3567459285259247, + "learning_rate": 7.187542465008833e-06, + "loss": 4.5417, + "step": 33130 + }, + { + "epoch": 2.251324908275581, + "grad_norm": 0.4049461781978607, + "learning_rate": 7.187117814920505e-06, + "loss": 4.6451, + "step": 33135 + }, + { + "epoch": 2.2516646283462425, + "grad_norm": 0.602989673614502, + "learning_rate": 7.186693164832179e-06, + "loss": 4.5404, + "step": 33140 + }, + { + "epoch": 2.2520043484169046, + "grad_norm": 0.5790131688117981, + "learning_rate": 7.186268514743852e-06, + "loss": 4.3112, + "step": 33145 + }, + { + "epoch": 2.252344068487566, + "grad_norm": 0.448927640914917, + "learning_rate": 7.185843864655524e-06, + "loss": 4.3511, + "step": 33150 + }, + { + "epoch": 2.252683788558228, + "grad_norm": 0.48438790440559387, + "learning_rate": 7.185419214567197e-06, + "loss": 4.5209, + "step": 33155 + }, + { + "epoch": 2.25302350862889, + "grad_norm": 0.7789647579193115, + "learning_rate": 7.18499456447887e-06, + "loss": 4.4275, + "step": 33160 + }, + { + "epoch": 2.2533632286995515, + "grad_norm": 0.41804736852645874, + "learning_rate": 7.184569914390542e-06, + "loss": 4.6495, + "step": 33165 + }, + { + "epoch": 2.253702948770213, + "grad_norm": 0.5930398106575012, + "learning_rate": 7.184145264302216e-06, + "loss": 4.3138, + "step": 33170 + }, + { + "epoch": 2.2540426688408752, + "grad_norm": 0.35266175866127014, + "learning_rate": 7.1837206142138885e-06, + "loss": 4.3661, + "step": 33175 + }, + { + "epoch": 2.254382388911537, + "grad_norm": 0.40399590134620667, + "learning_rate": 7.1832959641255604e-06, + "loss": 4.2326, + "step": 33180 + }, + { + "epoch": 2.2547221089821985, + "grad_norm": 0.3260570466518402, + "learning_rate": 7.182871314037234e-06, + "loss": 4.4363, + "step": 33185 + }, + { + "epoch": 2.2550618290528606, + "grad_norm": 0.5208240151405334, + "learning_rate": 7.182446663948907e-06, + "loss": 4.4662, + "step": 33190 + }, + { + "epoch": 2.255401549123522, + "grad_norm": 0.393411248922348, + "learning_rate": 7.182022013860579e-06, + "loss": 4.56, + "step": 33195 + }, + { + "epoch": 2.255741269194184, + "grad_norm": 0.38632136583328247, + "learning_rate": 7.1815973637722525e-06, + "loss": 4.3072, + "step": 33200 + }, + { + "epoch": 2.256080989264846, + "grad_norm": 0.3440764248371124, + "learning_rate": 7.181172713683925e-06, + "loss": 4.6655, + "step": 33205 + }, + { + "epoch": 2.2564207093355075, + "grad_norm": 0.4785524606704712, + "learning_rate": 7.180748063595597e-06, + "loss": 4.4695, + "step": 33210 + }, + { + "epoch": 2.256760429406169, + "grad_norm": 0.5240752100944519, + "learning_rate": 7.180323413507271e-06, + "loss": 4.3834, + "step": 33215 + }, + { + "epoch": 2.2571001494768312, + "grad_norm": 0.3842525780200958, + "learning_rate": 7.179898763418943e-06, + "loss": 4.44, + "step": 33220 + }, + { + "epoch": 2.257439869547493, + "grad_norm": 0.604819655418396, + "learning_rate": 7.179474113330616e-06, + "loss": 4.2504, + "step": 33225 + }, + { + "epoch": 2.2577795896181545, + "grad_norm": 0.37720054388046265, + "learning_rate": 7.179049463242289e-06, + "loss": 4.2597, + "step": 33230 + }, + { + "epoch": 2.2581193096888166, + "grad_norm": 0.30733221769332886, + "learning_rate": 7.178624813153961e-06, + "loss": 4.3949, + "step": 33235 + }, + { + "epoch": 2.258459029759478, + "grad_norm": 0.40436258912086487, + "learning_rate": 7.178200163065635e-06, + "loss": 4.4297, + "step": 33240 + }, + { + "epoch": 2.25879874983014, + "grad_norm": 0.5420289039611816, + "learning_rate": 7.177775512977308e-06, + "loss": 4.3582, + "step": 33245 + }, + { + "epoch": 2.259138469900802, + "grad_norm": 0.4603147506713867, + "learning_rate": 7.17735086288898e-06, + "loss": 4.4142, + "step": 33250 + }, + { + "epoch": 2.2594781899714635, + "grad_norm": 0.4361923336982727, + "learning_rate": 7.176926212800653e-06, + "loss": 4.3417, + "step": 33255 + }, + { + "epoch": 2.259817910042125, + "grad_norm": 0.29620370268821716, + "learning_rate": 7.176501562712326e-06, + "loss": 4.3416, + "step": 33260 + }, + { + "epoch": 2.2601576301127873, + "grad_norm": 0.648867666721344, + "learning_rate": 7.176076912623998e-06, + "loss": 4.8496, + "step": 33265 + }, + { + "epoch": 2.260497350183449, + "grad_norm": 0.40837305784225464, + "learning_rate": 7.175652262535672e-06, + "loss": 4.2401, + "step": 33270 + }, + { + "epoch": 2.2608370702541105, + "grad_norm": 0.8358386158943176, + "learning_rate": 7.1752276124473445e-06, + "loss": 4.448, + "step": 33275 + }, + { + "epoch": 2.2611767903247726, + "grad_norm": 0.38066381216049194, + "learning_rate": 7.1748029623590165e-06, + "loss": 4.3344, + "step": 33280 + }, + { + "epoch": 2.261516510395434, + "grad_norm": 0.638357937335968, + "learning_rate": 7.17437831227069e-06, + "loss": 4.4295, + "step": 33285 + }, + { + "epoch": 2.261856230466096, + "grad_norm": 0.3715384006500244, + "learning_rate": 7.173953662182362e-06, + "loss": 4.5088, + "step": 33290 + }, + { + "epoch": 2.262195950536758, + "grad_norm": 0.7866990566253662, + "learning_rate": 7.173529012094035e-06, + "loss": 4.3628, + "step": 33295 + }, + { + "epoch": 2.2625356706074196, + "grad_norm": 0.36633291840553284, + "learning_rate": 7.1731043620057085e-06, + "loss": 4.4153, + "step": 33300 + }, + { + "epoch": 2.262875390678081, + "grad_norm": 0.6137489676475525, + "learning_rate": 7.1726797119173805e-06, + "loss": 4.4153, + "step": 33305 + }, + { + "epoch": 2.2632151107487433, + "grad_norm": 0.42380791902542114, + "learning_rate": 7.172255061829053e-06, + "loss": 4.3639, + "step": 33310 + }, + { + "epoch": 2.263554830819405, + "grad_norm": 0.300591379404068, + "learning_rate": 7.171830411740727e-06, + "loss": 4.5492, + "step": 33315 + }, + { + "epoch": 2.2638945508900665, + "grad_norm": 0.3375454246997833, + "learning_rate": 7.171405761652399e-06, + "loss": 4.3466, + "step": 33320 + }, + { + "epoch": 2.2642342709607286, + "grad_norm": 0.4233975410461426, + "learning_rate": 7.170981111564072e-06, + "loss": 4.2459, + "step": 33325 + }, + { + "epoch": 2.2645739910313902, + "grad_norm": 0.5022677779197693, + "learning_rate": 7.170556461475745e-06, + "loss": 4.3015, + "step": 33330 + }, + { + "epoch": 2.264913711102052, + "grad_norm": 0.34255102276802063, + "learning_rate": 7.170131811387417e-06, + "loss": 4.4196, + "step": 33335 + }, + { + "epoch": 2.2652534311727135, + "grad_norm": 0.3450728952884674, + "learning_rate": 7.16970716129909e-06, + "loss": 4.3961, + "step": 33340 + }, + { + "epoch": 2.2655931512433756, + "grad_norm": 0.42801433801651, + "learning_rate": 7.169282511210764e-06, + "loss": 4.2813, + "step": 33345 + }, + { + "epoch": 2.265932871314037, + "grad_norm": 0.34706321358680725, + "learning_rate": 7.168857861122436e-06, + "loss": 4.3695, + "step": 33350 + }, + { + "epoch": 2.266272591384699, + "grad_norm": 0.4737650454044342, + "learning_rate": 7.1684332110341085e-06, + "loss": 4.6462, + "step": 33355 + }, + { + "epoch": 2.266612311455361, + "grad_norm": 0.33771830797195435, + "learning_rate": 7.168008560945781e-06, + "loss": 4.5793, + "step": 33360 + }, + { + "epoch": 2.2669520315260225, + "grad_norm": 0.4183183014392853, + "learning_rate": 7.167583910857454e-06, + "loss": 4.3871, + "step": 33365 + }, + { + "epoch": 2.267291751596684, + "grad_norm": 0.32711532711982727, + "learning_rate": 7.167159260769127e-06, + "loss": 4.4193, + "step": 33370 + }, + { + "epoch": 2.2676314716673462, + "grad_norm": 0.3450183570384979, + "learning_rate": 7.1667346106808e-06, + "loss": 4.5214, + "step": 33375 + }, + { + "epoch": 2.267971191738008, + "grad_norm": 0.33487921953201294, + "learning_rate": 7.1663099605924725e-06, + "loss": 4.5603, + "step": 33380 + }, + { + "epoch": 2.2683109118086695, + "grad_norm": 0.3392952084541321, + "learning_rate": 7.165885310504144e-06, + "loss": 3.9805, + "step": 33385 + }, + { + "epoch": 2.2686506318793316, + "grad_norm": 0.366892009973526, + "learning_rate": 7.165460660415818e-06, + "loss": 4.3455, + "step": 33390 + }, + { + "epoch": 2.268990351949993, + "grad_norm": 0.3823283016681671, + "learning_rate": 7.165036010327491e-06, + "loss": 4.2372, + "step": 33395 + }, + { + "epoch": 2.269330072020655, + "grad_norm": 0.46551698446273804, + "learning_rate": 7.164611360239163e-06, + "loss": 4.16, + "step": 33400 + }, + { + "epoch": 2.269669792091317, + "grad_norm": 0.4351184666156769, + "learning_rate": 7.1641867101508365e-06, + "loss": 4.2816, + "step": 33405 + }, + { + "epoch": 2.2700095121619785, + "grad_norm": 0.44224271178245544, + "learning_rate": 7.163762060062509e-06, + "loss": 4.4183, + "step": 33410 + }, + { + "epoch": 2.27034923223264, + "grad_norm": 0.3500521779060364, + "learning_rate": 7.163337409974181e-06, + "loss": 4.4942, + "step": 33415 + }, + { + "epoch": 2.2706889523033023, + "grad_norm": 0.39698484539985657, + "learning_rate": 7.162912759885855e-06, + "loss": 4.6194, + "step": 33420 + }, + { + "epoch": 2.271028672373964, + "grad_norm": 0.3945077955722809, + "learning_rate": 7.162488109797528e-06, + "loss": 4.1845, + "step": 33425 + }, + { + "epoch": 2.2713683924446255, + "grad_norm": 0.37782275676727295, + "learning_rate": 7.1620634597092e-06, + "loss": 4.1668, + "step": 33430 + }, + { + "epoch": 2.2717081125152876, + "grad_norm": 0.30212196707725525, + "learning_rate": 7.161638809620873e-06, + "loss": 4.4034, + "step": 33435 + }, + { + "epoch": 2.2720478325859492, + "grad_norm": 0.3328685164451599, + "learning_rate": 7.161214159532546e-06, + "loss": 4.2315, + "step": 33440 + }, + { + "epoch": 2.272387552656611, + "grad_norm": 0.34012505412101746, + "learning_rate": 7.160789509444218e-06, + "loss": 4.2181, + "step": 33445 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.4452834129333496, + "learning_rate": 7.160364859355892e-06, + "loss": 4.2685, + "step": 33450 + }, + { + "epoch": 2.2730669927979346, + "grad_norm": 0.4987432062625885, + "learning_rate": 7.1599402092675645e-06, + "loss": 4.4679, + "step": 33455 + }, + { + "epoch": 2.273406712868596, + "grad_norm": 0.6600987911224365, + "learning_rate": 7.1595155591792364e-06, + "loss": 4.3218, + "step": 33460 + }, + { + "epoch": 2.273746432939258, + "grad_norm": 0.7119587659835815, + "learning_rate": 7.15909090909091e-06, + "loss": 4.3197, + "step": 33465 + }, + { + "epoch": 2.27408615300992, + "grad_norm": 0.5339460968971252, + "learning_rate": 7.158666259002582e-06, + "loss": 4.4691, + "step": 33470 + }, + { + "epoch": 2.2744258730805815, + "grad_norm": 0.4063212275505066, + "learning_rate": 7.158241608914255e-06, + "loss": 4.4529, + "step": 33475 + }, + { + "epoch": 2.274765593151243, + "grad_norm": 0.5760005116462708, + "learning_rate": 7.1578169588259285e-06, + "loss": 4.4424, + "step": 33480 + }, + { + "epoch": 2.2751053132219052, + "grad_norm": 0.36425551772117615, + "learning_rate": 7.1573923087376004e-06, + "loss": 4.2686, + "step": 33485 + }, + { + "epoch": 2.275445033292567, + "grad_norm": 0.5163416862487793, + "learning_rate": 7.156967658649273e-06, + "loss": 4.4066, + "step": 33490 + }, + { + "epoch": 2.2757847533632285, + "grad_norm": 0.37795671820640564, + "learning_rate": 7.156543008560947e-06, + "loss": 4.377, + "step": 33495 + }, + { + "epoch": 2.2761244734338906, + "grad_norm": 0.4474274516105652, + "learning_rate": 7.156118358472619e-06, + "loss": 4.6178, + "step": 33500 + }, + { + "epoch": 2.276464193504552, + "grad_norm": 0.35046613216400146, + "learning_rate": 7.155693708384292e-06, + "loss": 4.4542, + "step": 33505 + }, + { + "epoch": 2.276803913575214, + "grad_norm": 0.4555718004703522, + "learning_rate": 7.155269058295965e-06, + "loss": 4.3767, + "step": 33510 + }, + { + "epoch": 2.277143633645876, + "grad_norm": 0.3633488118648529, + "learning_rate": 7.154844408207637e-06, + "loss": 4.3266, + "step": 33515 + }, + { + "epoch": 2.2774833537165375, + "grad_norm": 0.4696308672428131, + "learning_rate": 7.15441975811931e-06, + "loss": 4.2567, + "step": 33520 + }, + { + "epoch": 2.277823073787199, + "grad_norm": 0.3982876241207123, + "learning_rate": 7.153995108030984e-06, + "loss": 4.253, + "step": 33525 + }, + { + "epoch": 2.2781627938578612, + "grad_norm": 0.4149719774723053, + "learning_rate": 7.153570457942656e-06, + "loss": 4.3607, + "step": 33530 + }, + { + "epoch": 2.278502513928523, + "grad_norm": 0.3638160526752472, + "learning_rate": 7.1531458078543284e-06, + "loss": 4.392, + "step": 33535 + }, + { + "epoch": 2.2788422339991845, + "grad_norm": 0.5365212559700012, + "learning_rate": 7.152721157766001e-06, + "loss": 4.5832, + "step": 33540 + }, + { + "epoch": 2.2791819540698466, + "grad_norm": 0.5103355646133423, + "learning_rate": 7.152296507677674e-06, + "loss": 4.2529, + "step": 33545 + }, + { + "epoch": 2.279521674140508, + "grad_norm": 0.3833523094654083, + "learning_rate": 7.151871857589347e-06, + "loss": 4.3313, + "step": 33550 + }, + { + "epoch": 2.27986139421117, + "grad_norm": 0.7080106139183044, + "learning_rate": 7.15144720750102e-06, + "loss": 4.3995, + "step": 33555 + }, + { + "epoch": 2.280201114281832, + "grad_norm": 0.5381726026535034, + "learning_rate": 7.1510225574126924e-06, + "loss": 4.238, + "step": 33560 + }, + { + "epoch": 2.2805408343524936, + "grad_norm": 0.4983236491680145, + "learning_rate": 7.150597907324364e-06, + "loss": 4.3433, + "step": 33565 + }, + { + "epoch": 2.280880554423155, + "grad_norm": 0.4149067997932434, + "learning_rate": 7.150173257236038e-06, + "loss": 4.2408, + "step": 33570 + }, + { + "epoch": 2.2812202744938173, + "grad_norm": 0.3549051582813263, + "learning_rate": 7.149748607147711e-06, + "loss": 4.4017, + "step": 33575 + }, + { + "epoch": 2.281559994564479, + "grad_norm": 0.4590204954147339, + "learning_rate": 7.1493239570593845e-06, + "loss": 4.3815, + "step": 33580 + }, + { + "epoch": 2.2818997146351405, + "grad_norm": 0.3983209431171417, + "learning_rate": 7.1488993069710565e-06, + "loss": 4.5177, + "step": 33585 + }, + { + "epoch": 2.2822394347058026, + "grad_norm": 0.42768171429634094, + "learning_rate": 7.148474656882729e-06, + "loss": 4.3016, + "step": 33590 + }, + { + "epoch": 2.2825791547764642, + "grad_norm": 0.7063786387443542, + "learning_rate": 7.148050006794403e-06, + "loss": 4.3088, + "step": 33595 + }, + { + "epoch": 2.282918874847126, + "grad_norm": 0.44477298855781555, + "learning_rate": 7.147625356706075e-06, + "loss": 4.316, + "step": 33600 + }, + { + "epoch": 2.283258594917788, + "grad_norm": 0.5871889591217041, + "learning_rate": 7.147200706617748e-06, + "loss": 4.4513, + "step": 33605 + }, + { + "epoch": 2.2835983149884496, + "grad_norm": 0.42215681076049805, + "learning_rate": 7.1467760565294205e-06, + "loss": 4.5734, + "step": 33610 + }, + { + "epoch": 2.283938035059111, + "grad_norm": 0.40783968567848206, + "learning_rate": 7.146351406441093e-06, + "loss": 4.1916, + "step": 33615 + }, + { + "epoch": 2.2842777551297733, + "grad_norm": 0.4759994149208069, + "learning_rate": 7.145926756352766e-06, + "loss": 4.3105, + "step": 33620 + }, + { + "epoch": 2.284617475200435, + "grad_norm": 0.37363579869270325, + "learning_rate": 7.145502106264439e-06, + "loss": 4.4263, + "step": 33625 + }, + { + "epoch": 2.2849571952710965, + "grad_norm": 0.6295708417892456, + "learning_rate": 7.145077456176112e-06, + "loss": 4.4137, + "step": 33630 + }, + { + "epoch": 2.2852969153417586, + "grad_norm": 0.48134276270866394, + "learning_rate": 7.144652806087784e-06, + "loss": 4.3526, + "step": 33635 + }, + { + "epoch": 2.2856366354124202, + "grad_norm": 0.37696337699890137, + "learning_rate": 7.144228155999457e-06, + "loss": 4.5261, + "step": 33640 + }, + { + "epoch": 2.285976355483082, + "grad_norm": 0.5764722228050232, + "learning_rate": 7.14380350591113e-06, + "loss": 4.5097, + "step": 33645 + }, + { + "epoch": 2.286316075553744, + "grad_norm": 0.34308886528015137, + "learning_rate": 7.143378855822802e-06, + "loss": 4.214, + "step": 33650 + }, + { + "epoch": 2.2866557956244056, + "grad_norm": 0.6638505458831787, + "learning_rate": 7.142954205734476e-06, + "loss": 4.5409, + "step": 33655 + }, + { + "epoch": 2.286995515695067, + "grad_norm": 0.3736531138420105, + "learning_rate": 7.1425295556461485e-06, + "loss": 4.2682, + "step": 33660 + }, + { + "epoch": 2.2873352357657293, + "grad_norm": 0.2698637545108795, + "learning_rate": 7.14210490555782e-06, + "loss": 4.3646, + "step": 33665 + }, + { + "epoch": 2.287674955836391, + "grad_norm": 0.3930017054080963, + "learning_rate": 7.141680255469494e-06, + "loss": 4.2483, + "step": 33670 + }, + { + "epoch": 2.2880146759070525, + "grad_norm": 0.44221076369285583, + "learning_rate": 7.141255605381167e-06, + "loss": 4.1508, + "step": 33675 + }, + { + "epoch": 2.288354395977714, + "grad_norm": 0.7043525576591492, + "learning_rate": 7.140830955292839e-06, + "loss": 4.2896, + "step": 33680 + }, + { + "epoch": 2.2886941160483762, + "grad_norm": 0.3910509943962097, + "learning_rate": 7.1404063052045125e-06, + "loss": 4.5471, + "step": 33685 + }, + { + "epoch": 2.289033836119038, + "grad_norm": 0.39486706256866455, + "learning_rate": 7.139981655116185e-06, + "loss": 4.3805, + "step": 33690 + }, + { + "epoch": 2.2893735561896995, + "grad_norm": 0.3982042372226715, + "learning_rate": 7.139557005027857e-06, + "loss": 4.4237, + "step": 33695 + }, + { + "epoch": 2.2897132762603616, + "grad_norm": 0.43772315979003906, + "learning_rate": 7.139132354939531e-06, + "loss": 4.423, + "step": 33700 + }, + { + "epoch": 2.290052996331023, + "grad_norm": 0.2840714156627655, + "learning_rate": 7.138707704851203e-06, + "loss": 4.5274, + "step": 33705 + }, + { + "epoch": 2.290392716401685, + "grad_norm": 0.6183924674987793, + "learning_rate": 7.138283054762876e-06, + "loss": 4.3577, + "step": 33710 + }, + { + "epoch": 2.290732436472347, + "grad_norm": 0.42771589756011963, + "learning_rate": 7.137858404674549e-06, + "loss": 4.2178, + "step": 33715 + }, + { + "epoch": 2.2910721565430086, + "grad_norm": 0.621692955493927, + "learning_rate": 7.137433754586221e-06, + "loss": 4.1998, + "step": 33720 + }, + { + "epoch": 2.29141187661367, + "grad_norm": 0.35099154710769653, + "learning_rate": 7.137009104497894e-06, + "loss": 4.2072, + "step": 33725 + }, + { + "epoch": 2.2917515966843323, + "grad_norm": 0.5455004572868347, + "learning_rate": 7.136584454409568e-06, + "loss": 4.5473, + "step": 33730 + }, + { + "epoch": 2.292091316754994, + "grad_norm": 0.5017917156219482, + "learning_rate": 7.13615980432124e-06, + "loss": 4.4661, + "step": 33735 + }, + { + "epoch": 2.2924310368256555, + "grad_norm": 0.44305941462516785, + "learning_rate": 7.1357351542329124e-06, + "loss": 4.2336, + "step": 33740 + }, + { + "epoch": 2.2927707568963176, + "grad_norm": 0.3239680826663971, + "learning_rate": 7.135310504144586e-06, + "loss": 4.2208, + "step": 33745 + }, + { + "epoch": 2.2931104769669792, + "grad_norm": 0.3638860583305359, + "learning_rate": 7.134885854056258e-06, + "loss": 4.4134, + "step": 33750 + }, + { + "epoch": 2.293450197037641, + "grad_norm": 0.4999406933784485, + "learning_rate": 7.134461203967931e-06, + "loss": 4.5178, + "step": 33755 + }, + { + "epoch": 2.293789917108303, + "grad_norm": 0.4073525667190552, + "learning_rate": 7.1340365538796045e-06, + "loss": 4.146, + "step": 33760 + }, + { + "epoch": 2.2941296371789646, + "grad_norm": 0.3342823386192322, + "learning_rate": 7.1336119037912764e-06, + "loss": 4.3632, + "step": 33765 + }, + { + "epoch": 2.294469357249626, + "grad_norm": 0.6347739100456238, + "learning_rate": 7.133187253702949e-06, + "loss": 4.4359, + "step": 33770 + }, + { + "epoch": 2.2948090773202883, + "grad_norm": 0.3871743083000183, + "learning_rate": 7.132762603614623e-06, + "loss": 4.2382, + "step": 33775 + }, + { + "epoch": 2.29514879739095, + "grad_norm": 0.5166870951652527, + "learning_rate": 7.132337953526295e-06, + "loss": 4.2778, + "step": 33780 + }, + { + "epoch": 2.2954885174616115, + "grad_norm": 0.5923765897750854, + "learning_rate": 7.131913303437968e-06, + "loss": 4.418, + "step": 33785 + }, + { + "epoch": 2.295828237532273, + "grad_norm": 0.4617008864879608, + "learning_rate": 7.1314886533496404e-06, + "loss": 4.628, + "step": 33790 + }, + { + "epoch": 2.2961679576029352, + "grad_norm": 0.5514224171638489, + "learning_rate": 7.131064003261313e-06, + "loss": 4.5543, + "step": 33795 + }, + { + "epoch": 2.296507677673597, + "grad_norm": 0.7923144102096558, + "learning_rate": 7.130639353172985e-06, + "loss": 4.3863, + "step": 33800 + }, + { + "epoch": 2.2968473977442585, + "grad_norm": 0.5227079391479492, + "learning_rate": 7.130214703084659e-06, + "loss": 4.5133, + "step": 33805 + }, + { + "epoch": 2.2971871178149206, + "grad_norm": 0.36155733466148376, + "learning_rate": 7.129790052996332e-06, + "loss": 4.2826, + "step": 33810 + }, + { + "epoch": 2.297526837885582, + "grad_norm": 0.3238295614719391, + "learning_rate": 7.129365402908004e-06, + "loss": 4.3787, + "step": 33815 + }, + { + "epoch": 2.297866557956244, + "grad_norm": 0.47541147470474243, + "learning_rate": 7.128940752819677e-06, + "loss": 4.3057, + "step": 33820 + }, + { + "epoch": 2.298206278026906, + "grad_norm": 0.6469614505767822, + "learning_rate": 7.12851610273135e-06, + "loss": 4.123, + "step": 33825 + }, + { + "epoch": 2.2985459980975675, + "grad_norm": 0.4758502244949341, + "learning_rate": 7.128091452643022e-06, + "loss": 4.466, + "step": 33830 + }, + { + "epoch": 2.298885718168229, + "grad_norm": 0.4647830128669739, + "learning_rate": 7.127666802554696e-06, + "loss": 4.3082, + "step": 33835 + }, + { + "epoch": 2.2992254382388913, + "grad_norm": 0.3933093249797821, + "learning_rate": 7.1272421524663684e-06, + "loss": 4.3169, + "step": 33840 + }, + { + "epoch": 2.299565158309553, + "grad_norm": 0.4585284888744354, + "learning_rate": 7.12681750237804e-06, + "loss": 4.5653, + "step": 33845 + }, + { + "epoch": 2.2999048783802145, + "grad_norm": 0.6442696452140808, + "learning_rate": 7.126392852289714e-06, + "loss": 4.3803, + "step": 33850 + }, + { + "epoch": 2.3002445984508766, + "grad_norm": 0.3582524061203003, + "learning_rate": 7.125968202201387e-06, + "loss": 4.4216, + "step": 33855 + }, + { + "epoch": 2.300584318521538, + "grad_norm": 0.4673219621181488, + "learning_rate": 7.125543552113059e-06, + "loss": 3.9908, + "step": 33860 + }, + { + "epoch": 2.3009240385922, + "grad_norm": 0.36480283737182617, + "learning_rate": 7.1251189020247324e-06, + "loss": 4.3183, + "step": 33865 + }, + { + "epoch": 2.301263758662862, + "grad_norm": 0.41117653250694275, + "learning_rate": 7.124694251936405e-06, + "loss": 4.4648, + "step": 33870 + }, + { + "epoch": 2.3016034787335236, + "grad_norm": 0.34548649191856384, + "learning_rate": 7.124269601848077e-06, + "loss": 4.122, + "step": 33875 + }, + { + "epoch": 2.301943198804185, + "grad_norm": 0.7932968735694885, + "learning_rate": 7.123844951759751e-06, + "loss": 4.2916, + "step": 33880 + }, + { + "epoch": 2.3022829188748473, + "grad_norm": 0.34984201192855835, + "learning_rate": 7.123420301671423e-06, + "loss": 4.4247, + "step": 33885 + }, + { + "epoch": 2.302622638945509, + "grad_norm": 0.3564962148666382, + "learning_rate": 7.122995651583096e-06, + "loss": 4.3409, + "step": 33890 + }, + { + "epoch": 2.3029623590161705, + "grad_norm": 0.3383583724498749, + "learning_rate": 7.122571001494769e-06, + "loss": 4.2446, + "step": 33895 + }, + { + "epoch": 2.3033020790868326, + "grad_norm": 0.49683335423469543, + "learning_rate": 7.122146351406441e-06, + "loss": 4.2747, + "step": 33900 + }, + { + "epoch": 2.3036417991574942, + "grad_norm": 0.477236807346344, + "learning_rate": 7.121721701318114e-06, + "loss": 4.2667, + "step": 33905 + }, + { + "epoch": 2.303981519228156, + "grad_norm": 0.3521973788738251, + "learning_rate": 7.121297051229788e-06, + "loss": 4.2674, + "step": 33910 + }, + { + "epoch": 2.304321239298818, + "grad_norm": 0.5454042553901672, + "learning_rate": 7.12087240114146e-06, + "loss": 4.4786, + "step": 33915 + }, + { + "epoch": 2.3046609593694796, + "grad_norm": 0.4678252041339874, + "learning_rate": 7.120447751053133e-06, + "loss": 4.4169, + "step": 33920 + }, + { + "epoch": 2.305000679440141, + "grad_norm": 0.5776580572128296, + "learning_rate": 7.120023100964806e-06, + "loss": 4.4278, + "step": 33925 + }, + { + "epoch": 2.3053403995108033, + "grad_norm": 0.44096821546554565, + "learning_rate": 7.119598450876478e-06, + "loss": 4.3219, + "step": 33930 + }, + { + "epoch": 2.305680119581465, + "grad_norm": 0.43693873286247253, + "learning_rate": 7.119173800788152e-06, + "loss": 4.3622, + "step": 33935 + }, + { + "epoch": 2.3060198396521265, + "grad_norm": 0.3745706081390381, + "learning_rate": 7.1187491506998245e-06, + "loss": 4.1486, + "step": 33940 + }, + { + "epoch": 2.3063595597227886, + "grad_norm": 0.44974932074546814, + "learning_rate": 7.118324500611496e-06, + "loss": 4.3926, + "step": 33945 + }, + { + "epoch": 2.3066992797934502, + "grad_norm": 0.3735632002353668, + "learning_rate": 7.11789985052317e-06, + "loss": 4.3599, + "step": 33950 + }, + { + "epoch": 2.307038999864112, + "grad_norm": 0.3500899076461792, + "learning_rate": 7.117475200434842e-06, + "loss": 4.1653, + "step": 33955 + }, + { + "epoch": 2.307378719934774, + "grad_norm": 0.6138634085655212, + "learning_rate": 7.117050550346515e-06, + "loss": 4.3763, + "step": 33960 + }, + { + "epoch": 2.3077184400054356, + "grad_norm": 0.3681022822856903, + "learning_rate": 7.1166259002581885e-06, + "loss": 4.2728, + "step": 33965 + }, + { + "epoch": 2.308058160076097, + "grad_norm": 0.696733832359314, + "learning_rate": 7.11620125016986e-06, + "loss": 4.1395, + "step": 33970 + }, + { + "epoch": 2.3083978801467593, + "grad_norm": 0.3098507523536682, + "learning_rate": 7.115776600081533e-06, + "loss": 4.331, + "step": 33975 + }, + { + "epoch": 2.308737600217421, + "grad_norm": 0.40174365043640137, + "learning_rate": 7.115351949993207e-06, + "loss": 4.3294, + "step": 33980 + }, + { + "epoch": 2.3090773202880825, + "grad_norm": 0.4463421106338501, + "learning_rate": 7.114927299904879e-06, + "loss": 4.4688, + "step": 33985 + }, + { + "epoch": 2.3094170403587446, + "grad_norm": 0.46643713116645813, + "learning_rate": 7.114502649816552e-06, + "loss": 4.4023, + "step": 33990 + }, + { + "epoch": 2.3097567604294063, + "grad_norm": 1.0403728485107422, + "learning_rate": 7.114077999728225e-06, + "loss": 4.616, + "step": 33995 + }, + { + "epoch": 2.310096480500068, + "grad_norm": 0.7004113793373108, + "learning_rate": 7.113653349639897e-06, + "loss": 4.5362, + "step": 34000 + }, + { + "epoch": 2.31043620057073, + "grad_norm": 1.4111257791519165, + "learning_rate": 7.11322869955157e-06, + "loss": 4.2638, + "step": 34005 + }, + { + "epoch": 2.3107759206413916, + "grad_norm": 0.44009295105934143, + "learning_rate": 7.112804049463244e-06, + "loss": 4.4432, + "step": 34010 + }, + { + "epoch": 2.311115640712053, + "grad_norm": 0.47537410259246826, + "learning_rate": 7.112379399374916e-06, + "loss": 4.2682, + "step": 34015 + }, + { + "epoch": 2.311455360782715, + "grad_norm": 0.3539891242980957, + "learning_rate": 7.1119547492865884e-06, + "loss": 4.3422, + "step": 34020 + }, + { + "epoch": 2.311795080853377, + "grad_norm": 0.457186222076416, + "learning_rate": 7.111530099198261e-06, + "loss": 4.3243, + "step": 34025 + }, + { + "epoch": 2.3121348009240386, + "grad_norm": 0.35157036781311035, + "learning_rate": 7.111105449109934e-06, + "loss": 4.1802, + "step": 34030 + }, + { + "epoch": 2.3124745209947, + "grad_norm": 0.6110732555389404, + "learning_rate": 7.110680799021607e-06, + "loss": 4.2313, + "step": 34035 + }, + { + "epoch": 2.3128142410653623, + "grad_norm": 0.33452850580215454, + "learning_rate": 7.11025614893328e-06, + "loss": 4.3743, + "step": 34040 + }, + { + "epoch": 2.313153961136024, + "grad_norm": 0.5602484941482544, + "learning_rate": 7.1098314988449524e-06, + "loss": 4.5311, + "step": 34045 + }, + { + "epoch": 2.3134936812066855, + "grad_norm": 0.39967235922813416, + "learning_rate": 7.109406848756624e-06, + "loss": 4.2893, + "step": 34050 + }, + { + "epoch": 2.3138334012773476, + "grad_norm": 0.6493786573410034, + "learning_rate": 7.108982198668298e-06, + "loss": 4.2029, + "step": 34055 + }, + { + "epoch": 2.3141731213480092, + "grad_norm": 0.3152988851070404, + "learning_rate": 7.108557548579971e-06, + "loss": 4.1238, + "step": 34060 + }, + { + "epoch": 2.314512841418671, + "grad_norm": 0.30735695362091064, + "learning_rate": 7.108132898491643e-06, + "loss": 4.344, + "step": 34065 + }, + { + "epoch": 2.314852561489333, + "grad_norm": 0.4066847562789917, + "learning_rate": 7.1077082484033164e-06, + "loss": 4.2237, + "step": 34070 + }, + { + "epoch": 2.3151922815599946, + "grad_norm": 0.5139966607093811, + "learning_rate": 7.107283598314989e-06, + "loss": 4.2795, + "step": 34075 + }, + { + "epoch": 2.315532001630656, + "grad_norm": 0.44104182720184326, + "learning_rate": 7.106858948226661e-06, + "loss": 4.5229, + "step": 34080 + }, + { + "epoch": 2.3158717217013183, + "grad_norm": 0.28239667415618896, + "learning_rate": 7.106434298138335e-06, + "loss": 4.6849, + "step": 34085 + }, + { + "epoch": 2.31621144177198, + "grad_norm": 0.3287760317325592, + "learning_rate": 7.106009648050008e-06, + "loss": 4.4069, + "step": 34090 + }, + { + "epoch": 2.3165511618426415, + "grad_norm": 0.39208558201789856, + "learning_rate": 7.10558499796168e-06, + "loss": 4.437, + "step": 34095 + }, + { + "epoch": 2.3168908819133036, + "grad_norm": 0.32477867603302, + "learning_rate": 7.105160347873353e-06, + "loss": 4.388, + "step": 34100 + }, + { + "epoch": 2.3172306019839652, + "grad_norm": 0.4426877200603485, + "learning_rate": 7.104735697785026e-06, + "loss": 4.1838, + "step": 34105 + }, + { + "epoch": 2.317570322054627, + "grad_norm": 0.4362131655216217, + "learning_rate": 7.104311047696698e-06, + "loss": 4.3935, + "step": 34110 + }, + { + "epoch": 2.317910042125289, + "grad_norm": 0.37305012345314026, + "learning_rate": 7.103886397608372e-06, + "loss": 4.4321, + "step": 34115 + }, + { + "epoch": 2.3182497621959506, + "grad_norm": 0.549552321434021, + "learning_rate": 7.1034617475200444e-06, + "loss": 4.5335, + "step": 34120 + }, + { + "epoch": 2.318589482266612, + "grad_norm": 0.7405218482017517, + "learning_rate": 7.103037097431716e-06, + "loss": 4.2855, + "step": 34125 + }, + { + "epoch": 2.318929202337274, + "grad_norm": 0.6546456217765808, + "learning_rate": 7.10261244734339e-06, + "loss": 4.327, + "step": 34130 + }, + { + "epoch": 2.319268922407936, + "grad_norm": 0.5022596120834351, + "learning_rate": 7.102187797255062e-06, + "loss": 4.442, + "step": 34135 + }, + { + "epoch": 2.3196086424785975, + "grad_norm": 0.29880890250205994, + "learning_rate": 7.101763147166735e-06, + "loss": 4.3944, + "step": 34140 + }, + { + "epoch": 2.319948362549259, + "grad_norm": 0.45438316464424133, + "learning_rate": 7.1013384970784084e-06, + "loss": 4.1849, + "step": 34145 + }, + { + "epoch": 2.3202880826199213, + "grad_norm": 0.3592666685581207, + "learning_rate": 7.10091384699008e-06, + "loss": 4.3529, + "step": 34150 + }, + { + "epoch": 2.320627802690583, + "grad_norm": 0.6546388268470764, + "learning_rate": 7.100489196901753e-06, + "loss": 4.3463, + "step": 34155 + }, + { + "epoch": 2.3209675227612445, + "grad_norm": 0.744382381439209, + "learning_rate": 7.100064546813427e-06, + "loss": 4.2489, + "step": 34160 + }, + { + "epoch": 2.3213072428319066, + "grad_norm": 0.4072486162185669, + "learning_rate": 7.099639896725099e-06, + "loss": 4.0962, + "step": 34165 + }, + { + "epoch": 2.3216469629025682, + "grad_norm": 0.31927210092544556, + "learning_rate": 7.099215246636772e-06, + "loss": 4.1597, + "step": 34170 + }, + { + "epoch": 2.32198668297323, + "grad_norm": 0.3763040006160736, + "learning_rate": 7.098790596548445e-06, + "loss": 4.1295, + "step": 34175 + }, + { + "epoch": 2.322326403043892, + "grad_norm": 0.5428019165992737, + "learning_rate": 7.098365946460117e-06, + "loss": 4.1345, + "step": 34180 + }, + { + "epoch": 2.3226661231145536, + "grad_norm": 0.43125319480895996, + "learning_rate": 7.09794129637179e-06, + "loss": 4.452, + "step": 34185 + }, + { + "epoch": 2.323005843185215, + "grad_norm": 0.4407755434513092, + "learning_rate": 7.097516646283464e-06, + "loss": 4.2709, + "step": 34190 + }, + { + "epoch": 2.3233455632558773, + "grad_norm": 0.3884342610836029, + "learning_rate": 7.097091996195136e-06, + "loss": 4.5624, + "step": 34195 + }, + { + "epoch": 2.323685283326539, + "grad_norm": 0.39861956238746643, + "learning_rate": 7.096667346106808e-06, + "loss": 4.2989, + "step": 34200 + }, + { + "epoch": 2.3240250033972005, + "grad_norm": 0.4839802384376526, + "learning_rate": 7.096242696018481e-06, + "loss": 4.3966, + "step": 34205 + }, + { + "epoch": 2.3243647234678626, + "grad_norm": 0.3200452923774719, + "learning_rate": 7.095818045930154e-06, + "loss": 4.5138, + "step": 34210 + }, + { + "epoch": 2.3247044435385242, + "grad_norm": 0.33982351422309875, + "learning_rate": 7.095393395841827e-06, + "loss": 4.2671, + "step": 34215 + }, + { + "epoch": 2.325044163609186, + "grad_norm": 0.3648662269115448, + "learning_rate": 7.0949687457535e-06, + "loss": 4.4729, + "step": 34220 + }, + { + "epoch": 2.325383883679848, + "grad_norm": 0.4672451615333557, + "learning_rate": 7.094544095665172e-06, + "loss": 4.3326, + "step": 34225 + }, + { + "epoch": 2.3257236037505096, + "grad_norm": 0.6638365387916565, + "learning_rate": 7.094119445576844e-06, + "loss": 4.446, + "step": 34230 + }, + { + "epoch": 2.326063323821171, + "grad_norm": 0.37101808190345764, + "learning_rate": 7.093694795488518e-06, + "loss": 4.2832, + "step": 34235 + }, + { + "epoch": 2.3264030438918333, + "grad_norm": 0.35932523012161255, + "learning_rate": 7.093270145400191e-06, + "loss": 4.274, + "step": 34240 + }, + { + "epoch": 2.326742763962495, + "grad_norm": 0.7753689289093018, + "learning_rate": 7.092845495311863e-06, + "loss": 4.3844, + "step": 34245 + }, + { + "epoch": 2.3270824840331565, + "grad_norm": 0.7822800874710083, + "learning_rate": 7.092420845223536e-06, + "loss": 4.258, + "step": 34250 + }, + { + "epoch": 2.3274222041038186, + "grad_norm": 0.5190069675445557, + "learning_rate": 7.091996195135209e-06, + "loss": 4.4023, + "step": 34255 + }, + { + "epoch": 2.3277619241744802, + "grad_norm": 0.6031309366226196, + "learning_rate": 7.091571545046883e-06, + "loss": 4.3577, + "step": 34260 + }, + { + "epoch": 2.328101644245142, + "grad_norm": 0.416951060295105, + "learning_rate": 7.091146894958555e-06, + "loss": 4.2152, + "step": 34265 + }, + { + "epoch": 2.328441364315804, + "grad_norm": 0.4634447991847992, + "learning_rate": 7.090722244870228e-06, + "loss": 4.3204, + "step": 34270 + }, + { + "epoch": 2.3287810843864656, + "grad_norm": 0.4779868423938751, + "learning_rate": 7.0902975947819e-06, + "loss": 4.1885, + "step": 34275 + }, + { + "epoch": 2.329120804457127, + "grad_norm": 0.49010106921195984, + "learning_rate": 7.089872944693573e-06, + "loss": 4.6314, + "step": 34280 + }, + { + "epoch": 2.3294605245277893, + "grad_norm": 0.35696420073509216, + "learning_rate": 7.089448294605246e-06, + "loss": 4.3257, + "step": 34285 + }, + { + "epoch": 2.329800244598451, + "grad_norm": 0.357413113117218, + "learning_rate": 7.089023644516919e-06, + "loss": 4.4546, + "step": 34290 + }, + { + "epoch": 2.3301399646691126, + "grad_norm": 0.3295603096485138, + "learning_rate": 7.088598994428592e-06, + "loss": 4.3498, + "step": 34295 + }, + { + "epoch": 2.3304796847397746, + "grad_norm": 0.395277738571167, + "learning_rate": 7.0881743443402636e-06, + "loss": 4.3292, + "step": 34300 + }, + { + "epoch": 2.3308194048104363, + "grad_norm": 0.3434000015258789, + "learning_rate": 7.087749694251937e-06, + "loss": 4.5389, + "step": 34305 + }, + { + "epoch": 2.331159124881098, + "grad_norm": 0.8048215508460999, + "learning_rate": 7.08732504416361e-06, + "loss": 4.447, + "step": 34310 + }, + { + "epoch": 2.33149884495176, + "grad_norm": 0.5354920625686646, + "learning_rate": 7.086900394075282e-06, + "loss": 4.4161, + "step": 34315 + }, + { + "epoch": 2.3318385650224216, + "grad_norm": 0.5112811326980591, + "learning_rate": 7.086475743986956e-06, + "loss": 4.1137, + "step": 34320 + }, + { + "epoch": 2.3321782850930832, + "grad_norm": 0.47887375950813293, + "learning_rate": 7.0860510938986284e-06, + "loss": 4.2262, + "step": 34325 + }, + { + "epoch": 2.3325180051637453, + "grad_norm": 0.49267205595970154, + "learning_rate": 7.0856264438103e-06, + "loss": 4.6077, + "step": 34330 + }, + { + "epoch": 2.332857725234407, + "grad_norm": 0.42364731431007385, + "learning_rate": 7.085201793721974e-06, + "loss": 4.2347, + "step": 34335 + }, + { + "epoch": 2.3331974453050686, + "grad_norm": 0.5034579634666443, + "learning_rate": 7.084777143633647e-06, + "loss": 4.2114, + "step": 34340 + }, + { + "epoch": 2.3335371653757306, + "grad_norm": 0.4153178632259369, + "learning_rate": 7.084352493545319e-06, + "loss": 4.23, + "step": 34345 + }, + { + "epoch": 2.3338768854463923, + "grad_norm": 0.3659740388393402, + "learning_rate": 7.0839278434569924e-06, + "loss": 4.4394, + "step": 34350 + }, + { + "epoch": 2.334216605517054, + "grad_norm": 0.6401697397232056, + "learning_rate": 7.083503193368665e-06, + "loss": 4.4233, + "step": 34355 + }, + { + "epoch": 2.3345563255877155, + "grad_norm": 0.39644908905029297, + "learning_rate": 7.083078543280337e-06, + "loss": 4.4317, + "step": 34360 + }, + { + "epoch": 2.3348960456583776, + "grad_norm": 0.4712200164794922, + "learning_rate": 7.082653893192011e-06, + "loss": 4.5973, + "step": 34365 + }, + { + "epoch": 2.3352357657290392, + "grad_norm": 0.5000478029251099, + "learning_rate": 7.082229243103683e-06, + "loss": 4.4032, + "step": 34370 + }, + { + "epoch": 2.335575485799701, + "grad_norm": 0.3628085255622864, + "learning_rate": 7.081804593015356e-06, + "loss": 4.2204, + "step": 34375 + }, + { + "epoch": 2.335915205870363, + "grad_norm": 0.34614965319633484, + "learning_rate": 7.081379942927029e-06, + "loss": 4.515, + "step": 34380 + }, + { + "epoch": 2.3362549259410246, + "grad_norm": 0.48561638593673706, + "learning_rate": 7.080955292838701e-06, + "loss": 4.3617, + "step": 34385 + }, + { + "epoch": 2.336594646011686, + "grad_norm": 0.4515634775161743, + "learning_rate": 7.080530642750374e-06, + "loss": 4.3826, + "step": 34390 + }, + { + "epoch": 2.3369343660823483, + "grad_norm": 0.4465997517108917, + "learning_rate": 7.080105992662048e-06, + "loss": 4.4701, + "step": 34395 + }, + { + "epoch": 2.33727408615301, + "grad_norm": 0.6419448256492615, + "learning_rate": 7.07968134257372e-06, + "loss": 4.3224, + "step": 34400 + }, + { + "epoch": 2.3376138062236715, + "grad_norm": 0.720885694026947, + "learning_rate": 7.079256692485392e-06, + "loss": 4.1977, + "step": 34405 + }, + { + "epoch": 2.3379535262943336, + "grad_norm": 0.5754159688949585, + "learning_rate": 7.078832042397066e-06, + "loss": 4.3982, + "step": 34410 + }, + { + "epoch": 2.3382932463649952, + "grad_norm": 0.7158495187759399, + "learning_rate": 7.078407392308738e-06, + "loss": 4.3402, + "step": 34415 + }, + { + "epoch": 2.338632966435657, + "grad_norm": 0.44336092472076416, + "learning_rate": 7.077982742220411e-06, + "loss": 4.1472, + "step": 34420 + }, + { + "epoch": 2.338972686506319, + "grad_norm": 0.5648946762084961, + "learning_rate": 7.0775580921320844e-06, + "loss": 4.0464, + "step": 34425 + }, + { + "epoch": 2.3393124065769806, + "grad_norm": 0.37563183903694153, + "learning_rate": 7.077133442043756e-06, + "loss": 4.2606, + "step": 34430 + }, + { + "epoch": 2.339652126647642, + "grad_norm": 0.4438725411891937, + "learning_rate": 7.076708791955429e-06, + "loss": 4.2342, + "step": 34435 + }, + { + "epoch": 2.3399918467183043, + "grad_norm": 0.48239555954933167, + "learning_rate": 7.076284141867103e-06, + "loss": 4.3173, + "step": 34440 + }, + { + "epoch": 2.340331566788966, + "grad_norm": 0.44781506061553955, + "learning_rate": 7.075859491778775e-06, + "loss": 4.142, + "step": 34445 + }, + { + "epoch": 2.3406712868596276, + "grad_norm": 0.33603182435035706, + "learning_rate": 7.075434841690448e-06, + "loss": 4.2244, + "step": 34450 + }, + { + "epoch": 2.3410110069302896, + "grad_norm": 0.41960233449935913, + "learning_rate": 7.07501019160212e-06, + "loss": 4.2174, + "step": 34455 + }, + { + "epoch": 2.3413507270009513, + "grad_norm": 1.504639983177185, + "learning_rate": 7.074585541513793e-06, + "loss": 4.5074, + "step": 34460 + }, + { + "epoch": 2.341690447071613, + "grad_norm": 0.42934393882751465, + "learning_rate": 7.074160891425466e-06, + "loss": 4.2584, + "step": 34465 + }, + { + "epoch": 2.3420301671422745, + "grad_norm": 0.46368715167045593, + "learning_rate": 7.073736241337139e-06, + "loss": 4.1447, + "step": 34470 + }, + { + "epoch": 2.3423698872129366, + "grad_norm": 0.7133120894432068, + "learning_rate": 7.073311591248812e-06, + "loss": 4.0975, + "step": 34475 + }, + { + "epoch": 2.3427096072835982, + "grad_norm": 1.2346307039260864, + "learning_rate": 7.0728869411604836e-06, + "loss": 4.3646, + "step": 34480 + }, + { + "epoch": 2.34304932735426, + "grad_norm": 0.3669399619102478, + "learning_rate": 7.072462291072157e-06, + "loss": 4.3482, + "step": 34485 + }, + { + "epoch": 2.343389047424922, + "grad_norm": 0.40239691734313965, + "learning_rate": 7.07203764098383e-06, + "loss": 4.3211, + "step": 34490 + }, + { + "epoch": 2.3437287674955836, + "grad_norm": 0.4815988838672638, + "learning_rate": 7.071612990895502e-06, + "loss": 4.5754, + "step": 34495 + }, + { + "epoch": 2.344068487566245, + "grad_norm": 0.7153031826019287, + "learning_rate": 7.071188340807176e-06, + "loss": 4.0027, + "step": 34500 + }, + { + "epoch": 2.3444082076369073, + "grad_norm": 0.6627721190452576, + "learning_rate": 7.070763690718848e-06, + "loss": 4.3562, + "step": 34505 + }, + { + "epoch": 2.344747927707569, + "grad_norm": 0.6585566997528076, + "learning_rate": 7.07033904063052e-06, + "loss": 4.4332, + "step": 34510 + }, + { + "epoch": 2.3450876477782305, + "grad_norm": 0.6025935411453247, + "learning_rate": 7.069914390542194e-06, + "loss": 4.2609, + "step": 34515 + }, + { + "epoch": 2.3454273678488926, + "grad_norm": 0.3500916361808777, + "learning_rate": 7.069489740453867e-06, + "loss": 4.1337, + "step": 34520 + }, + { + "epoch": 2.3457670879195542, + "grad_norm": 0.3492721915245056, + "learning_rate": 7.069065090365539e-06, + "loss": 4.3811, + "step": 34525 + }, + { + "epoch": 2.346106807990216, + "grad_norm": 0.35751041769981384, + "learning_rate": 7.068640440277212e-06, + "loss": 4.224, + "step": 34530 + }, + { + "epoch": 2.346446528060878, + "grad_norm": 0.4540816843509674, + "learning_rate": 7.068215790188885e-06, + "loss": 4.3201, + "step": 34535 + }, + { + "epoch": 2.3467862481315396, + "grad_norm": 0.4014766812324524, + "learning_rate": 7.067791140100557e-06, + "loss": 4.0706, + "step": 34540 + }, + { + "epoch": 2.347125968202201, + "grad_norm": 0.4012834429740906, + "learning_rate": 7.067366490012231e-06, + "loss": 4.3508, + "step": 34545 + }, + { + "epoch": 2.3474656882728633, + "grad_norm": 0.45141687989234924, + "learning_rate": 7.066941839923903e-06, + "loss": 4.3638, + "step": 34550 + }, + { + "epoch": 2.347805408343525, + "grad_norm": 0.3200184106826782, + "learning_rate": 7.0665171898355756e-06, + "loss": 4.3296, + "step": 34555 + }, + { + "epoch": 2.3481451284141865, + "grad_norm": 0.3548823297023773, + "learning_rate": 7.066092539747249e-06, + "loss": 4.3332, + "step": 34560 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 0.4506218731403351, + "learning_rate": 7.065667889658921e-06, + "loss": 4.5283, + "step": 34565 + }, + { + "epoch": 2.3488245685555103, + "grad_norm": 0.3359902501106262, + "learning_rate": 7.065243239570594e-06, + "loss": 4.5114, + "step": 34570 + }, + { + "epoch": 2.349164288626172, + "grad_norm": 0.6949692964553833, + "learning_rate": 7.064818589482268e-06, + "loss": 4.3925, + "step": 34575 + }, + { + "epoch": 2.349504008696834, + "grad_norm": 0.4349916875362396, + "learning_rate": 7.0643939393939396e-06, + "loss": 4.4325, + "step": 34580 + }, + { + "epoch": 2.3498437287674956, + "grad_norm": 0.45364537835121155, + "learning_rate": 7.063969289305612e-06, + "loss": 4.1712, + "step": 34585 + }, + { + "epoch": 2.350183448838157, + "grad_norm": 0.381712943315506, + "learning_rate": 7.063544639217286e-06, + "loss": 4.2631, + "step": 34590 + }, + { + "epoch": 2.3505231689088193, + "grad_norm": 0.46431753039360046, + "learning_rate": 7.063119989128958e-06, + "loss": 4.1228, + "step": 34595 + }, + { + "epoch": 2.350862888979481, + "grad_norm": 0.2711656093597412, + "learning_rate": 7.062695339040632e-06, + "loss": 4.179, + "step": 34600 + }, + { + "epoch": 2.3512026090501426, + "grad_norm": 0.6462024450302124, + "learning_rate": 7.062270688952304e-06, + "loss": 4.233, + "step": 34605 + }, + { + "epoch": 2.3515423291208046, + "grad_norm": 0.38282233476638794, + "learning_rate": 7.061846038863976e-06, + "loss": 4.3064, + "step": 34610 + }, + { + "epoch": 2.3518820491914663, + "grad_norm": 0.6912009716033936, + "learning_rate": 7.06142138877565e-06, + "loss": 4.1245, + "step": 34615 + }, + { + "epoch": 2.352221769262128, + "grad_norm": 0.34446486830711365, + "learning_rate": 7.060996738687322e-06, + "loss": 4.0574, + "step": 34620 + }, + { + "epoch": 2.35256148933279, + "grad_norm": 0.2623662054538727, + "learning_rate": 7.060572088598995e-06, + "loss": 4.3235, + "step": 34625 + }, + { + "epoch": 2.3529012094034516, + "grad_norm": 0.38166823983192444, + "learning_rate": 7.0601474385106684e-06, + "loss": 4.198, + "step": 34630 + }, + { + "epoch": 2.3532409294741132, + "grad_norm": 0.5724354386329651, + "learning_rate": 7.05972278842234e-06, + "loss": 4.5117, + "step": 34635 + }, + { + "epoch": 2.3535806495447753, + "grad_norm": 0.5633864402770996, + "learning_rate": 7.059298138334013e-06, + "loss": 4.2872, + "step": 34640 + }, + { + "epoch": 2.353920369615437, + "grad_norm": 0.3134312331676483, + "learning_rate": 7.058873488245687e-06, + "loss": 4.2805, + "step": 34645 + }, + { + "epoch": 2.3542600896860986, + "grad_norm": 0.38145846128463745, + "learning_rate": 7.058448838157359e-06, + "loss": 4.2431, + "step": 34650 + }, + { + "epoch": 2.3545998097567606, + "grad_norm": 0.5862995982170105, + "learning_rate": 7.058024188069032e-06, + "loss": 4.5722, + "step": 34655 + }, + { + "epoch": 2.3549395298274223, + "grad_norm": 0.5103518962860107, + "learning_rate": 7.0576844679983705e-06, + "loss": 4.3209, + "step": 34660 + }, + { + "epoch": 2.355279249898084, + "grad_norm": 0.4031590521335602, + "learning_rate": 7.0572598179100425e-06, + "loss": 4.2762, + "step": 34665 + }, + { + "epoch": 2.355618969968746, + "grad_norm": 0.49873968958854675, + "learning_rate": 7.056835167821715e-06, + "loss": 4.308, + "step": 34670 + }, + { + "epoch": 2.3559586900394076, + "grad_norm": 0.3113182783126831, + "learning_rate": 7.056410517733389e-06, + "loss": 4.1954, + "step": 34675 + }, + { + "epoch": 2.3562984101100692, + "grad_norm": 0.35877037048339844, + "learning_rate": 7.055985867645061e-06, + "loss": 4.3684, + "step": 34680 + }, + { + "epoch": 2.3566381301807313, + "grad_norm": 0.44947633147239685, + "learning_rate": 7.055561217556734e-06, + "loss": 4.2163, + "step": 34685 + }, + { + "epoch": 2.356977850251393, + "grad_norm": 0.3599877953529358, + "learning_rate": 7.055136567468407e-06, + "loss": 4.3715, + "step": 34690 + }, + { + "epoch": 2.3573175703220546, + "grad_norm": 0.3566477596759796, + "learning_rate": 7.054711917380079e-06, + "loss": 4.2943, + "step": 34695 + }, + { + "epoch": 2.357657290392716, + "grad_norm": 0.6335046887397766, + "learning_rate": 7.054287267291752e-06, + "loss": 4.5052, + "step": 34700 + }, + { + "epoch": 2.3579970104633783, + "grad_norm": 0.3779323697090149, + "learning_rate": 7.053862617203425e-06, + "loss": 4.262, + "step": 34705 + }, + { + "epoch": 2.35833673053404, + "grad_norm": 0.3554871082305908, + "learning_rate": 7.053437967115098e-06, + "loss": 3.9436, + "step": 34710 + }, + { + "epoch": 2.3586764506047015, + "grad_norm": 0.3789539933204651, + "learning_rate": 7.0530133170267705e-06, + "loss": 4.1267, + "step": 34715 + }, + { + "epoch": 2.3590161706753636, + "grad_norm": 0.4252624213695526, + "learning_rate": 7.052588666938443e-06, + "loss": 4.3444, + "step": 34720 + }, + { + "epoch": 2.3593558907460253, + "grad_norm": 0.3683748245239258, + "learning_rate": 7.052164016850116e-06, + "loss": 4.3539, + "step": 34725 + }, + { + "epoch": 2.359695610816687, + "grad_norm": 0.5494880676269531, + "learning_rate": 7.051739366761788e-06, + "loss": 4.2146, + "step": 34730 + }, + { + "epoch": 2.360035330887349, + "grad_norm": 0.34506484866142273, + "learning_rate": 7.051314716673462e-06, + "loss": 4.218, + "step": 34735 + }, + { + "epoch": 2.3603750509580106, + "grad_norm": 0.3937227725982666, + "learning_rate": 7.0508900665851345e-06, + "loss": 4.1604, + "step": 34740 + }, + { + "epoch": 2.360714771028672, + "grad_norm": 0.37903350591659546, + "learning_rate": 7.050465416496806e-06, + "loss": 4.2275, + "step": 34745 + }, + { + "epoch": 2.3610544910993343, + "grad_norm": 0.3410090506076813, + "learning_rate": 7.05004076640848e-06, + "loss": 4.2488, + "step": 34750 + }, + { + "epoch": 2.361394211169996, + "grad_norm": 0.45704385638237, + "learning_rate": 7.049616116320153e-06, + "loss": 4.2691, + "step": 34755 + }, + { + "epoch": 2.3617339312406576, + "grad_norm": 0.45557257533073425, + "learning_rate": 7.049191466231825e-06, + "loss": 4.4562, + "step": 34760 + }, + { + "epoch": 2.3620736513113196, + "grad_norm": 0.35926324129104614, + "learning_rate": 7.0487668161434985e-06, + "loss": 4.3735, + "step": 34765 + }, + { + "epoch": 2.3624133713819813, + "grad_norm": 0.5533550381660461, + "learning_rate": 7.048342166055171e-06, + "loss": 4.2001, + "step": 34770 + }, + { + "epoch": 2.362753091452643, + "grad_norm": 0.5624970197677612, + "learning_rate": 7.047917515966843e-06, + "loss": 4.3558, + "step": 34775 + }, + { + "epoch": 2.363092811523305, + "grad_norm": 0.43583568930625916, + "learning_rate": 7.047492865878517e-06, + "loss": 4.2956, + "step": 34780 + }, + { + "epoch": 2.3634325315939666, + "grad_norm": 0.5335540771484375, + "learning_rate": 7.04706821579019e-06, + "loss": 4.4488, + "step": 34785 + }, + { + "epoch": 2.3637722516646282, + "grad_norm": 0.5611393451690674, + "learning_rate": 7.046643565701862e-06, + "loss": 4.5431, + "step": 34790 + }, + { + "epoch": 2.3641119717352903, + "grad_norm": 0.3756476938724518, + "learning_rate": 7.046218915613535e-06, + "loss": 4.2361, + "step": 34795 + }, + { + "epoch": 2.364451691805952, + "grad_norm": 0.7057375907897949, + "learning_rate": 7.045794265525207e-06, + "loss": 4.3483, + "step": 34800 + }, + { + "epoch": 2.3647914118766136, + "grad_norm": 0.5572187900543213, + "learning_rate": 7.045369615436881e-06, + "loss": 4.2527, + "step": 34805 + }, + { + "epoch": 2.365131131947275, + "grad_norm": 0.5023947954177856, + "learning_rate": 7.044944965348554e-06, + "loss": 4.3027, + "step": 34810 + }, + { + "epoch": 2.3654708520179373, + "grad_norm": 0.3814549148082733, + "learning_rate": 7.044520315260226e-06, + "loss": 4.0879, + "step": 34815 + }, + { + "epoch": 2.365810572088599, + "grad_norm": 0.4266946613788605, + "learning_rate": 7.044095665171899e-06, + "loss": 4.056, + "step": 34820 + }, + { + "epoch": 2.3661502921592605, + "grad_norm": 0.3616044819355011, + "learning_rate": 7.043671015083572e-06, + "loss": 4.011, + "step": 34825 + }, + { + "epoch": 2.3664900122299226, + "grad_norm": 0.4305979609489441, + "learning_rate": 7.043246364995244e-06, + "loss": 4.1777, + "step": 34830 + }, + { + "epoch": 2.3668297323005842, + "grad_norm": 0.34583285450935364, + "learning_rate": 7.042821714906918e-06, + "loss": 4.3697, + "step": 34835 + }, + { + "epoch": 2.367169452371246, + "grad_norm": 0.932852566242218, + "learning_rate": 7.0423970648185905e-06, + "loss": 4.5543, + "step": 34840 + }, + { + "epoch": 2.367509172441908, + "grad_norm": 0.41013970971107483, + "learning_rate": 7.0419724147302624e-06, + "loss": 4.3552, + "step": 34845 + }, + { + "epoch": 2.3678488925125696, + "grad_norm": 0.5063344240188599, + "learning_rate": 7.041547764641936e-06, + "loss": 4.2523, + "step": 34850 + }, + { + "epoch": 2.368188612583231, + "grad_norm": 0.5065039396286011, + "learning_rate": 7.041123114553609e-06, + "loss": 4.2644, + "step": 34855 + }, + { + "epoch": 2.3685283326538933, + "grad_norm": 0.47374624013900757, + "learning_rate": 7.040698464465281e-06, + "loss": 4.1823, + "step": 34860 + }, + { + "epoch": 2.368868052724555, + "grad_norm": 0.444771409034729, + "learning_rate": 7.0402738143769545e-06, + "loss": 4.1171, + "step": 34865 + }, + { + "epoch": 2.3692077727952165, + "grad_norm": 0.6805877685546875, + "learning_rate": 7.0398491642886264e-06, + "loss": 4.3993, + "step": 34870 + }, + { + "epoch": 2.3695474928658786, + "grad_norm": 0.48054513335227966, + "learning_rate": 7.039424514200299e-06, + "loss": 4.3273, + "step": 34875 + }, + { + "epoch": 2.3698872129365403, + "grad_norm": 0.49702656269073486, + "learning_rate": 7.038999864111973e-06, + "loss": 4.187, + "step": 34880 + }, + { + "epoch": 2.370226933007202, + "grad_norm": 0.37624430656433105, + "learning_rate": 7.038575214023645e-06, + "loss": 4.3579, + "step": 34885 + }, + { + "epoch": 2.370566653077864, + "grad_norm": 0.5959978103637695, + "learning_rate": 7.038150563935318e-06, + "loss": 4.2767, + "step": 34890 + }, + { + "epoch": 2.3709063731485256, + "grad_norm": 0.43896088004112244, + "learning_rate": 7.037725913846991e-06, + "loss": 4.4221, + "step": 34895 + }, + { + "epoch": 2.3712460932191872, + "grad_norm": 0.3605371415615082, + "learning_rate": 7.037301263758663e-06, + "loss": 4.3426, + "step": 34900 + }, + { + "epoch": 2.3715858132898493, + "grad_norm": 0.41922029852867126, + "learning_rate": 7.036876613670336e-06, + "loss": 4.3431, + "step": 34905 + }, + { + "epoch": 2.371925533360511, + "grad_norm": 0.4203605353832245, + "learning_rate": 7.03645196358201e-06, + "loss": 4.0585, + "step": 34910 + }, + { + "epoch": 2.3722652534311726, + "grad_norm": 0.4072273373603821, + "learning_rate": 7.036027313493682e-06, + "loss": 4.3491, + "step": 34915 + }, + { + "epoch": 2.3726049735018346, + "grad_norm": 0.5133415460586548, + "learning_rate": 7.0356026634053544e-06, + "loss": 4.3628, + "step": 34920 + }, + { + "epoch": 2.3729446935724963, + "grad_norm": 0.4941115081310272, + "learning_rate": 7.035178013317028e-06, + "loss": 4.4355, + "step": 34925 + }, + { + "epoch": 2.373284413643158, + "grad_norm": 0.5122039318084717, + "learning_rate": 7.0347533632287e-06, + "loss": 4.1847, + "step": 34930 + }, + { + "epoch": 2.37362413371382, + "grad_norm": 0.7180203795433044, + "learning_rate": 7.034328713140373e-06, + "loss": 4.4973, + "step": 34935 + }, + { + "epoch": 2.3739638537844816, + "grad_norm": 0.36392924189567566, + "learning_rate": 7.0339040630520465e-06, + "loss": 4.3935, + "step": 34940 + }, + { + "epoch": 2.3743035738551432, + "grad_norm": 0.4989936351776123, + "learning_rate": 7.0334794129637185e-06, + "loss": 4.1588, + "step": 34945 + }, + { + "epoch": 2.3746432939258053, + "grad_norm": 0.45144757628440857, + "learning_rate": 7.033054762875391e-06, + "loss": 4.3397, + "step": 34950 + }, + { + "epoch": 2.374983013996467, + "grad_norm": 0.5984836220741272, + "learning_rate": 7.032630112787064e-06, + "loss": 4.2018, + "step": 34955 + }, + { + "epoch": 2.3753227340671286, + "grad_norm": 0.5682023167610168, + "learning_rate": 7.032205462698737e-06, + "loss": 4.1877, + "step": 34960 + }, + { + "epoch": 2.3756624541377906, + "grad_norm": 0.6194354295730591, + "learning_rate": 7.03178081261041e-06, + "loss": 4.2573, + "step": 34965 + }, + { + "epoch": 2.3760021742084523, + "grad_norm": 0.5221415162086487, + "learning_rate": 7.0313561625220825e-06, + "loss": 4.3733, + "step": 34970 + }, + { + "epoch": 2.376341894279114, + "grad_norm": 0.3878774046897888, + "learning_rate": 7.030931512433755e-06, + "loss": 4.2585, + "step": 34975 + }, + { + "epoch": 2.376681614349776, + "grad_norm": 0.5028988718986511, + "learning_rate": 7.030506862345427e-06, + "loss": 4.1332, + "step": 34980 + }, + { + "epoch": 2.3770213344204376, + "grad_norm": 0.4511859714984894, + "learning_rate": 7.030082212257101e-06, + "loss": 4.2892, + "step": 34985 + }, + { + "epoch": 2.3773610544910992, + "grad_norm": 0.4024587869644165, + "learning_rate": 7.029657562168774e-06, + "loss": 4.4618, + "step": 34990 + }, + { + "epoch": 2.3777007745617613, + "grad_norm": 0.3432411253452301, + "learning_rate": 7.029232912080446e-06, + "loss": 4.274, + "step": 34995 + }, + { + "epoch": 2.378040494632423, + "grad_norm": 0.3883092403411865, + "learning_rate": 7.028808261992119e-06, + "loss": 4.4072, + "step": 35000 + }, + { + "epoch": 2.3783802147030846, + "grad_norm": 0.43110623955726624, + "learning_rate": 7.028383611903792e-06, + "loss": 4.3223, + "step": 35005 + }, + { + "epoch": 2.3787199347737467, + "grad_norm": 0.3685179352760315, + "learning_rate": 7.027958961815464e-06, + "loss": 4.2064, + "step": 35010 + }, + { + "epoch": 2.3790596548444083, + "grad_norm": 0.36750099062919617, + "learning_rate": 7.027534311727138e-06, + "loss": 4.1342, + "step": 35015 + }, + { + "epoch": 2.37939937491507, + "grad_norm": 0.32379239797592163, + "learning_rate": 7.0271096616388105e-06, + "loss": 4.4161, + "step": 35020 + }, + { + "epoch": 2.379739094985732, + "grad_norm": 0.5672815442085266, + "learning_rate": 7.026685011550482e-06, + "loss": 4.4024, + "step": 35025 + }, + { + "epoch": 2.3800788150563936, + "grad_norm": 0.45889103412628174, + "learning_rate": 7.026260361462156e-06, + "loss": 4.2953, + "step": 35030 + }, + { + "epoch": 2.3804185351270553, + "grad_norm": 0.6201569437980652, + "learning_rate": 7.025835711373829e-06, + "loss": 4.1685, + "step": 35035 + }, + { + "epoch": 2.380758255197717, + "grad_norm": 0.6332656145095825, + "learning_rate": 7.025411061285501e-06, + "loss": 4.4365, + "step": 35040 + }, + { + "epoch": 2.381097975268379, + "grad_norm": 0.6116868257522583, + "learning_rate": 7.0249864111971745e-06, + "loss": 4.2197, + "step": 35045 + }, + { + "epoch": 2.3814376953390406, + "grad_norm": 0.5331189632415771, + "learning_rate": 7.024561761108846e-06, + "loss": 3.863, + "step": 35050 + }, + { + "epoch": 2.3817774154097022, + "grad_norm": 0.5806018114089966, + "learning_rate": 7.024137111020519e-06, + "loss": 4.1773, + "step": 35055 + }, + { + "epoch": 2.3821171354803643, + "grad_norm": 0.5849651098251343, + "learning_rate": 7.023712460932193e-06, + "loss": 4.3412, + "step": 35060 + }, + { + "epoch": 2.382456855551026, + "grad_norm": 0.3896999657154083, + "learning_rate": 7.023287810843865e-06, + "loss": 4.2068, + "step": 35065 + }, + { + "epoch": 2.3827965756216876, + "grad_norm": 0.44954347610473633, + "learning_rate": 7.022863160755538e-06, + "loss": 4.2607, + "step": 35070 + }, + { + "epoch": 2.3831362956923496, + "grad_norm": 0.5043298006057739, + "learning_rate": 7.022438510667211e-06, + "loss": 4.2335, + "step": 35075 + }, + { + "epoch": 2.3834760157630113, + "grad_norm": 0.37869811058044434, + "learning_rate": 7.022013860578883e-06, + "loss": 4.2608, + "step": 35080 + }, + { + "epoch": 2.383815735833673, + "grad_norm": 0.5145981907844543, + "learning_rate": 7.021589210490556e-06, + "loss": 4.2308, + "step": 35085 + }, + { + "epoch": 2.384155455904335, + "grad_norm": 0.3250845968723297, + "learning_rate": 7.02116456040223e-06, + "loss": 4.1428, + "step": 35090 + }, + { + "epoch": 2.3844951759749966, + "grad_norm": 0.36652061343193054, + "learning_rate": 7.020739910313902e-06, + "loss": 4.5272, + "step": 35095 + }, + { + "epoch": 2.3848348960456582, + "grad_norm": 0.31951960921287537, + "learning_rate": 7.0203152602255744e-06, + "loss": 4.3483, + "step": 35100 + }, + { + "epoch": 2.3851746161163203, + "grad_norm": 0.28225433826446533, + "learning_rate": 7.019890610137248e-06, + "loss": 4.4146, + "step": 35105 + }, + { + "epoch": 2.385514336186982, + "grad_norm": 0.3470754027366638, + "learning_rate": 7.01946596004892e-06, + "loss": 4.3068, + "step": 35110 + }, + { + "epoch": 2.3858540562576436, + "grad_norm": 0.41153043508529663, + "learning_rate": 7.019041309960593e-06, + "loss": 4.2244, + "step": 35115 + }, + { + "epoch": 2.3861937763283056, + "grad_norm": 0.5401992201805115, + "learning_rate": 7.018616659872266e-06, + "loss": 4.5432, + "step": 35120 + }, + { + "epoch": 2.3865334963989673, + "grad_norm": 0.4837358593940735, + "learning_rate": 7.0181920097839384e-06, + "loss": 4.3324, + "step": 35125 + }, + { + "epoch": 2.386873216469629, + "grad_norm": 0.3192752003669739, + "learning_rate": 7.017767359695611e-06, + "loss": 4.4367, + "step": 35130 + }, + { + "epoch": 2.387212936540291, + "grad_norm": 0.4251634180545807, + "learning_rate": 7.017342709607284e-06, + "loss": 4.2869, + "step": 35135 + }, + { + "epoch": 2.3875526566109526, + "grad_norm": 0.444132000207901, + "learning_rate": 7.016918059518957e-06, + "loss": 4.255, + "step": 35140 + }, + { + "epoch": 2.3878923766816142, + "grad_norm": 0.316942036151886, + "learning_rate": 7.0164934094306305e-06, + "loss": 4.4252, + "step": 35145 + }, + { + "epoch": 2.388232096752276, + "grad_norm": 0.5019255876541138, + "learning_rate": 7.0160687593423024e-06, + "loss": 4.2628, + "step": 35150 + }, + { + "epoch": 2.388571816822938, + "grad_norm": 0.29652106761932373, + "learning_rate": 7.015644109253975e-06, + "loss": 4.1682, + "step": 35155 + }, + { + "epoch": 2.3889115368935996, + "grad_norm": 0.2735545337200165, + "learning_rate": 7.015219459165649e-06, + "loss": 4.2882, + "step": 35160 + }, + { + "epoch": 2.389251256964261, + "grad_norm": 0.464465469121933, + "learning_rate": 7.014794809077321e-06, + "loss": 4.4236, + "step": 35165 + }, + { + "epoch": 2.3895909770349233, + "grad_norm": 0.3931169807910919, + "learning_rate": 7.014370158988994e-06, + "loss": 4.0971, + "step": 35170 + }, + { + "epoch": 2.389930697105585, + "grad_norm": 0.4551200568675995, + "learning_rate": 7.013945508900667e-06, + "loss": 4.2523, + "step": 35175 + }, + { + "epoch": 2.3902704171762466, + "grad_norm": 0.40311959385871887, + "learning_rate": 7.013520858812339e-06, + "loss": 4.2397, + "step": 35180 + }, + { + "epoch": 2.3906101372469086, + "grad_norm": 0.4280718266963959, + "learning_rate": 7.013096208724012e-06, + "loss": 4.2305, + "step": 35185 + }, + { + "epoch": 2.3909498573175703, + "grad_norm": 0.37887969613075256, + "learning_rate": 7.012671558635686e-06, + "loss": 4.1851, + "step": 35190 + }, + { + "epoch": 2.391289577388232, + "grad_norm": 0.4644564390182495, + "learning_rate": 7.012246908547358e-06, + "loss": 4.3735, + "step": 35195 + }, + { + "epoch": 2.391629297458894, + "grad_norm": 0.3660288155078888, + "learning_rate": 7.0118222584590304e-06, + "loss": 4.4507, + "step": 35200 + }, + { + "epoch": 2.3919690175295556, + "grad_norm": 0.4610377848148346, + "learning_rate": 7.011397608370703e-06, + "loss": 3.9655, + "step": 35205 + }, + { + "epoch": 2.3923087376002172, + "grad_norm": 0.46473050117492676, + "learning_rate": 7.010972958282376e-06, + "loss": 4.5631, + "step": 35210 + }, + { + "epoch": 2.3926484576708793, + "grad_norm": 0.4599817097187042, + "learning_rate": 7.010548308194048e-06, + "loss": 4.2847, + "step": 35215 + }, + { + "epoch": 2.392988177741541, + "grad_norm": 0.53372722864151, + "learning_rate": 7.010123658105722e-06, + "loss": 4.3881, + "step": 35220 + }, + { + "epoch": 2.3933278978122026, + "grad_norm": 0.49602678418159485, + "learning_rate": 7.0096990080173944e-06, + "loss": 4.3798, + "step": 35225 + }, + { + "epoch": 2.3936676178828646, + "grad_norm": 0.48847779631614685, + "learning_rate": 7.009274357929066e-06, + "loss": 4.3702, + "step": 35230 + }, + { + "epoch": 2.3940073379535263, + "grad_norm": 0.4330179691314697, + "learning_rate": 7.00884970784074e-06, + "loss": 4.189, + "step": 35235 + }, + { + "epoch": 2.394347058024188, + "grad_norm": 0.7085963487625122, + "learning_rate": 7.008425057752413e-06, + "loss": 4.4767, + "step": 35240 + }, + { + "epoch": 2.39468677809485, + "grad_norm": 0.4738622307777405, + "learning_rate": 7.008000407664085e-06, + "loss": 3.9422, + "step": 35245 + }, + { + "epoch": 2.3950264981655116, + "grad_norm": 0.4067685902118683, + "learning_rate": 7.0075757575757585e-06, + "loss": 4.134, + "step": 35250 + }, + { + "epoch": 2.3953662182361732, + "grad_norm": 0.2806808650493622, + "learning_rate": 7.007151107487431e-06, + "loss": 4.2125, + "step": 35255 + }, + { + "epoch": 2.3957059383068353, + "grad_norm": 0.41629940271377563, + "learning_rate": 7.006726457399103e-06, + "loss": 4.1885, + "step": 35260 + }, + { + "epoch": 2.396045658377497, + "grad_norm": 0.3609860837459564, + "learning_rate": 7.006301807310777e-06, + "loss": 4.4313, + "step": 35265 + }, + { + "epoch": 2.3963853784481586, + "grad_norm": 0.385345995426178, + "learning_rate": 7.00587715722245e-06, + "loss": 4.3388, + "step": 35270 + }, + { + "epoch": 2.3967250985188207, + "grad_norm": 0.25193923711776733, + "learning_rate": 7.005452507134122e-06, + "loss": 4.2577, + "step": 35275 + }, + { + "epoch": 2.3970648185894823, + "grad_norm": 0.28347912430763245, + "learning_rate": 7.005027857045795e-06, + "loss": 4.3183, + "step": 35280 + }, + { + "epoch": 2.397404538660144, + "grad_norm": 0.32703036069869995, + "learning_rate": 7.004603206957468e-06, + "loss": 4.1956, + "step": 35285 + }, + { + "epoch": 2.397744258730806, + "grad_norm": 0.32998037338256836, + "learning_rate": 7.00417855686914e-06, + "loss": 4.4643, + "step": 35290 + }, + { + "epoch": 2.3980839788014676, + "grad_norm": 0.48410236835479736, + "learning_rate": 7.003753906780814e-06, + "loss": 4.3659, + "step": 35295 + }, + { + "epoch": 2.3984236988721293, + "grad_norm": 0.5160664916038513, + "learning_rate": 7.003329256692486e-06, + "loss": 4.3364, + "step": 35300 + }, + { + "epoch": 2.3987634189427913, + "grad_norm": 0.33377525210380554, + "learning_rate": 7.002904606604158e-06, + "loss": 4.2586, + "step": 35305 + }, + { + "epoch": 2.399103139013453, + "grad_norm": 1.0428448915481567, + "learning_rate": 7.002479956515832e-06, + "loss": 4.2205, + "step": 35310 + }, + { + "epoch": 2.3994428590841146, + "grad_norm": 0.4219854772090912, + "learning_rate": 7.002055306427504e-06, + "loss": 4.1824, + "step": 35315 + }, + { + "epoch": 2.3997825791547767, + "grad_norm": 0.5887892246246338, + "learning_rate": 7.001630656339177e-06, + "loss": 4.1453, + "step": 35320 + }, + { + "epoch": 2.4001222992254383, + "grad_norm": 0.3568934500217438, + "learning_rate": 7.0012060062508505e-06, + "loss": 4.2745, + "step": 35325 + }, + { + "epoch": 2.4004620192961, + "grad_norm": 0.3426426947116852, + "learning_rate": 7.000781356162522e-06, + "loss": 4.1526, + "step": 35330 + }, + { + "epoch": 2.400801739366762, + "grad_norm": 0.2901548147201538, + "learning_rate": 7.000356706074195e-06, + "loss": 4.2978, + "step": 35335 + }, + { + "epoch": 2.4011414594374236, + "grad_norm": 0.4710932672023773, + "learning_rate": 6.999932055985869e-06, + "loss": 4.2916, + "step": 35340 + }, + { + "epoch": 2.4014811795080853, + "grad_norm": 0.4534684121608734, + "learning_rate": 6.999507405897541e-06, + "loss": 4.3668, + "step": 35345 + }, + { + "epoch": 2.4018208995787473, + "grad_norm": 0.46978580951690674, + "learning_rate": 6.999082755809214e-06, + "loss": 4.1133, + "step": 35350 + }, + { + "epoch": 2.402160619649409, + "grad_norm": 0.6146649718284607, + "learning_rate": 6.998658105720887e-06, + "loss": 4.1068, + "step": 35355 + }, + { + "epoch": 2.4025003397200706, + "grad_norm": 0.516800045967102, + "learning_rate": 6.998233455632559e-06, + "loss": 4.3245, + "step": 35360 + }, + { + "epoch": 2.4028400597907327, + "grad_norm": 0.4772646129131317, + "learning_rate": 6.997808805544232e-06, + "loss": 4.1802, + "step": 35365 + }, + { + "epoch": 2.4031797798613943, + "grad_norm": 0.30559033155441284, + "learning_rate": 6.997384155455905e-06, + "loss": 4.3468, + "step": 35370 + }, + { + "epoch": 2.403519499932056, + "grad_norm": 0.30944404006004333, + "learning_rate": 6.996959505367578e-06, + "loss": 4.0353, + "step": 35375 + }, + { + "epoch": 2.4038592200027176, + "grad_norm": 0.3446901738643646, + "learning_rate": 6.9965348552792504e-06, + "loss": 4.1039, + "step": 35380 + }, + { + "epoch": 2.4041989400733796, + "grad_norm": 0.3303765654563904, + "learning_rate": 6.996110205190923e-06, + "loss": 4.0844, + "step": 35385 + }, + { + "epoch": 2.4045386601440413, + "grad_norm": 0.290471613407135, + "learning_rate": 6.995685555102596e-06, + "loss": 4.2425, + "step": 35390 + }, + { + "epoch": 2.404878380214703, + "grad_norm": 0.38800254464149475, + "learning_rate": 6.995260905014268e-06, + "loss": 4.1985, + "step": 35395 + }, + { + "epoch": 2.405218100285365, + "grad_norm": 0.4162693917751312, + "learning_rate": 6.994836254925942e-06, + "loss": 4.218, + "step": 35400 + }, + { + "epoch": 2.4055578203560266, + "grad_norm": 0.5339469909667969, + "learning_rate": 6.9944116048376144e-06, + "loss": 4.4172, + "step": 35405 + }, + { + "epoch": 2.4058975404266882, + "grad_norm": 0.40970438718795776, + "learning_rate": 6.993986954749286e-06, + "loss": 4.1605, + "step": 35410 + }, + { + "epoch": 2.4062372604973503, + "grad_norm": 0.31579673290252686, + "learning_rate": 6.99356230466096e-06, + "loss": 3.9623, + "step": 35415 + }, + { + "epoch": 2.406576980568012, + "grad_norm": 0.2573685646057129, + "learning_rate": 6.993137654572633e-06, + "loss": 4.2062, + "step": 35420 + }, + { + "epoch": 2.4069167006386736, + "grad_norm": 0.44836071133613586, + "learning_rate": 6.992713004484305e-06, + "loss": 4.3867, + "step": 35425 + }, + { + "epoch": 2.4072564207093357, + "grad_norm": 0.5958397388458252, + "learning_rate": 6.9922883543959784e-06, + "loss": 4.2628, + "step": 35430 + }, + { + "epoch": 2.4075961407799973, + "grad_norm": 0.36126530170440674, + "learning_rate": 6.991863704307651e-06, + "loss": 4.302, + "step": 35435 + }, + { + "epoch": 2.407935860850659, + "grad_norm": 0.42365291714668274, + "learning_rate": 6.991439054219323e-06, + "loss": 4.063, + "step": 35440 + }, + { + "epoch": 2.408275580921321, + "grad_norm": 0.6827090382575989, + "learning_rate": 6.991014404130997e-06, + "loss": 4.3586, + "step": 35445 + }, + { + "epoch": 2.4086153009919826, + "grad_norm": 0.36001577973365784, + "learning_rate": 6.99058975404267e-06, + "loss": 4.6152, + "step": 35450 + }, + { + "epoch": 2.4089550210626443, + "grad_norm": 0.3242202401161194, + "learning_rate": 6.990165103954342e-06, + "loss": 4.3104, + "step": 35455 + }, + { + "epoch": 2.4092947411333063, + "grad_norm": 0.44872230291366577, + "learning_rate": 6.989740453866015e-06, + "loss": 4.418, + "step": 35460 + }, + { + "epoch": 2.409634461203968, + "grad_norm": 0.5282010436058044, + "learning_rate": 6.989315803777687e-06, + "loss": 4.158, + "step": 35465 + }, + { + "epoch": 2.4099741812746296, + "grad_norm": 0.34259462356567383, + "learning_rate": 6.98889115368936e-06, + "loss": 4.1795, + "step": 35470 + }, + { + "epoch": 2.4103139013452917, + "grad_norm": 0.28671520948410034, + "learning_rate": 6.988466503601034e-06, + "loss": 4.3098, + "step": 35475 + }, + { + "epoch": 2.4106536214159533, + "grad_norm": 0.5190542340278625, + "learning_rate": 6.988041853512706e-06, + "loss": 4.1476, + "step": 35480 + }, + { + "epoch": 2.410993341486615, + "grad_norm": 0.48552045226097107, + "learning_rate": 6.987617203424379e-06, + "loss": 4.3919, + "step": 35485 + }, + { + "epoch": 2.4113330615572766, + "grad_norm": 0.7488798499107361, + "learning_rate": 6.987192553336052e-06, + "loss": 4.3971, + "step": 35490 + }, + { + "epoch": 2.4116727816279386, + "grad_norm": 0.4266529679298401, + "learning_rate": 6.986767903247724e-06, + "loss": 4.4021, + "step": 35495 + }, + { + "epoch": 2.4120125016986003, + "grad_norm": 0.5169879198074341, + "learning_rate": 6.986343253159398e-06, + "loss": 4.4126, + "step": 35500 + }, + { + "epoch": 2.412352221769262, + "grad_norm": 0.39245060086250305, + "learning_rate": 6.9859186030710704e-06, + "loss": 3.9504, + "step": 35505 + }, + { + "epoch": 2.412691941839924, + "grad_norm": 0.3968804180622101, + "learning_rate": 6.985493952982742e-06, + "loss": 4.2393, + "step": 35510 + }, + { + "epoch": 2.4130316619105856, + "grad_norm": 0.2879389524459839, + "learning_rate": 6.985069302894416e-06, + "loss": 4.2236, + "step": 35515 + }, + { + "epoch": 2.4133713819812472, + "grad_norm": 0.652403712272644, + "learning_rate": 6.984644652806089e-06, + "loss": 4.7265, + "step": 35520 + }, + { + "epoch": 2.4137111020519093, + "grad_norm": 0.3784124553203583, + "learning_rate": 6.984220002717761e-06, + "loss": 4.2047, + "step": 35525 + }, + { + "epoch": 2.414050822122571, + "grad_norm": 0.4855320155620575, + "learning_rate": 6.9837953526294344e-06, + "loss": 4.2832, + "step": 35530 + }, + { + "epoch": 2.4143905421932326, + "grad_norm": 0.3182666599750519, + "learning_rate": 6.983370702541107e-06, + "loss": 4.4627, + "step": 35535 + }, + { + "epoch": 2.4147302622638946, + "grad_norm": 0.3483329117298126, + "learning_rate": 6.982946052452779e-06, + "loss": 4.0975, + "step": 35540 + }, + { + "epoch": 2.4150699823345563, + "grad_norm": 0.3709977865219116, + "learning_rate": 6.982521402364453e-06, + "loss": 4.2978, + "step": 35545 + }, + { + "epoch": 2.415409702405218, + "grad_norm": 0.4416140913963318, + "learning_rate": 6.982096752276125e-06, + "loss": 4.1028, + "step": 35550 + }, + { + "epoch": 2.41574942247588, + "grad_norm": 0.3846432864665985, + "learning_rate": 6.981672102187798e-06, + "loss": 4.3931, + "step": 35555 + }, + { + "epoch": 2.4160891425465416, + "grad_norm": 0.4515693187713623, + "learning_rate": 6.981247452099471e-06, + "loss": 4.3667, + "step": 35560 + }, + { + "epoch": 2.4164288626172032, + "grad_norm": 0.3684602975845337, + "learning_rate": 6.980822802011143e-06, + "loss": 4.0311, + "step": 35565 + }, + { + "epoch": 2.4167685826878653, + "grad_norm": 0.3970387578010559, + "learning_rate": 6.980398151922816e-06, + "loss": 4.2789, + "step": 35570 + }, + { + "epoch": 2.417108302758527, + "grad_norm": 0.25439003109931946, + "learning_rate": 6.97997350183449e-06, + "loss": 4.1768, + "step": 35575 + }, + { + "epoch": 2.4174480228291886, + "grad_norm": 0.6306053996086121, + "learning_rate": 6.979548851746162e-06, + "loss": 4.228, + "step": 35580 + }, + { + "epoch": 2.4177877428998507, + "grad_norm": 0.32920873165130615, + "learning_rate": 6.979124201657834e-06, + "loss": 4.3113, + "step": 35585 + }, + { + "epoch": 2.4181274629705123, + "grad_norm": 0.39823609590530396, + "learning_rate": 6.978699551569508e-06, + "loss": 4.2981, + "step": 35590 + }, + { + "epoch": 2.418467183041174, + "grad_norm": 0.570845901966095, + "learning_rate": 6.97827490148118e-06, + "loss": 4.5745, + "step": 35595 + }, + { + "epoch": 2.418806903111836, + "grad_norm": 0.43629512190818787, + "learning_rate": 6.977850251392853e-06, + "loss": 4.3915, + "step": 35600 + }, + { + "epoch": 2.4191466231824976, + "grad_norm": 0.3331105709075928, + "learning_rate": 6.9774256013045265e-06, + "loss": 4.1256, + "step": 35605 + }, + { + "epoch": 2.4194863432531593, + "grad_norm": 0.45428818464279175, + "learning_rate": 6.977000951216198e-06, + "loss": 4.1137, + "step": 35610 + }, + { + "epoch": 2.4198260633238213, + "grad_norm": 0.447391152381897, + "learning_rate": 6.976576301127871e-06, + "loss": 4.3284, + "step": 35615 + }, + { + "epoch": 2.420165783394483, + "grad_norm": 0.35817837715148926, + "learning_rate": 6.976151651039544e-06, + "loss": 4.4382, + "step": 35620 + }, + { + "epoch": 2.4205055034651446, + "grad_norm": 0.36021214723587036, + "learning_rate": 6.975727000951217e-06, + "loss": 4.2006, + "step": 35625 + }, + { + "epoch": 2.4208452235358067, + "grad_norm": 0.3702952563762665, + "learning_rate": 6.97530235086289e-06, + "loss": 4.2143, + "step": 35630 + }, + { + "epoch": 2.4211849436064683, + "grad_norm": 0.2792362868785858, + "learning_rate": 6.974877700774562e-06, + "loss": 4.3259, + "step": 35635 + }, + { + "epoch": 2.42152466367713, + "grad_norm": 0.5753705501556396, + "learning_rate": 6.974453050686235e-06, + "loss": 3.968, + "step": 35640 + }, + { + "epoch": 2.421864383747792, + "grad_norm": 0.3737744987010956, + "learning_rate": 6.974028400597907e-06, + "loss": 4.1309, + "step": 35645 + }, + { + "epoch": 2.4222041038184536, + "grad_norm": 0.43114620447158813, + "learning_rate": 6.973603750509581e-06, + "loss": 4.0301, + "step": 35650 + }, + { + "epoch": 2.4225438238891153, + "grad_norm": 0.3548325002193451, + "learning_rate": 6.973179100421254e-06, + "loss": 4.1128, + "step": 35655 + }, + { + "epoch": 2.4228835439597773, + "grad_norm": 0.6952446699142456, + "learning_rate": 6.9727544503329256e-06, + "loss": 4.3516, + "step": 35660 + }, + { + "epoch": 2.423223264030439, + "grad_norm": 0.3984607756137848, + "learning_rate": 6.972329800244599e-06, + "loss": 4.3601, + "step": 35665 + }, + { + "epoch": 2.4235629841011006, + "grad_norm": 0.34441372752189636, + "learning_rate": 6.971905150156272e-06, + "loss": 4.2434, + "step": 35670 + }, + { + "epoch": 2.4239027041717627, + "grad_norm": 0.5078010559082031, + "learning_rate": 6.971480500067944e-06, + "loss": 4.0518, + "step": 35675 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.36100059747695923, + "learning_rate": 6.971055849979618e-06, + "loss": 4.3376, + "step": 35680 + }, + { + "epoch": 2.424582144313086, + "grad_norm": 0.5867305397987366, + "learning_rate": 6.9706311998912904e-06, + "loss": 4.1509, + "step": 35685 + }, + { + "epoch": 2.424921864383748, + "grad_norm": 0.3339923024177551, + "learning_rate": 6.970206549802962e-06, + "loss": 4.2058, + "step": 35690 + }, + { + "epoch": 2.4252615844544096, + "grad_norm": 0.3760300576686859, + "learning_rate": 6.969781899714636e-06, + "loss": 3.9956, + "step": 35695 + }, + { + "epoch": 2.4256013045250713, + "grad_norm": 0.3555503487586975, + "learning_rate": 6.969357249626309e-06, + "loss": 4.1252, + "step": 35700 + }, + { + "epoch": 2.4259410245957334, + "grad_norm": 0.45288148522377014, + "learning_rate": 6.968932599537981e-06, + "loss": 3.8603, + "step": 35705 + }, + { + "epoch": 2.426280744666395, + "grad_norm": 0.40410587191581726, + "learning_rate": 6.9685079494496544e-06, + "loss": 4.3651, + "step": 35710 + }, + { + "epoch": 2.4266204647370566, + "grad_norm": 0.38152480125427246, + "learning_rate": 6.968083299361326e-06, + "loss": 4.0761, + "step": 35715 + }, + { + "epoch": 2.4269601848077182, + "grad_norm": 0.3714294135570526, + "learning_rate": 6.967658649272999e-06, + "loss": 4.1328, + "step": 35720 + }, + { + "epoch": 2.4272999048783803, + "grad_norm": 0.30633461475372314, + "learning_rate": 6.967233999184673e-06, + "loss": 4.209, + "step": 35725 + }, + { + "epoch": 2.427639624949042, + "grad_norm": 0.4202348589897156, + "learning_rate": 6.966809349096345e-06, + "loss": 4.4539, + "step": 35730 + }, + { + "epoch": 2.4279793450197036, + "grad_norm": 0.26358407735824585, + "learning_rate": 6.966384699008018e-06, + "loss": 4.0982, + "step": 35735 + }, + { + "epoch": 2.4283190650903657, + "grad_norm": 0.2791978716850281, + "learning_rate": 6.965960048919691e-06, + "loss": 4.0262, + "step": 35740 + }, + { + "epoch": 2.4286587851610273, + "grad_norm": 0.5089686512947083, + "learning_rate": 6.965535398831363e-06, + "loss": 4.0173, + "step": 35745 + }, + { + "epoch": 2.428998505231689, + "grad_norm": 0.38691744208335876, + "learning_rate": 6.965110748743036e-06, + "loss": 4.3103, + "step": 35750 + }, + { + "epoch": 2.429338225302351, + "grad_norm": 0.4253362715244293, + "learning_rate": 6.96468609865471e-06, + "loss": 4.1134, + "step": 35755 + }, + { + "epoch": 2.4296779453730126, + "grad_norm": 0.2849259376525879, + "learning_rate": 6.964261448566382e-06, + "loss": 4.4914, + "step": 35760 + }, + { + "epoch": 2.4300176654436743, + "grad_norm": 0.41366419196128845, + "learning_rate": 6.963836798478054e-06, + "loss": 4.2478, + "step": 35765 + }, + { + "epoch": 2.4303573855143363, + "grad_norm": 0.3734844923019409, + "learning_rate": 6.963412148389728e-06, + "loss": 4.4759, + "step": 35770 + }, + { + "epoch": 2.430697105584998, + "grad_norm": 0.3955242335796356, + "learning_rate": 6.9629874983014e-06, + "loss": 4.3395, + "step": 35775 + }, + { + "epoch": 2.4310368256556596, + "grad_norm": 0.27540943026542664, + "learning_rate": 6.962562848213073e-06, + "loss": 4.2026, + "step": 35780 + }, + { + "epoch": 2.4313765457263217, + "grad_norm": 0.6206026077270508, + "learning_rate": 6.962138198124746e-06, + "loss": 4.2502, + "step": 35785 + }, + { + "epoch": 2.4317162657969833, + "grad_norm": 0.2986454963684082, + "learning_rate": 6.961713548036418e-06, + "loss": 4.0782, + "step": 35790 + }, + { + "epoch": 2.432055985867645, + "grad_norm": 0.4250440001487732, + "learning_rate": 6.961288897948091e-06, + "loss": 4.4568, + "step": 35795 + }, + { + "epoch": 2.432395705938307, + "grad_norm": 0.31960564851760864, + "learning_rate": 6.960864247859764e-06, + "loss": 4.4261, + "step": 35800 + }, + { + "epoch": 2.4327354260089686, + "grad_norm": 0.5433169603347778, + "learning_rate": 6.960439597771437e-06, + "loss": 4.1192, + "step": 35805 + }, + { + "epoch": 2.4330751460796303, + "grad_norm": 0.40922895073890686, + "learning_rate": 6.960014947683109e-06, + "loss": 4.3818, + "step": 35810 + }, + { + "epoch": 2.4334148661502923, + "grad_norm": 0.48675182461738586, + "learning_rate": 6.959590297594782e-06, + "loss": 4.1541, + "step": 35815 + }, + { + "epoch": 2.433754586220954, + "grad_norm": 0.4037831425666809, + "learning_rate": 6.959165647506455e-06, + "loss": 4.3696, + "step": 35820 + }, + { + "epoch": 2.4340943062916156, + "grad_norm": 0.8699174523353577, + "learning_rate": 6.958740997418129e-06, + "loss": 4.3793, + "step": 35825 + }, + { + "epoch": 2.4344340263622772, + "grad_norm": 0.2647295892238617, + "learning_rate": 6.958316347329801e-06, + "loss": 4.1519, + "step": 35830 + }, + { + "epoch": 2.4347737464329393, + "grad_norm": 0.5983976721763611, + "learning_rate": 6.957891697241474e-06, + "loss": 4.107, + "step": 35835 + }, + { + "epoch": 2.435113466503601, + "grad_norm": 0.4281734824180603, + "learning_rate": 6.957467047153147e-06, + "loss": 4.3771, + "step": 35840 + }, + { + "epoch": 2.4354531865742626, + "grad_norm": 0.4759114384651184, + "learning_rate": 6.957042397064819e-06, + "loss": 4.1601, + "step": 35845 + }, + { + "epoch": 2.4357929066449246, + "grad_norm": 0.4050826132297516, + "learning_rate": 6.956617746976492e-06, + "loss": 4.28, + "step": 35850 + }, + { + "epoch": 2.4361326267155863, + "grad_norm": 0.40374118089675903, + "learning_rate": 6.956193096888166e-06, + "loss": 4.104, + "step": 35855 + }, + { + "epoch": 2.436472346786248, + "grad_norm": 0.3601893484592438, + "learning_rate": 6.955768446799838e-06, + "loss": 4.4741, + "step": 35860 + }, + { + "epoch": 2.43681206685691, + "grad_norm": 0.4277615547180176, + "learning_rate": 6.95534379671151e-06, + "loss": 4.1944, + "step": 35865 + }, + { + "epoch": 2.4371517869275716, + "grad_norm": 0.3660428822040558, + "learning_rate": 6.954919146623183e-06, + "loss": 4.22, + "step": 35870 + }, + { + "epoch": 2.4374915069982332, + "grad_norm": 0.36325177550315857, + "learning_rate": 6.954494496534856e-06, + "loss": 4.3375, + "step": 35875 + }, + { + "epoch": 2.4378312270688953, + "grad_norm": 0.3560912609100342, + "learning_rate": 6.954069846446528e-06, + "loss": 4.1812, + "step": 35880 + }, + { + "epoch": 2.438170947139557, + "grad_norm": 0.5900121331214905, + "learning_rate": 6.953645196358202e-06, + "loss": 3.9655, + "step": 35885 + }, + { + "epoch": 2.4385106672102186, + "grad_norm": 0.38739311695098877, + "learning_rate": 6.953220546269874e-06, + "loss": 4.4923, + "step": 35890 + }, + { + "epoch": 2.4388503872808807, + "grad_norm": 0.27723658084869385, + "learning_rate": 6.952795896181546e-06, + "loss": 4.1021, + "step": 35895 + }, + { + "epoch": 2.4391901073515423, + "grad_norm": 0.39515143632888794, + "learning_rate": 6.95237124609322e-06, + "loss": 4.0969, + "step": 35900 + }, + { + "epoch": 2.439529827422204, + "grad_norm": 0.40817007422447205, + "learning_rate": 6.951946596004893e-06, + "loss": 4.2254, + "step": 35905 + }, + { + "epoch": 2.439869547492866, + "grad_norm": 0.321604460477829, + "learning_rate": 6.951521945916565e-06, + "loss": 4.3908, + "step": 35910 + }, + { + "epoch": 2.4402092675635276, + "grad_norm": 0.44734281301498413, + "learning_rate": 6.951097295828238e-06, + "loss": 4.2528, + "step": 35915 + }, + { + "epoch": 2.4405489876341893, + "grad_norm": 0.32795053720474243, + "learning_rate": 6.950672645739911e-06, + "loss": 4.2305, + "step": 35920 + }, + { + "epoch": 2.4408887077048513, + "grad_norm": 0.5339715480804443, + "learning_rate": 6.950247995651583e-06, + "loss": 4.0729, + "step": 35925 + }, + { + "epoch": 2.441228427775513, + "grad_norm": 0.4844982624053955, + "learning_rate": 6.949823345563257e-06, + "loss": 4.3522, + "step": 35930 + }, + { + "epoch": 2.4415681478461746, + "grad_norm": 0.34197792410850525, + "learning_rate": 6.94939869547493e-06, + "loss": 4.3201, + "step": 35935 + }, + { + "epoch": 2.4419078679168367, + "grad_norm": 0.38802847266197205, + "learning_rate": 6.9489740453866016e-06, + "loss": 4.187, + "step": 35940 + }, + { + "epoch": 2.4422475879874983, + "grad_norm": 0.36813265085220337, + "learning_rate": 6.948549395298275e-06, + "loss": 4.1664, + "step": 35945 + }, + { + "epoch": 2.44258730805816, + "grad_norm": 0.47474777698516846, + "learning_rate": 6.948124745209948e-06, + "loss": 4.1738, + "step": 35950 + }, + { + "epoch": 2.442927028128822, + "grad_norm": 0.37859490513801575, + "learning_rate": 6.94770009512162e-06, + "loss": 4.2609, + "step": 35955 + }, + { + "epoch": 2.4432667481994836, + "grad_norm": 0.30630043148994446, + "learning_rate": 6.947275445033294e-06, + "loss": 4.0486, + "step": 35960 + }, + { + "epoch": 2.4436064682701453, + "grad_norm": 0.38810911774635315, + "learning_rate": 6.9468507949449656e-06, + "loss": 4.18, + "step": 35965 + }, + { + "epoch": 2.4439461883408073, + "grad_norm": 0.39352279901504517, + "learning_rate": 6.946426144856638e-06, + "loss": 3.9531, + "step": 35970 + }, + { + "epoch": 2.444285908411469, + "grad_norm": 0.3728518486022949, + "learning_rate": 6.946001494768312e-06, + "loss": 4.2407, + "step": 35975 + }, + { + "epoch": 2.4446256284821306, + "grad_norm": 0.42894747853279114, + "learning_rate": 6.945576844679984e-06, + "loss": 3.8989, + "step": 35980 + }, + { + "epoch": 2.4449653485527927, + "grad_norm": 0.32973867654800415, + "learning_rate": 6.945152194591657e-06, + "loss": 4.0661, + "step": 35985 + }, + { + "epoch": 2.4453050686234543, + "grad_norm": 0.3630887269973755, + "learning_rate": 6.9447275445033304e-06, + "loss": 4.4196, + "step": 35990 + }, + { + "epoch": 2.445644788694116, + "grad_norm": 0.3682801127433777, + "learning_rate": 6.944302894415002e-06, + "loss": 4.1106, + "step": 35995 + }, + { + "epoch": 2.445984508764778, + "grad_norm": 0.45667633414268494, + "learning_rate": 6.943878244326675e-06, + "loss": 4.404, + "step": 36000 + }, + { + "epoch": 2.4463242288354397, + "grad_norm": 0.3920568525791168, + "learning_rate": 6.943453594238349e-06, + "loss": 4.3989, + "step": 36005 + }, + { + "epoch": 2.4466639489061013, + "grad_norm": 0.6913067102432251, + "learning_rate": 6.943028944150021e-06, + "loss": 4.3145, + "step": 36010 + }, + { + "epoch": 2.4470036689767634, + "grad_norm": 0.3142201602458954, + "learning_rate": 6.942604294061694e-06, + "loss": 4.2666, + "step": 36015 + }, + { + "epoch": 2.447343389047425, + "grad_norm": 0.3988335132598877, + "learning_rate": 6.942179643973367e-06, + "loss": 3.99, + "step": 36020 + }, + { + "epoch": 2.4476831091180866, + "grad_norm": 0.3660919666290283, + "learning_rate": 6.941754993885039e-06, + "loss": 4.2455, + "step": 36025 + }, + { + "epoch": 2.4480228291887487, + "grad_norm": 0.31114307045936584, + "learning_rate": 6.941330343796712e-06, + "loss": 4.1282, + "step": 36030 + }, + { + "epoch": 2.4483625492594103, + "grad_norm": 0.3269686996936798, + "learning_rate": 6.940905693708385e-06, + "loss": 4.3748, + "step": 36035 + }, + { + "epoch": 2.448702269330072, + "grad_norm": 0.5575769543647766, + "learning_rate": 6.940481043620058e-06, + "loss": 4.1496, + "step": 36040 + }, + { + "epoch": 2.449041989400734, + "grad_norm": 0.3239764869213104, + "learning_rate": 6.94005639353173e-06, + "loss": 4.3719, + "step": 36045 + }, + { + "epoch": 2.4493817094713957, + "grad_norm": 0.2777802050113678, + "learning_rate": 6.939631743443403e-06, + "loss": 4.41, + "step": 36050 + }, + { + "epoch": 2.4497214295420573, + "grad_norm": 0.3630428910255432, + "learning_rate": 6.939207093355076e-06, + "loss": 4.0977, + "step": 36055 + }, + { + "epoch": 2.4500611496127194, + "grad_norm": 0.6045559048652649, + "learning_rate": 6.938782443266748e-06, + "loss": 4.354, + "step": 36060 + }, + { + "epoch": 2.450400869683381, + "grad_norm": 0.5899837017059326, + "learning_rate": 6.938357793178422e-06, + "loss": 4.1844, + "step": 36065 + }, + { + "epoch": 2.4507405897540426, + "grad_norm": 0.40947744250297546, + "learning_rate": 6.937933143090094e-06, + "loss": 4.212, + "step": 36070 + }, + { + "epoch": 2.4510803098247043, + "grad_norm": 0.5364617705345154, + "learning_rate": 6.937508493001766e-06, + "loss": 4.276, + "step": 36075 + }, + { + "epoch": 2.4514200298953663, + "grad_norm": 0.3568486273288727, + "learning_rate": 6.93708384291344e-06, + "loss": 4.2697, + "step": 36080 + }, + { + "epoch": 2.451759749966028, + "grad_norm": 0.31792065501213074, + "learning_rate": 6.936659192825113e-06, + "loss": 4.3903, + "step": 36085 + }, + { + "epoch": 2.4520994700366896, + "grad_norm": 0.3287280201911926, + "learning_rate": 6.936234542736785e-06, + "loss": 4.0783, + "step": 36090 + }, + { + "epoch": 2.4524391901073517, + "grad_norm": 0.340667724609375, + "learning_rate": 6.935809892648458e-06, + "loss": 4.2016, + "step": 36095 + }, + { + "epoch": 2.4527789101780133, + "grad_norm": 0.5110222697257996, + "learning_rate": 6.935385242560131e-06, + "loss": 4.1127, + "step": 36100 + }, + { + "epoch": 2.453118630248675, + "grad_norm": 0.28724655508995056, + "learning_rate": 6.934960592471803e-06, + "loss": 4.1212, + "step": 36105 + }, + { + "epoch": 2.453458350319337, + "grad_norm": 0.31718504428863525, + "learning_rate": 6.934535942383477e-06, + "loss": 4.218, + "step": 36110 + }, + { + "epoch": 2.4537980703899986, + "grad_norm": 0.2689264714717865, + "learning_rate": 6.93411129229515e-06, + "loss": 4.3377, + "step": 36115 + }, + { + "epoch": 2.4541377904606603, + "grad_norm": 0.6539926528930664, + "learning_rate": 6.9336866422068215e-06, + "loss": 4.3254, + "step": 36120 + }, + { + "epoch": 2.4544775105313223, + "grad_norm": 0.34671762585639954, + "learning_rate": 6.933261992118495e-06, + "loss": 4.2194, + "step": 36125 + }, + { + "epoch": 2.454817230601984, + "grad_norm": 0.30101263523101807, + "learning_rate": 6.932837342030167e-06, + "loss": 4.1339, + "step": 36130 + }, + { + "epoch": 2.4551569506726456, + "grad_norm": 0.41111865639686584, + "learning_rate": 6.93241269194184e-06, + "loss": 4.3277, + "step": 36135 + }, + { + "epoch": 2.4554966707433077, + "grad_norm": 0.697486162185669, + "learning_rate": 6.931988041853514e-06, + "loss": 4.1463, + "step": 36140 + }, + { + "epoch": 2.4558363908139693, + "grad_norm": 0.4155782461166382, + "learning_rate": 6.9315633917651856e-06, + "loss": 4.5612, + "step": 36145 + }, + { + "epoch": 2.456176110884631, + "grad_norm": 0.5253707766532898, + "learning_rate": 6.931138741676858e-06, + "loss": 4.3761, + "step": 36150 + }, + { + "epoch": 2.456515830955293, + "grad_norm": 0.31343576312065125, + "learning_rate": 6.930714091588532e-06, + "loss": 4.3279, + "step": 36155 + }, + { + "epoch": 2.4568555510259547, + "grad_norm": 0.5968754291534424, + "learning_rate": 6.930289441500204e-06, + "loss": 4.4423, + "step": 36160 + }, + { + "epoch": 2.4571952710966163, + "grad_norm": 0.3112012445926666, + "learning_rate": 6.929864791411878e-06, + "loss": 4.2467, + "step": 36165 + }, + { + "epoch": 2.457534991167278, + "grad_norm": 0.3368610441684723, + "learning_rate": 6.92944014132355e-06, + "loss": 4.1506, + "step": 36170 + }, + { + "epoch": 2.45787471123794, + "grad_norm": 0.2995799481868744, + "learning_rate": 6.929015491235222e-06, + "loss": 4.1869, + "step": 36175 + }, + { + "epoch": 2.4582144313086016, + "grad_norm": 0.5281543731689453, + "learning_rate": 6.928590841146896e-06, + "loss": 4.3539, + "step": 36180 + }, + { + "epoch": 2.4585541513792633, + "grad_norm": 0.3811488747596741, + "learning_rate": 6.928166191058569e-06, + "loss": 4.2027, + "step": 36185 + }, + { + "epoch": 2.4588938714499253, + "grad_norm": 0.4819504916667938, + "learning_rate": 6.927741540970241e-06, + "loss": 4.3243, + "step": 36190 + }, + { + "epoch": 2.459233591520587, + "grad_norm": 0.33272233605384827, + "learning_rate": 6.927316890881914e-06, + "loss": 4.1898, + "step": 36195 + }, + { + "epoch": 2.4595733115912486, + "grad_norm": 0.3780234456062317, + "learning_rate": 6.926892240793587e-06, + "loss": 4.1277, + "step": 36200 + }, + { + "epoch": 2.4599130316619107, + "grad_norm": 0.45078209042549133, + "learning_rate": 6.926467590705259e-06, + "loss": 4.7166, + "step": 36205 + }, + { + "epoch": 2.4602527517325723, + "grad_norm": 0.34085333347320557, + "learning_rate": 6.926042940616933e-06, + "loss": 4.2016, + "step": 36210 + }, + { + "epoch": 2.460592471803234, + "grad_norm": 0.5397926568984985, + "learning_rate": 6.925618290528605e-06, + "loss": 4.275, + "step": 36215 + }, + { + "epoch": 2.460932191873896, + "grad_norm": 0.4616122543811798, + "learning_rate": 6.9251936404402776e-06, + "loss": 4.3326, + "step": 36220 + }, + { + "epoch": 2.4612719119445576, + "grad_norm": 0.3516686260700226, + "learning_rate": 6.924768990351951e-06, + "loss": 4.0433, + "step": 36225 + }, + { + "epoch": 2.4616116320152193, + "grad_norm": 0.3521251976490021, + "learning_rate": 6.924344340263623e-06, + "loss": 4.4752, + "step": 36230 + }, + { + "epoch": 2.4619513520858813, + "grad_norm": 0.29344701766967773, + "learning_rate": 6.923919690175296e-06, + "loss": 4.039, + "step": 36235 + }, + { + "epoch": 2.462291072156543, + "grad_norm": 0.3871248662471771, + "learning_rate": 6.92349504008697e-06, + "loss": 4.3139, + "step": 36240 + }, + { + "epoch": 2.4626307922272046, + "grad_norm": 0.311301052570343, + "learning_rate": 6.9230703899986416e-06, + "loss": 4.1227, + "step": 36245 + }, + { + "epoch": 2.4629705122978667, + "grad_norm": 0.42481377720832825, + "learning_rate": 6.922645739910314e-06, + "loss": 4.1503, + "step": 36250 + }, + { + "epoch": 2.4633102323685283, + "grad_norm": 0.4961177706718445, + "learning_rate": 6.922221089821988e-06, + "loss": 4.2083, + "step": 36255 + }, + { + "epoch": 2.46364995243919, + "grad_norm": 0.36435073614120483, + "learning_rate": 6.92179643973366e-06, + "loss": 4.0456, + "step": 36260 + }, + { + "epoch": 2.463989672509852, + "grad_norm": 0.37691161036491394, + "learning_rate": 6.921371789645333e-06, + "loss": 4.337, + "step": 36265 + }, + { + "epoch": 2.4643293925805136, + "grad_norm": 0.46803170442581177, + "learning_rate": 6.920947139557006e-06, + "loss": 4.383, + "step": 36270 + }, + { + "epoch": 2.4646691126511753, + "grad_norm": 0.4507039487361908, + "learning_rate": 6.920522489468678e-06, + "loss": 4.1422, + "step": 36275 + }, + { + "epoch": 2.4650088327218374, + "grad_norm": 0.4261987805366516, + "learning_rate": 6.920097839380351e-06, + "loss": 4.1661, + "step": 36280 + }, + { + "epoch": 2.465348552792499, + "grad_norm": 0.33731600642204285, + "learning_rate": 6.919673189292024e-06, + "loss": 4.1246, + "step": 36285 + }, + { + "epoch": 2.4656882728631606, + "grad_norm": 0.49429240822792053, + "learning_rate": 6.919248539203697e-06, + "loss": 4.2331, + "step": 36290 + }, + { + "epoch": 2.4660279929338227, + "grad_norm": 0.48718640208244324, + "learning_rate": 6.91882388911537e-06, + "loss": 3.9476, + "step": 36295 + }, + { + "epoch": 2.4663677130044843, + "grad_norm": 0.41819339990615845, + "learning_rate": 6.918399239027042e-06, + "loss": 4.1872, + "step": 36300 + }, + { + "epoch": 2.466707433075146, + "grad_norm": 0.28965362906455994, + "learning_rate": 6.917974588938715e-06, + "loss": 4.1187, + "step": 36305 + }, + { + "epoch": 2.467047153145808, + "grad_norm": 0.3665783703327179, + "learning_rate": 6.917549938850387e-06, + "loss": 4.3715, + "step": 36310 + }, + { + "epoch": 2.4673868732164697, + "grad_norm": 0.25535616278648376, + "learning_rate": 6.917125288762061e-06, + "loss": 4.2889, + "step": 36315 + }, + { + "epoch": 2.4677265932871313, + "grad_norm": 0.3305842876434326, + "learning_rate": 6.916700638673734e-06, + "loss": 4.3482, + "step": 36320 + }, + { + "epoch": 2.4680663133577934, + "grad_norm": 0.36750543117523193, + "learning_rate": 6.9162759885854055e-06, + "loss": 4.1448, + "step": 36325 + }, + { + "epoch": 2.468406033428455, + "grad_norm": 0.30002957582473755, + "learning_rate": 6.915851338497079e-06, + "loss": 4.0204, + "step": 36330 + }, + { + "epoch": 2.4687457534991166, + "grad_norm": 0.3331378102302551, + "learning_rate": 6.915426688408752e-06, + "loss": 4.2493, + "step": 36335 + }, + { + "epoch": 2.4690854735697787, + "grad_norm": 0.42510974407196045, + "learning_rate": 6.915002038320424e-06, + "loss": 3.9563, + "step": 36340 + }, + { + "epoch": 2.4694251936404403, + "grad_norm": 0.3744131624698639, + "learning_rate": 6.914577388232098e-06, + "loss": 4.1877, + "step": 36345 + }, + { + "epoch": 2.469764913711102, + "grad_norm": 0.3998700976371765, + "learning_rate": 6.91415273814377e-06, + "loss": 4.46, + "step": 36350 + }, + { + "epoch": 2.470104633781764, + "grad_norm": 0.4230731725692749, + "learning_rate": 6.913728088055442e-06, + "loss": 4.0207, + "step": 36355 + }, + { + "epoch": 2.4704443538524257, + "grad_norm": 0.6566610336303711, + "learning_rate": 6.913303437967116e-06, + "loss": 4.0493, + "step": 36360 + }, + { + "epoch": 2.4707840739230873, + "grad_norm": 0.30805012583732605, + "learning_rate": 6.912878787878789e-06, + "loss": 4.4299, + "step": 36365 + }, + { + "epoch": 2.4711237939937494, + "grad_norm": 0.3820827305316925, + "learning_rate": 6.912454137790461e-06, + "loss": 4.2487, + "step": 36370 + }, + { + "epoch": 2.471463514064411, + "grad_norm": 0.3237666189670563, + "learning_rate": 6.912029487702134e-06, + "loss": 4.0278, + "step": 36375 + }, + { + "epoch": 2.4718032341350726, + "grad_norm": 0.5699762105941772, + "learning_rate": 6.911604837613806e-06, + "loss": 4.3355, + "step": 36380 + }, + { + "epoch": 2.4721429542057347, + "grad_norm": 0.33514031767845154, + "learning_rate": 6.911180187525479e-06, + "loss": 4.275, + "step": 36385 + }, + { + "epoch": 2.4724826742763963, + "grad_norm": 0.5163676142692566, + "learning_rate": 6.910755537437153e-06, + "loss": 4.114, + "step": 36390 + }, + { + "epoch": 2.472822394347058, + "grad_norm": 0.38304412364959717, + "learning_rate": 6.910330887348825e-06, + "loss": 4.1262, + "step": 36395 + }, + { + "epoch": 2.47316211441772, + "grad_norm": 0.36253926157951355, + "learning_rate": 6.9099062372604975e-06, + "loss": 4.1734, + "step": 36400 + }, + { + "epoch": 2.4735018344883817, + "grad_norm": 0.3099172115325928, + "learning_rate": 6.909481587172171e-06, + "loss": 4.2845, + "step": 36405 + }, + { + "epoch": 2.4738415545590433, + "grad_norm": 0.2637651562690735, + "learning_rate": 6.909056937083843e-06, + "loss": 4.161, + "step": 36410 + }, + { + "epoch": 2.474181274629705, + "grad_norm": 0.4856361448764801, + "learning_rate": 6.908632286995516e-06, + "loss": 4.0229, + "step": 36415 + }, + { + "epoch": 2.474520994700367, + "grad_norm": 0.5347649455070496, + "learning_rate": 6.90820763690719e-06, + "loss": 4.205, + "step": 36420 + }, + { + "epoch": 2.4748607147710286, + "grad_norm": 0.3525322377681732, + "learning_rate": 6.9077829868188615e-06, + "loss": 4.1571, + "step": 36425 + }, + { + "epoch": 2.4752004348416903, + "grad_norm": 0.31760141253471375, + "learning_rate": 6.907358336730534e-06, + "loss": 4.1624, + "step": 36430 + }, + { + "epoch": 2.4755401549123524, + "grad_norm": 0.5642611980438232, + "learning_rate": 6.906933686642208e-06, + "loss": 4.1111, + "step": 36435 + }, + { + "epoch": 2.475879874983014, + "grad_norm": 0.44251522421836853, + "learning_rate": 6.90650903655388e-06, + "loss": 4.3226, + "step": 36440 + }, + { + "epoch": 2.4762195950536756, + "grad_norm": 0.3315935432910919, + "learning_rate": 6.906084386465553e-06, + "loss": 4.4146, + "step": 36445 + }, + { + "epoch": 2.4765593151243377, + "grad_norm": 0.20589157938957214, + "learning_rate": 6.9056597363772256e-06, + "loss": 4.2235, + "step": 36450 + }, + { + "epoch": 2.4768990351949993, + "grad_norm": 0.6778982877731323, + "learning_rate": 6.905235086288898e-06, + "loss": 4.4174, + "step": 36455 + }, + { + "epoch": 2.477238755265661, + "grad_norm": 0.3128318190574646, + "learning_rate": 6.904810436200571e-06, + "loss": 4.2948, + "step": 36460 + }, + { + "epoch": 2.477578475336323, + "grad_norm": 0.3537037670612335, + "learning_rate": 6.904385786112244e-06, + "loss": 4.2617, + "step": 36465 + }, + { + "epoch": 2.4779181954069847, + "grad_norm": 0.45705047249794006, + "learning_rate": 6.903961136023917e-06, + "loss": 4.0819, + "step": 36470 + }, + { + "epoch": 2.4782579154776463, + "grad_norm": 0.3218440115451813, + "learning_rate": 6.903536485935589e-06, + "loss": 4.2604, + "step": 36475 + }, + { + "epoch": 2.4785976355483084, + "grad_norm": 0.4102398455142975, + "learning_rate": 6.903111835847262e-06, + "loss": 4.279, + "step": 36480 + }, + { + "epoch": 2.47893735561897, + "grad_norm": 0.359722763299942, + "learning_rate": 6.902687185758935e-06, + "loss": 4.1678, + "step": 36485 + }, + { + "epoch": 2.4792770756896316, + "grad_norm": 0.42945125699043274, + "learning_rate": 6.902262535670607e-06, + "loss": 4.2118, + "step": 36490 + }, + { + "epoch": 2.4796167957602937, + "grad_norm": 0.40288785099983215, + "learning_rate": 6.901837885582281e-06, + "loss": 4.1741, + "step": 36495 + }, + { + "epoch": 2.4799565158309553, + "grad_norm": 0.3604901432991028, + "learning_rate": 6.9014132354939536e-06, + "loss": 4.2149, + "step": 36500 + }, + { + "epoch": 2.480296235901617, + "grad_norm": 0.27973756194114685, + "learning_rate": 6.900988585405627e-06, + "loss": 4.0526, + "step": 36505 + }, + { + "epoch": 2.4806359559722786, + "grad_norm": 0.2651127278804779, + "learning_rate": 6.900563935317299e-06, + "loss": 4.1908, + "step": 36510 + }, + { + "epoch": 2.4809756760429407, + "grad_norm": 0.48289451003074646, + "learning_rate": 6.900139285228972e-06, + "loss": 4.1346, + "step": 36515 + }, + { + "epoch": 2.4813153961136023, + "grad_norm": 0.42716163396835327, + "learning_rate": 6.899714635140646e-06, + "loss": 4.026, + "step": 36520 + }, + { + "epoch": 2.481655116184264, + "grad_norm": 0.48720791935920715, + "learning_rate": 6.8992899850523176e-06, + "loss": 4.3007, + "step": 36525 + }, + { + "epoch": 2.481994836254926, + "grad_norm": 0.5638366937637329, + "learning_rate": 6.89886533496399e-06, + "loss": 4.2347, + "step": 36530 + }, + { + "epoch": 2.4823345563255876, + "grad_norm": 0.32325872778892517, + "learning_rate": 6.898440684875663e-06, + "loss": 4.1801, + "step": 36535 + }, + { + "epoch": 2.4826742763962493, + "grad_norm": 0.26659107208251953, + "learning_rate": 6.898016034787336e-06, + "loss": 4.1389, + "step": 36540 + }, + { + "epoch": 2.4830139964669113, + "grad_norm": 0.5796077847480774, + "learning_rate": 6.897591384699009e-06, + "loss": 4.2335, + "step": 36545 + }, + { + "epoch": 2.483353716537573, + "grad_norm": 0.45213526487350464, + "learning_rate": 6.8971667346106816e-06, + "loss": 4.0017, + "step": 36550 + }, + { + "epoch": 2.4836934366082346, + "grad_norm": 0.3512641191482544, + "learning_rate": 6.896742084522354e-06, + "loss": 4.1149, + "step": 36555 + }, + { + "epoch": 2.4840331566788967, + "grad_norm": 0.5226696729660034, + "learning_rate": 6.896317434434026e-06, + "loss": 4.1259, + "step": 36560 + }, + { + "epoch": 2.4843728767495583, + "grad_norm": 0.39468422532081604, + "learning_rate": 6.8958927843457e-06, + "loss": 4.0691, + "step": 36565 + }, + { + "epoch": 2.48471259682022, + "grad_norm": 0.4285683333873749, + "learning_rate": 6.895468134257373e-06, + "loss": 4.1917, + "step": 36570 + }, + { + "epoch": 2.485052316890882, + "grad_norm": 0.3660343885421753, + "learning_rate": 6.895043484169045e-06, + "loss": 4.3538, + "step": 36575 + }, + { + "epoch": 2.4853920369615436, + "grad_norm": 0.4041832983493805, + "learning_rate": 6.894618834080718e-06, + "loss": 4.4443, + "step": 36580 + }, + { + "epoch": 2.4857317570322053, + "grad_norm": 0.337655246257782, + "learning_rate": 6.894194183992391e-06, + "loss": 4.217, + "step": 36585 + }, + { + "epoch": 2.4860714771028674, + "grad_norm": 0.3065522313117981, + "learning_rate": 6.893769533904063e-06, + "loss": 4.3115, + "step": 36590 + }, + { + "epoch": 2.486411197173529, + "grad_norm": 0.3836512565612793, + "learning_rate": 6.893344883815737e-06, + "loss": 4.0399, + "step": 36595 + }, + { + "epoch": 2.4867509172441906, + "grad_norm": 0.6738848090171814, + "learning_rate": 6.89292023372741e-06, + "loss": 4.3031, + "step": 36600 + }, + { + "epoch": 2.4870906373148527, + "grad_norm": 0.43367278575897217, + "learning_rate": 6.8924955836390815e-06, + "loss": 3.9992, + "step": 36605 + }, + { + "epoch": 2.4874303573855143, + "grad_norm": 0.49371337890625, + "learning_rate": 6.892070933550755e-06, + "loss": 4.3723, + "step": 36610 + }, + { + "epoch": 2.487770077456176, + "grad_norm": 0.4025973975658417, + "learning_rate": 6.891646283462428e-06, + "loss": 4.3751, + "step": 36615 + }, + { + "epoch": 2.488109797526838, + "grad_norm": 0.39580878615379333, + "learning_rate": 6.8912216333741e-06, + "loss": 4.3617, + "step": 36620 + }, + { + "epoch": 2.4884495175974997, + "grad_norm": 0.29623833298683167, + "learning_rate": 6.890796983285774e-06, + "loss": 4.1893, + "step": 36625 + }, + { + "epoch": 2.4887892376681613, + "grad_norm": 0.4238971769809723, + "learning_rate": 6.8903723331974455e-06, + "loss": 4.0004, + "step": 36630 + }, + { + "epoch": 2.4891289577388234, + "grad_norm": 0.3258155882358551, + "learning_rate": 6.889947683109118e-06, + "loss": 4.1391, + "step": 36635 + }, + { + "epoch": 2.489468677809485, + "grad_norm": 0.34543728828430176, + "learning_rate": 6.889523033020792e-06, + "loss": 4.0929, + "step": 36640 + }, + { + "epoch": 2.4898083978801466, + "grad_norm": 0.42513078451156616, + "learning_rate": 6.889098382932464e-06, + "loss": 4.1937, + "step": 36645 + }, + { + "epoch": 2.4901481179508087, + "grad_norm": 0.4687199294567108, + "learning_rate": 6.888673732844137e-06, + "loss": 4.159, + "step": 36650 + }, + { + "epoch": 2.4904878380214703, + "grad_norm": 0.392683207988739, + "learning_rate": 6.88824908275581e-06, + "loss": 4.4877, + "step": 36655 + }, + { + "epoch": 2.490827558092132, + "grad_norm": 0.5319814085960388, + "learning_rate": 6.887824432667482e-06, + "loss": 4.0436, + "step": 36660 + }, + { + "epoch": 2.491167278162794, + "grad_norm": 0.3852959871292114, + "learning_rate": 6.887399782579155e-06, + "loss": 4.1783, + "step": 36665 + }, + { + "epoch": 2.4915069982334557, + "grad_norm": 0.35050275921821594, + "learning_rate": 6.886975132490829e-06, + "loss": 4.0705, + "step": 36670 + }, + { + "epoch": 2.4918467183041173, + "grad_norm": 0.287328839302063, + "learning_rate": 6.886550482402501e-06, + "loss": 3.7662, + "step": 36675 + }, + { + "epoch": 2.4921864383747794, + "grad_norm": 0.5556873083114624, + "learning_rate": 6.8861258323141735e-06, + "loss": 4.1495, + "step": 36680 + }, + { + "epoch": 2.492526158445441, + "grad_norm": 0.34081923961639404, + "learning_rate": 6.885701182225847e-06, + "loss": 4.0557, + "step": 36685 + }, + { + "epoch": 2.4928658785161026, + "grad_norm": 0.3566417098045349, + "learning_rate": 6.885276532137519e-06, + "loss": 4.0549, + "step": 36690 + }, + { + "epoch": 2.4932055985867647, + "grad_norm": 0.29073256254196167, + "learning_rate": 6.884851882049192e-06, + "loss": 4.1649, + "step": 36695 + }, + { + "epoch": 2.4935453186574263, + "grad_norm": 0.7008209228515625, + "learning_rate": 6.884427231960865e-06, + "loss": 4.1837, + "step": 36700 + }, + { + "epoch": 2.493885038728088, + "grad_norm": 0.41210493445396423, + "learning_rate": 6.8840025818725375e-06, + "loss": 4.033, + "step": 36705 + }, + { + "epoch": 2.49422475879875, + "grad_norm": 0.4007455110549927, + "learning_rate": 6.88357793178421e-06, + "loss": 4.2169, + "step": 36710 + }, + { + "epoch": 2.4945644788694117, + "grad_norm": 0.43020099401474, + "learning_rate": 6.883153281695883e-06, + "loss": 4.2953, + "step": 36715 + }, + { + "epoch": 2.4949041989400733, + "grad_norm": 0.3737272024154663, + "learning_rate": 6.882728631607556e-06, + "loss": 4.1196, + "step": 36720 + }, + { + "epoch": 2.4952439190107354, + "grad_norm": 0.47279244661331177, + "learning_rate": 6.882303981519228e-06, + "loss": 4.2626, + "step": 36725 + }, + { + "epoch": 2.495583639081397, + "grad_norm": 0.3295413851737976, + "learning_rate": 6.8818793314309015e-06, + "loss": 4.2372, + "step": 36730 + }, + { + "epoch": 2.4959233591520587, + "grad_norm": 0.4434325098991394, + "learning_rate": 6.881454681342574e-06, + "loss": 4.3175, + "step": 36735 + }, + { + "epoch": 2.4962630792227207, + "grad_norm": 0.6940736770629883, + "learning_rate": 6.881030031254246e-06, + "loss": 4.3367, + "step": 36740 + }, + { + "epoch": 2.4966027992933824, + "grad_norm": 0.6637571454048157, + "learning_rate": 6.88060538116592e-06, + "loss": 4.1967, + "step": 36745 + }, + { + "epoch": 2.496942519364044, + "grad_norm": 0.3561144471168518, + "learning_rate": 6.880180731077593e-06, + "loss": 4.2827, + "step": 36750 + }, + { + "epoch": 2.4972822394347056, + "grad_norm": 0.4187156856060028, + "learning_rate": 6.879756080989265e-06, + "loss": 4.0667, + "step": 36755 + }, + { + "epoch": 2.4976219595053677, + "grad_norm": 0.3994169235229492, + "learning_rate": 6.879331430900938e-06, + "loss": 4.0161, + "step": 36760 + }, + { + "epoch": 2.4979616795760293, + "grad_norm": 0.35404449701309204, + "learning_rate": 6.878906780812611e-06, + "loss": 4.028, + "step": 36765 + }, + { + "epoch": 2.498301399646691, + "grad_norm": 0.3170236647129059, + "learning_rate": 6.878482130724283e-06, + "loss": 4.2158, + "step": 36770 + }, + { + "epoch": 2.498641119717353, + "grad_norm": 0.439581960439682, + "learning_rate": 6.878057480635957e-06, + "loss": 4.2001, + "step": 36775 + }, + { + "epoch": 2.4989808397880147, + "grad_norm": 0.4561178386211395, + "learning_rate": 6.8776328305476296e-06, + "loss": 4.1607, + "step": 36780 + }, + { + "epoch": 2.4993205598586763, + "grad_norm": 0.30842289328575134, + "learning_rate": 6.8772081804593015e-06, + "loss": 4.1234, + "step": 36785 + }, + { + "epoch": 2.4996602799293384, + "grad_norm": 0.33449748158454895, + "learning_rate": 6.876783530370975e-06, + "loss": 4.0951, + "step": 36790 + }, + { + "epoch": 2.5, + "grad_norm": 0.3575311303138733, + "learning_rate": 6.876358880282647e-06, + "loss": 4.0064, + "step": 36795 + }, + { + "epoch": 2.5003397200706616, + "grad_norm": 0.3853917717933655, + "learning_rate": 6.87593423019432e-06, + "loss": 4.3671, + "step": 36800 + }, + { + "epoch": 2.5006794401413237, + "grad_norm": 0.5558829307556152, + "learning_rate": 6.8755095801059936e-06, + "loss": 4.3126, + "step": 36805 + }, + { + "epoch": 2.5010191602119853, + "grad_norm": 0.39568760991096497, + "learning_rate": 6.8750849300176655e-06, + "loss": 4.3459, + "step": 36810 + }, + { + "epoch": 2.501358880282647, + "grad_norm": 0.32255345582962036, + "learning_rate": 6.874660279929338e-06, + "loss": 4.1886, + "step": 36815 + }, + { + "epoch": 2.5016986003533086, + "grad_norm": 0.3413611054420471, + "learning_rate": 6.874235629841012e-06, + "loss": 4.3042, + "step": 36820 + }, + { + "epoch": 2.5020383204239707, + "grad_norm": 0.369412362575531, + "learning_rate": 6.873810979752684e-06, + "loss": 4.4049, + "step": 36825 + }, + { + "epoch": 2.5023780404946323, + "grad_norm": 0.5605555176734924, + "learning_rate": 6.873386329664357e-06, + "loss": 4.1706, + "step": 36830 + }, + { + "epoch": 2.502717760565294, + "grad_norm": 0.3512830138206482, + "learning_rate": 6.87296167957603e-06, + "loss": 4.0088, + "step": 36835 + }, + { + "epoch": 2.503057480635956, + "grad_norm": 0.25003138184547424, + "learning_rate": 6.872537029487702e-06, + "loss": 4.1548, + "step": 36840 + }, + { + "epoch": 2.5033972007066176, + "grad_norm": 0.42777472734451294, + "learning_rate": 6.872112379399375e-06, + "loss": 4.403, + "step": 36845 + }, + { + "epoch": 2.5037369207772793, + "grad_norm": 0.33780717849731445, + "learning_rate": 6.871687729311049e-06, + "loss": 4.154, + "step": 36850 + }, + { + "epoch": 2.5040766408479413, + "grad_norm": 0.4462963938713074, + "learning_rate": 6.871263079222721e-06, + "loss": 4.1767, + "step": 36855 + }, + { + "epoch": 2.504416360918603, + "grad_norm": 0.3130565583705902, + "learning_rate": 6.870838429134394e-06, + "loss": 4.4218, + "step": 36860 + }, + { + "epoch": 2.5047560809892646, + "grad_norm": 0.4175531566143036, + "learning_rate": 6.870413779046067e-06, + "loss": 4.2112, + "step": 36865 + }, + { + "epoch": 2.5050958010599267, + "grad_norm": 0.294901967048645, + "learning_rate": 6.869989128957739e-06, + "loss": 4.005, + "step": 36870 + }, + { + "epoch": 2.5054355211305883, + "grad_norm": 0.3155161142349243, + "learning_rate": 6.869564478869413e-06, + "loss": 4.2191, + "step": 36875 + }, + { + "epoch": 2.50577524120125, + "grad_norm": 0.6071690320968628, + "learning_rate": 6.869139828781085e-06, + "loss": 4.2275, + "step": 36880 + }, + { + "epoch": 2.506114961271912, + "grad_norm": 0.3851169943809509, + "learning_rate": 6.8687151786927575e-06, + "loss": 3.9133, + "step": 36885 + }, + { + "epoch": 2.5064546813425737, + "grad_norm": 0.35326358675956726, + "learning_rate": 6.868290528604431e-06, + "loss": 4.1353, + "step": 36890 + }, + { + "epoch": 2.5067944014132353, + "grad_norm": 0.4010222852230072, + "learning_rate": 6.867865878516103e-06, + "loss": 4.123, + "step": 36895 + }, + { + "epoch": 2.5071341214838974, + "grad_norm": 0.39302846789360046, + "learning_rate": 6.867441228427776e-06, + "loss": 4.0839, + "step": 36900 + }, + { + "epoch": 2.507473841554559, + "grad_norm": 0.3035372197628021, + "learning_rate": 6.86701657833945e-06, + "loss": 4.1332, + "step": 36905 + }, + { + "epoch": 2.5078135616252206, + "grad_norm": 0.2695900499820709, + "learning_rate": 6.8665919282511215e-06, + "loss": 3.9656, + "step": 36910 + }, + { + "epoch": 2.5081532816958827, + "grad_norm": 0.6487801671028137, + "learning_rate": 6.866167278162794e-06, + "loss": 4.2642, + "step": 36915 + }, + { + "epoch": 2.5084930017665443, + "grad_norm": 0.3451882004737854, + "learning_rate": 6.865742628074468e-06, + "loss": 4.0561, + "step": 36920 + }, + { + "epoch": 2.508832721837206, + "grad_norm": 0.4102044105529785, + "learning_rate": 6.86531797798614e-06, + "loss": 4.2358, + "step": 36925 + }, + { + "epoch": 2.509172441907868, + "grad_norm": 0.2809232771396637, + "learning_rate": 6.864893327897813e-06, + "loss": 3.9956, + "step": 36930 + }, + { + "epoch": 2.5095121619785297, + "grad_norm": 0.4230904281139374, + "learning_rate": 6.864468677809486e-06, + "loss": 4.3686, + "step": 36935 + }, + { + "epoch": 2.5098518820491913, + "grad_norm": 0.2672853469848633, + "learning_rate": 6.864044027721158e-06, + "loss": 4.1703, + "step": 36940 + }, + { + "epoch": 2.5101916021198534, + "grad_norm": 0.3171551823616028, + "learning_rate": 6.863619377632831e-06, + "loss": 4.0098, + "step": 36945 + }, + { + "epoch": 2.510531322190515, + "grad_norm": 0.4196888506412506, + "learning_rate": 6.863194727544504e-06, + "loss": 4.2471, + "step": 36950 + }, + { + "epoch": 2.5108710422611766, + "grad_norm": 0.4035492539405823, + "learning_rate": 6.862770077456177e-06, + "loss": 4.1629, + "step": 36955 + }, + { + "epoch": 2.5112107623318387, + "grad_norm": 0.4023241698741913, + "learning_rate": 6.8623454273678495e-06, + "loss": 4.1881, + "step": 36960 + }, + { + "epoch": 2.5115504824025003, + "grad_norm": 0.34987673163414, + "learning_rate": 6.861920777279522e-06, + "loss": 4.0484, + "step": 36965 + }, + { + "epoch": 2.511890202473162, + "grad_norm": 0.37471386790275574, + "learning_rate": 6.861496127191195e-06, + "loss": 4.235, + "step": 36970 + }, + { + "epoch": 2.512229922543824, + "grad_norm": 0.2839449942111969, + "learning_rate": 6.861071477102867e-06, + "loss": 4.2102, + "step": 36975 + }, + { + "epoch": 2.5125696426144857, + "grad_norm": 0.3965788185596466, + "learning_rate": 6.860646827014541e-06, + "loss": 4.2045, + "step": 36980 + }, + { + "epoch": 2.5129093626851473, + "grad_norm": 0.324291855096817, + "learning_rate": 6.8602221769262135e-06, + "loss": 4.0645, + "step": 36985 + }, + { + "epoch": 2.5132490827558094, + "grad_norm": 0.3948322832584381, + "learning_rate": 6.8597975268378855e-06, + "loss": 4.2628, + "step": 36990 + }, + { + "epoch": 2.513588802826471, + "grad_norm": 0.2915910482406616, + "learning_rate": 6.859372876749559e-06, + "loss": 4.3394, + "step": 36995 + }, + { + "epoch": 2.5139285228971326, + "grad_norm": 0.4871560037136078, + "learning_rate": 6.858948226661232e-06, + "loss": 4.2101, + "step": 37000 + }, + { + "epoch": 2.5142682429677947, + "grad_norm": 0.2927365005016327, + "learning_rate": 6.858523576572904e-06, + "loss": 4.508, + "step": 37005 + }, + { + "epoch": 2.5146079630384564, + "grad_norm": 0.32833242416381836, + "learning_rate": 6.8580989264845775e-06, + "loss": 3.9434, + "step": 37010 + }, + { + "epoch": 2.514947683109118, + "grad_norm": 0.5123338103294373, + "learning_rate": 6.85767427639625e-06, + "loss": 4.1464, + "step": 37015 + }, + { + "epoch": 2.51528740317978, + "grad_norm": 0.2841409742832184, + "learning_rate": 6.857249626307922e-06, + "loss": 4.1409, + "step": 37020 + }, + { + "epoch": 2.5156271232504417, + "grad_norm": 0.34386545419692993, + "learning_rate": 6.856824976219596e-06, + "loss": 4.1472, + "step": 37025 + }, + { + "epoch": 2.5159668433211033, + "grad_norm": 0.33741435408592224, + "learning_rate": 6.856400326131269e-06, + "loss": 4.0727, + "step": 37030 + }, + { + "epoch": 2.5163065633917654, + "grad_norm": 0.3846331238746643, + "learning_rate": 6.855975676042941e-06, + "loss": 3.96, + "step": 37035 + }, + { + "epoch": 2.516646283462427, + "grad_norm": 0.7532127499580383, + "learning_rate": 6.855551025954614e-06, + "loss": 4.0986, + "step": 37040 + }, + { + "epoch": 2.5169860035330887, + "grad_norm": 0.4751978814601898, + "learning_rate": 6.855126375866286e-06, + "loss": 4.0297, + "step": 37045 + }, + { + "epoch": 2.5173257236037507, + "grad_norm": 0.42432770133018494, + "learning_rate": 6.854701725777959e-06, + "loss": 4.2303, + "step": 37050 + }, + { + "epoch": 2.5176654436744124, + "grad_norm": 0.389350026845932, + "learning_rate": 6.854277075689633e-06, + "loss": 4.2117, + "step": 37055 + }, + { + "epoch": 2.518005163745074, + "grad_norm": 0.6003375053405762, + "learning_rate": 6.853852425601305e-06, + "loss": 4.1914, + "step": 37060 + }, + { + "epoch": 2.518344883815736, + "grad_norm": 0.3597167432308197, + "learning_rate": 6.8534277755129775e-06, + "loss": 4.1486, + "step": 37065 + }, + { + "epoch": 2.5186846038863977, + "grad_norm": 0.3167712390422821, + "learning_rate": 6.853003125424651e-06, + "loss": 4.3104, + "step": 37070 + }, + { + "epoch": 2.5190243239570593, + "grad_norm": 0.2838726341724396, + "learning_rate": 6.852578475336323e-06, + "loss": 3.791, + "step": 37075 + }, + { + "epoch": 2.5193640440277214, + "grad_norm": 0.459964781999588, + "learning_rate": 6.852153825247996e-06, + "loss": 4.358, + "step": 37080 + }, + { + "epoch": 2.519703764098383, + "grad_norm": 0.28936418890953064, + "learning_rate": 6.8517291751596696e-06, + "loss": 4.06, + "step": 37085 + }, + { + "epoch": 2.5200434841690447, + "grad_norm": 0.2701174318790436, + "learning_rate": 6.8513045250713415e-06, + "loss": 4.0458, + "step": 37090 + }, + { + "epoch": 2.5203832042397067, + "grad_norm": 0.4081540107727051, + "learning_rate": 6.850879874983014e-06, + "loss": 4.4129, + "step": 37095 + }, + { + "epoch": 2.5207229243103684, + "grad_norm": 0.3368125557899475, + "learning_rate": 6.850455224894688e-06, + "loss": 4.1434, + "step": 37100 + }, + { + "epoch": 2.52106264438103, + "grad_norm": 0.3462597727775574, + "learning_rate": 6.85003057480636e-06, + "loss": 3.9542, + "step": 37105 + }, + { + "epoch": 2.521402364451692, + "grad_norm": 0.3538810610771179, + "learning_rate": 6.849605924718033e-06, + "loss": 4.3041, + "step": 37110 + }, + { + "epoch": 2.5217420845223537, + "grad_norm": 0.4081428050994873, + "learning_rate": 6.849181274629706e-06, + "loss": 4.198, + "step": 37115 + }, + { + "epoch": 2.5220818045930153, + "grad_norm": 0.33884650468826294, + "learning_rate": 6.848756624541378e-06, + "loss": 4.2589, + "step": 37120 + }, + { + "epoch": 2.522421524663677, + "grad_norm": 0.38238853216171265, + "learning_rate": 6.848331974453051e-06, + "loss": 4.0953, + "step": 37125 + }, + { + "epoch": 2.522761244734339, + "grad_norm": 0.48426833748817444, + "learning_rate": 6.847907324364724e-06, + "loss": 4.2053, + "step": 37130 + }, + { + "epoch": 2.5231009648050007, + "grad_norm": 0.31601276993751526, + "learning_rate": 6.847482674276397e-06, + "loss": 4.2683, + "step": 37135 + }, + { + "epoch": 2.5234406848756623, + "grad_norm": 0.2451317310333252, + "learning_rate": 6.847058024188069e-06, + "loss": 4.2814, + "step": 37140 + }, + { + "epoch": 2.5237804049463244, + "grad_norm": 0.3949219584465027, + "learning_rate": 6.846633374099742e-06, + "loss": 3.9759, + "step": 37145 + }, + { + "epoch": 2.524120125016986, + "grad_norm": 0.3375106155872345, + "learning_rate": 6.846208724011415e-06, + "loss": 4.2397, + "step": 37150 + }, + { + "epoch": 2.5244598450876476, + "grad_norm": 0.40398693084716797, + "learning_rate": 6.845784073923087e-06, + "loss": 4.225, + "step": 37155 + }, + { + "epoch": 2.5247995651583093, + "grad_norm": 0.47169235348701477, + "learning_rate": 6.845359423834761e-06, + "loss": 4.0327, + "step": 37160 + }, + { + "epoch": 2.5251392852289714, + "grad_norm": 0.48615744709968567, + "learning_rate": 6.8449347737464335e-06, + "loss": 4.4179, + "step": 37165 + }, + { + "epoch": 2.525479005299633, + "grad_norm": 0.363066703081131, + "learning_rate": 6.8445101236581055e-06, + "loss": 4.2586, + "step": 37170 + }, + { + "epoch": 2.5258187253702946, + "grad_norm": 0.4523671269416809, + "learning_rate": 6.844085473569779e-06, + "loss": 4.2722, + "step": 37175 + }, + { + "epoch": 2.5261584454409567, + "grad_norm": 0.42143553495407104, + "learning_rate": 6.843660823481452e-06, + "loss": 4.0954, + "step": 37180 + }, + { + "epoch": 2.5264981655116183, + "grad_norm": 0.3428216874599457, + "learning_rate": 6.843236173393124e-06, + "loss": 3.9931, + "step": 37185 + }, + { + "epoch": 2.52683788558228, + "grad_norm": 0.2598542273044586, + "learning_rate": 6.8428115233047975e-06, + "loss": 4.0306, + "step": 37190 + }, + { + "epoch": 2.527177605652942, + "grad_norm": 0.37398067116737366, + "learning_rate": 6.84238687321647e-06, + "loss": 4.3304, + "step": 37195 + }, + { + "epoch": 2.5275173257236037, + "grad_norm": 0.26748496294021606, + "learning_rate": 6.841962223128143e-06, + "loss": 4.1586, + "step": 37200 + }, + { + "epoch": 2.5278570457942653, + "grad_norm": 0.29368430376052856, + "learning_rate": 6.841537573039816e-06, + "loss": 4.1018, + "step": 37205 + }, + { + "epoch": 2.5281967658649274, + "grad_norm": 0.34825843572616577, + "learning_rate": 6.841112922951489e-06, + "loss": 4.2945, + "step": 37210 + }, + { + "epoch": 2.528536485935589, + "grad_norm": 0.3958490490913391, + "learning_rate": 6.8406882728631615e-06, + "loss": 4.036, + "step": 37215 + }, + { + "epoch": 2.5288762060062506, + "grad_norm": 0.30095067620277405, + "learning_rate": 6.840263622774834e-06, + "loss": 4.2459, + "step": 37220 + }, + { + "epoch": 2.5292159260769127, + "grad_norm": 0.5035551190376282, + "learning_rate": 6.839838972686506e-06, + "loss": 4.2247, + "step": 37225 + }, + { + "epoch": 2.5295556461475743, + "grad_norm": 0.3856772184371948, + "learning_rate": 6.83941432259818e-06, + "loss": 4.1511, + "step": 37230 + }, + { + "epoch": 2.529895366218236, + "grad_norm": 0.6888595223426819, + "learning_rate": 6.838989672509853e-06, + "loss": 4.0864, + "step": 37235 + }, + { + "epoch": 2.530235086288898, + "grad_norm": 0.28464290499687195, + "learning_rate": 6.838565022421525e-06, + "loss": 4.1709, + "step": 37240 + }, + { + "epoch": 2.5305748063595597, + "grad_norm": 0.5372489094734192, + "learning_rate": 6.838140372333198e-06, + "loss": 4.1765, + "step": 37245 + }, + { + "epoch": 2.5309145264302213, + "grad_norm": 0.4783514440059662, + "learning_rate": 6.837715722244871e-06, + "loss": 4.1673, + "step": 37250 + }, + { + "epoch": 2.5312542465008834, + "grad_norm": 0.3384709358215332, + "learning_rate": 6.837291072156543e-06, + "loss": 4.2747, + "step": 37255 + }, + { + "epoch": 2.531593966571545, + "grad_norm": 0.36078065633773804, + "learning_rate": 6.836866422068217e-06, + "loss": 4.1453, + "step": 37260 + }, + { + "epoch": 2.5319336866422066, + "grad_norm": 0.4193362891674042, + "learning_rate": 6.8364417719798895e-06, + "loss": 4.0079, + "step": 37265 + }, + { + "epoch": 2.5322734067128687, + "grad_norm": 0.3206990957260132, + "learning_rate": 6.8360171218915615e-06, + "loss": 4.0008, + "step": 37270 + }, + { + "epoch": 2.5326131267835303, + "grad_norm": 0.30331525206565857, + "learning_rate": 6.835592471803235e-06, + "loss": 4.1227, + "step": 37275 + }, + { + "epoch": 2.532952846854192, + "grad_norm": 0.43999865651130676, + "learning_rate": 6.835167821714908e-06, + "loss": 4.0284, + "step": 37280 + }, + { + "epoch": 2.533292566924854, + "grad_norm": 0.40794962644577026, + "learning_rate": 6.83474317162658e-06, + "loss": 4.2088, + "step": 37285 + }, + { + "epoch": 2.5336322869955157, + "grad_norm": 0.32012373208999634, + "learning_rate": 6.8343185215382535e-06, + "loss": 4.1328, + "step": 37290 + }, + { + "epoch": 2.5339720070661773, + "grad_norm": 0.4264039993286133, + "learning_rate": 6.8338938714499255e-06, + "loss": 4.1086, + "step": 37295 + }, + { + "epoch": 2.5343117271368394, + "grad_norm": 0.6505863666534424, + "learning_rate": 6.833469221361598e-06, + "loss": 4.2265, + "step": 37300 + }, + { + "epoch": 2.534651447207501, + "grad_norm": 0.3429872691631317, + "learning_rate": 6.833044571273272e-06, + "loss": 4.2201, + "step": 37305 + }, + { + "epoch": 2.5349911672781626, + "grad_norm": 0.5625879168510437, + "learning_rate": 6.832619921184944e-06, + "loss": 4.0894, + "step": 37310 + }, + { + "epoch": 2.5353308873488247, + "grad_norm": 0.4105376601219177, + "learning_rate": 6.832195271096617e-06, + "loss": 4.2168, + "step": 37315 + }, + { + "epoch": 2.5356706074194864, + "grad_norm": 0.3278648257255554, + "learning_rate": 6.831855551025956e-06, + "loss": 4.2818, + "step": 37320 + }, + { + "epoch": 2.536010327490148, + "grad_norm": 0.4403693974018097, + "learning_rate": 6.8314309009376276e-06, + "loss": 4.1437, + "step": 37325 + }, + { + "epoch": 2.53635004756081, + "grad_norm": 0.2727735936641693, + "learning_rate": 6.8310062508493e-06, + "loss": 4.234, + "step": 37330 + }, + { + "epoch": 2.5366897676314717, + "grad_norm": 0.3853159248828888, + "learning_rate": 6.830581600760974e-06, + "loss": 4.0994, + "step": 37335 + }, + { + "epoch": 2.5370294877021333, + "grad_norm": 0.3881276845932007, + "learning_rate": 6.830156950672646e-06, + "loss": 4.1504, + "step": 37340 + }, + { + "epoch": 2.5373692077727954, + "grad_norm": 0.36169537901878357, + "learning_rate": 6.829732300584319e-06, + "loss": 4.4925, + "step": 37345 + }, + { + "epoch": 2.537708927843457, + "grad_norm": 0.4202854633331299, + "learning_rate": 6.829307650495992e-06, + "loss": 4.0001, + "step": 37350 + }, + { + "epoch": 2.5380486479141187, + "grad_norm": 0.4988122582435608, + "learning_rate": 6.828883000407664e-06, + "loss": 4.2813, + "step": 37355 + }, + { + "epoch": 2.5383883679847807, + "grad_norm": 0.32741454243659973, + "learning_rate": 6.828458350319337e-06, + "loss": 4.1379, + "step": 37360 + }, + { + "epoch": 2.5387280880554424, + "grad_norm": 0.46169647574424744, + "learning_rate": 6.828033700231011e-06, + "loss": 3.9966, + "step": 37365 + }, + { + "epoch": 2.539067808126104, + "grad_norm": 1.0299102067947388, + "learning_rate": 6.827609050142683e-06, + "loss": 4.2887, + "step": 37370 + }, + { + "epoch": 2.539407528196766, + "grad_norm": 0.32287317514419556, + "learning_rate": 6.827184400054356e-06, + "loss": 4.1988, + "step": 37375 + }, + { + "epoch": 2.5397472482674277, + "grad_norm": 0.3120463788509369, + "learning_rate": 6.826759749966028e-06, + "loss": 4.1742, + "step": 37380 + }, + { + "epoch": 2.5400869683380893, + "grad_norm": 0.28978055715560913, + "learning_rate": 6.826335099877701e-06, + "loss": 4.1518, + "step": 37385 + }, + { + "epoch": 2.5404266884087514, + "grad_norm": 0.3583746552467346, + "learning_rate": 6.825910449789374e-06, + "loss": 4.1855, + "step": 37390 + }, + { + "epoch": 2.540766408479413, + "grad_norm": 0.28926098346710205, + "learning_rate": 6.825485799701047e-06, + "loss": 4.1276, + "step": 37395 + }, + { + "epoch": 2.5411061285500747, + "grad_norm": 0.3415997624397278, + "learning_rate": 6.82506114961272e-06, + "loss": 4.4592, + "step": 37400 + }, + { + "epoch": 2.5414458486207367, + "grad_norm": 0.27502113580703735, + "learning_rate": 6.824636499524393e-06, + "loss": 3.9932, + "step": 37405 + }, + { + "epoch": 2.5417855686913984, + "grad_norm": 0.4109880328178406, + "learning_rate": 6.824211849436065e-06, + "loss": 4.2292, + "step": 37410 + }, + { + "epoch": 2.54212528876206, + "grad_norm": 0.6910778284072876, + "learning_rate": 6.823787199347738e-06, + "loss": 4.2813, + "step": 37415 + }, + { + "epoch": 2.542465008832722, + "grad_norm": 0.4207935929298401, + "learning_rate": 6.823362549259412e-06, + "loss": 4.5163, + "step": 37420 + }, + { + "epoch": 2.5428047289033837, + "grad_norm": 0.32702237367630005, + "learning_rate": 6.822937899171084e-06, + "loss": 4.1993, + "step": 37425 + }, + { + "epoch": 2.5431444489740453, + "grad_norm": 0.2534087598323822, + "learning_rate": 6.822513249082756e-06, + "loss": 4.1715, + "step": 37430 + }, + { + "epoch": 2.5434841690447074, + "grad_norm": 0.37478166818618774, + "learning_rate": 6.82208859899443e-06, + "loss": 4.3344, + "step": 37435 + }, + { + "epoch": 2.543823889115369, + "grad_norm": 0.41540735960006714, + "learning_rate": 6.821663948906102e-06, + "loss": 4.1201, + "step": 37440 + }, + { + "epoch": 2.5441636091860307, + "grad_norm": 0.4385698437690735, + "learning_rate": 6.821239298817775e-06, + "loss": 4.1093, + "step": 37445 + }, + { + "epoch": 2.5445033292566928, + "grad_norm": 0.30061084032058716, + "learning_rate": 6.820814648729448e-06, + "loss": 4.068, + "step": 37450 + }, + { + "epoch": 2.5448430493273544, + "grad_norm": 0.5555351376533508, + "learning_rate": 6.82038999864112e-06, + "loss": 4.2604, + "step": 37455 + }, + { + "epoch": 2.545182769398016, + "grad_norm": 0.3514902591705322, + "learning_rate": 6.819965348552793e-06, + "loss": 4.3622, + "step": 37460 + }, + { + "epoch": 2.5455224894686777, + "grad_norm": 0.3303162753582001, + "learning_rate": 6.819540698464466e-06, + "loss": 4.0135, + "step": 37465 + }, + { + "epoch": 2.5458622095393397, + "grad_norm": 0.4428766369819641, + "learning_rate": 6.819116048376139e-06, + "loss": 4.1143, + "step": 37470 + }, + { + "epoch": 2.5462019296100014, + "grad_norm": 0.4063127934932709, + "learning_rate": 6.818691398287811e-06, + "loss": 4.1712, + "step": 37475 + }, + { + "epoch": 2.546541649680663, + "grad_norm": 0.3315499722957611, + "learning_rate": 6.818266748199484e-06, + "loss": 4.1713, + "step": 37480 + }, + { + "epoch": 2.546881369751325, + "grad_norm": 0.3159313499927521, + "learning_rate": 6.817842098111157e-06, + "loss": 4.1777, + "step": 37485 + }, + { + "epoch": 2.5472210898219867, + "grad_norm": 0.6620209813117981, + "learning_rate": 6.817417448022829e-06, + "loss": 4.3814, + "step": 37490 + }, + { + "epoch": 2.5475608098926483, + "grad_norm": 0.3396749794483185, + "learning_rate": 6.816992797934503e-06, + "loss": 4.2236, + "step": 37495 + }, + { + "epoch": 2.54790052996331, + "grad_norm": 0.3682197034358978, + "learning_rate": 6.816568147846176e-06, + "loss": 4.1609, + "step": 37500 + }, + { + "epoch": 2.548240250033972, + "grad_norm": 0.31669288873672485, + "learning_rate": 6.8161434977578476e-06, + "loss": 4.1325, + "step": 37505 + }, + { + "epoch": 2.5485799701046337, + "grad_norm": 0.3932414948940277, + "learning_rate": 6.815718847669521e-06, + "loss": 4.2232, + "step": 37510 + }, + { + "epoch": 2.5489196901752953, + "grad_norm": 0.3921944499015808, + "learning_rate": 6.815294197581194e-06, + "loss": 4.116, + "step": 37515 + }, + { + "epoch": 2.5492594102459574, + "grad_norm": 0.3309331238269806, + "learning_rate": 6.814869547492866e-06, + "loss": 3.922, + "step": 37520 + }, + { + "epoch": 2.549599130316619, + "grad_norm": 0.5038282871246338, + "learning_rate": 6.81444489740454e-06, + "loss": 4.2806, + "step": 37525 + }, + { + "epoch": 2.5499388503872806, + "grad_norm": 0.3009478747844696, + "learning_rate": 6.814020247316212e-06, + "loss": 4.0401, + "step": 37530 + }, + { + "epoch": 2.5502785704579427, + "grad_norm": 0.298850417137146, + "learning_rate": 6.813595597227884e-06, + "loss": 4.3175, + "step": 37535 + }, + { + "epoch": 2.5506182905286043, + "grad_norm": 0.3018946349620819, + "learning_rate": 6.813170947139558e-06, + "loss": 3.9744, + "step": 37540 + }, + { + "epoch": 2.550958010599266, + "grad_norm": 0.4951340854167938, + "learning_rate": 6.81274629705123e-06, + "loss": 4.2052, + "step": 37545 + }, + { + "epoch": 2.551297730669928, + "grad_norm": 0.559816300868988, + "learning_rate": 6.812321646962903e-06, + "loss": 4.1482, + "step": 37550 + }, + { + "epoch": 2.5516374507405897, + "grad_norm": 0.36621132493019104, + "learning_rate": 6.811896996874576e-06, + "loss": 3.8134, + "step": 37555 + }, + { + "epoch": 2.5519771708112513, + "grad_norm": 0.2474052906036377, + "learning_rate": 6.811472346786248e-06, + "loss": 3.8122, + "step": 37560 + }, + { + "epoch": 2.5523168908819134, + "grad_norm": 0.6689944863319397, + "learning_rate": 6.811047696697921e-06, + "loss": 4.0488, + "step": 37565 + }, + { + "epoch": 2.552656610952575, + "grad_norm": 0.4370094835758209, + "learning_rate": 6.810623046609595e-06, + "loss": 4.2444, + "step": 37570 + }, + { + "epoch": 2.5529963310232366, + "grad_norm": 0.3385678231716156, + "learning_rate": 6.810198396521267e-06, + "loss": 3.9226, + "step": 37575 + }, + { + "epoch": 2.5533360510938987, + "grad_norm": 0.3725952208042145, + "learning_rate": 6.8097737464329396e-06, + "loss": 4.3335, + "step": 37580 + }, + { + "epoch": 2.5536757711645603, + "grad_norm": 0.4271313548088074, + "learning_rate": 6.809349096344613e-06, + "loss": 4.0231, + "step": 37585 + }, + { + "epoch": 2.554015491235222, + "grad_norm": 0.34776946902275085, + "learning_rate": 6.808924446256285e-06, + "loss": 4.1256, + "step": 37590 + }, + { + "epoch": 2.554355211305884, + "grad_norm": 0.4323490560054779, + "learning_rate": 6.808499796167958e-06, + "loss": 4.1341, + "step": 37595 + }, + { + "epoch": 2.5546949313765457, + "grad_norm": 0.2933393120765686, + "learning_rate": 6.808075146079632e-06, + "loss": 3.9303, + "step": 37600 + }, + { + "epoch": 2.5550346514472073, + "grad_norm": 0.2554745078086853, + "learning_rate": 6.8076504959913036e-06, + "loss": 4.1077, + "step": 37605 + }, + { + "epoch": 2.5553743715178694, + "grad_norm": 0.41753003001213074, + "learning_rate": 6.807225845902976e-06, + "loss": 4.214, + "step": 37610 + }, + { + "epoch": 2.555714091588531, + "grad_norm": 0.4462239444255829, + "learning_rate": 6.80680119581465e-06, + "loss": 3.7771, + "step": 37615 + }, + { + "epoch": 2.5560538116591927, + "grad_norm": 0.31463098526000977, + "learning_rate": 6.806376545726322e-06, + "loss": 4.1501, + "step": 37620 + }, + { + "epoch": 2.5563935317298547, + "grad_norm": 0.5680791139602661, + "learning_rate": 6.805951895637995e-06, + "loss": 4.1884, + "step": 37625 + }, + { + "epoch": 2.5567332518005164, + "grad_norm": 0.3342342972755432, + "learning_rate": 6.8055272455496676e-06, + "loss": 4.0489, + "step": 37630 + }, + { + "epoch": 2.557072971871178, + "grad_norm": 0.3202345073223114, + "learning_rate": 6.80510259546134e-06, + "loss": 4.3044, + "step": 37635 + }, + { + "epoch": 2.55741269194184, + "grad_norm": 0.3891322910785675, + "learning_rate": 6.804677945373012e-06, + "loss": 4.0977, + "step": 37640 + }, + { + "epoch": 2.5577524120125017, + "grad_norm": 0.36560705304145813, + "learning_rate": 6.804253295284686e-06, + "loss": 4.2803, + "step": 37645 + }, + { + "epoch": 2.5580921320831633, + "grad_norm": 0.3537317216396332, + "learning_rate": 6.803828645196359e-06, + "loss": 4.1671, + "step": 37650 + }, + { + "epoch": 2.5584318521538254, + "grad_norm": 0.3117149770259857, + "learning_rate": 6.803403995108031e-06, + "loss": 4.1841, + "step": 37655 + }, + { + "epoch": 2.558771572224487, + "grad_norm": 0.31655237078666687, + "learning_rate": 6.802979345019704e-06, + "loss": 4.1653, + "step": 37660 + }, + { + "epoch": 2.5591112922951487, + "grad_norm": 0.33758240938186646, + "learning_rate": 6.802554694931377e-06, + "loss": 4.0043, + "step": 37665 + }, + { + "epoch": 2.5594510123658107, + "grad_norm": 0.5309095978736877, + "learning_rate": 6.802130044843049e-06, + "loss": 4.1633, + "step": 37670 + }, + { + "epoch": 2.5597907324364724, + "grad_norm": 0.32251784205436707, + "learning_rate": 6.801705394754723e-06, + "loss": 4.1796, + "step": 37675 + }, + { + "epoch": 2.560130452507134, + "grad_norm": 0.4265591502189636, + "learning_rate": 6.801280744666396e-06, + "loss": 4.185, + "step": 37680 + }, + { + "epoch": 2.560470172577796, + "grad_norm": 0.49463605880737305, + "learning_rate": 6.8008560945780675e-06, + "loss": 4.1519, + "step": 37685 + }, + { + "epoch": 2.5608098926484577, + "grad_norm": 0.4732853174209595, + "learning_rate": 6.800431444489741e-06, + "loss": 4.332, + "step": 37690 + }, + { + "epoch": 2.5611496127191193, + "grad_norm": 0.4222233295440674, + "learning_rate": 6.800006794401414e-06, + "loss": 4.1931, + "step": 37695 + }, + { + "epoch": 2.5614893327897814, + "grad_norm": 0.3005584180355072, + "learning_rate": 6.799582144313086e-06, + "loss": 4.093, + "step": 37700 + }, + { + "epoch": 2.561829052860443, + "grad_norm": 0.36606067419052124, + "learning_rate": 6.79915749422476e-06, + "loss": 4.1328, + "step": 37705 + }, + { + "epoch": 2.5621687729311047, + "grad_norm": 0.3546493351459503, + "learning_rate": 6.798732844136432e-06, + "loss": 4.2012, + "step": 37710 + }, + { + "epoch": 2.5625084930017668, + "grad_norm": 0.30156344175338745, + "learning_rate": 6.798308194048104e-06, + "loss": 4.1142, + "step": 37715 + }, + { + "epoch": 2.5628482130724284, + "grad_norm": 0.4071204364299774, + "learning_rate": 6.797883543959778e-06, + "loss": 4.1342, + "step": 37720 + }, + { + "epoch": 2.56318793314309, + "grad_norm": 0.3975578844547272, + "learning_rate": 6.79745889387145e-06, + "loss": 4.0151, + "step": 37725 + }, + { + "epoch": 2.563527653213752, + "grad_norm": 0.2948942184448242, + "learning_rate": 6.797034243783123e-06, + "loss": 4.2601, + "step": 37730 + }, + { + "epoch": 2.5638673732844137, + "grad_norm": 0.340171217918396, + "learning_rate": 6.796609593694796e-06, + "loss": 4.1341, + "step": 37735 + }, + { + "epoch": 2.5642070933550754, + "grad_norm": 0.27321919798851013, + "learning_rate": 6.796184943606468e-06, + "loss": 4.199, + "step": 37740 + }, + { + "epoch": 2.5645468134257374, + "grad_norm": 0.2698524594306946, + "learning_rate": 6.795760293518142e-06, + "loss": 4.3997, + "step": 37745 + }, + { + "epoch": 2.564886533496399, + "grad_norm": 0.3421938121318817, + "learning_rate": 6.795335643429815e-06, + "loss": 4.0153, + "step": 37750 + }, + { + "epoch": 2.5652262535670607, + "grad_norm": 0.3406151831150055, + "learning_rate": 6.794910993341487e-06, + "loss": 4.1141, + "step": 37755 + }, + { + "epoch": 2.5655659736377228, + "grad_norm": 0.5083059072494507, + "learning_rate": 6.79448634325316e-06, + "loss": 4.1577, + "step": 37760 + }, + { + "epoch": 2.5659056937083844, + "grad_norm": 0.284342885017395, + "learning_rate": 6.794061693164833e-06, + "loss": 4.0484, + "step": 37765 + }, + { + "epoch": 2.566245413779046, + "grad_norm": 0.4189382493495941, + "learning_rate": 6.793637043076505e-06, + "loss": 4.0656, + "step": 37770 + }, + { + "epoch": 2.566585133849708, + "grad_norm": 0.3725147247314453, + "learning_rate": 6.793212392988179e-06, + "loss": 4.4473, + "step": 37775 + }, + { + "epoch": 2.5669248539203697, + "grad_norm": 0.24698889255523682, + "learning_rate": 6.792787742899852e-06, + "loss": 4.1205, + "step": 37780 + }, + { + "epoch": 2.5672645739910314, + "grad_norm": 0.3689253330230713, + "learning_rate": 6.7923630928115235e-06, + "loss": 4.1401, + "step": 37785 + }, + { + "epoch": 2.5676042940616934, + "grad_norm": 0.2919285297393799, + "learning_rate": 6.791938442723197e-06, + "loss": 4.4269, + "step": 37790 + }, + { + "epoch": 2.567944014132355, + "grad_norm": 0.303272008895874, + "learning_rate": 6.791513792634869e-06, + "loss": 4.0974, + "step": 37795 + }, + { + "epoch": 2.5682837342030167, + "grad_norm": 0.40735942125320435, + "learning_rate": 6.791089142546542e-06, + "loss": 4.2322, + "step": 37800 + }, + { + "epoch": 2.5686234542736783, + "grad_norm": 0.3152822256088257, + "learning_rate": 6.790664492458216e-06, + "loss": 4.2035, + "step": 37805 + }, + { + "epoch": 2.5689631743443404, + "grad_norm": 0.2747667729854584, + "learning_rate": 6.7902398423698876e-06, + "loss": 4.2474, + "step": 37810 + }, + { + "epoch": 2.569302894415002, + "grad_norm": 0.25096356868743896, + "learning_rate": 6.78981519228156e-06, + "loss": 4.0099, + "step": 37815 + }, + { + "epoch": 2.5696426144856637, + "grad_norm": 0.44224825501441956, + "learning_rate": 6.789390542193234e-06, + "loss": 4.2757, + "step": 37820 + }, + { + "epoch": 2.5699823345563257, + "grad_norm": 0.31776323914527893, + "learning_rate": 6.788965892104906e-06, + "loss": 4.2364, + "step": 37825 + }, + { + "epoch": 2.5703220546269874, + "grad_norm": 0.2880792021751404, + "learning_rate": 6.788541242016579e-06, + "loss": 3.9654, + "step": 37830 + }, + { + "epoch": 2.570661774697649, + "grad_norm": 0.3319544792175293, + "learning_rate": 6.788116591928252e-06, + "loss": 3.9455, + "step": 37835 + }, + { + "epoch": 2.5710014947683106, + "grad_norm": 0.3132636845111847, + "learning_rate": 6.787691941839924e-06, + "loss": 4.2179, + "step": 37840 + }, + { + "epoch": 2.5713412148389727, + "grad_norm": 0.34838807582855225, + "learning_rate": 6.787267291751597e-06, + "loss": 4.3299, + "step": 37845 + }, + { + "epoch": 2.5716809349096343, + "grad_norm": 0.535422146320343, + "learning_rate": 6.786842641663271e-06, + "loss": 4.4445, + "step": 37850 + }, + { + "epoch": 2.572020654980296, + "grad_norm": 0.27361029386520386, + "learning_rate": 6.786417991574943e-06, + "loss": 4.4096, + "step": 37855 + }, + { + "epoch": 2.572360375050958, + "grad_norm": 0.3659472167491913, + "learning_rate": 6.7859933414866156e-06, + "loss": 4.2279, + "step": 37860 + }, + { + "epoch": 2.5727000951216197, + "grad_norm": 0.3309955298900604, + "learning_rate": 6.785568691398288e-06, + "loss": 4.3118, + "step": 37865 + }, + { + "epoch": 2.5730398151922813, + "grad_norm": 0.4107820987701416, + "learning_rate": 6.785144041309961e-06, + "loss": 4.0961, + "step": 37870 + }, + { + "epoch": 2.5733795352629434, + "grad_norm": 0.41429996490478516, + "learning_rate": 6.784719391221634e-06, + "loss": 4.4424, + "step": 37875 + }, + { + "epoch": 2.573719255333605, + "grad_norm": 0.40793707966804504, + "learning_rate": 6.784294741133307e-06, + "loss": 4.0476, + "step": 37880 + }, + { + "epoch": 2.5740589754042666, + "grad_norm": 0.46910208463668823, + "learning_rate": 6.7838700910449796e-06, + "loss": 4.1267, + "step": 37885 + }, + { + "epoch": 2.5743986954749287, + "grad_norm": 0.2645167410373688, + "learning_rate": 6.7834454409566515e-06, + "loss": 4.2361, + "step": 37890 + }, + { + "epoch": 2.5747384155455904, + "grad_norm": 0.38534238934516907, + "learning_rate": 6.783020790868325e-06, + "loss": 4.2249, + "step": 37895 + }, + { + "epoch": 2.575078135616252, + "grad_norm": 0.3999253213405609, + "learning_rate": 6.782596140779998e-06, + "loss": 4.3539, + "step": 37900 + }, + { + "epoch": 2.575417855686914, + "grad_norm": 0.4490685760974884, + "learning_rate": 6.78217149069167e-06, + "loss": 4.3597, + "step": 37905 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 0.4979700744152069, + "learning_rate": 6.7817468406033436e-06, + "loss": 4.1659, + "step": 37910 + }, + { + "epoch": 2.5760972958282373, + "grad_norm": 0.38500159978866577, + "learning_rate": 6.781322190515016e-06, + "loss": 4.1321, + "step": 37915 + }, + { + "epoch": 2.5764370158988994, + "grad_norm": 0.27842825651168823, + "learning_rate": 6.780897540426688e-06, + "loss": 3.922, + "step": 37920 + }, + { + "epoch": 2.576776735969561, + "grad_norm": 0.3450664281845093, + "learning_rate": 6.780472890338362e-06, + "loss": 4.2453, + "step": 37925 + }, + { + "epoch": 2.5771164560402227, + "grad_norm": 0.48085683584213257, + "learning_rate": 6.780048240250035e-06, + "loss": 4.2127, + "step": 37930 + }, + { + "epoch": 2.5774561761108847, + "grad_norm": 0.3480854630470276, + "learning_rate": 6.779623590161707e-06, + "loss": 3.9708, + "step": 37935 + }, + { + "epoch": 2.5777958961815464, + "grad_norm": 0.2864130735397339, + "learning_rate": 6.77919894007338e-06, + "loss": 3.8722, + "step": 37940 + }, + { + "epoch": 2.578135616252208, + "grad_norm": 0.32901009917259216, + "learning_rate": 6.778774289985053e-06, + "loss": 4.2271, + "step": 37945 + }, + { + "epoch": 2.57847533632287, + "grad_norm": 0.3517287075519562, + "learning_rate": 6.778349639896725e-06, + "loss": 4.1144, + "step": 37950 + }, + { + "epoch": 2.5788150563935317, + "grad_norm": 0.31421682238578796, + "learning_rate": 6.777924989808399e-06, + "loss": 4.153, + "step": 37955 + }, + { + "epoch": 2.5791547764641933, + "grad_norm": 0.3754502236843109, + "learning_rate": 6.7775003397200716e-06, + "loss": 4.0458, + "step": 37960 + }, + { + "epoch": 2.5794944965348554, + "grad_norm": 0.2918601930141449, + "learning_rate": 6.7770756896317435e-06, + "loss": 4.2332, + "step": 37965 + }, + { + "epoch": 2.579834216605517, + "grad_norm": 0.6740968227386475, + "learning_rate": 6.776651039543417e-06, + "loss": 4.1661, + "step": 37970 + }, + { + "epoch": 2.5801739366761787, + "grad_norm": 0.38108953833580017, + "learning_rate": 6.776226389455089e-06, + "loss": 4.1429, + "step": 37975 + }, + { + "epoch": 2.5805136567468407, + "grad_norm": 0.33148297667503357, + "learning_rate": 6.775801739366762e-06, + "loss": 4.0965, + "step": 37980 + }, + { + "epoch": 2.5808533768175024, + "grad_norm": 0.3363499641418457, + "learning_rate": 6.775377089278436e-06, + "loss": 3.997, + "step": 37985 + }, + { + "epoch": 2.581193096888164, + "grad_norm": 0.5406308770179749, + "learning_rate": 6.7749524391901075e-06, + "loss": 4.2817, + "step": 37990 + }, + { + "epoch": 2.581532816958826, + "grad_norm": 0.2931038737297058, + "learning_rate": 6.77452778910178e-06, + "loss": 4.4234, + "step": 37995 + }, + { + "epoch": 2.5818725370294877, + "grad_norm": 0.4439951181411743, + "learning_rate": 6.774103139013454e-06, + "loss": 4.0569, + "step": 38000 + }, + { + "epoch": 2.5822122571001493, + "grad_norm": 0.3937043845653534, + "learning_rate": 6.773678488925126e-06, + "loss": 3.9593, + "step": 38005 + }, + { + "epoch": 2.5825519771708114, + "grad_norm": 0.2884989380836487, + "learning_rate": 6.773253838836799e-06, + "loss": 4.2771, + "step": 38010 + }, + { + "epoch": 2.582891697241473, + "grad_norm": 0.49802273511886597, + "learning_rate": 6.772829188748472e-06, + "loss": 4.2509, + "step": 38015 + }, + { + "epoch": 2.5832314173121347, + "grad_norm": 0.27343541383743286, + "learning_rate": 6.772404538660144e-06, + "loss": 4.312, + "step": 38020 + }, + { + "epoch": 2.5835711373827968, + "grad_norm": 0.25631392002105713, + "learning_rate": 6.771979888571817e-06, + "loss": 4.0634, + "step": 38025 + }, + { + "epoch": 2.5839108574534584, + "grad_norm": 0.3234986364841461, + "learning_rate": 6.771555238483491e-06, + "loss": 4.0759, + "step": 38030 + }, + { + "epoch": 2.58425057752412, + "grad_norm": 0.4364822506904602, + "learning_rate": 6.771130588395163e-06, + "loss": 4.0243, + "step": 38035 + }, + { + "epoch": 2.584590297594782, + "grad_norm": 0.35175684094429016, + "learning_rate": 6.7707059383068355e-06, + "loss": 4.3028, + "step": 38040 + }, + { + "epoch": 2.5849300176654437, + "grad_norm": 0.3207109272480011, + "learning_rate": 6.770281288218508e-06, + "loss": 4.1898, + "step": 38045 + }, + { + "epoch": 2.5852697377361054, + "grad_norm": 0.34515392780303955, + "learning_rate": 6.769856638130181e-06, + "loss": 3.9445, + "step": 38050 + }, + { + "epoch": 2.5856094578067674, + "grad_norm": 0.38879168033599854, + "learning_rate": 6.769431988041854e-06, + "loss": 4.2535, + "step": 38055 + }, + { + "epoch": 2.585949177877429, + "grad_norm": 0.41972893476486206, + "learning_rate": 6.769007337953527e-06, + "loss": 4.225, + "step": 38060 + }, + { + "epoch": 2.5862888979480907, + "grad_norm": 0.34500765800476074, + "learning_rate": 6.7685826878651995e-06, + "loss": 4.344, + "step": 38065 + }, + { + "epoch": 2.5866286180187528, + "grad_norm": 0.22636574506759644, + "learning_rate": 6.7681580377768715e-06, + "loss": 4.0631, + "step": 38070 + }, + { + "epoch": 2.5869683380894144, + "grad_norm": 0.344661146402359, + "learning_rate": 6.767733387688545e-06, + "loss": 4.215, + "step": 38075 + }, + { + "epoch": 2.587308058160076, + "grad_norm": 0.35417041182518005, + "learning_rate": 6.767308737600218e-06, + "loss": 4.0757, + "step": 38080 + }, + { + "epoch": 2.587647778230738, + "grad_norm": 0.5130562782287598, + "learning_rate": 6.766884087511892e-06, + "loss": 4.1507, + "step": 38085 + }, + { + "epoch": 2.5879874983013997, + "grad_norm": 0.3247440755367279, + "learning_rate": 6.7664594374235635e-06, + "loss": 4.1936, + "step": 38090 + }, + { + "epoch": 2.5883272183720614, + "grad_norm": 0.39389851689338684, + "learning_rate": 6.766034787335236e-06, + "loss": 4.2756, + "step": 38095 + }, + { + "epoch": 2.5886669384427234, + "grad_norm": 0.21554917097091675, + "learning_rate": 6.76561013724691e-06, + "loss": 4.093, + "step": 38100 + }, + { + "epoch": 2.589006658513385, + "grad_norm": 0.36542919278144836, + "learning_rate": 6.765185487158582e-06, + "loss": 4.4987, + "step": 38105 + }, + { + "epoch": 2.5893463785840467, + "grad_norm": 0.43230271339416504, + "learning_rate": 6.764760837070255e-06, + "loss": 4.4071, + "step": 38110 + }, + { + "epoch": 2.589686098654709, + "grad_norm": 0.5451820492744446, + "learning_rate": 6.7643361869819276e-06, + "loss": 4.1308, + "step": 38115 + }, + { + "epoch": 2.5900258187253704, + "grad_norm": 0.7166210412979126, + "learning_rate": 6.7639115368936e-06, + "loss": 4.1169, + "step": 38120 + }, + { + "epoch": 2.590365538796032, + "grad_norm": 0.2992180585861206, + "learning_rate": 6.763486886805273e-06, + "loss": 4.2239, + "step": 38125 + }, + { + "epoch": 2.590705258866694, + "grad_norm": 0.40230345726013184, + "learning_rate": 6.763062236716946e-06, + "loss": 4.0403, + "step": 38130 + }, + { + "epoch": 2.5910449789373557, + "grad_norm": 0.3417702615261078, + "learning_rate": 6.762637586628619e-06, + "loss": 4.2418, + "step": 38135 + }, + { + "epoch": 2.5913846990080174, + "grad_norm": 0.38939228653907776, + "learning_rate": 6.762212936540291e-06, + "loss": 4.0736, + "step": 38140 + }, + { + "epoch": 2.591724419078679, + "grad_norm": 0.36426207423210144, + "learning_rate": 6.761788286451964e-06, + "loss": 4.316, + "step": 38145 + }, + { + "epoch": 2.592064139149341, + "grad_norm": 0.44256144762039185, + "learning_rate": 6.761363636363637e-06, + "loss": 4.1032, + "step": 38150 + }, + { + "epoch": 2.5924038592200027, + "grad_norm": 0.2772302031517029, + "learning_rate": 6.760938986275309e-06, + "loss": 4.0743, + "step": 38155 + }, + { + "epoch": 2.5927435792906643, + "grad_norm": 0.3509073555469513, + "learning_rate": 6.760514336186983e-06, + "loss": 4.1609, + "step": 38160 + }, + { + "epoch": 2.5930832993613264, + "grad_norm": 1.015260934829712, + "learning_rate": 6.7600896860986556e-06, + "loss": 4.0069, + "step": 38165 + }, + { + "epoch": 2.593423019431988, + "grad_norm": 0.4968920648097992, + "learning_rate": 6.7596650360103275e-06, + "loss": 4.2479, + "step": 38170 + }, + { + "epoch": 2.5937627395026497, + "grad_norm": 0.2795296013355255, + "learning_rate": 6.759240385922001e-06, + "loss": 4.1356, + "step": 38175 + }, + { + "epoch": 2.5941024595733113, + "grad_norm": 0.29582375288009644, + "learning_rate": 6.758815735833674e-06, + "loss": 4.1064, + "step": 38180 + }, + { + "epoch": 2.5944421796439734, + "grad_norm": 0.3246244490146637, + "learning_rate": 6.758391085745346e-06, + "loss": 4.0867, + "step": 38185 + }, + { + "epoch": 2.594781899714635, + "grad_norm": 0.4927613437175751, + "learning_rate": 6.7579664356570196e-06, + "loss": 3.9897, + "step": 38190 + }, + { + "epoch": 2.5951216197852967, + "grad_norm": 0.3474627733230591, + "learning_rate": 6.757541785568692e-06, + "loss": 4.4145, + "step": 38195 + }, + { + "epoch": 2.5954613398559587, + "grad_norm": 0.3140102028846741, + "learning_rate": 6.757117135480364e-06, + "loss": 4.1217, + "step": 38200 + }, + { + "epoch": 2.5958010599266204, + "grad_norm": 0.33956700563430786, + "learning_rate": 6.756692485392038e-06, + "loss": 4.1547, + "step": 38205 + }, + { + "epoch": 2.596140779997282, + "grad_norm": 0.26971590518951416, + "learning_rate": 6.75626783530371e-06, + "loss": 4.2215, + "step": 38210 + }, + { + "epoch": 2.596480500067944, + "grad_norm": 0.4348074197769165, + "learning_rate": 6.755843185215383e-06, + "loss": 4.1674, + "step": 38215 + }, + { + "epoch": 2.5968202201386057, + "grad_norm": 0.3795050382614136, + "learning_rate": 6.755418535127056e-06, + "loss": 4.1128, + "step": 38220 + }, + { + "epoch": 2.5971599402092673, + "grad_norm": 0.5211541056632996, + "learning_rate": 6.754993885038728e-06, + "loss": 4.1527, + "step": 38225 + }, + { + "epoch": 2.5974996602799294, + "grad_norm": 0.29646241664886475, + "learning_rate": 6.754569234950401e-06, + "loss": 4.2029, + "step": 38230 + }, + { + "epoch": 2.597839380350591, + "grad_norm": 0.37814775109291077, + "learning_rate": 6.754144584862075e-06, + "loss": 4.2408, + "step": 38235 + }, + { + "epoch": 2.5981791004212527, + "grad_norm": 0.3728523254394531, + "learning_rate": 6.753719934773747e-06, + "loss": 3.9628, + "step": 38240 + }, + { + "epoch": 2.5985188204919147, + "grad_norm": 0.3472331464290619, + "learning_rate": 6.7532952846854195e-06, + "loss": 4.28, + "step": 38245 + }, + { + "epoch": 2.5988585405625764, + "grad_norm": 0.3734162449836731, + "learning_rate": 6.752870634597093e-06, + "loss": 4.3801, + "step": 38250 + }, + { + "epoch": 2.599198260633238, + "grad_norm": 0.3382417857646942, + "learning_rate": 6.752445984508765e-06, + "loss": 4.1777, + "step": 38255 + }, + { + "epoch": 2.5995379807039, + "grad_norm": 0.8395103216171265, + "learning_rate": 6.752021334420438e-06, + "loss": 4.2592, + "step": 38260 + }, + { + "epoch": 2.5998777007745617, + "grad_norm": 0.35761064291000366, + "learning_rate": 6.7515966843321116e-06, + "loss": 3.9591, + "step": 38265 + }, + { + "epoch": 2.6002174208452233, + "grad_norm": 0.3734990060329437, + "learning_rate": 6.7511720342437835e-06, + "loss": 4.3455, + "step": 38270 + }, + { + "epoch": 2.6005571409158854, + "grad_norm": 0.300179660320282, + "learning_rate": 6.750747384155456e-06, + "loss": 4.053, + "step": 38275 + }, + { + "epoch": 2.600896860986547, + "grad_norm": 0.33842650055885315, + "learning_rate": 6.75032273406713e-06, + "loss": 4.0418, + "step": 38280 + }, + { + "epoch": 2.6012365810572087, + "grad_norm": 0.4006973206996918, + "learning_rate": 6.749898083978802e-06, + "loss": 4.342, + "step": 38285 + }, + { + "epoch": 2.6015763011278707, + "grad_norm": 0.36134207248687744, + "learning_rate": 6.749473433890475e-06, + "loss": 4.1354, + "step": 38290 + }, + { + "epoch": 2.6019160211985324, + "grad_norm": 0.33959949016571045, + "learning_rate": 6.7490487838021475e-06, + "loss": 4.185, + "step": 38295 + }, + { + "epoch": 2.602255741269194, + "grad_norm": 0.4892849624156952, + "learning_rate": 6.74862413371382e-06, + "loss": 4.2191, + "step": 38300 + }, + { + "epoch": 2.602595461339856, + "grad_norm": 0.5397498607635498, + "learning_rate": 6.748199483625492e-06, + "loss": 4.0873, + "step": 38305 + }, + { + "epoch": 2.6029351814105177, + "grad_norm": 0.415086030960083, + "learning_rate": 6.747774833537166e-06, + "loss": 4.1037, + "step": 38310 + }, + { + "epoch": 2.6032749014811793, + "grad_norm": 0.7865170836448669, + "learning_rate": 6.747350183448839e-06, + "loss": 4.3196, + "step": 38315 + }, + { + "epoch": 2.6036146215518414, + "grad_norm": 0.3150462210178375, + "learning_rate": 6.746925533360511e-06, + "loss": 4.1304, + "step": 38320 + }, + { + "epoch": 2.603954341622503, + "grad_norm": 0.2930243909358978, + "learning_rate": 6.746500883272184e-06, + "loss": 4.4674, + "step": 38325 + }, + { + "epoch": 2.6042940616931647, + "grad_norm": 0.37386950850486755, + "learning_rate": 6.746076233183857e-06, + "loss": 4.2004, + "step": 38330 + }, + { + "epoch": 2.6046337817638268, + "grad_norm": 0.4516938328742981, + "learning_rate": 6.745651583095529e-06, + "loss": 4.3043, + "step": 38335 + }, + { + "epoch": 2.6049735018344884, + "grad_norm": 0.44299188256263733, + "learning_rate": 6.745226933007203e-06, + "loss": 4.249, + "step": 38340 + }, + { + "epoch": 2.60531322190515, + "grad_norm": 0.29151731729507446, + "learning_rate": 6.7448022829188755e-06, + "loss": 4.0635, + "step": 38345 + }, + { + "epoch": 2.605652941975812, + "grad_norm": 0.4381179213523865, + "learning_rate": 6.7443776328305475e-06, + "loss": 3.9559, + "step": 38350 + }, + { + "epoch": 2.6059926620464737, + "grad_norm": 0.38241350650787354, + "learning_rate": 6.743952982742221e-06, + "loss": 3.9986, + "step": 38355 + }, + { + "epoch": 2.6063323821171354, + "grad_norm": 0.518024742603302, + "learning_rate": 6.743528332653894e-06, + "loss": 4.0453, + "step": 38360 + }, + { + "epoch": 2.6066721021877974, + "grad_norm": 0.33338966965675354, + "learning_rate": 6.743103682565566e-06, + "loss": 3.9025, + "step": 38365 + }, + { + "epoch": 2.607011822258459, + "grad_norm": 0.24900001287460327, + "learning_rate": 6.7426790324772395e-06, + "loss": 4.1205, + "step": 38370 + }, + { + "epoch": 2.6073515423291207, + "grad_norm": 0.49257412552833557, + "learning_rate": 6.742254382388912e-06, + "loss": 4.4167, + "step": 38375 + }, + { + "epoch": 2.6076912623997828, + "grad_norm": 0.2926067113876343, + "learning_rate": 6.741829732300584e-06, + "loss": 4.3312, + "step": 38380 + }, + { + "epoch": 2.6080309824704444, + "grad_norm": 0.39882394671440125, + "learning_rate": 6.741405082212258e-06, + "loss": 4.2288, + "step": 38385 + }, + { + "epoch": 2.608370702541106, + "grad_norm": 0.3636932373046875, + "learning_rate": 6.74098043212393e-06, + "loss": 4.1981, + "step": 38390 + }, + { + "epoch": 2.608710422611768, + "grad_norm": 0.2999427616596222, + "learning_rate": 6.740555782035603e-06, + "loss": 4.1972, + "step": 38395 + }, + { + "epoch": 2.6090501426824297, + "grad_norm": 0.22584567964076996, + "learning_rate": 6.740131131947276e-06, + "loss": 4.1554, + "step": 38400 + }, + { + "epoch": 2.6093898627530914, + "grad_norm": 0.365165650844574, + "learning_rate": 6.739706481858948e-06, + "loss": 3.9864, + "step": 38405 + }, + { + "epoch": 2.6097295828237534, + "grad_norm": 0.358200341463089, + "learning_rate": 6.739281831770621e-06, + "loss": 4.1996, + "step": 38410 + }, + { + "epoch": 2.610069302894415, + "grad_norm": 0.3097511827945709, + "learning_rate": 6.738857181682295e-06, + "loss": 4.2469, + "step": 38415 + }, + { + "epoch": 2.6104090229650767, + "grad_norm": 0.2847420573234558, + "learning_rate": 6.738432531593967e-06, + "loss": 4.1199, + "step": 38420 + }, + { + "epoch": 2.610748743035739, + "grad_norm": 0.33157774806022644, + "learning_rate": 6.73800788150564e-06, + "loss": 3.9651, + "step": 38425 + }, + { + "epoch": 2.6110884631064004, + "grad_norm": 0.5345950722694397, + "learning_rate": 6.737583231417313e-06, + "loss": 4.0117, + "step": 38430 + }, + { + "epoch": 2.611428183177062, + "grad_norm": 0.48201093077659607, + "learning_rate": 6.737158581328985e-06, + "loss": 4.0468, + "step": 38435 + }, + { + "epoch": 2.611767903247724, + "grad_norm": 0.3962993621826172, + "learning_rate": 6.736733931240659e-06, + "loss": 4.2897, + "step": 38440 + }, + { + "epoch": 2.6121076233183858, + "grad_norm": 0.7635703682899475, + "learning_rate": 6.7363092811523316e-06, + "loss": 4.2139, + "step": 38445 + }, + { + "epoch": 2.6124473433890474, + "grad_norm": 0.26518458127975464, + "learning_rate": 6.7358846310640035e-06, + "loss": 4.0357, + "step": 38450 + }, + { + "epoch": 2.6127870634597095, + "grad_norm": 0.3020807206630707, + "learning_rate": 6.735459980975677e-06, + "loss": 4.0185, + "step": 38455 + }, + { + "epoch": 2.613126783530371, + "grad_norm": 0.35880452394485474, + "learning_rate": 6.735035330887349e-06, + "loss": 4.1775, + "step": 38460 + }, + { + "epoch": 2.6134665036010327, + "grad_norm": 0.27939531207084656, + "learning_rate": 6.734610680799022e-06, + "loss": 4.204, + "step": 38465 + }, + { + "epoch": 2.613806223671695, + "grad_norm": 0.6022292375564575, + "learning_rate": 6.7341860307106956e-06, + "loss": 4.0797, + "step": 38470 + }, + { + "epoch": 2.6141459437423564, + "grad_norm": 0.2802383005619049, + "learning_rate": 6.7337613806223675e-06, + "loss": 4.0104, + "step": 38475 + }, + { + "epoch": 2.614485663813018, + "grad_norm": 0.4449191391468048, + "learning_rate": 6.73333673053404e-06, + "loss": 4.2632, + "step": 38480 + }, + { + "epoch": 2.6148253838836797, + "grad_norm": 0.3262802064418793, + "learning_rate": 6.732912080445714e-06, + "loss": 4.0349, + "step": 38485 + }, + { + "epoch": 2.6151651039543418, + "grad_norm": 0.46141695976257324, + "learning_rate": 6.732487430357386e-06, + "loss": 3.8099, + "step": 38490 + }, + { + "epoch": 2.6155048240250034, + "grad_norm": 0.35503438115119934, + "learning_rate": 6.732062780269059e-06, + "loss": 4.1894, + "step": 38495 + }, + { + "epoch": 2.615844544095665, + "grad_norm": 0.4400692284107208, + "learning_rate": 6.731638130180732e-06, + "loss": 4.0493, + "step": 38500 + }, + { + "epoch": 2.616184264166327, + "grad_norm": 0.46316975355148315, + "learning_rate": 6.731213480092404e-06, + "loss": 4.0357, + "step": 38505 + }, + { + "epoch": 2.6165239842369887, + "grad_norm": 0.5304751992225647, + "learning_rate": 6.730788830004077e-06, + "loss": 4.1445, + "step": 38510 + }, + { + "epoch": 2.6168637043076504, + "grad_norm": 0.6302080154418945, + "learning_rate": 6.730364179915751e-06, + "loss": 4.1268, + "step": 38515 + }, + { + "epoch": 2.617203424378312, + "grad_norm": 0.35809099674224854, + "learning_rate": 6.729939529827423e-06, + "loss": 4.133, + "step": 38520 + }, + { + "epoch": 2.617543144448974, + "grad_norm": 0.3490724265575409, + "learning_rate": 6.7295148797390955e-06, + "loss": 4.0289, + "step": 38525 + }, + { + "epoch": 2.6178828645196357, + "grad_norm": 0.40917059779167175, + "learning_rate": 6.729090229650769e-06, + "loss": 4.0828, + "step": 38530 + }, + { + "epoch": 2.6182225845902973, + "grad_norm": 0.2892396152019501, + "learning_rate": 6.728665579562441e-06, + "loss": 3.8101, + "step": 38535 + }, + { + "epoch": 2.6185623046609594, + "grad_norm": 0.5291091203689575, + "learning_rate": 6.728240929474114e-06, + "loss": 4.1112, + "step": 38540 + }, + { + "epoch": 2.618902024731621, + "grad_norm": 0.29931822419166565, + "learning_rate": 6.727816279385787e-06, + "loss": 4.0549, + "step": 38545 + }, + { + "epoch": 2.6192417448022827, + "grad_norm": 0.3024238348007202, + "learning_rate": 6.7273916292974595e-06, + "loss": 4.1154, + "step": 38550 + }, + { + "epoch": 2.6195814648729447, + "grad_norm": 0.317971795797348, + "learning_rate": 6.7269669792091315e-06, + "loss": 4.1991, + "step": 38555 + }, + { + "epoch": 2.6199211849436064, + "grad_norm": 0.3924446702003479, + "learning_rate": 6.726542329120805e-06, + "loss": 4.0431, + "step": 38560 + }, + { + "epoch": 2.620260905014268, + "grad_norm": 0.39212965965270996, + "learning_rate": 6.726117679032478e-06, + "loss": 4.3613, + "step": 38565 + }, + { + "epoch": 2.62060062508493, + "grad_norm": 0.2746898829936981, + "learning_rate": 6.72569302894415e-06, + "loss": 4.3204, + "step": 38570 + }, + { + "epoch": 2.6209403451555917, + "grad_norm": 0.3293813169002533, + "learning_rate": 6.7252683788558235e-06, + "loss": 4.1074, + "step": 38575 + }, + { + "epoch": 2.6212800652262533, + "grad_norm": 0.4049656391143799, + "learning_rate": 6.724843728767496e-06, + "loss": 4.1519, + "step": 38580 + }, + { + "epoch": 2.6216197852969154, + "grad_norm": 0.39926695823669434, + "learning_rate": 6.724419078679168e-06, + "loss": 4.2069, + "step": 38585 + }, + { + "epoch": 2.621959505367577, + "grad_norm": 0.3393886089324951, + "learning_rate": 6.723994428590842e-06, + "loss": 4.1573, + "step": 38590 + }, + { + "epoch": 2.6222992254382387, + "grad_norm": 0.4688952565193176, + "learning_rate": 6.723569778502515e-06, + "loss": 4.2806, + "step": 38595 + }, + { + "epoch": 2.6226389455089008, + "grad_norm": 0.5177536010742188, + "learning_rate": 6.723145128414187e-06, + "loss": 4.1408, + "step": 38600 + }, + { + "epoch": 2.6229786655795624, + "grad_norm": 0.2817637324333191, + "learning_rate": 6.72272047832586e-06, + "loss": 4.2792, + "step": 38605 + }, + { + "epoch": 2.623318385650224, + "grad_norm": 0.46889638900756836, + "learning_rate": 6.722295828237533e-06, + "loss": 4.1056, + "step": 38610 + }, + { + "epoch": 2.623658105720886, + "grad_norm": 0.29860466718673706, + "learning_rate": 6.721871178149205e-06, + "loss": 4.0835, + "step": 38615 + }, + { + "epoch": 2.6239978257915477, + "grad_norm": 0.5351051688194275, + "learning_rate": 6.721446528060879e-06, + "loss": 4.2096, + "step": 38620 + }, + { + "epoch": 2.6243375458622094, + "grad_norm": 0.360824853181839, + "learning_rate": 6.7210218779725515e-06, + "loss": 4.343, + "step": 38625 + }, + { + "epoch": 2.6246772659328714, + "grad_norm": 0.36170488595962524, + "learning_rate": 6.7205972278842235e-06, + "loss": 4.2098, + "step": 38630 + }, + { + "epoch": 2.625016986003533, + "grad_norm": 0.3334933817386627, + "learning_rate": 6.720172577795897e-06, + "loss": 4.4607, + "step": 38635 + }, + { + "epoch": 2.6253567060741947, + "grad_norm": 0.36238986253738403, + "learning_rate": 6.719747927707569e-06, + "loss": 4.2737, + "step": 38640 + }, + { + "epoch": 2.6256964261448568, + "grad_norm": 0.3085700571537018, + "learning_rate": 6.719323277619242e-06, + "loss": 4.0726, + "step": 38645 + }, + { + "epoch": 2.6260361462155184, + "grad_norm": 0.32380396127700806, + "learning_rate": 6.7188986275309155e-06, + "loss": 4.1219, + "step": 38650 + }, + { + "epoch": 2.62637586628618, + "grad_norm": 0.4737776517868042, + "learning_rate": 6.7184739774425875e-06, + "loss": 4.0731, + "step": 38655 + }, + { + "epoch": 2.626715586356842, + "grad_norm": 0.5313569903373718, + "learning_rate": 6.71804932735426e-06, + "loss": 4.1273, + "step": 38660 + }, + { + "epoch": 2.6270553064275037, + "grad_norm": 0.24456320703029633, + "learning_rate": 6.717624677265934e-06, + "loss": 3.9501, + "step": 38665 + }, + { + "epoch": 2.6273950264981654, + "grad_norm": 0.3956949710845947, + "learning_rate": 6.717200027177606e-06, + "loss": 4.217, + "step": 38670 + }, + { + "epoch": 2.6277347465688274, + "grad_norm": 0.41084951162338257, + "learning_rate": 6.716775377089279e-06, + "loss": 4.1383, + "step": 38675 + }, + { + "epoch": 2.628074466639489, + "grad_norm": 0.8330075740814209, + "learning_rate": 6.716350727000952e-06, + "loss": 4.2144, + "step": 38680 + }, + { + "epoch": 2.6284141867101507, + "grad_norm": 0.2605981230735779, + "learning_rate": 6.715926076912624e-06, + "loss": 4.3145, + "step": 38685 + }, + { + "epoch": 2.6287539067808128, + "grad_norm": 0.294364333152771, + "learning_rate": 6.715501426824297e-06, + "loss": 4.2923, + "step": 38690 + }, + { + "epoch": 2.6290936268514744, + "grad_norm": 0.37934234738349915, + "learning_rate": 6.715076776735971e-06, + "loss": 4.1444, + "step": 38695 + }, + { + "epoch": 2.629433346922136, + "grad_norm": 0.4295320212841034, + "learning_rate": 6.714652126647643e-06, + "loss": 4.0115, + "step": 38700 + }, + { + "epoch": 2.629773066992798, + "grad_norm": 0.37135395407676697, + "learning_rate": 6.7142274765593155e-06, + "loss": 4.1164, + "step": 38705 + }, + { + "epoch": 2.6301127870634597, + "grad_norm": 0.4428029954433441, + "learning_rate": 6.713802826470988e-06, + "loss": 4.2337, + "step": 38710 + }, + { + "epoch": 2.6304525071341214, + "grad_norm": 0.4637139141559601, + "learning_rate": 6.713378176382661e-06, + "loss": 4.2509, + "step": 38715 + }, + { + "epoch": 2.6307922272047835, + "grad_norm": 0.4115109443664551, + "learning_rate": 6.712953526294334e-06, + "loss": 4.3423, + "step": 38720 + }, + { + "epoch": 2.631131947275445, + "grad_norm": 0.29154086112976074, + "learning_rate": 6.712528876206007e-06, + "loss": 4.1004, + "step": 38725 + }, + { + "epoch": 2.6314716673461067, + "grad_norm": 0.44212979078292847, + "learning_rate": 6.7121042261176795e-06, + "loss": 4.2186, + "step": 38730 + }, + { + "epoch": 2.631811387416769, + "grad_norm": 0.300873339176178, + "learning_rate": 6.7116795760293515e-06, + "loss": 4.2222, + "step": 38735 + }, + { + "epoch": 2.6321511074874304, + "grad_norm": 0.33419981598854065, + "learning_rate": 6.711254925941025e-06, + "loss": 4.0523, + "step": 38740 + }, + { + "epoch": 2.632490827558092, + "grad_norm": 0.3947445750236511, + "learning_rate": 6.710830275852698e-06, + "loss": 4.2316, + "step": 38745 + }, + { + "epoch": 2.632830547628754, + "grad_norm": 0.2724277079105377, + "learning_rate": 6.71040562576437e-06, + "loss": 4.2129, + "step": 38750 + }, + { + "epoch": 2.6331702676994158, + "grad_norm": 0.5130965113639832, + "learning_rate": 6.7099809756760435e-06, + "loss": 4.1317, + "step": 38755 + }, + { + "epoch": 2.6335099877700774, + "grad_norm": 0.29559192061424255, + "learning_rate": 6.709556325587716e-06, + "loss": 4.291, + "step": 38760 + }, + { + "epoch": 2.6338497078407395, + "grad_norm": 0.3030453324317932, + "learning_rate": 6.70913167549939e-06, + "loss": 3.9617, + "step": 38765 + }, + { + "epoch": 2.634189427911401, + "grad_norm": 0.46160024404525757, + "learning_rate": 6.708707025411062e-06, + "loss": 4.4142, + "step": 38770 + }, + { + "epoch": 2.6345291479820627, + "grad_norm": 0.34282591938972473, + "learning_rate": 6.708282375322735e-06, + "loss": 3.9242, + "step": 38775 + }, + { + "epoch": 2.634868868052725, + "grad_norm": 0.25396668910980225, + "learning_rate": 6.7078577252344075e-06, + "loss": 4.2807, + "step": 38780 + }, + { + "epoch": 2.6352085881233864, + "grad_norm": 0.4103064239025116, + "learning_rate": 6.70743307514608e-06, + "loss": 4.4365, + "step": 38785 + }, + { + "epoch": 2.635548308194048, + "grad_norm": 0.42015308141708374, + "learning_rate": 6.707008425057753e-06, + "loss": 4.4184, + "step": 38790 + }, + { + "epoch": 2.63588802826471, + "grad_norm": 0.35178667306900024, + "learning_rate": 6.706583774969426e-06, + "loss": 4.1979, + "step": 38795 + }, + { + "epoch": 2.6362277483353718, + "grad_norm": 0.365976482629776, + "learning_rate": 6.706159124881099e-06, + "loss": 4.3468, + "step": 38800 + }, + { + "epoch": 2.6365674684060334, + "grad_norm": 0.4179193377494812, + "learning_rate": 6.705734474792771e-06, + "loss": 4.0704, + "step": 38805 + }, + { + "epoch": 2.6369071884766955, + "grad_norm": 0.9803059101104736, + "learning_rate": 6.705309824704444e-06, + "loss": 4.1754, + "step": 38810 + }, + { + "epoch": 2.637246908547357, + "grad_norm": 0.40046679973602295, + "learning_rate": 6.704885174616117e-06, + "loss": 4.0036, + "step": 38815 + }, + { + "epoch": 2.6375866286180187, + "grad_norm": 0.39722761511802673, + "learning_rate": 6.704460524527789e-06, + "loss": 4.4443, + "step": 38820 + }, + { + "epoch": 2.6379263486886804, + "grad_norm": 0.42546388506889343, + "learning_rate": 6.704035874439463e-06, + "loss": 4.1983, + "step": 38825 + }, + { + "epoch": 2.6382660687593424, + "grad_norm": 0.3214744031429291, + "learning_rate": 6.7036112243511355e-06, + "loss": 4.0634, + "step": 38830 + }, + { + "epoch": 2.638605788830004, + "grad_norm": 0.3488433063030243, + "learning_rate": 6.7031865742628075e-06, + "loss": 4.2643, + "step": 38835 + }, + { + "epoch": 2.6389455089006657, + "grad_norm": 0.3113909065723419, + "learning_rate": 6.702761924174481e-06, + "loss": 3.9649, + "step": 38840 + }, + { + "epoch": 2.639285228971328, + "grad_norm": 0.23682314157485962, + "learning_rate": 6.702337274086154e-06, + "loss": 4.0301, + "step": 38845 + }, + { + "epoch": 2.6396249490419894, + "grad_norm": 0.37908896803855896, + "learning_rate": 6.701912623997826e-06, + "loss": 4.1163, + "step": 38850 + }, + { + "epoch": 2.639964669112651, + "grad_norm": 0.3080543279647827, + "learning_rate": 6.7014879739094995e-06, + "loss": 4.2606, + "step": 38855 + }, + { + "epoch": 2.640304389183313, + "grad_norm": 0.29367074370384216, + "learning_rate": 6.701063323821172e-06, + "loss": 4.2524, + "step": 38860 + }, + { + "epoch": 2.6406441092539747, + "grad_norm": 0.3549099862575531, + "learning_rate": 6.700638673732844e-06, + "loss": 4.0664, + "step": 38865 + }, + { + "epoch": 2.6409838293246364, + "grad_norm": 0.34124064445495605, + "learning_rate": 6.700214023644518e-06, + "loss": 3.8966, + "step": 38870 + }, + { + "epoch": 2.641323549395298, + "grad_norm": 0.33671361207962036, + "learning_rate": 6.69978937355619e-06, + "loss": 4.3414, + "step": 38875 + }, + { + "epoch": 2.64166326946596, + "grad_norm": 0.30759304761886597, + "learning_rate": 6.699364723467863e-06, + "loss": 4.0222, + "step": 38880 + }, + { + "epoch": 2.6420029895366217, + "grad_norm": 0.31631699204444885, + "learning_rate": 6.698940073379536e-06, + "loss": 4.26, + "step": 38885 + }, + { + "epoch": 2.6423427096072833, + "grad_norm": 0.3236902952194214, + "learning_rate": 6.698515423291208e-06, + "loss": 4.1138, + "step": 38890 + }, + { + "epoch": 2.6426824296779454, + "grad_norm": 0.4035790264606476, + "learning_rate": 6.698090773202881e-06, + "loss": 3.9735, + "step": 38895 + }, + { + "epoch": 2.643022149748607, + "grad_norm": 0.30679309368133545, + "learning_rate": 6.697666123114555e-06, + "loss": 3.837, + "step": 38900 + }, + { + "epoch": 2.6433618698192687, + "grad_norm": 0.9124605655670166, + "learning_rate": 6.697241473026227e-06, + "loss": 4.1645, + "step": 38905 + }, + { + "epoch": 2.6437015898899308, + "grad_norm": 0.2658267021179199, + "learning_rate": 6.6968168229378995e-06, + "loss": 3.9261, + "step": 38910 + }, + { + "epoch": 2.6440413099605924, + "grad_norm": 0.39138364791870117, + "learning_rate": 6.696392172849573e-06, + "loss": 4.0463, + "step": 38915 + }, + { + "epoch": 2.644381030031254, + "grad_norm": 0.3205065429210663, + "learning_rate": 6.695967522761245e-06, + "loss": 4.123, + "step": 38920 + }, + { + "epoch": 2.644720750101916, + "grad_norm": 0.27888768911361694, + "learning_rate": 6.695542872672918e-06, + "loss": 3.9571, + "step": 38925 + }, + { + "epoch": 2.6450604701725777, + "grad_norm": 0.32972633838653564, + "learning_rate": 6.6951182225845915e-06, + "loss": 4.0339, + "step": 38930 + }, + { + "epoch": 2.6454001902432394, + "grad_norm": 0.4414174258708954, + "learning_rate": 6.6946935724962635e-06, + "loss": 3.9888, + "step": 38935 + }, + { + "epoch": 2.6457399103139014, + "grad_norm": 0.27780869603157043, + "learning_rate": 6.694268922407936e-06, + "loss": 3.9727, + "step": 38940 + }, + { + "epoch": 2.646079630384563, + "grad_norm": 0.3791099786758423, + "learning_rate": 6.69384427231961e-06, + "loss": 4.1713, + "step": 38945 + }, + { + "epoch": 2.6464193504552247, + "grad_norm": 0.37285172939300537, + "learning_rate": 6.693419622231282e-06, + "loss": 4.2134, + "step": 38950 + }, + { + "epoch": 2.6467590705258868, + "grad_norm": 0.3111470937728882, + "learning_rate": 6.692994972142955e-06, + "loss": 4.0422, + "step": 38955 + }, + { + "epoch": 2.6470987905965484, + "grad_norm": 0.3461996018886566, + "learning_rate": 6.6925703220546275e-06, + "loss": 3.8765, + "step": 38960 + }, + { + "epoch": 2.64743851066721, + "grad_norm": 0.45890411734580994, + "learning_rate": 6.6921456719663e-06, + "loss": 4.197, + "step": 38965 + }, + { + "epoch": 2.647778230737872, + "grad_norm": 0.3446623682975769, + "learning_rate": 6.691721021877973e-06, + "loss": 4.1364, + "step": 38970 + }, + { + "epoch": 2.6481179508085337, + "grad_norm": 0.39477041363716125, + "learning_rate": 6.691296371789646e-06, + "loss": 4.0952, + "step": 38975 + }, + { + "epoch": 2.6484576708791954, + "grad_norm": 0.3934609889984131, + "learning_rate": 6.690871721701319e-06, + "loss": 4.2792, + "step": 38980 + }, + { + "epoch": 2.6487973909498574, + "grad_norm": 0.3061703145503998, + "learning_rate": 6.690447071612991e-06, + "loss": 3.9237, + "step": 38985 + }, + { + "epoch": 2.649137111020519, + "grad_norm": 0.47294020652770996, + "learning_rate": 6.690022421524664e-06, + "loss": 3.9337, + "step": 38990 + }, + { + "epoch": 2.6494768310911807, + "grad_norm": 0.3760506808757782, + "learning_rate": 6.689597771436337e-06, + "loss": 4.0855, + "step": 38995 + }, + { + "epoch": 2.649816551161843, + "grad_norm": 0.4326598048210144, + "learning_rate": 6.689173121348009e-06, + "loss": 4.3657, + "step": 39000 + }, + { + "epoch": 2.6501562712325044, + "grad_norm": 0.34479042887687683, + "learning_rate": 6.688748471259683e-06, + "loss": 4.2156, + "step": 39005 + }, + { + "epoch": 2.650495991303166, + "grad_norm": 0.34586524963378906, + "learning_rate": 6.6883238211713555e-06, + "loss": 4.1891, + "step": 39010 + }, + { + "epoch": 2.650835711373828, + "grad_norm": 0.35405492782592773, + "learning_rate": 6.6878991710830275e-06, + "loss": 4.2041, + "step": 39015 + }, + { + "epoch": 2.6511754314444897, + "grad_norm": 0.39074963331222534, + "learning_rate": 6.687474520994701e-06, + "loss": 3.9293, + "step": 39020 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 0.2980903089046478, + "learning_rate": 6.687049870906374e-06, + "loss": 4.2048, + "step": 39025 + }, + { + "epoch": 2.6518548715858135, + "grad_norm": 0.30705976486206055, + "learning_rate": 6.686625220818046e-06, + "loss": 4.1381, + "step": 39030 + }, + { + "epoch": 2.652194591656475, + "grad_norm": 0.3068237602710724, + "learning_rate": 6.6862005707297195e-06, + "loss": 4.0556, + "step": 39035 + }, + { + "epoch": 2.6525343117271367, + "grad_norm": 0.32068315148353577, + "learning_rate": 6.685775920641392e-06, + "loss": 4.1543, + "step": 39040 + }, + { + "epoch": 2.652874031797799, + "grad_norm": 0.3279162645339966, + "learning_rate": 6.685351270553064e-06, + "loss": 4.1877, + "step": 39045 + }, + { + "epoch": 2.6532137518684604, + "grad_norm": 0.32492268085479736, + "learning_rate": 6.684926620464738e-06, + "loss": 4.0554, + "step": 39050 + }, + { + "epoch": 2.653553471939122, + "grad_norm": 0.5427375435829163, + "learning_rate": 6.68450197037641e-06, + "loss": 3.9674, + "step": 39055 + }, + { + "epoch": 2.653893192009784, + "grad_norm": 0.5234894156455994, + "learning_rate": 6.684077320288083e-06, + "loss": 4.1177, + "step": 39060 + }, + { + "epoch": 2.6542329120804458, + "grad_norm": 0.3171158730983734, + "learning_rate": 6.683652670199756e-06, + "loss": 3.9315, + "step": 39065 + }, + { + "epoch": 2.6545726321511074, + "grad_norm": 0.4060606062412262, + "learning_rate": 6.683228020111428e-06, + "loss": 4.2462, + "step": 39070 + }, + { + "epoch": 2.6549123522217695, + "grad_norm": 0.9558091759681702, + "learning_rate": 6.682803370023101e-06, + "loss": 4.1119, + "step": 39075 + }, + { + "epoch": 2.655252072292431, + "grad_norm": 0.34979358315467834, + "learning_rate": 6.682378719934775e-06, + "loss": 4.1021, + "step": 39080 + }, + { + "epoch": 2.6555917923630927, + "grad_norm": 0.3092152178287506, + "learning_rate": 6.681954069846447e-06, + "loss": 4.1378, + "step": 39085 + }, + { + "epoch": 2.655931512433755, + "grad_norm": 0.2618195116519928, + "learning_rate": 6.6815294197581195e-06, + "loss": 4.3472, + "step": 39090 + }, + { + "epoch": 2.6562712325044164, + "grad_norm": 0.2847874164581299, + "learning_rate": 6.681104769669793e-06, + "loss": 4.2228, + "step": 39095 + }, + { + "epoch": 2.656610952575078, + "grad_norm": 0.7326367497444153, + "learning_rate": 6.680680119581465e-06, + "loss": 4.1973, + "step": 39100 + }, + { + "epoch": 2.65695067264574, + "grad_norm": 0.2928914427757263, + "learning_rate": 6.680255469493139e-06, + "loss": 3.9823, + "step": 39105 + }, + { + "epoch": 2.6572903927164018, + "grad_norm": 0.5436951518058777, + "learning_rate": 6.6798308194048115e-06, + "loss": 3.9971, + "step": 39110 + }, + { + "epoch": 2.6576301127870634, + "grad_norm": 0.48338642716407776, + "learning_rate": 6.6794061693164835e-06, + "loss": 4.0515, + "step": 39115 + }, + { + "epoch": 2.6579698328577255, + "grad_norm": 0.31968605518341064, + "learning_rate": 6.678981519228157e-06, + "loss": 4.2518, + "step": 39120 + }, + { + "epoch": 2.658309552928387, + "grad_norm": 0.29551050066947937, + "learning_rate": 6.678556869139829e-06, + "loss": 3.7433, + "step": 39125 + }, + { + "epoch": 2.6586492729990487, + "grad_norm": 0.3499673008918762, + "learning_rate": 6.678132219051502e-06, + "loss": 3.9953, + "step": 39130 + }, + { + "epoch": 2.658988993069711, + "grad_norm": 0.3823378086090088, + "learning_rate": 6.6777075689631755e-06, + "loss": 4.2902, + "step": 39135 + }, + { + "epoch": 2.6593287131403724, + "grad_norm": 0.5979027152061462, + "learning_rate": 6.6772829188748475e-06, + "loss": 4.3996, + "step": 39140 + }, + { + "epoch": 2.659668433211034, + "grad_norm": 0.2998010218143463, + "learning_rate": 6.67685826878652e-06, + "loss": 3.7935, + "step": 39145 + }, + { + "epoch": 2.660008153281696, + "grad_norm": 0.3237876296043396, + "learning_rate": 6.676433618698194e-06, + "loss": 4.4019, + "step": 39150 + }, + { + "epoch": 2.660347873352358, + "grad_norm": 0.2518341839313507, + "learning_rate": 6.676008968609866e-06, + "loss": 4.4584, + "step": 39155 + }, + { + "epoch": 2.6606875934230194, + "grad_norm": 0.375280499458313, + "learning_rate": 6.675584318521539e-06, + "loss": 3.9638, + "step": 39160 + }, + { + "epoch": 2.661027313493681, + "grad_norm": 0.4081980586051941, + "learning_rate": 6.675159668433212e-06, + "loss": 4.2573, + "step": 39165 + }, + { + "epoch": 2.661367033564343, + "grad_norm": 0.3064493238925934, + "learning_rate": 6.674735018344884e-06, + "loss": 4.2003, + "step": 39170 + }, + { + "epoch": 2.6617067536350048, + "grad_norm": 0.31146538257598877, + "learning_rate": 6.674310368256557e-06, + "loss": 3.9699, + "step": 39175 + }, + { + "epoch": 2.6620464737056664, + "grad_norm": 0.3683062493801117, + "learning_rate": 6.673885718168231e-06, + "loss": 3.9817, + "step": 39180 + }, + { + "epoch": 2.6623861937763285, + "grad_norm": 0.3268360197544098, + "learning_rate": 6.673461068079903e-06, + "loss": 3.9446, + "step": 39185 + }, + { + "epoch": 2.66272591384699, + "grad_norm": 0.5861287117004395, + "learning_rate": 6.6730364179915755e-06, + "loss": 4.0976, + "step": 39190 + }, + { + "epoch": 2.6630656339176517, + "grad_norm": 0.3208427429199219, + "learning_rate": 6.672611767903249e-06, + "loss": 4.3286, + "step": 39195 + }, + { + "epoch": 2.663405353988314, + "grad_norm": 0.3104759156703949, + "learning_rate": 6.672187117814921e-06, + "loss": 4.2906, + "step": 39200 + }, + { + "epoch": 2.6637450740589754, + "grad_norm": 0.548774242401123, + "learning_rate": 6.671762467726594e-06, + "loss": 4.1303, + "step": 39205 + }, + { + "epoch": 2.664084794129637, + "grad_norm": 0.2852970063686371, + "learning_rate": 6.671337817638267e-06, + "loss": 4.0211, + "step": 39210 + }, + { + "epoch": 2.6644245142002987, + "grad_norm": 0.3995794355869293, + "learning_rate": 6.6709131675499395e-06, + "loss": 4.0626, + "step": 39215 + }, + { + "epoch": 2.6647642342709608, + "grad_norm": 0.37964674830436707, + "learning_rate": 6.6704885174616114e-06, + "loss": 4.0527, + "step": 39220 + }, + { + "epoch": 2.6651039543416224, + "grad_norm": 0.2608977258205414, + "learning_rate": 6.670063867373285e-06, + "loss": 4.1968, + "step": 39225 + }, + { + "epoch": 2.665443674412284, + "grad_norm": 0.36069533228874207, + "learning_rate": 6.669639217284958e-06, + "loss": 3.9925, + "step": 39230 + }, + { + "epoch": 2.665783394482946, + "grad_norm": 0.24680276215076447, + "learning_rate": 6.66921456719663e-06, + "loss": 4.1434, + "step": 39235 + }, + { + "epoch": 2.6661231145536077, + "grad_norm": 0.29570409655570984, + "learning_rate": 6.6687899171083035e-06, + "loss": 4.1757, + "step": 39240 + }, + { + "epoch": 2.6664628346242694, + "grad_norm": 0.3648737668991089, + "learning_rate": 6.668365267019976e-06, + "loss": 4.2232, + "step": 39245 + }, + { + "epoch": 2.6668025546949314, + "grad_norm": 0.31175458431243896, + "learning_rate": 6.667940616931648e-06, + "loss": 4.2844, + "step": 39250 + }, + { + "epoch": 2.667142274765593, + "grad_norm": 0.3059222996234894, + "learning_rate": 6.667515966843322e-06, + "loss": 3.944, + "step": 39255 + }, + { + "epoch": 2.6674819948362547, + "grad_norm": 0.47973719239234924, + "learning_rate": 6.667091316754995e-06, + "loss": 4.4748, + "step": 39260 + }, + { + "epoch": 2.6678217149069168, + "grad_norm": 0.3574087619781494, + "learning_rate": 6.666666666666667e-06, + "loss": 4.0692, + "step": 39265 + }, + { + "epoch": 2.6681614349775784, + "grad_norm": 0.3886382579803467, + "learning_rate": 6.66624201657834e-06, + "loss": 4.0433, + "step": 39270 + }, + { + "epoch": 2.66850115504824, + "grad_norm": 0.3564925491809845, + "learning_rate": 6.665817366490013e-06, + "loss": 4.3567, + "step": 39275 + }, + { + "epoch": 2.668840875118902, + "grad_norm": 0.37383362650871277, + "learning_rate": 6.665392716401685e-06, + "loss": 4.1692, + "step": 39280 + }, + { + "epoch": 2.6691805951895637, + "grad_norm": 0.4097398817539215, + "learning_rate": 6.664968066313359e-06, + "loss": 4.2432, + "step": 39285 + }, + { + "epoch": 2.6695203152602254, + "grad_norm": 0.397940456867218, + "learning_rate": 6.6645434162250315e-06, + "loss": 4.1097, + "step": 39290 + }, + { + "epoch": 2.6698600353308874, + "grad_norm": 0.3166559040546417, + "learning_rate": 6.6641187661367035e-06, + "loss": 4.1883, + "step": 39295 + }, + { + "epoch": 2.670199755401549, + "grad_norm": 0.33351895213127136, + "learning_rate": 6.663694116048377e-06, + "loss": 4.2122, + "step": 39300 + }, + { + "epoch": 2.6705394754722107, + "grad_norm": 0.3101691007614136, + "learning_rate": 6.663269465960049e-06, + "loss": 4.0579, + "step": 39305 + }, + { + "epoch": 2.670879195542873, + "grad_norm": 0.326925665140152, + "learning_rate": 6.662844815871722e-06, + "loss": 4.029, + "step": 39310 + }, + { + "epoch": 2.6712189156135344, + "grad_norm": 0.3547525703907013, + "learning_rate": 6.6624201657833955e-06, + "loss": 4.145, + "step": 39315 + }, + { + "epoch": 2.671558635684196, + "grad_norm": 0.4806784689426422, + "learning_rate": 6.6619955156950675e-06, + "loss": 4.6406, + "step": 39320 + }, + { + "epoch": 2.671898355754858, + "grad_norm": 0.2779713571071625, + "learning_rate": 6.66157086560674e-06, + "loss": 4.034, + "step": 39325 + }, + { + "epoch": 2.6722380758255198, + "grad_norm": 0.25950300693511963, + "learning_rate": 6.661146215518414e-06, + "loss": 4.1623, + "step": 39330 + }, + { + "epoch": 2.6725777958961814, + "grad_norm": 0.30047327280044556, + "learning_rate": 6.660721565430086e-06, + "loss": 3.9996, + "step": 39335 + }, + { + "epoch": 2.6729175159668435, + "grad_norm": 0.5525081753730774, + "learning_rate": 6.660296915341759e-06, + "loss": 4.1898, + "step": 39340 + }, + { + "epoch": 2.673257236037505, + "grad_norm": 0.40754425525665283, + "learning_rate": 6.659872265253432e-06, + "loss": 4.2862, + "step": 39345 + }, + { + "epoch": 2.6735969561081667, + "grad_norm": 0.4303596019744873, + "learning_rate": 6.659447615165104e-06, + "loss": 4.1463, + "step": 39350 + }, + { + "epoch": 2.673936676178829, + "grad_norm": 0.4013802409172058, + "learning_rate": 6.659022965076777e-06, + "loss": 4.3703, + "step": 39355 + }, + { + "epoch": 2.6742763962494904, + "grad_norm": 0.234235480427742, + "learning_rate": 6.658598314988451e-06, + "loss": 3.7989, + "step": 39360 + }, + { + "epoch": 2.674616116320152, + "grad_norm": 0.3332996070384979, + "learning_rate": 6.658173664900123e-06, + "loss": 4.0326, + "step": 39365 + }, + { + "epoch": 2.674955836390814, + "grad_norm": 0.3312223255634308, + "learning_rate": 6.6577490148117955e-06, + "loss": 4.1676, + "step": 39370 + }, + { + "epoch": 2.6752955564614758, + "grad_norm": 0.4888524115085602, + "learning_rate": 6.657324364723468e-06, + "loss": 4.2279, + "step": 39375 + }, + { + "epoch": 2.6756352765321374, + "grad_norm": 0.2918905019760132, + "learning_rate": 6.656899714635141e-06, + "loss": 4.1464, + "step": 39380 + }, + { + "epoch": 2.6759749966027995, + "grad_norm": 0.33031007647514343, + "learning_rate": 6.656475064546814e-06, + "loss": 4.2963, + "step": 39385 + }, + { + "epoch": 2.676314716673461, + "grad_norm": 0.2967909872531891, + "learning_rate": 6.656050414458487e-06, + "loss": 4.1488, + "step": 39390 + }, + { + "epoch": 2.6766544367441227, + "grad_norm": 0.273360937833786, + "learning_rate": 6.6556257643701595e-06, + "loss": 4.172, + "step": 39395 + }, + { + "epoch": 2.676994156814785, + "grad_norm": 0.49071651697158813, + "learning_rate": 6.655201114281831e-06, + "loss": 4.3612, + "step": 39400 + }, + { + "epoch": 2.6773338768854464, + "grad_norm": 0.46733152866363525, + "learning_rate": 6.654776464193505e-06, + "loss": 4.2092, + "step": 39405 + }, + { + "epoch": 2.677673596956108, + "grad_norm": 0.3339482843875885, + "learning_rate": 6.654351814105178e-06, + "loss": 4.1172, + "step": 39410 + }, + { + "epoch": 2.67801331702677, + "grad_norm": 0.3177696466445923, + "learning_rate": 6.65392716401685e-06, + "loss": 4.4347, + "step": 39415 + }, + { + "epoch": 2.6783530370974318, + "grad_norm": 0.6356673836708069, + "learning_rate": 6.6535025139285235e-06, + "loss": 4.07, + "step": 39420 + }, + { + "epoch": 2.6786927571680934, + "grad_norm": 0.3351656496524811, + "learning_rate": 6.653077863840196e-06, + "loss": 4.2466, + "step": 39425 + }, + { + "epoch": 2.6790324772387555, + "grad_norm": 0.4029470682144165, + "learning_rate": 6.652653213751868e-06, + "loss": 4.2937, + "step": 39430 + }, + { + "epoch": 2.679372197309417, + "grad_norm": 0.411004900932312, + "learning_rate": 6.652228563663542e-06, + "loss": 4.048, + "step": 39435 + }, + { + "epoch": 2.6797119173800787, + "grad_norm": 0.25222229957580566, + "learning_rate": 6.651803913575215e-06, + "loss": 4.1183, + "step": 39440 + }, + { + "epoch": 2.680051637450741, + "grad_norm": 0.3030124008655548, + "learning_rate": 6.6513792634868875e-06, + "loss": 4.1738, + "step": 39445 + }, + { + "epoch": 2.6803913575214025, + "grad_norm": 0.25874149799346924, + "learning_rate": 6.65095461339856e-06, + "loss": 4.1871, + "step": 39450 + }, + { + "epoch": 2.680731077592064, + "grad_norm": 0.37370553612709045, + "learning_rate": 6.650529963310233e-06, + "loss": 4.2636, + "step": 39455 + }, + { + "epoch": 2.681070797662726, + "grad_norm": 0.28812533617019653, + "learning_rate": 6.650105313221906e-06, + "loss": 4.054, + "step": 39460 + }, + { + "epoch": 2.681410517733388, + "grad_norm": 0.34674957394599915, + "learning_rate": 6.649680663133579e-06, + "loss": 4.1802, + "step": 39465 + }, + { + "epoch": 2.6817502378040494, + "grad_norm": 0.25673815608024597, + "learning_rate": 6.649256013045251e-06, + "loss": 4.0676, + "step": 39470 + }, + { + "epoch": 2.6820899578747115, + "grad_norm": 0.26782727241516113, + "learning_rate": 6.648831362956924e-06, + "loss": 4.0273, + "step": 39475 + }, + { + "epoch": 2.682429677945373, + "grad_norm": 0.2750261127948761, + "learning_rate": 6.648406712868597e-06, + "loss": 4.0474, + "step": 39480 + }, + { + "epoch": 2.6827693980160348, + "grad_norm": 0.40227553248405457, + "learning_rate": 6.647982062780269e-06, + "loss": 4.4434, + "step": 39485 + }, + { + "epoch": 2.683109118086697, + "grad_norm": 0.32917726039886475, + "learning_rate": 6.647557412691943e-06, + "loss": 4.2161, + "step": 39490 + }, + { + "epoch": 2.6834488381573585, + "grad_norm": 0.3321444094181061, + "learning_rate": 6.6471327626036155e-06, + "loss": 4.1418, + "step": 39495 + }, + { + "epoch": 2.68378855822802, + "grad_norm": 0.5243711471557617, + "learning_rate": 6.6467081125152874e-06, + "loss": 4.1003, + "step": 39500 + }, + { + "epoch": 2.6841282782986817, + "grad_norm": 0.4531053900718689, + "learning_rate": 6.646283462426961e-06, + "loss": 4.1869, + "step": 39505 + }, + { + "epoch": 2.684467998369344, + "grad_norm": 0.2693192958831787, + "learning_rate": 6.645858812338634e-06, + "loss": 4.1779, + "step": 39510 + }, + { + "epoch": 2.6848077184400054, + "grad_norm": 0.4508264660835266, + "learning_rate": 6.645434162250306e-06, + "loss": 4.1162, + "step": 39515 + }, + { + "epoch": 2.685147438510667, + "grad_norm": 0.3300711214542389, + "learning_rate": 6.6450095121619795e-06, + "loss": 3.9761, + "step": 39520 + }, + { + "epoch": 2.685487158581329, + "grad_norm": 0.33268117904663086, + "learning_rate": 6.644584862073652e-06, + "loss": 3.9594, + "step": 39525 + }, + { + "epoch": 2.6858268786519908, + "grad_norm": 0.3620127737522125, + "learning_rate": 6.644160211985324e-06, + "loss": 4.1436, + "step": 39530 + }, + { + "epoch": 2.6861665987226524, + "grad_norm": 0.2890022099018097, + "learning_rate": 6.643735561896998e-06, + "loss": 4.2173, + "step": 39535 + }, + { + "epoch": 2.6865063187933145, + "grad_norm": 0.35372021794319153, + "learning_rate": 6.643310911808671e-06, + "loss": 4.1345, + "step": 39540 + }, + { + "epoch": 2.686846038863976, + "grad_norm": 0.47303012013435364, + "learning_rate": 6.642886261720343e-06, + "loss": 3.796, + "step": 39545 + }, + { + "epoch": 2.6871857589346377, + "grad_norm": 0.5342762470245361, + "learning_rate": 6.642461611632016e-06, + "loss": 4.2096, + "step": 39550 + }, + { + "epoch": 2.6875254790052994, + "grad_norm": 0.35702964663505554, + "learning_rate": 6.642036961543688e-06, + "loss": 4.0987, + "step": 39555 + }, + { + "epoch": 2.6878651990759614, + "grad_norm": 0.38810044527053833, + "learning_rate": 6.641612311455361e-06, + "loss": 4.2045, + "step": 39560 + }, + { + "epoch": 2.688204919146623, + "grad_norm": 0.31039097905158997, + "learning_rate": 6.641187661367035e-06, + "loss": 4.1692, + "step": 39565 + }, + { + "epoch": 2.6885446392172847, + "grad_norm": 0.5087475776672363, + "learning_rate": 6.640763011278707e-06, + "loss": 4.1689, + "step": 39570 + }, + { + "epoch": 2.688884359287947, + "grad_norm": 0.488471657037735, + "learning_rate": 6.6403383611903794e-06, + "loss": 4.0654, + "step": 39575 + }, + { + "epoch": 2.6892240793586084, + "grad_norm": 0.4577271342277527, + "learning_rate": 6.639913711102053e-06, + "loss": 4.1397, + "step": 39580 + }, + { + "epoch": 2.68956379942927, + "grad_norm": 0.399454265832901, + "learning_rate": 6.639489061013725e-06, + "loss": 4.3452, + "step": 39585 + }, + { + "epoch": 2.689903519499932, + "grad_norm": 0.5403753519058228, + "learning_rate": 6.639064410925398e-06, + "loss": 4.2576, + "step": 39590 + }, + { + "epoch": 2.6902432395705937, + "grad_norm": 0.28242436051368713, + "learning_rate": 6.6386397608370715e-06, + "loss": 4.0852, + "step": 39595 + }, + { + "epoch": 2.6905829596412554, + "grad_norm": 0.45061755180358887, + "learning_rate": 6.6382151107487435e-06, + "loss": 4.074, + "step": 39600 + }, + { + "epoch": 2.6909226797119175, + "grad_norm": 0.27290046215057373, + "learning_rate": 6.637790460660416e-06, + "loss": 3.9334, + "step": 39605 + }, + { + "epoch": 2.691262399782579, + "grad_norm": 0.36286044120788574, + "learning_rate": 6.63736581057209e-06, + "loss": 4.0793, + "step": 39610 + }, + { + "epoch": 2.6916021198532407, + "grad_norm": 0.36956867575645447, + "learning_rate": 6.636941160483762e-06, + "loss": 4.272, + "step": 39615 + }, + { + "epoch": 2.691941839923903, + "grad_norm": 0.27383890748023987, + "learning_rate": 6.636516510395435e-06, + "loss": 4.0224, + "step": 39620 + }, + { + "epoch": 2.6922815599945644, + "grad_norm": 0.4535904824733734, + "learning_rate": 6.6360918603071075e-06, + "loss": 4.2385, + "step": 39625 + }, + { + "epoch": 2.692621280065226, + "grad_norm": 0.33067578077316284, + "learning_rate": 6.63566721021878e-06, + "loss": 4.0333, + "step": 39630 + }, + { + "epoch": 2.692961000135888, + "grad_norm": 0.3665585219860077, + "learning_rate": 6.635242560130453e-06, + "loss": 4.0918, + "step": 39635 + }, + { + "epoch": 2.6933007202065498, + "grad_norm": 0.5784291625022888, + "learning_rate": 6.634817910042126e-06, + "loss": 4.1807, + "step": 39640 + }, + { + "epoch": 2.6936404402772114, + "grad_norm": 0.47805914282798767, + "learning_rate": 6.634393259953799e-06, + "loss": 3.937, + "step": 39645 + }, + { + "epoch": 2.6939801603478735, + "grad_norm": 0.46894124150276184, + "learning_rate": 6.633968609865471e-06, + "loss": 4.092, + "step": 39650 + }, + { + "epoch": 2.694319880418535, + "grad_norm": 0.3160810172557831, + "learning_rate": 6.633543959777144e-06, + "loss": 4.136, + "step": 39655 + }, + { + "epoch": 2.6946596004891967, + "grad_norm": 0.4944882392883301, + "learning_rate": 6.633119309688817e-06, + "loss": 3.9422, + "step": 39660 + }, + { + "epoch": 2.694999320559859, + "grad_norm": 0.30301806330680847, + "learning_rate": 6.632694659600489e-06, + "loss": 4.2187, + "step": 39665 + }, + { + "epoch": 2.6953390406305204, + "grad_norm": 0.274935245513916, + "learning_rate": 6.632270009512163e-06, + "loss": 3.9452, + "step": 39670 + }, + { + "epoch": 2.695678760701182, + "grad_norm": 0.37892279028892517, + "learning_rate": 6.6318453594238355e-06, + "loss": 3.9869, + "step": 39675 + }, + { + "epoch": 2.696018480771844, + "grad_norm": 0.2719658315181732, + "learning_rate": 6.631420709335507e-06, + "loss": 4.2037, + "step": 39680 + }, + { + "epoch": 2.6963582008425058, + "grad_norm": 0.36001405119895935, + "learning_rate": 6.630996059247181e-06, + "loss": 4.0655, + "step": 39685 + }, + { + "epoch": 2.6966979209131674, + "grad_norm": 0.38067522644996643, + "learning_rate": 6.630571409158854e-06, + "loss": 4.1438, + "step": 39690 + }, + { + "epoch": 2.6970376409838295, + "grad_norm": 0.400176465511322, + "learning_rate": 6.630146759070526e-06, + "loss": 4.0299, + "step": 39695 + }, + { + "epoch": 2.697377361054491, + "grad_norm": 0.25322380661964417, + "learning_rate": 6.6297221089821995e-06, + "loss": 4.0772, + "step": 39700 + }, + { + "epoch": 2.6977170811251527, + "grad_norm": 0.2491844743490219, + "learning_rate": 6.629297458893872e-06, + "loss": 4.0715, + "step": 39705 + }, + { + "epoch": 2.698056801195815, + "grad_norm": 0.30528348684310913, + "learning_rate": 6.628872808805544e-06, + "loss": 4.2992, + "step": 39710 + }, + { + "epoch": 2.6983965212664764, + "grad_norm": 0.4517967700958252, + "learning_rate": 6.628448158717218e-06, + "loss": 4.1611, + "step": 39715 + }, + { + "epoch": 2.698736241337138, + "grad_norm": 0.3538734018802643, + "learning_rate": 6.62802350862889e-06, + "loss": 4.3375, + "step": 39720 + }, + { + "epoch": 2.6990759614078, + "grad_norm": 0.3512342870235443, + "learning_rate": 6.627598858540563e-06, + "loss": 3.9946, + "step": 39725 + }, + { + "epoch": 2.699415681478462, + "grad_norm": 0.26214003562927246, + "learning_rate": 6.627174208452236e-06, + "loss": 4.0388, + "step": 39730 + }, + { + "epoch": 2.6997554015491234, + "grad_norm": 0.3953142762184143, + "learning_rate": 6.626749558363908e-06, + "loss": 4.1206, + "step": 39735 + }, + { + "epoch": 2.7000951216197855, + "grad_norm": 0.2603701055049896, + "learning_rate": 6.626324908275581e-06, + "loss": 3.9436, + "step": 39740 + }, + { + "epoch": 2.700434841690447, + "grad_norm": 0.31308743357658386, + "learning_rate": 6.625900258187255e-06, + "loss": 4.104, + "step": 39745 + }, + { + "epoch": 2.7007745617611087, + "grad_norm": 0.39022567868232727, + "learning_rate": 6.625475608098927e-06, + "loss": 4.1522, + "step": 39750 + }, + { + "epoch": 2.701114281831771, + "grad_norm": 0.2476404309272766, + "learning_rate": 6.6250509580105994e-06, + "loss": 4.0205, + "step": 39755 + }, + { + "epoch": 2.7014540019024325, + "grad_norm": 0.3329305350780487, + "learning_rate": 6.624626307922273e-06, + "loss": 4.1656, + "step": 39760 + }, + { + "epoch": 2.701793721973094, + "grad_norm": 0.28941717743873596, + "learning_rate": 6.624201657833945e-06, + "loss": 4.1865, + "step": 39765 + }, + { + "epoch": 2.702133442043756, + "grad_norm": 0.4306187331676483, + "learning_rate": 6.623777007745618e-06, + "loss": 4.1353, + "step": 39770 + }, + { + "epoch": 2.702473162114418, + "grad_norm": 0.5013722777366638, + "learning_rate": 6.6233523576572915e-06, + "loss": 3.9632, + "step": 39775 + }, + { + "epoch": 2.7028128821850794, + "grad_norm": 0.3415372669696808, + "learning_rate": 6.6229277075689634e-06, + "loss": 4.0159, + "step": 39780 + }, + { + "epoch": 2.7031526022557415, + "grad_norm": 0.3275023102760315, + "learning_rate": 6.622503057480637e-06, + "loss": 4.0345, + "step": 39785 + }, + { + "epoch": 2.703492322326403, + "grad_norm": 0.3017248213291168, + "learning_rate": 6.622078407392309e-06, + "loss": 4.0423, + "step": 39790 + }, + { + "epoch": 2.7038320423970648, + "grad_norm": 0.5028504133224487, + "learning_rate": 6.621653757303982e-06, + "loss": 4.245, + "step": 39795 + }, + { + "epoch": 2.704171762467727, + "grad_norm": 0.5394537448883057, + "learning_rate": 6.6212291072156555e-06, + "loss": 4.088, + "step": 39800 + }, + { + "epoch": 2.7045114825383885, + "grad_norm": 0.3951258659362793, + "learning_rate": 6.6208044571273274e-06, + "loss": 4.1459, + "step": 39805 + }, + { + "epoch": 2.70485120260905, + "grad_norm": 0.295487642288208, + "learning_rate": 6.620379807039e-06, + "loss": 4.0921, + "step": 39810 + }, + { + "epoch": 2.705190922679712, + "grad_norm": 0.3151441514492035, + "learning_rate": 6.619955156950674e-06, + "loss": 4.1049, + "step": 39815 + }, + { + "epoch": 2.705530642750374, + "grad_norm": 0.44371628761291504, + "learning_rate": 6.619530506862346e-06, + "loss": 4.1835, + "step": 39820 + }, + { + "epoch": 2.7058703628210354, + "grad_norm": 0.4183436930179596, + "learning_rate": 6.619105856774019e-06, + "loss": 4.1041, + "step": 39825 + }, + { + "epoch": 2.7062100828916975, + "grad_norm": 0.7643747329711914, + "learning_rate": 6.618681206685692e-06, + "loss": 4.183, + "step": 39830 + }, + { + "epoch": 2.706549802962359, + "grad_norm": 0.2964249849319458, + "learning_rate": 6.618256556597364e-06, + "loss": 4.1819, + "step": 39835 + }, + { + "epoch": 2.7068895230330208, + "grad_norm": 0.35286960005760193, + "learning_rate": 6.617831906509037e-06, + "loss": 4.0108, + "step": 39840 + }, + { + "epoch": 2.7072292431036824, + "grad_norm": 0.25064539909362793, + "learning_rate": 6.617407256420711e-06, + "loss": 3.9376, + "step": 39845 + }, + { + "epoch": 2.7075689631743445, + "grad_norm": 0.3800380229949951, + "learning_rate": 6.616982606332383e-06, + "loss": 4.072, + "step": 39850 + }, + { + "epoch": 2.707908683245006, + "grad_norm": 0.5181752443313599, + "learning_rate": 6.6165579562440554e-06, + "loss": 4.1933, + "step": 39855 + }, + { + "epoch": 2.7082484033156677, + "grad_norm": 0.35623979568481445, + "learning_rate": 6.616133306155729e-06, + "loss": 3.9529, + "step": 39860 + }, + { + "epoch": 2.70858812338633, + "grad_norm": 0.5029263496398926, + "learning_rate": 6.615708656067401e-06, + "loss": 4.2592, + "step": 39865 + }, + { + "epoch": 2.7089278434569914, + "grad_norm": 0.4120001196861267, + "learning_rate": 6.615284005979074e-06, + "loss": 4.2389, + "step": 39870 + }, + { + "epoch": 2.709267563527653, + "grad_norm": 0.30792301893234253, + "learning_rate": 6.614859355890747e-06, + "loss": 4.1117, + "step": 39875 + }, + { + "epoch": 2.709607283598315, + "grad_norm": 0.45433351397514343, + "learning_rate": 6.6144347058024194e-06, + "loss": 4.0931, + "step": 39880 + }, + { + "epoch": 2.709947003668977, + "grad_norm": 0.40293845534324646, + "learning_rate": 6.614010055714092e-06, + "loss": 4.2457, + "step": 39885 + }, + { + "epoch": 2.7102867237396384, + "grad_norm": 0.2651796340942383, + "learning_rate": 6.613585405625765e-06, + "loss": 3.9868, + "step": 39890 + }, + { + "epoch": 2.7106264438103, + "grad_norm": 0.3302900493144989, + "learning_rate": 6.613160755537438e-06, + "loss": 4.1901, + "step": 39895 + }, + { + "epoch": 2.710966163880962, + "grad_norm": 0.29363900423049927, + "learning_rate": 6.61273610544911e-06, + "loss": 4.1468, + "step": 39900 + }, + { + "epoch": 2.7113058839516238, + "grad_norm": 0.41276612877845764, + "learning_rate": 6.6123114553607835e-06, + "loss": 4.2427, + "step": 39905 + }, + { + "epoch": 2.7116456040222854, + "grad_norm": 0.33308717608451843, + "learning_rate": 6.611886805272456e-06, + "loss": 4.1861, + "step": 39910 + }, + { + "epoch": 2.7119853240929475, + "grad_norm": 0.4225836396217346, + "learning_rate": 6.611462155184128e-06, + "loss": 4.0044, + "step": 39915 + }, + { + "epoch": 2.712325044163609, + "grad_norm": 0.25438883900642395, + "learning_rate": 6.611037505095802e-06, + "loss": 4.2031, + "step": 39920 + }, + { + "epoch": 2.7126647642342707, + "grad_norm": 0.3444686532020569, + "learning_rate": 6.610612855007475e-06, + "loss": 4.1336, + "step": 39925 + }, + { + "epoch": 2.713004484304933, + "grad_norm": 0.3694024384021759, + "learning_rate": 6.610188204919147e-06, + "loss": 4.2466, + "step": 39930 + }, + { + "epoch": 2.7133442043755944, + "grad_norm": 0.5037913918495178, + "learning_rate": 6.60976355483082e-06, + "loss": 4.2782, + "step": 39935 + }, + { + "epoch": 2.713683924446256, + "grad_norm": 0.27582213282585144, + "learning_rate": 6.609338904742493e-06, + "loss": 4.0026, + "step": 39940 + }, + { + "epoch": 2.714023644516918, + "grad_norm": 0.3182463049888611, + "learning_rate": 6.608914254654165e-06, + "loss": 3.9058, + "step": 39945 + }, + { + "epoch": 2.7143633645875798, + "grad_norm": 0.5513135194778442, + "learning_rate": 6.608489604565839e-06, + "loss": 3.8808, + "step": 39950 + }, + { + "epoch": 2.7147030846582414, + "grad_norm": 0.29741719365119934, + "learning_rate": 6.6080649544775115e-06, + "loss": 4.0986, + "step": 39955 + }, + { + "epoch": 2.7150428047289035, + "grad_norm": 0.6656172275543213, + "learning_rate": 6.607640304389183e-06, + "loss": 4.1911, + "step": 39960 + }, + { + "epoch": 2.715382524799565, + "grad_norm": 0.28425395488739014, + "learning_rate": 6.607215654300857e-06, + "loss": 4.0945, + "step": 39965 + }, + { + "epoch": 2.7157222448702267, + "grad_norm": 0.3732162415981293, + "learning_rate": 6.606791004212529e-06, + "loss": 3.7992, + "step": 39970 + }, + { + "epoch": 2.716061964940889, + "grad_norm": 0.4671967923641205, + "learning_rate": 6.606366354124202e-06, + "loss": 4.2663, + "step": 39975 + }, + { + "epoch": 2.7164016850115504, + "grad_norm": 0.4990939199924469, + "learning_rate": 6.6059417040358755e-06, + "loss": 4.0089, + "step": 39980 + }, + { + "epoch": 2.716741405082212, + "grad_norm": 0.37401410937309265, + "learning_rate": 6.605517053947547e-06, + "loss": 4.1698, + "step": 39985 + }, + { + "epoch": 2.717081125152874, + "grad_norm": 0.31578922271728516, + "learning_rate": 6.60509240385922e-06, + "loss": 4.0706, + "step": 39990 + }, + { + "epoch": 2.7174208452235358, + "grad_norm": 0.35960131883621216, + "learning_rate": 6.604667753770894e-06, + "loss": 4.1249, + "step": 39995 + }, + { + "epoch": 2.7177605652941974, + "grad_norm": 0.26087266206741333, + "learning_rate": 6.604243103682566e-06, + "loss": 4.0944, + "step": 40000 + }, + { + "epoch": 2.7181002853648595, + "grad_norm": 0.428239107131958, + "learning_rate": 6.603818453594239e-06, + "loss": 4.2181, + "step": 40005 + }, + { + "epoch": 2.718440005435521, + "grad_norm": 0.3016834557056427, + "learning_rate": 6.603393803505912e-06, + "loss": 4.0216, + "step": 40010 + }, + { + "epoch": 2.7187797255061827, + "grad_norm": 0.29002639651298523, + "learning_rate": 6.602969153417584e-06, + "loss": 3.9563, + "step": 40015 + }, + { + "epoch": 2.719119445576845, + "grad_norm": 0.4254847466945648, + "learning_rate": 6.602544503329257e-06, + "loss": 4.3502, + "step": 40020 + }, + { + "epoch": 2.7194591656475064, + "grad_norm": 0.33559754490852356, + "learning_rate": 6.602119853240931e-06, + "loss": 4.1375, + "step": 40025 + }, + { + "epoch": 2.719798885718168, + "grad_norm": 0.351316899061203, + "learning_rate": 6.601695203152603e-06, + "loss": 4.2239, + "step": 40030 + }, + { + "epoch": 2.72013860578883, + "grad_norm": 0.29378294944763184, + "learning_rate": 6.6012705530642754e-06, + "loss": 3.9533, + "step": 40035 + }, + { + "epoch": 2.720478325859492, + "grad_norm": 0.3415873944759369, + "learning_rate": 6.600845902975948e-06, + "loss": 4.2006, + "step": 40040 + }, + { + "epoch": 2.7208180459301534, + "grad_norm": 0.5907360911369324, + "learning_rate": 6.600421252887621e-06, + "loss": 4.2259, + "step": 40045 + }, + { + "epoch": 2.7211577660008155, + "grad_norm": 0.28662747144699097, + "learning_rate": 6.599996602799294e-06, + "loss": 4.0402, + "step": 40050 + }, + { + "epoch": 2.721497486071477, + "grad_norm": 0.3799734115600586, + "learning_rate": 6.599571952710967e-06, + "loss": 4.1338, + "step": 40055 + }, + { + "epoch": 2.7218372061421388, + "grad_norm": 0.43560153245925903, + "learning_rate": 6.5991473026226394e-06, + "loss": 3.9572, + "step": 40060 + }, + { + "epoch": 2.722176926212801, + "grad_norm": 0.3412218391895294, + "learning_rate": 6.598722652534311e-06, + "loss": 4.1622, + "step": 40065 + }, + { + "epoch": 2.7225166462834625, + "grad_norm": 0.308470219373703, + "learning_rate": 6.598298002445985e-06, + "loss": 4.1504, + "step": 40070 + }, + { + "epoch": 2.722856366354124, + "grad_norm": 0.4385407567024231, + "learning_rate": 6.597873352357658e-06, + "loss": 4.0901, + "step": 40075 + }, + { + "epoch": 2.723196086424786, + "grad_norm": 0.5000472068786621, + "learning_rate": 6.59744870226933e-06, + "loss": 4.0618, + "step": 40080 + }, + { + "epoch": 2.723535806495448, + "grad_norm": 0.29166439175605774, + "learning_rate": 6.5970240521810034e-06, + "loss": 4.253, + "step": 40085 + }, + { + "epoch": 2.7238755265661094, + "grad_norm": 0.6906698942184448, + "learning_rate": 6.596599402092676e-06, + "loss": 4.2675, + "step": 40090 + }, + { + "epoch": 2.7242152466367715, + "grad_norm": 0.28645265102386475, + "learning_rate": 6.596174752004348e-06, + "loss": 4.2241, + "step": 40095 + }, + { + "epoch": 2.724554966707433, + "grad_norm": 0.3507741391658783, + "learning_rate": 6.595750101916022e-06, + "loss": 4.1748, + "step": 40100 + }, + { + "epoch": 2.7248946867780948, + "grad_norm": 0.35526540875434875, + "learning_rate": 6.595325451827695e-06, + "loss": 4.0459, + "step": 40105 + }, + { + "epoch": 2.725234406848757, + "grad_norm": 0.4359420835971832, + "learning_rate": 6.594900801739367e-06, + "loss": 4.3408, + "step": 40110 + }, + { + "epoch": 2.7255741269194185, + "grad_norm": 0.3280409276485443, + "learning_rate": 6.59447615165104e-06, + "loss": 4.1404, + "step": 40115 + }, + { + "epoch": 2.72591384699008, + "grad_norm": 0.4268851578235626, + "learning_rate": 6.594051501562713e-06, + "loss": 3.9363, + "step": 40120 + }, + { + "epoch": 2.726253567060742, + "grad_norm": 0.31047096848487854, + "learning_rate": 6.593626851474386e-06, + "loss": 3.918, + "step": 40125 + }, + { + "epoch": 2.726593287131404, + "grad_norm": 0.29556259512901306, + "learning_rate": 6.593202201386059e-06, + "loss": 4.0607, + "step": 40130 + }, + { + "epoch": 2.7269330072020654, + "grad_norm": 0.376481294631958, + "learning_rate": 6.592777551297731e-06, + "loss": 4.1049, + "step": 40135 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.2927439510822296, + "learning_rate": 6.592352901209404e-06, + "loss": 3.8935, + "step": 40140 + }, + { + "epoch": 2.727612447343389, + "grad_norm": 0.45183074474334717, + "learning_rate": 6.591928251121077e-06, + "loss": 4.1984, + "step": 40145 + }, + { + "epoch": 2.7279521674140508, + "grad_norm": 0.28457123041152954, + "learning_rate": 6.591503601032749e-06, + "loss": 4.0153, + "step": 40150 + }, + { + "epoch": 2.728291887484713, + "grad_norm": 0.20604689419269562, + "learning_rate": 6.591078950944423e-06, + "loss": 4.0565, + "step": 40155 + }, + { + "epoch": 2.7286316075553745, + "grad_norm": 0.2271767258644104, + "learning_rate": 6.5906543008560954e-06, + "loss": 3.9838, + "step": 40160 + }, + { + "epoch": 2.728971327626036, + "grad_norm": 0.5177909731864929, + "learning_rate": 6.590229650767767e-06, + "loss": 4.3724, + "step": 40165 + }, + { + "epoch": 2.729311047696698, + "grad_norm": 0.378028929233551, + "learning_rate": 6.589805000679441e-06, + "loss": 4.1166, + "step": 40170 + }, + { + "epoch": 2.72965076776736, + "grad_norm": 0.2704724073410034, + "learning_rate": 6.589380350591114e-06, + "loss": 4.1584, + "step": 40175 + }, + { + "epoch": 2.7299904878380215, + "grad_norm": 0.4293934404850006, + "learning_rate": 6.588955700502786e-06, + "loss": 4.3554, + "step": 40180 + }, + { + "epoch": 2.730330207908683, + "grad_norm": 0.32194390892982483, + "learning_rate": 6.5885310504144594e-06, + "loss": 4.106, + "step": 40185 + }, + { + "epoch": 2.730669927979345, + "grad_norm": 0.3023357391357422, + "learning_rate": 6.588106400326132e-06, + "loss": 4.105, + "step": 40190 + }, + { + "epoch": 2.731009648050007, + "grad_norm": 0.26209431886672974, + "learning_rate": 6.587681750237804e-06, + "loss": 4.0673, + "step": 40195 + }, + { + "epoch": 2.7313493681206684, + "grad_norm": 0.3299480974674225, + "learning_rate": 6.587257100149478e-06, + "loss": 4.3784, + "step": 40200 + }, + { + "epoch": 2.7316890881913305, + "grad_norm": 0.3384230136871338, + "learning_rate": 6.586832450061151e-06, + "loss": 3.9479, + "step": 40205 + }, + { + "epoch": 2.732028808261992, + "grad_norm": NaN, + "learning_rate": 6.586492729990488e-06, + "loss": 4.1056, + "step": 40210 + }, + { + "epoch": 2.7323685283326538, + "grad_norm": 0.33695515990257263, + "learning_rate": 6.5860680799021615e-06, + "loss": 4.2479, + "step": 40215 + }, + { + "epoch": 2.732708248403316, + "grad_norm": 0.35180920362472534, + "learning_rate": 6.5856434298138335e-06, + "loss": 4.0407, + "step": 40220 + }, + { + "epoch": 2.7330479684739775, + "grad_norm": 0.31118693947792053, + "learning_rate": 6.585218779725506e-06, + "loss": 4.2155, + "step": 40225 + }, + { + "epoch": 2.733387688544639, + "grad_norm": 0.4839710295200348, + "learning_rate": 6.58479412963718e-06, + "loss": 4.3697, + "step": 40230 + }, + { + "epoch": 2.7337274086153007, + "grad_norm": 0.3233281970024109, + "learning_rate": 6.584369479548852e-06, + "loss": 4.0409, + "step": 40235 + }, + { + "epoch": 2.734067128685963, + "grad_norm": 0.31194278597831726, + "learning_rate": 6.583944829460525e-06, + "loss": 4.1948, + "step": 40240 + }, + { + "epoch": 2.7344068487566244, + "grad_norm": 0.35404860973358154, + "learning_rate": 6.583520179372198e-06, + "loss": 3.9856, + "step": 40245 + }, + { + "epoch": 2.734746568827286, + "grad_norm": 0.37028440833091736, + "learning_rate": 6.58309552928387e-06, + "loss": 4.2539, + "step": 40250 + }, + { + "epoch": 2.735086288897948, + "grad_norm": 0.35931214690208435, + "learning_rate": 6.582670879195543e-06, + "loss": 4.2792, + "step": 40255 + }, + { + "epoch": 2.7354260089686098, + "grad_norm": 0.24276648461818695, + "learning_rate": 6.582246229107217e-06, + "loss": 4.1906, + "step": 40260 + }, + { + "epoch": 2.7357657290392714, + "grad_norm": 0.4568244218826294, + "learning_rate": 6.581821579018889e-06, + "loss": 3.9824, + "step": 40265 + }, + { + "epoch": 2.7361054491099335, + "grad_norm": 0.2779233753681183, + "learning_rate": 6.5813969289305615e-06, + "loss": 4.1035, + "step": 40270 + }, + { + "epoch": 2.736445169180595, + "grad_norm": 0.42505770921707153, + "learning_rate": 6.580972278842235e-06, + "loss": 4.2929, + "step": 40275 + }, + { + "epoch": 2.7367848892512567, + "grad_norm": 0.3877238631248474, + "learning_rate": 6.580547628753907e-06, + "loss": 4.069, + "step": 40280 + }, + { + "epoch": 2.737124609321919, + "grad_norm": 0.37477636337280273, + "learning_rate": 6.58012297866558e-06, + "loss": 4.0765, + "step": 40285 + }, + { + "epoch": 2.7374643293925804, + "grad_norm": 0.2806493937969208, + "learning_rate": 6.579698328577253e-06, + "loss": 3.9662, + "step": 40290 + }, + { + "epoch": 2.737804049463242, + "grad_norm": 0.343660831451416, + "learning_rate": 6.5792736784889255e-06, + "loss": 3.9605, + "step": 40295 + }, + { + "epoch": 2.738143769533904, + "grad_norm": 0.3905579149723053, + "learning_rate": 6.578849028400598e-06, + "loss": 3.9775, + "step": 40300 + }, + { + "epoch": 2.738483489604566, + "grad_norm": 0.4009062349796295, + "learning_rate": 6.578424378312271e-06, + "loss": 4.165, + "step": 40305 + }, + { + "epoch": 2.7388232096752274, + "grad_norm": 0.398764044046402, + "learning_rate": 6.577999728223944e-06, + "loss": 4.1889, + "step": 40310 + }, + { + "epoch": 2.7391629297458895, + "grad_norm": 0.3101148307323456, + "learning_rate": 6.577575078135616e-06, + "loss": 4.223, + "step": 40315 + }, + { + "epoch": 2.739502649816551, + "grad_norm": 0.30949461460113525, + "learning_rate": 6.5771504280472895e-06, + "loss": 3.7496, + "step": 40320 + }, + { + "epoch": 2.7398423698872127, + "grad_norm": 0.3401767313480377, + "learning_rate": 6.576725777958962e-06, + "loss": 4.3156, + "step": 40325 + }, + { + "epoch": 2.740182089957875, + "grad_norm": 0.7485408186912537, + "learning_rate": 6.576301127870636e-06, + "loss": 4.1054, + "step": 40330 + }, + { + "epoch": 2.7405218100285365, + "grad_norm": 0.30509519577026367, + "learning_rate": 6.575876477782308e-06, + "loss": 4.0462, + "step": 40335 + }, + { + "epoch": 2.740861530099198, + "grad_norm": 0.31714117527008057, + "learning_rate": 6.575451827693981e-06, + "loss": 4.0684, + "step": 40340 + }, + { + "epoch": 2.74120125016986, + "grad_norm": 0.35790690779685974, + "learning_rate": 6.575027177605654e-06, + "loss": 4.2242, + "step": 40345 + }, + { + "epoch": 2.741540970240522, + "grad_norm": 0.39336830377578735, + "learning_rate": 6.574602527517326e-06, + "loss": 3.8385, + "step": 40350 + }, + { + "epoch": 2.7418806903111834, + "grad_norm": 0.32411783933639526, + "learning_rate": 6.574177877428999e-06, + "loss": 3.9569, + "step": 40355 + }, + { + "epoch": 2.7422204103818455, + "grad_norm": 0.4037640392780304, + "learning_rate": 6.573753227340673e-06, + "loss": 4.1632, + "step": 40360 + }, + { + "epoch": 2.742560130452507, + "grad_norm": 0.41352614760398865, + "learning_rate": 6.573328577252345e-06, + "loss": 4.3087, + "step": 40365 + }, + { + "epoch": 2.7428998505231688, + "grad_norm": 0.3419109284877777, + "learning_rate": 6.5729039271640175e-06, + "loss": 4.1905, + "step": 40370 + }, + { + "epoch": 2.743239570593831, + "grad_norm": 0.48968008160591125, + "learning_rate": 6.57247927707569e-06, + "loss": 4.2196, + "step": 40375 + }, + { + "epoch": 2.7435792906644925, + "grad_norm": 0.348448246717453, + "learning_rate": 6.572054626987363e-06, + "loss": 4.2021, + "step": 40380 + }, + { + "epoch": 2.743919010735154, + "grad_norm": 0.34363222122192383, + "learning_rate": 6.571629976899036e-06, + "loss": 4.2483, + "step": 40385 + }, + { + "epoch": 2.744258730805816, + "grad_norm": 0.31402841210365295, + "learning_rate": 6.571205326810709e-06, + "loss": 4.212, + "step": 40390 + }, + { + "epoch": 2.744598450876478, + "grad_norm": 0.3818187415599823, + "learning_rate": 6.5707806767223815e-06, + "loss": 4.0114, + "step": 40395 + }, + { + "epoch": 2.7449381709471394, + "grad_norm": 0.35296106338500977, + "learning_rate": 6.5703560266340535e-06, + "loss": 4.0566, + "step": 40400 + }, + { + "epoch": 2.7452778910178015, + "grad_norm": 0.34302622079849243, + "learning_rate": 6.569931376545727e-06, + "loss": 4.3627, + "step": 40405 + }, + { + "epoch": 2.745617611088463, + "grad_norm": 0.26439765095710754, + "learning_rate": 6.5695067264574e-06, + "loss": 4.179, + "step": 40410 + }, + { + "epoch": 2.7459573311591248, + "grad_norm": 0.3086088001728058, + "learning_rate": 6.569082076369072e-06, + "loss": 4.1283, + "step": 40415 + }, + { + "epoch": 2.746297051229787, + "grad_norm": 0.2918623089790344, + "learning_rate": 6.5686574262807455e-06, + "loss": 4.0901, + "step": 40420 + }, + { + "epoch": 2.7466367713004485, + "grad_norm": 0.26636096835136414, + "learning_rate": 6.568232776192418e-06, + "loss": 4.2486, + "step": 40425 + }, + { + "epoch": 2.74697649137111, + "grad_norm": 0.2812598645687103, + "learning_rate": 6.56780812610409e-06, + "loss": 3.8846, + "step": 40430 + }, + { + "epoch": 2.747316211441772, + "grad_norm": 0.32431191205978394, + "learning_rate": 6.567383476015764e-06, + "loss": 3.9644, + "step": 40435 + }, + { + "epoch": 2.747655931512434, + "grad_norm": 0.4229033291339874, + "learning_rate": 6.566958825927437e-06, + "loss": 4.1817, + "step": 40440 + }, + { + "epoch": 2.7479956515830954, + "grad_norm": 0.28657451272010803, + "learning_rate": 6.566534175839109e-06, + "loss": 4.1452, + "step": 40445 + }, + { + "epoch": 2.7483353716537575, + "grad_norm": 0.37803933024406433, + "learning_rate": 6.566109525750782e-06, + "loss": 4.2785, + "step": 40450 + }, + { + "epoch": 2.748675091724419, + "grad_norm": 0.2910180389881134, + "learning_rate": 6.565684875662455e-06, + "loss": 4.0472, + "step": 40455 + }, + { + "epoch": 2.749014811795081, + "grad_norm": 0.37396034598350525, + "learning_rate": 6.565260225574127e-06, + "loss": 4.2043, + "step": 40460 + }, + { + "epoch": 2.749354531865743, + "grad_norm": 0.3752279281616211, + "learning_rate": 6.564835575485801e-06, + "loss": 4.2058, + "step": 40465 + }, + { + "epoch": 2.7496942519364045, + "grad_norm": 0.40476617217063904, + "learning_rate": 6.564410925397473e-06, + "loss": 3.8934, + "step": 40470 + }, + { + "epoch": 2.750033972007066, + "grad_norm": 0.27723994851112366, + "learning_rate": 6.5639862753091455e-06, + "loss": 4.1648, + "step": 40475 + }, + { + "epoch": 2.750373692077728, + "grad_norm": 0.3147490620613098, + "learning_rate": 6.563561625220819e-06, + "loss": 4.1509, + "step": 40480 + }, + { + "epoch": 2.75071341214839, + "grad_norm": 0.3334410488605499, + "learning_rate": 6.563136975132491e-06, + "loss": 4.0866, + "step": 40485 + }, + { + "epoch": 2.7510531322190515, + "grad_norm": 0.2923220694065094, + "learning_rate": 6.562712325044164e-06, + "loss": 3.966, + "step": 40490 + }, + { + "epoch": 2.7513928522897135, + "grad_norm": 0.25867560505867004, + "learning_rate": 6.5622876749558375e-06, + "loss": 4.2224, + "step": 40495 + }, + { + "epoch": 2.751732572360375, + "grad_norm": 0.3239818215370178, + "learning_rate": 6.5618630248675095e-06, + "loss": 4.189, + "step": 40500 + }, + { + "epoch": 2.752072292431037, + "grad_norm": 0.40131115913391113, + "learning_rate": 6.561438374779182e-06, + "loss": 3.9691, + "step": 40505 + }, + { + "epoch": 2.752412012501699, + "grad_norm": 0.283601313829422, + "learning_rate": 6.561013724690856e-06, + "loss": 4.1728, + "step": 40510 + }, + { + "epoch": 2.7527517325723605, + "grad_norm": 0.3626449406147003, + "learning_rate": 6.560589074602528e-06, + "loss": 4.0965, + "step": 40515 + }, + { + "epoch": 2.753091452643022, + "grad_norm": 0.35478103160858154, + "learning_rate": 6.560164424514201e-06, + "loss": 4.1175, + "step": 40520 + }, + { + "epoch": 2.7534311727136838, + "grad_norm": 0.5701117515563965, + "learning_rate": 6.559739774425874e-06, + "loss": 4.097, + "step": 40525 + }, + { + "epoch": 2.753770892784346, + "grad_norm": 0.3020617365837097, + "learning_rate": 6.559315124337546e-06, + "loss": 4.1595, + "step": 40530 + }, + { + "epoch": 2.7541106128550075, + "grad_norm": 0.21835920214653015, + "learning_rate": 6.558890474249219e-06, + "loss": 4.1469, + "step": 40535 + }, + { + "epoch": 2.754450332925669, + "grad_norm": 0.6604756116867065, + "learning_rate": 6.558465824160892e-06, + "loss": 4.1207, + "step": 40540 + }, + { + "epoch": 2.754790052996331, + "grad_norm": 0.3091903328895569, + "learning_rate": 6.558041174072565e-06, + "loss": 4.1493, + "step": 40545 + }, + { + "epoch": 2.755129773066993, + "grad_norm": 0.3306977450847626, + "learning_rate": 6.5576165239842375e-06, + "loss": 4.3031, + "step": 40550 + }, + { + "epoch": 2.7554694931376544, + "grad_norm": 0.4263104498386383, + "learning_rate": 6.55719187389591e-06, + "loss": 4.0748, + "step": 40555 + }, + { + "epoch": 2.7558092132083165, + "grad_norm": 0.2660503685474396, + "learning_rate": 6.556767223807583e-06, + "loss": 3.9851, + "step": 40560 + }, + { + "epoch": 2.756148933278978, + "grad_norm": 0.3380272090435028, + "learning_rate": 6.556342573719255e-06, + "loss": 3.8868, + "step": 40565 + }, + { + "epoch": 2.7564886533496398, + "grad_norm": 0.4604250192642212, + "learning_rate": 6.555917923630929e-06, + "loss": 4.0186, + "step": 40570 + }, + { + "epoch": 2.7568283734203014, + "grad_norm": 0.3188706040382385, + "learning_rate": 6.5554932735426015e-06, + "loss": 4.1263, + "step": 40575 + }, + { + "epoch": 2.7571680934909635, + "grad_norm": 0.4087993800640106, + "learning_rate": 6.5550686234542734e-06, + "loss": 4.3446, + "step": 40580 + }, + { + "epoch": 2.757507813561625, + "grad_norm": 0.42784175276756287, + "learning_rate": 6.554643973365947e-06, + "loss": 4.0064, + "step": 40585 + }, + { + "epoch": 2.7578475336322867, + "grad_norm": 0.30888593196868896, + "learning_rate": 6.55421932327762e-06, + "loss": 4.2324, + "step": 40590 + }, + { + "epoch": 2.758187253702949, + "grad_norm": 0.40869829058647156, + "learning_rate": 6.553794673189292e-06, + "loss": 4.1583, + "step": 40595 + }, + { + "epoch": 2.7585269737736104, + "grad_norm": 0.29036423563957214, + "learning_rate": 6.5533700231009655e-06, + "loss": 4.1257, + "step": 40600 + }, + { + "epoch": 2.758866693844272, + "grad_norm": 0.372175395488739, + "learning_rate": 6.552945373012638e-06, + "loss": 4.1764, + "step": 40605 + }, + { + "epoch": 2.759206413914934, + "grad_norm": 0.36870667338371277, + "learning_rate": 6.55252072292431e-06, + "loss": 4.3268, + "step": 40610 + }, + { + "epoch": 2.759546133985596, + "grad_norm": 0.35814252495765686, + "learning_rate": 6.552096072835984e-06, + "loss": 3.7197, + "step": 40615 + }, + { + "epoch": 2.7598858540562574, + "grad_norm": 0.40360137820243835, + "learning_rate": 6.551671422747657e-06, + "loss": 4.2411, + "step": 40620 + }, + { + "epoch": 2.7602255741269195, + "grad_norm": 0.27730628848075867, + "learning_rate": 6.551246772659329e-06, + "loss": 4.1099, + "step": 40625 + }, + { + "epoch": 2.760565294197581, + "grad_norm": 0.32394474744796753, + "learning_rate": 6.550822122571002e-06, + "loss": 3.9982, + "step": 40630 + }, + { + "epoch": 2.7609050142682428, + "grad_norm": 0.32294386625289917, + "learning_rate": 6.550397472482674e-06, + "loss": 4.013, + "step": 40635 + }, + { + "epoch": 2.761244734338905, + "grad_norm": 0.24551056325435638, + "learning_rate": 6.549972822394347e-06, + "loss": 3.6635, + "step": 40640 + }, + { + "epoch": 2.7615844544095665, + "grad_norm": 0.4011462330818176, + "learning_rate": 6.549548172306021e-06, + "loss": 4.2927, + "step": 40645 + }, + { + "epoch": 2.761924174480228, + "grad_norm": 0.4966874420642853, + "learning_rate": 6.549123522217693e-06, + "loss": 4.0842, + "step": 40650 + }, + { + "epoch": 2.76226389455089, + "grad_norm": 0.3603767156600952, + "learning_rate": 6.5486988721293655e-06, + "loss": 4.0014, + "step": 40655 + }, + { + "epoch": 2.762603614621552, + "grad_norm": 0.32976391911506653, + "learning_rate": 6.548274222041039e-06, + "loss": 4.2055, + "step": 40660 + }, + { + "epoch": 2.7629433346922134, + "grad_norm": 0.2782934606075287, + "learning_rate": 6.547849571952711e-06, + "loss": 4.0628, + "step": 40665 + }, + { + "epoch": 2.7632830547628755, + "grad_norm": 0.39980459213256836, + "learning_rate": 6.547424921864385e-06, + "loss": 3.9997, + "step": 40670 + }, + { + "epoch": 2.763622774833537, + "grad_norm": 0.3328931927680969, + "learning_rate": 6.5470002717760575e-06, + "loss": 4.0339, + "step": 40675 + }, + { + "epoch": 2.7639624949041988, + "grad_norm": 0.2725696563720703, + "learning_rate": 6.5465756216877295e-06, + "loss": 3.9087, + "step": 40680 + }, + { + "epoch": 2.764302214974861, + "grad_norm": 0.605735719203949, + "learning_rate": 6.546150971599403e-06, + "loss": 4.3512, + "step": 40685 + }, + { + "epoch": 2.7646419350455225, + "grad_norm": 0.3247482478618622, + "learning_rate": 6.545726321511076e-06, + "loss": 4.2099, + "step": 40690 + }, + { + "epoch": 2.764981655116184, + "grad_norm": 0.3562101721763611, + "learning_rate": 6.545301671422748e-06, + "loss": 4.0889, + "step": 40695 + }, + { + "epoch": 2.765321375186846, + "grad_norm": 0.3902135491371155, + "learning_rate": 6.5448770213344215e-06, + "loss": 4.1631, + "step": 40700 + }, + { + "epoch": 2.765661095257508, + "grad_norm": 0.28717878460884094, + "learning_rate": 6.544452371246094e-06, + "loss": 4.1224, + "step": 40705 + }, + { + "epoch": 2.7660008153281694, + "grad_norm": 0.4536729156970978, + "learning_rate": 6.544027721157766e-06, + "loss": 4.0718, + "step": 40710 + }, + { + "epoch": 2.7663405353988315, + "grad_norm": 0.3534712791442871, + "learning_rate": 6.54360307106944e-06, + "loss": 4.0014, + "step": 40715 + }, + { + "epoch": 2.766680255469493, + "grad_norm": 0.4792826175689697, + "learning_rate": 6.543178420981112e-06, + "loss": 4.0449, + "step": 40720 + }, + { + "epoch": 2.7670199755401548, + "grad_norm": 0.3105972111225128, + "learning_rate": 6.542753770892785e-06, + "loss": 4.0964, + "step": 40725 + }, + { + "epoch": 2.767359695610817, + "grad_norm": 0.3485959470272064, + "learning_rate": 6.542329120804458e-06, + "loss": 4.1538, + "step": 40730 + }, + { + "epoch": 2.7676994156814785, + "grad_norm": 0.23795086145401, + "learning_rate": 6.54190447071613e-06, + "loss": 4.1211, + "step": 40735 + }, + { + "epoch": 2.76803913575214, + "grad_norm": 0.2615280747413635, + "learning_rate": 6.541479820627803e-06, + "loss": 4.0381, + "step": 40740 + }, + { + "epoch": 2.768378855822802, + "grad_norm": 0.2733452022075653, + "learning_rate": 6.541055170539477e-06, + "loss": 4.0881, + "step": 40745 + }, + { + "epoch": 2.768718575893464, + "grad_norm": 0.3240598142147064, + "learning_rate": 6.540630520451149e-06, + "loss": 4.3132, + "step": 40750 + }, + { + "epoch": 2.7690582959641254, + "grad_norm": 0.4202045202255249, + "learning_rate": 6.5402058703628215e-06, + "loss": 4.0912, + "step": 40755 + }, + { + "epoch": 2.7693980160347875, + "grad_norm": 0.29146525263786316, + "learning_rate": 6.539781220274495e-06, + "loss": 4.1058, + "step": 40760 + }, + { + "epoch": 2.769737736105449, + "grad_norm": 0.41031092405319214, + "learning_rate": 6.539356570186167e-06, + "loss": 4.1301, + "step": 40765 + }, + { + "epoch": 2.770077456176111, + "grad_norm": 0.27595001459121704, + "learning_rate": 6.53893192009784e-06, + "loss": 4.1859, + "step": 40770 + }, + { + "epoch": 2.770417176246773, + "grad_norm": 0.33034035563468933, + "learning_rate": 6.5385072700095135e-06, + "loss": 4.4638, + "step": 40775 + }, + { + "epoch": 2.7707568963174345, + "grad_norm": 0.3120898902416229, + "learning_rate": 6.5380826199211855e-06, + "loss": 4.184, + "step": 40780 + }, + { + "epoch": 2.771096616388096, + "grad_norm": 0.35349681973457336, + "learning_rate": 6.537657969832858e-06, + "loss": 3.9436, + "step": 40785 + }, + { + "epoch": 2.771436336458758, + "grad_norm": 0.37676161527633667, + "learning_rate": 6.537233319744531e-06, + "loss": 4.0736, + "step": 40790 + }, + { + "epoch": 2.77177605652942, + "grad_norm": 0.6028788685798645, + "learning_rate": 6.536808669656204e-06, + "loss": 4.159, + "step": 40795 + }, + { + "epoch": 2.7721157766000815, + "grad_norm": 0.30615437030792236, + "learning_rate": 6.536384019567877e-06, + "loss": 4.1522, + "step": 40800 + }, + { + "epoch": 2.7724554966707435, + "grad_norm": 0.3882412612438202, + "learning_rate": 6.5359593694795495e-06, + "loss": 4.0514, + "step": 40805 + }, + { + "epoch": 2.772795216741405, + "grad_norm": 0.3876357078552246, + "learning_rate": 6.535534719391222e-06, + "loss": 4.1025, + "step": 40810 + }, + { + "epoch": 2.773134936812067, + "grad_norm": 0.525871217250824, + "learning_rate": 6.535110069302894e-06, + "loss": 4.2327, + "step": 40815 + }, + { + "epoch": 2.773474656882729, + "grad_norm": 0.2507377862930298, + "learning_rate": 6.534685419214568e-06, + "loss": 4.1911, + "step": 40820 + }, + { + "epoch": 2.7738143769533905, + "grad_norm": 0.3485431373119354, + "learning_rate": 6.534260769126241e-06, + "loss": 4.1544, + "step": 40825 + }, + { + "epoch": 2.774154097024052, + "grad_norm": 0.5149925351142883, + "learning_rate": 6.533836119037913e-06, + "loss": 4.1478, + "step": 40830 + }, + { + "epoch": 2.774493817094714, + "grad_norm": 0.2841576039791107, + "learning_rate": 6.533411468949586e-06, + "loss": 4.1119, + "step": 40835 + }, + { + "epoch": 2.774833537165376, + "grad_norm": 0.2453017383813858, + "learning_rate": 6.532986818861259e-06, + "loss": 4.1846, + "step": 40840 + }, + { + "epoch": 2.7751732572360375, + "grad_norm": 0.29038771986961365, + "learning_rate": 6.532562168772931e-06, + "loss": 4.2473, + "step": 40845 + }, + { + "epoch": 2.7755129773066995, + "grad_norm": 0.3550018072128296, + "learning_rate": 6.532137518684605e-06, + "loss": 4.0125, + "step": 40850 + }, + { + "epoch": 2.775852697377361, + "grad_norm": 0.2900540828704834, + "learning_rate": 6.5317128685962775e-06, + "loss": 3.9602, + "step": 40855 + }, + { + "epoch": 2.776192417448023, + "grad_norm": 0.2955409586429596, + "learning_rate": 6.5312882185079494e-06, + "loss": 4.1427, + "step": 40860 + }, + { + "epoch": 2.7765321375186844, + "grad_norm": 0.30911171436309814, + "learning_rate": 6.530863568419623e-06, + "loss": 4.2884, + "step": 40865 + }, + { + "epoch": 2.7768718575893465, + "grad_norm": 0.3757115602493286, + "learning_rate": 6.530438918331296e-06, + "loss": 4.0592, + "step": 40870 + }, + { + "epoch": 2.777211577660008, + "grad_norm": 0.34608155488967896, + "learning_rate": 6.530014268242968e-06, + "loss": 4.1406, + "step": 40875 + }, + { + "epoch": 2.7775512977306698, + "grad_norm": 0.44818419218063354, + "learning_rate": 6.5295896181546415e-06, + "loss": 4.0964, + "step": 40880 + }, + { + "epoch": 2.777891017801332, + "grad_norm": 0.3895784020423889, + "learning_rate": 6.5291649680663134e-06, + "loss": 4.1265, + "step": 40885 + }, + { + "epoch": 2.7782307378719935, + "grad_norm": 0.4222285747528076, + "learning_rate": 6.528740317977986e-06, + "loss": 4.3288, + "step": 40890 + }, + { + "epoch": 2.778570457942655, + "grad_norm": 0.288204550743103, + "learning_rate": 6.52831566788966e-06, + "loss": 4.2877, + "step": 40895 + }, + { + "epoch": 2.778910178013317, + "grad_norm": 0.48147881031036377, + "learning_rate": 6.527891017801332e-06, + "loss": 3.9342, + "step": 40900 + }, + { + "epoch": 2.779249898083979, + "grad_norm": 0.3172706663608551, + "learning_rate": 6.527466367713005e-06, + "loss": 4.1871, + "step": 40905 + }, + { + "epoch": 2.7795896181546405, + "grad_norm": 0.3536725342273712, + "learning_rate": 6.527041717624678e-06, + "loss": 4.0848, + "step": 40910 + }, + { + "epoch": 2.779929338225302, + "grad_norm": 0.41935378313064575, + "learning_rate": 6.52661706753635e-06, + "loss": 4.3156, + "step": 40915 + }, + { + "epoch": 2.780269058295964, + "grad_norm": 0.4103308618068695, + "learning_rate": 6.526192417448023e-06, + "loss": 4.0271, + "step": 40920 + }, + { + "epoch": 2.780608778366626, + "grad_norm": 0.28172051906585693, + "learning_rate": 6.525767767359697e-06, + "loss": 4.3022, + "step": 40925 + }, + { + "epoch": 2.7809484984372874, + "grad_norm": 0.31169593334198, + "learning_rate": 6.525343117271369e-06, + "loss": 4.0319, + "step": 40930 + }, + { + "epoch": 2.7812882185079495, + "grad_norm": 0.2680776119232178, + "learning_rate": 6.5249184671830414e-06, + "loss": 4.0593, + "step": 40935 + }, + { + "epoch": 2.781627938578611, + "grad_norm": 0.27079126238822937, + "learning_rate": 6.524493817094715e-06, + "loss": 4.2075, + "step": 40940 + }, + { + "epoch": 2.7819676586492728, + "grad_norm": 0.3220539093017578, + "learning_rate": 6.524069167006387e-06, + "loss": 4.1266, + "step": 40945 + }, + { + "epoch": 2.782307378719935, + "grad_norm": 0.40749838948249817, + "learning_rate": 6.52364451691806e-06, + "loss": 4.1871, + "step": 40950 + }, + { + "epoch": 2.7826470987905965, + "grad_norm": 0.41903096437454224, + "learning_rate": 6.523219866829733e-06, + "loss": 4.2349, + "step": 40955 + }, + { + "epoch": 2.782986818861258, + "grad_norm": 0.4190886914730072, + "learning_rate": 6.5227952167414055e-06, + "loss": 4.1601, + "step": 40960 + }, + { + "epoch": 2.78332653893192, + "grad_norm": 0.31770333647727966, + "learning_rate": 6.522370566653078e-06, + "loss": 4.204, + "step": 40965 + }, + { + "epoch": 2.783666259002582, + "grad_norm": 0.25473058223724365, + "learning_rate": 6.521945916564751e-06, + "loss": 4.0446, + "step": 40970 + }, + { + "epoch": 2.7840059790732434, + "grad_norm": 0.32070574164390564, + "learning_rate": 6.521521266476424e-06, + "loss": 4.1104, + "step": 40975 + }, + { + "epoch": 2.7843456991439055, + "grad_norm": 0.4107213318347931, + "learning_rate": 6.521096616388096e-06, + "loss": 4.2067, + "step": 40980 + }, + { + "epoch": 2.784685419214567, + "grad_norm": 0.2469772845506668, + "learning_rate": 6.5206719662997695e-06, + "loss": 3.9806, + "step": 40985 + }, + { + "epoch": 2.7850251392852288, + "grad_norm": 0.37261050939559937, + "learning_rate": 6.520247316211442e-06, + "loss": 4.1533, + "step": 40990 + }, + { + "epoch": 2.785364859355891, + "grad_norm": 0.2577108144760132, + "learning_rate": 6.519822666123114e-06, + "loss": 4.0709, + "step": 40995 + }, + { + "epoch": 2.7857045794265525, + "grad_norm": 0.3051903545856476, + "learning_rate": 6.519398016034788e-06, + "loss": 3.8869, + "step": 41000 + }, + { + "epoch": 2.786044299497214, + "grad_norm": 0.26225975155830383, + "learning_rate": 6.518973365946461e-06, + "loss": 4.2956, + "step": 41005 + }, + { + "epoch": 2.786384019567876, + "grad_norm": 0.3143223226070404, + "learning_rate": 6.518548715858134e-06, + "loss": 4.1508, + "step": 41010 + }, + { + "epoch": 2.786723739638538, + "grad_norm": 0.23781424760818481, + "learning_rate": 6.518124065769806e-06, + "loss": 4.1739, + "step": 41015 + }, + { + "epoch": 2.7870634597091994, + "grad_norm": 0.3551914393901825, + "learning_rate": 6.517699415681479e-06, + "loss": 4.0804, + "step": 41020 + }, + { + "epoch": 2.7874031797798615, + "grad_norm": 0.3556143045425415, + "learning_rate": 6.517274765593153e-06, + "loss": 4.1854, + "step": 41025 + }, + { + "epoch": 2.787742899850523, + "grad_norm": 0.9574588537216187, + "learning_rate": 6.516850115504825e-06, + "loss": 4.043, + "step": 41030 + }, + { + "epoch": 2.788082619921185, + "grad_norm": 0.26953643560409546, + "learning_rate": 6.5164254654164975e-06, + "loss": 4.1914, + "step": 41035 + }, + { + "epoch": 2.788422339991847, + "grad_norm": 0.31123778223991394, + "learning_rate": 6.51600081532817e-06, + "loss": 4.0423, + "step": 41040 + }, + { + "epoch": 2.7887620600625085, + "grad_norm": 0.2565614879131317, + "learning_rate": 6.515576165239843e-06, + "loss": 3.8418, + "step": 41045 + }, + { + "epoch": 2.78910178013317, + "grad_norm": 0.8591545224189758, + "learning_rate": 6.515151515151516e-06, + "loss": 3.8625, + "step": 41050 + }, + { + "epoch": 2.789441500203832, + "grad_norm": 0.24228675663471222, + "learning_rate": 6.514726865063189e-06, + "loss": 4.1365, + "step": 41055 + }, + { + "epoch": 2.789781220274494, + "grad_norm": 0.25766777992248535, + "learning_rate": 6.5143022149748615e-06, + "loss": 4.1062, + "step": 41060 + }, + { + "epoch": 2.7901209403451555, + "grad_norm": 0.29100626707077026, + "learning_rate": 6.513877564886533e-06, + "loss": 4.184, + "step": 41065 + }, + { + "epoch": 2.7904606604158175, + "grad_norm": 0.32050395011901855, + "learning_rate": 6.513452914798207e-06, + "loss": 3.9531, + "step": 41070 + }, + { + "epoch": 2.790800380486479, + "grad_norm": 0.4427286982536316, + "learning_rate": 6.51302826470988e-06, + "loss": 4.0443, + "step": 41075 + }, + { + "epoch": 2.791140100557141, + "grad_norm": 0.42837056517601013, + "learning_rate": 6.512603614621552e-06, + "loss": 4.1043, + "step": 41080 + }, + { + "epoch": 2.791479820627803, + "grad_norm": 0.3664741814136505, + "learning_rate": 6.5121789645332255e-06, + "loss": 4.0821, + "step": 41085 + }, + { + "epoch": 2.7918195406984645, + "grad_norm": 0.34884393215179443, + "learning_rate": 6.511754314444898e-06, + "loss": 4.1258, + "step": 41090 + }, + { + "epoch": 2.792159260769126, + "grad_norm": 0.3043070137500763, + "learning_rate": 6.51132966435657e-06, + "loss": 4.1771, + "step": 41095 + }, + { + "epoch": 2.792498980839788, + "grad_norm": 0.3892734944820404, + "learning_rate": 6.510905014268244e-06, + "loss": 4.1644, + "step": 41100 + }, + { + "epoch": 2.79283870091045, + "grad_norm": 0.23416945338249207, + "learning_rate": 6.510480364179917e-06, + "loss": 4.126, + "step": 41105 + }, + { + "epoch": 2.7931784209811115, + "grad_norm": 0.2660585939884186, + "learning_rate": 6.510055714091589e-06, + "loss": 4.2188, + "step": 41110 + }, + { + "epoch": 2.7935181410517735, + "grad_norm": 0.2684452533721924, + "learning_rate": 6.509631064003262e-06, + "loss": 4.1646, + "step": 41115 + }, + { + "epoch": 2.793857861122435, + "grad_norm": 0.26179543137550354, + "learning_rate": 6.509206413914935e-06, + "loss": 4.1318, + "step": 41120 + }, + { + "epoch": 2.794197581193097, + "grad_norm": 0.26719391345977783, + "learning_rate": 6.508781763826607e-06, + "loss": 4.1052, + "step": 41125 + }, + { + "epoch": 2.794537301263759, + "grad_norm": 0.2986716628074646, + "learning_rate": 6.508357113738281e-06, + "loss": 3.8193, + "step": 41130 + }, + { + "epoch": 2.7948770213344205, + "grad_norm": 0.24594907462596893, + "learning_rate": 6.507932463649953e-06, + "loss": 4.0467, + "step": 41135 + }, + { + "epoch": 2.795216741405082, + "grad_norm": 0.551154375076294, + "learning_rate": 6.5075078135616254e-06, + "loss": 4.1008, + "step": 41140 + }, + { + "epoch": 2.795556461475744, + "grad_norm": 1.1099376678466797, + "learning_rate": 6.507083163473299e-06, + "loss": 3.9375, + "step": 41145 + }, + { + "epoch": 2.795896181546406, + "grad_norm": 0.5078096389770508, + "learning_rate": 6.506658513384971e-06, + "loss": 4.1748, + "step": 41150 + }, + { + "epoch": 2.7962359016170675, + "grad_norm": 0.3605741858482361, + "learning_rate": 6.506233863296644e-06, + "loss": 3.9958, + "step": 41155 + }, + { + "epoch": 2.7965756216877296, + "grad_norm": 0.2795321047306061, + "learning_rate": 6.5058092132083175e-06, + "loss": 3.9938, + "step": 41160 + }, + { + "epoch": 2.796915341758391, + "grad_norm": 0.41945600509643555, + "learning_rate": 6.5053845631199894e-06, + "loss": 4.15, + "step": 41165 + }, + { + "epoch": 2.797255061829053, + "grad_norm": 0.2636597454547882, + "learning_rate": 6.504959913031662e-06, + "loss": 4.0847, + "step": 41170 + }, + { + "epoch": 2.797594781899715, + "grad_norm": 0.28942301869392395, + "learning_rate": 6.504535262943336e-06, + "loss": 4.0066, + "step": 41175 + }, + { + "epoch": 2.7979345019703765, + "grad_norm": 0.2904748320579529, + "learning_rate": 6.504110612855008e-06, + "loss": 4.1558, + "step": 41180 + }, + { + "epoch": 2.798274222041038, + "grad_norm": 0.36455708742141724, + "learning_rate": 6.503685962766681e-06, + "loss": 3.9521, + "step": 41185 + }, + { + "epoch": 2.7986139421117002, + "grad_norm": 0.2958689332008362, + "learning_rate": 6.503261312678354e-06, + "loss": 4.1347, + "step": 41190 + }, + { + "epoch": 2.798953662182362, + "grad_norm": 0.20812848210334778, + "learning_rate": 6.502836662590026e-06, + "loss": 4.2476, + "step": 41195 + }, + { + "epoch": 2.7992933822530235, + "grad_norm": 0.3249000012874603, + "learning_rate": 6.502412012501699e-06, + "loss": 4.0126, + "step": 41200 + }, + { + "epoch": 2.799633102323685, + "grad_norm": 0.27895423769950867, + "learning_rate": 6.501987362413372e-06, + "loss": 4.0565, + "step": 41205 + }, + { + "epoch": 2.799972822394347, + "grad_norm": 0.2610771059989929, + "learning_rate": 6.501562712325045e-06, + "loss": 4.2705, + "step": 41210 + }, + { + "epoch": 2.800312542465009, + "grad_norm": 0.35712090134620667, + "learning_rate": 6.5011380622367174e-06, + "loss": 3.7318, + "step": 41215 + }, + { + "epoch": 2.8006522625356705, + "grad_norm": 0.3229202926158905, + "learning_rate": 6.50071341214839e-06, + "loss": 3.9883, + "step": 41220 + }, + { + "epoch": 2.8009919826063325, + "grad_norm": 0.29122644662857056, + "learning_rate": 6.500288762060063e-06, + "loss": 3.9008, + "step": 41225 + }, + { + "epoch": 2.801331702676994, + "grad_norm": 0.304330438375473, + "learning_rate": 6.499864111971735e-06, + "loss": 4.2311, + "step": 41230 + }, + { + "epoch": 2.801671422747656, + "grad_norm": 0.28193148970603943, + "learning_rate": 6.499439461883409e-06, + "loss": 3.9938, + "step": 41235 + }, + { + "epoch": 2.802011142818318, + "grad_norm": 0.34241577982902527, + "learning_rate": 6.4990148117950814e-06, + "loss": 4.3776, + "step": 41240 + }, + { + "epoch": 2.8023508628889795, + "grad_norm": 0.37679532170295715, + "learning_rate": 6.498590161706753e-06, + "loss": 4.2054, + "step": 41245 + }, + { + "epoch": 2.802690582959641, + "grad_norm": 0.2954711318016052, + "learning_rate": 6.498165511618427e-06, + "loss": 3.9258, + "step": 41250 + }, + { + "epoch": 2.8030303030303028, + "grad_norm": 0.36875686049461365, + "learning_rate": 6.4977408615301e-06, + "loss": 4.1818, + "step": 41255 + }, + { + "epoch": 2.803370023100965, + "grad_norm": 0.3375855088233948, + "learning_rate": 6.497316211441772e-06, + "loss": 4.4658, + "step": 41260 + }, + { + "epoch": 2.8037097431716265, + "grad_norm": 0.6453655362129211, + "learning_rate": 6.4968915613534455e-06, + "loss": 4.1511, + "step": 41265 + }, + { + "epoch": 2.804049463242288, + "grad_norm": 0.4580519199371338, + "learning_rate": 6.496466911265118e-06, + "loss": 4.1397, + "step": 41270 + }, + { + "epoch": 2.80438918331295, + "grad_norm": 0.3058185875415802, + "learning_rate": 6.49604226117679e-06, + "loss": 3.8505, + "step": 41275 + }, + { + "epoch": 2.804728903383612, + "grad_norm": 0.26483091711997986, + "learning_rate": 6.495617611088464e-06, + "loss": 3.9847, + "step": 41280 + }, + { + "epoch": 2.8050686234542734, + "grad_norm": 0.3417665362358093, + "learning_rate": 6.495192961000137e-06, + "loss": 3.7971, + "step": 41285 + }, + { + "epoch": 2.8054083435249355, + "grad_norm": 0.3601803779602051, + "learning_rate": 6.494768310911809e-06, + "loss": 4.2027, + "step": 41290 + }, + { + "epoch": 2.805748063595597, + "grad_norm": 0.5200850963592529, + "learning_rate": 6.494343660823482e-06, + "loss": 3.9471, + "step": 41295 + }, + { + "epoch": 2.8060877836662588, + "grad_norm": 0.30718758702278137, + "learning_rate": 6.493919010735154e-06, + "loss": 3.9438, + "step": 41300 + }, + { + "epoch": 2.806427503736921, + "grad_norm": 0.6830807328224182, + "learning_rate": 6.493494360646827e-06, + "loss": 4.0661, + "step": 41305 + }, + { + "epoch": 2.8067672238075825, + "grad_norm": 0.273107647895813, + "learning_rate": 6.493069710558501e-06, + "loss": 4.0217, + "step": 41310 + }, + { + "epoch": 2.807106943878244, + "grad_norm": 0.5004891157150269, + "learning_rate": 6.492645060470173e-06, + "loss": 4.0077, + "step": 41315 + }, + { + "epoch": 2.807446663948906, + "grad_norm": 0.6858990788459778, + "learning_rate": 6.492220410381845e-06, + "loss": 4.0934, + "step": 41320 + }, + { + "epoch": 2.807786384019568, + "grad_norm": 0.3009529113769531, + "learning_rate": 6.491795760293519e-06, + "loss": 4.1468, + "step": 41325 + }, + { + "epoch": 2.8081261040902294, + "grad_norm": 0.4834187626838684, + "learning_rate": 6.491371110205191e-06, + "loss": 4.1629, + "step": 41330 + }, + { + "epoch": 2.8084658241608915, + "grad_norm": 0.3518901765346527, + "learning_rate": 6.490946460116864e-06, + "loss": 4.0041, + "step": 41335 + }, + { + "epoch": 2.808805544231553, + "grad_norm": 0.47075918316841125, + "learning_rate": 6.4905218100285375e-06, + "loss": 3.8973, + "step": 41340 + }, + { + "epoch": 2.809145264302215, + "grad_norm": 0.3038744330406189, + "learning_rate": 6.490097159940209e-06, + "loss": 4.1226, + "step": 41345 + }, + { + "epoch": 2.809484984372877, + "grad_norm": 0.3337230980396271, + "learning_rate": 6.489672509851883e-06, + "loss": 4.0038, + "step": 41350 + }, + { + "epoch": 2.8098247044435385, + "grad_norm": 0.2716892659664154, + "learning_rate": 6.489247859763556e-06, + "loss": 4.1009, + "step": 41355 + }, + { + "epoch": 2.8101644245142, + "grad_norm": 0.2990915775299072, + "learning_rate": 6.488823209675228e-06, + "loss": 3.9929, + "step": 41360 + }, + { + "epoch": 2.810504144584862, + "grad_norm": 0.4546178877353668, + "learning_rate": 6.4883985595869015e-06, + "loss": 4.0275, + "step": 41365 + }, + { + "epoch": 2.810843864655524, + "grad_norm": 0.34583836793899536, + "learning_rate": 6.487973909498574e-06, + "loss": 4.2623, + "step": 41370 + }, + { + "epoch": 2.8111835847261855, + "grad_norm": 0.41667842864990234, + "learning_rate": 6.487549259410246e-06, + "loss": 4.2088, + "step": 41375 + }, + { + "epoch": 2.8115233047968475, + "grad_norm": 0.27664169669151306, + "learning_rate": 6.48712460932192e-06, + "loss": 4.1575, + "step": 41380 + }, + { + "epoch": 2.811863024867509, + "grad_norm": 0.424713134765625, + "learning_rate": 6.486699959233592e-06, + "loss": 4.1523, + "step": 41385 + }, + { + "epoch": 2.812202744938171, + "grad_norm": 0.265350341796875, + "learning_rate": 6.486275309145265e-06, + "loss": 4.0609, + "step": 41390 + }, + { + "epoch": 2.812542465008833, + "grad_norm": 0.48881956934928894, + "learning_rate": 6.485850659056938e-06, + "loss": 4.1412, + "step": 41395 + }, + { + "epoch": 2.8128821850794945, + "grad_norm": 0.23839648067951202, + "learning_rate": 6.48542600896861e-06, + "loss": 4.2068, + "step": 41400 + }, + { + "epoch": 2.813221905150156, + "grad_norm": 0.2722378671169281, + "learning_rate": 6.485001358880283e-06, + "loss": 4.2177, + "step": 41405 + }, + { + "epoch": 2.813561625220818, + "grad_norm": 0.32912740111351013, + "learning_rate": 6.484576708791957e-06, + "loss": 4.0181, + "step": 41410 + }, + { + "epoch": 2.81390134529148, + "grad_norm": 0.34942764043807983, + "learning_rate": 6.484152058703629e-06, + "loss": 4.0931, + "step": 41415 + }, + { + "epoch": 2.8142410653621415, + "grad_norm": 0.4283919930458069, + "learning_rate": 6.4837274086153014e-06, + "loss": 4.515, + "step": 41420 + }, + { + "epoch": 2.8145807854328035, + "grad_norm": 0.34248000383377075, + "learning_rate": 6.483302758526975e-06, + "loss": 4.0835, + "step": 41425 + }, + { + "epoch": 2.814920505503465, + "grad_norm": 0.3675915002822876, + "learning_rate": 6.482878108438647e-06, + "loss": 4.0966, + "step": 41430 + }, + { + "epoch": 2.815260225574127, + "grad_norm": 0.25485798716545105, + "learning_rate": 6.48245345835032e-06, + "loss": 3.894, + "step": 41435 + }, + { + "epoch": 2.815599945644789, + "grad_norm": 0.34521788358688354, + "learning_rate": 6.4820288082619935e-06, + "loss": 4.0661, + "step": 41440 + }, + { + "epoch": 2.8159396657154505, + "grad_norm": 0.3178851902484894, + "learning_rate": 6.4816041581736654e-06, + "loss": 4.0812, + "step": 41445 + }, + { + "epoch": 2.816279385786112, + "grad_norm": 0.28821924328804016, + "learning_rate": 6.481179508085338e-06, + "loss": 4.2706, + "step": 41450 + }, + { + "epoch": 2.816619105856774, + "grad_norm": 0.285999059677124, + "learning_rate": 6.480754857997011e-06, + "loss": 4.0809, + "step": 41455 + }, + { + "epoch": 2.816958825927436, + "grad_norm": 0.27764391899108887, + "learning_rate": 6.480330207908684e-06, + "loss": 4.1874, + "step": 41460 + }, + { + "epoch": 2.8172985459980975, + "grad_norm": 0.5513471364974976, + "learning_rate": 6.479905557820357e-06, + "loss": 4.052, + "step": 41465 + }, + { + "epoch": 2.8176382660687596, + "grad_norm": 0.315391480922699, + "learning_rate": 6.4794809077320294e-06, + "loss": 4.1363, + "step": 41470 + }, + { + "epoch": 2.817977986139421, + "grad_norm": 0.2702782154083252, + "learning_rate": 6.479056257643702e-06, + "loss": 4.1421, + "step": 41475 + }, + { + "epoch": 2.818317706210083, + "grad_norm": 0.24309012293815613, + "learning_rate": 6.478631607555374e-06, + "loss": 4.0233, + "step": 41480 + }, + { + "epoch": 2.818657426280745, + "grad_norm": 0.45458388328552246, + "learning_rate": 6.478206957467048e-06, + "loss": 4.371, + "step": 41485 + }, + { + "epoch": 2.8189971463514065, + "grad_norm": 0.35038936138153076, + "learning_rate": 6.477782307378721e-06, + "loss": 4.0966, + "step": 41490 + }, + { + "epoch": 2.819336866422068, + "grad_norm": 0.30450960993766785, + "learning_rate": 6.477357657290393e-06, + "loss": 4.1821, + "step": 41495 + }, + { + "epoch": 2.8196765864927302, + "grad_norm": 0.4624302387237549, + "learning_rate": 6.476933007202066e-06, + "loss": 3.9279, + "step": 41500 + }, + { + "epoch": 2.820016306563392, + "grad_norm": 0.27472448348999023, + "learning_rate": 6.476508357113739e-06, + "loss": 4.1571, + "step": 41505 + }, + { + "epoch": 2.8203560266340535, + "grad_norm": 0.3202635943889618, + "learning_rate": 6.476083707025411e-06, + "loss": 4.1155, + "step": 41510 + }, + { + "epoch": 2.8206957467047156, + "grad_norm": 0.4354565441608429, + "learning_rate": 6.475659056937085e-06, + "loss": 4.0221, + "step": 41515 + }, + { + "epoch": 2.821035466775377, + "grad_norm": 0.44975003600120544, + "learning_rate": 6.4752344068487574e-06, + "loss": 4.3103, + "step": 41520 + }, + { + "epoch": 2.821375186846039, + "grad_norm": 0.22301846742630005, + "learning_rate": 6.474809756760429e-06, + "loss": 4.2753, + "step": 41525 + }, + { + "epoch": 2.821714906916701, + "grad_norm": 0.25090667605400085, + "learning_rate": 6.474385106672103e-06, + "loss": 4.1258, + "step": 41530 + }, + { + "epoch": 2.8220546269873625, + "grad_norm": 0.20047533512115479, + "learning_rate": 6.473960456583776e-06, + "loss": 4.0816, + "step": 41535 + }, + { + "epoch": 2.822394347058024, + "grad_norm": 0.2773591876029968, + "learning_rate": 6.473535806495448e-06, + "loss": 4.1315, + "step": 41540 + }, + { + "epoch": 2.822734067128686, + "grad_norm": 0.2299867421388626, + "learning_rate": 6.4731111564071214e-06, + "loss": 3.9684, + "step": 41545 + }, + { + "epoch": 2.823073787199348, + "grad_norm": 0.31686297059059143, + "learning_rate": 6.472686506318793e-06, + "loss": 4.1192, + "step": 41550 + }, + { + "epoch": 2.8234135072700095, + "grad_norm": 0.32373812794685364, + "learning_rate": 6.472261856230466e-06, + "loss": 4.2999, + "step": 41555 + }, + { + "epoch": 2.823753227340671, + "grad_norm": 0.39868423342704773, + "learning_rate": 6.47183720614214e-06, + "loss": 4.1707, + "step": 41560 + }, + { + "epoch": 2.824092947411333, + "grad_norm": 0.3730676472187042, + "learning_rate": 6.471412556053812e-06, + "loss": 4.2665, + "step": 41565 + }, + { + "epoch": 2.824432667481995, + "grad_norm": 0.3323011100292206, + "learning_rate": 6.470987905965485e-06, + "loss": 4.1842, + "step": 41570 + }, + { + "epoch": 2.8247723875526565, + "grad_norm": 0.48396191000938416, + "learning_rate": 6.470563255877158e-06, + "loss": 4.2983, + "step": 41575 + }, + { + "epoch": 2.8251121076233185, + "grad_norm": 0.1971474587917328, + "learning_rate": 6.47013860578883e-06, + "loss": 3.9596, + "step": 41580 + }, + { + "epoch": 2.82545182769398, + "grad_norm": 0.5708956122398376, + "learning_rate": 6.469713955700503e-06, + "loss": 4.3643, + "step": 41585 + }, + { + "epoch": 2.825791547764642, + "grad_norm": 0.3037988841533661, + "learning_rate": 6.469289305612177e-06, + "loss": 4.298, + "step": 41590 + }, + { + "epoch": 2.8261312678353034, + "grad_norm": 0.4654460549354553, + "learning_rate": 6.468864655523849e-06, + "loss": 4.0086, + "step": 41595 + }, + { + "epoch": 2.8264709879059655, + "grad_norm": 0.3027285039424896, + "learning_rate": 6.468440005435521e-06, + "loss": 4.1662, + "step": 41600 + }, + { + "epoch": 2.826810707976627, + "grad_norm": 0.28670066595077515, + "learning_rate": 6.468015355347195e-06, + "loss": 4.1918, + "step": 41605 + }, + { + "epoch": 2.8271504280472888, + "grad_norm": 0.3219994008541107, + "learning_rate": 6.467590705258867e-06, + "loss": 4.0629, + "step": 41610 + }, + { + "epoch": 2.827490148117951, + "grad_norm": 0.3279687762260437, + "learning_rate": 6.46716605517054e-06, + "loss": 4.1866, + "step": 41615 + }, + { + "epoch": 2.8278298681886125, + "grad_norm": 0.2945742905139923, + "learning_rate": 6.4667414050822135e-06, + "loss": 4.0053, + "step": 41620 + }, + { + "epoch": 2.828169588259274, + "grad_norm": 0.34169718623161316, + "learning_rate": 6.466316754993885e-06, + "loss": 4.0755, + "step": 41625 + }, + { + "epoch": 2.828509308329936, + "grad_norm": 0.2381933480501175, + "learning_rate": 6.465892104905558e-06, + "loss": 4.2559, + "step": 41630 + }, + { + "epoch": 2.828849028400598, + "grad_norm": 0.21106138825416565, + "learning_rate": 6.465467454817231e-06, + "loss": 4.1567, + "step": 41635 + }, + { + "epoch": 2.8291887484712595, + "grad_norm": 0.21962076425552368, + "learning_rate": 6.465042804728904e-06, + "loss": 3.9751, + "step": 41640 + }, + { + "epoch": 2.8295284685419215, + "grad_norm": 0.33323177695274353, + "learning_rate": 6.464618154640576e-06, + "loss": 4.2671, + "step": 41645 + }, + { + "epoch": 2.829868188612583, + "grad_norm": 0.2800276577472687, + "learning_rate": 6.464193504552249e-06, + "loss": 4.6653, + "step": 41650 + }, + { + "epoch": 2.830207908683245, + "grad_norm": 0.6058720946311951, + "learning_rate": 6.463768854463922e-06, + "loss": 4.1491, + "step": 41655 + }, + { + "epoch": 2.830547628753907, + "grad_norm": 0.34607264399528503, + "learning_rate": 6.463344204375594e-06, + "loss": 3.9849, + "step": 41660 + }, + { + "epoch": 2.8308873488245685, + "grad_norm": 0.3953573703765869, + "learning_rate": 6.462919554287268e-06, + "loss": 3.854, + "step": 41665 + }, + { + "epoch": 2.83122706889523, + "grad_norm": 0.44648247957229614, + "learning_rate": 6.462494904198941e-06, + "loss": 4.1143, + "step": 41670 + }, + { + "epoch": 2.831566788965892, + "grad_norm": 0.4026210606098175, + "learning_rate": 6.4620702541106126e-06, + "loss": 3.9369, + "step": 41675 + }, + { + "epoch": 2.831906509036554, + "grad_norm": 0.32553112506866455, + "learning_rate": 6.461645604022286e-06, + "loss": 4.1026, + "step": 41680 + }, + { + "epoch": 2.8322462291072155, + "grad_norm": 0.40719082951545715, + "learning_rate": 6.461220953933959e-06, + "loss": 4.0959, + "step": 41685 + }, + { + "epoch": 2.8325859491778775, + "grad_norm": 0.2600876986980438, + "learning_rate": 6.460796303845633e-06, + "loss": 4.1274, + "step": 41690 + }, + { + "epoch": 2.832925669248539, + "grad_norm": 0.3184675872325897, + "learning_rate": 6.460371653757305e-06, + "loss": 4.2523, + "step": 41695 + }, + { + "epoch": 2.833265389319201, + "grad_norm": 0.34947389364242554, + "learning_rate": 6.459947003668977e-06, + "loss": 4.1694, + "step": 41700 + }, + { + "epoch": 2.833605109389863, + "grad_norm": 0.29491332173347473, + "learning_rate": 6.45952235358065e-06, + "loss": 4.311, + "step": 41705 + }, + { + "epoch": 2.8339448294605245, + "grad_norm": 0.29844823479652405, + "learning_rate": 6.459097703492323e-06, + "loss": 4.2069, + "step": 41710 + }, + { + "epoch": 2.834284549531186, + "grad_norm": 0.34419703483581543, + "learning_rate": 6.458673053403996e-06, + "loss": 4.0324, + "step": 41715 + }, + { + "epoch": 2.834624269601848, + "grad_norm": 0.2820223569869995, + "learning_rate": 6.458248403315669e-06, + "loss": 4.1264, + "step": 41720 + }, + { + "epoch": 2.83496398967251, + "grad_norm": 0.2536083459854126, + "learning_rate": 6.4578237532273414e-06, + "loss": 4.0127, + "step": 41725 + }, + { + "epoch": 2.8353037097431715, + "grad_norm": 0.32996928691864014, + "learning_rate": 6.457399103139013e-06, + "loss": 4.4302, + "step": 41730 + }, + { + "epoch": 2.8356434298138335, + "grad_norm": 0.4045233726501465, + "learning_rate": 6.456974453050687e-06, + "loss": 4.1602, + "step": 41735 + }, + { + "epoch": 2.835983149884495, + "grad_norm": 0.24872156977653503, + "learning_rate": 6.45654980296236e-06, + "loss": 4.0754, + "step": 41740 + }, + { + "epoch": 2.836322869955157, + "grad_norm": 0.3002725839614868, + "learning_rate": 6.456125152874032e-06, + "loss": 4.4075, + "step": 41745 + }, + { + "epoch": 2.836662590025819, + "grad_norm": 0.29633742570877075, + "learning_rate": 6.4557005027857054e-06, + "loss": 4.2061, + "step": 41750 + }, + { + "epoch": 2.8370023100964805, + "grad_norm": 0.27808430790901184, + "learning_rate": 6.455275852697378e-06, + "loss": 3.9713, + "step": 41755 + }, + { + "epoch": 2.837342030167142, + "grad_norm": 0.3200647532939911, + "learning_rate": 6.45485120260905e-06, + "loss": 4.3136, + "step": 41760 + }, + { + "epoch": 2.8376817502378042, + "grad_norm": 0.2772790789604187, + "learning_rate": 6.454426552520724e-06, + "loss": 4.0678, + "step": 41765 + }, + { + "epoch": 2.838021470308466, + "grad_norm": 0.3898334801197052, + "learning_rate": 6.454001902432397e-06, + "loss": 4.4331, + "step": 41770 + }, + { + "epoch": 2.8383611903791275, + "grad_norm": 0.3410170376300812, + "learning_rate": 6.453577252344069e-06, + "loss": 4.0782, + "step": 41775 + }, + { + "epoch": 2.8387009104497896, + "grad_norm": 0.22574065625667572, + "learning_rate": 6.453152602255742e-06, + "loss": 4.0904, + "step": 41780 + }, + { + "epoch": 2.839040630520451, + "grad_norm": 0.2685375511646271, + "learning_rate": 6.452727952167415e-06, + "loss": 4.0282, + "step": 41785 + }, + { + "epoch": 2.839380350591113, + "grad_norm": 0.268327921628952, + "learning_rate": 6.452303302079087e-06, + "loss": 4.0427, + "step": 41790 + }, + { + "epoch": 2.839720070661775, + "grad_norm": 0.44473668932914734, + "learning_rate": 6.451878651990761e-06, + "loss": 4.1474, + "step": 41795 + }, + { + "epoch": 2.8400597907324365, + "grad_norm": 0.3504163324832916, + "learning_rate": 6.451454001902433e-06, + "loss": 3.9718, + "step": 41800 + }, + { + "epoch": 2.840399510803098, + "grad_norm": 0.2782386839389801, + "learning_rate": 6.451029351814105e-06, + "loss": 4.2079, + "step": 41805 + }, + { + "epoch": 2.8407392308737602, + "grad_norm": 0.3616240322589874, + "learning_rate": 6.450604701725779e-06, + "loss": 4.302, + "step": 41810 + }, + { + "epoch": 2.841078950944422, + "grad_norm": 0.2817832827568054, + "learning_rate": 6.450180051637451e-06, + "loss": 4.3245, + "step": 41815 + }, + { + "epoch": 2.8414186710150835, + "grad_norm": 0.39887920022010803, + "learning_rate": 6.449755401549124e-06, + "loss": 4.1074, + "step": 41820 + }, + { + "epoch": 2.8417583910857456, + "grad_norm": 0.3616105616092682, + "learning_rate": 6.4493307514607974e-06, + "loss": 4.1411, + "step": 41825 + }, + { + "epoch": 2.842098111156407, + "grad_norm": 0.3289375603199005, + "learning_rate": 6.448906101372469e-06, + "loss": 3.993, + "step": 41830 + }, + { + "epoch": 2.842437831227069, + "grad_norm": 0.3417339324951172, + "learning_rate": 6.448481451284142e-06, + "loss": 3.9536, + "step": 41835 + }, + { + "epoch": 2.842777551297731, + "grad_norm": 0.275409996509552, + "learning_rate": 6.448056801195816e-06, + "loss": 4.0662, + "step": 41840 + }, + { + "epoch": 2.8431172713683925, + "grad_norm": 0.40745681524276733, + "learning_rate": 6.447632151107488e-06, + "loss": 4.2694, + "step": 41845 + }, + { + "epoch": 2.843456991439054, + "grad_norm": 0.46364763379096985, + "learning_rate": 6.447207501019161e-06, + "loss": 4.1746, + "step": 41850 + }, + { + "epoch": 2.8437967115097162, + "grad_norm": 0.2971811592578888, + "learning_rate": 6.446782850930834e-06, + "loss": 4.0336, + "step": 41855 + }, + { + "epoch": 2.844136431580378, + "grad_norm": 0.30380553007125854, + "learning_rate": 6.446358200842506e-06, + "loss": 3.836, + "step": 41860 + }, + { + "epoch": 2.8444761516510395, + "grad_norm": 0.3201640546321869, + "learning_rate": 6.445933550754179e-06, + "loss": 3.9889, + "step": 41865 + }, + { + "epoch": 2.8448158717217016, + "grad_norm": 0.39686325192451477, + "learning_rate": 6.445508900665852e-06, + "loss": 3.8194, + "step": 41870 + }, + { + "epoch": 2.845155591792363, + "grad_norm": 0.39310595393180847, + "learning_rate": 6.445084250577525e-06, + "loss": 4.0151, + "step": 41875 + }, + { + "epoch": 2.845495311863025, + "grad_norm": 0.31132426857948303, + "learning_rate": 6.444659600489197e-06, + "loss": 3.9963, + "step": 41880 + }, + { + "epoch": 2.8458350319336865, + "grad_norm": 0.3837707042694092, + "learning_rate": 6.44423495040087e-06, + "loss": 4.1845, + "step": 41885 + }, + { + "epoch": 2.8461747520043486, + "grad_norm": 0.2652003765106201, + "learning_rate": 6.443810300312543e-06, + "loss": 4.2869, + "step": 41890 + }, + { + "epoch": 2.84651447207501, + "grad_norm": 0.3600050210952759, + "learning_rate": 6.443385650224215e-06, + "loss": 4.3397, + "step": 41895 + }, + { + "epoch": 2.846854192145672, + "grad_norm": 0.33764761686325073, + "learning_rate": 6.442961000135889e-06, + "loss": 4.1465, + "step": 41900 + }, + { + "epoch": 2.847193912216334, + "grad_norm": 0.3559412956237793, + "learning_rate": 6.442536350047561e-06, + "loss": 4.0559, + "step": 41905 + }, + { + "epoch": 2.8475336322869955, + "grad_norm": 0.2569819688796997, + "learning_rate": 6.442111699959233e-06, + "loss": 3.9715, + "step": 41910 + }, + { + "epoch": 2.847873352357657, + "grad_norm": 0.2931400239467621, + "learning_rate": 6.441687049870907e-06, + "loss": 4.2594, + "step": 41915 + }, + { + "epoch": 2.8482130724283192, + "grad_norm": 0.3394862413406372, + "learning_rate": 6.44126239978258e-06, + "loss": 4.285, + "step": 41920 + }, + { + "epoch": 2.848552792498981, + "grad_norm": 0.3899845778942108, + "learning_rate": 6.440837749694252e-06, + "loss": 4.1735, + "step": 41925 + }, + { + "epoch": 2.8488925125696425, + "grad_norm": 0.28829941153526306, + "learning_rate": 6.440413099605925e-06, + "loss": 3.8718, + "step": 41930 + }, + { + "epoch": 2.849232232640304, + "grad_norm": 0.3743482828140259, + "learning_rate": 6.439988449517598e-06, + "loss": 4.16, + "step": 41935 + }, + { + "epoch": 2.849571952710966, + "grad_norm": 0.736650288105011, + "learning_rate": 6.43956379942927e-06, + "loss": 4.0429, + "step": 41940 + }, + { + "epoch": 2.849911672781628, + "grad_norm": 0.2710595726966858, + "learning_rate": 6.439139149340944e-06, + "loss": 3.9751, + "step": 41945 + }, + { + "epoch": 2.8502513928522895, + "grad_norm": 0.45623284578323364, + "learning_rate": 6.438714499252617e-06, + "loss": 3.8607, + "step": 41950 + }, + { + "epoch": 2.8505911129229515, + "grad_norm": 0.2620748281478882, + "learning_rate": 6.4382898491642886e-06, + "loss": 4.2059, + "step": 41955 + }, + { + "epoch": 2.850930832993613, + "grad_norm": 0.2542775869369507, + "learning_rate": 6.437865199075962e-06, + "loss": 4.1724, + "step": 41960 + }, + { + "epoch": 2.851270553064275, + "grad_norm": 0.3610950708389282, + "learning_rate": 6.437440548987635e-06, + "loss": 3.9368, + "step": 41965 + }, + { + "epoch": 2.851610273134937, + "grad_norm": 0.20491011440753937, + "learning_rate": 6.437015898899307e-06, + "loss": 3.9632, + "step": 41970 + }, + { + "epoch": 2.8519499932055985, + "grad_norm": 0.34427592158317566, + "learning_rate": 6.436591248810981e-06, + "loss": 4.1747, + "step": 41975 + }, + { + "epoch": 2.85228971327626, + "grad_norm": 0.30006980895996094, + "learning_rate": 6.4361665987226526e-06, + "loss": 4.0352, + "step": 41980 + }, + { + "epoch": 2.852629433346922, + "grad_norm": 0.35184627771377563, + "learning_rate": 6.435741948634325e-06, + "loss": 4.3257, + "step": 41985 + }, + { + "epoch": 2.852969153417584, + "grad_norm": 0.5730725526809692, + "learning_rate": 6.435317298545999e-06, + "loss": 4.1263, + "step": 41990 + }, + { + "epoch": 2.8533088734882455, + "grad_norm": 0.4099743664264679, + "learning_rate": 6.434892648457671e-06, + "loss": 4.1105, + "step": 41995 + }, + { + "epoch": 2.8536485935589075, + "grad_norm": 0.2780665457248688, + "learning_rate": 6.434467998369344e-06, + "loss": 4.2432, + "step": 42000 + }, + { + "epoch": 2.853988313629569, + "grad_norm": 0.3514252007007599, + "learning_rate": 6.434043348281017e-06, + "loss": 4.248, + "step": 42005 + }, + { + "epoch": 2.854328033700231, + "grad_norm": 0.3659090995788574, + "learning_rate": 6.433618698192689e-06, + "loss": 3.9506, + "step": 42010 + }, + { + "epoch": 2.854667753770893, + "grad_norm": 0.32115209102630615, + "learning_rate": 6.433194048104362e-06, + "loss": 4.2113, + "step": 42015 + }, + { + "epoch": 2.8550074738415545, + "grad_norm": 0.3105934262275696, + "learning_rate": 6.432769398016036e-06, + "loss": 4.1757, + "step": 42020 + }, + { + "epoch": 2.855347193912216, + "grad_norm": 0.3725217282772064, + "learning_rate": 6.432344747927708e-06, + "loss": 4.3942, + "step": 42025 + }, + { + "epoch": 2.855686913982878, + "grad_norm": 0.2510318458080292, + "learning_rate": 6.4319200978393814e-06, + "loss": 4.1098, + "step": 42030 + }, + { + "epoch": 2.85602663405354, + "grad_norm": 0.3421857953071594, + "learning_rate": 6.431495447751054e-06, + "loss": 3.919, + "step": 42035 + }, + { + "epoch": 2.8563663541242015, + "grad_norm": 0.37200650572776794, + "learning_rate": 6.431070797662726e-06, + "loss": 4.0973, + "step": 42040 + }, + { + "epoch": 2.8567060741948636, + "grad_norm": 0.6406606435775757, + "learning_rate": 6.4306461475744e-06, + "loss": 4.3181, + "step": 42045 + }, + { + "epoch": 2.857045794265525, + "grad_norm": 0.5849560499191284, + "learning_rate": 6.430221497486072e-06, + "loss": 4.2274, + "step": 42050 + }, + { + "epoch": 2.857385514336187, + "grad_norm": 0.45109912753105164, + "learning_rate": 6.429796847397745e-06, + "loss": 4.2815, + "step": 42055 + }, + { + "epoch": 2.857725234406849, + "grad_norm": 0.24690964818000793, + "learning_rate": 6.429372197309418e-06, + "loss": 3.9324, + "step": 42060 + }, + { + "epoch": 2.8580649544775105, + "grad_norm": 0.29374203085899353, + "learning_rate": 6.42894754722109e-06, + "loss": 4.2426, + "step": 42065 + }, + { + "epoch": 2.858404674548172, + "grad_norm": 0.3530105948448181, + "learning_rate": 6.428522897132763e-06, + "loss": 4.1065, + "step": 42070 + }, + { + "epoch": 2.8587443946188342, + "grad_norm": 0.30857574939727783, + "learning_rate": 6.428098247044437e-06, + "loss": 4.0489, + "step": 42075 + }, + { + "epoch": 2.859084114689496, + "grad_norm": 0.2629433572292328, + "learning_rate": 6.427673596956109e-06, + "loss": 3.9699, + "step": 42080 + }, + { + "epoch": 2.8594238347601575, + "grad_norm": 0.36264124512672424, + "learning_rate": 6.427248946867781e-06, + "loss": 3.9253, + "step": 42085 + }, + { + "epoch": 2.8597635548308196, + "grad_norm": 0.32324787974357605, + "learning_rate": 6.426824296779455e-06, + "loss": 4.0967, + "step": 42090 + }, + { + "epoch": 2.860103274901481, + "grad_norm": 0.44548243284225464, + "learning_rate": 6.426399646691127e-06, + "loss": 4.1116, + "step": 42095 + }, + { + "epoch": 2.860442994972143, + "grad_norm": 0.3310277760028839, + "learning_rate": 6.4259749966028e-06, + "loss": 4.1553, + "step": 42100 + }, + { + "epoch": 2.860782715042805, + "grad_norm": 0.24289549887180328, + "learning_rate": 6.4255503465144734e-06, + "loss": 3.7803, + "step": 42105 + }, + { + "epoch": 2.8611224351134665, + "grad_norm": 0.28473860025405884, + "learning_rate": 6.425125696426145e-06, + "loss": 4.2304, + "step": 42110 + }, + { + "epoch": 2.861462155184128, + "grad_norm": 0.670924961566925, + "learning_rate": 6.424701046337818e-06, + "loss": 4.0079, + "step": 42115 + }, + { + "epoch": 2.8618018752547902, + "grad_norm": 0.3114093542098999, + "learning_rate": 6.424276396249491e-06, + "loss": 4.2228, + "step": 42120 + }, + { + "epoch": 2.862141595325452, + "grad_norm": 0.4421844482421875, + "learning_rate": 6.423851746161164e-06, + "loss": 3.9899, + "step": 42125 + }, + { + "epoch": 2.8624813153961135, + "grad_norm": 0.31747445464134216, + "learning_rate": 6.423427096072837e-06, + "loss": 4.1204, + "step": 42130 + }, + { + "epoch": 2.8628210354667756, + "grad_norm": 0.5292691588401794, + "learning_rate": 6.423002445984509e-06, + "loss": 4.2063, + "step": 42135 + }, + { + "epoch": 2.863160755537437, + "grad_norm": 0.30083709955215454, + "learning_rate": 6.422577795896182e-06, + "loss": 4.0958, + "step": 42140 + }, + { + "epoch": 2.863500475608099, + "grad_norm": 0.34690824151039124, + "learning_rate": 6.422153145807854e-06, + "loss": 4.1007, + "step": 42145 + }, + { + "epoch": 2.863840195678761, + "grad_norm": 0.40418586134910583, + "learning_rate": 6.421728495719528e-06, + "loss": 4.1166, + "step": 42150 + }, + { + "epoch": 2.8641799157494225, + "grad_norm": 0.3235832750797272, + "learning_rate": 6.421303845631201e-06, + "loss": 4.1791, + "step": 42155 + }, + { + "epoch": 2.864519635820084, + "grad_norm": 0.2356172651052475, + "learning_rate": 6.4208791955428726e-06, + "loss": 4.0145, + "step": 42160 + }, + { + "epoch": 2.8648593558907463, + "grad_norm": 0.294516384601593, + "learning_rate": 6.420454545454546e-06, + "loss": 3.7721, + "step": 42165 + }, + { + "epoch": 2.865199075961408, + "grad_norm": 0.3770972788333893, + "learning_rate": 6.420029895366219e-06, + "loss": 4.124, + "step": 42170 + }, + { + "epoch": 2.8655387960320695, + "grad_norm": 0.22412535548210144, + "learning_rate": 6.419605245277891e-06, + "loss": 4.0167, + "step": 42175 + }, + { + "epoch": 2.8658785161027316, + "grad_norm": 0.3137924373149872, + "learning_rate": 6.419180595189565e-06, + "loss": 4.2652, + "step": 42180 + }, + { + "epoch": 2.866218236173393, + "grad_norm": 0.2928006649017334, + "learning_rate": 6.418755945101237e-06, + "loss": 4.0498, + "step": 42185 + }, + { + "epoch": 2.866557956244055, + "grad_norm": 0.21806754171848297, + "learning_rate": 6.418331295012909e-06, + "loss": 4.1182, + "step": 42190 + }, + { + "epoch": 2.866897676314717, + "grad_norm": 0.3609802722930908, + "learning_rate": 6.417906644924583e-06, + "loss": 4.1194, + "step": 42195 + }, + { + "epoch": 2.8672373963853786, + "grad_norm": 0.3618173599243164, + "learning_rate": 6.417481994836256e-06, + "loss": 4.1761, + "step": 42200 + }, + { + "epoch": 2.86757711645604, + "grad_norm": 0.3195483684539795, + "learning_rate": 6.417057344747928e-06, + "loss": 4.1904, + "step": 42205 + }, + { + "epoch": 2.8679168365267023, + "grad_norm": 0.378964364528656, + "learning_rate": 6.416632694659601e-06, + "loss": 4.2376, + "step": 42210 + }, + { + "epoch": 2.868256556597364, + "grad_norm": 0.20957531034946442, + "learning_rate": 6.416208044571273e-06, + "loss": 4.0505, + "step": 42215 + }, + { + "epoch": 2.8685962766680255, + "grad_norm": 0.3087986409664154, + "learning_rate": 6.415783394482946e-06, + "loss": 3.7406, + "step": 42220 + }, + { + "epoch": 2.868935996738687, + "grad_norm": 0.3314252495765686, + "learning_rate": 6.41535874439462e-06, + "loss": 4.2465, + "step": 42225 + }, + { + "epoch": 2.8692757168093492, + "grad_norm": 0.4389788806438446, + "learning_rate": 6.414934094306292e-06, + "loss": 4.152, + "step": 42230 + }, + { + "epoch": 2.869615436880011, + "grad_norm": 0.316606342792511, + "learning_rate": 6.4145094442179646e-06, + "loss": 4.2867, + "step": 42235 + }, + { + "epoch": 2.8699551569506725, + "grad_norm": 0.28960350155830383, + "learning_rate": 6.414084794129638e-06, + "loss": 4.2366, + "step": 42240 + }, + { + "epoch": 2.8702948770213346, + "grad_norm": 0.4808342456817627, + "learning_rate": 6.41366014404131e-06, + "loss": 4.1788, + "step": 42245 + }, + { + "epoch": 2.870634597091996, + "grad_norm": 0.2924494743347168, + "learning_rate": 6.413235493952983e-06, + "loss": 3.9726, + "step": 42250 + }, + { + "epoch": 2.870974317162658, + "grad_norm": 0.32907408475875854, + "learning_rate": 6.412810843864657e-06, + "loss": 4.1722, + "step": 42255 + }, + { + "epoch": 2.87131403723332, + "grad_norm": 0.28066232800483704, + "learning_rate": 6.4123861937763286e-06, + "loss": 3.9372, + "step": 42260 + }, + { + "epoch": 2.8716537573039815, + "grad_norm": 0.3001053035259247, + "learning_rate": 6.411961543688001e-06, + "loss": 3.9434, + "step": 42265 + }, + { + "epoch": 2.871993477374643, + "grad_norm": 0.551072359085083, + "learning_rate": 6.411536893599675e-06, + "loss": 3.9694, + "step": 42270 + }, + { + "epoch": 2.872333197445305, + "grad_norm": 0.3060532510280609, + "learning_rate": 6.411112243511347e-06, + "loss": 4.1321, + "step": 42275 + }, + { + "epoch": 2.872672917515967, + "grad_norm": 0.251531183719635, + "learning_rate": 6.41068759342302e-06, + "loss": 4.0329, + "step": 42280 + }, + { + "epoch": 2.8730126375866285, + "grad_norm": 0.44055870175361633, + "learning_rate": 6.410262943334693e-06, + "loss": 3.9396, + "step": 42285 + }, + { + "epoch": 2.87335235765729, + "grad_norm": 0.3444724977016449, + "learning_rate": 6.409838293246365e-06, + "loss": 4.2731, + "step": 42290 + }, + { + "epoch": 2.873692077727952, + "grad_norm": 0.41253402829170227, + "learning_rate": 6.409413643158038e-06, + "loss": 3.953, + "step": 42295 + }, + { + "epoch": 2.874031797798614, + "grad_norm": 0.29783254861831665, + "learning_rate": 6.408988993069711e-06, + "loss": 4.1752, + "step": 42300 + }, + { + "epoch": 2.8743715178692755, + "grad_norm": 0.3744717836380005, + "learning_rate": 6.408564342981384e-06, + "loss": 4.2189, + "step": 42305 + }, + { + "epoch": 2.8747112379399375, + "grad_norm": 0.3748668134212494, + "learning_rate": 6.408139692893056e-06, + "loss": 4.1174, + "step": 42310 + }, + { + "epoch": 2.875050958010599, + "grad_norm": 0.30083778500556946, + "learning_rate": 6.407715042804729e-06, + "loss": 4.1974, + "step": 42315 + }, + { + "epoch": 2.875390678081261, + "grad_norm": 0.4292962849140167, + "learning_rate": 6.407290392716402e-06, + "loss": 4.1952, + "step": 42320 + }, + { + "epoch": 2.875730398151923, + "grad_norm": 0.26118651032447815, + "learning_rate": 6.406865742628074e-06, + "loss": 4.0772, + "step": 42325 + }, + { + "epoch": 2.8760701182225845, + "grad_norm": 0.2666637599468231, + "learning_rate": 6.406441092539748e-06, + "loss": 4.0533, + "step": 42330 + }, + { + "epoch": 2.876409838293246, + "grad_norm": 0.2921749949455261, + "learning_rate": 6.406016442451421e-06, + "loss": 4.0839, + "step": 42335 + }, + { + "epoch": 2.876749558363908, + "grad_norm": 0.28033825755119324, + "learning_rate": 6.4055917923630925e-06, + "loss": 4.0701, + "step": 42340 + }, + { + "epoch": 2.87708927843457, + "grad_norm": 0.34317097067832947, + "learning_rate": 6.405167142274766e-06, + "loss": 4.1255, + "step": 42345 + }, + { + "epoch": 2.8774289985052315, + "grad_norm": 0.27558326721191406, + "learning_rate": 6.404742492186439e-06, + "loss": 4.1596, + "step": 42350 + }, + { + "epoch": 2.8777687185758936, + "grad_norm": 0.26915106177330017, + "learning_rate": 6.404317842098111e-06, + "loss": 4.1303, + "step": 42355 + }, + { + "epoch": 2.878108438646555, + "grad_norm": 0.30805206298828125, + "learning_rate": 6.403893192009785e-06, + "loss": 3.9729, + "step": 42360 + }, + { + "epoch": 2.878448158717217, + "grad_norm": 0.27782538533210754, + "learning_rate": 6.403468541921457e-06, + "loss": 3.9361, + "step": 42365 + }, + { + "epoch": 2.878787878787879, + "grad_norm": 0.38913753628730774, + "learning_rate": 6.40304389183313e-06, + "loss": 4.4941, + "step": 42370 + }, + { + "epoch": 2.8791275988585405, + "grad_norm": 0.33893612027168274, + "learning_rate": 6.402619241744803e-06, + "loss": 4.3389, + "step": 42375 + }, + { + "epoch": 2.879467318929202, + "grad_norm": 0.34503743052482605, + "learning_rate": 6.402194591656476e-06, + "loss": 4.0931, + "step": 42380 + }, + { + "epoch": 2.8798070389998642, + "grad_norm": 0.38275617361068726, + "learning_rate": 6.401769941568149e-06, + "loss": 3.8917, + "step": 42385 + }, + { + "epoch": 2.880146759070526, + "grad_norm": 0.2712876498699188, + "learning_rate": 6.401345291479821e-06, + "loss": 3.9267, + "step": 42390 + }, + { + "epoch": 2.8804864791411875, + "grad_norm": 0.3440549671649933, + "learning_rate": 6.400920641391493e-06, + "loss": 4.1309, + "step": 42395 + }, + { + "epoch": 2.8808261992118496, + "grad_norm": 0.46206632256507874, + "learning_rate": 6.400580921320832e-06, + "loss": 4.2703, + "step": 42400 + }, + { + "epoch": 2.881165919282511, + "grad_norm": 0.3127560615539551, + "learning_rate": 6.400156271232505e-06, + "loss": 4.0544, + "step": 42405 + }, + { + "epoch": 2.881505639353173, + "grad_norm": 0.3424752652645111, + "learning_rate": 6.399731621144177e-06, + "loss": 4.1031, + "step": 42410 + }, + { + "epoch": 2.881845359423835, + "grad_norm": 0.2852213680744171, + "learning_rate": 6.399306971055851e-06, + "loss": 4.1663, + "step": 42415 + }, + { + "epoch": 2.8821850794944965, + "grad_norm": 0.40229013562202454, + "learning_rate": 6.3988823209675235e-06, + "loss": 4.3872, + "step": 42420 + }, + { + "epoch": 2.882524799565158, + "grad_norm": 0.34777840971946716, + "learning_rate": 6.398457670879195e-06, + "loss": 3.8744, + "step": 42425 + }, + { + "epoch": 2.8828645196358202, + "grad_norm": 0.33459916710853577, + "learning_rate": 6.398033020790869e-06, + "loss": 4.0188, + "step": 42430 + }, + { + "epoch": 2.883204239706482, + "grad_norm": 0.3728928565979004, + "learning_rate": 6.397608370702542e-06, + "loss": 4.0975, + "step": 42435 + }, + { + "epoch": 2.8835439597771435, + "grad_norm": 0.27201610803604126, + "learning_rate": 6.397183720614214e-06, + "loss": 4.2608, + "step": 42440 + }, + { + "epoch": 2.8838836798478056, + "grad_norm": 0.37054258584976196, + "learning_rate": 6.3967590705258875e-06, + "loss": 4.3486, + "step": 42445 + }, + { + "epoch": 2.884223399918467, + "grad_norm": 0.6325844526290894, + "learning_rate": 6.39633442043756e-06, + "loss": 4.0739, + "step": 42450 + }, + { + "epoch": 2.884563119989129, + "grad_norm": 0.2717214822769165, + "learning_rate": 6.395909770349232e-06, + "loss": 4.0698, + "step": 42455 + }, + { + "epoch": 2.884902840059791, + "grad_norm": 0.25826191902160645, + "learning_rate": 6.395485120260906e-06, + "loss": 4.0604, + "step": 42460 + }, + { + "epoch": 2.8852425601304525, + "grad_norm": 0.223573237657547, + "learning_rate": 6.395060470172579e-06, + "loss": 3.9665, + "step": 42465 + }, + { + "epoch": 2.885582280201114, + "grad_norm": 0.35503754019737244, + "learning_rate": 6.394635820084251e-06, + "loss": 3.983, + "step": 42470 + }, + { + "epoch": 2.8859220002717763, + "grad_norm": 0.31285256147384644, + "learning_rate": 6.394211169995924e-06, + "loss": 4.1952, + "step": 42475 + }, + { + "epoch": 2.886261720342438, + "grad_norm": 0.3307328224182129, + "learning_rate": 6.393786519907596e-06, + "loss": 4.0366, + "step": 42480 + }, + { + "epoch": 2.8866014404130995, + "grad_norm": 0.2760362923145294, + "learning_rate": 6.393361869819269e-06, + "loss": 4.0441, + "step": 42485 + }, + { + "epoch": 2.8869411604837616, + "grad_norm": 0.3433591425418854, + "learning_rate": 6.392937219730943e-06, + "loss": 4.0537, + "step": 42490 + }, + { + "epoch": 2.8872808805544232, + "grad_norm": 0.320799320936203, + "learning_rate": 6.392512569642615e-06, + "loss": 3.982, + "step": 42495 + }, + { + "epoch": 2.887620600625085, + "grad_norm": 0.3191812336444855, + "learning_rate": 6.3920879195542874e-06, + "loss": 3.9829, + "step": 42500 + }, + { + "epoch": 2.887960320695747, + "grad_norm": 0.2445913404226303, + "learning_rate": 6.391663269465961e-06, + "loss": 4.0946, + "step": 42505 + }, + { + "epoch": 2.8883000407664086, + "grad_norm": 0.5783270001411438, + "learning_rate": 6.391238619377633e-06, + "loss": 4.1853, + "step": 42510 + }, + { + "epoch": 2.88863976083707, + "grad_norm": 0.3551919758319855, + "learning_rate": 6.390813969289306e-06, + "loss": 4.0927, + "step": 42515 + }, + { + "epoch": 2.8889794809077323, + "grad_norm": 0.35858210921287537, + "learning_rate": 6.3903893192009795e-06, + "loss": 4.3938, + "step": 42520 + }, + { + "epoch": 2.889319200978394, + "grad_norm": 0.29119181632995605, + "learning_rate": 6.3899646691126514e-06, + "loss": 4.1392, + "step": 42525 + }, + { + "epoch": 2.8896589210490555, + "grad_norm": 0.2517675757408142, + "learning_rate": 6.389540019024324e-06, + "loss": 4.2422, + "step": 42530 + }, + { + "epoch": 2.8899986411197176, + "grad_norm": 0.3057129681110382, + "learning_rate": 6.389115368935998e-06, + "loss": 3.9852, + "step": 42535 + }, + { + "epoch": 2.8903383611903792, + "grad_norm": 0.34089726209640503, + "learning_rate": 6.38869071884767e-06, + "loss": 4.269, + "step": 42540 + }, + { + "epoch": 2.890678081261041, + "grad_norm": 0.2873436510562897, + "learning_rate": 6.388266068759343e-06, + "loss": 3.9765, + "step": 42545 + }, + { + "epoch": 2.891017801331703, + "grad_norm": 0.3692763149738312, + "learning_rate": 6.3878414186710154e-06, + "loss": 3.9261, + "step": 42550 + }, + { + "epoch": 2.8913575214023646, + "grad_norm": 0.38115188479423523, + "learning_rate": 6.387416768582688e-06, + "loss": 3.8813, + "step": 42555 + }, + { + "epoch": 2.891697241473026, + "grad_norm": 0.28370919823646545, + "learning_rate": 6.386992118494361e-06, + "loss": 4.0506, + "step": 42560 + }, + { + "epoch": 2.8920369615436883, + "grad_norm": 0.4117698669433594, + "learning_rate": 6.386567468406034e-06, + "loss": 4.2956, + "step": 42565 + }, + { + "epoch": 2.89237668161435, + "grad_norm": 0.42097344994544983, + "learning_rate": 6.386142818317707e-06, + "loss": 4.101, + "step": 42570 + }, + { + "epoch": 2.8927164016850115, + "grad_norm": 0.301704078912735, + "learning_rate": 6.38571816822938e-06, + "loss": 3.9508, + "step": 42575 + }, + { + "epoch": 2.893056121755673, + "grad_norm": 0.3290724456310272, + "learning_rate": 6.385293518141052e-06, + "loss": 3.9259, + "step": 42580 + }, + { + "epoch": 2.8933958418263352, + "grad_norm": 0.22816696763038635, + "learning_rate": 6.384868868052725e-06, + "loss": 4.0171, + "step": 42585 + }, + { + "epoch": 2.893735561896997, + "grad_norm": 0.34376707673072815, + "learning_rate": 6.384444217964399e-06, + "loss": 4.2672, + "step": 42590 + }, + { + "epoch": 2.8940752819676585, + "grad_norm": 0.32635244727134705, + "learning_rate": 6.384019567876071e-06, + "loss": 4.1207, + "step": 42595 + }, + { + "epoch": 2.8944150020383206, + "grad_norm": 0.4039771854877472, + "learning_rate": 6.3835949177877434e-06, + "loss": 4.2512, + "step": 42600 + }, + { + "epoch": 2.894754722108982, + "grad_norm": 0.42529335618019104, + "learning_rate": 6.383170267699417e-06, + "loss": 3.9052, + "step": 42605 + }, + { + "epoch": 2.895094442179644, + "grad_norm": 0.292140394449234, + "learning_rate": 6.382745617611089e-06, + "loss": 3.9993, + "step": 42610 + }, + { + "epoch": 2.8954341622503055, + "grad_norm": 0.4306875765323639, + "learning_rate": 6.382320967522762e-06, + "loss": 3.9798, + "step": 42615 + }, + { + "epoch": 2.8957738823209676, + "grad_norm": 0.5602509379386902, + "learning_rate": 6.381896317434435e-06, + "loss": 4.1497, + "step": 42620 + }, + { + "epoch": 2.896113602391629, + "grad_norm": 0.31910187005996704, + "learning_rate": 6.3814716673461074e-06, + "loss": 4.0501, + "step": 42625 + }, + { + "epoch": 2.896453322462291, + "grad_norm": 0.5980179309844971, + "learning_rate": 6.38104701725778e-06, + "loss": 4.0365, + "step": 42630 + }, + { + "epoch": 2.896793042532953, + "grad_norm": 0.30951839685440063, + "learning_rate": 6.380622367169453e-06, + "loss": 4.0887, + "step": 42635 + }, + { + "epoch": 2.8971327626036145, + "grad_norm": 0.36909204721450806, + "learning_rate": 6.380197717081126e-06, + "loss": 4.1217, + "step": 42640 + }, + { + "epoch": 2.897472482674276, + "grad_norm": 0.6410396695137024, + "learning_rate": 6.379773066992798e-06, + "loss": 4.2589, + "step": 42645 + }, + { + "epoch": 2.8978122027449382, + "grad_norm": 0.2473197728395462, + "learning_rate": 6.3793484169044715e-06, + "loss": 4.0434, + "step": 42650 + }, + { + "epoch": 2.8981519228156, + "grad_norm": 0.3823889195919037, + "learning_rate": 6.378923766816144e-06, + "loss": 4.1263, + "step": 42655 + }, + { + "epoch": 2.8984916428862615, + "grad_norm": 0.4339233338832855, + "learning_rate": 6.378499116727816e-06, + "loss": 4.1066, + "step": 42660 + }, + { + "epoch": 2.8988313629569236, + "grad_norm": 0.27151355147361755, + "learning_rate": 6.37807446663949e-06, + "loss": 4.1322, + "step": 42665 + }, + { + "epoch": 2.899171083027585, + "grad_norm": 0.4769507348537445, + "learning_rate": 6.377649816551163e-06, + "loss": 3.9152, + "step": 42670 + }, + { + "epoch": 2.899510803098247, + "grad_norm": 0.2696497142314911, + "learning_rate": 6.377225166462835e-06, + "loss": 4.0871, + "step": 42675 + }, + { + "epoch": 2.899850523168909, + "grad_norm": 0.2987005412578583, + "learning_rate": 6.376800516374508e-06, + "loss": 3.956, + "step": 42680 + }, + { + "epoch": 2.9001902432395705, + "grad_norm": 0.3164271116256714, + "learning_rate": 6.376375866286181e-06, + "loss": 4.0118, + "step": 42685 + }, + { + "epoch": 2.900529963310232, + "grad_norm": 0.4091396927833557, + "learning_rate": 6.375951216197853e-06, + "loss": 4.178, + "step": 42690 + }, + { + "epoch": 2.9008696833808942, + "grad_norm": 0.2488265335559845, + "learning_rate": 6.375526566109527e-06, + "loss": 4.0835, + "step": 42695 + }, + { + "epoch": 2.901209403451556, + "grad_norm": 0.2463270127773285, + "learning_rate": 6.3751019160211995e-06, + "loss": 4.1874, + "step": 42700 + }, + { + "epoch": 2.9015491235222175, + "grad_norm": 0.3618469536304474, + "learning_rate": 6.374677265932871e-06, + "loss": 4.4261, + "step": 42705 + }, + { + "epoch": 2.9018888435928796, + "grad_norm": 0.3385120928287506, + "learning_rate": 6.374252615844545e-06, + "loss": 4.1036, + "step": 42710 + }, + { + "epoch": 2.902228563663541, + "grad_norm": 0.2847059965133667, + "learning_rate": 6.373827965756217e-06, + "loss": 4.2517, + "step": 42715 + }, + { + "epoch": 2.902568283734203, + "grad_norm": 0.2726222276687622, + "learning_rate": 6.37340331566789e-06, + "loss": 4.066, + "step": 42720 + }, + { + "epoch": 2.902908003804865, + "grad_norm": 0.36315685510635376, + "learning_rate": 6.3729786655795635e-06, + "loss": 4.2532, + "step": 42725 + }, + { + "epoch": 2.9032477238755265, + "grad_norm": 0.36893677711486816, + "learning_rate": 6.372554015491235e-06, + "loss": 4.2687, + "step": 42730 + }, + { + "epoch": 2.903587443946188, + "grad_norm": 0.2418612539768219, + "learning_rate": 6.372129365402908e-06, + "loss": 3.9141, + "step": 42735 + }, + { + "epoch": 2.9039271640168502, + "grad_norm": 0.43880072236061096, + "learning_rate": 6.371704715314582e-06, + "loss": 4.1567, + "step": 42740 + }, + { + "epoch": 2.904266884087512, + "grad_norm": 0.2646597921848297, + "learning_rate": 6.371280065226254e-06, + "loss": 3.9716, + "step": 42745 + }, + { + "epoch": 2.9046066041581735, + "grad_norm": 0.28964748978614807, + "learning_rate": 6.370855415137927e-06, + "loss": 3.9233, + "step": 42750 + }, + { + "epoch": 2.9049463242288356, + "grad_norm": 0.3322697877883911, + "learning_rate": 6.3704307650496e-06, + "loss": 4.1216, + "step": 42755 + }, + { + "epoch": 2.905286044299497, + "grad_norm": 0.326271116733551, + "learning_rate": 6.370006114961272e-06, + "loss": 4.0194, + "step": 42760 + }, + { + "epoch": 2.905625764370159, + "grad_norm": 0.3460148274898529, + "learning_rate": 6.369581464872945e-06, + "loss": 4.1147, + "step": 42765 + }, + { + "epoch": 2.905965484440821, + "grad_norm": 0.5189924240112305, + "learning_rate": 6.369156814784619e-06, + "loss": 4.1278, + "step": 42770 + }, + { + "epoch": 2.9063052045114826, + "grad_norm": 0.24260658025741577, + "learning_rate": 6.368732164696291e-06, + "loss": 4.3083, + "step": 42775 + }, + { + "epoch": 2.906644924582144, + "grad_norm": 0.31056010723114014, + "learning_rate": 6.3683075146079634e-06, + "loss": 4.1312, + "step": 42780 + }, + { + "epoch": 2.9069846446528063, + "grad_norm": 0.33253660798072815, + "learning_rate": 6.367882864519637e-06, + "loss": 3.8499, + "step": 42785 + }, + { + "epoch": 2.907324364723468, + "grad_norm": 0.3686317503452301, + "learning_rate": 6.367458214431309e-06, + "loss": 4.3329, + "step": 42790 + }, + { + "epoch": 2.9076640847941295, + "grad_norm": 0.3141219913959503, + "learning_rate": 6.367033564342982e-06, + "loss": 3.8309, + "step": 42795 + }, + { + "epoch": 2.9080038048647916, + "grad_norm": 0.34582585096359253, + "learning_rate": 6.366608914254655e-06, + "loss": 4.0951, + "step": 42800 + }, + { + "epoch": 2.9083435249354532, + "grad_norm": 0.42529913783073425, + "learning_rate": 6.3661842641663274e-06, + "loss": 3.9916, + "step": 42805 + }, + { + "epoch": 2.908683245006115, + "grad_norm": 0.27189674973487854, + "learning_rate": 6.365759614077999e-06, + "loss": 4.2266, + "step": 42810 + }, + { + "epoch": 2.909022965076777, + "grad_norm": 0.4491163492202759, + "learning_rate": 6.365334963989673e-06, + "loss": 4.0611, + "step": 42815 + }, + { + "epoch": 2.9093626851474386, + "grad_norm": 0.3023031949996948, + "learning_rate": 6.364910313901346e-06, + "loss": 4.2225, + "step": 42820 + }, + { + "epoch": 2.9097024052181, + "grad_norm": 0.2881506383419037, + "learning_rate": 6.364485663813018e-06, + "loss": 4.2609, + "step": 42825 + }, + { + "epoch": 2.9100421252887623, + "grad_norm": 0.241195946931839, + "learning_rate": 6.3640610137246914e-06, + "loss": 3.9891, + "step": 42830 + }, + { + "epoch": 2.910381845359424, + "grad_norm": 0.32480311393737793, + "learning_rate": 6.363636363636364e-06, + "loss": 4.3125, + "step": 42835 + }, + { + "epoch": 2.9107215654300855, + "grad_norm": 0.26577386260032654, + "learning_rate": 6.363211713548036e-06, + "loss": 4.1026, + "step": 42840 + }, + { + "epoch": 2.9110612855007476, + "grad_norm": 0.2686423659324646, + "learning_rate": 6.36278706345971e-06, + "loss": 3.9595, + "step": 42845 + }, + { + "epoch": 2.9114010055714092, + "grad_norm": 0.28422924876213074, + "learning_rate": 6.362362413371383e-06, + "loss": 4.0141, + "step": 42850 + }, + { + "epoch": 2.911740725642071, + "grad_norm": 0.38845258951187134, + "learning_rate": 6.361937763283055e-06, + "loss": 4.0369, + "step": 42855 + }, + { + "epoch": 2.912080445712733, + "grad_norm": 0.2912499010562897, + "learning_rate": 6.361513113194728e-06, + "loss": 4.2073, + "step": 42860 + }, + { + "epoch": 2.9124201657833946, + "grad_norm": 0.6535465717315674, + "learning_rate": 6.361088463106401e-06, + "loss": 4.1151, + "step": 42865 + }, + { + "epoch": 2.912759885854056, + "grad_norm": 0.3158765137195587, + "learning_rate": 6.360663813018073e-06, + "loss": 4.1173, + "step": 42870 + }, + { + "epoch": 2.9130996059247183, + "grad_norm": 0.3394230306148529, + "learning_rate": 6.360239162929747e-06, + "loss": 3.9743, + "step": 42875 + }, + { + "epoch": 2.91343932599538, + "grad_norm": 0.2757450342178345, + "learning_rate": 6.3598145128414194e-06, + "loss": 4.1207, + "step": 42880 + }, + { + "epoch": 2.9137790460660415, + "grad_norm": 0.44292372465133667, + "learning_rate": 6.359389862753091e-06, + "loss": 4.3128, + "step": 42885 + }, + { + "epoch": 2.9141187661367036, + "grad_norm": 0.264527827501297, + "learning_rate": 6.358965212664765e-06, + "loss": 3.9903, + "step": 42890 + }, + { + "epoch": 2.9144584862073653, + "grad_norm": 0.4943791627883911, + "learning_rate": 6.358540562576437e-06, + "loss": 4.1954, + "step": 42895 + }, + { + "epoch": 2.914798206278027, + "grad_norm": 0.28047290444374084, + "learning_rate": 6.35811591248811e-06, + "loss": 4.0184, + "step": 42900 + }, + { + "epoch": 2.915137926348689, + "grad_norm": 0.2498064637184143, + "learning_rate": 6.3576912623997834e-06, + "loss": 4.2743, + "step": 42905 + }, + { + "epoch": 2.9154776464193506, + "grad_norm": 0.6271222233772278, + "learning_rate": 6.357266612311455e-06, + "loss": 4.0749, + "step": 42910 + }, + { + "epoch": 2.915817366490012, + "grad_norm": 0.28547903895378113, + "learning_rate": 6.356841962223129e-06, + "loss": 4.074, + "step": 42915 + }, + { + "epoch": 2.916157086560674, + "grad_norm": 0.3509618639945984, + "learning_rate": 6.356417312134802e-06, + "loss": 3.9426, + "step": 42920 + }, + { + "epoch": 2.916496806631336, + "grad_norm": 0.29753991961479187, + "learning_rate": 6.355992662046474e-06, + "loss": 4.0809, + "step": 42925 + }, + { + "epoch": 2.9168365267019976, + "grad_norm": 0.28455495834350586, + "learning_rate": 6.3555680119581474e-06, + "loss": 4.148, + "step": 42930 + }, + { + "epoch": 2.917176246772659, + "grad_norm": 0.3871593773365021, + "learning_rate": 6.35514336186982e-06, + "loss": 4.1891, + "step": 42935 + }, + { + "epoch": 2.9175159668433213, + "grad_norm": 0.2603963315486908, + "learning_rate": 6.354718711781492e-06, + "loss": 4.263, + "step": 42940 + }, + { + "epoch": 2.917855686913983, + "grad_norm": 0.4238588511943817, + "learning_rate": 6.354294061693166e-06, + "loss": 4.2281, + "step": 42945 + }, + { + "epoch": 2.9181954069846445, + "grad_norm": 0.3313015401363373, + "learning_rate": 6.353869411604839e-06, + "loss": 4.0941, + "step": 42950 + }, + { + "epoch": 2.918535127055306, + "grad_norm": 0.37152737379074097, + "learning_rate": 6.353444761516511e-06, + "loss": 3.9219, + "step": 42955 + }, + { + "epoch": 2.9188748471259682, + "grad_norm": 0.3061894476413727, + "learning_rate": 6.353020111428184e-06, + "loss": 4.2374, + "step": 42960 + }, + { + "epoch": 2.91921456719663, + "grad_norm": 0.20706769824028015, + "learning_rate": 6.352595461339856e-06, + "loss": 4.0699, + "step": 42965 + }, + { + "epoch": 2.9195542872672915, + "grad_norm": 0.339542418718338, + "learning_rate": 6.352170811251529e-06, + "loss": 4.2804, + "step": 42970 + }, + { + "epoch": 2.9198940073379536, + "grad_norm": 0.23551052808761597, + "learning_rate": 6.351746161163203e-06, + "loss": 4.0547, + "step": 42975 + }, + { + "epoch": 2.920233727408615, + "grad_norm": 0.26381954550743103, + "learning_rate": 6.351321511074875e-06, + "loss": 4.1462, + "step": 42980 + }, + { + "epoch": 2.920573447479277, + "grad_norm": 0.5274572372436523, + "learning_rate": 6.350896860986547e-06, + "loss": 3.9349, + "step": 42985 + }, + { + "epoch": 2.920913167549939, + "grad_norm": 0.5261923670768738, + "learning_rate": 6.350472210898221e-06, + "loss": 4.0217, + "step": 42990 + }, + { + "epoch": 2.9212528876206005, + "grad_norm": 0.33922716975212097, + "learning_rate": 6.350047560809893e-06, + "loss": 3.9607, + "step": 42995 + }, + { + "epoch": 2.921592607691262, + "grad_norm": 0.4076475501060486, + "learning_rate": 6.349622910721566e-06, + "loss": 3.9481, + "step": 43000 + }, + { + "epoch": 2.9219323277619242, + "grad_norm": 0.27840110659599304, + "learning_rate": 6.3491982606332395e-06, + "loss": 4.2341, + "step": 43005 + }, + { + "epoch": 2.922272047832586, + "grad_norm": 0.22469346225261688, + "learning_rate": 6.348773610544911e-06, + "loss": 3.9575, + "step": 43010 + }, + { + "epoch": 2.9226117679032475, + "grad_norm": 0.6440495252609253, + "learning_rate": 6.348348960456584e-06, + "loss": 3.8789, + "step": 43015 + }, + { + "epoch": 2.9229514879739096, + "grad_norm": 0.4102027118206024, + "learning_rate": 6.347924310368258e-06, + "loss": 4.2516, + "step": 43020 + }, + { + "epoch": 2.923291208044571, + "grad_norm": 0.3294235169887543, + "learning_rate": 6.34749966027993e-06, + "loss": 4.2886, + "step": 43025 + }, + { + "epoch": 2.923630928115233, + "grad_norm": 0.3171224892139435, + "learning_rate": 6.347075010191603e-06, + "loss": 4.1003, + "step": 43030 + }, + { + "epoch": 2.923970648185895, + "grad_norm": 0.23230689764022827, + "learning_rate": 6.346650360103276e-06, + "loss": 4.0886, + "step": 43035 + }, + { + "epoch": 2.9243103682565565, + "grad_norm": 0.2410319447517395, + "learning_rate": 6.346225710014948e-06, + "loss": 4.0132, + "step": 43040 + }, + { + "epoch": 2.924650088327218, + "grad_norm": 0.28125080466270447, + "learning_rate": 6.345801059926621e-06, + "loss": 4.13, + "step": 43045 + }, + { + "epoch": 2.9249898083978803, + "grad_norm": 0.27088236808776855, + "learning_rate": 6.345376409838294e-06, + "loss": 4.0646, + "step": 43050 + }, + { + "epoch": 2.925329528468542, + "grad_norm": 0.2675643265247345, + "learning_rate": 6.344951759749967e-06, + "loss": 3.9828, + "step": 43055 + }, + { + "epoch": 2.9256692485392035, + "grad_norm": 0.2789582312107086, + "learning_rate": 6.3445271096616386e-06, + "loss": 4.0231, + "step": 43060 + }, + { + "epoch": 2.9260089686098656, + "grad_norm": 0.29446470737457275, + "learning_rate": 6.344102459573312e-06, + "loss": 4.1167, + "step": 43065 + }, + { + "epoch": 2.926348688680527, + "grad_norm": 0.4355533719062805, + "learning_rate": 6.343677809484985e-06, + "loss": 4.1129, + "step": 43070 + }, + { + "epoch": 2.926688408751189, + "grad_norm": 0.5557639598846436, + "learning_rate": 6.343253159396657e-06, + "loss": 4.1245, + "step": 43075 + }, + { + "epoch": 2.927028128821851, + "grad_norm": 0.40928739309310913, + "learning_rate": 6.342828509308331e-06, + "loss": 4.1312, + "step": 43080 + }, + { + "epoch": 2.9273678488925126, + "grad_norm": 0.34902825951576233, + "learning_rate": 6.3424038592200034e-06, + "loss": 4.127, + "step": 43085 + }, + { + "epoch": 2.927707568963174, + "grad_norm": 0.26595014333724976, + "learning_rate": 6.341979209131675e-06, + "loss": 4.0492, + "step": 43090 + }, + { + "epoch": 2.9280472890338363, + "grad_norm": 0.2596486508846283, + "learning_rate": 6.341554559043349e-06, + "loss": 3.7503, + "step": 43095 + }, + { + "epoch": 2.928387009104498, + "grad_norm": 0.3171564042568207, + "learning_rate": 6.341129908955022e-06, + "loss": 3.856, + "step": 43100 + }, + { + "epoch": 2.9287267291751595, + "grad_norm": 0.30073684453964233, + "learning_rate": 6.340705258866694e-06, + "loss": 3.9943, + "step": 43105 + }, + { + "epoch": 2.9290664492458216, + "grad_norm": 0.2815317213535309, + "learning_rate": 6.3402806087783674e-06, + "loss": 3.98, + "step": 43110 + }, + { + "epoch": 2.9294061693164832, + "grad_norm": 0.25870445370674133, + "learning_rate": 6.33985595869004e-06, + "loss": 4.1844, + "step": 43115 + }, + { + "epoch": 2.929745889387145, + "grad_norm": 0.36205750703811646, + "learning_rate": 6.339431308601712e-06, + "loss": 4.0303, + "step": 43120 + }, + { + "epoch": 2.930085609457807, + "grad_norm": 0.32378852367401123, + "learning_rate": 6.339006658513386e-06, + "loss": 4.1203, + "step": 43125 + }, + { + "epoch": 2.9304253295284686, + "grad_norm": 0.3523716628551483, + "learning_rate": 6.338582008425059e-06, + "loss": 4.1796, + "step": 43130 + }, + { + "epoch": 2.93076504959913, + "grad_norm": 0.35119402408599854, + "learning_rate": 6.338157358336731e-06, + "loss": 3.8226, + "step": 43135 + }, + { + "epoch": 2.9311047696697923, + "grad_norm": 0.35893985629081726, + "learning_rate": 6.337732708248404e-06, + "loss": 4.1062, + "step": 43140 + }, + { + "epoch": 2.931444489740454, + "grad_norm": 0.29170453548431396, + "learning_rate": 6.337308058160076e-06, + "loss": 4.1754, + "step": 43145 + }, + { + "epoch": 2.9317842098111155, + "grad_norm": 0.2747843265533447, + "learning_rate": 6.336883408071749e-06, + "loss": 4.3513, + "step": 43150 + }, + { + "epoch": 2.9321239298817776, + "grad_norm": 0.3248627483844757, + "learning_rate": 6.336458757983423e-06, + "loss": 4.1338, + "step": 43155 + }, + { + "epoch": 2.9324636499524392, + "grad_norm": 0.3403521478176117, + "learning_rate": 6.336034107895095e-06, + "loss": 3.9706, + "step": 43160 + }, + { + "epoch": 2.932803370023101, + "grad_norm": 0.3379804790019989, + "learning_rate": 6.335609457806767e-06, + "loss": 4.0958, + "step": 43165 + }, + { + "epoch": 2.933143090093763, + "grad_norm": 0.2971228361129761, + "learning_rate": 6.335184807718441e-06, + "loss": 3.9004, + "step": 43170 + }, + { + "epoch": 2.9334828101644246, + "grad_norm": 0.38082921504974365, + "learning_rate": 6.334760157630113e-06, + "loss": 3.8922, + "step": 43175 + }, + { + "epoch": 2.933822530235086, + "grad_norm": 0.5505419373512268, + "learning_rate": 6.334335507541786e-06, + "loss": 4.2194, + "step": 43180 + }, + { + "epoch": 2.9341622503057483, + "grad_norm": 0.24593666195869446, + "learning_rate": 6.3339108574534594e-06, + "loss": 3.7986, + "step": 43185 + }, + { + "epoch": 2.93450197037641, + "grad_norm": 0.3311125636100769, + "learning_rate": 6.333486207365131e-06, + "loss": 4.0308, + "step": 43190 + }, + { + "epoch": 2.9348416904470715, + "grad_norm": 0.3622412085533142, + "learning_rate": 6.333061557276804e-06, + "loss": 4.3095, + "step": 43195 + }, + { + "epoch": 2.9351814105177336, + "grad_norm": 0.5853675603866577, + "learning_rate": 6.332636907188478e-06, + "loss": 4.123, + "step": 43200 + }, + { + "epoch": 2.9355211305883953, + "grad_norm": 0.5935848951339722, + "learning_rate": 6.33221225710015e-06, + "loss": 4.1251, + "step": 43205 + }, + { + "epoch": 2.935860850659057, + "grad_norm": 0.2521975040435791, + "learning_rate": 6.331787607011823e-06, + "loss": 4.0345, + "step": 43210 + }, + { + "epoch": 2.936200570729719, + "grad_norm": 0.32313427329063416, + "learning_rate": 6.331362956923495e-06, + "loss": 3.9485, + "step": 43215 + }, + { + "epoch": 2.9365402908003806, + "grad_norm": 0.2605019807815552, + "learning_rate": 6.330938306835168e-06, + "loss": 4.0544, + "step": 43220 + }, + { + "epoch": 2.9368800108710422, + "grad_norm": 0.3263035714626312, + "learning_rate": 6.330513656746841e-06, + "loss": 3.9483, + "step": 43225 + }, + { + "epoch": 2.9372197309417043, + "grad_norm": 0.2872108519077301, + "learning_rate": 6.330089006658514e-06, + "loss": 3.9895, + "step": 43230 + }, + { + "epoch": 2.937559451012366, + "grad_norm": 0.5384459495544434, + "learning_rate": 6.329664356570187e-06, + "loss": 4.0441, + "step": 43235 + }, + { + "epoch": 2.9378991710830276, + "grad_norm": 0.32951459288597107, + "learning_rate": 6.3292397064818586e-06, + "loss": 4.0157, + "step": 43240 + }, + { + "epoch": 2.9382388911536896, + "grad_norm": 0.25895506143569946, + "learning_rate": 6.328815056393532e-06, + "loss": 3.9784, + "step": 43245 + }, + { + "epoch": 2.9385786112243513, + "grad_norm": 0.3005218207836151, + "learning_rate": 6.328390406305205e-06, + "loss": 4.1469, + "step": 43250 + }, + { + "epoch": 2.938918331295013, + "grad_norm": 0.24618183076381683, + "learning_rate": 6.327965756216879e-06, + "loss": 4.2136, + "step": 43255 + }, + { + "epoch": 2.9392580513656745, + "grad_norm": 0.37768813967704773, + "learning_rate": 6.327541106128551e-06, + "loss": 4.1329, + "step": 43260 + }, + { + "epoch": 2.9395977714363366, + "grad_norm": 0.32328081130981445, + "learning_rate": 6.327116456040223e-06, + "loss": 3.9361, + "step": 43265 + }, + { + "epoch": 2.9399374915069982, + "grad_norm": 0.26396480202674866, + "learning_rate": 6.326691805951897e-06, + "loss": 3.8276, + "step": 43270 + }, + { + "epoch": 2.94027721157766, + "grad_norm": 0.25044941902160645, + "learning_rate": 6.326267155863569e-06, + "loss": 3.8566, + "step": 43275 + }, + { + "epoch": 2.940616931648322, + "grad_norm": 0.5395330786705017, + "learning_rate": 6.325842505775242e-06, + "loss": 4.0759, + "step": 43280 + }, + { + "epoch": 2.9409566517189836, + "grad_norm": 0.3089270293712616, + "learning_rate": 6.325417855686915e-06, + "loss": 4.1113, + "step": 43285 + }, + { + "epoch": 2.941296371789645, + "grad_norm": 0.21163295209407806, + "learning_rate": 6.324993205598587e-06, + "loss": 4.0854, + "step": 43290 + }, + { + "epoch": 2.941636091860307, + "grad_norm": 0.37303218245506287, + "learning_rate": 6.32456855551026e-06, + "loss": 4.2331, + "step": 43295 + }, + { + "epoch": 2.941975811930969, + "grad_norm": 0.461417019367218, + "learning_rate": 6.324143905421933e-06, + "loss": 4.2071, + "step": 43300 + }, + { + "epoch": 2.9423155320016305, + "grad_norm": 0.30671894550323486, + "learning_rate": 6.323719255333606e-06, + "loss": 3.8123, + "step": 43305 + }, + { + "epoch": 2.942655252072292, + "grad_norm": 0.2538086175918579, + "learning_rate": 6.323294605245278e-06, + "loss": 3.995, + "step": 43310 + }, + { + "epoch": 2.9429949721429542, + "grad_norm": 0.34166252613067627, + "learning_rate": 6.322869955156951e-06, + "loss": 3.9192, + "step": 43315 + }, + { + "epoch": 2.943334692213616, + "grad_norm": 0.34327417612075806, + "learning_rate": 6.322445305068624e-06, + "loss": 4.244, + "step": 43320 + }, + { + "epoch": 2.9436744122842775, + "grad_norm": 0.42860302329063416, + "learning_rate": 6.322020654980296e-06, + "loss": 4.2157, + "step": 43325 + }, + { + "epoch": 2.9440141323549396, + "grad_norm": 0.2628140151500702, + "learning_rate": 6.32159600489197e-06, + "loss": 4.1262, + "step": 43330 + }, + { + "epoch": 2.944353852425601, + "grad_norm": 0.3307388722896576, + "learning_rate": 6.321171354803643e-06, + "loss": 3.9159, + "step": 43335 + }, + { + "epoch": 2.944693572496263, + "grad_norm": 0.2376973032951355, + "learning_rate": 6.3207467047153146e-06, + "loss": 4.1478, + "step": 43340 + }, + { + "epoch": 2.945033292566925, + "grad_norm": 0.3448496460914612, + "learning_rate": 6.320322054626988e-06, + "loss": 4.2218, + "step": 43345 + }, + { + "epoch": 2.9453730126375866, + "grad_norm": 0.2639380395412445, + "learning_rate": 6.319897404538661e-06, + "loss": 4.2487, + "step": 43350 + }, + { + "epoch": 2.945712732708248, + "grad_norm": 0.5351875424385071, + "learning_rate": 6.319472754450333e-06, + "loss": 4.2797, + "step": 43355 + }, + { + "epoch": 2.9460524527789103, + "grad_norm": 0.2673819959163666, + "learning_rate": 6.319048104362007e-06, + "loss": 4.0927, + "step": 43360 + }, + { + "epoch": 2.946392172849572, + "grad_norm": 0.3677598237991333, + "learning_rate": 6.318623454273679e-06, + "loss": 3.9524, + "step": 43365 + }, + { + "epoch": 2.9467318929202335, + "grad_norm": 0.48718687891960144, + "learning_rate": 6.318198804185351e-06, + "loss": 4.3509, + "step": 43370 + }, + { + "epoch": 2.9470716129908956, + "grad_norm": 0.43906304240226746, + "learning_rate": 6.317774154097025e-06, + "loss": 4.1838, + "step": 43375 + }, + { + "epoch": 2.9474113330615572, + "grad_norm": 0.3158840537071228, + "learning_rate": 6.317349504008697e-06, + "loss": 4.0799, + "step": 43380 + }, + { + "epoch": 2.947751053132219, + "grad_norm": 0.31031402945518494, + "learning_rate": 6.31692485392037e-06, + "loss": 4.0144, + "step": 43385 + }, + { + "epoch": 2.948090773202881, + "grad_norm": 0.29075607657432556, + "learning_rate": 6.3165002038320434e-06, + "loss": 4.2201, + "step": 43390 + }, + { + "epoch": 2.9484304932735426, + "grad_norm": 0.34213224053382874, + "learning_rate": 6.316075553743715e-06, + "loss": 4.0497, + "step": 43395 + }, + { + "epoch": 2.948770213344204, + "grad_norm": 0.5086959600448608, + "learning_rate": 6.315650903655388e-06, + "loss": 4.0964, + "step": 43400 + }, + { + "epoch": 2.9491099334148663, + "grad_norm": 0.3811047673225403, + "learning_rate": 6.315226253567062e-06, + "loss": 3.8493, + "step": 43405 + }, + { + "epoch": 2.949449653485528, + "grad_norm": 0.3746865391731262, + "learning_rate": 6.314801603478734e-06, + "loss": 4.0156, + "step": 43410 + }, + { + "epoch": 2.9497893735561895, + "grad_norm": 0.27244123816490173, + "learning_rate": 6.314376953390407e-06, + "loss": 3.8507, + "step": 43415 + }, + { + "epoch": 2.9501290936268516, + "grad_norm": 0.30728039145469666, + "learning_rate": 6.31395230330208e-06, + "loss": 4.1295, + "step": 43420 + }, + { + "epoch": 2.9504688136975132, + "grad_norm": 0.30517303943634033, + "learning_rate": 6.313527653213752e-06, + "loss": 4.271, + "step": 43425 + }, + { + "epoch": 2.950808533768175, + "grad_norm": 0.4214308261871338, + "learning_rate": 6.313103003125425e-06, + "loss": 4.2798, + "step": 43430 + }, + { + "epoch": 2.951148253838837, + "grad_norm": 0.3395228981971741, + "learning_rate": 6.312678353037099e-06, + "loss": 4.3957, + "step": 43435 + }, + { + "epoch": 2.9514879739094986, + "grad_norm": 0.2723403573036194, + "learning_rate": 6.312253702948771e-06, + "loss": 3.9939, + "step": 43440 + }, + { + "epoch": 2.95182769398016, + "grad_norm": 0.33211326599121094, + "learning_rate": 6.311829052860443e-06, + "loss": 4.2632, + "step": 43445 + }, + { + "epoch": 2.9521674140508223, + "grad_norm": 0.32624638080596924, + "learning_rate": 6.311404402772117e-06, + "loss": 3.9967, + "step": 43450 + }, + { + "epoch": 2.952507134121484, + "grad_norm": 0.6192296147346497, + "learning_rate": 6.310979752683789e-06, + "loss": 4.1049, + "step": 43455 + }, + { + "epoch": 2.9528468541921455, + "grad_norm": 0.4219388961791992, + "learning_rate": 6.310555102595462e-06, + "loss": 3.9055, + "step": 43460 + }, + { + "epoch": 2.9531865742628076, + "grad_norm": 0.40859392285346985, + "learning_rate": 6.310130452507135e-06, + "loss": 4.0364, + "step": 43465 + }, + { + "epoch": 2.9535262943334692, + "grad_norm": 0.3691442310810089, + "learning_rate": 6.309705802418807e-06, + "loss": 4.2726, + "step": 43470 + }, + { + "epoch": 2.953866014404131, + "grad_norm": 0.3743135333061218, + "learning_rate": 6.30928115233048e-06, + "loss": 3.9129, + "step": 43475 + }, + { + "epoch": 2.954205734474793, + "grad_norm": 0.37138256430625916, + "learning_rate": 6.308856502242153e-06, + "loss": 4.2704, + "step": 43480 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 0.33514323830604553, + "learning_rate": 6.308431852153826e-06, + "loss": 4.2379, + "step": 43485 + }, + { + "epoch": 2.954885174616116, + "grad_norm": 0.29256439208984375, + "learning_rate": 6.308007202065498e-06, + "loss": 4.1559, + "step": 43490 + }, + { + "epoch": 2.9552248946867783, + "grad_norm": 0.23323631286621094, + "learning_rate": 6.307582551977171e-06, + "loss": 3.8662, + "step": 43495 + }, + { + "epoch": 2.95556461475744, + "grad_norm": 0.31112828850746155, + "learning_rate": 6.307157901888844e-06, + "loss": 3.9512, + "step": 43500 + }, + { + "epoch": 2.9559043348281016, + "grad_norm": 0.2828062176704407, + "learning_rate": 6.306733251800516e-06, + "loss": 4.0839, + "step": 43505 + }, + { + "epoch": 2.9562440548987636, + "grad_norm": 0.3542782962322235, + "learning_rate": 6.30630860171219e-06, + "loss": 4.2006, + "step": 43510 + }, + { + "epoch": 2.9565837749694253, + "grad_norm": 0.35601890087127686, + "learning_rate": 6.305883951623863e-06, + "loss": 3.8868, + "step": 43515 + }, + { + "epoch": 2.956923495040087, + "grad_norm": 0.3291551172733307, + "learning_rate": 6.3054593015355346e-06, + "loss": 4.0365, + "step": 43520 + }, + { + "epoch": 2.957263215110749, + "grad_norm": 0.30709579586982727, + "learning_rate": 6.305034651447208e-06, + "loss": 4.1693, + "step": 43525 + }, + { + "epoch": 2.9576029351814106, + "grad_norm": 0.6585543155670166, + "learning_rate": 6.304610001358881e-06, + "loss": 4.1053, + "step": 43530 + }, + { + "epoch": 2.9579426552520722, + "grad_norm": 0.394031822681427, + "learning_rate": 6.304185351270553e-06, + "loss": 3.9403, + "step": 43535 + }, + { + "epoch": 2.9582823753227343, + "grad_norm": 0.3095897436141968, + "learning_rate": 6.303760701182227e-06, + "loss": 3.8894, + "step": 43540 + }, + { + "epoch": 2.958622095393396, + "grad_norm": 0.2911556661128998, + "learning_rate": 6.303336051093899e-06, + "loss": 3.9435, + "step": 43545 + }, + { + "epoch": 2.9589618154640576, + "grad_norm": 0.33159229159355164, + "learning_rate": 6.302911401005571e-06, + "loss": 4.2154, + "step": 43550 + }, + { + "epoch": 2.9593015355347196, + "grad_norm": 0.34602659940719604, + "learning_rate": 6.302486750917245e-06, + "loss": 4.158, + "step": 43555 + }, + { + "epoch": 2.9596412556053813, + "grad_norm": 0.28838878870010376, + "learning_rate": 6.302062100828917e-06, + "loss": 4.089, + "step": 43560 + }, + { + "epoch": 2.959980975676043, + "grad_norm": 0.3839625120162964, + "learning_rate": 6.30163745074059e-06, + "loss": 3.7825, + "step": 43565 + }, + { + "epoch": 2.960320695746705, + "grad_norm": 0.5654308199882507, + "learning_rate": 6.301212800652263e-06, + "loss": 3.9394, + "step": 43570 + }, + { + "epoch": 2.9606604158173666, + "grad_norm": 0.34526941180229187, + "learning_rate": 6.300788150563935e-06, + "loss": 4.0666, + "step": 43575 + }, + { + "epoch": 2.9610001358880282, + "grad_norm": 0.4189952611923218, + "learning_rate": 6.300363500475608e-06, + "loss": 4.054, + "step": 43580 + }, + { + "epoch": 2.9613398559586903, + "grad_norm": 0.37871304154396057, + "learning_rate": 6.299938850387282e-06, + "loss": 4.1435, + "step": 43585 + }, + { + "epoch": 2.961679576029352, + "grad_norm": 0.46341875195503235, + "learning_rate": 6.299514200298954e-06, + "loss": 4.1616, + "step": 43590 + }, + { + "epoch": 2.9620192961000136, + "grad_norm": 0.3164316415786743, + "learning_rate": 6.299089550210627e-06, + "loss": 3.9858, + "step": 43595 + }, + { + "epoch": 2.962359016170675, + "grad_norm": 0.26309722661972046, + "learning_rate": 6.2986649001223e-06, + "loss": 4.0522, + "step": 43600 + }, + { + "epoch": 2.9626987362413373, + "grad_norm": 0.22533772885799408, + "learning_rate": 6.298240250033972e-06, + "loss": 3.9801, + "step": 43605 + }, + { + "epoch": 2.963038456311999, + "grad_norm": 0.3124501407146454, + "learning_rate": 6.297815599945646e-06, + "loss": 4.0322, + "step": 43610 + }, + { + "epoch": 2.9633781763826605, + "grad_norm": 0.30194178223609924, + "learning_rate": 6.297390949857319e-06, + "loss": 3.9941, + "step": 43615 + }, + { + "epoch": 2.9637178964533226, + "grad_norm": 0.3565506041049957, + "learning_rate": 6.2969662997689906e-06, + "loss": 4.0885, + "step": 43620 + }, + { + "epoch": 2.9640576165239843, + "grad_norm": 0.2765846848487854, + "learning_rate": 6.296541649680664e-06, + "loss": 4.0533, + "step": 43625 + }, + { + "epoch": 2.964397336594646, + "grad_norm": 0.37145519256591797, + "learning_rate": 6.296116999592336e-06, + "loss": 4.041, + "step": 43630 + }, + { + "epoch": 2.9647370566653075, + "grad_norm": 0.4361737370491028, + "learning_rate": 6.295692349504009e-06, + "loss": 4.244, + "step": 43635 + }, + { + "epoch": 2.9650767767359696, + "grad_norm": 0.3525587022304535, + "learning_rate": 6.295267699415683e-06, + "loss": 4.1674, + "step": 43640 + }, + { + "epoch": 2.965416496806631, + "grad_norm": 0.29799845814704895, + "learning_rate": 6.2948430493273546e-06, + "loss": 4.1559, + "step": 43645 + }, + { + "epoch": 2.965756216877293, + "grad_norm": 0.4038304090499878, + "learning_rate": 6.294418399239027e-06, + "loss": 4.1303, + "step": 43650 + }, + { + "epoch": 2.966095936947955, + "grad_norm": 0.3031052052974701, + "learning_rate": 6.293993749150701e-06, + "loss": 4.2569, + "step": 43655 + }, + { + "epoch": 2.9664356570186166, + "grad_norm": 0.30260953307151794, + "learning_rate": 6.293569099062373e-06, + "loss": 4.3104, + "step": 43660 + }, + { + "epoch": 2.966775377089278, + "grad_norm": 0.2947876751422882, + "learning_rate": 6.293144448974046e-06, + "loss": 4.1014, + "step": 43665 + }, + { + "epoch": 2.9671150971599403, + "grad_norm": 0.33060359954833984, + "learning_rate": 6.292719798885719e-06, + "loss": 4.0944, + "step": 43670 + }, + { + "epoch": 2.967454817230602, + "grad_norm": 0.2806376814842224, + "learning_rate": 6.292295148797391e-06, + "loss": 3.9575, + "step": 43675 + }, + { + "epoch": 2.9677945373012635, + "grad_norm": 0.33356592059135437, + "learning_rate": 6.291870498709064e-06, + "loss": 4.1315, + "step": 43680 + }, + { + "epoch": 2.9681342573719256, + "grad_norm": 0.2942105233669281, + "learning_rate": 6.291445848620738e-06, + "loss": 4.1899, + "step": 43685 + }, + { + "epoch": 2.9684739774425872, + "grad_norm": 0.2908257246017456, + "learning_rate": 6.29102119853241e-06, + "loss": 4.126, + "step": 43690 + }, + { + "epoch": 2.968813697513249, + "grad_norm": 0.3760777711868286, + "learning_rate": 6.290596548444083e-06, + "loss": 4.3331, + "step": 43695 + }, + { + "epoch": 2.969153417583911, + "grad_norm": 0.3736666738986969, + "learning_rate": 6.290171898355756e-06, + "loss": 3.9975, + "step": 43700 + }, + { + "epoch": 2.9694931376545726, + "grad_norm": 0.4146619141101837, + "learning_rate": 6.289747248267428e-06, + "loss": 4.2739, + "step": 43705 + }, + { + "epoch": 2.969832857725234, + "grad_norm": 0.30634409189224243, + "learning_rate": 6.289322598179101e-06, + "loss": 4.0048, + "step": 43710 + }, + { + "epoch": 2.9701725777958963, + "grad_norm": 0.429054319858551, + "learning_rate": 6.288897948090774e-06, + "loss": 3.9246, + "step": 43715 + }, + { + "epoch": 2.970512297866558, + "grad_norm": 0.27849021553993225, + "learning_rate": 6.288473298002447e-06, + "loss": 3.9042, + "step": 43720 + }, + { + "epoch": 2.9708520179372195, + "grad_norm": 0.41190120577812195, + "learning_rate": 6.2880486479141185e-06, + "loss": 3.9708, + "step": 43725 + }, + { + "epoch": 2.9711917380078816, + "grad_norm": 0.36131376028060913, + "learning_rate": 6.287623997825792e-06, + "loss": 4.2649, + "step": 43730 + }, + { + "epoch": 2.9715314580785432, + "grad_norm": 0.2664549648761749, + "learning_rate": 6.287199347737465e-06, + "loss": 4.1712, + "step": 43735 + }, + { + "epoch": 2.971871178149205, + "grad_norm": 0.2856772541999817, + "learning_rate": 6.286774697649137e-06, + "loss": 4.0061, + "step": 43740 + }, + { + "epoch": 2.972210898219867, + "grad_norm": 0.31025755405426025, + "learning_rate": 6.286350047560811e-06, + "loss": 4.0669, + "step": 43745 + }, + { + "epoch": 2.9725506182905286, + "grad_norm": 0.2779572904109955, + "learning_rate": 6.285925397472483e-06, + "loss": 4.1324, + "step": 43750 + }, + { + "epoch": 2.97289033836119, + "grad_norm": 0.31596237421035767, + "learning_rate": 6.285500747384155e-06, + "loss": 4.14, + "step": 43755 + }, + { + "epoch": 2.9732300584318523, + "grad_norm": 0.2971198558807373, + "learning_rate": 6.285076097295829e-06, + "loss": 4.3909, + "step": 43760 + }, + { + "epoch": 2.973569778502514, + "grad_norm": 0.39623475074768066, + "learning_rate": 6.284651447207502e-06, + "loss": 4.0119, + "step": 43765 + }, + { + "epoch": 2.9739094985731755, + "grad_norm": 0.4044703543186188, + "learning_rate": 6.284226797119174e-06, + "loss": 4.039, + "step": 43770 + }, + { + "epoch": 2.9742492186438376, + "grad_norm": 0.38135719299316406, + "learning_rate": 6.283802147030847e-06, + "loss": 3.9061, + "step": 43775 + }, + { + "epoch": 2.9745889387144993, + "grad_norm": 0.24628795683383942, + "learning_rate": 6.28337749694252e-06, + "loss": 4.0814, + "step": 43780 + }, + { + "epoch": 2.974928658785161, + "grad_norm": 0.24091151356697083, + "learning_rate": 6.282952846854192e-06, + "loss": 3.8653, + "step": 43785 + }, + { + "epoch": 2.975268378855823, + "grad_norm": 0.3957355320453644, + "learning_rate": 6.282528196765866e-06, + "loss": 4.0585, + "step": 43790 + }, + { + "epoch": 2.9756080989264846, + "grad_norm": 0.4173252582550049, + "learning_rate": 6.282103546677539e-06, + "loss": 4.0713, + "step": 43795 + }, + { + "epoch": 2.975947818997146, + "grad_norm": 0.25496500730514526, + "learning_rate": 6.2816788965892105e-06, + "loss": 4.0322, + "step": 43800 + }, + { + "epoch": 2.9762875390678083, + "grad_norm": 0.26601874828338623, + "learning_rate": 6.281254246500884e-06, + "loss": 4.1253, + "step": 43805 + }, + { + "epoch": 2.97662725913847, + "grad_norm": 0.40158531069755554, + "learning_rate": 6.280829596412556e-06, + "loss": 3.9861, + "step": 43810 + }, + { + "epoch": 2.9769669792091316, + "grad_norm": 0.3283807635307312, + "learning_rate": 6.280404946324229e-06, + "loss": 4.353, + "step": 43815 + }, + { + "epoch": 2.9773066992797936, + "grad_norm": 0.28900378942489624, + "learning_rate": 6.279980296235903e-06, + "loss": 4.0796, + "step": 43820 + }, + { + "epoch": 2.9776464193504553, + "grad_norm": 0.23478113114833832, + "learning_rate": 6.2795556461475746e-06, + "loss": 4.0898, + "step": 43825 + }, + { + "epoch": 2.977986139421117, + "grad_norm": 0.2784697711467743, + "learning_rate": 6.279130996059247e-06, + "loss": 4.1697, + "step": 43830 + }, + { + "epoch": 2.978325859491779, + "grad_norm": 0.48136094212532043, + "learning_rate": 6.278706345970921e-06, + "loss": 3.9419, + "step": 43835 + }, + { + "epoch": 2.9786655795624406, + "grad_norm": 0.24101845920085907, + "learning_rate": 6.278281695882593e-06, + "loss": 4.013, + "step": 43840 + }, + { + "epoch": 2.9790052996331022, + "grad_norm": 0.2526110112667084, + "learning_rate": 6.277857045794266e-06, + "loss": 4.1022, + "step": 43845 + }, + { + "epoch": 2.9793450197037643, + "grad_norm": 0.4208475649356842, + "learning_rate": 6.277432395705939e-06, + "loss": 4.1489, + "step": 43850 + }, + { + "epoch": 2.979684739774426, + "grad_norm": 0.21480311453342438, + "learning_rate": 6.277007745617611e-06, + "loss": 3.9789, + "step": 43855 + }, + { + "epoch": 2.9800244598450876, + "grad_norm": 0.4159485995769501, + "learning_rate": 6.276583095529284e-06, + "loss": 4.2109, + "step": 43860 + }, + { + "epoch": 2.9803641799157496, + "grad_norm": 0.35946932435035706, + "learning_rate": 6.276158445440958e-06, + "loss": 3.9807, + "step": 43865 + }, + { + "epoch": 2.9807038999864113, + "grad_norm": 0.31180545687675476, + "learning_rate": 6.27573379535263e-06, + "loss": 4.0668, + "step": 43870 + }, + { + "epoch": 2.981043620057073, + "grad_norm": 0.3808722198009491, + "learning_rate": 6.2753091452643026e-06, + "loss": 3.9625, + "step": 43875 + }, + { + "epoch": 2.981383340127735, + "grad_norm": 0.2877368628978729, + "learning_rate": 6.274884495175975e-06, + "loss": 4.254, + "step": 43880 + }, + { + "epoch": 2.9817230601983966, + "grad_norm": 0.5847010016441345, + "learning_rate": 6.274459845087648e-06, + "loss": 3.9397, + "step": 43885 + }, + { + "epoch": 2.9820627802690582, + "grad_norm": 0.28909972310066223, + "learning_rate": 6.274035194999321e-06, + "loss": 3.8547, + "step": 43890 + }, + { + "epoch": 2.9824025003397203, + "grad_norm": 0.39949941635131836, + "learning_rate": 6.273610544910994e-06, + "loss": 4.4354, + "step": 43895 + }, + { + "epoch": 2.982742220410382, + "grad_norm": 0.368864506483078, + "learning_rate": 6.2731858948226666e-06, + "loss": 4.0137, + "step": 43900 + }, + { + "epoch": 2.9830819404810436, + "grad_norm": 0.3257102072238922, + "learning_rate": 6.2727612447343385e-06, + "loss": 4.1219, + "step": 43905 + }, + { + "epoch": 2.9834216605517057, + "grad_norm": 0.259562224149704, + "learning_rate": 6.272336594646012e-06, + "loss": 4.1075, + "step": 43910 + }, + { + "epoch": 2.9837613806223673, + "grad_norm": 0.29542359709739685, + "learning_rate": 6.271911944557685e-06, + "loss": 4.1055, + "step": 43915 + }, + { + "epoch": 2.984101100693029, + "grad_norm": 0.30996477603912354, + "learning_rate": 6.271487294469357e-06, + "loss": 3.9595, + "step": 43920 + }, + { + "epoch": 2.984440820763691, + "grad_norm": 0.31693974137306213, + "learning_rate": 6.2710626443810306e-06, + "loss": 4.1876, + "step": 43925 + }, + { + "epoch": 2.9847805408343526, + "grad_norm": 0.3592541217803955, + "learning_rate": 6.270637994292703e-06, + "loss": 3.9678, + "step": 43930 + }, + { + "epoch": 2.9851202609050143, + "grad_norm": 0.4020242393016815, + "learning_rate": 6.270213344204377e-06, + "loss": 3.688, + "step": 43935 + }, + { + "epoch": 2.985459980975676, + "grad_norm": 0.39278990030288696, + "learning_rate": 6.269788694116049e-06, + "loss": 4.1044, + "step": 43940 + }, + { + "epoch": 2.985799701046338, + "grad_norm": 0.3139536380767822, + "learning_rate": 6.269364044027722e-06, + "loss": 3.8511, + "step": 43945 + }, + { + "epoch": 2.9861394211169996, + "grad_norm": 0.2882060110569, + "learning_rate": 6.2689393939393946e-06, + "loss": 4.1092, + "step": 43950 + }, + { + "epoch": 2.9864791411876612, + "grad_norm": 0.3116571307182312, + "learning_rate": 6.268514743851067e-06, + "loss": 4.0181, + "step": 43955 + }, + { + "epoch": 2.9868188612583233, + "grad_norm": 0.3733536899089813, + "learning_rate": 6.26809009376274e-06, + "loss": 4.2017, + "step": 43960 + }, + { + "epoch": 2.987158581328985, + "grad_norm": 0.38130536675453186, + "learning_rate": 6.267665443674413e-06, + "loss": 4.1239, + "step": 43965 + }, + { + "epoch": 2.9874983013996466, + "grad_norm": 0.27369993925094604, + "learning_rate": 6.267240793586086e-06, + "loss": 4.1691, + "step": 43970 + }, + { + "epoch": 2.987838021470308, + "grad_norm": 0.2536536455154419, + "learning_rate": 6.266816143497758e-06, + "loss": 4.0093, + "step": 43975 + }, + { + "epoch": 2.9881777415409703, + "grad_norm": 0.25461527705192566, + "learning_rate": 6.266391493409431e-06, + "loss": 3.982, + "step": 43980 + }, + { + "epoch": 2.988517461611632, + "grad_norm": 0.24422451853752136, + "learning_rate": 6.265966843321104e-06, + "loss": 3.8644, + "step": 43985 + }, + { + "epoch": 2.9888571816822935, + "grad_norm": 0.5754057168960571, + "learning_rate": 6.265542193232776e-06, + "loss": 4.3306, + "step": 43990 + }, + { + "epoch": 2.9891969017529556, + "grad_norm": 0.38818761706352234, + "learning_rate": 6.26511754314445e-06, + "loss": 4.312, + "step": 43995 + }, + { + "epoch": 2.9895366218236172, + "grad_norm": 0.3518742024898529, + "learning_rate": 6.264692893056123e-06, + "loss": 3.9536, + "step": 44000 + }, + { + "epoch": 2.989876341894279, + "grad_norm": 0.370165079832077, + "learning_rate": 6.2642682429677945e-06, + "loss": 4.257, + "step": 44005 + }, + { + "epoch": 2.990216061964941, + "grad_norm": 0.39717039465904236, + "learning_rate": 6.263843592879468e-06, + "loss": 3.8386, + "step": 44010 + }, + { + "epoch": 2.9905557820356026, + "grad_norm": 0.25895291566848755, + "learning_rate": 6.263418942791141e-06, + "loss": 4.1914, + "step": 44015 + }, + { + "epoch": 2.990895502106264, + "grad_norm": 0.2554032802581787, + "learning_rate": 6.262994292702813e-06, + "loss": 3.9653, + "step": 44020 + }, + { + "epoch": 2.9912352221769263, + "grad_norm": 0.25651687383651733, + "learning_rate": 6.262569642614487e-06, + "loss": 3.9725, + "step": 44025 + }, + { + "epoch": 2.991574942247588, + "grad_norm": 0.30608808994293213, + "learning_rate": 6.262144992526159e-06, + "loss": 4.4216, + "step": 44030 + }, + { + "epoch": 2.9919146623182495, + "grad_norm": 0.30543282628059387, + "learning_rate": 6.261720342437831e-06, + "loss": 4.0837, + "step": 44035 + }, + { + "epoch": 2.9922543823889116, + "grad_norm": 0.2745923697948456, + "learning_rate": 6.261295692349505e-06, + "loss": 3.9986, + "step": 44040 + }, + { + "epoch": 2.9925941024595732, + "grad_norm": 0.24451331794261932, + "learning_rate": 6.260871042261178e-06, + "loss": 4.2115, + "step": 44045 + }, + { + "epoch": 2.992933822530235, + "grad_norm": 0.5734487771987915, + "learning_rate": 6.26044639217285e-06, + "loss": 4.2437, + "step": 44050 + }, + { + "epoch": 2.993273542600897, + "grad_norm": 0.5262256264686584, + "learning_rate": 6.260021742084523e-06, + "loss": 4.3311, + "step": 44055 + }, + { + "epoch": 2.9936132626715586, + "grad_norm": 0.33301812410354614, + "learning_rate": 6.259597091996195e-06, + "loss": 4.2536, + "step": 44060 + }, + { + "epoch": 2.99395298274222, + "grad_norm": 0.3570989668369293, + "learning_rate": 6.259172441907868e-06, + "loss": 3.8111, + "step": 44065 + }, + { + "epoch": 2.9942927028128823, + "grad_norm": 0.26334863901138306, + "learning_rate": 6.258747791819542e-06, + "loss": 4.0691, + "step": 44070 + }, + { + "epoch": 2.994632422883544, + "grad_norm": 0.5909995436668396, + "learning_rate": 6.258323141731214e-06, + "loss": 4.0272, + "step": 44075 + }, + { + "epoch": 2.9949721429542056, + "grad_norm": 0.3457583487033844, + "learning_rate": 6.2578984916428865e-06, + "loss": 4.1478, + "step": 44080 + }, + { + "epoch": 2.9953118630248676, + "grad_norm": 0.380750834941864, + "learning_rate": 6.25747384155456e-06, + "loss": 4.2725, + "step": 44085 + }, + { + "epoch": 2.9956515830955293, + "grad_norm": 0.5014374256134033, + "learning_rate": 6.257049191466232e-06, + "loss": 4.037, + "step": 44090 + }, + { + "epoch": 2.995991303166191, + "grad_norm": 0.28197816014289856, + "learning_rate": 6.256624541377905e-06, + "loss": 4.1473, + "step": 44095 + }, + { + "epoch": 2.996331023236853, + "grad_norm": 0.29361692070961, + "learning_rate": 6.256199891289579e-06, + "loss": 4.1502, + "step": 44100 + }, + { + "epoch": 2.9966707433075146, + "grad_norm": 0.28540802001953125, + "learning_rate": 6.2557752412012505e-06, + "loss": 4.1884, + "step": 44105 + }, + { + "epoch": 2.9970104633781762, + "grad_norm": 0.29913339018821716, + "learning_rate": 6.255350591112923e-06, + "loss": 4.0891, + "step": 44110 + }, + { + "epoch": 2.9973501834488383, + "grad_norm": 0.34914451837539673, + "learning_rate": 6.254925941024597e-06, + "loss": 3.943, + "step": 44115 + }, + { + "epoch": 2.9976899035195, + "grad_norm": 0.4796534478664398, + "learning_rate": 6.254501290936269e-06, + "loss": 4.1178, + "step": 44120 + }, + { + "epoch": 2.9980296235901616, + "grad_norm": 0.26901787519454956, + "learning_rate": 6.254076640847942e-06, + "loss": 4.1302, + "step": 44125 + }, + { + "epoch": 2.9983693436608236, + "grad_norm": 0.43198761343955994, + "learning_rate": 6.2536519907596146e-06, + "loss": 4.1601, + "step": 44130 + }, + { + "epoch": 2.9987090637314853, + "grad_norm": 0.4135466516017914, + "learning_rate": 6.253227340671287e-06, + "loss": 4.0228, + "step": 44135 + }, + { + "epoch": 2.999048783802147, + "grad_norm": 0.3002406060695648, + "learning_rate": 6.25280269058296e-06, + "loss": 4.125, + "step": 44140 + }, + { + "epoch": 2.999388503872809, + "grad_norm": 0.2931835651397705, + "learning_rate": 6.252378040494633e-06, + "loss": 3.9671, + "step": 44145 + }, + { + "epoch": 2.9997282239434706, + "grad_norm": 0.28410419821739197, + "learning_rate": 6.251953390406306e-06, + "loss": 3.9487, + "step": 44150 + }, + { + "epoch": 3.0, + "eval_bertscore": { + "f1": 0.8212621033578686, + "precision": 0.8179725552266023, + "recall": 0.8259640674233402 + }, + "eval_bleu_4": 0.0038861388123395436, + "eval_exact_match": 0.0, + "eval_loss": 3.8639140129089355, + "eval_meteor": 0.06735385040939107, + "eval_rouge": { + "rouge1": 0.1055211996780733, + "rouge2": 0.008603980092310573, + "rougeL": 0.08514021717169461, + "rougeLsum": 0.08519111945492262 + }, + "eval_runtime": 274.7613, + "eval_samples_per_second": 37.556, + "eval_steps_per_second": 4.695, + "step": 44154 + }, + { + "epoch": 3.0000679440141322, + "grad_norm": 0.3501611649990082, + "learning_rate": 6.251528740317978e-06, + "loss": 4.0481, + "step": 44155 + }, + { + "epoch": 3.0004076640847943, + "grad_norm": 0.30670157074928284, + "learning_rate": 6.251104090229651e-06, + "loss": 4.0616, + "step": 44160 + }, + { + "epoch": 3.000747384155456, + "grad_norm": 0.3260643184185028, + "learning_rate": 6.250679440141324e-06, + "loss": 4.1239, + "step": 44165 + }, + { + "epoch": 3.0010871042261176, + "grad_norm": 0.32072293758392334, + "learning_rate": 6.250254790052996e-06, + "loss": 3.9435, + "step": 44170 + }, + { + "epoch": 3.0014268242967796, + "grad_norm": 0.37347063422203064, + "learning_rate": 6.24983013996467e-06, + "loss": 4.0716, + "step": 44175 + }, + { + "epoch": 3.0017665443674413, + "grad_norm": 0.3204927444458008, + "learning_rate": 6.2494054898763426e-06, + "loss": 4.1721, + "step": 44180 + }, + { + "epoch": 3.002106264438103, + "grad_norm": 0.4598839282989502, + "learning_rate": 6.2489808397880145e-06, + "loss": 3.7673, + "step": 44185 + }, + { + "epoch": 3.002445984508765, + "grad_norm": 0.2597476541996002, + "learning_rate": 6.248556189699688e-06, + "loss": 4.176, + "step": 44190 + }, + { + "epoch": 3.0027857045794266, + "grad_norm": 0.34709542989730835, + "learning_rate": 6.248131539611361e-06, + "loss": 4.0058, + "step": 44195 + }, + { + "epoch": 3.0031254246500882, + "grad_norm": 0.27033472061157227, + "learning_rate": 6.247706889523033e-06, + "loss": 3.8855, + "step": 44200 + }, + { + "epoch": 3.0034651447207503, + "grad_norm": 0.46271035075187683, + "learning_rate": 6.2472822394347066e-06, + "loss": 4.1531, + "step": 44205 + }, + { + "epoch": 3.003804864791412, + "grad_norm": 0.29825592041015625, + "learning_rate": 6.246857589346379e-06, + "loss": 4.2472, + "step": 44210 + }, + { + "epoch": 3.0041445848620736, + "grad_norm": 0.2901274561882019, + "learning_rate": 6.246432939258051e-06, + "loss": 4.0098, + "step": 44215 + }, + { + "epoch": 3.004484304932735, + "grad_norm": 0.2834618389606476, + "learning_rate": 6.246008289169725e-06, + "loss": 4.038, + "step": 44220 + }, + { + "epoch": 3.0048240250033973, + "grad_norm": 0.3435004651546478, + "learning_rate": 6.245583639081397e-06, + "loss": 4.1739, + "step": 44225 + }, + { + "epoch": 3.005163745074059, + "grad_norm": 0.31433942914009094, + "learning_rate": 6.24515898899307e-06, + "loss": 3.9456, + "step": 44230 + }, + { + "epoch": 3.0055034651447206, + "grad_norm": 0.5816453099250793, + "learning_rate": 6.244734338904743e-06, + "loss": 4.0438, + "step": 44235 + }, + { + "epoch": 3.0058431852153826, + "grad_norm": 0.3467179536819458, + "learning_rate": 6.244309688816415e-06, + "loss": 4.0306, + "step": 44240 + }, + { + "epoch": 3.0061829052860443, + "grad_norm": 0.47570016980171204, + "learning_rate": 6.243885038728088e-06, + "loss": 4.1874, + "step": 44245 + }, + { + "epoch": 3.006522625356706, + "grad_norm": 0.42170873284339905, + "learning_rate": 6.243460388639762e-06, + "loss": 4.078, + "step": 44250 + }, + { + "epoch": 3.006862345427368, + "grad_norm": 0.3147519826889038, + "learning_rate": 6.243035738551434e-06, + "loss": 4.0023, + "step": 44255 + }, + { + "epoch": 3.0072020654980296, + "grad_norm": 0.3976505696773529, + "learning_rate": 6.2426110884631065e-06, + "loss": 4.2383, + "step": 44260 + }, + { + "epoch": 3.0075417855686912, + "grad_norm": 0.31064289808273315, + "learning_rate": 6.24218643837478e-06, + "loss": 4.0696, + "step": 44265 + }, + { + "epoch": 3.0078815056393533, + "grad_norm": 0.328553169965744, + "learning_rate": 6.241761788286452e-06, + "loss": 3.9082, + "step": 44270 + }, + { + "epoch": 3.008221225710015, + "grad_norm": 0.4793787896633148, + "learning_rate": 6.241337138198125e-06, + "loss": 4.0139, + "step": 44275 + }, + { + "epoch": 3.0085609457806766, + "grad_norm": 0.39482876658439636, + "learning_rate": 6.2409124881097986e-06, + "loss": 4.0148, + "step": 44280 + }, + { + "epoch": 3.0089006658513386, + "grad_norm": 0.38112467527389526, + "learning_rate": 6.2404878380214705e-06, + "loss": 3.8827, + "step": 44285 + }, + { + "epoch": 3.0092403859220003, + "grad_norm": 0.8114576935768127, + "learning_rate": 6.240063187933144e-06, + "loss": 3.8405, + "step": 44290 + }, + { + "epoch": 3.009580105992662, + "grad_norm": 0.2691807746887207, + "learning_rate": 6.239638537844816e-06, + "loss": 4.1092, + "step": 44295 + }, + { + "epoch": 3.009919826063324, + "grad_norm": 0.2958657741546631, + "learning_rate": 6.239213887756489e-06, + "loss": 3.9337, + "step": 44300 + }, + { + "epoch": 3.0102595461339856, + "grad_norm": 0.38561728596687317, + "learning_rate": 6.238789237668163e-06, + "loss": 3.957, + "step": 44305 + }, + { + "epoch": 3.0105992662046472, + "grad_norm": 0.38338804244995117, + "learning_rate": 6.2383645875798345e-06, + "loss": 4.2641, + "step": 44310 + }, + { + "epoch": 3.0109389862753093, + "grad_norm": 0.3289751410484314, + "learning_rate": 6.237939937491507e-06, + "loss": 4.1482, + "step": 44315 + }, + { + "epoch": 3.011278706345971, + "grad_norm": 0.3209465444087982, + "learning_rate": 6.237515287403181e-06, + "loss": 4.0468, + "step": 44320 + }, + { + "epoch": 3.0116184264166326, + "grad_norm": 0.2747717797756195, + "learning_rate": 6.237090637314853e-06, + "loss": 4.0792, + "step": 44325 + }, + { + "epoch": 3.0119581464872947, + "grad_norm": 0.3187781870365143, + "learning_rate": 6.236665987226526e-06, + "loss": 4.1814, + "step": 44330 + }, + { + "epoch": 3.0122978665579563, + "grad_norm": 0.28947877883911133, + "learning_rate": 6.236241337138199e-06, + "loss": 4.0557, + "step": 44335 + }, + { + "epoch": 3.012637586628618, + "grad_norm": 0.3333393633365631, + "learning_rate": 6.235816687049871e-06, + "loss": 3.9614, + "step": 44340 + }, + { + "epoch": 3.01297730669928, + "grad_norm": 0.421230286359787, + "learning_rate": 6.235392036961544e-06, + "loss": 4.1539, + "step": 44345 + }, + { + "epoch": 3.0133170267699416, + "grad_norm": 0.31205061078071594, + "learning_rate": 6.234967386873218e-06, + "loss": 4.0584, + "step": 44350 + }, + { + "epoch": 3.0136567468406033, + "grad_norm": 0.3109869062900543, + "learning_rate": 6.23454273678489e-06, + "loss": 4.0653, + "step": 44355 + }, + { + "epoch": 3.0139964669112653, + "grad_norm": 0.2536885142326355, + "learning_rate": 6.2341180866965625e-06, + "loss": 4.1655, + "step": 44360 + }, + { + "epoch": 3.014336186981927, + "grad_norm": 0.23751765489578247, + "learning_rate": 6.233693436608236e-06, + "loss": 3.8646, + "step": 44365 + }, + { + "epoch": 3.0146759070525886, + "grad_norm": 0.32765355706214905, + "learning_rate": 6.233268786519908e-06, + "loss": 3.9171, + "step": 44370 + }, + { + "epoch": 3.01501562712325, + "grad_norm": 0.2551412582397461, + "learning_rate": 6.232844136431581e-06, + "loss": 4.2346, + "step": 44375 + }, + { + "epoch": 3.0153553471939123, + "grad_norm": 0.3150400221347809, + "learning_rate": 6.232419486343254e-06, + "loss": 3.9068, + "step": 44380 + }, + { + "epoch": 3.015695067264574, + "grad_norm": 0.32944291830062866, + "learning_rate": 6.2319948362549265e-06, + "loss": 4.1634, + "step": 44385 + }, + { + "epoch": 3.0160347873352356, + "grad_norm": 0.33226704597473145, + "learning_rate": 6.231570186166599e-06, + "loss": 4.1022, + "step": 44390 + }, + { + "epoch": 3.0163745074058976, + "grad_norm": 0.26700177788734436, + "learning_rate": 6.231145536078272e-06, + "loss": 3.9544, + "step": 44395 + }, + { + "epoch": 3.0167142274765593, + "grad_norm": 0.5916354060173035, + "learning_rate": 6.230720885989945e-06, + "loss": 4.2728, + "step": 44400 + }, + { + "epoch": 3.017053947547221, + "grad_norm": 0.2901079058647156, + "learning_rate": 6.230296235901617e-06, + "loss": 3.9118, + "step": 44405 + }, + { + "epoch": 3.017393667617883, + "grad_norm": 0.285693496465683, + "learning_rate": 6.2298715858132905e-06, + "loss": 3.88, + "step": 44410 + }, + { + "epoch": 3.0177333876885446, + "grad_norm": 0.31521594524383545, + "learning_rate": 6.229446935724963e-06, + "loss": 4.1902, + "step": 44415 + }, + { + "epoch": 3.0180731077592062, + "grad_norm": 0.3587312400341034, + "learning_rate": 6.229022285636635e-06, + "loss": 4.1411, + "step": 44420 + }, + { + "epoch": 3.0184128278298683, + "grad_norm": 0.33299165964126587, + "learning_rate": 6.228597635548309e-06, + "loss": 4.2555, + "step": 44425 + }, + { + "epoch": 3.01875254790053, + "grad_norm": 0.2522968649864197, + "learning_rate": 6.228172985459982e-06, + "loss": 4.1785, + "step": 44430 + }, + { + "epoch": 3.0190922679711916, + "grad_norm": 0.2582986354827881, + "learning_rate": 6.227748335371654e-06, + "loss": 4.0084, + "step": 44435 + }, + { + "epoch": 3.0194319880418536, + "grad_norm": 0.34414437413215637, + "learning_rate": 6.227323685283327e-06, + "loss": 4.2732, + "step": 44440 + }, + { + "epoch": 3.0197717081125153, + "grad_norm": 0.2930225729942322, + "learning_rate": 6.226899035195e-06, + "loss": 4.1831, + "step": 44445 + }, + { + "epoch": 3.020111428183177, + "grad_norm": 0.338127464056015, + "learning_rate": 6.226474385106672e-06, + "loss": 4.2252, + "step": 44450 + }, + { + "epoch": 3.020451148253839, + "grad_norm": 0.3261042833328247, + "learning_rate": 6.226049735018346e-06, + "loss": 3.9738, + "step": 44455 + }, + { + "epoch": 3.0207908683245006, + "grad_norm": 0.331245094537735, + "learning_rate": 6.2256250849300186e-06, + "loss": 3.9457, + "step": 44460 + }, + { + "epoch": 3.0211305883951622, + "grad_norm": 0.3300244212150574, + "learning_rate": 6.2252004348416905e-06, + "loss": 4.1857, + "step": 44465 + }, + { + "epoch": 3.0214703084658243, + "grad_norm": 0.387504905462265, + "learning_rate": 6.224775784753364e-06, + "loss": 4.2861, + "step": 44470 + }, + { + "epoch": 3.021810028536486, + "grad_norm": 0.2643010914325714, + "learning_rate": 6.224351134665036e-06, + "loss": 4.1047, + "step": 44475 + }, + { + "epoch": 3.0221497486071476, + "grad_norm": 0.400867223739624, + "learning_rate": 6.223926484576709e-06, + "loss": 4.135, + "step": 44480 + }, + { + "epoch": 3.0224894686778097, + "grad_norm": 0.3217700719833374, + "learning_rate": 6.2235018344883826e-06, + "loss": 4.3079, + "step": 44485 + }, + { + "epoch": 3.0228291887484713, + "grad_norm": 0.24836939573287964, + "learning_rate": 6.2230771844000545e-06, + "loss": 3.8196, + "step": 44490 + }, + { + "epoch": 3.023168908819133, + "grad_norm": 0.3005322515964508, + "learning_rate": 6.222652534311727e-06, + "loss": 4.1674, + "step": 44495 + }, + { + "epoch": 3.023508628889795, + "grad_norm": 0.2432934194803238, + "learning_rate": 6.222227884223401e-06, + "loss": 4.055, + "step": 44500 + }, + { + "epoch": 3.0238483489604566, + "grad_norm": 0.44044914841651917, + "learning_rate": 6.221888164152738e-06, + "loss": 4.1292, + "step": 44505 + }, + { + "epoch": 3.0241880690311183, + "grad_norm": 0.6052452325820923, + "learning_rate": 6.221463514064412e-06, + "loss": 4.2037, + "step": 44510 + }, + { + "epoch": 3.0245277891017803, + "grad_norm": 0.2959815561771393, + "learning_rate": 6.221038863976085e-06, + "loss": 4.0531, + "step": 44515 + }, + { + "epoch": 3.024867509172442, + "grad_norm": 0.3820519745349884, + "learning_rate": 6.220614213887757e-06, + "loss": 4.0743, + "step": 44520 + }, + { + "epoch": 3.0252072292431036, + "grad_norm": 0.8502155542373657, + "learning_rate": 6.22018956379943e-06, + "loss": 4.207, + "step": 44525 + }, + { + "epoch": 3.0255469493137657, + "grad_norm": 0.3065190017223358, + "learning_rate": 6.219764913711103e-06, + "loss": 4.2524, + "step": 44530 + }, + { + "epoch": 3.0258866693844273, + "grad_norm": 0.3913699984550476, + "learning_rate": 6.219340263622775e-06, + "loss": 3.9845, + "step": 44535 + }, + { + "epoch": 3.026226389455089, + "grad_norm": 0.21215897798538208, + "learning_rate": 6.218915613534449e-06, + "loss": 4.4199, + "step": 44540 + }, + { + "epoch": 3.026566109525751, + "grad_norm": 0.30598339438438416, + "learning_rate": 6.2184909634461214e-06, + "loss": 3.9566, + "step": 44545 + }, + { + "epoch": 3.0269058295964126, + "grad_norm": 0.44269633293151855, + "learning_rate": 6.218066313357793e-06, + "loss": 4.0788, + "step": 44550 + }, + { + "epoch": 3.0272455496670743, + "grad_norm": 0.3247036933898926, + "learning_rate": 6.217641663269467e-06, + "loss": 3.9908, + "step": 44555 + }, + { + "epoch": 3.027585269737736, + "grad_norm": 0.3331852853298187, + "learning_rate": 6.217217013181139e-06, + "loss": 3.9922, + "step": 44560 + }, + { + "epoch": 3.027924989808398, + "grad_norm": 0.33676204085350037, + "learning_rate": 6.216792363092812e-06, + "loss": 4.007, + "step": 44565 + }, + { + "epoch": 3.0282647098790596, + "grad_norm": 0.3165758550167084, + "learning_rate": 6.2163677130044854e-06, + "loss": 3.9665, + "step": 44570 + }, + { + "epoch": 3.0286044299497212, + "grad_norm": 0.2706071436405182, + "learning_rate": 6.215943062916157e-06, + "loss": 3.989, + "step": 44575 + }, + { + "epoch": 3.0289441500203833, + "grad_norm": 0.3612626791000366, + "learning_rate": 6.21551841282783e-06, + "loss": 3.9087, + "step": 44580 + }, + { + "epoch": 3.029283870091045, + "grad_norm": 0.6768820285797119, + "learning_rate": 6.215093762739504e-06, + "loss": 4.0611, + "step": 44585 + }, + { + "epoch": 3.0296235901617066, + "grad_norm": 0.3488738536834717, + "learning_rate": 6.214669112651176e-06, + "loss": 4.0477, + "step": 44590 + }, + { + "epoch": 3.0299633102323686, + "grad_norm": 0.34495773911476135, + "learning_rate": 6.214244462562849e-06, + "loss": 3.9629, + "step": 44595 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 0.3170544505119324, + "learning_rate": 6.213819812474522e-06, + "loss": 4.0684, + "step": 44600 + }, + { + "epoch": 3.030642750373692, + "grad_norm": 0.3294026255607605, + "learning_rate": 6.213395162386194e-06, + "loss": 3.9822, + "step": 44605 + }, + { + "epoch": 3.030982470444354, + "grad_norm": 0.2644574046134949, + "learning_rate": 6.212970512297867e-06, + "loss": 4.0001, + "step": 44610 + }, + { + "epoch": 3.0313221905150156, + "grad_norm": 0.4590366780757904, + "learning_rate": 6.212545862209541e-06, + "loss": 4.1051, + "step": 44615 + }, + { + "epoch": 3.0316619105856772, + "grad_norm": 0.33929404616355896, + "learning_rate": 6.212121212121213e-06, + "loss": 3.9965, + "step": 44620 + }, + { + "epoch": 3.0320016306563393, + "grad_norm": 0.3620201349258423, + "learning_rate": 6.211696562032885e-06, + "loss": 4.224, + "step": 44625 + }, + { + "epoch": 3.032341350727001, + "grad_norm": 0.388290673494339, + "learning_rate": 6.211271911944558e-06, + "loss": 4.225, + "step": 44630 + }, + { + "epoch": 3.0326810707976626, + "grad_norm": 0.3392608165740967, + "learning_rate": 6.210847261856231e-06, + "loss": 4.0269, + "step": 44635 + }, + { + "epoch": 3.0330207908683247, + "grad_norm": 0.3799680471420288, + "learning_rate": 6.210422611767904e-06, + "loss": 3.5759, + "step": 44640 + }, + { + "epoch": 3.0333605109389863, + "grad_norm": 0.3201201856136322, + "learning_rate": 6.209997961679577e-06, + "loss": 4.1607, + "step": 44645 + }, + { + "epoch": 3.033700231009648, + "grad_norm": 0.32567763328552246, + "learning_rate": 6.209573311591249e-06, + "loss": 3.7951, + "step": 44650 + }, + { + "epoch": 3.03403995108031, + "grad_norm": 0.34542590379714966, + "learning_rate": 6.209148661502921e-06, + "loss": 3.9398, + "step": 44655 + }, + { + "epoch": 3.0343796711509716, + "grad_norm": 0.30021020770072937, + "learning_rate": 6.208724011414595e-06, + "loss": 3.9248, + "step": 44660 + }, + { + "epoch": 3.0347193912216333, + "grad_norm": 0.5526005625724792, + "learning_rate": 6.208299361326268e-06, + "loss": 4.2011, + "step": 44665 + }, + { + "epoch": 3.0350591112922953, + "grad_norm": 0.3613058626651764, + "learning_rate": 6.20787471123794e-06, + "loss": 3.7475, + "step": 44670 + }, + { + "epoch": 3.035398831362957, + "grad_norm": 0.2487121969461441, + "learning_rate": 6.207450061149613e-06, + "loss": 3.9886, + "step": 44675 + }, + { + "epoch": 3.0357385514336186, + "grad_norm": 0.3573257029056549, + "learning_rate": 6.207025411061286e-06, + "loss": 4.0675, + "step": 44680 + }, + { + "epoch": 3.0360782715042807, + "grad_norm": 0.29942286014556885, + "learning_rate": 6.206600760972958e-06, + "loss": 4.1118, + "step": 44685 + }, + { + "epoch": 3.0364179915749423, + "grad_norm": 0.3751446604728699, + "learning_rate": 6.206176110884632e-06, + "loss": 3.941, + "step": 44690 + }, + { + "epoch": 3.036757711645604, + "grad_norm": 0.5862172245979309, + "learning_rate": 6.205751460796305e-06, + "loss": 3.9164, + "step": 44695 + }, + { + "epoch": 3.037097431716266, + "grad_norm": 0.49230897426605225, + "learning_rate": 6.2053268107079766e-06, + "loss": 4.0655, + "step": 44700 + }, + { + "epoch": 3.0374371517869276, + "grad_norm": 0.45832693576812744, + "learning_rate": 6.20490216061965e-06, + "loss": 3.8844, + "step": 44705 + }, + { + "epoch": 3.0377768718575893, + "grad_norm": 0.2463107705116272, + "learning_rate": 6.204477510531323e-06, + "loss": 4.1757, + "step": 44710 + }, + { + "epoch": 3.038116591928251, + "grad_norm": 0.6024778485298157, + "learning_rate": 6.204052860442995e-06, + "loss": 4.0491, + "step": 44715 + }, + { + "epoch": 3.038456311998913, + "grad_norm": 0.28856441378593445, + "learning_rate": 6.203628210354669e-06, + "loss": 4.0132, + "step": 44720 + }, + { + "epoch": 3.0387960320695746, + "grad_norm": 0.3073683977127075, + "learning_rate": 6.2032035602663406e-06, + "loss": 3.8135, + "step": 44725 + }, + { + "epoch": 3.0391357521402362, + "grad_norm": 0.3565156161785126, + "learning_rate": 6.202778910178013e-06, + "loss": 4.1629, + "step": 44730 + }, + { + "epoch": 3.0394754722108983, + "grad_norm": 0.2753666043281555, + "learning_rate": 6.202354260089687e-06, + "loss": 4.2969, + "step": 44735 + }, + { + "epoch": 3.03981519228156, + "grad_norm": 0.5140369534492493, + "learning_rate": 6.201929610001359e-06, + "loss": 3.8859, + "step": 44740 + }, + { + "epoch": 3.0401549123522216, + "grad_norm": 0.44430863857269287, + "learning_rate": 6.201504959913032e-06, + "loss": 4.176, + "step": 44745 + }, + { + "epoch": 3.0404946324228836, + "grad_norm": 0.20257627964019775, + "learning_rate": 6.2010803098247054e-06, + "loss": 4.0779, + "step": 44750 + }, + { + "epoch": 3.0408343524935453, + "grad_norm": 0.2673858106136322, + "learning_rate": 6.200655659736377e-06, + "loss": 3.7562, + "step": 44755 + }, + { + "epoch": 3.041174072564207, + "grad_norm": 0.3111318051815033, + "learning_rate": 6.20023100964805e-06, + "loss": 3.9616, + "step": 44760 + }, + { + "epoch": 3.041513792634869, + "grad_norm": 0.28266939520835876, + "learning_rate": 6.199806359559724e-06, + "loss": 3.9387, + "step": 44765 + }, + { + "epoch": 3.0418535127055306, + "grad_norm": 0.5092897415161133, + "learning_rate": 6.199381709471396e-06, + "loss": 3.7054, + "step": 44770 + }, + { + "epoch": 3.0421932327761922, + "grad_norm": 0.4218597114086151, + "learning_rate": 6.198957059383069e-06, + "loss": 4.0916, + "step": 44775 + }, + { + "epoch": 3.0425329528468543, + "grad_norm": 0.38128697872161865, + "learning_rate": 6.198532409294742e-06, + "loss": 3.9179, + "step": 44780 + }, + { + "epoch": 3.042872672917516, + "grad_norm": 0.29522812366485596, + "learning_rate": 6.198107759206414e-06, + "loss": 3.9593, + "step": 44785 + }, + { + "epoch": 3.0432123929881776, + "grad_norm": 0.3035309910774231, + "learning_rate": 6.197683109118087e-06, + "loss": 3.661, + "step": 44790 + }, + { + "epoch": 3.0435521130588397, + "grad_norm": 0.2757706344127655, + "learning_rate": 6.19725845902976e-06, + "loss": 4.0267, + "step": 44795 + }, + { + "epoch": 3.0438918331295013, + "grad_norm": 0.35891374945640564, + "learning_rate": 6.196833808941433e-06, + "loss": 4.2299, + "step": 44800 + }, + { + "epoch": 3.044231553200163, + "grad_norm": 0.3745458424091339, + "learning_rate": 6.196409158853105e-06, + "loss": 4.0275, + "step": 44805 + }, + { + "epoch": 3.044571273270825, + "grad_norm": 0.24983344972133636, + "learning_rate": 6.195984508764778e-06, + "loss": 4.0973, + "step": 44810 + }, + { + "epoch": 3.0449109933414866, + "grad_norm": 0.2988947927951813, + "learning_rate": 6.195559858676451e-06, + "loss": 4.2541, + "step": 44815 + }, + { + "epoch": 3.0452507134121483, + "grad_norm": 0.3634129464626312, + "learning_rate": 6.195135208588123e-06, + "loss": 4.146, + "step": 44820 + }, + { + "epoch": 3.0455904334828103, + "grad_norm": 0.22377733886241913, + "learning_rate": 6.194710558499797e-06, + "loss": 4.0597, + "step": 44825 + }, + { + "epoch": 3.045930153553472, + "grad_norm": 0.33179551362991333, + "learning_rate": 6.194285908411469e-06, + "loss": 4.3088, + "step": 44830 + }, + { + "epoch": 3.0462698736241336, + "grad_norm": 0.3004407584667206, + "learning_rate": 6.193861258323143e-06, + "loss": 4.0753, + "step": 44835 + }, + { + "epoch": 3.0466095936947957, + "grad_norm": 0.5144370794296265, + "learning_rate": 6.193436608234815e-06, + "loss": 3.9024, + "step": 44840 + }, + { + "epoch": 3.0469493137654573, + "grad_norm": 0.28277653455734253, + "learning_rate": 6.193011958146488e-06, + "loss": 3.9298, + "step": 44845 + }, + { + "epoch": 3.047289033836119, + "grad_norm": 0.24418668448925018, + "learning_rate": 6.1925873080581614e-06, + "loss": 4.08, + "step": 44850 + }, + { + "epoch": 3.047628753906781, + "grad_norm": 0.24452583491802216, + "learning_rate": 6.192162657969833e-06, + "loss": 3.9492, + "step": 44855 + }, + { + "epoch": 3.0479684739774426, + "grad_norm": 0.2727963924407959, + "learning_rate": 6.191738007881506e-06, + "loss": 4.0633, + "step": 44860 + }, + { + "epoch": 3.0483081940481043, + "grad_norm": 0.561783492565155, + "learning_rate": 6.19131335779318e-06, + "loss": 4.2622, + "step": 44865 + }, + { + "epoch": 3.0486479141187663, + "grad_norm": 0.2605483829975128, + "learning_rate": 6.190888707704852e-06, + "loss": 3.9267, + "step": 44870 + }, + { + "epoch": 3.048987634189428, + "grad_norm": 0.37209683656692505, + "learning_rate": 6.190464057616525e-06, + "loss": 3.8945, + "step": 44875 + }, + { + "epoch": 3.0493273542600896, + "grad_norm": 0.5564342737197876, + "learning_rate": 6.190039407528197e-06, + "loss": 4.0057, + "step": 44880 + }, + { + "epoch": 3.0496670743307517, + "grad_norm": 0.3533412218093872, + "learning_rate": 6.18961475743987e-06, + "loss": 4.1479, + "step": 44885 + }, + { + "epoch": 3.0500067944014133, + "grad_norm": 0.32777127623558044, + "learning_rate": 6.189190107351543e-06, + "loss": 4.1817, + "step": 44890 + }, + { + "epoch": 3.050346514472075, + "grad_norm": 0.28981542587280273, + "learning_rate": 6.188765457263216e-06, + "loss": 3.9214, + "step": 44895 + }, + { + "epoch": 3.0506862345427366, + "grad_norm": 0.3344077467918396, + "learning_rate": 6.188340807174889e-06, + "loss": 4.1357, + "step": 44900 + }, + { + "epoch": 3.0510259546133986, + "grad_norm": 0.25108227133750916, + "learning_rate": 6.1879161570865606e-06, + "loss": 3.7962, + "step": 44905 + }, + { + "epoch": 3.0513656746840603, + "grad_norm": 0.32419681549072266, + "learning_rate": 6.187491506998234e-06, + "loss": 4.0012, + "step": 44910 + }, + { + "epoch": 3.051705394754722, + "grad_norm": 0.25206562876701355, + "learning_rate": 6.187066856909907e-06, + "loss": 4.0533, + "step": 44915 + }, + { + "epoch": 3.052045114825384, + "grad_norm": 0.25081220269203186, + "learning_rate": 6.186642206821579e-06, + "loss": 4.1552, + "step": 44920 + }, + { + "epoch": 3.0523848348960456, + "grad_norm": 0.289016991853714, + "learning_rate": 6.186217556733253e-06, + "loss": 4.0436, + "step": 44925 + }, + { + "epoch": 3.0527245549667072, + "grad_norm": 0.3089035153388977, + "learning_rate": 6.185792906644925e-06, + "loss": 4.1648, + "step": 44930 + }, + { + "epoch": 3.0530642750373693, + "grad_norm": 0.3340117633342743, + "learning_rate": 6.185368256556597e-06, + "loss": 3.817, + "step": 44935 + }, + { + "epoch": 3.053403995108031, + "grad_norm": 0.3157886862754822, + "learning_rate": 6.184943606468271e-06, + "loss": 3.9082, + "step": 44940 + }, + { + "epoch": 3.0537437151786926, + "grad_norm": 0.2616835832595825, + "learning_rate": 6.184518956379944e-06, + "loss": 4.1285, + "step": 44945 + }, + { + "epoch": 3.0540834352493547, + "grad_norm": 0.27244555950164795, + "learning_rate": 6.184094306291616e-06, + "loss": 4.0464, + "step": 44950 + }, + { + "epoch": 3.0544231553200163, + "grad_norm": 0.3075932562351227, + "learning_rate": 6.183669656203289e-06, + "loss": 4.068, + "step": 44955 + }, + { + "epoch": 3.054762875390678, + "grad_norm": 0.20183423161506653, + "learning_rate": 6.183245006114962e-06, + "loss": 4.0956, + "step": 44960 + }, + { + "epoch": 3.05510259546134, + "grad_norm": 0.3452366292476654, + "learning_rate": 6.182820356026634e-06, + "loss": 4.105, + "step": 44965 + }, + { + "epoch": 3.0554423155320016, + "grad_norm": 0.33364948630332947, + "learning_rate": 6.182395705938308e-06, + "loss": 3.884, + "step": 44970 + }, + { + "epoch": 3.0557820356026633, + "grad_norm": 0.34618860483169556, + "learning_rate": 6.18197105584998e-06, + "loss": 4.2741, + "step": 44975 + }, + { + "epoch": 3.0561217556733253, + "grad_norm": 0.28193557262420654, + "learning_rate": 6.1815464057616526e-06, + "loss": 4.1907, + "step": 44980 + }, + { + "epoch": 3.056461475743987, + "grad_norm": 0.26008450984954834, + "learning_rate": 6.181121755673326e-06, + "loss": 4.0904, + "step": 44985 + }, + { + "epoch": 3.0568011958146486, + "grad_norm": 0.24840299785137177, + "learning_rate": 6.180697105584998e-06, + "loss": 4.0821, + "step": 44990 + }, + { + "epoch": 3.0571409158853107, + "grad_norm": 0.3647990822792053, + "learning_rate": 6.180272455496671e-06, + "loss": 3.8641, + "step": 44995 + }, + { + "epoch": 3.0574806359559723, + "grad_norm": 0.25318145751953125, + "learning_rate": 6.179847805408345e-06, + "loss": 4.1341, + "step": 45000 + }, + { + "epoch": 3.057820356026634, + "grad_norm": 0.25981736183166504, + "learning_rate": 6.1794231553200166e-06, + "loss": 3.9375, + "step": 45005 + }, + { + "epoch": 3.058160076097296, + "grad_norm": 0.30152028799057007, + "learning_rate": 6.178998505231689e-06, + "loss": 3.8337, + "step": 45010 + }, + { + "epoch": 3.0584997961679576, + "grad_norm": 0.2315729707479477, + "learning_rate": 6.178573855143363e-06, + "loss": 4.1147, + "step": 45015 + }, + { + "epoch": 3.0588395162386193, + "grad_norm": 0.3193207085132599, + "learning_rate": 6.178149205055035e-06, + "loss": 3.9444, + "step": 45020 + }, + { + "epoch": 3.0591792363092813, + "grad_norm": 0.3436634838581085, + "learning_rate": 6.177724554966708e-06, + "loss": 4.0526, + "step": 45025 + }, + { + "epoch": 3.059518956379943, + "grad_norm": 0.30699828267097473, + "learning_rate": 6.177299904878381e-06, + "loss": 4.1707, + "step": 45030 + }, + { + "epoch": 3.0598586764506046, + "grad_norm": 0.36400994658470154, + "learning_rate": 6.176875254790053e-06, + "loss": 4.1855, + "step": 45035 + }, + { + "epoch": 3.0601983965212667, + "grad_norm": 0.4154624938964844, + "learning_rate": 6.176450604701726e-06, + "loss": 4.0704, + "step": 45040 + }, + { + "epoch": 3.0605381165919283, + "grad_norm": 0.30780676007270813, + "learning_rate": 6.176025954613399e-06, + "loss": 3.7317, + "step": 45045 + }, + { + "epoch": 3.06087783666259, + "grad_norm": 0.32011619210243225, + "learning_rate": 6.175601304525072e-06, + "loss": 4.1009, + "step": 45050 + }, + { + "epoch": 3.0612175567332516, + "grad_norm": 0.28661608695983887, + "learning_rate": 6.175176654436745e-06, + "loss": 3.9352, + "step": 45055 + }, + { + "epoch": 3.0615572768039137, + "grad_norm": 0.3274683952331543, + "learning_rate": 6.174752004348417e-06, + "loss": 4.1327, + "step": 45060 + }, + { + "epoch": 3.0618969968745753, + "grad_norm": 0.3365553915500641, + "learning_rate": 6.17432735426009e-06, + "loss": 4.0288, + "step": 45065 + }, + { + "epoch": 3.062236716945237, + "grad_norm": 0.47228965163230896, + "learning_rate": 6.173902704171762e-06, + "loss": 4.1317, + "step": 45070 + }, + { + "epoch": 3.062576437015899, + "grad_norm": 0.309785932302475, + "learning_rate": 6.173478054083436e-06, + "loss": 4.1873, + "step": 45075 + }, + { + "epoch": 3.0629161570865606, + "grad_norm": 0.27704954147338867, + "learning_rate": 6.173053403995109e-06, + "loss": 3.9611, + "step": 45080 + }, + { + "epoch": 3.0632558771572223, + "grad_norm": 0.2946705222129822, + "learning_rate": 6.1726287539067805e-06, + "loss": 4.3659, + "step": 45085 + }, + { + "epoch": 3.0635955972278843, + "grad_norm": 0.4489670693874359, + "learning_rate": 6.172204103818454e-06, + "loss": 4.1706, + "step": 45090 + }, + { + "epoch": 3.063935317298546, + "grad_norm": 0.24512355029582977, + "learning_rate": 6.171779453730127e-06, + "loss": 3.9493, + "step": 45095 + }, + { + "epoch": 3.0642750373692076, + "grad_norm": 0.33992043137550354, + "learning_rate": 6.171354803641799e-06, + "loss": 4.1707, + "step": 45100 + }, + { + "epoch": 3.0646147574398697, + "grad_norm": 0.37474751472473145, + "learning_rate": 6.170930153553473e-06, + "loss": 4.0786, + "step": 45105 + }, + { + "epoch": 3.0649544775105313, + "grad_norm": 0.291967511177063, + "learning_rate": 6.170505503465145e-06, + "loss": 4.2654, + "step": 45110 + }, + { + "epoch": 3.065294197581193, + "grad_norm": 0.3526178300380707, + "learning_rate": 6.170080853376817e-06, + "loss": 3.9046, + "step": 45115 + }, + { + "epoch": 3.065633917651855, + "grad_norm": 0.2575472295284271, + "learning_rate": 6.169656203288491e-06, + "loss": 4.145, + "step": 45120 + }, + { + "epoch": 3.0659736377225166, + "grad_norm": 0.32188278436660767, + "learning_rate": 6.169231553200164e-06, + "loss": 3.915, + "step": 45125 + }, + { + "epoch": 3.0663133577931783, + "grad_norm": 0.350328266620636, + "learning_rate": 6.168806903111836e-06, + "loss": 4.0183, + "step": 45130 + }, + { + "epoch": 3.0666530778638403, + "grad_norm": 0.3540175259113312, + "learning_rate": 6.168382253023509e-06, + "loss": 4.0188, + "step": 45135 + }, + { + "epoch": 3.066992797934502, + "grad_norm": 0.39142778515815735, + "learning_rate": 6.167957602935181e-06, + "loss": 4.2332, + "step": 45140 + }, + { + "epoch": 3.0673325180051636, + "grad_norm": 0.34079059958457947, + "learning_rate": 6.167532952846854e-06, + "loss": 3.894, + "step": 45145 + }, + { + "epoch": 3.0676722380758257, + "grad_norm": 0.31508392095565796, + "learning_rate": 6.167108302758528e-06, + "loss": 4.0386, + "step": 45150 + }, + { + "epoch": 3.0680119581464873, + "grad_norm": 0.42288345098495483, + "learning_rate": 6.1666836526702e-06, + "loss": 4.0923, + "step": 45155 + }, + { + "epoch": 3.068351678217149, + "grad_norm": 0.2698819041252136, + "learning_rate": 6.1662590025818725e-06, + "loss": 3.9584, + "step": 45160 + }, + { + "epoch": 3.068691398287811, + "grad_norm": 0.32553228735923767, + "learning_rate": 6.165834352493546e-06, + "loss": 4.0327, + "step": 45165 + }, + { + "epoch": 3.0690311183584726, + "grad_norm": 0.47816938161849976, + "learning_rate": 6.165409702405218e-06, + "loss": 4.3192, + "step": 45170 + }, + { + "epoch": 3.0693708384291343, + "grad_norm": 0.35380133986473083, + "learning_rate": 6.164985052316892e-06, + "loss": 4.0464, + "step": 45175 + }, + { + "epoch": 3.0697105584997963, + "grad_norm": 0.4149528741836548, + "learning_rate": 6.164560402228565e-06, + "loss": 4.1393, + "step": 45180 + }, + { + "epoch": 3.070050278570458, + "grad_norm": 0.32884082198143005, + "learning_rate": 6.1641357521402365e-06, + "loss": 4.2665, + "step": 45185 + }, + { + "epoch": 3.0703899986411196, + "grad_norm": 0.396927148103714, + "learning_rate": 6.16371110205191e-06, + "loss": 4.0625, + "step": 45190 + }, + { + "epoch": 3.0707297187117817, + "grad_norm": 0.39789026975631714, + "learning_rate": 6.163286451963583e-06, + "loss": 3.9207, + "step": 45195 + }, + { + "epoch": 3.0710694387824433, + "grad_norm": 0.3192601799964905, + "learning_rate": 6.162861801875255e-06, + "loss": 4.1008, + "step": 45200 + }, + { + "epoch": 3.071409158853105, + "grad_norm": 0.445340096950531, + "learning_rate": 6.162437151786929e-06, + "loss": 3.9616, + "step": 45205 + }, + { + "epoch": 3.071748878923767, + "grad_norm": 0.3681430220603943, + "learning_rate": 6.162012501698601e-06, + "loss": 4.0337, + "step": 45210 + }, + { + "epoch": 3.0720885989944287, + "grad_norm": 0.3458237051963806, + "learning_rate": 6.161587851610273e-06, + "loss": 4.1023, + "step": 45215 + }, + { + "epoch": 3.0724283190650903, + "grad_norm": 0.44518452882766724, + "learning_rate": 6.161163201521947e-06, + "loss": 4.0171, + "step": 45220 + }, + { + "epoch": 3.0727680391357524, + "grad_norm": 0.2585323452949524, + "learning_rate": 6.160738551433619e-06, + "loss": 4.0811, + "step": 45225 + }, + { + "epoch": 3.073107759206414, + "grad_norm": 0.30787238478660583, + "learning_rate": 6.160313901345292e-06, + "loss": 3.8733, + "step": 45230 + }, + { + "epoch": 3.0734474792770756, + "grad_norm": 0.5151342749595642, + "learning_rate": 6.159889251256965e-06, + "loss": 3.7901, + "step": 45235 + }, + { + "epoch": 3.0737871993477373, + "grad_norm": 0.28946051001548767, + "learning_rate": 6.159464601168637e-06, + "loss": 3.9341, + "step": 45240 + }, + { + "epoch": 3.0741269194183993, + "grad_norm": 0.3847355544567108, + "learning_rate": 6.15903995108031e-06, + "loss": 4.2488, + "step": 45245 + }, + { + "epoch": 3.074466639489061, + "grad_norm": 0.3739905059337616, + "learning_rate": 6.158615300991984e-06, + "loss": 4.4327, + "step": 45250 + }, + { + "epoch": 3.0748063595597226, + "grad_norm": 0.2758461833000183, + "learning_rate": 6.158190650903656e-06, + "loss": 4.0057, + "step": 45255 + }, + { + "epoch": 3.0751460796303847, + "grad_norm": 0.3115977942943573, + "learning_rate": 6.1577660008153286e-06, + "loss": 4.1736, + "step": 45260 + }, + { + "epoch": 3.0754857997010463, + "grad_norm": 0.33170321583747864, + "learning_rate": 6.157341350727002e-06, + "loss": 4.021, + "step": 45265 + }, + { + "epoch": 3.075825519771708, + "grad_norm": 0.4229245185852051, + "learning_rate": 6.156916700638674e-06, + "loss": 4.1432, + "step": 45270 + }, + { + "epoch": 3.07616523984237, + "grad_norm": 0.30959928035736084, + "learning_rate": 6.156492050550347e-06, + "loss": 4.1296, + "step": 45275 + }, + { + "epoch": 3.0765049599130316, + "grad_norm": 0.3186522126197815, + "learning_rate": 6.156067400462021e-06, + "loss": 3.9487, + "step": 45280 + }, + { + "epoch": 3.0768446799836933, + "grad_norm": 0.2706434428691864, + "learning_rate": 6.1556427503736926e-06, + "loss": 4.154, + "step": 45285 + }, + { + "epoch": 3.0771844000543553, + "grad_norm": 0.25024092197418213, + "learning_rate": 6.155218100285365e-06, + "loss": 4.1583, + "step": 45290 + }, + { + "epoch": 3.077524120125017, + "grad_norm": 0.40901148319244385, + "learning_rate": 6.154793450197038e-06, + "loss": 4.0423, + "step": 45295 + }, + { + "epoch": 3.0778638401956786, + "grad_norm": 0.26574358344078064, + "learning_rate": 6.154368800108711e-06, + "loss": 3.9956, + "step": 45300 + }, + { + "epoch": 3.0782035602663407, + "grad_norm": 0.274812787771225, + "learning_rate": 6.153944150020384e-06, + "loss": 3.9649, + "step": 45305 + }, + { + "epoch": 3.0785432803370023, + "grad_norm": 0.3986132740974426, + "learning_rate": 6.1535194999320566e-06, + "loss": 4.0902, + "step": 45310 + }, + { + "epoch": 3.078883000407664, + "grad_norm": 0.2684265375137329, + "learning_rate": 6.153094849843729e-06, + "loss": 4.0151, + "step": 45315 + }, + { + "epoch": 3.079222720478326, + "grad_norm": 0.26901066303253174, + "learning_rate": 6.152670199755401e-06, + "loss": 3.906, + "step": 45320 + }, + { + "epoch": 3.0795624405489876, + "grad_norm": 0.29834192991256714, + "learning_rate": 6.152245549667075e-06, + "loss": 3.9039, + "step": 45325 + }, + { + "epoch": 3.0799021606196493, + "grad_norm": 0.31592440605163574, + "learning_rate": 6.151820899578748e-06, + "loss": 4.0387, + "step": 45330 + }, + { + "epoch": 3.0802418806903114, + "grad_norm": 0.27636170387268066, + "learning_rate": 6.15139624949042e-06, + "loss": 4.1026, + "step": 45335 + }, + { + "epoch": 3.080581600760973, + "grad_norm": 0.2955247461795807, + "learning_rate": 6.150971599402093e-06, + "loss": 3.9522, + "step": 45340 + }, + { + "epoch": 3.0809213208316346, + "grad_norm": 0.25864332914352417, + "learning_rate": 6.150546949313766e-06, + "loss": 4.2827, + "step": 45345 + }, + { + "epoch": 3.0812610409022967, + "grad_norm": 0.33150264620780945, + "learning_rate": 6.150122299225438e-06, + "loss": 4.0116, + "step": 45350 + }, + { + "epoch": 3.0816007609729583, + "grad_norm": 0.5191275477409363, + "learning_rate": 6.149697649137112e-06, + "loss": 4.2664, + "step": 45355 + }, + { + "epoch": 3.08194048104362, + "grad_norm": 0.4212564527988434, + "learning_rate": 6.149272999048785e-06, + "loss": 4.0946, + "step": 45360 + }, + { + "epoch": 3.082280201114282, + "grad_norm": 0.4377128779888153, + "learning_rate": 6.1488483489604565e-06, + "loss": 4.105, + "step": 45365 + }, + { + "epoch": 3.0826199211849437, + "grad_norm": 0.33359581232070923, + "learning_rate": 6.14842369887213e-06, + "loss": 4.0296, + "step": 45370 + }, + { + "epoch": 3.0829596412556053, + "grad_norm": 0.4333612620830536, + "learning_rate": 6.147999048783803e-06, + "loss": 3.8338, + "step": 45375 + }, + { + "epoch": 3.0832993613262674, + "grad_norm": 0.39010730385780334, + "learning_rate": 6.147574398695475e-06, + "loss": 4.0711, + "step": 45380 + }, + { + "epoch": 3.083639081396929, + "grad_norm": 0.24163539707660675, + "learning_rate": 6.147149748607149e-06, + "loss": 3.8932, + "step": 45385 + }, + { + "epoch": 3.0839788014675906, + "grad_norm": 0.47997987270355225, + "learning_rate": 6.1467250985188205e-06, + "loss": 4.174, + "step": 45390 + }, + { + "epoch": 3.0843185215382523, + "grad_norm": 0.4625537097454071, + "learning_rate": 6.146300448430493e-06, + "loss": 4.2038, + "step": 45395 + }, + { + "epoch": 3.0846582416089143, + "grad_norm": 0.3253684937953949, + "learning_rate": 6.145875798342167e-06, + "loss": 4.104, + "step": 45400 + }, + { + "epoch": 3.084997961679576, + "grad_norm": 0.5627989768981934, + "learning_rate": 6.145451148253839e-06, + "loss": 3.9999, + "step": 45405 + }, + { + "epoch": 3.0853376817502376, + "grad_norm": 0.40988603234291077, + "learning_rate": 6.145026498165512e-06, + "loss": 4.094, + "step": 45410 + }, + { + "epoch": 3.0856774018208997, + "grad_norm": 0.32651352882385254, + "learning_rate": 6.144601848077185e-06, + "loss": 4.184, + "step": 45415 + }, + { + "epoch": 3.0860171218915613, + "grad_norm": 0.3080008625984192, + "learning_rate": 6.144177197988857e-06, + "loss": 3.9984, + "step": 45420 + }, + { + "epoch": 3.086356841962223, + "grad_norm": 0.4208436906337738, + "learning_rate": 6.14375254790053e-06, + "loss": 3.9986, + "step": 45425 + }, + { + "epoch": 3.086696562032885, + "grad_norm": 0.3058658540248871, + "learning_rate": 6.143327897812204e-06, + "loss": 4.0908, + "step": 45430 + }, + { + "epoch": 3.0870362821035466, + "grad_norm": 0.2319546639919281, + "learning_rate": 6.142903247723876e-06, + "loss": 4.0923, + "step": 45435 + }, + { + "epoch": 3.0873760021742083, + "grad_norm": 0.2223261296749115, + "learning_rate": 6.1424785976355485e-06, + "loss": 3.9683, + "step": 45440 + }, + { + "epoch": 3.0877157222448703, + "grad_norm": 0.3855245113372803, + "learning_rate": 6.142053947547222e-06, + "loss": 3.9886, + "step": 45445 + }, + { + "epoch": 3.088055442315532, + "grad_norm": 0.33222290873527527, + "learning_rate": 6.141629297458894e-06, + "loss": 4.3723, + "step": 45450 + }, + { + "epoch": 3.0883951623861936, + "grad_norm": 0.48143360018730164, + "learning_rate": 6.141204647370567e-06, + "loss": 3.8619, + "step": 45455 + }, + { + "epoch": 3.0887348824568557, + "grad_norm": 0.30359652638435364, + "learning_rate": 6.14077999728224e-06, + "loss": 3.912, + "step": 45460 + }, + { + "epoch": 3.0890746025275173, + "grad_norm": 0.4757236838340759, + "learning_rate": 6.1403553471939125e-06, + "loss": 4.0506, + "step": 45465 + }, + { + "epoch": 3.089414322598179, + "grad_norm": 0.3015609681606293, + "learning_rate": 6.139930697105585e-06, + "loss": 4.1082, + "step": 45470 + }, + { + "epoch": 3.089754042668841, + "grad_norm": 0.30534598231315613, + "learning_rate": 6.139506047017258e-06, + "loss": 3.9161, + "step": 45475 + }, + { + "epoch": 3.0900937627395026, + "grad_norm": 0.3156137764453888, + "learning_rate": 6.139081396928931e-06, + "loss": 4.0587, + "step": 45480 + }, + { + "epoch": 3.0904334828101643, + "grad_norm": 0.2953508496284485, + "learning_rate": 6.138656746840603e-06, + "loss": 3.8756, + "step": 45485 + }, + { + "epoch": 3.0907732028808264, + "grad_norm": 0.3507824242115021, + "learning_rate": 6.1382320967522765e-06, + "loss": 4.0282, + "step": 45490 + }, + { + "epoch": 3.091112922951488, + "grad_norm": 0.3079257309436798, + "learning_rate": 6.137807446663949e-06, + "loss": 4.0145, + "step": 45495 + }, + { + "epoch": 3.0914526430221496, + "grad_norm": 0.41376766562461853, + "learning_rate": 6.137382796575621e-06, + "loss": 4.0383, + "step": 45500 + }, + { + "epoch": 3.0917923630928117, + "grad_norm": 0.32969194650650024, + "learning_rate": 6.136958146487295e-06, + "loss": 3.9858, + "step": 45505 + }, + { + "epoch": 3.0921320831634733, + "grad_norm": 0.40564101934432983, + "learning_rate": 6.136533496398968e-06, + "loss": 4.2529, + "step": 45510 + }, + { + "epoch": 3.092471803234135, + "grad_norm": 0.28160223364830017, + "learning_rate": 6.136108846310641e-06, + "loss": 4.0862, + "step": 45515 + }, + { + "epoch": 3.092811523304797, + "grad_norm": 0.24162788689136505, + "learning_rate": 6.135684196222313e-06, + "loss": 4.1285, + "step": 45520 + }, + { + "epoch": 3.0931512433754587, + "grad_norm": 0.3072498142719269, + "learning_rate": 6.135259546133986e-06, + "loss": 4.0815, + "step": 45525 + }, + { + "epoch": 3.0934909634461203, + "grad_norm": 0.3921680748462677, + "learning_rate": 6.13483489604566e-06, + "loss": 4.15, + "step": 45530 + }, + { + "epoch": 3.0938306835167824, + "grad_norm": 0.41272222995758057, + "learning_rate": 6.134410245957332e-06, + "loss": 4.2163, + "step": 45535 + }, + { + "epoch": 3.094170403587444, + "grad_norm": 0.29948362708091736, + "learning_rate": 6.1339855958690046e-06, + "loss": 4.2079, + "step": 45540 + }, + { + "epoch": 3.0945101236581056, + "grad_norm": 0.2685193121433258, + "learning_rate": 6.133560945780677e-06, + "loss": 4.3168, + "step": 45545 + }, + { + "epoch": 3.0948498437287677, + "grad_norm": 0.3208426535129547, + "learning_rate": 6.13313629569235e-06, + "loss": 3.9806, + "step": 45550 + }, + { + "epoch": 3.0951895637994293, + "grad_norm": 0.2845105528831482, + "learning_rate": 6.132711645604023e-06, + "loss": 3.9827, + "step": 45555 + }, + { + "epoch": 3.095529283870091, + "grad_norm": 0.2643812298774719, + "learning_rate": 6.132286995515696e-06, + "loss": 4.2197, + "step": 45560 + }, + { + "epoch": 3.095869003940753, + "grad_norm": 0.335784912109375, + "learning_rate": 6.1318623454273686e-06, + "loss": 4.0124, + "step": 45565 + }, + { + "epoch": 3.0962087240114147, + "grad_norm": 0.3181460201740265, + "learning_rate": 6.1314376953390405e-06, + "loss": 3.8958, + "step": 45570 + }, + { + "epoch": 3.0965484440820763, + "grad_norm": 0.2805441617965698, + "learning_rate": 6.131013045250714e-06, + "loss": 3.8173, + "step": 45575 + }, + { + "epoch": 3.096888164152738, + "grad_norm": 0.32550036907196045, + "learning_rate": 6.130588395162387e-06, + "loss": 3.8659, + "step": 45580 + }, + { + "epoch": 3.0972278842234, + "grad_norm": 0.2539219856262207, + "learning_rate": 6.130163745074059e-06, + "loss": 4.1439, + "step": 45585 + }, + { + "epoch": 3.0975676042940616, + "grad_norm": 0.40947988629341125, + "learning_rate": 6.1297390949857326e-06, + "loss": 3.9997, + "step": 45590 + }, + { + "epoch": 3.0979073243647233, + "grad_norm": 0.3214080333709717, + "learning_rate": 6.129314444897405e-06, + "loss": 4.0109, + "step": 45595 + }, + { + "epoch": 3.0982470444353853, + "grad_norm": 0.5252742171287537, + "learning_rate": 6.128889794809077e-06, + "loss": 3.8933, + "step": 45600 + }, + { + "epoch": 3.098586764506047, + "grad_norm": 0.19226281344890594, + "learning_rate": 6.128465144720751e-06, + "loss": 3.9437, + "step": 45605 + }, + { + "epoch": 3.0989264845767086, + "grad_norm": 0.29697856307029724, + "learning_rate": 6.128040494632424e-06, + "loss": 3.8878, + "step": 45610 + }, + { + "epoch": 3.0992662046473707, + "grad_norm": 0.42898011207580566, + "learning_rate": 6.127615844544096e-06, + "loss": 3.835, + "step": 45615 + }, + { + "epoch": 3.0996059247180323, + "grad_norm": 0.23624323308467865, + "learning_rate": 6.127191194455769e-06, + "loss": 4.1721, + "step": 45620 + }, + { + "epoch": 3.099945644788694, + "grad_norm": 0.4295189380645752, + "learning_rate": 6.126766544367442e-06, + "loss": 4.215, + "step": 45625 + }, + { + "epoch": 3.100285364859356, + "grad_norm": 0.33971264958381653, + "learning_rate": 6.126341894279114e-06, + "loss": 3.869, + "step": 45630 + }, + { + "epoch": 3.1006250849300176, + "grad_norm": 0.28942644596099854, + "learning_rate": 6.125917244190788e-06, + "loss": 4.132, + "step": 45635 + }, + { + "epoch": 3.1009648050006793, + "grad_norm": 0.2798900306224823, + "learning_rate": 6.12549259410246e-06, + "loss": 3.6961, + "step": 45640 + }, + { + "epoch": 3.1013045250713414, + "grad_norm": 0.7082122564315796, + "learning_rate": 6.1250679440141325e-06, + "loss": 4.136, + "step": 45645 + }, + { + "epoch": 3.101644245142003, + "grad_norm": 0.39356738328933716, + "learning_rate": 6.124643293925806e-06, + "loss": 3.8637, + "step": 45650 + }, + { + "epoch": 3.1019839652126646, + "grad_norm": 0.2991182208061218, + "learning_rate": 6.124218643837478e-06, + "loss": 3.9143, + "step": 45655 + }, + { + "epoch": 3.1023236852833267, + "grad_norm": 0.37739279866218567, + "learning_rate": 6.123793993749151e-06, + "loss": 4.2621, + "step": 45660 + }, + { + "epoch": 3.1026634053539883, + "grad_norm": 0.39426323771476746, + "learning_rate": 6.123369343660825e-06, + "loss": 3.9075, + "step": 45665 + }, + { + "epoch": 3.10300312542465, + "grad_norm": 0.35651642084121704, + "learning_rate": 6.1229446935724965e-06, + "loss": 4.023, + "step": 45670 + }, + { + "epoch": 3.103342845495312, + "grad_norm": 0.2850717008113861, + "learning_rate": 6.122520043484169e-06, + "loss": 4.1063, + "step": 45675 + }, + { + "epoch": 3.1036825655659737, + "grad_norm": 0.3377261459827423, + "learning_rate": 6.122095393395843e-06, + "loss": 4.1438, + "step": 45680 + }, + { + "epoch": 3.1040222856366353, + "grad_norm": 0.22573018074035645, + "learning_rate": 6.121670743307515e-06, + "loss": 4.2106, + "step": 45685 + }, + { + "epoch": 3.1043620057072974, + "grad_norm": 0.3714926838874817, + "learning_rate": 6.121246093219188e-06, + "loss": 4.2204, + "step": 45690 + }, + { + "epoch": 3.104701725777959, + "grad_norm": 0.4607090950012207, + "learning_rate": 6.120821443130861e-06, + "loss": 4.1367, + "step": 45695 + }, + { + "epoch": 3.1050414458486206, + "grad_norm": 0.360028475522995, + "learning_rate": 6.120396793042533e-06, + "loss": 4.2596, + "step": 45700 + }, + { + "epoch": 3.1053811659192827, + "grad_norm": 0.39575397968292236, + "learning_rate": 6.119972142954206e-06, + "loss": 4.0889, + "step": 45705 + }, + { + "epoch": 3.1057208859899443, + "grad_norm": 0.33402690291404724, + "learning_rate": 6.119547492865879e-06, + "loss": 4.1501, + "step": 45710 + }, + { + "epoch": 3.106060606060606, + "grad_norm": 0.3084780275821686, + "learning_rate": 6.119122842777552e-06, + "loss": 3.8726, + "step": 45715 + }, + { + "epoch": 3.106400326131268, + "grad_norm": 0.2837364077568054, + "learning_rate": 6.1186981926892245e-06, + "loss": 3.8831, + "step": 45720 + }, + { + "epoch": 3.1067400462019297, + "grad_norm": 0.2237514704465866, + "learning_rate": 6.118273542600897e-06, + "loss": 4.2191, + "step": 45725 + }, + { + "epoch": 3.1070797662725913, + "grad_norm": 0.29260873794555664, + "learning_rate": 6.11784889251257e-06, + "loss": 3.9325, + "step": 45730 + }, + { + "epoch": 3.107419486343253, + "grad_norm": 0.3160875141620636, + "learning_rate": 6.117424242424242e-06, + "loss": 3.9225, + "step": 45735 + }, + { + "epoch": 3.107759206413915, + "grad_norm": 0.456554114818573, + "learning_rate": 6.116999592335916e-06, + "loss": 4.1324, + "step": 45740 + }, + { + "epoch": 3.1080989264845766, + "grad_norm": 0.3221472501754761, + "learning_rate": 6.1165749422475885e-06, + "loss": 4.2296, + "step": 45745 + }, + { + "epoch": 3.1084386465552383, + "grad_norm": 0.4002121686935425, + "learning_rate": 6.1161502921592605e-06, + "loss": 4.2445, + "step": 45750 + }, + { + "epoch": 3.1087783666259003, + "grad_norm": 0.3191263973712921, + "learning_rate": 6.115725642070934e-06, + "loss": 3.8553, + "step": 45755 + }, + { + "epoch": 3.109118086696562, + "grad_norm": 0.5329229831695557, + "learning_rate": 6.115300991982607e-06, + "loss": 4.0496, + "step": 45760 + }, + { + "epoch": 3.1094578067672236, + "grad_norm": 0.326102077960968, + "learning_rate": 6.114876341894279e-06, + "loss": 4.0184, + "step": 45765 + }, + { + "epoch": 3.1097975268378857, + "grad_norm": 0.3364032208919525, + "learning_rate": 6.1144516918059525e-06, + "loss": 4.1009, + "step": 45770 + }, + { + "epoch": 3.1101372469085473, + "grad_norm": 0.3396734297275543, + "learning_rate": 6.114027041717625e-06, + "loss": 4.0477, + "step": 45775 + }, + { + "epoch": 3.110476966979209, + "grad_norm": 0.21736203134059906, + "learning_rate": 6.113602391629297e-06, + "loss": 4.0381, + "step": 45780 + }, + { + "epoch": 3.110816687049871, + "grad_norm": 0.3899634778499603, + "learning_rate": 6.113177741540971e-06, + "loss": 4.143, + "step": 45785 + }, + { + "epoch": 3.1111564071205327, + "grad_norm": 0.291987806558609, + "learning_rate": 6.112753091452644e-06, + "loss": 4.2588, + "step": 45790 + }, + { + "epoch": 3.1114961271911943, + "grad_norm": 0.4886905252933502, + "learning_rate": 6.112328441364316e-06, + "loss": 4.2731, + "step": 45795 + }, + { + "epoch": 3.1118358472618564, + "grad_norm": 0.3504076302051544, + "learning_rate": 6.111903791275989e-06, + "loss": 4.2658, + "step": 45800 + }, + { + "epoch": 3.112175567332518, + "grad_norm": 0.2666422426700592, + "learning_rate": 6.111479141187661e-06, + "loss": 4.2687, + "step": 45805 + }, + { + "epoch": 3.1125152874031796, + "grad_norm": 0.28810638189315796, + "learning_rate": 6.111054491099334e-06, + "loss": 4.0057, + "step": 45810 + }, + { + "epoch": 3.1128550074738417, + "grad_norm": 0.29785484075546265, + "learning_rate": 6.110629841011008e-06, + "loss": 4.0407, + "step": 45815 + }, + { + "epoch": 3.1131947275445033, + "grad_norm": 0.33200690150260925, + "learning_rate": 6.11020519092268e-06, + "loss": 4.1931, + "step": 45820 + }, + { + "epoch": 3.113534447615165, + "grad_norm": 0.37364527583122253, + "learning_rate": 6.1097805408343525e-06, + "loss": 3.9095, + "step": 45825 + }, + { + "epoch": 3.113874167685827, + "grad_norm": 0.21655714511871338, + "learning_rate": 6.109355890746026e-06, + "loss": 4.2129, + "step": 45830 + }, + { + "epoch": 3.1142138877564887, + "grad_norm": 0.28328800201416016, + "learning_rate": 6.108931240657698e-06, + "loss": 4.0733, + "step": 45835 + }, + { + "epoch": 3.1145536078271503, + "grad_norm": 0.30544495582580566, + "learning_rate": 6.108506590569371e-06, + "loss": 4.2618, + "step": 45840 + }, + { + "epoch": 3.1148933278978124, + "grad_norm": 0.29669755697250366, + "learning_rate": 6.1080819404810446e-06, + "loss": 4.0501, + "step": 45845 + }, + { + "epoch": 3.115233047968474, + "grad_norm": 0.3130665421485901, + "learning_rate": 6.1076572903927165e-06, + "loss": 4.0152, + "step": 45850 + }, + { + "epoch": 3.1155727680391356, + "grad_norm": 0.3375866115093231, + "learning_rate": 6.10723264030439e-06, + "loss": 4.2403, + "step": 45855 + }, + { + "epoch": 3.1159124881097977, + "grad_norm": 0.32667845487594604, + "learning_rate": 6.106807990216063e-06, + "loss": 3.8439, + "step": 45860 + }, + { + "epoch": 3.1162522081804593, + "grad_norm": 0.24027979373931885, + "learning_rate": 6.106383340127735e-06, + "loss": 4.1662, + "step": 45865 + }, + { + "epoch": 3.116591928251121, + "grad_norm": 0.3576103448867798, + "learning_rate": 6.1059586900394086e-06, + "loss": 3.9135, + "step": 45870 + }, + { + "epoch": 3.116931648321783, + "grad_norm": 0.2369670867919922, + "learning_rate": 6.105534039951081e-06, + "loss": 3.9125, + "step": 45875 + }, + { + "epoch": 3.1172713683924447, + "grad_norm": 0.3938811123371124, + "learning_rate": 6.105109389862753e-06, + "loss": 4.149, + "step": 45880 + }, + { + "epoch": 3.1176110884631063, + "grad_norm": 0.3464202880859375, + "learning_rate": 6.104684739774427e-06, + "loss": 4.005, + "step": 45885 + }, + { + "epoch": 3.1179508085337684, + "grad_norm": 0.3154645264148712, + "learning_rate": 6.104260089686099e-06, + "loss": 4.36, + "step": 45890 + }, + { + "epoch": 3.11829052860443, + "grad_norm": 0.30196699500083923, + "learning_rate": 6.103835439597772e-06, + "loss": 3.8778, + "step": 45895 + }, + { + "epoch": 3.1186302486750916, + "grad_norm": 0.49348849058151245, + "learning_rate": 6.103410789509445e-06, + "loss": 4.1981, + "step": 45900 + }, + { + "epoch": 3.1189699687457537, + "grad_norm": 0.243957981467247, + "learning_rate": 6.102986139421117e-06, + "loss": 4.0615, + "step": 45905 + }, + { + "epoch": 3.1193096888164153, + "grad_norm": 0.4616289436817169, + "learning_rate": 6.10256148933279e-06, + "loss": 4.1689, + "step": 45910 + }, + { + "epoch": 3.119649408887077, + "grad_norm": 0.31531503796577454, + "learning_rate": 6.102136839244464e-06, + "loss": 4.0908, + "step": 45915 + }, + { + "epoch": 3.1199891289577386, + "grad_norm": 0.29298776388168335, + "learning_rate": 6.101712189156136e-06, + "loss": 4.0112, + "step": 45920 + }, + { + "epoch": 3.1203288490284007, + "grad_norm": 0.28685855865478516, + "learning_rate": 6.1012875390678085e-06, + "loss": 3.8207, + "step": 45925 + }, + { + "epoch": 3.1206685690990623, + "grad_norm": 0.5733582973480225, + "learning_rate": 6.100862888979482e-06, + "loss": 3.9378, + "step": 45930 + }, + { + "epoch": 3.121008289169724, + "grad_norm": 0.3146815001964569, + "learning_rate": 6.100438238891154e-06, + "loss": 3.9642, + "step": 45935 + }, + { + "epoch": 3.121348009240386, + "grad_norm": 0.36934590339660645, + "learning_rate": 6.100013588802827e-06, + "loss": 4.0995, + "step": 45940 + }, + { + "epoch": 3.1216877293110477, + "grad_norm": 0.3306015133857727, + "learning_rate": 6.0995889387145006e-06, + "loss": 4.1659, + "step": 45945 + }, + { + "epoch": 3.1220274493817093, + "grad_norm": 0.36321529746055603, + "learning_rate": 6.0991642886261725e-06, + "loss": 4.135, + "step": 45950 + }, + { + "epoch": 3.1223671694523714, + "grad_norm": 0.2740051746368408, + "learning_rate": 6.098739638537845e-06, + "loss": 3.844, + "step": 45955 + }, + { + "epoch": 3.122706889523033, + "grad_norm": 0.32132259011268616, + "learning_rate": 6.098314988449518e-06, + "loss": 4.2151, + "step": 45960 + }, + { + "epoch": 3.1230466095936946, + "grad_norm": 0.23851442337036133, + "learning_rate": 6.097890338361191e-06, + "loss": 3.9941, + "step": 45965 + }, + { + "epoch": 3.1233863296643567, + "grad_norm": 0.48225414752960205, + "learning_rate": 6.097465688272864e-06, + "loss": 3.9778, + "step": 45970 + }, + { + "epoch": 3.1237260497350183, + "grad_norm": 0.38841691613197327, + "learning_rate": 6.0970410381845365e-06, + "loss": 4.1727, + "step": 45975 + }, + { + "epoch": 3.12406576980568, + "grad_norm": 0.28789564967155457, + "learning_rate": 6.096616388096209e-06, + "loss": 3.8668, + "step": 45980 + }, + { + "epoch": 3.124405489876342, + "grad_norm": 0.3073010742664337, + "learning_rate": 6.096191738007881e-06, + "loss": 4.2257, + "step": 45985 + }, + { + "epoch": 3.1247452099470037, + "grad_norm": 0.27173906564712524, + "learning_rate": 6.095767087919555e-06, + "loss": 3.7093, + "step": 45990 + }, + { + "epoch": 3.1250849300176653, + "grad_norm": 0.24422723054885864, + "learning_rate": 6.095342437831228e-06, + "loss": 4.1286, + "step": 45995 + }, + { + "epoch": 3.1254246500883274, + "grad_norm": 0.30739688873291016, + "learning_rate": 6.0949177877429e-06, + "loss": 4.2927, + "step": 46000 + }, + { + "epoch": 3.125764370158989, + "grad_norm": 0.28265467286109924, + "learning_rate": 6.094493137654573e-06, + "loss": 4.0554, + "step": 46005 + }, + { + "epoch": 3.1261040902296506, + "grad_norm": 0.2692340612411499, + "learning_rate": 6.094068487566246e-06, + "loss": 4.0536, + "step": 46010 + }, + { + "epoch": 3.1264438103003127, + "grad_norm": 0.3006051480770111, + "learning_rate": 6.093643837477918e-06, + "loss": 3.8814, + "step": 46015 + }, + { + "epoch": 3.1267835303709743, + "grad_norm": 0.28886112570762634, + "learning_rate": 6.093219187389592e-06, + "loss": 4.1693, + "step": 46020 + }, + { + "epoch": 3.127123250441636, + "grad_norm": 0.23291821777820587, + "learning_rate": 6.0927945373012645e-06, + "loss": 3.9215, + "step": 46025 + }, + { + "epoch": 3.127462970512298, + "grad_norm": 0.37482431530952454, + "learning_rate": 6.0923698872129365e-06, + "loss": 4.0418, + "step": 46030 + }, + { + "epoch": 3.1278026905829597, + "grad_norm": 0.36613699793815613, + "learning_rate": 6.09194523712461e-06, + "loss": 4.0121, + "step": 46035 + }, + { + "epoch": 3.1281424106536213, + "grad_norm": 0.3637827932834625, + "learning_rate": 6.091520587036283e-06, + "loss": 4.112, + "step": 46040 + }, + { + "epoch": 3.1284821307242834, + "grad_norm": 0.3140016198158264, + "learning_rate": 6.091095936947955e-06, + "loss": 3.8185, + "step": 46045 + }, + { + "epoch": 3.128821850794945, + "grad_norm": 0.38199394941329956, + "learning_rate": 6.0906712868596285e-06, + "loss": 4.0936, + "step": 46050 + }, + { + "epoch": 3.1291615708656066, + "grad_norm": 0.2770847678184509, + "learning_rate": 6.0902466367713005e-06, + "loss": 3.9635, + "step": 46055 + }, + { + "epoch": 3.1295012909362687, + "grad_norm": 0.2585768699645996, + "learning_rate": 6.089821986682973e-06, + "loss": 3.9797, + "step": 46060 + }, + { + "epoch": 3.1298410110069304, + "grad_norm": 0.3813711404800415, + "learning_rate": 6.089397336594647e-06, + "loss": 4.2109, + "step": 46065 + }, + { + "epoch": 3.130180731077592, + "grad_norm": 0.28975680470466614, + "learning_rate": 6.088972686506319e-06, + "loss": 4.2708, + "step": 46070 + }, + { + "epoch": 3.1305204511482536, + "grad_norm": 0.3838997483253479, + "learning_rate": 6.088548036417992e-06, + "loss": 4.0385, + "step": 46075 + }, + { + "epoch": 3.1308601712189157, + "grad_norm": 0.32199200987815857, + "learning_rate": 6.088123386329665e-06, + "loss": 4.0758, + "step": 46080 + }, + { + "epoch": 3.1311998912895773, + "grad_norm": 0.34878459572792053, + "learning_rate": 6.087698736241337e-06, + "loss": 3.8766, + "step": 46085 + }, + { + "epoch": 3.131539611360239, + "grad_norm": 0.2767389416694641, + "learning_rate": 6.08727408615301e-06, + "loss": 4.1984, + "step": 46090 + }, + { + "epoch": 3.131879331430901, + "grad_norm": 0.2320900857448578, + "learning_rate": 6.086849436064684e-06, + "loss": 4.1529, + "step": 46095 + }, + { + "epoch": 3.1322190515015627, + "grad_norm": 0.3046022951602936, + "learning_rate": 6.086424785976356e-06, + "loss": 4.129, + "step": 46100 + }, + { + "epoch": 3.1325587715722243, + "grad_norm": 0.4521959125995636, + "learning_rate": 6.0860001358880285e-06, + "loss": 4.3658, + "step": 46105 + }, + { + "epoch": 3.1328984916428864, + "grad_norm": 0.28329893946647644, + "learning_rate": 6.085575485799702e-06, + "loss": 4.0799, + "step": 46110 + }, + { + "epoch": 3.133238211713548, + "grad_norm": 0.36158281564712524, + "learning_rate": 6.085150835711374e-06, + "loss": 4.2144, + "step": 46115 + }, + { + "epoch": 3.1335779317842096, + "grad_norm": 0.3086854815483093, + "learning_rate": 6.084726185623047e-06, + "loss": 3.9845, + "step": 46120 + }, + { + "epoch": 3.1339176518548717, + "grad_norm": 0.27334946393966675, + "learning_rate": 6.0843015355347206e-06, + "loss": 4.1892, + "step": 46125 + }, + { + "epoch": 3.1342573719255333, + "grad_norm": 0.2833721339702606, + "learning_rate": 6.0838768854463925e-06, + "loss": 3.6568, + "step": 46130 + }, + { + "epoch": 3.134597091996195, + "grad_norm": 0.3293377757072449, + "learning_rate": 6.083452235358065e-06, + "loss": 3.8771, + "step": 46135 + }, + { + "epoch": 3.134936812066857, + "grad_norm": 0.23518423736095428, + "learning_rate": 6.083027585269738e-06, + "loss": 3.7635, + "step": 46140 + }, + { + "epoch": 3.1352765321375187, + "grad_norm": 0.435173362493515, + "learning_rate": 6.082602935181411e-06, + "loss": 4.1099, + "step": 46145 + }, + { + "epoch": 3.1356162522081803, + "grad_norm": 0.2966592609882355, + "learning_rate": 6.082178285093083e-06, + "loss": 4.0237, + "step": 46150 + }, + { + "epoch": 3.1359559722788424, + "grad_norm": 0.23657195270061493, + "learning_rate": 6.0817536350047565e-06, + "loss": 4.1584, + "step": 46155 + }, + { + "epoch": 3.136295692349504, + "grad_norm": 0.31759005784988403, + "learning_rate": 6.081328984916429e-06, + "loss": 3.9037, + "step": 46160 + }, + { + "epoch": 3.1366354124201656, + "grad_norm": 0.3835636079311371, + "learning_rate": 6.080904334828101e-06, + "loss": 4.2443, + "step": 46165 + }, + { + "epoch": 3.1369751324908277, + "grad_norm": 0.30922722816467285, + "learning_rate": 6.080479684739775e-06, + "loss": 3.9236, + "step": 46170 + }, + { + "epoch": 3.1373148525614893, + "grad_norm": 0.32796627283096313, + "learning_rate": 6.080055034651448e-06, + "loss": 4.1056, + "step": 46175 + }, + { + "epoch": 3.137654572632151, + "grad_norm": 0.3680817782878876, + "learning_rate": 6.07963038456312e-06, + "loss": 3.9834, + "step": 46180 + }, + { + "epoch": 3.137994292702813, + "grad_norm": 0.2920980453491211, + "learning_rate": 6.079205734474793e-06, + "loss": 4.0275, + "step": 46185 + }, + { + "epoch": 3.1383340127734747, + "grad_norm": 0.3657244145870209, + "learning_rate": 6.078781084386466e-06, + "loss": 4.3655, + "step": 46190 + }, + { + "epoch": 3.1386737328441363, + "grad_norm": 0.2495344579219818, + "learning_rate": 6.07835643429814e-06, + "loss": 4.3319, + "step": 46195 + }, + { + "epoch": 3.1390134529147984, + "grad_norm": 0.39514485001564026, + "learning_rate": 6.077931784209812e-06, + "loss": 4.0222, + "step": 46200 + }, + { + "epoch": 3.13935317298546, + "grad_norm": 0.3249465525150299, + "learning_rate": 6.0775071341214845e-06, + "loss": 4.0394, + "step": 46205 + }, + { + "epoch": 3.1396928930561216, + "grad_norm": 0.634479284286499, + "learning_rate": 6.077082484033157e-06, + "loss": 4.0156, + "step": 46210 + }, + { + "epoch": 3.1400326131267837, + "grad_norm": 0.24345381557941437, + "learning_rate": 6.07665783394483e-06, + "loss": 4.2662, + "step": 46215 + }, + { + "epoch": 3.1403723331974454, + "grad_norm": 0.24775828421115875, + "learning_rate": 6.076233183856503e-06, + "loss": 4.1338, + "step": 46220 + }, + { + "epoch": 3.140712053268107, + "grad_norm": 0.5506097078323364, + "learning_rate": 6.075808533768176e-06, + "loss": 4.1123, + "step": 46225 + }, + { + "epoch": 3.141051773338769, + "grad_norm": 0.3346982002258301, + "learning_rate": 6.0753838836798485e-06, + "loss": 3.9326, + "step": 46230 + }, + { + "epoch": 3.1413914934094307, + "grad_norm": 0.35018277168273926, + "learning_rate": 6.0749592335915205e-06, + "loss": 4.0617, + "step": 46235 + }, + { + "epoch": 3.1417312134800923, + "grad_norm": 0.24525035917758942, + "learning_rate": 6.074534583503194e-06, + "loss": 4.0972, + "step": 46240 + }, + { + "epoch": 3.1420709335507544, + "grad_norm": 0.4471696615219116, + "learning_rate": 6.074109933414867e-06, + "loss": 4.179, + "step": 46245 + }, + { + "epoch": 3.142410653621416, + "grad_norm": 0.25572818517684937, + "learning_rate": 6.073685283326539e-06, + "loss": 3.971, + "step": 46250 + }, + { + "epoch": 3.1427503736920777, + "grad_norm": 0.3723585605621338, + "learning_rate": 6.0732606332382125e-06, + "loss": 4.0823, + "step": 46255 + }, + { + "epoch": 3.1430900937627397, + "grad_norm": 0.47143688797950745, + "learning_rate": 6.072835983149885e-06, + "loss": 4.0959, + "step": 46260 + }, + { + "epoch": 3.1434298138334014, + "grad_norm": 0.35523539781570435, + "learning_rate": 6.072411333061557e-06, + "loss": 4.1719, + "step": 46265 + }, + { + "epoch": 3.143769533904063, + "grad_norm": 0.4373340606689453, + "learning_rate": 6.071986682973231e-06, + "loss": 4.203, + "step": 46270 + }, + { + "epoch": 3.1441092539747246, + "grad_norm": 0.38786590099334717, + "learning_rate": 6.071562032884904e-06, + "loss": 4.2191, + "step": 46275 + }, + { + "epoch": 3.1444489740453867, + "grad_norm": 0.40505579113960266, + "learning_rate": 6.071137382796576e-06, + "loss": 4.0537, + "step": 46280 + }, + { + "epoch": 3.1447886941160483, + "grad_norm": 0.28554418683052063, + "learning_rate": 6.070712732708249e-06, + "loss": 4.0858, + "step": 46285 + }, + { + "epoch": 3.14512841418671, + "grad_norm": 0.2794850170612335, + "learning_rate": 6.070288082619922e-06, + "loss": 3.9335, + "step": 46290 + }, + { + "epoch": 3.145468134257372, + "grad_norm": 0.35189297795295715, + "learning_rate": 6.069863432531594e-06, + "loss": 4.0824, + "step": 46295 + }, + { + "epoch": 3.1458078543280337, + "grad_norm": 0.2948751151561737, + "learning_rate": 6.069438782443268e-06, + "loss": 4.2262, + "step": 46300 + }, + { + "epoch": 3.1461475743986953, + "grad_norm": 0.3572220802307129, + "learning_rate": 6.06901413235494e-06, + "loss": 4.2928, + "step": 46305 + }, + { + "epoch": 3.1464872944693574, + "grad_norm": 0.3152468502521515, + "learning_rate": 6.0685894822666125e-06, + "loss": 4.0948, + "step": 46310 + }, + { + "epoch": 3.146827014540019, + "grad_norm": 0.3382044732570648, + "learning_rate": 6.068164832178286e-06, + "loss": 3.8674, + "step": 46315 + }, + { + "epoch": 3.1471667346106806, + "grad_norm": 0.32020553946495056, + "learning_rate": 6.067740182089958e-06, + "loss": 4.0059, + "step": 46320 + }, + { + "epoch": 3.1475064546813427, + "grad_norm": 0.4638337790966034, + "learning_rate": 6.067315532001631e-06, + "loss": 4.3339, + "step": 46325 + }, + { + "epoch": 3.1478461747520043, + "grad_norm": 0.35201719403266907, + "learning_rate": 6.0668908819133045e-06, + "loss": 4.2243, + "step": 46330 + }, + { + "epoch": 3.148185894822666, + "grad_norm": 0.4641876220703125, + "learning_rate": 6.0664662318249765e-06, + "loss": 4.1242, + "step": 46335 + }, + { + "epoch": 3.148525614893328, + "grad_norm": 0.3560083508491516, + "learning_rate": 6.066041581736649e-06, + "loss": 4.0833, + "step": 46340 + }, + { + "epoch": 3.1488653349639897, + "grad_norm": 1.2962263822555542, + "learning_rate": 6.065616931648323e-06, + "loss": 4.1419, + "step": 46345 + }, + { + "epoch": 3.1492050550346513, + "grad_norm": 0.24191705882549286, + "learning_rate": 6.065192281559995e-06, + "loss": 3.9654, + "step": 46350 + }, + { + "epoch": 3.1495447751053134, + "grad_norm": 0.45394518971443176, + "learning_rate": 6.064767631471668e-06, + "loss": 3.9403, + "step": 46355 + }, + { + "epoch": 3.149884495175975, + "grad_norm": 0.3193761110305786, + "learning_rate": 6.064342981383341e-06, + "loss": 4.1576, + "step": 46360 + }, + { + "epoch": 3.1502242152466366, + "grad_norm": 0.2999456226825714, + "learning_rate": 6.063918331295013e-06, + "loss": 4.246, + "step": 46365 + }, + { + "epoch": 3.1505639353172987, + "grad_norm": 0.3980537950992584, + "learning_rate": 6.063493681206686e-06, + "loss": 3.9688, + "step": 46370 + }, + { + "epoch": 3.1509036553879604, + "grad_norm": 0.2707633674144745, + "learning_rate": 6.063069031118359e-06, + "loss": 3.8408, + "step": 46375 + }, + { + "epoch": 3.151243375458622, + "grad_norm": 0.2854677140712738, + "learning_rate": 6.062644381030032e-06, + "loss": 3.853, + "step": 46380 + }, + { + "epoch": 3.151583095529284, + "grad_norm": 0.31402742862701416, + "learning_rate": 6.0622197309417045e-06, + "loss": 4.0292, + "step": 46385 + }, + { + "epoch": 3.1519228155999457, + "grad_norm": 0.24182772636413574, + "learning_rate": 6.061795080853377e-06, + "loss": 3.8857, + "step": 46390 + }, + { + "epoch": 3.1522625356706073, + "grad_norm": 0.29004010558128357, + "learning_rate": 6.06137043076505e-06, + "loss": 4.1459, + "step": 46395 + }, + { + "epoch": 3.1526022557412694, + "grad_norm": 0.29065775871276855, + "learning_rate": 6.060945780676722e-06, + "loss": 4.3005, + "step": 46400 + }, + { + "epoch": 3.152941975811931, + "grad_norm": 0.45037955045700073, + "learning_rate": 6.060521130588396e-06, + "loss": 4.1598, + "step": 46405 + }, + { + "epoch": 3.1532816958825927, + "grad_norm": 0.18361927568912506, + "learning_rate": 6.0600964805000685e-06, + "loss": 3.9157, + "step": 46410 + }, + { + "epoch": 3.1536214159532543, + "grad_norm": 0.29154857993125916, + "learning_rate": 6.0596718304117405e-06, + "loss": 3.9973, + "step": 46415 + }, + { + "epoch": 3.1539611360239164, + "grad_norm": 0.33548232913017273, + "learning_rate": 6.059247180323414e-06, + "loss": 4.0149, + "step": 46420 + }, + { + "epoch": 3.154300856094578, + "grad_norm": 0.2702288031578064, + "learning_rate": 6.058822530235087e-06, + "loss": 4.0733, + "step": 46425 + }, + { + "epoch": 3.1546405761652396, + "grad_norm": 0.311455100774765, + "learning_rate": 6.058397880146759e-06, + "loss": 4.1051, + "step": 46430 + }, + { + "epoch": 3.1549802962359017, + "grad_norm": 0.3021794855594635, + "learning_rate": 6.0579732300584325e-06, + "loss": 3.9381, + "step": 46435 + }, + { + "epoch": 3.1553200163065633, + "grad_norm": 0.40814369916915894, + "learning_rate": 6.057548579970105e-06, + "loss": 4.0702, + "step": 46440 + }, + { + "epoch": 3.155659736377225, + "grad_norm": 0.23970481753349304, + "learning_rate": 6.057123929881777e-06, + "loss": 3.9912, + "step": 46445 + }, + { + "epoch": 3.155999456447887, + "grad_norm": 0.2388405203819275, + "learning_rate": 6.056699279793451e-06, + "loss": 4.2381, + "step": 46450 + }, + { + "epoch": 3.1563391765185487, + "grad_norm": 0.37573546171188354, + "learning_rate": 6.056274629705124e-06, + "loss": 4.2595, + "step": 46455 + }, + { + "epoch": 3.1566788965892103, + "grad_norm": 0.5083688497543335, + "learning_rate": 6.055849979616796e-06, + "loss": 3.916, + "step": 46460 + }, + { + "epoch": 3.1570186166598724, + "grad_norm": 0.3001059591770172, + "learning_rate": 6.055425329528469e-06, + "loss": 3.9536, + "step": 46465 + }, + { + "epoch": 3.157358336730534, + "grad_norm": 0.32184237241744995, + "learning_rate": 6.055000679440142e-06, + "loss": 4.2972, + "step": 46470 + }, + { + "epoch": 3.1576980568011956, + "grad_norm": 0.3685143291950226, + "learning_rate": 6.054576029351814e-06, + "loss": 3.8412, + "step": 46475 + }, + { + "epoch": 3.1580377768718577, + "grad_norm": 0.2776133418083191, + "learning_rate": 6.054151379263488e-06, + "loss": 4.0107, + "step": 46480 + }, + { + "epoch": 3.1583774969425193, + "grad_norm": 0.2152249962091446, + "learning_rate": 6.05372672917516e-06, + "loss": 4.0772, + "step": 46485 + }, + { + "epoch": 3.158717217013181, + "grad_norm": 0.3097594380378723, + "learning_rate": 6.0533020790868325e-06, + "loss": 4.0338, + "step": 46490 + }, + { + "epoch": 3.159056937083843, + "grad_norm": 0.40025919675827026, + "learning_rate": 6.052877428998506e-06, + "loss": 4.228, + "step": 46495 + }, + { + "epoch": 3.1593966571545047, + "grad_norm": 0.2564457058906555, + "learning_rate": 6.052452778910178e-06, + "loss": 3.724, + "step": 46500 + }, + { + "epoch": 3.1597363772251663, + "grad_norm": 0.3179915249347687, + "learning_rate": 6.052028128821851e-06, + "loss": 4.0108, + "step": 46505 + }, + { + "epoch": 3.1600760972958284, + "grad_norm": 0.3770314157009125, + "learning_rate": 6.0516034787335245e-06, + "loss": 4.0759, + "step": 46510 + }, + { + "epoch": 3.16041581736649, + "grad_norm": 0.6061219573020935, + "learning_rate": 6.0511788286451965e-06, + "loss": 4.2996, + "step": 46515 + }, + { + "epoch": 3.1607555374371517, + "grad_norm": 0.27109402418136597, + "learning_rate": 6.050754178556869e-06, + "loss": 4.2721, + "step": 46520 + }, + { + "epoch": 3.1610952575078137, + "grad_norm": 0.38595980405807495, + "learning_rate": 6.050329528468543e-06, + "loss": 3.9802, + "step": 46525 + }, + { + "epoch": 3.1614349775784754, + "grad_norm": 0.24952258169651031, + "learning_rate": 6.049904878380215e-06, + "loss": 4.0493, + "step": 46530 + }, + { + "epoch": 3.161774697649137, + "grad_norm": 0.277462899684906, + "learning_rate": 6.0494802282918885e-06, + "loss": 4.0495, + "step": 46535 + }, + { + "epoch": 3.162114417719799, + "grad_norm": 0.3551154136657715, + "learning_rate": 6.049055578203561e-06, + "loss": 4.0778, + "step": 46540 + }, + { + "epoch": 3.1624541377904607, + "grad_norm": 0.803044319152832, + "learning_rate": 6.048630928115233e-06, + "loss": 3.9764, + "step": 46545 + }, + { + "epoch": 3.1627938578611223, + "grad_norm": 0.22652699053287506, + "learning_rate": 6.048206278026907e-06, + "loss": 4.0801, + "step": 46550 + }, + { + "epoch": 3.1631335779317844, + "grad_norm": 0.3443916141986847, + "learning_rate": 6.047781627938579e-06, + "loss": 4.1464, + "step": 46555 + }, + { + "epoch": 3.163473298002446, + "grad_norm": 0.3590533435344696, + "learning_rate": 6.047356977850252e-06, + "loss": 3.9883, + "step": 46560 + }, + { + "epoch": 3.1638130180731077, + "grad_norm": 0.20975321531295776, + "learning_rate": 6.046932327761925e-06, + "loss": 3.9996, + "step": 46565 + }, + { + "epoch": 3.1641527381437697, + "grad_norm": 0.3570464253425598, + "learning_rate": 6.046507677673597e-06, + "loss": 3.9192, + "step": 46570 + }, + { + "epoch": 3.1644924582144314, + "grad_norm": 0.386894553899765, + "learning_rate": 6.04608302758527e-06, + "loss": 4.1796, + "step": 46575 + }, + { + "epoch": 3.164832178285093, + "grad_norm": 0.2684537172317505, + "learning_rate": 6.045658377496944e-06, + "loss": 4.2697, + "step": 46580 + }, + { + "epoch": 3.165171898355755, + "grad_norm": 0.3984907865524292, + "learning_rate": 6.045233727408616e-06, + "loss": 4.1891, + "step": 46585 + }, + { + "epoch": 3.1655116184264167, + "grad_norm": 0.33606743812561035, + "learning_rate": 6.0448090773202885e-06, + "loss": 4.0855, + "step": 46590 + }, + { + "epoch": 3.1658513384970783, + "grad_norm": 0.2688981592655182, + "learning_rate": 6.044384427231962e-06, + "loss": 4.1719, + "step": 46595 + }, + { + "epoch": 3.1661910585677404, + "grad_norm": 0.29919710755348206, + "learning_rate": 6.043959777143634e-06, + "loss": 4.1743, + "step": 46600 + }, + { + "epoch": 3.166530778638402, + "grad_norm": 0.30408573150634766, + "learning_rate": 6.043535127055307e-06, + "loss": 4.1083, + "step": 46605 + }, + { + "epoch": 3.1668704987090637, + "grad_norm": 0.7565528154373169, + "learning_rate": 6.0431104769669805e-06, + "loss": 3.8263, + "step": 46610 + }, + { + "epoch": 3.1672102187797253, + "grad_norm": 0.3811274468898773, + "learning_rate": 6.0426858268786525e-06, + "loss": 4.077, + "step": 46615 + }, + { + "epoch": 3.1675499388503874, + "grad_norm": 0.3226150572299957, + "learning_rate": 6.042261176790325e-06, + "loss": 4.1622, + "step": 46620 + }, + { + "epoch": 3.167889658921049, + "grad_norm": 0.29676541686058044, + "learning_rate": 6.041836526701998e-06, + "loss": 4.2738, + "step": 46625 + }, + { + "epoch": 3.1682293789917106, + "grad_norm": 0.33935222029685974, + "learning_rate": 6.041411876613671e-06, + "loss": 4.0407, + "step": 46630 + }, + { + "epoch": 3.1685690990623727, + "grad_norm": 0.4268118441104889, + "learning_rate": 6.040987226525344e-06, + "loss": 3.9886, + "step": 46635 + }, + { + "epoch": 3.1689088191330343, + "grad_norm": 0.2823053002357483, + "learning_rate": 6.0405625764370165e-06, + "loss": 4.0761, + "step": 46640 + }, + { + "epoch": 3.169248539203696, + "grad_norm": 0.34163540601730347, + "learning_rate": 6.040137926348689e-06, + "loss": 4.0122, + "step": 46645 + }, + { + "epoch": 3.169588259274358, + "grad_norm": 0.3901447057723999, + "learning_rate": 6.039713276260361e-06, + "loss": 4.2714, + "step": 46650 + }, + { + "epoch": 3.1699279793450197, + "grad_norm": 0.29569724202156067, + "learning_rate": 6.039288626172035e-06, + "loss": 3.9874, + "step": 46655 + }, + { + "epoch": 3.1702676994156813, + "grad_norm": 0.2569231688976288, + "learning_rate": 6.038863976083708e-06, + "loss": 4.3208, + "step": 46660 + }, + { + "epoch": 3.1706074194863434, + "grad_norm": 0.2534581124782562, + "learning_rate": 6.03843932599538e-06, + "loss": 3.9424, + "step": 46665 + }, + { + "epoch": 3.170947139557005, + "grad_norm": 0.3112891912460327, + "learning_rate": 6.038014675907053e-06, + "loss": 3.9966, + "step": 46670 + }, + { + "epoch": 3.1712868596276667, + "grad_norm": 0.4235370457172394, + "learning_rate": 6.037590025818726e-06, + "loss": 4.2602, + "step": 46675 + }, + { + "epoch": 3.1716265796983287, + "grad_norm": 0.3195633888244629, + "learning_rate": 6.037165375730398e-06, + "loss": 4.1697, + "step": 46680 + }, + { + "epoch": 3.1719662997689904, + "grad_norm": 0.2976992130279541, + "learning_rate": 6.036740725642072e-06, + "loss": 3.9429, + "step": 46685 + }, + { + "epoch": 3.172306019839652, + "grad_norm": 0.4838637709617615, + "learning_rate": 6.0363160755537445e-06, + "loss": 4.4154, + "step": 46690 + }, + { + "epoch": 3.172645739910314, + "grad_norm": 0.30286216735839844, + "learning_rate": 6.0358914254654165e-06, + "loss": 4.1447, + "step": 46695 + }, + { + "epoch": 3.1729854599809757, + "grad_norm": 0.3411317765712738, + "learning_rate": 6.03546677537709e-06, + "loss": 4.1148, + "step": 46700 + }, + { + "epoch": 3.1733251800516373, + "grad_norm": 0.2981995940208435, + "learning_rate": 6.035042125288763e-06, + "loss": 3.9029, + "step": 46705 + }, + { + "epoch": 3.1736649001222994, + "grad_norm": 0.4368768334388733, + "learning_rate": 6.034617475200435e-06, + "loss": 4.0031, + "step": 46710 + }, + { + "epoch": 3.174004620192961, + "grad_norm": 0.5406789183616638, + "learning_rate": 6.0341928251121085e-06, + "loss": 4.367, + "step": 46715 + }, + { + "epoch": 3.1743443402636227, + "grad_norm": 0.2712922692298889, + "learning_rate": 6.0337681750237805e-06, + "loss": 3.9398, + "step": 46720 + }, + { + "epoch": 3.1746840603342847, + "grad_norm": 0.2895439863204956, + "learning_rate": 6.033343524935453e-06, + "loss": 3.9877, + "step": 46725 + }, + { + "epoch": 3.1750237804049464, + "grad_norm": 0.28965774178504944, + "learning_rate": 6.032918874847127e-06, + "loss": 4.0746, + "step": 46730 + }, + { + "epoch": 3.175363500475608, + "grad_norm": 0.271637499332428, + "learning_rate": 6.032494224758799e-06, + "loss": 3.9935, + "step": 46735 + }, + { + "epoch": 3.17570322054627, + "grad_norm": 0.473802775144577, + "learning_rate": 6.032069574670472e-06, + "loss": 4.2261, + "step": 46740 + }, + { + "epoch": 3.1760429406169317, + "grad_norm": 0.31820568442344666, + "learning_rate": 6.031644924582145e-06, + "loss": 4.2846, + "step": 46745 + }, + { + "epoch": 3.1763826606875933, + "grad_norm": 0.2774612307548523, + "learning_rate": 6.031220274493817e-06, + "loss": 4.0244, + "step": 46750 + }, + { + "epoch": 3.176722380758255, + "grad_norm": 0.31489211320877075, + "learning_rate": 6.03079562440549e-06, + "loss": 3.7151, + "step": 46755 + }, + { + "epoch": 3.177062100828917, + "grad_norm": 0.36877813935279846, + "learning_rate": 6.030370974317164e-06, + "loss": 4.197, + "step": 46760 + }, + { + "epoch": 3.1774018208995787, + "grad_norm": 0.239251509308815, + "learning_rate": 6.029946324228836e-06, + "loss": 3.9557, + "step": 46765 + }, + { + "epoch": 3.1777415409702403, + "grad_norm": 0.33314186334609985, + "learning_rate": 6.0295216741405085e-06, + "loss": 4.0782, + "step": 46770 + }, + { + "epoch": 3.1780812610409024, + "grad_norm": 0.3178296983242035, + "learning_rate": 6.029097024052182e-06, + "loss": 4.0111, + "step": 46775 + }, + { + "epoch": 3.178420981111564, + "grad_norm": 0.3044586777687073, + "learning_rate": 6.028672373963854e-06, + "loss": 4.0826, + "step": 46780 + }, + { + "epoch": 3.1787607011822256, + "grad_norm": 0.3291226923465729, + "learning_rate": 6.028247723875527e-06, + "loss": 3.938, + "step": 46785 + }, + { + "epoch": 3.1791004212528877, + "grad_norm": 0.37319624423980713, + "learning_rate": 6.0278230737872005e-06, + "loss": 4.2408, + "step": 46790 + }, + { + "epoch": 3.1794401413235494, + "grad_norm": 0.5436403155326843, + "learning_rate": 6.0273984236988725e-06, + "loss": 4.0254, + "step": 46795 + }, + { + "epoch": 3.179779861394211, + "grad_norm": 0.38046711683273315, + "learning_rate": 6.026973773610545e-06, + "loss": 3.8032, + "step": 46800 + }, + { + "epoch": 3.180119581464873, + "grad_norm": 0.22828029096126556, + "learning_rate": 6.026549123522218e-06, + "loss": 3.826, + "step": 46805 + }, + { + "epoch": 3.1804593015355347, + "grad_norm": 0.3844408392906189, + "learning_rate": 6.026124473433891e-06, + "loss": 4.1899, + "step": 46810 + }, + { + "epoch": 3.1807990216061963, + "grad_norm": 0.28157299757003784, + "learning_rate": 6.025699823345563e-06, + "loss": 3.7191, + "step": 46815 + }, + { + "epoch": 3.1811387416768584, + "grad_norm": 0.28697097301483154, + "learning_rate": 6.0252751732572365e-06, + "loss": 3.9116, + "step": 46820 + }, + { + "epoch": 3.18147846174752, + "grad_norm": 0.253659188747406, + "learning_rate": 6.024850523168909e-06, + "loss": 3.9822, + "step": 46825 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 0.27708762884140015, + "learning_rate": 6.024425873080581e-06, + "loss": 3.9917, + "step": 46830 + }, + { + "epoch": 3.1821579018888437, + "grad_norm": 0.28928548097610474, + "learning_rate": 6.024001222992255e-06, + "loss": 4.0643, + "step": 46835 + }, + { + "epoch": 3.1824976219595054, + "grad_norm": 0.471868634223938, + "learning_rate": 6.023576572903928e-06, + "loss": 3.8417, + "step": 46840 + }, + { + "epoch": 3.182837342030167, + "grad_norm": 0.28691565990448, + "learning_rate": 6.0231519228156e-06, + "loss": 3.9817, + "step": 46845 + }, + { + "epoch": 3.183177062100829, + "grad_norm": 0.2905040681362152, + "learning_rate": 6.022727272727273e-06, + "loss": 4.097, + "step": 46850 + }, + { + "epoch": 3.1835167821714907, + "grad_norm": 0.25732219219207764, + "learning_rate": 6.022302622638946e-06, + "loss": 4.0299, + "step": 46855 + }, + { + "epoch": 3.1838565022421523, + "grad_norm": 0.3615959882736206, + "learning_rate": 6.021877972550618e-06, + "loss": 4.4031, + "step": 46860 + }, + { + "epoch": 3.1841962223128144, + "grad_norm": 0.3516290783882141, + "learning_rate": 6.021453322462292e-06, + "loss": 4.1516, + "step": 46865 + }, + { + "epoch": 3.184535942383476, + "grad_norm": 0.2645806670188904, + "learning_rate": 6.0210286723739645e-06, + "loss": 4.1197, + "step": 46870 + }, + { + "epoch": 3.1848756624541377, + "grad_norm": 0.33176544308662415, + "learning_rate": 6.020604022285637e-06, + "loss": 4.2194, + "step": 46875 + }, + { + "epoch": 3.1852153825247997, + "grad_norm": 0.2658992111682892, + "learning_rate": 6.02017937219731e-06, + "loss": 4.0268, + "step": 46880 + }, + { + "epoch": 3.1855551025954614, + "grad_norm": 0.27949580550193787, + "learning_rate": 6.019754722108983e-06, + "loss": 4.175, + "step": 46885 + }, + { + "epoch": 3.185894822666123, + "grad_norm": 0.4469769299030304, + "learning_rate": 6.019330072020656e-06, + "loss": 3.981, + "step": 46890 + }, + { + "epoch": 3.186234542736785, + "grad_norm": 0.29245656728744507, + "learning_rate": 6.0189054219323285e-06, + "loss": 4.1179, + "step": 46895 + }, + { + "epoch": 3.1865742628074467, + "grad_norm": 0.41357603669166565, + "learning_rate": 6.0184807718440004e-06, + "loss": 4.1643, + "step": 46900 + }, + { + "epoch": 3.1869139828781083, + "grad_norm": 0.3503984212875366, + "learning_rate": 6.018056121755674e-06, + "loss": 3.9906, + "step": 46905 + }, + { + "epoch": 3.1872537029487704, + "grad_norm": 0.32327479124069214, + "learning_rate": 6.017631471667347e-06, + "loss": 4.2165, + "step": 46910 + }, + { + "epoch": 3.187593423019432, + "grad_norm": 0.23909492790699005, + "learning_rate": 6.017206821579019e-06, + "loss": 3.9201, + "step": 46915 + }, + { + "epoch": 3.1879331430900937, + "grad_norm": 0.4146348536014557, + "learning_rate": 6.0167821714906925e-06, + "loss": 4.0942, + "step": 46920 + }, + { + "epoch": 3.1882728631607558, + "grad_norm": 0.38652557134628296, + "learning_rate": 6.016357521402365e-06, + "loss": 4.2332, + "step": 46925 + }, + { + "epoch": 3.1886125832314174, + "grad_norm": 0.2831554710865021, + "learning_rate": 6.015932871314037e-06, + "loss": 4.1564, + "step": 46930 + }, + { + "epoch": 3.188952303302079, + "grad_norm": 0.41044050455093384, + "learning_rate": 6.015508221225711e-06, + "loss": 4.0597, + "step": 46935 + }, + { + "epoch": 3.189292023372741, + "grad_norm": 0.4240272045135498, + "learning_rate": 6.015083571137384e-06, + "loss": 4.0807, + "step": 46940 + }, + { + "epoch": 3.1896317434434027, + "grad_norm": 0.28247636556625366, + "learning_rate": 6.014658921049056e-06, + "loss": 3.931, + "step": 46945 + }, + { + "epoch": 3.1899714635140644, + "grad_norm": 0.24953696131706238, + "learning_rate": 6.014234270960729e-06, + "loss": 4.0157, + "step": 46950 + }, + { + "epoch": 3.190311183584726, + "grad_norm": 0.35950085520744324, + "learning_rate": 6.013809620872402e-06, + "loss": 4.0807, + "step": 46955 + }, + { + "epoch": 3.190650903655388, + "grad_norm": 0.24691011011600494, + "learning_rate": 6.013384970784074e-06, + "loss": 3.8621, + "step": 46960 + }, + { + "epoch": 3.1909906237260497, + "grad_norm": 0.3735734522342682, + "learning_rate": 6.012960320695748e-06, + "loss": 3.9764, + "step": 46965 + }, + { + "epoch": 3.1913303437967113, + "grad_norm": 0.4453348219394684, + "learning_rate": 6.01253567060742e-06, + "loss": 3.8671, + "step": 46970 + }, + { + "epoch": 3.1916700638673734, + "grad_norm": 0.3302174508571625, + "learning_rate": 6.0121110205190924e-06, + "loss": 4.2755, + "step": 46975 + }, + { + "epoch": 3.192009783938035, + "grad_norm": 0.2863055169582367, + "learning_rate": 6.011686370430766e-06, + "loss": 4.0387, + "step": 46980 + }, + { + "epoch": 3.1923495040086967, + "grad_norm": 0.2806362509727478, + "learning_rate": 6.011261720342438e-06, + "loss": 3.9082, + "step": 46985 + }, + { + "epoch": 3.1926892240793587, + "grad_norm": 0.31178581714630127, + "learning_rate": 6.010837070254111e-06, + "loss": 3.9045, + "step": 46990 + }, + { + "epoch": 3.1930289441500204, + "grad_norm": 0.2511047422885895, + "learning_rate": 6.0104124201657845e-06, + "loss": 3.9754, + "step": 46995 + }, + { + "epoch": 3.193368664220682, + "grad_norm": 0.32045143842697144, + "learning_rate": 6.0099877700774565e-06, + "loss": 3.9179, + "step": 47000 + }, + { + "epoch": 3.193708384291344, + "grad_norm": 0.22985930740833282, + "learning_rate": 6.009563119989129e-06, + "loss": 4.0949, + "step": 47005 + }, + { + "epoch": 3.1940481043620057, + "grad_norm": 0.4178199768066406, + "learning_rate": 6.009138469900803e-06, + "loss": 3.8778, + "step": 47010 + }, + { + "epoch": 3.1943878244326673, + "grad_norm": 0.3715893030166626, + "learning_rate": 6.008713819812475e-06, + "loss": 4.0526, + "step": 47015 + }, + { + "epoch": 3.1947275445033294, + "grad_norm": 0.29986754059791565, + "learning_rate": 6.008289169724148e-06, + "loss": 3.9441, + "step": 47020 + }, + { + "epoch": 3.195067264573991, + "grad_norm": 0.2715075612068176, + "learning_rate": 6.007864519635821e-06, + "loss": 4.0219, + "step": 47025 + }, + { + "epoch": 3.1954069846446527, + "grad_norm": 0.3709012269973755, + "learning_rate": 6.007439869547493e-06, + "loss": 3.9673, + "step": 47030 + }, + { + "epoch": 3.1957467047153147, + "grad_norm": 0.316460520029068, + "learning_rate": 6.007015219459166e-06, + "loss": 3.966, + "step": 47035 + }, + { + "epoch": 3.1960864247859764, + "grad_norm": 0.4124718904495239, + "learning_rate": 6.00659056937084e-06, + "loss": 4.136, + "step": 47040 + }, + { + "epoch": 3.196426144856638, + "grad_norm": 0.2598228454589844, + "learning_rate": 6.006165919282512e-06, + "loss": 3.9622, + "step": 47045 + }, + { + "epoch": 3.1967658649273, + "grad_norm": 0.44463783502578735, + "learning_rate": 6.0057412691941845e-06, + "loss": 3.8997, + "step": 47050 + }, + { + "epoch": 3.1971055849979617, + "grad_norm": 0.2737605571746826, + "learning_rate": 6.005316619105857e-06, + "loss": 4.0693, + "step": 47055 + }, + { + "epoch": 3.1974453050686233, + "grad_norm": 0.35720738768577576, + "learning_rate": 6.00489196901753e-06, + "loss": 4.0447, + "step": 47060 + }, + { + "epoch": 3.1977850251392854, + "grad_norm": 0.3026759624481201, + "learning_rate": 6.004467318929202e-06, + "loss": 4.2714, + "step": 47065 + }, + { + "epoch": 3.198124745209947, + "grad_norm": 0.3616412580013275, + "learning_rate": 6.004042668840876e-06, + "loss": 3.8685, + "step": 47070 + }, + { + "epoch": 3.1984644652806087, + "grad_norm": 0.3006216883659363, + "learning_rate": 6.0036180187525485e-06, + "loss": 4.1428, + "step": 47075 + }, + { + "epoch": 3.1988041853512708, + "grad_norm": 0.3454624116420746, + "learning_rate": 6.00319336866422e-06, + "loss": 3.9667, + "step": 47080 + }, + { + "epoch": 3.1991439054219324, + "grad_norm": 0.33032920956611633, + "learning_rate": 6.002768718575894e-06, + "loss": 4.1023, + "step": 47085 + }, + { + "epoch": 3.199483625492594, + "grad_norm": 0.9447365999221802, + "learning_rate": 6.002344068487567e-06, + "loss": 4.0665, + "step": 47090 + }, + { + "epoch": 3.1998233455632556, + "grad_norm": 0.3912559449672699, + "learning_rate": 6.001919418399239e-06, + "loss": 4.0507, + "step": 47095 + }, + { + "epoch": 3.2001630656339177, + "grad_norm": 0.2895233631134033, + "learning_rate": 6.0014947683109125e-06, + "loss": 3.8809, + "step": 47100 + }, + { + "epoch": 3.2005027857045794, + "grad_norm": 0.28558605909347534, + "learning_rate": 6.001070118222585e-06, + "loss": 3.8229, + "step": 47105 + }, + { + "epoch": 3.200842505775241, + "grad_norm": NaN, + "learning_rate": 6.000730398151924e-06, + "loss": 4.4749, + "step": 47110 + }, + { + "epoch": 3.201182225845903, + "grad_norm": 0.2926326096057892, + "learning_rate": 6.000305748063596e-06, + "loss": 4.0777, + "step": 47115 + }, + { + "epoch": 3.2015219459165647, + "grad_norm": 0.30932843685150146, + "learning_rate": 5.999881097975269e-06, + "loss": 4.153, + "step": 47120 + }, + { + "epoch": 3.2018616659872263, + "grad_norm": 0.28694087266921997, + "learning_rate": 5.999456447886942e-06, + "loss": 3.8232, + "step": 47125 + }, + { + "epoch": 3.2022013860578884, + "grad_norm": 0.363930344581604, + "learning_rate": 5.9990317977986145e-06, + "loss": 4.0895, + "step": 47130 + }, + { + "epoch": 3.20254110612855, + "grad_norm": 0.39173007011413574, + "learning_rate": 5.998607147710287e-06, + "loss": 4.2587, + "step": 47135 + }, + { + "epoch": 3.2028808261992117, + "grad_norm": 0.31689003109931946, + "learning_rate": 5.99818249762196e-06, + "loss": 4.036, + "step": 47140 + }, + { + "epoch": 3.2032205462698737, + "grad_norm": 0.22115390002727509, + "learning_rate": 5.997757847533633e-06, + "loss": 3.9583, + "step": 47145 + }, + { + "epoch": 3.2035602663405354, + "grad_norm": 0.34654438495635986, + "learning_rate": 5.997333197445305e-06, + "loss": 4.0356, + "step": 47150 + }, + { + "epoch": 3.203899986411197, + "grad_norm": 0.2534022629261017, + "learning_rate": 5.9969085473569785e-06, + "loss": 4.1814, + "step": 47155 + }, + { + "epoch": 3.204239706481859, + "grad_norm": 0.38446900248527527, + "learning_rate": 5.996483897268651e-06, + "loss": 4.2581, + "step": 47160 + }, + { + "epoch": 3.2045794265525207, + "grad_norm": 0.2818875312805176, + "learning_rate": 5.996059247180323e-06, + "loss": 4.0461, + "step": 47165 + }, + { + "epoch": 3.2049191466231823, + "grad_norm": 0.32841262221336365, + "learning_rate": 5.995634597091997e-06, + "loss": 4.1583, + "step": 47170 + }, + { + "epoch": 3.2052588666938444, + "grad_norm": 0.24600352346897125, + "learning_rate": 5.99520994700367e-06, + "loss": 3.928, + "step": 47175 + }, + { + "epoch": 3.205598586764506, + "grad_norm": 0.2585042715072632, + "learning_rate": 5.994785296915342e-06, + "loss": 4.1435, + "step": 47180 + }, + { + "epoch": 3.2059383068351677, + "grad_norm": 0.31214165687561035, + "learning_rate": 5.994360646827015e-06, + "loss": 3.8539, + "step": 47185 + }, + { + "epoch": 3.2062780269058297, + "grad_norm": 0.3984734117984772, + "learning_rate": 5.993935996738688e-06, + "loss": 4.1951, + "step": 47190 + }, + { + "epoch": 3.2066177469764914, + "grad_norm": 0.3729524314403534, + "learning_rate": 5.99351134665036e-06, + "loss": 3.9584, + "step": 47195 + }, + { + "epoch": 3.206957467047153, + "grad_norm": 0.25654616951942444, + "learning_rate": 5.993086696562034e-06, + "loss": 3.7956, + "step": 47200 + }, + { + "epoch": 3.207297187117815, + "grad_norm": 0.37056052684783936, + "learning_rate": 5.9926620464737066e-06, + "loss": 4.1795, + "step": 47205 + }, + { + "epoch": 3.2076369071884767, + "grad_norm": 0.26016995310783386, + "learning_rate": 5.9922373963853785e-06, + "loss": 4.0585, + "step": 47210 + }, + { + "epoch": 3.2079766272591383, + "grad_norm": 0.2788388133049011, + "learning_rate": 5.991812746297052e-06, + "loss": 3.8988, + "step": 47215 + }, + { + "epoch": 3.2083163473298004, + "grad_norm": 0.3213461935520172, + "learning_rate": 5.991388096208724e-06, + "loss": 3.928, + "step": 47220 + }, + { + "epoch": 3.208656067400462, + "grad_norm": 0.2899187207221985, + "learning_rate": 5.990963446120397e-06, + "loss": 4.0635, + "step": 47225 + }, + { + "epoch": 3.2089957874711237, + "grad_norm": 0.4170101583003998, + "learning_rate": 5.9905387960320706e-06, + "loss": 3.9823, + "step": 47230 + }, + { + "epoch": 3.2093355075417858, + "grad_norm": 0.40719088912010193, + "learning_rate": 5.9901141459437425e-06, + "loss": 4.2658, + "step": 47235 + }, + { + "epoch": 3.2096752276124474, + "grad_norm": 0.309490829706192, + "learning_rate": 5.989689495855415e-06, + "loss": 4.0472, + "step": 47240 + }, + { + "epoch": 3.210014947683109, + "grad_norm": 0.28298085927963257, + "learning_rate": 5.989264845767089e-06, + "loss": 4.1003, + "step": 47245 + }, + { + "epoch": 3.210354667753771, + "grad_norm": 0.2944099009037018, + "learning_rate": 5.988840195678761e-06, + "loss": 4.1777, + "step": 47250 + }, + { + "epoch": 3.2106943878244327, + "grad_norm": 0.22726662456989288, + "learning_rate": 5.988415545590434e-06, + "loss": 3.9806, + "step": 47255 + }, + { + "epoch": 3.2110341078950944, + "grad_norm": 0.5024898052215576, + "learning_rate": 5.987990895502107e-06, + "loss": 4.1701, + "step": 47260 + }, + { + "epoch": 3.2113738279657564, + "grad_norm": 0.34713873267173767, + "learning_rate": 5.987566245413779e-06, + "loss": 4.1418, + "step": 47265 + }, + { + "epoch": 3.211713548036418, + "grad_norm": 0.2590649724006653, + "learning_rate": 5.987141595325452e-06, + "loss": 4.0552, + "step": 47270 + }, + { + "epoch": 3.2120532681070797, + "grad_norm": 0.4963878095149994, + "learning_rate": 5.986716945237126e-06, + "loss": 4.1624, + "step": 47275 + }, + { + "epoch": 3.2123929881777418, + "grad_norm": 0.4012317657470703, + "learning_rate": 5.986292295148798e-06, + "loss": 4.1132, + "step": 47280 + }, + { + "epoch": 3.2127327082484034, + "grad_norm": 0.2412136197090149, + "learning_rate": 5.9858676450604705e-06, + "loss": 4.032, + "step": 47285 + }, + { + "epoch": 3.213072428319065, + "grad_norm": 0.4545312821865082, + "learning_rate": 5.985442994972144e-06, + "loss": 4.0166, + "step": 47290 + }, + { + "epoch": 3.2134121483897267, + "grad_norm": 0.25688132643699646, + "learning_rate": 5.985018344883816e-06, + "loss": 3.9494, + "step": 47295 + }, + { + "epoch": 3.2137518684603887, + "grad_norm": 0.2655623257160187, + "learning_rate": 5.984593694795489e-06, + "loss": 4.2334, + "step": 47300 + }, + { + "epoch": 3.2140915885310504, + "grad_norm": 0.24376720190048218, + "learning_rate": 5.984169044707162e-06, + "loss": 3.7492, + "step": 47305 + }, + { + "epoch": 3.214431308601712, + "grad_norm": 0.25937262177467346, + "learning_rate": 5.9837443946188345e-06, + "loss": 3.8274, + "step": 47310 + }, + { + "epoch": 3.214771028672374, + "grad_norm": 0.2934010624885559, + "learning_rate": 5.9833197445305065e-06, + "loss": 4.1009, + "step": 47315 + }, + { + "epoch": 3.2151107487430357, + "grad_norm": 0.22598396241664886, + "learning_rate": 5.98289509444218e-06, + "loss": 4.0547, + "step": 47320 + }, + { + "epoch": 3.2154504688136973, + "grad_norm": 0.2570714056491852, + "learning_rate": 5.982470444353853e-06, + "loss": 4.1379, + "step": 47325 + }, + { + "epoch": 3.2157901888843594, + "grad_norm": 0.40738415718078613, + "learning_rate": 5.982045794265525e-06, + "loss": 3.8087, + "step": 47330 + }, + { + "epoch": 3.216129908955021, + "grad_norm": 0.34570643305778503, + "learning_rate": 5.9816211441771985e-06, + "loss": 3.8816, + "step": 47335 + }, + { + "epoch": 3.2164696290256827, + "grad_norm": 0.39296942949295044, + "learning_rate": 5.981196494088871e-06, + "loss": 3.9005, + "step": 47340 + }, + { + "epoch": 3.2168093490963448, + "grad_norm": 0.3740856349468231, + "learning_rate": 5.980771844000543e-06, + "loss": 3.9616, + "step": 47345 + }, + { + "epoch": 3.2171490691670064, + "grad_norm": 0.390924870967865, + "learning_rate": 5.980347193912217e-06, + "loss": 3.9583, + "step": 47350 + }, + { + "epoch": 3.217488789237668, + "grad_norm": 0.2824489176273346, + "learning_rate": 5.97992254382389e-06, + "loss": 3.9286, + "step": 47355 + }, + { + "epoch": 3.21782850930833, + "grad_norm": 0.288730651140213, + "learning_rate": 5.979497893735562e-06, + "loss": 4.2083, + "step": 47360 + }, + { + "epoch": 3.2181682293789917, + "grad_norm": 0.3102990388870239, + "learning_rate": 5.979073243647235e-06, + "loss": 3.8818, + "step": 47365 + }, + { + "epoch": 3.2185079494496533, + "grad_norm": 0.4957708418369293, + "learning_rate": 5.978648593558908e-06, + "loss": 4.1609, + "step": 47370 + }, + { + "epoch": 3.2188476695203154, + "grad_norm": 0.27642351388931274, + "learning_rate": 5.97822394347058e-06, + "loss": 4.06, + "step": 47375 + }, + { + "epoch": 3.219187389590977, + "grad_norm": 0.22341741621494293, + "learning_rate": 5.977799293382254e-06, + "loss": 4.1986, + "step": 47380 + }, + { + "epoch": 3.2195271096616387, + "grad_norm": 0.38961857557296753, + "learning_rate": 5.9773746432939265e-06, + "loss": 3.6599, + "step": 47385 + }, + { + "epoch": 3.2198668297323008, + "grad_norm": 0.2675381302833557, + "learning_rate": 5.9769499932055985e-06, + "loss": 4.0825, + "step": 47390 + }, + { + "epoch": 3.2202065498029624, + "grad_norm": 0.5348509550094604, + "learning_rate": 5.976525343117272e-06, + "loss": 4.1797, + "step": 47395 + }, + { + "epoch": 3.220546269873624, + "grad_norm": 0.25729021430015564, + "learning_rate": 5.976100693028944e-06, + "loss": 4.0107, + "step": 47400 + }, + { + "epoch": 3.220885989944286, + "grad_norm": 0.3510497510433197, + "learning_rate": 5.975676042940617e-06, + "loss": 4.0865, + "step": 47405 + }, + { + "epoch": 3.2212257100149477, + "grad_norm": 0.2663924992084503, + "learning_rate": 5.9752513928522905e-06, + "loss": 4.2448, + "step": 47410 + }, + { + "epoch": 3.2215654300856094, + "grad_norm": 0.2968381345272064, + "learning_rate": 5.9748267427639625e-06, + "loss": 4.2325, + "step": 47415 + }, + { + "epoch": 3.2219051501562714, + "grad_norm": 0.48573988676071167, + "learning_rate": 5.974402092675636e-06, + "loss": 3.7887, + "step": 47420 + }, + { + "epoch": 3.222244870226933, + "grad_norm": 0.29097139835357666, + "learning_rate": 5.973977442587309e-06, + "loss": 3.8948, + "step": 47425 + }, + { + "epoch": 3.2225845902975947, + "grad_norm": 0.5112706422805786, + "learning_rate": 5.973552792498981e-06, + "loss": 3.8809, + "step": 47430 + }, + { + "epoch": 3.2229243103682563, + "grad_norm": 0.3903740644454956, + "learning_rate": 5.9731281424106545e-06, + "loss": 4.0924, + "step": 47435 + }, + { + "epoch": 3.2232640304389184, + "grad_norm": 0.451642781496048, + "learning_rate": 5.972703492322327e-06, + "loss": 4.1318, + "step": 47440 + }, + { + "epoch": 3.22360375050958, + "grad_norm": 0.28888726234436035, + "learning_rate": 5.972278842233999e-06, + "loss": 4.0605, + "step": 47445 + }, + { + "epoch": 3.2239434705802417, + "grad_norm": 0.30446314811706543, + "learning_rate": 5.971854192145673e-06, + "loss": 4.0384, + "step": 47450 + }, + { + "epoch": 3.2242831906509037, + "grad_norm": 0.22266867756843567, + "learning_rate": 5.971429542057346e-06, + "loss": 4.1629, + "step": 47455 + }, + { + "epoch": 3.2246229107215654, + "grad_norm": 0.40371257066726685, + "learning_rate": 5.971004891969018e-06, + "loss": 4.031, + "step": 47460 + }, + { + "epoch": 3.224962630792227, + "grad_norm": 0.3603185713291168, + "learning_rate": 5.970580241880691e-06, + "loss": 4.0637, + "step": 47465 + }, + { + "epoch": 3.225302350862889, + "grad_norm": 0.33402249217033386, + "learning_rate": 5.970155591792363e-06, + "loss": 4.0952, + "step": 47470 + }, + { + "epoch": 3.2256420709335507, + "grad_norm": 0.2507516145706177, + "learning_rate": 5.969730941704036e-06, + "loss": 3.8455, + "step": 47475 + }, + { + "epoch": 3.2259817910042123, + "grad_norm": 0.30824702978134155, + "learning_rate": 5.96930629161571e-06, + "loss": 4.0103, + "step": 47480 + }, + { + "epoch": 3.2263215110748744, + "grad_norm": 0.23992577195167542, + "learning_rate": 5.968881641527382e-06, + "loss": 4.0866, + "step": 47485 + }, + { + "epoch": 3.226661231145536, + "grad_norm": 0.2970588505268097, + "learning_rate": 5.9684569914390545e-06, + "loss": 4.1221, + "step": 47490 + }, + { + "epoch": 3.2270009512161977, + "grad_norm": 0.4057115912437439, + "learning_rate": 5.968032341350728e-06, + "loss": 3.8083, + "step": 47495 + }, + { + "epoch": 3.2273406712868598, + "grad_norm": 0.29899826645851135, + "learning_rate": 5.9676076912624e-06, + "loss": 3.8146, + "step": 47500 + }, + { + "epoch": 3.2276803913575214, + "grad_norm": 0.28845998644828796, + "learning_rate": 5.967183041174073e-06, + "loss": 4.1665, + "step": 47505 + }, + { + "epoch": 3.228020111428183, + "grad_norm": 0.236780047416687, + "learning_rate": 5.9667583910857466e-06, + "loss": 3.965, + "step": 47510 + }, + { + "epoch": 3.228359831498845, + "grad_norm": 0.2802571654319763, + "learning_rate": 5.9663337409974185e-06, + "loss": 4.2232, + "step": 47515 + }, + { + "epoch": 3.2286995515695067, + "grad_norm": 0.28556978702545166, + "learning_rate": 5.965909090909091e-06, + "loss": 3.7085, + "step": 47520 + }, + { + "epoch": 3.2290392716401684, + "grad_norm": 0.33216506242752075, + "learning_rate": 5.965484440820765e-06, + "loss": 4.0227, + "step": 47525 + }, + { + "epoch": 3.2293789917108304, + "grad_norm": 0.2543140649795532, + "learning_rate": 5.965059790732437e-06, + "loss": 4.0699, + "step": 47530 + }, + { + "epoch": 3.229718711781492, + "grad_norm": 0.2976633906364441, + "learning_rate": 5.96463514064411e-06, + "loss": 3.771, + "step": 47535 + }, + { + "epoch": 3.2300584318521537, + "grad_norm": 0.29037824273109436, + "learning_rate": 5.964210490555783e-06, + "loss": 4.029, + "step": 47540 + }, + { + "epoch": 3.2303981519228158, + "grad_norm": 0.2521169185638428, + "learning_rate": 5.963785840467455e-06, + "loss": 3.9811, + "step": 47545 + }, + { + "epoch": 3.2307378719934774, + "grad_norm": 0.5264061689376831, + "learning_rate": 5.963361190379128e-06, + "loss": 3.9464, + "step": 47550 + }, + { + "epoch": 3.231077592064139, + "grad_norm": 0.4325396716594696, + "learning_rate": 5.962936540290801e-06, + "loss": 4.2122, + "step": 47555 + }, + { + "epoch": 3.231417312134801, + "grad_norm": 0.3936866223812103, + "learning_rate": 5.962511890202474e-06, + "loss": 3.9076, + "step": 47560 + }, + { + "epoch": 3.2317570322054627, + "grad_norm": 0.2952384352684021, + "learning_rate": 5.962087240114146e-06, + "loss": 4.0796, + "step": 47565 + }, + { + "epoch": 3.2320967522761244, + "grad_norm": 0.26153069734573364, + "learning_rate": 5.961662590025819e-06, + "loss": 4.029, + "step": 47570 + }, + { + "epoch": 3.2324364723467864, + "grad_norm": 0.3280096650123596, + "learning_rate": 5.961237939937492e-06, + "loss": 4.1502, + "step": 47575 + }, + { + "epoch": 3.232776192417448, + "grad_norm": 0.5072812438011169, + "learning_rate": 5.960813289849164e-06, + "loss": 4.1162, + "step": 47580 + }, + { + "epoch": 3.2331159124881097, + "grad_norm": 0.4478359818458557, + "learning_rate": 5.960388639760838e-06, + "loss": 4.2637, + "step": 47585 + }, + { + "epoch": 3.2334556325587718, + "grad_norm": 0.22487495839595795, + "learning_rate": 5.9599639896725105e-06, + "loss": 4.0975, + "step": 47590 + }, + { + "epoch": 3.2337953526294334, + "grad_norm": 0.3217003345489502, + "learning_rate": 5.9595393395841825e-06, + "loss": 3.945, + "step": 47595 + }, + { + "epoch": 3.234135072700095, + "grad_norm": 0.38764193654060364, + "learning_rate": 5.959114689495856e-06, + "loss": 4.2197, + "step": 47600 + }, + { + "epoch": 3.234474792770757, + "grad_norm": 0.3707970380783081, + "learning_rate": 5.958690039407529e-06, + "loss": 4.3429, + "step": 47605 + }, + { + "epoch": 3.2348145128414187, + "grad_norm": 0.27156415581703186, + "learning_rate": 5.958265389319201e-06, + "loss": 3.9154, + "step": 47610 + }, + { + "epoch": 3.2351542329120804, + "grad_norm": 0.3654709458351135, + "learning_rate": 5.9578407392308745e-06, + "loss": 3.988, + "step": 47615 + }, + { + "epoch": 3.2354939529827424, + "grad_norm": 0.2729306221008301, + "learning_rate": 5.957416089142547e-06, + "loss": 4.1372, + "step": 47620 + }, + { + "epoch": 3.235833673053404, + "grad_norm": 0.37874913215637207, + "learning_rate": 5.956991439054219e-06, + "loss": 3.8175, + "step": 47625 + }, + { + "epoch": 3.2361733931240657, + "grad_norm": 0.26023417711257935, + "learning_rate": 5.956566788965893e-06, + "loss": 4.0289, + "step": 47630 + }, + { + "epoch": 3.2365131131947273, + "grad_norm": 0.24045619368553162, + "learning_rate": 5.956142138877566e-06, + "loss": 4.082, + "step": 47635 + }, + { + "epoch": 3.2368528332653894, + "grad_norm": 0.32119300961494446, + "learning_rate": 5.955717488789238e-06, + "loss": 4.0239, + "step": 47640 + }, + { + "epoch": 3.237192553336051, + "grad_norm": 0.3581959307193756, + "learning_rate": 5.955292838700911e-06, + "loss": 4.0878, + "step": 47645 + }, + { + "epoch": 3.2375322734067127, + "grad_norm": 0.37661054730415344, + "learning_rate": 5.954868188612583e-06, + "loss": 3.9528, + "step": 47650 + }, + { + "epoch": 3.2378719934773748, + "grad_norm": 0.28434082865715027, + "learning_rate": 5.954443538524256e-06, + "loss": 4.0042, + "step": 47655 + }, + { + "epoch": 3.2382117135480364, + "grad_norm": 0.2402709722518921, + "learning_rate": 5.95401888843593e-06, + "loss": 4.1475, + "step": 47660 + }, + { + "epoch": 3.238551433618698, + "grad_norm": 0.4182557165622711, + "learning_rate": 5.953594238347602e-06, + "loss": 4.0766, + "step": 47665 + }, + { + "epoch": 3.23889115368936, + "grad_norm": 0.23285336792469025, + "learning_rate": 5.9531695882592745e-06, + "loss": 4.0218, + "step": 47670 + }, + { + "epoch": 3.2392308737600217, + "grad_norm": 0.3388868570327759, + "learning_rate": 5.952744938170948e-06, + "loss": 3.9953, + "step": 47675 + }, + { + "epoch": 3.2395705938306834, + "grad_norm": 0.25619202852249146, + "learning_rate": 5.95232028808262e-06, + "loss": 3.9655, + "step": 47680 + }, + { + "epoch": 3.2399103139013454, + "grad_norm": 0.3065642714500427, + "learning_rate": 5.951895637994293e-06, + "loss": 4.2494, + "step": 47685 + }, + { + "epoch": 3.240250033972007, + "grad_norm": 0.3053908944129944, + "learning_rate": 5.9514709879059665e-06, + "loss": 4.1686, + "step": 47690 + }, + { + "epoch": 3.2405897540426687, + "grad_norm": 0.42545586824417114, + "learning_rate": 5.9510463378176385e-06, + "loss": 4.3784, + "step": 47695 + }, + { + "epoch": 3.2409294741133308, + "grad_norm": 0.27250802516937256, + "learning_rate": 5.950621687729311e-06, + "loss": 4.067, + "step": 47700 + }, + { + "epoch": 3.2412691941839924, + "grad_norm": 0.25772401690483093, + "learning_rate": 5.950197037640985e-06, + "loss": 4.1084, + "step": 47705 + }, + { + "epoch": 3.241608914254654, + "grad_norm": 0.3090958297252655, + "learning_rate": 5.949772387552657e-06, + "loss": 4.133, + "step": 47710 + }, + { + "epoch": 3.241948634325316, + "grad_norm": 0.3169165849685669, + "learning_rate": 5.94934773746433e-06, + "loss": 3.884, + "step": 47715 + }, + { + "epoch": 3.2422883543959777, + "grad_norm": 0.42463263869285583, + "learning_rate": 5.9489230873760025e-06, + "loss": 4.0857, + "step": 47720 + }, + { + "epoch": 3.2426280744666394, + "grad_norm": 0.2617664337158203, + "learning_rate": 5.948498437287675e-06, + "loss": 3.9142, + "step": 47725 + }, + { + "epoch": 3.2429677945373014, + "grad_norm": 0.3677845001220703, + "learning_rate": 5.948073787199348e-06, + "loss": 3.9562, + "step": 47730 + }, + { + "epoch": 3.243307514607963, + "grad_norm": 0.2871178388595581, + "learning_rate": 5.947649137111021e-06, + "loss": 4.1974, + "step": 47735 + }, + { + "epoch": 3.2436472346786247, + "grad_norm": 0.30062976479530334, + "learning_rate": 5.947224487022694e-06, + "loss": 4.0316, + "step": 47740 + }, + { + "epoch": 3.2439869547492868, + "grad_norm": 0.31647419929504395, + "learning_rate": 5.946799836934366e-06, + "loss": 3.9868, + "step": 47745 + }, + { + "epoch": 3.2443266748199484, + "grad_norm": 0.3251648247241974, + "learning_rate": 5.946375186846039e-06, + "loss": 3.9639, + "step": 47750 + }, + { + "epoch": 3.24466639489061, + "grad_norm": 0.3603740334510803, + "learning_rate": 5.945950536757712e-06, + "loss": 4.2948, + "step": 47755 + }, + { + "epoch": 3.245006114961272, + "grad_norm": 0.3180083930492401, + "learning_rate": 5.945525886669386e-06, + "loss": 4.1813, + "step": 47760 + }, + { + "epoch": 3.2453458350319337, + "grad_norm": 0.266731321811676, + "learning_rate": 5.945101236581058e-06, + "loss": 4.135, + "step": 47765 + }, + { + "epoch": 3.2456855551025954, + "grad_norm": 0.2382948100566864, + "learning_rate": 5.9446765864927305e-06, + "loss": 4.08, + "step": 47770 + }, + { + "epoch": 3.246025275173257, + "grad_norm": 0.2641947865486145, + "learning_rate": 5.944251936404404e-06, + "loss": 4.0099, + "step": 47775 + }, + { + "epoch": 3.246364995243919, + "grad_norm": 0.32125264406204224, + "learning_rate": 5.943827286316076e-06, + "loss": 3.9395, + "step": 47780 + }, + { + "epoch": 3.2467047153145807, + "grad_norm": 0.25527215003967285, + "learning_rate": 5.943402636227749e-06, + "loss": 3.9145, + "step": 47785 + }, + { + "epoch": 3.2470444353852423, + "grad_norm": 0.3021070659160614, + "learning_rate": 5.942977986139422e-06, + "loss": 3.9702, + "step": 47790 + }, + { + "epoch": 3.2473841554559044, + "grad_norm": 0.3865714371204376, + "learning_rate": 5.9425533360510945e-06, + "loss": 3.9458, + "step": 47795 + }, + { + "epoch": 3.247723875526566, + "grad_norm": 0.32005882263183594, + "learning_rate": 5.942128685962767e-06, + "loss": 4.0803, + "step": 47800 + }, + { + "epoch": 3.2480635955972277, + "grad_norm": 0.7842370271682739, + "learning_rate": 5.94170403587444e-06, + "loss": 4.0229, + "step": 47805 + }, + { + "epoch": 3.2484033156678898, + "grad_norm": 0.30374762415885925, + "learning_rate": 5.941279385786113e-06, + "loss": 4.2069, + "step": 47810 + }, + { + "epoch": 3.2487430357385514, + "grad_norm": 0.3352510929107666, + "learning_rate": 5.940854735697785e-06, + "loss": 4.003, + "step": 47815 + }, + { + "epoch": 3.249082755809213, + "grad_norm": 0.22856800258159637, + "learning_rate": 5.9404300856094585e-06, + "loss": 4.3915, + "step": 47820 + }, + { + "epoch": 3.249422475879875, + "grad_norm": 0.36949288845062256, + "learning_rate": 5.940005435521131e-06, + "loss": 3.9398, + "step": 47825 + }, + { + "epoch": 3.2497621959505367, + "grad_norm": 0.2802063524723053, + "learning_rate": 5.939580785432803e-06, + "loss": 3.8852, + "step": 47830 + }, + { + "epoch": 3.2501019160211984, + "grad_norm": 0.48607969284057617, + "learning_rate": 5.939156135344477e-06, + "loss": 4.0678, + "step": 47835 + }, + { + "epoch": 3.2504416360918604, + "grad_norm": 0.3017584979534149, + "learning_rate": 5.93873148525615e-06, + "loss": 4.1214, + "step": 47840 + }, + { + "epoch": 3.250781356162522, + "grad_norm": 0.24220700562000275, + "learning_rate": 5.938306835167822e-06, + "loss": 4.0326, + "step": 47845 + }, + { + "epoch": 3.2511210762331837, + "grad_norm": 0.2977551519870758, + "learning_rate": 5.937882185079495e-06, + "loss": 4.2178, + "step": 47850 + }, + { + "epoch": 3.2514607963038458, + "grad_norm": 0.3031715154647827, + "learning_rate": 5.937457534991168e-06, + "loss": 4.1212, + "step": 47855 + }, + { + "epoch": 3.2518005163745074, + "grad_norm": 0.2709047794342041, + "learning_rate": 5.93703288490284e-06, + "loss": 4.0784, + "step": 47860 + }, + { + "epoch": 3.252140236445169, + "grad_norm": 0.2918964922428131, + "learning_rate": 5.936608234814514e-06, + "loss": 3.7979, + "step": 47865 + }, + { + "epoch": 3.252479956515831, + "grad_norm": 0.40077582001686096, + "learning_rate": 5.9361835847261865e-06, + "loss": 4.0082, + "step": 47870 + }, + { + "epoch": 3.2528196765864927, + "grad_norm": 0.3380339443683624, + "learning_rate": 5.9357589346378585e-06, + "loss": 4.009, + "step": 47875 + }, + { + "epoch": 3.2531593966571544, + "grad_norm": 0.538092851638794, + "learning_rate": 5.935334284549532e-06, + "loss": 4.1895, + "step": 47880 + }, + { + "epoch": 3.2534991167278164, + "grad_norm": 0.3104609549045563, + "learning_rate": 5.934909634461204e-06, + "loss": 3.9677, + "step": 47885 + }, + { + "epoch": 3.253838836798478, + "grad_norm": 0.27262094616889954, + "learning_rate": 5.934484984372877e-06, + "loss": 4.1283, + "step": 47890 + }, + { + "epoch": 3.2541785568691397, + "grad_norm": 0.38348135352134705, + "learning_rate": 5.9340603342845505e-06, + "loss": 4.0667, + "step": 47895 + }, + { + "epoch": 3.254518276939802, + "grad_norm": 0.3190543055534363, + "learning_rate": 5.9336356841962225e-06, + "loss": 4.0448, + "step": 47900 + }, + { + "epoch": 3.2548579970104634, + "grad_norm": 0.3094339966773987, + "learning_rate": 5.933211034107895e-06, + "loss": 4.0904, + "step": 47905 + }, + { + "epoch": 3.255197717081125, + "grad_norm": 0.2967469394207001, + "learning_rate": 5.932786384019569e-06, + "loss": 3.9173, + "step": 47910 + }, + { + "epoch": 3.255537437151787, + "grad_norm": 0.24838033318519592, + "learning_rate": 5.932361733931241e-06, + "loss": 3.8551, + "step": 47915 + }, + { + "epoch": 3.2558771572224487, + "grad_norm": 0.27587953209877014, + "learning_rate": 5.931937083842914e-06, + "loss": 4.326, + "step": 47920 + }, + { + "epoch": 3.2562168772931104, + "grad_norm": 0.30312013626098633, + "learning_rate": 5.931512433754587e-06, + "loss": 4.0099, + "step": 47925 + }, + { + "epoch": 3.2565565973637725, + "grad_norm": 0.4507500231266022, + "learning_rate": 5.931087783666259e-06, + "loss": 4.2085, + "step": 47930 + }, + { + "epoch": 3.256896317434434, + "grad_norm": 0.29066896438598633, + "learning_rate": 5.930663133577932e-06, + "loss": 4.2855, + "step": 47935 + }, + { + "epoch": 3.2572360375050957, + "grad_norm": 0.35309872031211853, + "learning_rate": 5.930238483489606e-06, + "loss": 3.8681, + "step": 47940 + }, + { + "epoch": 3.257575757575758, + "grad_norm": 0.30962327122688293, + "learning_rate": 5.929813833401278e-06, + "loss": 3.9473, + "step": 47945 + }, + { + "epoch": 3.2579154776464194, + "grad_norm": 0.23932930827140808, + "learning_rate": 5.9293891833129505e-06, + "loss": 3.991, + "step": 47950 + }, + { + "epoch": 3.258255197717081, + "grad_norm": 0.26234933733940125, + "learning_rate": 5.928964533224624e-06, + "loss": 4.0477, + "step": 47955 + }, + { + "epoch": 3.258594917787743, + "grad_norm": 0.42338618636131287, + "learning_rate": 5.928539883136296e-06, + "loss": 4.2147, + "step": 47960 + }, + { + "epoch": 3.2589346378584048, + "grad_norm": 0.3628242015838623, + "learning_rate": 5.928115233047969e-06, + "loss": 3.9255, + "step": 47965 + }, + { + "epoch": 3.2592743579290664, + "grad_norm": 0.28050696849823, + "learning_rate": 5.927690582959642e-06, + "loss": 4.0123, + "step": 47970 + }, + { + "epoch": 3.2596140779997285, + "grad_norm": 0.4036614000797272, + "learning_rate": 5.9272659328713145e-06, + "loss": 3.9106, + "step": 47975 + }, + { + "epoch": 3.25995379807039, + "grad_norm": 0.25527679920196533, + "learning_rate": 5.926841282782987e-06, + "loss": 3.9178, + "step": 47980 + }, + { + "epoch": 3.2602935181410517, + "grad_norm": 0.2467799037694931, + "learning_rate": 5.92641663269466e-06, + "loss": 4.0296, + "step": 47985 + }, + { + "epoch": 3.2606332382117134, + "grad_norm": 0.2848129868507385, + "learning_rate": 5.925991982606333e-06, + "loss": 3.9131, + "step": 47990 + }, + { + "epoch": 3.2609729582823754, + "grad_norm": 0.41317644715309143, + "learning_rate": 5.925567332518005e-06, + "loss": 4.0263, + "step": 47995 + }, + { + "epoch": 3.261312678353037, + "grad_norm": 0.24646703898906708, + "learning_rate": 5.9251426824296785e-06, + "loss": 4.166, + "step": 48000 + }, + { + "epoch": 3.2616523984236987, + "grad_norm": 0.2687589228153229, + "learning_rate": 5.924718032341351e-06, + "loss": 3.9829, + "step": 48005 + }, + { + "epoch": 3.2619921184943608, + "grad_norm": 0.2908027768135071, + "learning_rate": 5.924293382253023e-06, + "loss": 4.1627, + "step": 48010 + }, + { + "epoch": 3.2623318385650224, + "grad_norm": 0.43088895082473755, + "learning_rate": 5.923868732164697e-06, + "loss": 4.2178, + "step": 48015 + }, + { + "epoch": 3.262671558635684, + "grad_norm": 0.3325897455215454, + "learning_rate": 5.92344408207637e-06, + "loss": 4.0308, + "step": 48020 + }, + { + "epoch": 3.263011278706346, + "grad_norm": 0.28967922925949097, + "learning_rate": 5.923019431988042e-06, + "loss": 4.2448, + "step": 48025 + }, + { + "epoch": 3.2633509987770077, + "grad_norm": 0.21907471120357513, + "learning_rate": 5.922594781899715e-06, + "loss": 4.139, + "step": 48030 + }, + { + "epoch": 3.2636907188476694, + "grad_norm": 0.27018260955810547, + "learning_rate": 5.922170131811388e-06, + "loss": 4.0709, + "step": 48035 + }, + { + "epoch": 3.2640304389183314, + "grad_norm": 0.22781725227832794, + "learning_rate": 5.92174548172306e-06, + "loss": 4.1389, + "step": 48040 + }, + { + "epoch": 3.264370158988993, + "grad_norm": 0.3692578077316284, + "learning_rate": 5.921320831634734e-06, + "loss": 4.0179, + "step": 48045 + }, + { + "epoch": 3.2647098790596547, + "grad_norm": 0.29886648058891296, + "learning_rate": 5.9208961815464065e-06, + "loss": 4.3226, + "step": 48050 + }, + { + "epoch": 3.265049599130317, + "grad_norm": 0.38111627101898193, + "learning_rate": 5.9204715314580785e-06, + "loss": 4.0102, + "step": 48055 + }, + { + "epoch": 3.2653893192009784, + "grad_norm": 0.4260112941265106, + "learning_rate": 5.920046881369752e-06, + "loss": 3.9775, + "step": 48060 + }, + { + "epoch": 3.26572903927164, + "grad_norm": 0.29897767305374146, + "learning_rate": 5.919622231281424e-06, + "loss": 4.012, + "step": 48065 + }, + { + "epoch": 3.266068759342302, + "grad_norm": 0.43545013666152954, + "learning_rate": 5.919197581193097e-06, + "loss": 3.9749, + "step": 48070 + }, + { + "epoch": 3.2664084794129638, + "grad_norm": 0.23756611347198486, + "learning_rate": 5.9187729311047705e-06, + "loss": 4.0377, + "step": 48075 + }, + { + "epoch": 3.2667481994836254, + "grad_norm": 0.309381365776062, + "learning_rate": 5.9183482810164425e-06, + "loss": 4.004, + "step": 48080 + }, + { + "epoch": 3.2670879195542875, + "grad_norm": 0.4515659809112549, + "learning_rate": 5.917923630928115e-06, + "loss": 3.9801, + "step": 48085 + }, + { + "epoch": 3.267427639624949, + "grad_norm": 0.42163464426994324, + "learning_rate": 5.917498980839789e-06, + "loss": 4.0809, + "step": 48090 + }, + { + "epoch": 3.2677673596956107, + "grad_norm": 0.31766796112060547, + "learning_rate": 5.917074330751461e-06, + "loss": 4.0082, + "step": 48095 + }, + { + "epoch": 3.2681070797662723, + "grad_norm": 0.2608053386211395, + "learning_rate": 5.9166496806631345e-06, + "loss": 4.1727, + "step": 48100 + }, + { + "epoch": 3.2684467998369344, + "grad_norm": 0.27596524357795715, + "learning_rate": 5.916225030574807e-06, + "loss": 3.9886, + "step": 48105 + }, + { + "epoch": 3.268786519907596, + "grad_norm": 0.3747252821922302, + "learning_rate": 5.915800380486479e-06, + "loss": 4.0535, + "step": 48110 + }, + { + "epoch": 3.2691262399782577, + "grad_norm": 0.2663266062736511, + "learning_rate": 5.915375730398153e-06, + "loss": 4.1207, + "step": 48115 + }, + { + "epoch": 3.2694659600489198, + "grad_norm": 0.2129511833190918, + "learning_rate": 5.914951080309826e-06, + "loss": 4.0481, + "step": 48120 + }, + { + "epoch": 3.2698056801195814, + "grad_norm": 0.666419267654419, + "learning_rate": 5.914526430221498e-06, + "loss": 4.2988, + "step": 48125 + }, + { + "epoch": 3.270145400190243, + "grad_norm": 0.3113200068473816, + "learning_rate": 5.914101780133171e-06, + "loss": 3.836, + "step": 48130 + }, + { + "epoch": 3.270485120260905, + "grad_norm": 0.36652886867523193, + "learning_rate": 5.913677130044843e-06, + "loss": 3.9731, + "step": 48135 + }, + { + "epoch": 3.2708248403315667, + "grad_norm": 0.30222275853157043, + "learning_rate": 5.913252479956516e-06, + "loss": 3.9299, + "step": 48140 + }, + { + "epoch": 3.2711645604022284, + "grad_norm": 0.3599853813648224, + "learning_rate": 5.91282782986819e-06, + "loss": 4.1133, + "step": 48145 + }, + { + "epoch": 3.2715042804728904, + "grad_norm": 0.2172679901123047, + "learning_rate": 5.912403179779862e-06, + "loss": 4.2162, + "step": 48150 + }, + { + "epoch": 3.271844000543552, + "grad_norm": 0.3298075497150421, + "learning_rate": 5.9119785296915345e-06, + "loss": 4.1329, + "step": 48155 + }, + { + "epoch": 3.2721837206142137, + "grad_norm": 0.2818569242954254, + "learning_rate": 5.911553879603208e-06, + "loss": 3.9262, + "step": 48160 + }, + { + "epoch": 3.2725234406848758, + "grad_norm": 0.38638192415237427, + "learning_rate": 5.91112922951488e-06, + "loss": 4.2137, + "step": 48165 + }, + { + "epoch": 3.2728631607555374, + "grad_norm": 0.26767203211784363, + "learning_rate": 5.910704579426553e-06, + "loss": 3.9908, + "step": 48170 + }, + { + "epoch": 3.273202880826199, + "grad_norm": 0.4739445745944977, + "learning_rate": 5.9102799293382265e-06, + "loss": 3.9307, + "step": 48175 + }, + { + "epoch": 3.273542600896861, + "grad_norm": 0.2911924719810486, + "learning_rate": 5.9098552792498985e-06, + "loss": 3.9521, + "step": 48180 + }, + { + "epoch": 3.2738823209675227, + "grad_norm": 0.4992022216320038, + "learning_rate": 5.909430629161571e-06, + "loss": 4.1038, + "step": 48185 + }, + { + "epoch": 3.2742220410381844, + "grad_norm": 0.3770259618759155, + "learning_rate": 5.909005979073245e-06, + "loss": 4.0887, + "step": 48190 + }, + { + "epoch": 3.2745617611088464, + "grad_norm": 0.32382458448410034, + "learning_rate": 5.908581328984917e-06, + "loss": 4.1215, + "step": 48195 + }, + { + "epoch": 3.274901481179508, + "grad_norm": 0.4542224705219269, + "learning_rate": 5.90815667889659e-06, + "loss": 4.0353, + "step": 48200 + }, + { + "epoch": 3.2752412012501697, + "grad_norm": 0.38104280829429626, + "learning_rate": 5.907732028808263e-06, + "loss": 4.0544, + "step": 48205 + }, + { + "epoch": 3.275580921320832, + "grad_norm": 0.36089277267456055, + "learning_rate": 5.907307378719935e-06, + "loss": 4.3164, + "step": 48210 + }, + { + "epoch": 3.2759206413914934, + "grad_norm": 0.2553955614566803, + "learning_rate": 5.906882728631608e-06, + "loss": 4.1053, + "step": 48215 + }, + { + "epoch": 3.276260361462155, + "grad_norm": 0.2601340413093567, + "learning_rate": 5.906458078543281e-06, + "loss": 4.0325, + "step": 48220 + }, + { + "epoch": 3.276600081532817, + "grad_norm": 0.2954120635986328, + "learning_rate": 5.906033428454954e-06, + "loss": 4.0935, + "step": 48225 + }, + { + "epoch": 3.2769398016034788, + "grad_norm": 0.3561007082462311, + "learning_rate": 5.905608778366626e-06, + "loss": 4.0754, + "step": 48230 + }, + { + "epoch": 3.2772795216741404, + "grad_norm": 0.36926111578941345, + "learning_rate": 5.905184128278299e-06, + "loss": 4.1502, + "step": 48235 + }, + { + "epoch": 3.2776192417448025, + "grad_norm": 0.2775425612926483, + "learning_rate": 5.904759478189972e-06, + "loss": 4.1182, + "step": 48240 + }, + { + "epoch": 3.277958961815464, + "grad_norm": 0.30058640241622925, + "learning_rate": 5.904334828101644e-06, + "loss": 4.3307, + "step": 48245 + }, + { + "epoch": 3.2782986818861257, + "grad_norm": 0.2570943832397461, + "learning_rate": 5.903910178013318e-06, + "loss": 4.062, + "step": 48250 + }, + { + "epoch": 3.278638401956788, + "grad_norm": 0.39881980419158936, + "learning_rate": 5.9034855279249905e-06, + "loss": 3.8593, + "step": 48255 + }, + { + "epoch": 3.2789781220274494, + "grad_norm": 0.5231386423110962, + "learning_rate": 5.9030608778366624e-06, + "loss": 4.0597, + "step": 48260 + }, + { + "epoch": 3.279317842098111, + "grad_norm": 0.44525644183158875, + "learning_rate": 5.902636227748336e-06, + "loss": 3.8959, + "step": 48265 + }, + { + "epoch": 3.279657562168773, + "grad_norm": 0.3088377118110657, + "learning_rate": 5.902211577660009e-06, + "loss": 3.9044, + "step": 48270 + }, + { + "epoch": 3.2799972822394348, + "grad_norm": 0.33920544385910034, + "learning_rate": 5.901786927571681e-06, + "loss": 3.9285, + "step": 48275 + }, + { + "epoch": 3.2803370023100964, + "grad_norm": 0.3551734685897827, + "learning_rate": 5.9013622774833545e-06, + "loss": 3.9596, + "step": 48280 + }, + { + "epoch": 3.2806767223807585, + "grad_norm": 0.328427255153656, + "learning_rate": 5.900937627395027e-06, + "loss": 3.9087, + "step": 48285 + }, + { + "epoch": 3.28101644245142, + "grad_norm": 0.3425564467906952, + "learning_rate": 5.900512977306699e-06, + "loss": 3.9659, + "step": 48290 + }, + { + "epoch": 3.2813561625220817, + "grad_norm": 0.23244278132915497, + "learning_rate": 5.900088327218373e-06, + "loss": 4.0951, + "step": 48295 + }, + { + "epoch": 3.281695882592744, + "grad_norm": 0.351144015789032, + "learning_rate": 5.899663677130046e-06, + "loss": 3.9019, + "step": 48300 + }, + { + "epoch": 3.2820356026634054, + "grad_norm": 0.6079340577125549, + "learning_rate": 5.899239027041718e-06, + "loss": 4.1105, + "step": 48305 + }, + { + "epoch": 3.282375322734067, + "grad_norm": 0.2786935567855835, + "learning_rate": 5.898814376953391e-06, + "loss": 4.1222, + "step": 48310 + }, + { + "epoch": 3.282715042804729, + "grad_norm": 0.41275927424430847, + "learning_rate": 5.898389726865063e-06, + "loss": 4.0452, + "step": 48315 + }, + { + "epoch": 3.2830547628753908, + "grad_norm": 0.4398971498012543, + "learning_rate": 5.897965076776736e-06, + "loss": 4.2928, + "step": 48320 + }, + { + "epoch": 3.2833944829460524, + "grad_norm": 0.4377080798149109, + "learning_rate": 5.89754042668841e-06, + "loss": 4.1678, + "step": 48325 + }, + { + "epoch": 3.283734203016714, + "grad_norm": 0.2820020914077759, + "learning_rate": 5.897115776600082e-06, + "loss": 4.0863, + "step": 48330 + }, + { + "epoch": 3.284073923087376, + "grad_norm": 0.25943395495414734, + "learning_rate": 5.8966911265117544e-06, + "loss": 3.9081, + "step": 48335 + }, + { + "epoch": 3.2844136431580377, + "grad_norm": 0.23188821971416473, + "learning_rate": 5.896266476423428e-06, + "loss": 3.9903, + "step": 48340 + }, + { + "epoch": 3.2847533632286994, + "grad_norm": 0.26532208919525146, + "learning_rate": 5.8958418263351e-06, + "loss": 3.8202, + "step": 48345 + }, + { + "epoch": 3.2850930832993614, + "grad_norm": 0.2713851034641266, + "learning_rate": 5.895417176246773e-06, + "loss": 4.1872, + "step": 48350 + }, + { + "epoch": 3.285432803370023, + "grad_norm": 0.31640955805778503, + "learning_rate": 5.8949925261584465e-06, + "loss": 3.9322, + "step": 48355 + }, + { + "epoch": 3.2857725234406847, + "grad_norm": 0.2694891691207886, + "learning_rate": 5.8945678760701185e-06, + "loss": 3.9883, + "step": 48360 + }, + { + "epoch": 3.286112243511347, + "grad_norm": 0.5694528222084045, + "learning_rate": 5.894143225981791e-06, + "loss": 4.0584, + "step": 48365 + }, + { + "epoch": 3.2864519635820084, + "grad_norm": 0.2583724558353424, + "learning_rate": 5.893718575893465e-06, + "loss": 4.0494, + "step": 48370 + }, + { + "epoch": 3.28679168365267, + "grad_norm": 0.2848065495491028, + "learning_rate": 5.893293925805137e-06, + "loss": 4.1253, + "step": 48375 + }, + { + "epoch": 3.287131403723332, + "grad_norm": 0.25390052795410156, + "learning_rate": 5.89286927571681e-06, + "loss": 4.1376, + "step": 48380 + }, + { + "epoch": 3.2874711237939938, + "grad_norm": 0.2464405596256256, + "learning_rate": 5.8924446256284825e-06, + "loss": 4.0772, + "step": 48385 + }, + { + "epoch": 3.2878108438646554, + "grad_norm": 0.7973275780677795, + "learning_rate": 5.892019975540155e-06, + "loss": 3.9557, + "step": 48390 + }, + { + "epoch": 3.2881505639353175, + "grad_norm": 0.2698051631450653, + "learning_rate": 5.891595325451828e-06, + "loss": 3.9001, + "step": 48395 + }, + { + "epoch": 3.288490284005979, + "grad_norm": 0.22637058794498444, + "learning_rate": 5.891170675363501e-06, + "loss": 3.9815, + "step": 48400 + }, + { + "epoch": 3.2888300040766407, + "grad_norm": 0.2921884059906006, + "learning_rate": 5.890746025275174e-06, + "loss": 4.2965, + "step": 48405 + }, + { + "epoch": 3.289169724147303, + "grad_norm": 0.24339447915554047, + "learning_rate": 5.890321375186846e-06, + "loss": 4.1135, + "step": 48410 + }, + { + "epoch": 3.2895094442179644, + "grad_norm": 0.31387194991111755, + "learning_rate": 5.889896725098519e-06, + "loss": 3.8931, + "step": 48415 + }, + { + "epoch": 3.289849164288626, + "grad_norm": 0.9024649858474731, + "learning_rate": 5.889472075010192e-06, + "loss": 3.8976, + "step": 48420 + }, + { + "epoch": 3.290188884359288, + "grad_norm": 0.26096203923225403, + "learning_rate": 5.889047424921864e-06, + "loss": 3.8895, + "step": 48425 + }, + { + "epoch": 3.2905286044299498, + "grad_norm": 0.2772884666919708, + "learning_rate": 5.888622774833538e-06, + "loss": 4.0798, + "step": 48430 + }, + { + "epoch": 3.2908683245006114, + "grad_norm": 0.24132102727890015, + "learning_rate": 5.8881981247452105e-06, + "loss": 3.9172, + "step": 48435 + }, + { + "epoch": 3.291208044571273, + "grad_norm": 0.3768388628959656, + "learning_rate": 5.887773474656884e-06, + "loss": 4.3156, + "step": 48440 + }, + { + "epoch": 3.291547764641935, + "grad_norm": 0.27647560834884644, + "learning_rate": 5.887348824568556e-06, + "loss": 4.2297, + "step": 48445 + }, + { + "epoch": 3.2918874847125967, + "grad_norm": 0.23813460767269135, + "learning_rate": 5.886924174480229e-06, + "loss": 4.3164, + "step": 48450 + }, + { + "epoch": 3.2922272047832584, + "grad_norm": 0.27734917402267456, + "learning_rate": 5.886499524391902e-06, + "loss": 3.8098, + "step": 48455 + }, + { + "epoch": 3.2925669248539204, + "grad_norm": 0.3735520839691162, + "learning_rate": 5.8860748743035745e-06, + "loss": 3.7883, + "step": 48460 + }, + { + "epoch": 3.292906644924582, + "grad_norm": 0.2918664813041687, + "learning_rate": 5.885650224215247e-06, + "loss": 3.9982, + "step": 48465 + }, + { + "epoch": 3.2932463649952437, + "grad_norm": 0.5686084032058716, + "learning_rate": 5.88522557412692e-06, + "loss": 3.9484, + "step": 48470 + }, + { + "epoch": 3.2935860850659058, + "grad_norm": 0.3323687016963959, + "learning_rate": 5.884800924038593e-06, + "loss": 3.8914, + "step": 48475 + }, + { + "epoch": 3.2939258051365674, + "grad_norm": 0.3598771393299103, + "learning_rate": 5.884376273950265e-06, + "loss": 4.0476, + "step": 48480 + }, + { + "epoch": 3.294265525207229, + "grad_norm": 0.27686697244644165, + "learning_rate": 5.8839516238619385e-06, + "loss": 4.0687, + "step": 48485 + }, + { + "epoch": 3.294605245277891, + "grad_norm": 0.2870088815689087, + "learning_rate": 5.883526973773611e-06, + "loss": 4.0821, + "step": 48490 + }, + { + "epoch": 3.2949449653485527, + "grad_norm": 0.29564669728279114, + "learning_rate": 5.883102323685283e-06, + "loss": 3.921, + "step": 48495 + }, + { + "epoch": 3.2952846854192144, + "grad_norm": 0.2894650399684906, + "learning_rate": 5.882677673596957e-06, + "loss": 3.9663, + "step": 48500 + }, + { + "epoch": 3.2956244054898765, + "grad_norm": 0.21810325980186462, + "learning_rate": 5.88225302350863e-06, + "loss": 4.1572, + "step": 48505 + }, + { + "epoch": 3.295964125560538, + "grad_norm": 0.323368638753891, + "learning_rate": 5.881828373420302e-06, + "loss": 3.9647, + "step": 48510 + }, + { + "epoch": 3.2963038456311997, + "grad_norm": 0.28397008776664734, + "learning_rate": 5.881403723331975e-06, + "loss": 3.9886, + "step": 48515 + }, + { + "epoch": 3.296643565701862, + "grad_norm": 0.2780940532684326, + "learning_rate": 5.880979073243648e-06, + "loss": 3.9082, + "step": 48520 + }, + { + "epoch": 3.2969832857725234, + "grad_norm": 0.2801942527294159, + "learning_rate": 5.88055442315532e-06, + "loss": 3.8823, + "step": 48525 + }, + { + "epoch": 3.297323005843185, + "grad_norm": 0.308383971452713, + "learning_rate": 5.880129773066994e-06, + "loss": 3.9522, + "step": 48530 + }, + { + "epoch": 3.297662725913847, + "grad_norm": 0.2277589738368988, + "learning_rate": 5.8797051229786665e-06, + "loss": 3.8068, + "step": 48535 + }, + { + "epoch": 3.2980024459845088, + "grad_norm": 0.4139375686645508, + "learning_rate": 5.8792804728903384e-06, + "loss": 3.8617, + "step": 48540 + }, + { + "epoch": 3.2983421660551704, + "grad_norm": 0.32531601190567017, + "learning_rate": 5.878855822802012e-06, + "loss": 4.0723, + "step": 48545 + }, + { + "epoch": 3.2986818861258325, + "grad_norm": 0.3303651809692383, + "learning_rate": 5.878431172713685e-06, + "loss": 4.0518, + "step": 48550 + }, + { + "epoch": 3.299021606196494, + "grad_norm": 0.315654456615448, + "learning_rate": 5.878006522625357e-06, + "loss": 4.1192, + "step": 48555 + }, + { + "epoch": 3.2993613262671557, + "grad_norm": 0.31341439485549927, + "learning_rate": 5.8775818725370305e-06, + "loss": 4.2055, + "step": 48560 + }, + { + "epoch": 3.299701046337818, + "grad_norm": 0.3542352616786957, + "learning_rate": 5.8771572224487024e-06, + "loss": 3.7819, + "step": 48565 + }, + { + "epoch": 3.3000407664084794, + "grad_norm": 0.261854887008667, + "learning_rate": 5.876732572360375e-06, + "loss": 4.1147, + "step": 48570 + }, + { + "epoch": 3.300380486479141, + "grad_norm": 0.31554922461509705, + "learning_rate": 5.876307922272049e-06, + "loss": 4.187, + "step": 48575 + }, + { + "epoch": 3.300720206549803, + "grad_norm": 0.21891064941883087, + "learning_rate": 5.875883272183721e-06, + "loss": 4.1389, + "step": 48580 + }, + { + "epoch": 3.3010599266204648, + "grad_norm": 0.41900935769081116, + "learning_rate": 5.875458622095394e-06, + "loss": 4.0593, + "step": 48585 + }, + { + "epoch": 3.3013996466911264, + "grad_norm": 0.3940296769142151, + "learning_rate": 5.875033972007067e-06, + "loss": 4.0202, + "step": 48590 + }, + { + "epoch": 3.3017393667617885, + "grad_norm": 0.47269096970558167, + "learning_rate": 5.874609321918739e-06, + "loss": 4.0236, + "step": 48595 + }, + { + "epoch": 3.30207908683245, + "grad_norm": 0.3278716802597046, + "learning_rate": 5.874184671830412e-06, + "loss": 4.0439, + "step": 48600 + }, + { + "epoch": 3.3024188069031117, + "grad_norm": 0.3517068326473236, + "learning_rate": 5.873760021742086e-06, + "loss": 4.0017, + "step": 48605 + }, + { + "epoch": 3.302758526973774, + "grad_norm": 0.3133341670036316, + "learning_rate": 5.873335371653758e-06, + "loss": 4.1066, + "step": 48610 + }, + { + "epoch": 3.3030982470444354, + "grad_norm": 0.3408801853656769, + "learning_rate": 5.8729107215654304e-06, + "loss": 4.2437, + "step": 48615 + }, + { + "epoch": 3.303437967115097, + "grad_norm": 0.3186608850955963, + "learning_rate": 5.872486071477104e-06, + "loss": 4.1163, + "step": 48620 + }, + { + "epoch": 3.303777687185759, + "grad_norm": 0.5826367139816284, + "learning_rate": 5.872061421388776e-06, + "loss": 3.9929, + "step": 48625 + }, + { + "epoch": 3.304117407256421, + "grad_norm": 0.3876701891422272, + "learning_rate": 5.871636771300449e-06, + "loss": 3.9346, + "step": 48630 + }, + { + "epoch": 3.3044571273270824, + "grad_norm": 0.25978538393974304, + "learning_rate": 5.871212121212122e-06, + "loss": 4.1817, + "step": 48635 + }, + { + "epoch": 3.3047968473977445, + "grad_norm": 0.3252239227294922, + "learning_rate": 5.8707874711237944e-06, + "loss": 4.0957, + "step": 48640 + }, + { + "epoch": 3.305136567468406, + "grad_norm": 0.3758350908756256, + "learning_rate": 5.870362821035467e-06, + "loss": 4.0805, + "step": 48645 + }, + { + "epoch": 3.3054762875390677, + "grad_norm": 0.2839273512363434, + "learning_rate": 5.86993817094714e-06, + "loss": 4.2037, + "step": 48650 + }, + { + "epoch": 3.30581600760973, + "grad_norm": 0.22312912344932556, + "learning_rate": 5.869513520858813e-06, + "loss": 4.0106, + "step": 48655 + }, + { + "epoch": 3.3061557276803915, + "grad_norm": 0.25193941593170166, + "learning_rate": 5.869088870770485e-06, + "loss": 4.0813, + "step": 48660 + }, + { + "epoch": 3.306495447751053, + "grad_norm": 0.3437483012676239, + "learning_rate": 5.8686642206821585e-06, + "loss": 3.8795, + "step": 48665 + }, + { + "epoch": 3.3068351678217147, + "grad_norm": 0.2962421774864197, + "learning_rate": 5.868239570593831e-06, + "loss": 4.1193, + "step": 48670 + }, + { + "epoch": 3.307174887892377, + "grad_norm": 0.26960495114326477, + "learning_rate": 5.867814920505503e-06, + "loss": 4.0248, + "step": 48675 + }, + { + "epoch": 3.3075146079630384, + "grad_norm": 0.750188946723938, + "learning_rate": 5.867390270417177e-06, + "loss": 3.7253, + "step": 48680 + }, + { + "epoch": 3.3078543280337, + "grad_norm": 0.3600695729255676, + "learning_rate": 5.86696562032885e-06, + "loss": 3.9446, + "step": 48685 + }, + { + "epoch": 3.308194048104362, + "grad_norm": 0.30295318365097046, + "learning_rate": 5.866540970240522e-06, + "loss": 3.9169, + "step": 48690 + }, + { + "epoch": 3.3085337681750238, + "grad_norm": 0.41391903162002563, + "learning_rate": 5.866116320152195e-06, + "loss": 4.0727, + "step": 48695 + }, + { + "epoch": 3.3088734882456854, + "grad_norm": 0.288821816444397, + "learning_rate": 5.865691670063868e-06, + "loss": 4.098, + "step": 48700 + }, + { + "epoch": 3.3092132083163475, + "grad_norm": 0.2799849212169647, + "learning_rate": 5.86526701997554e-06, + "loss": 3.7794, + "step": 48705 + }, + { + "epoch": 3.309552928387009, + "grad_norm": 0.265115886926651, + "learning_rate": 5.864842369887214e-06, + "loss": 4.0174, + "step": 48710 + }, + { + "epoch": 3.3098926484576707, + "grad_norm": 0.29870492219924927, + "learning_rate": 5.8644177197988865e-06, + "loss": 3.8771, + "step": 48715 + }, + { + "epoch": 3.310232368528333, + "grad_norm": 0.2304045557975769, + "learning_rate": 5.863993069710558e-06, + "loss": 3.8516, + "step": 48720 + }, + { + "epoch": 3.3105720885989944, + "grad_norm": 0.33860355615615845, + "learning_rate": 5.863568419622232e-06, + "loss": 4.2126, + "step": 48725 + }, + { + "epoch": 3.310911808669656, + "grad_norm": 0.2892429232597351, + "learning_rate": 5.863143769533904e-06, + "loss": 3.9043, + "step": 48730 + }, + { + "epoch": 3.311251528740318, + "grad_norm": 0.5555552840232849, + "learning_rate": 5.862719119445577e-06, + "loss": 4.173, + "step": 48735 + }, + { + "epoch": 3.3115912488109798, + "grad_norm": 0.2580050230026245, + "learning_rate": 5.8622944693572505e-06, + "loss": 4.1389, + "step": 48740 + }, + { + "epoch": 3.3119309688816414, + "grad_norm": 0.2774181663990021, + "learning_rate": 5.861869819268922e-06, + "loss": 3.9066, + "step": 48745 + }, + { + "epoch": 3.3122706889523035, + "grad_norm": 0.2567768394947052, + "learning_rate": 5.861445169180595e-06, + "loss": 4.1397, + "step": 48750 + }, + { + "epoch": 3.312610409022965, + "grad_norm": 0.25892728567123413, + "learning_rate": 5.861020519092269e-06, + "loss": 3.7113, + "step": 48755 + }, + { + "epoch": 3.3129501290936267, + "grad_norm": 0.252722829580307, + "learning_rate": 5.860595869003941e-06, + "loss": 3.8504, + "step": 48760 + }, + { + "epoch": 3.313289849164289, + "grad_norm": 0.39385294914245605, + "learning_rate": 5.860171218915614e-06, + "loss": 3.9717, + "step": 48765 + }, + { + "epoch": 3.3136295692349504, + "grad_norm": 0.6236310005187988, + "learning_rate": 5.859746568827287e-06, + "loss": 3.9056, + "step": 48770 + }, + { + "epoch": 3.313969289305612, + "grad_norm": 0.3683835566043854, + "learning_rate": 5.859321918738959e-06, + "loss": 4.0147, + "step": 48775 + }, + { + "epoch": 3.3143090093762737, + "grad_norm": 0.33160531520843506, + "learning_rate": 5.858897268650633e-06, + "loss": 4.1382, + "step": 48780 + }, + { + "epoch": 3.314648729446936, + "grad_norm": 0.2662808895111084, + "learning_rate": 5.858472618562306e-06, + "loss": 3.9999, + "step": 48785 + }, + { + "epoch": 3.3149884495175974, + "grad_norm": 0.2619406580924988, + "learning_rate": 5.858047968473978e-06, + "loss": 4.1343, + "step": 48790 + }, + { + "epoch": 3.315328169588259, + "grad_norm": 0.280244380235672, + "learning_rate": 5.857623318385651e-06, + "loss": 4.0995, + "step": 48795 + }, + { + "epoch": 3.315667889658921, + "grad_norm": 0.32745084166526794, + "learning_rate": 5.857198668297323e-06, + "loss": 3.9607, + "step": 48800 + }, + { + "epoch": 3.3160076097295828, + "grad_norm": 0.3293471932411194, + "learning_rate": 5.856774018208996e-06, + "loss": 4.1062, + "step": 48805 + }, + { + "epoch": 3.3163473298002444, + "grad_norm": 0.24793143570423126, + "learning_rate": 5.85634936812067e-06, + "loss": 3.9459, + "step": 48810 + }, + { + "epoch": 3.3166870498709065, + "grad_norm": 0.3380812108516693, + "learning_rate": 5.855924718032342e-06, + "loss": 4.1969, + "step": 48815 + }, + { + "epoch": 3.317026769941568, + "grad_norm": 0.3023262023925781, + "learning_rate": 5.8555000679440144e-06, + "loss": 3.9902, + "step": 48820 + }, + { + "epoch": 3.3173664900122297, + "grad_norm": 0.5365413427352905, + "learning_rate": 5.855075417855688e-06, + "loss": 4.0551, + "step": 48825 + }, + { + "epoch": 3.317706210082892, + "grad_norm": 0.32727542519569397, + "learning_rate": 5.85465076776736e-06, + "loss": 3.8649, + "step": 48830 + }, + { + "epoch": 3.3180459301535534, + "grad_norm": 0.2561405599117279, + "learning_rate": 5.854226117679033e-06, + "loss": 4.337, + "step": 48835 + }, + { + "epoch": 3.318385650224215, + "grad_norm": 0.3102800250053406, + "learning_rate": 5.8538014675907065e-06, + "loss": 4.2229, + "step": 48840 + }, + { + "epoch": 3.318725370294877, + "grad_norm": 0.2913373112678528, + "learning_rate": 5.8533768175023784e-06, + "loss": 4.1776, + "step": 48845 + }, + { + "epoch": 3.3190650903655388, + "grad_norm": 0.2943929433822632, + "learning_rate": 5.852952167414051e-06, + "loss": 3.7588, + "step": 48850 + }, + { + "epoch": 3.3194048104362004, + "grad_norm": 0.31193339824676514, + "learning_rate": 5.852527517325725e-06, + "loss": 4.1658, + "step": 48855 + }, + { + "epoch": 3.3197445305068625, + "grad_norm": 0.30158731341362, + "learning_rate": 5.852102867237397e-06, + "loss": 3.8531, + "step": 48860 + }, + { + "epoch": 3.320084250577524, + "grad_norm": 0.3404352068901062, + "learning_rate": 5.85167821714907e-06, + "loss": 3.9144, + "step": 48865 + }, + { + "epoch": 3.3204239706481857, + "grad_norm": 0.41701191663742065, + "learning_rate": 5.851253567060743e-06, + "loss": 4.0845, + "step": 48870 + }, + { + "epoch": 3.320763690718848, + "grad_norm": 0.3000173270702362, + "learning_rate": 5.850828916972415e-06, + "loss": 3.9603, + "step": 48875 + }, + { + "epoch": 3.3211034107895094, + "grad_norm": 0.44793450832366943, + "learning_rate": 5.850404266884088e-06, + "loss": 3.9651, + "step": 48880 + }, + { + "epoch": 3.321443130860171, + "grad_norm": 0.48373571038246155, + "learning_rate": 5.849979616795761e-06, + "loss": 4.0123, + "step": 48885 + }, + { + "epoch": 3.321782850930833, + "grad_norm": 0.3048192262649536, + "learning_rate": 5.849554966707434e-06, + "loss": 3.9836, + "step": 48890 + }, + { + "epoch": 3.3221225710014948, + "grad_norm": 0.2731519341468811, + "learning_rate": 5.8491303166191064e-06, + "loss": 4.1156, + "step": 48895 + }, + { + "epoch": 3.3224622910721564, + "grad_norm": 0.27263858914375305, + "learning_rate": 5.848705666530779e-06, + "loss": 4.1194, + "step": 48900 + }, + { + "epoch": 3.3228020111428185, + "grad_norm": 0.3053271174430847, + "learning_rate": 5.848281016442452e-06, + "loss": 4.0825, + "step": 48905 + }, + { + "epoch": 3.32314173121348, + "grad_norm": 0.23112119734287262, + "learning_rate": 5.847856366354124e-06, + "loss": 4.184, + "step": 48910 + }, + { + "epoch": 3.3234814512841417, + "grad_norm": 0.3195740580558777, + "learning_rate": 5.847431716265798e-06, + "loss": 4.1806, + "step": 48915 + }, + { + "epoch": 3.323821171354804, + "grad_norm": 0.6740862131118774, + "learning_rate": 5.8470070661774704e-06, + "loss": 4.1056, + "step": 48920 + }, + { + "epoch": 3.3241608914254654, + "grad_norm": 0.27427974343299866, + "learning_rate": 5.846582416089142e-06, + "loss": 4.1717, + "step": 48925 + }, + { + "epoch": 3.324500611496127, + "grad_norm": 0.4961708188056946, + "learning_rate": 5.846157766000816e-06, + "loss": 4.1633, + "step": 48930 + }, + { + "epoch": 3.324840331566789, + "grad_norm": 0.3204471170902252, + "learning_rate": 5.845733115912489e-06, + "loss": 3.997, + "step": 48935 + }, + { + "epoch": 3.325180051637451, + "grad_norm": 0.34869685769081116, + "learning_rate": 5.845308465824161e-06, + "loss": 4.1032, + "step": 48940 + }, + { + "epoch": 3.3255197717081124, + "grad_norm": 0.43987852334976196, + "learning_rate": 5.8448838157358344e-06, + "loss": 4.0704, + "step": 48945 + }, + { + "epoch": 3.3258594917787745, + "grad_norm": 0.358053058385849, + "learning_rate": 5.844459165647507e-06, + "loss": 4.1662, + "step": 48950 + }, + { + "epoch": 3.326199211849436, + "grad_norm": 0.24400851130485535, + "learning_rate": 5.844034515559179e-06, + "loss": 4.1319, + "step": 48955 + }, + { + "epoch": 3.3265389319200978, + "grad_norm": 0.24922378361225128, + "learning_rate": 5.843609865470853e-06, + "loss": 3.9617, + "step": 48960 + }, + { + "epoch": 3.32687865199076, + "grad_norm": 0.3551463782787323, + "learning_rate": 5.843185215382526e-06, + "loss": 4.2576, + "step": 48965 + }, + { + "epoch": 3.3272183720614215, + "grad_norm": 0.26700958609580994, + "learning_rate": 5.842760565294198e-06, + "loss": 3.8926, + "step": 48970 + }, + { + "epoch": 3.327558092132083, + "grad_norm": 0.3206445872783661, + "learning_rate": 5.842335915205871e-06, + "loss": 3.7062, + "step": 48975 + }, + { + "epoch": 3.327897812202745, + "grad_norm": 0.27124831080436707, + "learning_rate": 5.841911265117543e-06, + "loss": 4.0772, + "step": 48980 + }, + { + "epoch": 3.328237532273407, + "grad_norm": 0.28647279739379883, + "learning_rate": 5.841486615029216e-06, + "loss": 3.8903, + "step": 48985 + }, + { + "epoch": 3.3285772523440684, + "grad_norm": 0.48883646726608276, + "learning_rate": 5.84106196494089e-06, + "loss": 3.9505, + "step": 48990 + }, + { + "epoch": 3.3289169724147305, + "grad_norm": 0.3748396039009094, + "learning_rate": 5.840637314852562e-06, + "loss": 4.0297, + "step": 48995 + }, + { + "epoch": 3.329256692485392, + "grad_norm": 0.34053587913513184, + "learning_rate": 5.840212664764234e-06, + "loss": 4.221, + "step": 49000 + }, + { + "epoch": 3.3295964125560538, + "grad_norm": 0.39139682054519653, + "learning_rate": 5.839788014675908e-06, + "loss": 4.1211, + "step": 49005 + }, + { + "epoch": 3.3299361326267154, + "grad_norm": 0.34260016679763794, + "learning_rate": 5.83936336458758e-06, + "loss": 4.1374, + "step": 49010 + }, + { + "epoch": 3.3302758526973775, + "grad_norm": 0.3717816472053528, + "learning_rate": 5.838938714499253e-06, + "loss": 4.0504, + "step": 49015 + }, + { + "epoch": 3.330615572768039, + "grad_norm": 0.47663915157318115, + "learning_rate": 5.8385140644109265e-06, + "loss": 3.9223, + "step": 49020 + }, + { + "epoch": 3.3309552928387007, + "grad_norm": 0.3580418527126312, + "learning_rate": 5.838089414322598e-06, + "loss": 4.123, + "step": 49025 + }, + { + "epoch": 3.331295012909363, + "grad_norm": 0.30824515223503113, + "learning_rate": 5.837664764234271e-06, + "loss": 4.1367, + "step": 49030 + }, + { + "epoch": 3.3316347329800244, + "grad_norm": 0.2360360473394394, + "learning_rate": 5.837240114145945e-06, + "loss": 4.0093, + "step": 49035 + }, + { + "epoch": 3.331974453050686, + "grad_norm": 0.3630819022655487, + "learning_rate": 5.836815464057617e-06, + "loss": 4.0263, + "step": 49040 + }, + { + "epoch": 3.332314173121348, + "grad_norm": 0.25539788603782654, + "learning_rate": 5.83639081396929e-06, + "loss": 3.9639, + "step": 49045 + }, + { + "epoch": 3.3326538931920098, + "grad_norm": 0.3716889023780823, + "learning_rate": 5.835966163880962e-06, + "loss": 4.1446, + "step": 49050 + }, + { + "epoch": 3.3329936132626714, + "grad_norm": 0.27138248085975647, + "learning_rate": 5.835541513792635e-06, + "loss": 4.025, + "step": 49055 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.2604251205921173, + "learning_rate": 5.835116863704308e-06, + "loss": 4.2645, + "step": 49060 + }, + { + "epoch": 3.333673053403995, + "grad_norm": 0.3656865656375885, + "learning_rate": 5.834692213615981e-06, + "loss": 3.7472, + "step": 49065 + }, + { + "epoch": 3.3340127734746567, + "grad_norm": 0.4201490581035614, + "learning_rate": 5.834267563527654e-06, + "loss": 3.8257, + "step": 49070 + }, + { + "epoch": 3.334352493545319, + "grad_norm": 0.2558007538318634, + "learning_rate": 5.8338429134393256e-06, + "loss": 4.0109, + "step": 49075 + }, + { + "epoch": 3.3346922136159804, + "grad_norm": 0.2902700901031494, + "learning_rate": 5.833418263350999e-06, + "loss": 3.8697, + "step": 49080 + }, + { + "epoch": 3.335031933686642, + "grad_norm": 0.3329278826713562, + "learning_rate": 5.832993613262672e-06, + "loss": 4.0062, + "step": 49085 + }, + { + "epoch": 3.335371653757304, + "grad_norm": 0.34280499815940857, + "learning_rate": 5.832568963174344e-06, + "loss": 3.8541, + "step": 49090 + }, + { + "epoch": 3.335711373827966, + "grad_norm": 0.23452430963516235, + "learning_rate": 5.832144313086018e-06, + "loss": 4.3177, + "step": 49095 + }, + { + "epoch": 3.3360510938986274, + "grad_norm": 0.2632144093513489, + "learning_rate": 5.8317196629976904e-06, + "loss": 3.9816, + "step": 49100 + }, + { + "epoch": 3.3363908139692895, + "grad_norm": 0.31428688764572144, + "learning_rate": 5.831295012909362e-06, + "loss": 3.9723, + "step": 49105 + }, + { + "epoch": 3.336730534039951, + "grad_norm": 0.28372693061828613, + "learning_rate": 5.830870362821036e-06, + "loss": 4.2112, + "step": 49110 + }, + { + "epoch": 3.3370702541106128, + "grad_norm": 0.37998858094215393, + "learning_rate": 5.830445712732709e-06, + "loss": 4.3567, + "step": 49115 + }, + { + "epoch": 3.3374099741812744, + "grad_norm": 0.3106271028518677, + "learning_rate": 5.8300210626443825e-06, + "loss": 3.9416, + "step": 49120 + }, + { + "epoch": 3.3377496942519365, + "grad_norm": 0.5500901937484741, + "learning_rate": 5.8295964125560544e-06, + "loss": 4.0522, + "step": 49125 + }, + { + "epoch": 3.338089414322598, + "grad_norm": 0.27650487422943115, + "learning_rate": 5.829171762467727e-06, + "loss": 4.0219, + "step": 49130 + }, + { + "epoch": 3.3384291343932597, + "grad_norm": 0.2701644003391266, + "learning_rate": 5.8287471123794e-06, + "loss": 4.1141, + "step": 49135 + }, + { + "epoch": 3.338768854463922, + "grad_norm": 0.3922502398490906, + "learning_rate": 5.828322462291073e-06, + "loss": 4.3035, + "step": 49140 + }, + { + "epoch": 3.3391085745345834, + "grad_norm": 0.7363902926445007, + "learning_rate": 5.827897812202745e-06, + "loss": 4.1603, + "step": 49145 + }, + { + "epoch": 3.339448294605245, + "grad_norm": 0.20986801385879517, + "learning_rate": 5.8274731621144184e-06, + "loss": 3.9438, + "step": 49150 + }, + { + "epoch": 3.339788014675907, + "grad_norm": 0.4328341782093048, + "learning_rate": 5.827048512026091e-06, + "loss": 4.1339, + "step": 49155 + }, + { + "epoch": 3.3401277347465688, + "grad_norm": 0.2991235852241516, + "learning_rate": 5.826623861937763e-06, + "loss": 3.951, + "step": 49160 + }, + { + "epoch": 3.3404674548172304, + "grad_norm": NaN, + "learning_rate": 5.826284141867102e-06, + "loss": 4.1123, + "step": 49165 + }, + { + "epoch": 3.3408071748878925, + "grad_norm": 0.302746057510376, + "learning_rate": 5.825859491778775e-06, + "loss": 4.0676, + "step": 49170 + }, + { + "epoch": 3.341146894958554, + "grad_norm": 0.3075489401817322, + "learning_rate": 5.825434841690447e-06, + "loss": 4.1511, + "step": 49175 + }, + { + "epoch": 3.3414866150292157, + "grad_norm": 0.4298730194568634, + "learning_rate": 5.8250101916021205e-06, + "loss": 3.8897, + "step": 49180 + }, + { + "epoch": 3.341826335099878, + "grad_norm": 0.4662380814552307, + "learning_rate": 5.824585541513793e-06, + "loss": 3.8463, + "step": 49185 + }, + { + "epoch": 3.3421660551705394, + "grad_norm": 0.3443109393119812, + "learning_rate": 5.824160891425465e-06, + "loss": 3.9662, + "step": 49190 + }, + { + "epoch": 3.342505775241201, + "grad_norm": 0.30497801303863525, + "learning_rate": 5.823736241337139e-06, + "loss": 4.1759, + "step": 49195 + }, + { + "epoch": 3.342845495311863, + "grad_norm": 0.3073280453681946, + "learning_rate": 5.823311591248812e-06, + "loss": 3.9045, + "step": 49200 + }, + { + "epoch": 3.3431852153825248, + "grad_norm": 0.27480435371398926, + "learning_rate": 5.822886941160484e-06, + "loss": 4.1971, + "step": 49205 + }, + { + "epoch": 3.3435249354531864, + "grad_norm": 0.4186271131038666, + "learning_rate": 5.822462291072157e-06, + "loss": 4.238, + "step": 49210 + }, + { + "epoch": 3.3438646555238485, + "grad_norm": 0.3470136821269989, + "learning_rate": 5.82203764098383e-06, + "loss": 3.9941, + "step": 49215 + }, + { + "epoch": 3.34420437559451, + "grad_norm": 0.3153388500213623, + "learning_rate": 5.821612990895502e-06, + "loss": 3.9295, + "step": 49220 + }, + { + "epoch": 3.3445440956651717, + "grad_norm": 0.32270631194114685, + "learning_rate": 5.821188340807176e-06, + "loss": 4.0648, + "step": 49225 + }, + { + "epoch": 3.344883815735834, + "grad_norm": 0.28502681851387024, + "learning_rate": 5.820763690718848e-06, + "loss": 4.1769, + "step": 49230 + }, + { + "epoch": 3.3452235358064955, + "grad_norm": 0.31680986285209656, + "learning_rate": 5.8203390406305205e-06, + "loss": 3.5295, + "step": 49235 + }, + { + "epoch": 3.345563255877157, + "grad_norm": 0.5002680420875549, + "learning_rate": 5.819914390542194e-06, + "loss": 4.0769, + "step": 49240 + }, + { + "epoch": 3.345902975947819, + "grad_norm": 0.30211353302001953, + "learning_rate": 5.819489740453866e-06, + "loss": 3.9761, + "step": 49245 + }, + { + "epoch": 3.346242696018481, + "grad_norm": 0.46942514181137085, + "learning_rate": 5.819065090365539e-06, + "loss": 4.0973, + "step": 49250 + }, + { + "epoch": 3.3465824160891424, + "grad_norm": 0.4532061517238617, + "learning_rate": 5.8186404402772125e-06, + "loss": 3.9737, + "step": 49255 + }, + { + "epoch": 3.3469221361598045, + "grad_norm": 0.35940346121788025, + "learning_rate": 5.8182157901888845e-06, + "loss": 4.0985, + "step": 49260 + }, + { + "epoch": 3.347261856230466, + "grad_norm": 0.3929024338722229, + "learning_rate": 5.817791140100557e-06, + "loss": 4.2971, + "step": 49265 + }, + { + "epoch": 3.3476015763011278, + "grad_norm": 0.3364627957344055, + "learning_rate": 5.817366490012231e-06, + "loss": 4.0352, + "step": 49270 + }, + { + "epoch": 3.34794129637179, + "grad_norm": 0.47645092010498047, + "learning_rate": 5.816941839923903e-06, + "loss": 4.1413, + "step": 49275 + }, + { + "epoch": 3.3482810164424515, + "grad_norm": 0.3193295896053314, + "learning_rate": 5.816517189835576e-06, + "loss": 4.1634, + "step": 49280 + }, + { + "epoch": 3.348620736513113, + "grad_norm": 0.2325139194726944, + "learning_rate": 5.816092539747249e-06, + "loss": 3.9223, + "step": 49285 + }, + { + "epoch": 3.348960456583775, + "grad_norm": 0.3454229533672333, + "learning_rate": 5.815667889658921e-06, + "loss": 4.0479, + "step": 49290 + }, + { + "epoch": 3.349300176654437, + "grad_norm": 0.2697758376598358, + "learning_rate": 5.815243239570594e-06, + "loss": 3.997, + "step": 49295 + }, + { + "epoch": 3.3496398967250984, + "grad_norm": 0.28677093982696533, + "learning_rate": 5.814818589482267e-06, + "loss": 4.0152, + "step": 49300 + }, + { + "epoch": 3.3499796167957605, + "grad_norm": 0.4098311960697174, + "learning_rate": 5.81439393939394e-06, + "loss": 3.9945, + "step": 49305 + }, + { + "epoch": 3.350319336866422, + "grad_norm": 0.28086045384407043, + "learning_rate": 5.8139692893056125e-06, + "loss": 3.9062, + "step": 49310 + }, + { + "epoch": 3.3506590569370838, + "grad_norm": 0.23341646790504456, + "learning_rate": 5.813544639217285e-06, + "loss": 4.1221, + "step": 49315 + }, + { + "epoch": 3.350998777007746, + "grad_norm": 0.2581154406070709, + "learning_rate": 5.813119989128958e-06, + "loss": 4.0464, + "step": 49320 + }, + { + "epoch": 3.3513384970784075, + "grad_norm": 0.3258393704891205, + "learning_rate": 5.812695339040632e-06, + "loss": 3.8758, + "step": 49325 + }, + { + "epoch": 3.351678217149069, + "grad_norm": 0.2680785059928894, + "learning_rate": 5.812270688952304e-06, + "loss": 3.8481, + "step": 49330 + }, + { + "epoch": 3.352017937219731, + "grad_norm": 0.2963819205760956, + "learning_rate": 5.8118460388639765e-06, + "loss": 3.9592, + "step": 49335 + }, + { + "epoch": 3.352357657290393, + "grad_norm": 0.27159106731414795, + "learning_rate": 5.81142138877565e-06, + "loss": 3.9518, + "step": 49340 + }, + { + "epoch": 3.3526973773610544, + "grad_norm": 0.3081188201904297, + "learning_rate": 5.810996738687322e-06, + "loss": 3.8909, + "step": 49345 + }, + { + "epoch": 3.353037097431716, + "grad_norm": 0.34198734164237976, + "learning_rate": 5.810572088598995e-06, + "loss": 3.8605, + "step": 49350 + }, + { + "epoch": 3.353376817502378, + "grad_norm": 0.28934288024902344, + "learning_rate": 5.8101474385106685e-06, + "loss": 3.6238, + "step": 49355 + }, + { + "epoch": 3.35371653757304, + "grad_norm": 0.29449793696403503, + "learning_rate": 5.8097227884223405e-06, + "loss": 3.9982, + "step": 49360 + }, + { + "epoch": 3.3540562576437014, + "grad_norm": 0.3041863441467285, + "learning_rate": 5.809298138334013e-06, + "loss": 4.032, + "step": 49365 + }, + { + "epoch": 3.3543959777143635, + "grad_norm": 0.2594981789588928, + "learning_rate": 5.808873488245687e-06, + "loss": 4.1597, + "step": 49370 + }, + { + "epoch": 3.354735697785025, + "grad_norm": 0.25791555643081665, + "learning_rate": 5.808448838157359e-06, + "loss": 4.0028, + "step": 49375 + }, + { + "epoch": 3.3550754178556867, + "grad_norm": 0.2856009900569916, + "learning_rate": 5.808024188069032e-06, + "loss": 3.9037, + "step": 49380 + }, + { + "epoch": 3.355415137926349, + "grad_norm": 0.35309743881225586, + "learning_rate": 5.8075995379807045e-06, + "loss": 4.0259, + "step": 49385 + }, + { + "epoch": 3.3557548579970105, + "grad_norm": 0.25184279680252075, + "learning_rate": 5.807174887892377e-06, + "loss": 3.7821, + "step": 49390 + }, + { + "epoch": 3.356094578067672, + "grad_norm": 0.27855417132377625, + "learning_rate": 5.80675023780405e-06, + "loss": 4.2396, + "step": 49395 + }, + { + "epoch": 3.356434298138334, + "grad_norm": 0.2885570526123047, + "learning_rate": 5.806325587715723e-06, + "loss": 4.041, + "step": 49400 + }, + { + "epoch": 3.356774018208996, + "grad_norm": 0.3166770339012146, + "learning_rate": 5.805900937627396e-06, + "loss": 3.8866, + "step": 49405 + }, + { + "epoch": 3.3571137382796574, + "grad_norm": 0.2921448051929474, + "learning_rate": 5.805476287539068e-06, + "loss": 4.3495, + "step": 49410 + }, + { + "epoch": 3.3574534583503195, + "grad_norm": 0.2258954644203186, + "learning_rate": 5.805051637450741e-06, + "loss": 4.0906, + "step": 49415 + }, + { + "epoch": 3.357793178420981, + "grad_norm": 0.3758501410484314, + "learning_rate": 5.804626987362414e-06, + "loss": 4.0676, + "step": 49420 + }, + { + "epoch": 3.3581328984916428, + "grad_norm": 0.30045104026794434, + "learning_rate": 5.804202337274086e-06, + "loss": 3.9896, + "step": 49425 + }, + { + "epoch": 3.358472618562305, + "grad_norm": 0.24537909030914307, + "learning_rate": 5.80377768718576e-06, + "loss": 4.062, + "step": 49430 + }, + { + "epoch": 3.3588123386329665, + "grad_norm": 0.2840781509876251, + "learning_rate": 5.8033530370974325e-06, + "loss": 3.9326, + "step": 49435 + }, + { + "epoch": 3.359152058703628, + "grad_norm": 0.3450201451778412, + "learning_rate": 5.8029283870091045e-06, + "loss": 3.8227, + "step": 49440 + }, + { + "epoch": 3.35949177877429, + "grad_norm": 0.26058077812194824, + "learning_rate": 5.802503736920778e-06, + "loss": 3.8933, + "step": 49445 + }, + { + "epoch": 3.359831498844952, + "grad_norm": 0.32814666628837585, + "learning_rate": 5.802079086832451e-06, + "loss": 3.8307, + "step": 49450 + }, + { + "epoch": 3.3601712189156134, + "grad_norm": 0.3112383782863617, + "learning_rate": 5.801654436744123e-06, + "loss": 4.3037, + "step": 49455 + }, + { + "epoch": 3.360510938986275, + "grad_norm": 0.3673069179058075, + "learning_rate": 5.8012297866557965e-06, + "loss": 4.1833, + "step": 49460 + }, + { + "epoch": 3.360850659056937, + "grad_norm": 0.3933025002479553, + "learning_rate": 5.800805136567469e-06, + "loss": 3.8915, + "step": 49465 + }, + { + "epoch": 3.3611903791275988, + "grad_norm": 0.3118894398212433, + "learning_rate": 5.800380486479141e-06, + "loss": 4.1283, + "step": 49470 + }, + { + "epoch": 3.3615300991982604, + "grad_norm": 0.3445534110069275, + "learning_rate": 5.799955836390815e-06, + "loss": 3.9795, + "step": 49475 + }, + { + "epoch": 3.3618698192689225, + "grad_norm": 0.28083881735801697, + "learning_rate": 5.799531186302487e-06, + "loss": 3.8525, + "step": 49480 + }, + { + "epoch": 3.362209539339584, + "grad_norm": 0.2909643054008484, + "learning_rate": 5.79910653621416e-06, + "loss": 4.0059, + "step": 49485 + }, + { + "epoch": 3.3625492594102457, + "grad_norm": 0.33581000566482544, + "learning_rate": 5.798681886125833e-06, + "loss": 4.1893, + "step": 49490 + }, + { + "epoch": 3.362888979480908, + "grad_norm": 0.3221350908279419, + "learning_rate": 5.798257236037505e-06, + "loss": 4.1756, + "step": 49495 + }, + { + "epoch": 3.3632286995515694, + "grad_norm": 0.2871928811073303, + "learning_rate": 5.797832585949178e-06, + "loss": 3.896, + "step": 49500 + }, + { + "epoch": 3.363568419622231, + "grad_norm": 0.27062299847602844, + "learning_rate": 5.797407935860852e-06, + "loss": 3.8049, + "step": 49505 + }, + { + "epoch": 3.363908139692893, + "grad_norm": 0.31927716732025146, + "learning_rate": 5.796983285772524e-06, + "loss": 4.023, + "step": 49510 + }, + { + "epoch": 3.364247859763555, + "grad_norm": 0.3061186373233795, + "learning_rate": 5.7965586356841965e-06, + "loss": 4.2427, + "step": 49515 + }, + { + "epoch": 3.3645875798342164, + "grad_norm": 0.427229106426239, + "learning_rate": 5.79613398559587e-06, + "loss": 4.345, + "step": 49520 + }, + { + "epoch": 3.3649272999048785, + "grad_norm": 0.38682442903518677, + "learning_rate": 5.795709335507542e-06, + "loss": 3.973, + "step": 49525 + }, + { + "epoch": 3.36526701997554, + "grad_norm": 0.311367392539978, + "learning_rate": 5.795284685419215e-06, + "loss": 4.1099, + "step": 49530 + }, + { + "epoch": 3.3656067400462018, + "grad_norm": 0.25745657086372375, + "learning_rate": 5.7948600353308885e-06, + "loss": 4.2954, + "step": 49535 + }, + { + "epoch": 3.365946460116864, + "grad_norm": 0.2724379003047943, + "learning_rate": 5.7944353852425605e-06, + "loss": 4.1756, + "step": 49540 + }, + { + "epoch": 3.3662861801875255, + "grad_norm": 0.6098592281341553, + "learning_rate": 5.794010735154233e-06, + "loss": 4.0639, + "step": 49545 + }, + { + "epoch": 3.366625900258187, + "grad_norm": 0.2255047708749771, + "learning_rate": 5.793586085065906e-06, + "loss": 4.0821, + "step": 49550 + }, + { + "epoch": 3.366965620328849, + "grad_norm": 0.36313438415527344, + "learning_rate": 5.793161434977579e-06, + "loss": 4.0826, + "step": 49555 + }, + { + "epoch": 3.367305340399511, + "grad_norm": 0.3098818063735962, + "learning_rate": 5.792736784889252e-06, + "loss": 3.9455, + "step": 49560 + }, + { + "epoch": 3.3676450604701724, + "grad_norm": 0.4830585718154907, + "learning_rate": 5.7923121348009245e-06, + "loss": 4.0553, + "step": 49565 + }, + { + "epoch": 3.3679847805408345, + "grad_norm": 0.35559988021850586, + "learning_rate": 5.791887484712597e-06, + "loss": 4.0833, + "step": 49570 + }, + { + "epoch": 3.368324500611496, + "grad_norm": 0.288711279630661, + "learning_rate": 5.791462834624269e-06, + "loss": 4.1356, + "step": 49575 + }, + { + "epoch": 3.3686642206821578, + "grad_norm": 0.41589513421058655, + "learning_rate": 5.791038184535943e-06, + "loss": 4.4493, + "step": 49580 + }, + { + "epoch": 3.36900394075282, + "grad_norm": 0.45195072889328003, + "learning_rate": 5.790613534447616e-06, + "loss": 3.8801, + "step": 49585 + }, + { + "epoch": 3.3693436608234815, + "grad_norm": 0.22553063929080963, + "learning_rate": 5.790188884359288e-06, + "loss": 3.8292, + "step": 49590 + }, + { + "epoch": 3.369683380894143, + "grad_norm": 0.2843762934207916, + "learning_rate": 5.789764234270961e-06, + "loss": 4.0254, + "step": 49595 + }, + { + "epoch": 3.370023100964805, + "grad_norm": 0.36058005690574646, + "learning_rate": 5.789339584182634e-06, + "loss": 4.0314, + "step": 49600 + }, + { + "epoch": 3.370362821035467, + "grad_norm": 0.24768976867198944, + "learning_rate": 5.788914934094306e-06, + "loss": 4.2056, + "step": 49605 + }, + { + "epoch": 3.3707025411061284, + "grad_norm": 0.27258041501045227, + "learning_rate": 5.78849028400598e-06, + "loss": 3.9363, + "step": 49610 + }, + { + "epoch": 3.3710422611767905, + "grad_norm": 0.2757612466812134, + "learning_rate": 5.7880656339176525e-06, + "loss": 3.8691, + "step": 49615 + }, + { + "epoch": 3.371381981247452, + "grad_norm": 0.3652002513408661, + "learning_rate": 5.7876409838293244e-06, + "loss": 4.1341, + "step": 49620 + }, + { + "epoch": 3.3717217013181138, + "grad_norm": 0.49607306718826294, + "learning_rate": 5.787216333740998e-06, + "loss": 4.0499, + "step": 49625 + }, + { + "epoch": 3.372061421388776, + "grad_norm": 0.29398706555366516, + "learning_rate": 5.786791683652671e-06, + "loss": 4.0914, + "step": 49630 + }, + { + "epoch": 3.3724011414594375, + "grad_norm": 0.2936544418334961, + "learning_rate": 5.786367033564343e-06, + "loss": 3.9485, + "step": 49635 + }, + { + "epoch": 3.372740861530099, + "grad_norm": 0.2652987837791443, + "learning_rate": 5.7859423834760165e-06, + "loss": 3.9988, + "step": 49640 + }, + { + "epoch": 3.373080581600761, + "grad_norm": 0.3365686535835266, + "learning_rate": 5.7855177333876884e-06, + "loss": 4.0458, + "step": 49645 + }, + { + "epoch": 3.373420301671423, + "grad_norm": 0.2142159789800644, + "learning_rate": 5.785093083299361e-06, + "loss": 3.9953, + "step": 49650 + }, + { + "epoch": 3.3737600217420844, + "grad_norm": 0.3154142200946808, + "learning_rate": 5.784668433211035e-06, + "loss": 4.0414, + "step": 49655 + }, + { + "epoch": 3.3740997418127465, + "grad_norm": 0.3708847761154175, + "learning_rate": 5.784243783122707e-06, + "loss": 4.0649, + "step": 49660 + }, + { + "epoch": 3.374439461883408, + "grad_norm": 0.264597624540329, + "learning_rate": 5.7838191330343805e-06, + "loss": 4.0668, + "step": 49665 + }, + { + "epoch": 3.37477918195407, + "grad_norm": 0.4278242290019989, + "learning_rate": 5.783394482946053e-06, + "loss": 4.142, + "step": 49670 + }, + { + "epoch": 3.375118902024732, + "grad_norm": 0.2909790873527527, + "learning_rate": 5.782969832857725e-06, + "loss": 4.018, + "step": 49675 + }, + { + "epoch": 3.3754586220953935, + "grad_norm": 0.291695237159729, + "learning_rate": 5.782545182769399e-06, + "loss": 3.9921, + "step": 49680 + }, + { + "epoch": 3.375798342166055, + "grad_norm": 0.2735567092895508, + "learning_rate": 5.782120532681072e-06, + "loss": 3.9532, + "step": 49685 + }, + { + "epoch": 3.3761380622367168, + "grad_norm": 0.3820851445198059, + "learning_rate": 5.781695882592744e-06, + "loss": 4.2374, + "step": 49690 + }, + { + "epoch": 3.376477782307379, + "grad_norm": 0.326675683259964, + "learning_rate": 5.781271232504417e-06, + "loss": 4.1617, + "step": 49695 + }, + { + "epoch": 3.3768175023780405, + "grad_norm": 0.2548367381095886, + "learning_rate": 5.78084658241609e-06, + "loss": 3.9095, + "step": 49700 + }, + { + "epoch": 3.377157222448702, + "grad_norm": 0.42928192019462585, + "learning_rate": 5.780421932327762e-06, + "loss": 3.8334, + "step": 49705 + }, + { + "epoch": 3.377496942519364, + "grad_norm": 0.41699737310409546, + "learning_rate": 5.779997282239436e-06, + "loss": 4.1493, + "step": 49710 + }, + { + "epoch": 3.377836662590026, + "grad_norm": 0.24956318736076355, + "learning_rate": 5.7795726321511085e-06, + "loss": 3.8193, + "step": 49715 + }, + { + "epoch": 3.3781763826606874, + "grad_norm": 0.2644082009792328, + "learning_rate": 5.7791479820627805e-06, + "loss": 4.1338, + "step": 49720 + }, + { + "epoch": 3.3785161027313495, + "grad_norm": 0.326916366815567, + "learning_rate": 5.778723331974454e-06, + "loss": 3.8562, + "step": 49725 + }, + { + "epoch": 3.378855822802011, + "grad_norm": 0.2606109380722046, + "learning_rate": 5.778298681886126e-06, + "loss": 4.0431, + "step": 49730 + }, + { + "epoch": 3.3791955428726728, + "grad_norm": 0.2671799659729004, + "learning_rate": 5.777874031797799e-06, + "loss": 4.1386, + "step": 49735 + }, + { + "epoch": 3.379535262943335, + "grad_norm": 0.31563669443130493, + "learning_rate": 5.7774493817094725e-06, + "loss": 4.0726, + "step": 49740 + }, + { + "epoch": 3.3798749830139965, + "grad_norm": 0.3676195740699768, + "learning_rate": 5.7770247316211445e-06, + "loss": 4.0162, + "step": 49745 + }, + { + "epoch": 3.380214703084658, + "grad_norm": 0.26362621784210205, + "learning_rate": 5.776600081532817e-06, + "loss": 3.8954, + "step": 49750 + }, + { + "epoch": 3.38055442315532, + "grad_norm": 0.30669310688972473, + "learning_rate": 5.776175431444491e-06, + "loss": 3.9598, + "step": 49755 + }, + { + "epoch": 3.380894143225982, + "grad_norm": 0.5126946568489075, + "learning_rate": 5.775750781356163e-06, + "loss": 3.9512, + "step": 49760 + }, + { + "epoch": 3.3812338632966434, + "grad_norm": 0.3639022707939148, + "learning_rate": 5.775326131267836e-06, + "loss": 4.0084, + "step": 49765 + }, + { + "epoch": 3.3815735833673055, + "grad_norm": 0.41495174169540405, + "learning_rate": 5.774901481179509e-06, + "loss": 3.8724, + "step": 49770 + }, + { + "epoch": 3.381913303437967, + "grad_norm": 0.2849728763103485, + "learning_rate": 5.774476831091181e-06, + "loss": 4.0721, + "step": 49775 + }, + { + "epoch": 3.3822530235086288, + "grad_norm": 0.2961200773715973, + "learning_rate": 5.774052181002854e-06, + "loss": 4.2138, + "step": 49780 + }, + { + "epoch": 3.382592743579291, + "grad_norm": 0.2233978658914566, + "learning_rate": 5.773627530914528e-06, + "loss": 4.2637, + "step": 49785 + }, + { + "epoch": 3.3829324636499525, + "grad_norm": 0.2691737115383148, + "learning_rate": 5.7732028808262e-06, + "loss": 4.1646, + "step": 49790 + }, + { + "epoch": 3.383272183720614, + "grad_norm": 0.2904474139213562, + "learning_rate": 5.7727782307378725e-06, + "loss": 4.062, + "step": 49795 + }, + { + "epoch": 3.3836119037912757, + "grad_norm": 0.32670462131500244, + "learning_rate": 5.772353580649545e-06, + "loss": 3.9597, + "step": 49800 + }, + { + "epoch": 3.383951623861938, + "grad_norm": 0.24486733973026276, + "learning_rate": 5.771928930561218e-06, + "loss": 3.6609, + "step": 49805 + }, + { + "epoch": 3.3842913439325994, + "grad_norm": 0.2583079934120178, + "learning_rate": 5.771504280472891e-06, + "loss": 4.0963, + "step": 49810 + }, + { + "epoch": 3.384631064003261, + "grad_norm": 0.26040035486221313, + "learning_rate": 5.771079630384564e-06, + "loss": 4.0345, + "step": 49815 + }, + { + "epoch": 3.384970784073923, + "grad_norm": 0.3137873113155365, + "learning_rate": 5.7706549802962365e-06, + "loss": 3.963, + "step": 49820 + }, + { + "epoch": 3.385310504144585, + "grad_norm": 0.27895739674568176, + "learning_rate": 5.770230330207908e-06, + "loss": 3.9487, + "step": 49825 + }, + { + "epoch": 3.3856502242152464, + "grad_norm": 0.2933403551578522, + "learning_rate": 5.769805680119582e-06, + "loss": 4.2084, + "step": 49830 + }, + { + "epoch": 3.3859899442859085, + "grad_norm": 0.42277956008911133, + "learning_rate": 5.769381030031255e-06, + "loss": 4.3256, + "step": 49835 + }, + { + "epoch": 3.38632966435657, + "grad_norm": 0.3687208294868469, + "learning_rate": 5.768956379942927e-06, + "loss": 3.9271, + "step": 49840 + }, + { + "epoch": 3.3866693844272318, + "grad_norm": 0.46295762062072754, + "learning_rate": 5.7685317298546005e-06, + "loss": 3.9421, + "step": 49845 + }, + { + "epoch": 3.387009104497894, + "grad_norm": 0.26642096042633057, + "learning_rate": 5.768107079766273e-06, + "loss": 4.0762, + "step": 49850 + }, + { + "epoch": 3.3873488245685555, + "grad_norm": 0.2895353138446808, + "learning_rate": 5.767682429677945e-06, + "loss": 4.0419, + "step": 49855 + }, + { + "epoch": 3.387688544639217, + "grad_norm": 0.35539817810058594, + "learning_rate": 5.767257779589619e-06, + "loss": 4.0288, + "step": 49860 + }, + { + "epoch": 3.388028264709879, + "grad_norm": 0.2786482870578766, + "learning_rate": 5.766833129501292e-06, + "loss": 4.0873, + "step": 49865 + }, + { + "epoch": 3.388367984780541, + "grad_norm": 0.2862595021724701, + "learning_rate": 5.766408479412964e-06, + "loss": 3.9616, + "step": 49870 + }, + { + "epoch": 3.3887077048512024, + "grad_norm": 0.5261958837509155, + "learning_rate": 5.765983829324637e-06, + "loss": 4.0774, + "step": 49875 + }, + { + "epoch": 3.3890474249218645, + "grad_norm": 0.26770490407943726, + "learning_rate": 5.76555917923631e-06, + "loss": 4.048, + "step": 49880 + }, + { + "epoch": 3.389387144992526, + "grad_norm": 0.34076544642448425, + "learning_rate": 5.765134529147982e-06, + "loss": 3.8031, + "step": 49885 + }, + { + "epoch": 3.3897268650631878, + "grad_norm": 0.3427274227142334, + "learning_rate": 5.764709879059656e-06, + "loss": 3.8804, + "step": 49890 + }, + { + "epoch": 3.39006658513385, + "grad_norm": 0.3003857433795929, + "learning_rate": 5.764285228971328e-06, + "loss": 4.0382, + "step": 49895 + }, + { + "epoch": 3.3904063052045115, + "grad_norm": 0.3213597238063812, + "learning_rate": 5.7638605788830004e-06, + "loss": 4.2775, + "step": 49900 + }, + { + "epoch": 3.390746025275173, + "grad_norm": 0.34288865327835083, + "learning_rate": 5.763435928794674e-06, + "loss": 4.0005, + "step": 49905 + }, + { + "epoch": 3.391085745345835, + "grad_norm": 0.2857915759086609, + "learning_rate": 5.763011278706346e-06, + "loss": 4.056, + "step": 49910 + }, + { + "epoch": 3.391425465416497, + "grad_norm": 0.9582070708274841, + "learning_rate": 5.762586628618019e-06, + "loss": 4.0121, + "step": 49915 + }, + { + "epoch": 3.3917651854871584, + "grad_norm": 0.35124924778938293, + "learning_rate": 5.7621619785296925e-06, + "loss": 4.0047, + "step": 49920 + }, + { + "epoch": 3.3921049055578205, + "grad_norm": 0.3114834427833557, + "learning_rate": 5.7617373284413644e-06, + "loss": 4.1728, + "step": 49925 + }, + { + "epoch": 3.392444625628482, + "grad_norm": 0.24048225581645966, + "learning_rate": 5.761312678353037e-06, + "loss": 4.0718, + "step": 49930 + }, + { + "epoch": 3.3927843456991438, + "grad_norm": 0.2879497706890106, + "learning_rate": 5.760888028264711e-06, + "loss": 4.1299, + "step": 49935 + }, + { + "epoch": 3.393124065769806, + "grad_norm": 0.5126955509185791, + "learning_rate": 5.760463378176383e-06, + "loss": 3.9509, + "step": 49940 + }, + { + "epoch": 3.3934637858404675, + "grad_norm": 0.2762451767921448, + "learning_rate": 5.760038728088056e-06, + "loss": 4.1541, + "step": 49945 + }, + { + "epoch": 3.393803505911129, + "grad_norm": 0.33361464738845825, + "learning_rate": 5.759614077999729e-06, + "loss": 4.079, + "step": 49950 + }, + { + "epoch": 3.394143225981791, + "grad_norm": 0.22472944855690002, + "learning_rate": 5.759189427911401e-06, + "loss": 4.0807, + "step": 49955 + }, + { + "epoch": 3.394482946052453, + "grad_norm": 0.2266189008951187, + "learning_rate": 5.758764777823074e-06, + "loss": 3.9331, + "step": 49960 + }, + { + "epoch": 3.3948226661231145, + "grad_norm": 0.5355797410011292, + "learning_rate": 5.758340127734748e-06, + "loss": 4.1016, + "step": 49965 + }, + { + "epoch": 3.3951623861937765, + "grad_norm": 0.2652139961719513, + "learning_rate": 5.75791547764642e-06, + "loss": 3.9692, + "step": 49970 + }, + { + "epoch": 3.395502106264438, + "grad_norm": 0.21829281747341156, + "learning_rate": 5.7574908275580924e-06, + "loss": 4.1438, + "step": 49975 + }, + { + "epoch": 3.3958418263351, + "grad_norm": 0.2755884528160095, + "learning_rate": 5.757066177469765e-06, + "loss": 4.0204, + "step": 49980 + }, + { + "epoch": 3.396181546405762, + "grad_norm": 0.2275608777999878, + "learning_rate": 5.756641527381438e-06, + "loss": 3.9841, + "step": 49985 + }, + { + "epoch": 3.3965212664764235, + "grad_norm": 0.323946475982666, + "learning_rate": 5.75621687729311e-06, + "loss": 4.0223, + "step": 49990 + }, + { + "epoch": 3.396860986547085, + "grad_norm": 0.22395983338356018, + "learning_rate": 5.755792227204784e-06, + "loss": 3.7128, + "step": 49995 + }, + { + "epoch": 3.397200706617747, + "grad_norm": 0.3150942325592041, + "learning_rate": 5.7553675771164564e-06, + "loss": 3.9552, + "step": 50000 + }, + { + "epoch": 3.397540426688409, + "grad_norm": 0.3048819601535797, + "learning_rate": 5.75494292702813e-06, + "loss": 4.0854, + "step": 50005 + }, + { + "epoch": 3.3978801467590705, + "grad_norm": 0.3486061990261078, + "learning_rate": 5.754518276939802e-06, + "loss": 4.1077, + "step": 50010 + }, + { + "epoch": 3.3982198668297325, + "grad_norm": 0.3210917115211487, + "learning_rate": 5.754093626851475e-06, + "loss": 4.0535, + "step": 50015 + }, + { + "epoch": 3.398559586900394, + "grad_norm": 0.29433754086494446, + "learning_rate": 5.7536689767631485e-06, + "loss": 3.9816, + "step": 50020 + }, + { + "epoch": 3.398899306971056, + "grad_norm": 0.28897571563720703, + "learning_rate": 5.7532443266748205e-06, + "loss": 3.9816, + "step": 50025 + }, + { + "epoch": 3.3992390270417174, + "grad_norm": 0.3000413775444031, + "learning_rate": 5.752819676586493e-06, + "loss": 3.7744, + "step": 50030 + }, + { + "epoch": 3.3995787471123795, + "grad_norm": 0.30204150080680847, + "learning_rate": 5.752395026498167e-06, + "loss": 4.0775, + "step": 50035 + }, + { + "epoch": 3.399918467183041, + "grad_norm": 0.30830809473991394, + "learning_rate": 5.751970376409839e-06, + "loss": 4.1726, + "step": 50040 + }, + { + "epoch": 3.4002581872537028, + "grad_norm": 0.5264266133308411, + "learning_rate": 5.751545726321512e-06, + "loss": 3.9693, + "step": 50045 + }, + { + "epoch": 3.400597907324365, + "grad_norm": 0.27183979749679565, + "learning_rate": 5.7511210762331845e-06, + "loss": 4.1232, + "step": 50050 + }, + { + "epoch": 3.4009376273950265, + "grad_norm": 0.3374366760253906, + "learning_rate": 5.750696426144857e-06, + "loss": 3.9438, + "step": 50055 + }, + { + "epoch": 3.401277347465688, + "grad_norm": 0.29805949330329895, + "learning_rate": 5.75027177605653e-06, + "loss": 3.8041, + "step": 50060 + }, + { + "epoch": 3.40161706753635, + "grad_norm": 0.4331541359424591, + "learning_rate": 5.749847125968203e-06, + "loss": 4.0688, + "step": 50065 + }, + { + "epoch": 3.401956787607012, + "grad_norm": 0.30009570717811584, + "learning_rate": 5.749422475879876e-06, + "loss": 4.0989, + "step": 50070 + }, + { + "epoch": 3.4022965076776734, + "grad_norm": 0.2998616099357605, + "learning_rate": 5.748997825791548e-06, + "loss": 4.1088, + "step": 50075 + }, + { + "epoch": 3.4026362277483355, + "grad_norm": 0.36682775616645813, + "learning_rate": 5.748573175703221e-06, + "loss": 3.8851, + "step": 50080 + }, + { + "epoch": 3.402975947818997, + "grad_norm": 0.24856485426425934, + "learning_rate": 5.748148525614894e-06, + "loss": 4.1585, + "step": 50085 + }, + { + "epoch": 3.403315667889659, + "grad_norm": 0.3053820729255676, + "learning_rate": 5.747723875526566e-06, + "loss": 4.0521, + "step": 50090 + }, + { + "epoch": 3.403655387960321, + "grad_norm": 0.5139725804328918, + "learning_rate": 5.74729922543824e-06, + "loss": 4.2048, + "step": 50095 + }, + { + "epoch": 3.4039951080309825, + "grad_norm": 0.23905649781227112, + "learning_rate": 5.7468745753499125e-06, + "loss": 3.9631, + "step": 50100 + }, + { + "epoch": 3.404334828101644, + "grad_norm": 0.3559301197528839, + "learning_rate": 5.746449925261584e-06, + "loss": 4.0561, + "step": 50105 + }, + { + "epoch": 3.404674548172306, + "grad_norm": 0.24546809494495392, + "learning_rate": 5.746025275173258e-06, + "loss": 4.0852, + "step": 50110 + }, + { + "epoch": 3.405014268242968, + "grad_norm": 0.28223729133605957, + "learning_rate": 5.745600625084931e-06, + "loss": 4.0626, + "step": 50115 + }, + { + "epoch": 3.4053539883136295, + "grad_norm": 0.29206112027168274, + "learning_rate": 5.745175974996603e-06, + "loss": 4.1275, + "step": 50120 + }, + { + "epoch": 3.4056937083842915, + "grad_norm": 0.3563416302204132, + "learning_rate": 5.7447513249082765e-06, + "loss": 4.171, + "step": 50125 + }, + { + "epoch": 3.406033428454953, + "grad_norm": 0.25415751338005066, + "learning_rate": 5.744326674819949e-06, + "loss": 4.1912, + "step": 50130 + }, + { + "epoch": 3.406373148525615, + "grad_norm": 0.2478543519973755, + "learning_rate": 5.743902024731621e-06, + "loss": 3.8877, + "step": 50135 + }, + { + "epoch": 3.4067128685962764, + "grad_norm": 0.31974348425865173, + "learning_rate": 5.743477374643295e-06, + "loss": 4.0748, + "step": 50140 + }, + { + "epoch": 3.4070525886669385, + "grad_norm": 0.27180492877960205, + "learning_rate": 5.743052724554967e-06, + "loss": 3.9628, + "step": 50145 + }, + { + "epoch": 3.4073923087376, + "grad_norm": 0.43642741441726685, + "learning_rate": 5.74262807446664e-06, + "loss": 3.9125, + "step": 50150 + }, + { + "epoch": 3.4077320288082618, + "grad_norm": 0.3064809739589691, + "learning_rate": 5.742203424378313e-06, + "loss": 4.0877, + "step": 50155 + }, + { + "epoch": 3.408071748878924, + "grad_norm": 0.25625303387641907, + "learning_rate": 5.741778774289985e-06, + "loss": 4.0735, + "step": 50160 + }, + { + "epoch": 3.4084114689495855, + "grad_norm": 0.2545959949493408, + "learning_rate": 5.741354124201658e-06, + "loss": 4.1638, + "step": 50165 + }, + { + "epoch": 3.408751189020247, + "grad_norm": 0.4601030647754669, + "learning_rate": 5.740929474113332e-06, + "loss": 3.9111, + "step": 50170 + }, + { + "epoch": 3.409090909090909, + "grad_norm": 0.23212260007858276, + "learning_rate": 5.740504824025004e-06, + "loss": 3.7993, + "step": 50175 + }, + { + "epoch": 3.409430629161571, + "grad_norm": 0.4517466723918915, + "learning_rate": 5.7400801739366764e-06, + "loss": 4.0958, + "step": 50180 + }, + { + "epoch": 3.4097703492322324, + "grad_norm": 0.375510573387146, + "learning_rate": 5.73965552384835e-06, + "loss": 3.9539, + "step": 50185 + }, + { + "epoch": 3.4101100693028945, + "grad_norm": 0.32319843769073486, + "learning_rate": 5.739230873760022e-06, + "loss": 3.926, + "step": 50190 + }, + { + "epoch": 3.410449789373556, + "grad_norm": 0.3014265298843384, + "learning_rate": 5.738806223671695e-06, + "loss": 3.6996, + "step": 50195 + }, + { + "epoch": 3.4107895094442178, + "grad_norm": 0.39693236351013184, + "learning_rate": 5.7383815735833685e-06, + "loss": 4.276, + "step": 50200 + }, + { + "epoch": 3.41112922951488, + "grad_norm": 0.33890143036842346, + "learning_rate": 5.7379569234950404e-06, + "loss": 4.048, + "step": 50205 + }, + { + "epoch": 3.4114689495855415, + "grad_norm": 0.29113566875457764, + "learning_rate": 5.737532273406713e-06, + "loss": 3.7552, + "step": 50210 + }, + { + "epoch": 3.411808669656203, + "grad_norm": 0.33550888299942017, + "learning_rate": 5.737107623318386e-06, + "loss": 4.2679, + "step": 50215 + }, + { + "epoch": 3.412148389726865, + "grad_norm": 0.29679369926452637, + "learning_rate": 5.736682973230059e-06, + "loss": 4.0289, + "step": 50220 + }, + { + "epoch": 3.412488109797527, + "grad_norm": 0.3140623867511749, + "learning_rate": 5.736258323141732e-06, + "loss": 3.7608, + "step": 50225 + }, + { + "epoch": 3.4128278298681884, + "grad_norm": 0.37629640102386475, + "learning_rate": 5.7358336730534044e-06, + "loss": 3.9419, + "step": 50230 + }, + { + "epoch": 3.4131675499388505, + "grad_norm": 0.44006043672561646, + "learning_rate": 5.735409022965077e-06, + "loss": 3.8978, + "step": 50235 + }, + { + "epoch": 3.413507270009512, + "grad_norm": 0.23007243871688843, + "learning_rate": 5.734984372876749e-06, + "loss": 4.1027, + "step": 50240 + }, + { + "epoch": 3.413846990080174, + "grad_norm": 0.30914705991744995, + "learning_rate": 5.734559722788423e-06, + "loss": 3.888, + "step": 50245 + }, + { + "epoch": 3.414186710150836, + "grad_norm": 0.2895198166370392, + "learning_rate": 5.734135072700096e-06, + "loss": 4.1885, + "step": 50250 + }, + { + "epoch": 3.4145264302214975, + "grad_norm": 0.34291014075279236, + "learning_rate": 5.733710422611768e-06, + "loss": 3.9652, + "step": 50255 + }, + { + "epoch": 3.414866150292159, + "grad_norm": 0.3352106213569641, + "learning_rate": 5.733285772523441e-06, + "loss": 4.1327, + "step": 50260 + }, + { + "epoch": 3.415205870362821, + "grad_norm": 0.27466046810150146, + "learning_rate": 5.732861122435114e-06, + "loss": 4.0903, + "step": 50265 + }, + { + "epoch": 3.415545590433483, + "grad_norm": 0.358784556388855, + "learning_rate": 5.732436472346786e-06, + "loss": 4.2497, + "step": 50270 + }, + { + "epoch": 3.4158853105041445, + "grad_norm": 0.44881531596183777, + "learning_rate": 5.73201182225846e-06, + "loss": 4.0273, + "step": 50275 + }, + { + "epoch": 3.4162250305748065, + "grad_norm": 0.27184534072875977, + "learning_rate": 5.7315871721701324e-06, + "loss": 4.1785, + "step": 50280 + }, + { + "epoch": 3.416564750645468, + "grad_norm": 0.346109002828598, + "learning_rate": 5.731162522081804e-06, + "loss": 4.0629, + "step": 50285 + }, + { + "epoch": 3.41690447071613, + "grad_norm": 0.24961036443710327, + "learning_rate": 5.730737871993478e-06, + "loss": 4.2228, + "step": 50290 + }, + { + "epoch": 3.417244190786792, + "grad_norm": 0.43375536799430847, + "learning_rate": 5.730313221905151e-06, + "loss": 4.0792, + "step": 50295 + }, + { + "epoch": 3.4175839108574535, + "grad_norm": 0.27998659014701843, + "learning_rate": 5.729888571816823e-06, + "loss": 3.8875, + "step": 50300 + }, + { + "epoch": 3.417923630928115, + "grad_norm": 0.21733282506465912, + "learning_rate": 5.7294639217284964e-06, + "loss": 3.8575, + "step": 50305 + }, + { + "epoch": 3.418263350998777, + "grad_norm": 0.26236119866371155, + "learning_rate": 5.729039271640168e-06, + "loss": 3.9946, + "step": 50310 + }, + { + "epoch": 3.418603071069439, + "grad_norm": 0.2944495975971222, + "learning_rate": 5.728614621551841e-06, + "loss": 3.9332, + "step": 50315 + }, + { + "epoch": 3.4189427911401005, + "grad_norm": 0.27222251892089844, + "learning_rate": 5.728189971463515e-06, + "loss": 4.1305, + "step": 50320 + }, + { + "epoch": 3.4192825112107625, + "grad_norm": 0.24894575774669647, + "learning_rate": 5.727765321375187e-06, + "loss": 3.9654, + "step": 50325 + }, + { + "epoch": 3.419622231281424, + "grad_norm": 0.28768494725227356, + "learning_rate": 5.72734067128686e-06, + "loss": 4.1773, + "step": 50330 + }, + { + "epoch": 3.419961951352086, + "grad_norm": 0.24787110090255737, + "learning_rate": 5.726916021198533e-06, + "loss": 3.9152, + "step": 50335 + }, + { + "epoch": 3.420301671422748, + "grad_norm": 0.48772507905960083, + "learning_rate": 5.726491371110205e-06, + "loss": 4.2389, + "step": 50340 + }, + { + "epoch": 3.4206413914934095, + "grad_norm": 0.34224793314933777, + "learning_rate": 5.726066721021879e-06, + "loss": 4.015, + "step": 50345 + }, + { + "epoch": 3.420981111564071, + "grad_norm": 0.24963001906871796, + "learning_rate": 5.725642070933552e-06, + "loss": 4.0607, + "step": 50350 + }, + { + "epoch": 3.421320831634733, + "grad_norm": 0.26717549562454224, + "learning_rate": 5.725217420845224e-06, + "loss": 4.1253, + "step": 50355 + }, + { + "epoch": 3.421660551705395, + "grad_norm": 0.32828137278556824, + "learning_rate": 5.724792770756897e-06, + "loss": 3.7921, + "step": 50360 + }, + { + "epoch": 3.4220002717760565, + "grad_norm": 0.23739206790924072, + "learning_rate": 5.72436812066857e-06, + "loss": 4.0888, + "step": 50365 + }, + { + "epoch": 3.422339991846718, + "grad_norm": 0.344870001077652, + "learning_rate": 5.723943470580242e-06, + "loss": 4.0224, + "step": 50370 + }, + { + "epoch": 3.42267971191738, + "grad_norm": 0.3273894488811493, + "learning_rate": 5.723518820491916e-06, + "loss": 4.0813, + "step": 50375 + }, + { + "epoch": 3.423019431988042, + "grad_norm": 0.36302441358566284, + "learning_rate": 5.7230941704035885e-06, + "loss": 4.1322, + "step": 50380 + }, + { + "epoch": 3.4233591520587034, + "grad_norm": 0.3365080654621124, + "learning_rate": 5.72266952031526e-06, + "loss": 4.1877, + "step": 50385 + }, + { + "epoch": 3.4236988721293655, + "grad_norm": 0.4313608407974243, + "learning_rate": 5.722244870226934e-06, + "loss": 4.0432, + "step": 50390 + }, + { + "epoch": 3.424038592200027, + "grad_norm": 0.26801612973213196, + "learning_rate": 5.721820220138606e-06, + "loss": 4.037, + "step": 50395 + }, + { + "epoch": 3.424378312270689, + "grad_norm": 0.25503993034362793, + "learning_rate": 5.721395570050279e-06, + "loss": 3.8382, + "step": 50400 + }, + { + "epoch": 3.424718032341351, + "grad_norm": 0.4948146939277649, + "learning_rate": 5.7209709199619525e-06, + "loss": 3.8238, + "step": 50405 + }, + { + "epoch": 3.4250577524120125, + "grad_norm": 0.24333715438842773, + "learning_rate": 5.720546269873624e-06, + "loss": 3.9579, + "step": 50410 + }, + { + "epoch": 3.425397472482674, + "grad_norm": 0.4092772305011749, + "learning_rate": 5.720121619785297e-06, + "loss": 3.9427, + "step": 50415 + }, + { + "epoch": 3.425737192553336, + "grad_norm": 0.31199461221694946, + "learning_rate": 5.719696969696971e-06, + "loss": 3.9744, + "step": 50420 + }, + { + "epoch": 3.426076912623998, + "grad_norm": 0.25000303983688354, + "learning_rate": 5.719272319608643e-06, + "loss": 4.0246, + "step": 50425 + }, + { + "epoch": 3.4264166326946595, + "grad_norm": 0.26896020770072937, + "learning_rate": 5.718847669520316e-06, + "loss": 4.2203, + "step": 50430 + }, + { + "epoch": 3.4267563527653215, + "grad_norm": 0.26538437604904175, + "learning_rate": 5.718423019431989e-06, + "loss": 3.7514, + "step": 50435 + }, + { + "epoch": 3.427096072835983, + "grad_norm": 0.32169246673583984, + "learning_rate": 5.717998369343661e-06, + "loss": 3.9363, + "step": 50440 + }, + { + "epoch": 3.427435792906645, + "grad_norm": 0.20996150374412537, + "learning_rate": 5.717573719255334e-06, + "loss": 4.051, + "step": 50445 + }, + { + "epoch": 3.427775512977307, + "grad_norm": 0.26288697123527527, + "learning_rate": 5.717149069167008e-06, + "loss": 3.8131, + "step": 50450 + }, + { + "epoch": 3.4281152330479685, + "grad_norm": 0.2545327842235565, + "learning_rate": 5.71672441907868e-06, + "loss": 3.9171, + "step": 50455 + }, + { + "epoch": 3.42845495311863, + "grad_norm": 0.2957732081413269, + "learning_rate": 5.716299768990352e-06, + "loss": 4.2051, + "step": 50460 + }, + { + "epoch": 3.428794673189292, + "grad_norm": 0.28833064436912537, + "learning_rate": 5.715875118902025e-06, + "loss": 3.6877, + "step": 50465 + }, + { + "epoch": 3.429134393259954, + "grad_norm": 0.21887071430683136, + "learning_rate": 5.715450468813698e-06, + "loss": 4.0176, + "step": 50470 + }, + { + "epoch": 3.4294741133306155, + "grad_norm": 0.2977769076824188, + "learning_rate": 5.715025818725371e-06, + "loss": 3.9774, + "step": 50475 + }, + { + "epoch": 3.429813833401277, + "grad_norm": 0.26704782247543335, + "learning_rate": 5.714601168637044e-06, + "loss": 4.0544, + "step": 50480 + }, + { + "epoch": 3.430153553471939, + "grad_norm": 0.2924327254295349, + "learning_rate": 5.7141765185487164e-06, + "loss": 4.1315, + "step": 50485 + }, + { + "epoch": 3.430493273542601, + "grad_norm": 0.20490004122257233, + "learning_rate": 5.713751868460388e-06, + "loss": 3.8758, + "step": 50490 + }, + { + "epoch": 3.4308329936132624, + "grad_norm": 0.4692337214946747, + "learning_rate": 5.713327218372062e-06, + "loss": 3.9372, + "step": 50495 + }, + { + "epoch": 3.4311727136839245, + "grad_norm": 0.2555513083934784, + "learning_rate": 5.712902568283735e-06, + "loss": 4.1367, + "step": 50500 + }, + { + "epoch": 3.431512433754586, + "grad_norm": 0.24138827621936798, + "learning_rate": 5.712477918195407e-06, + "loss": 4.0371, + "step": 50505 + }, + { + "epoch": 3.4318521538252478, + "grad_norm": 0.27402952313423157, + "learning_rate": 5.7120532681070804e-06, + "loss": 3.8102, + "step": 50510 + }, + { + "epoch": 3.43219187389591, + "grad_norm": 0.35548681020736694, + "learning_rate": 5.711628618018753e-06, + "loss": 4.0238, + "step": 50515 + }, + { + "epoch": 3.4325315939665715, + "grad_norm": 0.34779754281044006, + "learning_rate": 5.711203967930425e-06, + "loss": 4.072, + "step": 50520 + }, + { + "epoch": 3.432871314037233, + "grad_norm": 0.36383235454559326, + "learning_rate": 5.710779317842099e-06, + "loss": 3.7813, + "step": 50525 + }, + { + "epoch": 3.433211034107895, + "grad_norm": 0.22932852804660797, + "learning_rate": 5.710354667753772e-06, + "loss": 4.003, + "step": 50530 + }, + { + "epoch": 3.433550754178557, + "grad_norm": 0.3892970383167267, + "learning_rate": 5.709930017665444e-06, + "loss": 4.1171, + "step": 50535 + }, + { + "epoch": 3.4338904742492184, + "grad_norm": 0.4620884358882904, + "learning_rate": 5.709505367577117e-06, + "loss": 4.2534, + "step": 50540 + }, + { + "epoch": 3.4342301943198805, + "grad_norm": 0.3107282817363739, + "learning_rate": 5.70908071748879e-06, + "loss": 3.9443, + "step": 50545 + }, + { + "epoch": 3.434569914390542, + "grad_norm": 0.2737424075603485, + "learning_rate": 5.708656067400462e-06, + "loss": 4.1017, + "step": 50550 + }, + { + "epoch": 3.434909634461204, + "grad_norm": 0.3488514721393585, + "learning_rate": 5.708231417312136e-06, + "loss": 3.9112, + "step": 50555 + }, + { + "epoch": 3.435249354531866, + "grad_norm": 0.31540369987487793, + "learning_rate": 5.707806767223808e-06, + "loss": 4.1973, + "step": 50560 + }, + { + "epoch": 3.4355890746025275, + "grad_norm": 0.43529167771339417, + "learning_rate": 5.70738211713548e-06, + "loss": 3.981, + "step": 50565 + }, + { + "epoch": 3.435928794673189, + "grad_norm": 0.22116999328136444, + "learning_rate": 5.706957467047154e-06, + "loss": 3.9, + "step": 50570 + }, + { + "epoch": 3.436268514743851, + "grad_norm": 0.2404497116804123, + "learning_rate": 5.706532816958826e-06, + "loss": 3.7656, + "step": 50575 + }, + { + "epoch": 3.436608234814513, + "grad_norm": 0.38021138310432434, + "learning_rate": 5.706108166870499e-06, + "loss": 4.0964, + "step": 50580 + }, + { + "epoch": 3.4369479548851745, + "grad_norm": 0.3856101334095001, + "learning_rate": 5.7056835167821724e-06, + "loss": 4.2136, + "step": 50585 + }, + { + "epoch": 3.4372876749558365, + "grad_norm": 0.5118966102600098, + "learning_rate": 5.705258866693844e-06, + "loss": 4.1554, + "step": 50590 + }, + { + "epoch": 3.437627395026498, + "grad_norm": 0.32763320207595825, + "learning_rate": 5.704834216605517e-06, + "loss": 4.2075, + "step": 50595 + }, + { + "epoch": 3.43796711509716, + "grad_norm": 0.2615818381309509, + "learning_rate": 5.704409566517191e-06, + "loss": 4.011, + "step": 50600 + }, + { + "epoch": 3.438306835167822, + "grad_norm": 0.2707517743110657, + "learning_rate": 5.703984916428863e-06, + "loss": 3.986, + "step": 50605 + }, + { + "epoch": 3.4386465552384835, + "grad_norm": 0.33716586232185364, + "learning_rate": 5.703560266340536e-06, + "loss": 3.9515, + "step": 50610 + }, + { + "epoch": 3.438986275309145, + "grad_norm": 0.2559202015399933, + "learning_rate": 5.703135616252209e-06, + "loss": 3.7597, + "step": 50615 + }, + { + "epoch": 3.439325995379807, + "grad_norm": 0.2899017035961151, + "learning_rate": 5.702710966163881e-06, + "loss": 4.06, + "step": 50620 + }, + { + "epoch": 3.439665715450469, + "grad_norm": 0.32001322507858276, + "learning_rate": 5.702286316075554e-06, + "loss": 4.056, + "step": 50625 + }, + { + "epoch": 3.4400054355211305, + "grad_norm": 0.3615940511226654, + "learning_rate": 5.701861665987228e-06, + "loss": 4.2111, + "step": 50630 + }, + { + "epoch": 3.4403451555917925, + "grad_norm": 0.31494325399398804, + "learning_rate": 5.7014370158989e-06, + "loss": 3.8717, + "step": 50635 + }, + { + "epoch": 3.440684875662454, + "grad_norm": 0.39977797865867615, + "learning_rate": 5.701012365810572e-06, + "loss": 4.0975, + "step": 50640 + }, + { + "epoch": 3.441024595733116, + "grad_norm": 0.357051283121109, + "learning_rate": 5.700587715722245e-06, + "loss": 4.3337, + "step": 50645 + }, + { + "epoch": 3.441364315803778, + "grad_norm": 0.2914546728134155, + "learning_rate": 5.700163065633918e-06, + "loss": 3.9776, + "step": 50650 + }, + { + "epoch": 3.4417040358744395, + "grad_norm": 0.4462736248970032, + "learning_rate": 5.69973841554559e-06, + "loss": 4.1824, + "step": 50655 + }, + { + "epoch": 3.442043755945101, + "grad_norm": 0.26905861496925354, + "learning_rate": 5.699313765457264e-06, + "loss": 3.981, + "step": 50660 + }, + { + "epoch": 3.442383476015763, + "grad_norm": 0.2304588258266449, + "learning_rate": 5.698889115368936e-06, + "loss": 4.0802, + "step": 50665 + }, + { + "epoch": 3.442723196086425, + "grad_norm": 0.29586291313171387, + "learning_rate": 5.698464465280608e-06, + "loss": 4.0555, + "step": 50670 + }, + { + "epoch": 3.4430629161570865, + "grad_norm": 0.3251632750034332, + "learning_rate": 5.698039815192282e-06, + "loss": 3.9285, + "step": 50675 + }, + { + "epoch": 3.4434026362277486, + "grad_norm": 0.4333968460559845, + "learning_rate": 5.697615165103955e-06, + "loss": 4.2162, + "step": 50680 + }, + { + "epoch": 3.44374235629841, + "grad_norm": 0.2850123941898346, + "learning_rate": 5.6971905150156285e-06, + "loss": 4.0979, + "step": 50685 + }, + { + "epoch": 3.444082076369072, + "grad_norm": 0.35891062021255493, + "learning_rate": 5.6967658649273e-06, + "loss": 4.279, + "step": 50690 + }, + { + "epoch": 3.444421796439734, + "grad_norm": 0.2532978057861328, + "learning_rate": 5.696341214838973e-06, + "loss": 4.0142, + "step": 50695 + }, + { + "epoch": 3.4447615165103955, + "grad_norm": 0.20349664986133575, + "learning_rate": 5.695916564750647e-06, + "loss": 3.9179, + "step": 50700 + }, + { + "epoch": 3.445101236581057, + "grad_norm": 0.3324473798274994, + "learning_rate": 5.695491914662319e-06, + "loss": 4.1571, + "step": 50705 + }, + { + "epoch": 3.4454409566517192, + "grad_norm": 0.3695877492427826, + "learning_rate": 5.695067264573992e-06, + "loss": 4.0209, + "step": 50710 + }, + { + "epoch": 3.445780676722381, + "grad_norm": 0.39970123767852783, + "learning_rate": 5.694642614485664e-06, + "loss": 4.3656, + "step": 50715 + }, + { + "epoch": 3.4461203967930425, + "grad_norm": 0.3350466191768646, + "learning_rate": 5.694217964397337e-06, + "loss": 4.0441, + "step": 50720 + }, + { + "epoch": 3.446460116863704, + "grad_norm": 0.2577368915081024, + "learning_rate": 5.69379331430901e-06, + "loss": 4.1524, + "step": 50725 + }, + { + "epoch": 3.446799836934366, + "grad_norm": 0.4408584237098694, + "learning_rate": 5.693368664220683e-06, + "loss": 4.3398, + "step": 50730 + }, + { + "epoch": 3.447139557005028, + "grad_norm": 0.31478771567344666, + "learning_rate": 5.692944014132356e-06, + "loss": 3.7926, + "step": 50735 + }, + { + "epoch": 3.4474792770756895, + "grad_norm": 0.25602811574935913, + "learning_rate": 5.6925193640440276e-06, + "loss": 4.1105, + "step": 50740 + }, + { + "epoch": 3.4478189971463515, + "grad_norm": 0.30836519598960876, + "learning_rate": 5.692094713955701e-06, + "loss": 3.8396, + "step": 50745 + }, + { + "epoch": 3.448158717217013, + "grad_norm": 0.3534688353538513, + "learning_rate": 5.691670063867374e-06, + "loss": 4.009, + "step": 50750 + }, + { + "epoch": 3.448498437287675, + "grad_norm": 0.29623961448669434, + "learning_rate": 5.691245413779046e-06, + "loss": 4.0622, + "step": 50755 + }, + { + "epoch": 3.448838157358337, + "grad_norm": 0.41532662510871887, + "learning_rate": 5.69082076369072e-06, + "loss": 3.7594, + "step": 50760 + }, + { + "epoch": 3.4491778774289985, + "grad_norm": 0.2609698474407196, + "learning_rate": 5.690396113602392e-06, + "loss": 3.6832, + "step": 50765 + }, + { + "epoch": 3.44951759749966, + "grad_norm": 0.3491130471229553, + "learning_rate": 5.689971463514064e-06, + "loss": 3.973, + "step": 50770 + }, + { + "epoch": 3.449857317570322, + "grad_norm": 0.3006887435913086, + "learning_rate": 5.689546813425738e-06, + "loss": 3.8764, + "step": 50775 + }, + { + "epoch": 3.450197037640984, + "grad_norm": 0.26827356219291687, + "learning_rate": 5.689122163337411e-06, + "loss": 3.9605, + "step": 50780 + }, + { + "epoch": 3.4505367577116455, + "grad_norm": 0.2719358801841736, + "learning_rate": 5.688697513249083e-06, + "loss": 3.9854, + "step": 50785 + }, + { + "epoch": 3.4508764777823075, + "grad_norm": 0.2838961184024811, + "learning_rate": 5.6882728631607564e-06, + "loss": 4.1445, + "step": 50790 + }, + { + "epoch": 3.451216197852969, + "grad_norm": 0.5848087072372437, + "learning_rate": 5.687848213072429e-06, + "loss": 4.022, + "step": 50795 + }, + { + "epoch": 3.451555917923631, + "grad_norm": 0.2474643886089325, + "learning_rate": 5.687423562984101e-06, + "loss": 3.9839, + "step": 50800 + }, + { + "epoch": 3.451895637994293, + "grad_norm": 0.3017576038837433, + "learning_rate": 5.686998912895775e-06, + "loss": 4.2564, + "step": 50805 + }, + { + "epoch": 3.4522353580649545, + "grad_norm": 0.3366270065307617, + "learning_rate": 5.686574262807447e-06, + "loss": 4.013, + "step": 50810 + }, + { + "epoch": 3.452575078135616, + "grad_norm": 0.1992260217666626, + "learning_rate": 5.68614961271912e-06, + "loss": 4.0666, + "step": 50815 + }, + { + "epoch": 3.452914798206278, + "grad_norm": 0.3315991759300232, + "learning_rate": 5.685724962630793e-06, + "loss": 4.0244, + "step": 50820 + }, + { + "epoch": 3.45325451827694, + "grad_norm": 0.2614695131778717, + "learning_rate": 5.685300312542465e-06, + "loss": 3.961, + "step": 50825 + }, + { + "epoch": 3.4535942383476015, + "grad_norm": 0.3392232358455658, + "learning_rate": 5.684875662454138e-06, + "loss": 4.0675, + "step": 50830 + }, + { + "epoch": 3.453933958418263, + "grad_norm": 0.3359285295009613, + "learning_rate": 5.684451012365812e-06, + "loss": 3.8721, + "step": 50835 + }, + { + "epoch": 3.454273678488925, + "grad_norm": 0.2857203483581543, + "learning_rate": 5.684026362277484e-06, + "loss": 3.9636, + "step": 50840 + }, + { + "epoch": 3.454613398559587, + "grad_norm": 0.2911919057369232, + "learning_rate": 5.683601712189156e-06, + "loss": 4.0979, + "step": 50845 + }, + { + "epoch": 3.4549531186302485, + "grad_norm": 0.34484824538230896, + "learning_rate": 5.68317706210083e-06, + "loss": 4.1501, + "step": 50850 + }, + { + "epoch": 3.4552928387009105, + "grad_norm": 0.3517887592315674, + "learning_rate": 5.682752412012502e-06, + "loss": 4.0991, + "step": 50855 + }, + { + "epoch": 3.455632558771572, + "grad_norm": 0.3417205214500427, + "learning_rate": 5.682327761924175e-06, + "loss": 3.9426, + "step": 50860 + }, + { + "epoch": 3.455972278842234, + "grad_norm": 0.334113746881485, + "learning_rate": 5.6819031118358484e-06, + "loss": 4.0246, + "step": 50865 + }, + { + "epoch": 3.456311998912896, + "grad_norm": 0.3431953489780426, + "learning_rate": 5.68147846174752e-06, + "loss": 4.0257, + "step": 50870 + }, + { + "epoch": 3.4566517189835575, + "grad_norm": 0.376158744096756, + "learning_rate": 5.681053811659193e-06, + "loss": 4.0648, + "step": 50875 + }, + { + "epoch": 3.456991439054219, + "grad_norm": 0.26006263494491577, + "learning_rate": 5.680629161570866e-06, + "loss": 4.0224, + "step": 50880 + }, + { + "epoch": 3.457331159124881, + "grad_norm": 0.5220692157745361, + "learning_rate": 5.680204511482539e-06, + "loss": 3.8943, + "step": 50885 + }, + { + "epoch": 3.457670879195543, + "grad_norm": 0.37853333353996277, + "learning_rate": 5.679779861394212e-06, + "loss": 4.0211, + "step": 50890 + }, + { + "epoch": 3.4580105992662045, + "grad_norm": 0.3293214440345764, + "learning_rate": 5.679355211305884e-06, + "loss": 3.9011, + "step": 50895 + }, + { + "epoch": 3.4583503193368665, + "grad_norm": 0.3098258674144745, + "learning_rate": 5.678930561217557e-06, + "loss": 4.0575, + "step": 50900 + }, + { + "epoch": 3.458690039407528, + "grad_norm": 0.2700006365776062, + "learning_rate": 5.678505911129229e-06, + "loss": 4.0962, + "step": 50905 + }, + { + "epoch": 3.45902975947819, + "grad_norm": 0.2919718325138092, + "learning_rate": 5.678081261040903e-06, + "loss": 4.1455, + "step": 50910 + }, + { + "epoch": 3.459369479548852, + "grad_norm": 0.3804974853992462, + "learning_rate": 5.677656610952576e-06, + "loss": 4.3599, + "step": 50915 + }, + { + "epoch": 3.4597091996195135, + "grad_norm": 0.36537155508995056, + "learning_rate": 5.6772319608642476e-06, + "loss": 3.9426, + "step": 50920 + }, + { + "epoch": 3.460048919690175, + "grad_norm": 0.3505673408508301, + "learning_rate": 5.676807310775921e-06, + "loss": 3.8884, + "step": 50925 + }, + { + "epoch": 3.460388639760837, + "grad_norm": 0.26537930965423584, + "learning_rate": 5.676382660687594e-06, + "loss": 3.9061, + "step": 50930 + }, + { + "epoch": 3.460728359831499, + "grad_norm": 0.33159828186035156, + "learning_rate": 5.675958010599266e-06, + "loss": 4.121, + "step": 50935 + }, + { + "epoch": 3.4610680799021605, + "grad_norm": 0.3511878252029419, + "learning_rate": 5.67553336051094e-06, + "loss": 4.1025, + "step": 50940 + }, + { + "epoch": 3.4614077999728226, + "grad_norm": 0.32004401087760925, + "learning_rate": 5.675108710422612e-06, + "loss": 4.1438, + "step": 50945 + }, + { + "epoch": 3.461747520043484, + "grad_norm": 0.2677765488624573, + "learning_rate": 5.674684060334284e-06, + "loss": 4.0722, + "step": 50950 + }, + { + "epoch": 3.462087240114146, + "grad_norm": 0.33246222138404846, + "learning_rate": 5.674259410245958e-06, + "loss": 4.0321, + "step": 50955 + }, + { + "epoch": 3.462426960184808, + "grad_norm": 0.24804207682609558, + "learning_rate": 5.673834760157631e-06, + "loss": 4.2024, + "step": 50960 + }, + { + "epoch": 3.4627666802554695, + "grad_norm": 0.37924569845199585, + "learning_rate": 5.673410110069303e-06, + "loss": 3.9965, + "step": 50965 + }, + { + "epoch": 3.463106400326131, + "grad_norm": 0.3393055498600006, + "learning_rate": 5.672985459980976e-06, + "loss": 4.0083, + "step": 50970 + }, + { + "epoch": 3.4634461203967932, + "grad_norm": 0.340242862701416, + "learning_rate": 5.672560809892649e-06, + "loss": 4.0292, + "step": 50975 + }, + { + "epoch": 3.463785840467455, + "grad_norm": 0.27670350670814514, + "learning_rate": 5.672136159804321e-06, + "loss": 4.0315, + "step": 50980 + }, + { + "epoch": 3.4641255605381165, + "grad_norm": 0.40391743183135986, + "learning_rate": 5.671711509715995e-06, + "loss": 3.8363, + "step": 50985 + }, + { + "epoch": 3.4644652806087786, + "grad_norm": 0.3160367012023926, + "learning_rate": 5.671286859627667e-06, + "loss": 3.7563, + "step": 50990 + }, + { + "epoch": 3.46480500067944, + "grad_norm": 0.2607237696647644, + "learning_rate": 5.6708622095393396e-06, + "loss": 4.0, + "step": 50995 + }, + { + "epoch": 3.465144720750102, + "grad_norm": 0.19776609539985657, + "learning_rate": 5.670437559451013e-06, + "loss": 4.0997, + "step": 51000 + }, + { + "epoch": 3.465484440820764, + "grad_norm": 0.2658778429031372, + "learning_rate": 5.670012909362685e-06, + "loss": 4.2314, + "step": 51005 + }, + { + "epoch": 3.4658241608914255, + "grad_norm": 0.2690046727657318, + "learning_rate": 5.669588259274358e-06, + "loss": 3.9739, + "step": 51010 + }, + { + "epoch": 3.466163880962087, + "grad_norm": 0.2647019922733307, + "learning_rate": 5.669163609186032e-06, + "loss": 3.9245, + "step": 51015 + }, + { + "epoch": 3.4665036010327492, + "grad_norm": 0.283288836479187, + "learning_rate": 5.6687389590977036e-06, + "loss": 3.8512, + "step": 51020 + }, + { + "epoch": 3.466843321103411, + "grad_norm": 0.33586809039115906, + "learning_rate": 5.668314309009377e-06, + "loss": 4.0892, + "step": 51025 + }, + { + "epoch": 3.4671830411740725, + "grad_norm": 0.21296930313110352, + "learning_rate": 5.66788965892105e-06, + "loss": 4.0549, + "step": 51030 + }, + { + "epoch": 3.4675227612447346, + "grad_norm": 0.3716370165348053, + "learning_rate": 5.667465008832722e-06, + "loss": 4.3182, + "step": 51035 + }, + { + "epoch": 3.467862481315396, + "grad_norm": 0.3236735761165619, + "learning_rate": 5.667040358744396e-06, + "loss": 4.0648, + "step": 51040 + }, + { + "epoch": 3.468202201386058, + "grad_norm": 0.2984794080257416, + "learning_rate": 5.666615708656068e-06, + "loss": 4.0657, + "step": 51045 + }, + { + "epoch": 3.46854192145672, + "grad_norm": 0.30260148644447327, + "learning_rate": 5.66619105856774e-06, + "loss": 4.2324, + "step": 51050 + }, + { + "epoch": 3.4688816415273815, + "grad_norm": 0.35108932852745056, + "learning_rate": 5.665766408479414e-06, + "loss": 4.0633, + "step": 51055 + }, + { + "epoch": 3.469221361598043, + "grad_norm": 0.3395356833934784, + "learning_rate": 5.665341758391086e-06, + "loss": 3.97, + "step": 51060 + }, + { + "epoch": 3.469561081668705, + "grad_norm": 0.29494884610176086, + "learning_rate": 5.664917108302759e-06, + "loss": 3.9965, + "step": 51065 + }, + { + "epoch": 3.469900801739367, + "grad_norm": 0.3287017047405243, + "learning_rate": 5.6644924582144324e-06, + "loss": 3.8667, + "step": 51070 + }, + { + "epoch": 3.4702405218100285, + "grad_norm": 0.29594841599464417, + "learning_rate": 5.664067808126104e-06, + "loss": 4.1172, + "step": 51075 + }, + { + "epoch": 3.47058024188069, + "grad_norm": 0.36197131872177124, + "learning_rate": 5.663643158037777e-06, + "loss": 4.0338, + "step": 51080 + }, + { + "epoch": 3.470919961951352, + "grad_norm": 0.30048084259033203, + "learning_rate": 5.663218507949451e-06, + "loss": 3.9773, + "step": 51085 + }, + { + "epoch": 3.471259682022014, + "grad_norm": 0.3813410997390747, + "learning_rate": 5.662793857861123e-06, + "loss": 4.1533, + "step": 51090 + }, + { + "epoch": 3.4715994020926755, + "grad_norm": 0.22287774085998535, + "learning_rate": 5.662369207772796e-06, + "loss": 4.036, + "step": 51095 + }, + { + "epoch": 3.4719391221633376, + "grad_norm": 0.33607882261276245, + "learning_rate": 5.661944557684469e-06, + "loss": 4.0722, + "step": 51100 + }, + { + "epoch": 3.472278842233999, + "grad_norm": 0.25676974654197693, + "learning_rate": 5.661519907596141e-06, + "loss": 3.8202, + "step": 51105 + }, + { + "epoch": 3.472618562304661, + "grad_norm": 0.257261723279953, + "learning_rate": 5.661095257507814e-06, + "loss": 3.9976, + "step": 51110 + }, + { + "epoch": 3.472958282375323, + "grad_norm": 0.27432408928871155, + "learning_rate": 5.660670607419488e-06, + "loss": 4.2162, + "step": 51115 + }, + { + "epoch": 3.4732980024459845, + "grad_norm": 0.38179516792297363, + "learning_rate": 5.66024595733116e-06, + "loss": 4.0735, + "step": 51120 + }, + { + "epoch": 3.473637722516646, + "grad_norm": 0.2936605215072632, + "learning_rate": 5.659821307242832e-06, + "loss": 4.1386, + "step": 51125 + }, + { + "epoch": 3.4739774425873082, + "grad_norm": 0.344626784324646, + "learning_rate": 5.659396657154505e-06, + "loss": 3.9916, + "step": 51130 + }, + { + "epoch": 3.47431716265797, + "grad_norm": 0.2815966010093689, + "learning_rate": 5.658972007066178e-06, + "loss": 4.117, + "step": 51135 + }, + { + "epoch": 3.4746568827286315, + "grad_norm": 0.36827534437179565, + "learning_rate": 5.658547356977851e-06, + "loss": 3.789, + "step": 51140 + }, + { + "epoch": 3.4749966027992936, + "grad_norm": 0.2616114318370819, + "learning_rate": 5.658122706889524e-06, + "loss": 3.9596, + "step": 51145 + }, + { + "epoch": 3.475336322869955, + "grad_norm": 0.23158246278762817, + "learning_rate": 5.657698056801196e-06, + "loss": 4.1583, + "step": 51150 + }, + { + "epoch": 3.475676042940617, + "grad_norm": 0.43670111894607544, + "learning_rate": 5.657273406712868e-06, + "loss": 3.9655, + "step": 51155 + }, + { + "epoch": 3.4760157630112785, + "grad_norm": 0.25497475266456604, + "learning_rate": 5.656848756624542e-06, + "loss": 3.9769, + "step": 51160 + }, + { + "epoch": 3.4763554830819405, + "grad_norm": 0.28849247097969055, + "learning_rate": 5.656424106536215e-06, + "loss": 3.8221, + "step": 51165 + }, + { + "epoch": 3.476695203152602, + "grad_norm": 0.298325777053833, + "learning_rate": 5.655999456447887e-06, + "loss": 4.0329, + "step": 51170 + }, + { + "epoch": 3.477034923223264, + "grad_norm": 0.2934148609638214, + "learning_rate": 5.65557480635956e-06, + "loss": 3.8067, + "step": 51175 + }, + { + "epoch": 3.477374643293926, + "grad_norm": 0.3374827802181244, + "learning_rate": 5.655150156271233e-06, + "loss": 4.1852, + "step": 51180 + }, + { + "epoch": 3.4777143633645875, + "grad_norm": 0.38233765959739685, + "learning_rate": 5.654725506182905e-06, + "loss": 4.2992, + "step": 51185 + }, + { + "epoch": 3.478054083435249, + "grad_norm": 0.2305595427751541, + "learning_rate": 5.654300856094579e-06, + "loss": 4.0411, + "step": 51190 + }, + { + "epoch": 3.478393803505911, + "grad_norm": 0.2815157175064087, + "learning_rate": 5.653876206006252e-06, + "loss": 4.0928, + "step": 51195 + }, + { + "epoch": 3.478733523576573, + "grad_norm": 0.20542198419570923, + "learning_rate": 5.6534515559179235e-06, + "loss": 3.8366, + "step": 51200 + }, + { + "epoch": 3.4790732436472345, + "grad_norm": 0.2800775170326233, + "learning_rate": 5.653026905829597e-06, + "loss": 3.8274, + "step": 51205 + }, + { + "epoch": 3.4794129637178965, + "grad_norm": 0.3329930007457733, + "learning_rate": 5.65260225574127e-06, + "loss": 4.0249, + "step": 51210 + }, + { + "epoch": 3.479752683788558, + "grad_norm": 0.32893046736717224, + "learning_rate": 5.652177605652942e-06, + "loss": 3.9703, + "step": 51215 + }, + { + "epoch": 3.48009240385922, + "grad_norm": 0.5609014630317688, + "learning_rate": 5.651752955564616e-06, + "loss": 3.9822, + "step": 51220 + }, + { + "epoch": 3.480432123929882, + "grad_norm": 0.26732829213142395, + "learning_rate": 5.6513283054762876e-06, + "loss": 4.1252, + "step": 51225 + }, + { + "epoch": 3.4807718440005435, + "grad_norm": 0.5540297031402588, + "learning_rate": 5.65090365538796e-06, + "loss": 4.1513, + "step": 51230 + }, + { + "epoch": 3.481111564071205, + "grad_norm": 0.3092375099658966, + "learning_rate": 5.650479005299634e-06, + "loss": 4.1378, + "step": 51235 + }, + { + "epoch": 3.481451284141867, + "grad_norm": 0.2942757308483124, + "learning_rate": 5.650054355211306e-06, + "loss": 4.1114, + "step": 51240 + }, + { + "epoch": 3.481791004212529, + "grad_norm": 0.2837248742580414, + "learning_rate": 5.649629705122979e-06, + "loss": 3.9836, + "step": 51245 + }, + { + "epoch": 3.4821307242831905, + "grad_norm": 0.2913335859775543, + "learning_rate": 5.649205055034652e-06, + "loss": 3.9808, + "step": 51250 + }, + { + "epoch": 3.4824704443538526, + "grad_norm": 0.2589147090911865, + "learning_rate": 5.648780404946324e-06, + "loss": 4.1296, + "step": 51255 + }, + { + "epoch": 3.482810164424514, + "grad_norm": 0.47023093700408936, + "learning_rate": 5.648355754857997e-06, + "loss": 3.855, + "step": 51260 + }, + { + "epoch": 3.483149884495176, + "grad_norm": 0.26176154613494873, + "learning_rate": 5.647931104769671e-06, + "loss": 3.9832, + "step": 51265 + }, + { + "epoch": 3.483489604565838, + "grad_norm": 0.22299779951572418, + "learning_rate": 5.647506454681343e-06, + "loss": 4.166, + "step": 51270 + }, + { + "epoch": 3.4838293246364995, + "grad_norm": 0.6711809039115906, + "learning_rate": 5.6470818045930156e-06, + "loss": 4.0249, + "step": 51275 + }, + { + "epoch": 3.484169044707161, + "grad_norm": 0.4564420282840729, + "learning_rate": 5.646657154504689e-06, + "loss": 4.3556, + "step": 51280 + }, + { + "epoch": 3.4845087647778232, + "grad_norm": 0.22087186574935913, + "learning_rate": 5.646232504416361e-06, + "loss": 4.0579, + "step": 51285 + }, + { + "epoch": 3.484848484848485, + "grad_norm": 0.34027934074401855, + "learning_rate": 5.645807854328034e-06, + "loss": 4.1217, + "step": 51290 + }, + { + "epoch": 3.4851882049191465, + "grad_norm": 0.2818072438240051, + "learning_rate": 5.645383204239708e-06, + "loss": 3.9677, + "step": 51295 + }, + { + "epoch": 3.4855279249898086, + "grad_norm": 0.26292699575424194, + "learning_rate": 5.6449585541513796e-06, + "loss": 3.8785, + "step": 51300 + }, + { + "epoch": 3.48586764506047, + "grad_norm": 0.2601062059402466, + "learning_rate": 5.644533904063052e-06, + "loss": 4.0911, + "step": 51305 + }, + { + "epoch": 3.486207365131132, + "grad_norm": 0.3001246452331543, + "learning_rate": 5.644109253974725e-06, + "loss": 4.0117, + "step": 51310 + }, + { + "epoch": 3.486547085201794, + "grad_norm": 0.2743498682975769, + "learning_rate": 5.643684603886398e-06, + "loss": 4.0193, + "step": 51315 + }, + { + "epoch": 3.4868868052724555, + "grad_norm": 0.34988683462142944, + "learning_rate": 5.643259953798071e-06, + "loss": 3.83, + "step": 51320 + }, + { + "epoch": 3.487226525343117, + "grad_norm": 0.38489246368408203, + "learning_rate": 5.6428353037097436e-06, + "loss": 4.0907, + "step": 51325 + }, + { + "epoch": 3.4875662454137792, + "grad_norm": 0.2715805768966675, + "learning_rate": 5.642410653621416e-06, + "loss": 4.2505, + "step": 51330 + }, + { + "epoch": 3.487905965484441, + "grad_norm": 0.2609928846359253, + "learning_rate": 5.641986003533088e-06, + "loss": 4.0645, + "step": 51335 + }, + { + "epoch": 3.4882456855551025, + "grad_norm": 0.29513490200042725, + "learning_rate": 5.641561353444762e-06, + "loss": 4.0329, + "step": 51340 + }, + { + "epoch": 3.4885854056257646, + "grad_norm": 0.5128707885742188, + "learning_rate": 5.641136703356435e-06, + "loss": 4.1685, + "step": 51345 + }, + { + "epoch": 3.488925125696426, + "grad_norm": 0.2607712745666504, + "learning_rate": 5.640712053268107e-06, + "loss": 4.0816, + "step": 51350 + }, + { + "epoch": 3.489264845767088, + "grad_norm": 0.20459634065628052, + "learning_rate": 5.64028740317978e-06, + "loss": 3.9568, + "step": 51355 + }, + { + "epoch": 3.48960456583775, + "grad_norm": 0.2630140781402588, + "learning_rate": 5.639862753091453e-06, + "loss": 3.6654, + "step": 51360 + }, + { + "epoch": 3.4899442859084115, + "grad_norm": 0.2871137857437134, + "learning_rate": 5.639438103003127e-06, + "loss": 4.0104, + "step": 51365 + }, + { + "epoch": 3.490284005979073, + "grad_norm": 0.3089918792247772, + "learning_rate": 5.639013452914799e-06, + "loss": 4.069, + "step": 51370 + }, + { + "epoch": 3.4906237260497353, + "grad_norm": 0.2769620716571808, + "learning_rate": 5.638588802826472e-06, + "loss": 4.0937, + "step": 51375 + }, + { + "epoch": 3.490963446120397, + "grad_norm": 0.23099912703037262, + "learning_rate": 5.638164152738144e-06, + "loss": 3.9401, + "step": 51380 + }, + { + "epoch": 3.4913031661910585, + "grad_norm": 0.2685595750808716, + "learning_rate": 5.637739502649817e-06, + "loss": 4.1633, + "step": 51385 + }, + { + "epoch": 3.4916428862617206, + "grad_norm": 0.3202241361141205, + "learning_rate": 5.63731485256149e-06, + "loss": 3.7833, + "step": 51390 + }, + { + "epoch": 3.491982606332382, + "grad_norm": 0.25052720308303833, + "learning_rate": 5.636890202473163e-06, + "loss": 4.0562, + "step": 51395 + }, + { + "epoch": 3.492322326403044, + "grad_norm": 0.30103015899658203, + "learning_rate": 5.636465552384836e-06, + "loss": 4.0247, + "step": 51400 + }, + { + "epoch": 3.4926620464737055, + "grad_norm": 0.2608059346675873, + "learning_rate": 5.6360409022965075e-06, + "loss": 3.9786, + "step": 51405 + }, + { + "epoch": 3.4930017665443676, + "grad_norm": 0.35382163524627686, + "learning_rate": 5.635616252208181e-06, + "loss": 4.0977, + "step": 51410 + }, + { + "epoch": 3.493341486615029, + "grad_norm": 0.2700732946395874, + "learning_rate": 5.635191602119854e-06, + "loss": 4.0036, + "step": 51415 + }, + { + "epoch": 3.493681206685691, + "grad_norm": 0.27825063467025757, + "learning_rate": 5.634766952031526e-06, + "loss": 4.3379, + "step": 51420 + }, + { + "epoch": 3.494020926756353, + "grad_norm": 0.28410547971725464, + "learning_rate": 5.6343423019432e-06, + "loss": 4.0434, + "step": 51425 + }, + { + "epoch": 3.4943606468270145, + "grad_norm": 0.37398409843444824, + "learning_rate": 5.633917651854872e-06, + "loss": 3.9718, + "step": 51430 + }, + { + "epoch": 3.494700366897676, + "grad_norm": 0.23913481831550598, + "learning_rate": 5.633493001766544e-06, + "loss": 4.0685, + "step": 51435 + }, + { + "epoch": 3.4950400869683382, + "grad_norm": 0.263546884059906, + "learning_rate": 5.633068351678218e-06, + "loss": 3.8618, + "step": 51440 + }, + { + "epoch": 3.495379807039, + "grad_norm": 0.32439249753952026, + "learning_rate": 5.632643701589891e-06, + "loss": 3.7996, + "step": 51445 + }, + { + "epoch": 3.4957195271096615, + "grad_norm": 0.40100786089897156, + "learning_rate": 5.632219051501563e-06, + "loss": 3.8919, + "step": 51450 + }, + { + "epoch": 3.4960592471803236, + "grad_norm": 1.0447489023208618, + "learning_rate": 5.631794401413236e-06, + "loss": 4.3634, + "step": 51455 + }, + { + "epoch": 3.496398967250985, + "grad_norm": 0.24136579036712646, + "learning_rate": 5.631369751324909e-06, + "loss": 4.0623, + "step": 51460 + }, + { + "epoch": 3.496738687321647, + "grad_norm": 0.32962489128112793, + "learning_rate": 5.630945101236581e-06, + "loss": 3.8298, + "step": 51465 + }, + { + "epoch": 3.497078407392309, + "grad_norm": 0.31133055686950684, + "learning_rate": 5.630520451148255e-06, + "loss": 3.8939, + "step": 51470 + }, + { + "epoch": 3.4974181274629705, + "grad_norm": 0.43173420429229736, + "learning_rate": 5.630095801059927e-06, + "loss": 3.7736, + "step": 51475 + }, + { + "epoch": 3.497757847533632, + "grad_norm": 0.3093147873878479, + "learning_rate": 5.6296711509715995e-06, + "loss": 4.1117, + "step": 51480 + }, + { + "epoch": 3.4980975676042942, + "grad_norm": 0.3498633801937103, + "learning_rate": 5.629246500883273e-06, + "loss": 3.7667, + "step": 51485 + }, + { + "epoch": 3.498437287674956, + "grad_norm": 0.3606415390968323, + "learning_rate": 5.628821850794945e-06, + "loss": 3.9967, + "step": 51490 + }, + { + "epoch": 3.4987770077456175, + "grad_norm": 0.6088607311248779, + "learning_rate": 5.628397200706618e-06, + "loss": 4.0156, + "step": 51495 + }, + { + "epoch": 3.499116727816279, + "grad_norm": 0.258937805891037, + "learning_rate": 5.627972550618292e-06, + "loss": 3.9015, + "step": 51500 + }, + { + "epoch": 3.499456447886941, + "grad_norm": 0.3778758943080902, + "learning_rate": 5.6275479005299635e-06, + "loss": 4.0316, + "step": 51505 + }, + { + "epoch": 3.499796167957603, + "grad_norm": 0.32012346386909485, + "learning_rate": 5.627123250441636e-06, + "loss": 4.0566, + "step": 51510 + }, + { + "epoch": 3.5001358880282645, + "grad_norm": 0.2793514132499695, + "learning_rate": 5.62669860035331e-06, + "loss": 3.9934, + "step": 51515 + }, + { + "epoch": 3.5004756080989265, + "grad_norm": 0.3971507251262665, + "learning_rate": 5.626273950264982e-06, + "loss": 3.7532, + "step": 51520 + }, + { + "epoch": 3.500815328169588, + "grad_norm": 0.2753978669643402, + "learning_rate": 5.625849300176655e-06, + "loss": 4.0428, + "step": 51525 + }, + { + "epoch": 3.50115504824025, + "grad_norm": 0.28149768710136414, + "learning_rate": 5.625424650088328e-06, + "loss": 3.9184, + "step": 51530 + }, + { + "epoch": 3.501494768310912, + "grad_norm": 0.23050899803638458, + "learning_rate": 5.625e-06, + "loss": 3.884, + "step": 51535 + }, + { + "epoch": 3.5018344883815735, + "grad_norm": 0.34767860174179077, + "learning_rate": 5.624575349911673e-06, + "loss": 3.9519, + "step": 51540 + }, + { + "epoch": 3.502174208452235, + "grad_norm": 0.4117642343044281, + "learning_rate": 5.624150699823347e-06, + "loss": 4.0196, + "step": 51545 + }, + { + "epoch": 3.5025139285228972, + "grad_norm": 0.36451828479766846, + "learning_rate": 5.623726049735019e-06, + "loss": 4.161, + "step": 51550 + }, + { + "epoch": 3.502853648593559, + "grad_norm": 0.29267027974128723, + "learning_rate": 5.6233013996466916e-06, + "loss": 3.8085, + "step": 51555 + }, + { + "epoch": 3.5031933686642205, + "grad_norm": 0.2957439124584198, + "learning_rate": 5.622876749558364e-06, + "loss": 3.9241, + "step": 51560 + }, + { + "epoch": 3.5035330887348826, + "grad_norm": 0.4378969669342041, + "learning_rate": 5.622452099470037e-06, + "loss": 4.0619, + "step": 51565 + }, + { + "epoch": 3.503872808805544, + "grad_norm": 0.23407255113124847, + "learning_rate": 5.622027449381709e-06, + "loss": 3.8396, + "step": 51570 + }, + { + "epoch": 3.504212528876206, + "grad_norm": 0.3205069899559021, + "learning_rate": 5.621602799293383e-06, + "loss": 4.2909, + "step": 51575 + }, + { + "epoch": 3.504552248946868, + "grad_norm": 0.3741506338119507, + "learning_rate": 5.6211781492050556e-06, + "loss": 3.983, + "step": 51580 + }, + { + "epoch": 3.5048919690175295, + "grad_norm": 0.25136205554008484, + "learning_rate": 5.6207534991167275e-06, + "loss": 3.8446, + "step": 51585 + }, + { + "epoch": 3.505231689088191, + "grad_norm": 0.2676699161529541, + "learning_rate": 5.620328849028401e-06, + "loss": 3.7414, + "step": 51590 + }, + { + "epoch": 3.5055714091588532, + "grad_norm": 0.38664510846138, + "learning_rate": 5.619904198940074e-06, + "loss": 4.3418, + "step": 51595 + }, + { + "epoch": 3.505911129229515, + "grad_norm": 0.27899569272994995, + "learning_rate": 5.619479548851746e-06, + "loss": 3.6824, + "step": 51600 + }, + { + "epoch": 3.5062508493001765, + "grad_norm": 0.25585436820983887, + "learning_rate": 5.6190548987634196e-06, + "loss": 3.726, + "step": 51605 + }, + { + "epoch": 3.5065905693708386, + "grad_norm": 0.26894277334213257, + "learning_rate": 5.618630248675092e-06, + "loss": 3.8388, + "step": 51610 + }, + { + "epoch": 3.5069302894415, + "grad_norm": 0.3219679296016693, + "learning_rate": 5.618205598586764e-06, + "loss": 3.6227, + "step": 51615 + }, + { + "epoch": 3.507270009512162, + "grad_norm": 0.347689151763916, + "learning_rate": 5.617780948498438e-06, + "loss": 4.0305, + "step": 51620 + }, + { + "epoch": 3.507609729582824, + "grad_norm": 0.32980385422706604, + "learning_rate": 5.617356298410111e-06, + "loss": 4.172, + "step": 51625 + }, + { + "epoch": 3.5079494496534855, + "grad_norm": 0.4547092020511627, + "learning_rate": 5.616931648321783e-06, + "loss": 4.0359, + "step": 51630 + }, + { + "epoch": 3.508289169724147, + "grad_norm": 0.33685827255249023, + "learning_rate": 5.616506998233456e-06, + "loss": 3.9064, + "step": 51635 + }, + { + "epoch": 3.5086288897948092, + "grad_norm": 0.37676313519477844, + "learning_rate": 5.616082348145129e-06, + "loss": 4.2619, + "step": 51640 + }, + { + "epoch": 3.508968609865471, + "grad_norm": 0.3089193105697632, + "learning_rate": 5.615657698056801e-06, + "loss": 4.0779, + "step": 51645 + }, + { + "epoch": 3.5093083299361325, + "grad_norm": 0.30597513914108276, + "learning_rate": 5.615233047968475e-06, + "loss": 4.136, + "step": 51650 + }, + { + "epoch": 3.5096480500067946, + "grad_norm": 0.3004966378211975, + "learning_rate": 5.614808397880147e-06, + "loss": 3.9205, + "step": 51655 + }, + { + "epoch": 3.509987770077456, + "grad_norm": 0.2734283208847046, + "learning_rate": 5.6143837477918195e-06, + "loss": 3.8742, + "step": 51660 + }, + { + "epoch": 3.510327490148118, + "grad_norm": 0.2290216088294983, + "learning_rate": 5.613959097703493e-06, + "loss": 3.7671, + "step": 51665 + }, + { + "epoch": 3.51066721021878, + "grad_norm": 0.5140334963798523, + "learning_rate": 5.613534447615165e-06, + "loss": 4.0336, + "step": 51670 + }, + { + "epoch": 3.5110069302894416, + "grad_norm": 0.28188028931617737, + "learning_rate": 5.613109797526838e-06, + "loss": 4.2, + "step": 51675 + }, + { + "epoch": 3.511346650360103, + "grad_norm": 0.4476923644542694, + "learning_rate": 5.612685147438512e-06, + "loss": 4.076, + "step": 51680 + }, + { + "epoch": 3.5116863704307653, + "grad_norm": 0.22467415034770966, + "learning_rate": 5.6122604973501835e-06, + "loss": 4.0085, + "step": 51685 + }, + { + "epoch": 3.512026090501427, + "grad_norm": 0.28817474842071533, + "learning_rate": 5.611835847261856e-06, + "loss": 3.8282, + "step": 51690 + }, + { + "epoch": 3.5123658105720885, + "grad_norm": 0.2339165210723877, + "learning_rate": 5.61141119717353e-06, + "loss": 4.1315, + "step": 51695 + }, + { + "epoch": 3.5127055306427506, + "grad_norm": 0.3112815320491791, + "learning_rate": 5.611071477102867e-06, + "loss": 4.1872, + "step": 51700 + }, + { + "epoch": 3.5130452507134122, + "grad_norm": 0.3598974347114563, + "learning_rate": 5.610646827014541e-06, + "loss": 3.7551, + "step": 51705 + }, + { + "epoch": 3.513384970784074, + "grad_norm": 0.2935684025287628, + "learning_rate": 5.610222176926214e-06, + "loss": 4.2307, + "step": 51710 + }, + { + "epoch": 3.513724690854736, + "grad_norm": 0.33893099427223206, + "learning_rate": 5.609797526837886e-06, + "loss": 3.882, + "step": 51715 + }, + { + "epoch": 3.5140644109253976, + "grad_norm": 0.31923988461494446, + "learning_rate": 5.609372876749559e-06, + "loss": 4.1511, + "step": 51720 + }, + { + "epoch": 3.514404130996059, + "grad_norm": 0.2449861764907837, + "learning_rate": 5.608948226661231e-06, + "loss": 3.7457, + "step": 51725 + }, + { + "epoch": 3.5147438510667213, + "grad_norm": 0.32594847679138184, + "learning_rate": 5.608523576572904e-06, + "loss": 4.1417, + "step": 51730 + }, + { + "epoch": 3.515083571137383, + "grad_norm": 0.2572097182273865, + "learning_rate": 5.608098926484578e-06, + "loss": 4.0461, + "step": 51735 + }, + { + "epoch": 3.5154232912080445, + "grad_norm": 0.3292061984539032, + "learning_rate": 5.60767427639625e-06, + "loss": 3.9351, + "step": 51740 + }, + { + "epoch": 3.5157630112787066, + "grad_norm": 0.3630558252334595, + "learning_rate": 5.607249626307922e-06, + "loss": 4.0527, + "step": 51745 + }, + { + "epoch": 3.5161027313493682, + "grad_norm": 0.2926666736602783, + "learning_rate": 5.606824976219596e-06, + "loss": 4.1816, + "step": 51750 + }, + { + "epoch": 3.51644245142003, + "grad_norm": 0.30548661947250366, + "learning_rate": 5.606400326131268e-06, + "loss": 4.0273, + "step": 51755 + }, + { + "epoch": 3.516782171490692, + "grad_norm": 0.3910655081272125, + "learning_rate": 5.605975676042941e-06, + "loss": 4.0928, + "step": 51760 + }, + { + "epoch": 3.5171218915613536, + "grad_norm": 0.267047256231308, + "learning_rate": 5.6055510259546145e-06, + "loss": 4.017, + "step": 51765 + }, + { + "epoch": 3.517461611632015, + "grad_norm": 0.34861454367637634, + "learning_rate": 5.605126375866286e-06, + "loss": 3.845, + "step": 51770 + }, + { + "epoch": 3.517801331702677, + "grad_norm": 0.33884763717651367, + "learning_rate": 5.604701725777959e-06, + "loss": 4.0873, + "step": 51775 + }, + { + "epoch": 3.518141051773339, + "grad_norm": 0.2994120121002197, + "learning_rate": 5.604277075689633e-06, + "loss": 4.1258, + "step": 51780 + }, + { + "epoch": 3.5184807718440005, + "grad_norm": 0.3504497706890106, + "learning_rate": 5.603852425601305e-06, + "loss": 3.9629, + "step": 51785 + }, + { + "epoch": 3.518820491914662, + "grad_norm": 0.312033474445343, + "learning_rate": 5.603427775512978e-06, + "loss": 4.142, + "step": 51790 + }, + { + "epoch": 3.5191602119853242, + "grad_norm": 0.35450538992881775, + "learning_rate": 5.603003125424651e-06, + "loss": 4.0421, + "step": 51795 + }, + { + "epoch": 3.519499932055986, + "grad_norm": 0.2813437283039093, + "learning_rate": 5.602578475336323e-06, + "loss": 3.8341, + "step": 51800 + }, + { + "epoch": 3.5198396521266475, + "grad_norm": 0.40372997522354126, + "learning_rate": 5.602153825247996e-06, + "loss": 4.1111, + "step": 51805 + }, + { + "epoch": 3.520179372197309, + "grad_norm": 0.26021578907966614, + "learning_rate": 5.601729175159669e-06, + "loss": 3.9945, + "step": 51810 + }, + { + "epoch": 3.520519092267971, + "grad_norm": 0.3100302517414093, + "learning_rate": 5.601304525071342e-06, + "loss": 4.0621, + "step": 51815 + }, + { + "epoch": 3.520858812338633, + "grad_norm": 0.2700953483581543, + "learning_rate": 5.6008798749830136e-06, + "loss": 3.7774, + "step": 51820 + }, + { + "epoch": 3.5211985324092945, + "grad_norm": 0.24745973944664001, + "learning_rate": 5.600455224894687e-06, + "loss": 3.8733, + "step": 51825 + }, + { + "epoch": 3.5215382524799566, + "grad_norm": 0.3392847180366516, + "learning_rate": 5.60003057480636e-06, + "loss": 4.0775, + "step": 51830 + }, + { + "epoch": 3.521877972550618, + "grad_norm": 0.5099708437919617, + "learning_rate": 5.599605924718032e-06, + "loss": 4.161, + "step": 51835 + }, + { + "epoch": 3.52221769262128, + "grad_norm": 0.49964091181755066, + "learning_rate": 5.599181274629706e-06, + "loss": 3.8733, + "step": 51840 + }, + { + "epoch": 3.522557412691942, + "grad_norm": 0.30754467844963074, + "learning_rate": 5.5987566245413784e-06, + "loss": 3.9245, + "step": 51845 + }, + { + "epoch": 3.5228971327626035, + "grad_norm": 0.30071622133255005, + "learning_rate": 5.59833197445305e-06, + "loss": 4.0831, + "step": 51850 + }, + { + "epoch": 3.523236852833265, + "grad_norm": 0.33726125955581665, + "learning_rate": 5.597907324364724e-06, + "loss": 3.9469, + "step": 51855 + }, + { + "epoch": 3.5235765729039272, + "grad_norm": 0.31346678733825684, + "learning_rate": 5.597482674276397e-06, + "loss": 4.1837, + "step": 51860 + }, + { + "epoch": 3.523916292974589, + "grad_norm": 0.23737733066082, + "learning_rate": 5.597058024188069e-06, + "loss": 3.9542, + "step": 51865 + }, + { + "epoch": 3.5242560130452505, + "grad_norm": 0.30066606402397156, + "learning_rate": 5.5966333740997424e-06, + "loss": 3.8667, + "step": 51870 + }, + { + "epoch": 3.5245957331159126, + "grad_norm": 0.2542743980884552, + "learning_rate": 5.596208724011415e-06, + "loss": 4.0667, + "step": 51875 + }, + { + "epoch": 3.524935453186574, + "grad_norm": 0.3095589876174927, + "learning_rate": 5.595784073923087e-06, + "loss": 3.9058, + "step": 51880 + }, + { + "epoch": 3.525275173257236, + "grad_norm": 0.26978328824043274, + "learning_rate": 5.595359423834761e-06, + "loss": 3.8219, + "step": 51885 + }, + { + "epoch": 3.525614893327898, + "grad_norm": 0.2246703803539276, + "learning_rate": 5.594934773746434e-06, + "loss": 4.0055, + "step": 51890 + }, + { + "epoch": 3.5259546133985595, + "grad_norm": 0.2809813320636749, + "learning_rate": 5.594510123658106e-06, + "loss": 4.1303, + "step": 51895 + }, + { + "epoch": 3.526294333469221, + "grad_norm": 0.26726019382476807, + "learning_rate": 5.594085473569779e-06, + "loss": 3.8593, + "step": 51900 + }, + { + "epoch": 3.5266340535398832, + "grad_norm": 0.5026929974555969, + "learning_rate": 5.593660823481451e-06, + "loss": 4.031, + "step": 51905 + }, + { + "epoch": 3.526973773610545, + "grad_norm": 0.2979482412338257, + "learning_rate": 5.593236173393124e-06, + "loss": 4.1052, + "step": 51910 + }, + { + "epoch": 3.5273134936812065, + "grad_norm": 0.37676769495010376, + "learning_rate": 5.592811523304798e-06, + "loss": 4.1777, + "step": 51915 + }, + { + "epoch": 3.5276532137518686, + "grad_norm": 0.26854756474494934, + "learning_rate": 5.59238687321647e-06, + "loss": 3.7332, + "step": 51920 + }, + { + "epoch": 3.52799293382253, + "grad_norm": 0.35776281356811523, + "learning_rate": 5.591962223128143e-06, + "loss": 4.0725, + "step": 51925 + }, + { + "epoch": 3.528332653893192, + "grad_norm": 0.2664646804332733, + "learning_rate": 5.591537573039816e-06, + "loss": 4.1226, + "step": 51930 + }, + { + "epoch": 3.528672373963854, + "grad_norm": 0.26314637064933777, + "learning_rate": 5.591112922951488e-06, + "loss": 3.7362, + "step": 51935 + }, + { + "epoch": 3.5290120940345155, + "grad_norm": 0.35955771803855896, + "learning_rate": 5.590688272863162e-06, + "loss": 4.1112, + "step": 51940 + }, + { + "epoch": 3.529351814105177, + "grad_norm": 0.36164990067481995, + "learning_rate": 5.5902636227748344e-06, + "loss": 4.0479, + "step": 51945 + }, + { + "epoch": 3.5296915341758393, + "grad_norm": 0.3486993908882141, + "learning_rate": 5.589838972686506e-06, + "loss": 3.873, + "step": 51950 + }, + { + "epoch": 3.530031254246501, + "grad_norm": 0.32214829325675964, + "learning_rate": 5.58941432259818e-06, + "loss": 3.9198, + "step": 51955 + }, + { + "epoch": 3.5303709743171625, + "grad_norm": 0.2912079989910126, + "learning_rate": 5.588989672509853e-06, + "loss": 3.9974, + "step": 51960 + }, + { + "epoch": 3.5307106943878246, + "grad_norm": 0.3030076026916504, + "learning_rate": 5.588565022421525e-06, + "loss": 3.9981, + "step": 51965 + }, + { + "epoch": 3.531050414458486, + "grad_norm": 0.3175545334815979, + "learning_rate": 5.5881403723331984e-06, + "loss": 4.1907, + "step": 51970 + }, + { + "epoch": 3.531390134529148, + "grad_norm": 0.2732813358306885, + "learning_rate": 5.58771572224487e-06, + "loss": 3.6799, + "step": 51975 + }, + { + "epoch": 3.53172985459981, + "grad_norm": 0.2782655656337738, + "learning_rate": 5.587291072156543e-06, + "loss": 3.7401, + "step": 51980 + }, + { + "epoch": 3.5320695746704716, + "grad_norm": 0.2794867157936096, + "learning_rate": 5.586866422068217e-06, + "loss": 4.1236, + "step": 51985 + }, + { + "epoch": 3.532409294741133, + "grad_norm": 0.26005616784095764, + "learning_rate": 5.586441771979889e-06, + "loss": 4.0052, + "step": 51990 + }, + { + "epoch": 3.5327490148117953, + "grad_norm": 0.3663305342197418, + "learning_rate": 5.586017121891562e-06, + "loss": 4.0943, + "step": 51995 + }, + { + "epoch": 3.533088734882457, + "grad_norm": 0.4017591178417206, + "learning_rate": 5.585592471803235e-06, + "loss": 4.1336, + "step": 52000 + }, + { + "epoch": 3.5334284549531185, + "grad_norm": 0.562318742275238, + "learning_rate": 5.585167821714907e-06, + "loss": 4.1495, + "step": 52005 + }, + { + "epoch": 3.5337681750237806, + "grad_norm": 0.2733638882637024, + "learning_rate": 5.58474317162658e-06, + "loss": 3.7613, + "step": 52010 + }, + { + "epoch": 3.5341078950944422, + "grad_norm": 0.3687717914581299, + "learning_rate": 5.584318521538254e-06, + "loss": 4.1681, + "step": 52015 + }, + { + "epoch": 3.534447615165104, + "grad_norm": 0.2710268795490265, + "learning_rate": 5.583893871449926e-06, + "loss": 3.9724, + "step": 52020 + }, + { + "epoch": 3.534787335235766, + "grad_norm": 0.19683268666267395, + "learning_rate": 5.583469221361598e-06, + "loss": 3.884, + "step": 52025 + }, + { + "epoch": 3.5351270553064276, + "grad_norm": 0.2483166605234146, + "learning_rate": 5.583044571273272e-06, + "loss": 4.1106, + "step": 52030 + }, + { + "epoch": 3.535466775377089, + "grad_norm": 0.31876182556152344, + "learning_rate": 5.582619921184944e-06, + "loss": 4.2291, + "step": 52035 + }, + { + "epoch": 3.5358064954477513, + "grad_norm": 0.3691520392894745, + "learning_rate": 5.582195271096617e-06, + "loss": 4.0853, + "step": 52040 + }, + { + "epoch": 3.536146215518413, + "grad_norm": 0.28410473465919495, + "learning_rate": 5.5817706210082905e-06, + "loss": 4.1493, + "step": 52045 + }, + { + "epoch": 3.5364859355890745, + "grad_norm": 0.30513349175453186, + "learning_rate": 5.581345970919962e-06, + "loss": 4.2366, + "step": 52050 + }, + { + "epoch": 3.5368256556597366, + "grad_norm": 0.2861103117465973, + "learning_rate": 5.580921320831635e-06, + "loss": 4.0451, + "step": 52055 + }, + { + "epoch": 3.5371653757303982, + "grad_norm": 0.3129676878452301, + "learning_rate": 5.580496670743308e-06, + "loss": 4.1584, + "step": 52060 + }, + { + "epoch": 3.53750509580106, + "grad_norm": 0.2800575792789459, + "learning_rate": 5.580072020654981e-06, + "loss": 4.0608, + "step": 52065 + }, + { + "epoch": 3.537844815871722, + "grad_norm": 0.3099808096885681, + "learning_rate": 5.579647370566653e-06, + "loss": 3.7992, + "step": 52070 + }, + { + "epoch": 3.5381845359423836, + "grad_norm": 0.2689172625541687, + "learning_rate": 5.579222720478326e-06, + "loss": 4.0074, + "step": 52075 + }, + { + "epoch": 3.538524256013045, + "grad_norm": 0.3055102229118347, + "learning_rate": 5.578798070389999e-06, + "loss": 4.1728, + "step": 52080 + }, + { + "epoch": 3.5388639760837073, + "grad_norm": 0.29722860455513, + "learning_rate": 5.578373420301671e-06, + "loss": 3.9552, + "step": 52085 + }, + { + "epoch": 3.539203696154369, + "grad_norm": 0.383652001619339, + "learning_rate": 5.577948770213345e-06, + "loss": 3.8848, + "step": 52090 + }, + { + "epoch": 3.5395434162250305, + "grad_norm": 0.30131396651268005, + "learning_rate": 5.577524120125018e-06, + "loss": 4.2525, + "step": 52095 + }, + { + "epoch": 3.5398831362956926, + "grad_norm": 0.2176067978143692, + "learning_rate": 5.5770994700366896e-06, + "loss": 3.9729, + "step": 52100 + }, + { + "epoch": 3.5402228563663543, + "grad_norm": 0.19793090224266052, + "learning_rate": 5.576674819948363e-06, + "loss": 4.0158, + "step": 52105 + }, + { + "epoch": 3.540562576437016, + "grad_norm": 0.23961593210697174, + "learning_rate": 5.576250169860036e-06, + "loss": 3.9094, + "step": 52110 + }, + { + "epoch": 3.5409022965076775, + "grad_norm": 0.3111574947834015, + "learning_rate": 5.575825519771708e-06, + "loss": 4.1984, + "step": 52115 + }, + { + "epoch": 3.5412420165783396, + "grad_norm": 0.3385190963745117, + "learning_rate": 5.575400869683382e-06, + "loss": 3.8838, + "step": 52120 + }, + { + "epoch": 3.541581736649001, + "grad_norm": 0.27400386333465576, + "learning_rate": 5.574976219595054e-06, + "loss": 4.0314, + "step": 52125 + }, + { + "epoch": 3.541921456719663, + "grad_norm": 0.30544111132621765, + "learning_rate": 5.574551569506726e-06, + "loss": 4.1378, + "step": 52130 + }, + { + "epoch": 3.542261176790325, + "grad_norm": 0.2876983880996704, + "learning_rate": 5.5741269194184e-06, + "loss": 4.0778, + "step": 52135 + }, + { + "epoch": 3.5426008968609866, + "grad_norm": 0.46036893129348755, + "learning_rate": 5.573702269330073e-06, + "loss": 4.0227, + "step": 52140 + }, + { + "epoch": 3.542940616931648, + "grad_norm": 0.27808070182800293, + "learning_rate": 5.573277619241745e-06, + "loss": 4.0263, + "step": 52145 + }, + { + "epoch": 3.54328033700231, + "grad_norm": 0.2753634452819824, + "learning_rate": 5.5728529691534184e-06, + "loss": 3.8541, + "step": 52150 + }, + { + "epoch": 3.543620057072972, + "grad_norm": 0.296649694442749, + "learning_rate": 5.57242831906509e-06, + "loss": 4.1063, + "step": 52155 + }, + { + "epoch": 3.5439597771436335, + "grad_norm": 0.30419084429740906, + "learning_rate": 5.572003668976763e-06, + "loss": 4.2051, + "step": 52160 + }, + { + "epoch": 3.544299497214295, + "grad_norm": 0.3319060504436493, + "learning_rate": 5.571579018888437e-06, + "loss": 3.9267, + "step": 52165 + }, + { + "epoch": 3.5446392172849572, + "grad_norm": 0.3346010446548462, + "learning_rate": 5.571154368800109e-06, + "loss": 3.821, + "step": 52170 + }, + { + "epoch": 3.544978937355619, + "grad_norm": 0.2676739990711212, + "learning_rate": 5.570729718711782e-06, + "loss": 3.8404, + "step": 52175 + }, + { + "epoch": 3.5453186574262805, + "grad_norm": 0.37103644013404846, + "learning_rate": 5.570305068623455e-06, + "loss": 3.8769, + "step": 52180 + }, + { + "epoch": 3.5456583774969426, + "grad_norm": 0.25558602809906006, + "learning_rate": 5.569880418535127e-06, + "loss": 3.8323, + "step": 52185 + }, + { + "epoch": 3.545998097567604, + "grad_norm": 0.21012765169143677, + "learning_rate": 5.5694557684468e-06, + "loss": 3.8925, + "step": 52190 + }, + { + "epoch": 3.546337817638266, + "grad_norm": 0.3383244276046753, + "learning_rate": 5.569031118358474e-06, + "loss": 3.9623, + "step": 52195 + }, + { + "epoch": 3.546677537708928, + "grad_norm": 0.40889593958854675, + "learning_rate": 5.568606468270146e-06, + "loss": 4.065, + "step": 52200 + }, + { + "epoch": 3.5470172577795895, + "grad_norm": 0.3647099435329437, + "learning_rate": 5.568181818181818e-06, + "loss": 3.8785, + "step": 52205 + }, + { + "epoch": 3.547356977850251, + "grad_norm": 0.3223525285720825, + "learning_rate": 5.567757168093492e-06, + "loss": 3.984, + "step": 52210 + }, + { + "epoch": 3.5476966979209132, + "grad_norm": 0.2632036507129669, + "learning_rate": 5.567332518005164e-06, + "loss": 4.0423, + "step": 52215 + }, + { + "epoch": 3.548036417991575, + "grad_norm": 0.3114495575428009, + "learning_rate": 5.566907867916837e-06, + "loss": 3.9726, + "step": 52220 + }, + { + "epoch": 3.5483761380622365, + "grad_norm": 0.2788843810558319, + "learning_rate": 5.56648321782851e-06, + "loss": 3.8493, + "step": 52225 + }, + { + "epoch": 3.5487158581328986, + "grad_norm": 0.2976054251194, + "learning_rate": 5.566058567740182e-06, + "loss": 4.0718, + "step": 52230 + }, + { + "epoch": 3.54905557820356, + "grad_norm": 0.2764502167701721, + "learning_rate": 5.565633917651855e-06, + "loss": 3.8698, + "step": 52235 + }, + { + "epoch": 3.549395298274222, + "grad_norm": 0.458842009305954, + "learning_rate": 5.565209267563528e-06, + "loss": 3.9913, + "step": 52240 + }, + { + "epoch": 3.549735018344884, + "grad_norm": 0.5941668748855591, + "learning_rate": 5.564784617475201e-06, + "loss": 4.2215, + "step": 52245 + }, + { + "epoch": 3.5500747384155455, + "grad_norm": 0.25510072708129883, + "learning_rate": 5.564359967386873e-06, + "loss": 3.7965, + "step": 52250 + }, + { + "epoch": 3.550414458486207, + "grad_norm": 0.2887587249279022, + "learning_rate": 5.563935317298546e-06, + "loss": 3.9501, + "step": 52255 + }, + { + "epoch": 3.5507541785568693, + "grad_norm": 0.29599979519844055, + "learning_rate": 5.563510667210219e-06, + "loss": 3.946, + "step": 52260 + }, + { + "epoch": 3.551093898627531, + "grad_norm": 0.2590574622154236, + "learning_rate": 5.563086017121893e-06, + "loss": 3.909, + "step": 52265 + }, + { + "epoch": 3.5514336186981925, + "grad_norm": 0.29090163111686707, + "learning_rate": 5.562661367033565e-06, + "loss": 4.0454, + "step": 52270 + }, + { + "epoch": 3.5517733387688546, + "grad_norm": 0.2341231405735016, + "learning_rate": 5.562236716945238e-06, + "loss": 4.0254, + "step": 52275 + }, + { + "epoch": 3.5521130588395162, + "grad_norm": 0.3728874623775482, + "learning_rate": 5.561812066856911e-06, + "loss": 4.0282, + "step": 52280 + }, + { + "epoch": 3.552452778910178, + "grad_norm": 0.3527260422706604, + "learning_rate": 5.561387416768583e-06, + "loss": 4.0186, + "step": 52285 + }, + { + "epoch": 3.55279249898084, + "grad_norm": 0.31084415316581726, + "learning_rate": 5.560962766680256e-06, + "loss": 4.139, + "step": 52290 + }, + { + "epoch": 3.5531322190515016, + "grad_norm": 0.30333876609802246, + "learning_rate": 5.560538116591929e-06, + "loss": 4.2486, + "step": 52295 + }, + { + "epoch": 3.553471939122163, + "grad_norm": 0.2622332274913788, + "learning_rate": 5.560113466503602e-06, + "loss": 3.7031, + "step": 52300 + }, + { + "epoch": 3.5538116591928253, + "grad_norm": 0.29371702671051025, + "learning_rate": 5.559688816415274e-06, + "loss": 4.0498, + "step": 52305 + }, + { + "epoch": 3.554151379263487, + "grad_norm": 0.31681352853775024, + "learning_rate": 5.559264166326947e-06, + "loss": 3.8916, + "step": 52310 + }, + { + "epoch": 3.5544910993341485, + "grad_norm": 0.23230314254760742, + "learning_rate": 5.55883951623862e-06, + "loss": 3.9355, + "step": 52315 + }, + { + "epoch": 3.5548308194048106, + "grad_norm": 0.27255553007125854, + "learning_rate": 5.558414866150292e-06, + "loss": 3.9719, + "step": 52320 + }, + { + "epoch": 3.5551705394754722, + "grad_norm": 0.21610155701637268, + "learning_rate": 5.557990216061966e-06, + "loss": 4.1221, + "step": 52325 + }, + { + "epoch": 3.555510259546134, + "grad_norm": 0.27921879291534424, + "learning_rate": 5.557565565973638e-06, + "loss": 4.0976, + "step": 52330 + }, + { + "epoch": 3.555849979616796, + "grad_norm": 0.39384666085243225, + "learning_rate": 5.55714091588531e-06, + "loss": 4.1382, + "step": 52335 + }, + { + "epoch": 3.5561896996874576, + "grad_norm": 0.24380050599575043, + "learning_rate": 5.556716265796984e-06, + "loss": 3.8129, + "step": 52340 + }, + { + "epoch": 3.556529419758119, + "grad_norm": 0.47374865412712097, + "learning_rate": 5.556291615708657e-06, + "loss": 3.9509, + "step": 52345 + }, + { + "epoch": 3.5568691398287813, + "grad_norm": 0.26523569226264954, + "learning_rate": 5.555866965620329e-06, + "loss": 3.9001, + "step": 52350 + }, + { + "epoch": 3.557208859899443, + "grad_norm": 0.35588181018829346, + "learning_rate": 5.555442315532002e-06, + "loss": 3.9467, + "step": 52355 + }, + { + "epoch": 3.5575485799701045, + "grad_norm": 0.2536468207836151, + "learning_rate": 5.555017665443675e-06, + "loss": 4.1601, + "step": 52360 + }, + { + "epoch": 3.5578883000407666, + "grad_norm": 0.2479034960269928, + "learning_rate": 5.554593015355347e-06, + "loss": 3.853, + "step": 52365 + }, + { + "epoch": 3.5582280201114282, + "grad_norm": 0.3426201343536377, + "learning_rate": 5.554168365267021e-06, + "loss": 4.0055, + "step": 52370 + }, + { + "epoch": 3.55856774018209, + "grad_norm": 0.38879895210266113, + "learning_rate": 5.553743715178694e-06, + "loss": 3.8779, + "step": 52375 + }, + { + "epoch": 3.558907460252752, + "grad_norm": 0.26272228360176086, + "learning_rate": 5.5533190650903656e-06, + "loss": 3.965, + "step": 52380 + }, + { + "epoch": 3.5592471803234136, + "grad_norm": 0.30743733048439026, + "learning_rate": 5.552894415002039e-06, + "loss": 4.1024, + "step": 52385 + }, + { + "epoch": 3.559586900394075, + "grad_norm": 0.9248998165130615, + "learning_rate": 5.552469764913711e-06, + "loss": 4.2662, + "step": 52390 + }, + { + "epoch": 3.5599266204647373, + "grad_norm": 0.4579523503780365, + "learning_rate": 5.552045114825384e-06, + "loss": 3.9273, + "step": 52395 + }, + { + "epoch": 3.560266340535399, + "grad_norm": 0.33460670709609985, + "learning_rate": 5.551620464737058e-06, + "loss": 3.8747, + "step": 52400 + }, + { + "epoch": 3.5606060606060606, + "grad_norm": 0.2660001516342163, + "learning_rate": 5.5511958146487296e-06, + "loss": 3.936, + "step": 52405 + }, + { + "epoch": 3.5609457806767226, + "grad_norm": 0.296764999628067, + "learning_rate": 5.550771164560402e-06, + "loss": 4.3271, + "step": 52410 + }, + { + "epoch": 3.5612855007473843, + "grad_norm": 0.3860391676425934, + "learning_rate": 5.550346514472076e-06, + "loss": 3.9713, + "step": 52415 + }, + { + "epoch": 3.561625220818046, + "grad_norm": 0.28292933106422424, + "learning_rate": 5.549921864383748e-06, + "loss": 3.9388, + "step": 52420 + }, + { + "epoch": 3.561964940888708, + "grad_norm": 0.28050997853279114, + "learning_rate": 5.549497214295421e-06, + "loss": 3.9845, + "step": 52425 + }, + { + "epoch": 3.5623046609593696, + "grad_norm": 0.23604248464107513, + "learning_rate": 5.549072564207094e-06, + "loss": 3.9008, + "step": 52430 + }, + { + "epoch": 3.5626443810300312, + "grad_norm": 0.2617349922657013, + "learning_rate": 5.548647914118766e-06, + "loss": 4.2106, + "step": 52435 + }, + { + "epoch": 3.5629841011006933, + "grad_norm": 0.34565410017967224, + "learning_rate": 5.548223264030439e-06, + "loss": 3.8444, + "step": 52440 + }, + { + "epoch": 3.563323821171355, + "grad_norm": 0.4019630551338196, + "learning_rate": 5.547798613942113e-06, + "loss": 4.0953, + "step": 52445 + }, + { + "epoch": 3.5636635412420166, + "grad_norm": 0.5050804018974304, + "learning_rate": 5.547373963853785e-06, + "loss": 4.0257, + "step": 52450 + }, + { + "epoch": 3.564003261312678, + "grad_norm": 0.38360920548439026, + "learning_rate": 5.546949313765458e-06, + "loss": 4.0714, + "step": 52455 + }, + { + "epoch": 3.5643429813833403, + "grad_norm": 0.30456510186195374, + "learning_rate": 5.546524663677131e-06, + "loss": 4.0339, + "step": 52460 + }, + { + "epoch": 3.564682701454002, + "grad_norm": 0.2849016487598419, + "learning_rate": 5.546100013588803e-06, + "loss": 4.0248, + "step": 52465 + }, + { + "epoch": 3.5650224215246635, + "grad_norm": 0.30945178866386414, + "learning_rate": 5.545675363500476e-06, + "loss": 3.9031, + "step": 52470 + }, + { + "epoch": 3.5653621415953256, + "grad_norm": 0.3378139138221741, + "learning_rate": 5.545250713412149e-06, + "loss": 4.064, + "step": 52475 + }, + { + "epoch": 3.5657018616659872, + "grad_norm": 0.3054908514022827, + "learning_rate": 5.544826063323822e-06, + "loss": 4.1387, + "step": 52480 + }, + { + "epoch": 3.566041581736649, + "grad_norm": 0.28352871537208557, + "learning_rate": 5.544401413235494e-06, + "loss": 4.1331, + "step": 52485 + }, + { + "epoch": 3.5663813018073105, + "grad_norm": 0.4252399802207947, + "learning_rate": 5.543976763147167e-06, + "loss": 4.0868, + "step": 52490 + }, + { + "epoch": 3.5667210218779726, + "grad_norm": 0.285638689994812, + "learning_rate": 5.54355211305884e-06, + "loss": 4.1517, + "step": 52495 + }, + { + "epoch": 3.567060741948634, + "grad_norm": 0.4027678966522217, + "learning_rate": 5.543127462970512e-06, + "loss": 3.7657, + "step": 52500 + }, + { + "epoch": 3.567400462019296, + "grad_norm": 0.30986374616622925, + "learning_rate": 5.542702812882186e-06, + "loss": 3.9385, + "step": 52505 + }, + { + "epoch": 3.567740182089958, + "grad_norm": 0.21709398925304413, + "learning_rate": 5.542278162793858e-06, + "loss": 3.928, + "step": 52510 + }, + { + "epoch": 3.5680799021606195, + "grad_norm": 0.3287184536457062, + "learning_rate": 5.54185351270553e-06, + "loss": 3.7062, + "step": 52515 + }, + { + "epoch": 3.568419622231281, + "grad_norm": 0.2540283203125, + "learning_rate": 5.541428862617204e-06, + "loss": 4.0478, + "step": 52520 + }, + { + "epoch": 3.5687593423019432, + "grad_norm": 0.3902058005332947, + "learning_rate": 5.541004212528877e-06, + "loss": 4.368, + "step": 52525 + }, + { + "epoch": 3.569099062372605, + "grad_norm": 0.2729368805885315, + "learning_rate": 5.540579562440549e-06, + "loss": 3.9935, + "step": 52530 + }, + { + "epoch": 3.5694387824432665, + "grad_norm": 0.3067176640033722, + "learning_rate": 5.540154912352222e-06, + "loss": 3.7479, + "step": 52535 + }, + { + "epoch": 3.5697785025139286, + "grad_norm": 0.2875843048095703, + "learning_rate": 5.539730262263895e-06, + "loss": 4.1587, + "step": 52540 + }, + { + "epoch": 3.57011822258459, + "grad_norm": 0.3321269750595093, + "learning_rate": 5.539305612175567e-06, + "loss": 4.0868, + "step": 52545 + }, + { + "epoch": 3.570457942655252, + "grad_norm": 0.5362895727157593, + "learning_rate": 5.538880962087241e-06, + "loss": 4.0047, + "step": 52550 + }, + { + "epoch": 3.570797662725914, + "grad_norm": 0.32530736923217773, + "learning_rate": 5.538456311998914e-06, + "loss": 4.1088, + "step": 52555 + }, + { + "epoch": 3.5711373827965756, + "grad_norm": 0.27117660641670227, + "learning_rate": 5.5380316619105855e-06, + "loss": 3.9409, + "step": 52560 + }, + { + "epoch": 3.571477102867237, + "grad_norm": 0.43399420380592346, + "learning_rate": 5.537607011822259e-06, + "loss": 3.8946, + "step": 52565 + }, + { + "epoch": 3.5718168229378993, + "grad_norm": 0.22727592289447784, + "learning_rate": 5.537182361733931e-06, + "loss": 3.9103, + "step": 52570 + }, + { + "epoch": 3.572156543008561, + "grad_norm": 0.2737791836261749, + "learning_rate": 5.536757711645604e-06, + "loss": 4.2448, + "step": 52575 + }, + { + "epoch": 3.5724962630792225, + "grad_norm": 0.29590877890586853, + "learning_rate": 5.536333061557278e-06, + "loss": 4.1019, + "step": 52580 + }, + { + "epoch": 3.5728359831498846, + "grad_norm": 0.3118896186351776, + "learning_rate": 5.5359084114689496e-06, + "loss": 4.2826, + "step": 52585 + }, + { + "epoch": 3.5731757032205462, + "grad_norm": 0.8297372460365295, + "learning_rate": 5.535483761380622e-06, + "loss": 4.218, + "step": 52590 + }, + { + "epoch": 3.573515423291208, + "grad_norm": 0.3567030727863312, + "learning_rate": 5.535059111292296e-06, + "loss": 3.9569, + "step": 52595 + }, + { + "epoch": 3.57385514336187, + "grad_norm": 0.2503286898136139, + "learning_rate": 5.534634461203968e-06, + "loss": 4.011, + "step": 52600 + }, + { + "epoch": 3.5741948634325316, + "grad_norm": 0.21605952084064484, + "learning_rate": 5.534209811115642e-06, + "loss": 3.9337, + "step": 52605 + }, + { + "epoch": 3.574534583503193, + "grad_norm": 0.40738824009895325, + "learning_rate": 5.533785161027314e-06, + "loss": 4.1897, + "step": 52610 + }, + { + "epoch": 3.5748743035738553, + "grad_norm": 0.3347904682159424, + "learning_rate": 5.533360510938986e-06, + "loss": 3.9786, + "step": 52615 + }, + { + "epoch": 3.575214023644517, + "grad_norm": 0.24076105654239655, + "learning_rate": 5.53293586085066e-06, + "loss": 3.9705, + "step": 52620 + }, + { + "epoch": 3.5755537437151785, + "grad_norm": 0.30616262555122375, + "learning_rate": 5.532511210762333e-06, + "loss": 3.9591, + "step": 52625 + }, + { + "epoch": 3.5758934637858406, + "grad_norm": 0.3264245092868805, + "learning_rate": 5.532086560674005e-06, + "loss": 4.0861, + "step": 52630 + }, + { + "epoch": 3.5762331838565022, + "grad_norm": 0.4581214189529419, + "learning_rate": 5.531661910585678e-06, + "loss": 4.0182, + "step": 52635 + }, + { + "epoch": 3.576572903927164, + "grad_norm": 0.35292381048202515, + "learning_rate": 5.53123726049735e-06, + "loss": 3.9264, + "step": 52640 + }, + { + "epoch": 3.576912623997826, + "grad_norm": 0.23817278444766998, + "learning_rate": 5.530812610409023e-06, + "loss": 3.8593, + "step": 52645 + }, + { + "epoch": 3.5772523440684876, + "grad_norm": 0.26209330558776855, + "learning_rate": 5.530387960320697e-06, + "loss": 4.4091, + "step": 52650 + }, + { + "epoch": 3.577592064139149, + "grad_norm": 0.3263472616672516, + "learning_rate": 5.529963310232369e-06, + "loss": 4.0423, + "step": 52655 + }, + { + "epoch": 3.5779317842098113, + "grad_norm": 0.30559396743774414, + "learning_rate": 5.5295386601440416e-06, + "loss": 4.0945, + "step": 52660 + }, + { + "epoch": 3.578271504280473, + "grad_norm": 0.25244179368019104, + "learning_rate": 5.529114010055715e-06, + "loss": 3.8048, + "step": 52665 + }, + { + "epoch": 3.5786112243511345, + "grad_norm": 0.37744709849357605, + "learning_rate": 5.528689359967387e-06, + "loss": 3.9988, + "step": 52670 + }, + { + "epoch": 3.5789509444217966, + "grad_norm": 0.2632368803024292, + "learning_rate": 5.52826470987906e-06, + "loss": 3.9794, + "step": 52675 + }, + { + "epoch": 3.5792906644924583, + "grad_norm": 0.23333927989006042, + "learning_rate": 5.527840059790734e-06, + "loss": 4.0741, + "step": 52680 + }, + { + "epoch": 3.57963038456312, + "grad_norm": 0.27712392807006836, + "learning_rate": 5.5274154097024056e-06, + "loss": 4.1451, + "step": 52685 + }, + { + "epoch": 3.579970104633782, + "grad_norm": 0.32057827711105347, + "learning_rate": 5.526990759614078e-06, + "loss": 3.9983, + "step": 52690 + }, + { + "epoch": 3.5803098247044436, + "grad_norm": 0.30932319164276123, + "learning_rate": 5.526566109525752e-06, + "loss": 4.0351, + "step": 52695 + }, + { + "epoch": 3.580649544775105, + "grad_norm": 0.3378918170928955, + "learning_rate": 5.526141459437424e-06, + "loss": 4.2332, + "step": 52700 + }, + { + "epoch": 3.5809892648457673, + "grad_norm": 0.2607005536556244, + "learning_rate": 5.525716809349097e-06, + "loss": 3.9213, + "step": 52705 + }, + { + "epoch": 3.581328984916429, + "grad_norm": 0.3612997531890869, + "learning_rate": 5.52529215926077e-06, + "loss": 4.0922, + "step": 52710 + }, + { + "epoch": 3.5816687049870906, + "grad_norm": 0.23875446617603302, + "learning_rate": 5.524867509172442e-06, + "loss": 4.1042, + "step": 52715 + }, + { + "epoch": 3.5820084250577526, + "grad_norm": 0.34844309091567993, + "learning_rate": 5.524442859084115e-06, + "loss": 4.0373, + "step": 52720 + }, + { + "epoch": 3.5823481451284143, + "grad_norm": 0.304025799036026, + "learning_rate": 5.524018208995788e-06, + "loss": 3.9037, + "step": 52725 + }, + { + "epoch": 3.582687865199076, + "grad_norm": 0.3605744540691376, + "learning_rate": 5.523593558907461e-06, + "loss": 3.914, + "step": 52730 + }, + { + "epoch": 3.583027585269738, + "grad_norm": 0.24614089727401733, + "learning_rate": 5.523168908819133e-06, + "loss": 4.0472, + "step": 52735 + }, + { + "epoch": 3.5833673053403996, + "grad_norm": 0.5582446455955505, + "learning_rate": 5.522744258730806e-06, + "loss": 3.9567, + "step": 52740 + }, + { + "epoch": 3.5837070254110612, + "grad_norm": 0.23668625950813293, + "learning_rate": 5.522319608642479e-06, + "loss": 4.026, + "step": 52745 + }, + { + "epoch": 3.5840467454817233, + "grad_norm": 0.3155452311038971, + "learning_rate": 5.521894958554151e-06, + "loss": 4.0, + "step": 52750 + }, + { + "epoch": 3.584386465552385, + "grad_norm": 0.26444342732429504, + "learning_rate": 5.521470308465825e-06, + "loss": 3.9983, + "step": 52755 + }, + { + "epoch": 3.5847261856230466, + "grad_norm": 0.3139782249927521, + "learning_rate": 5.521045658377498e-06, + "loss": 4.2342, + "step": 52760 + }, + { + "epoch": 3.5850659056937086, + "grad_norm": 0.3382418751716614, + "learning_rate": 5.5206210082891695e-06, + "loss": 3.9875, + "step": 52765 + }, + { + "epoch": 3.5854056257643703, + "grad_norm": 0.236204132437706, + "learning_rate": 5.520196358200843e-06, + "loss": 3.9136, + "step": 52770 + }, + { + "epoch": 3.585745345835032, + "grad_norm": 0.2786565124988556, + "learning_rate": 5.519771708112516e-06, + "loss": 3.9711, + "step": 52775 + }, + { + "epoch": 3.586085065905694, + "grad_norm": 0.2089240401983261, + "learning_rate": 5.519347058024188e-06, + "loss": 3.9296, + "step": 52780 + }, + { + "epoch": 3.5864247859763556, + "grad_norm": 0.2811935544013977, + "learning_rate": 5.518922407935862e-06, + "loss": 3.8299, + "step": 52785 + }, + { + "epoch": 3.5867645060470172, + "grad_norm": 0.23338516056537628, + "learning_rate": 5.518497757847534e-06, + "loss": 4.1147, + "step": 52790 + }, + { + "epoch": 3.587104226117679, + "grad_norm": 0.2954792082309723, + "learning_rate": 5.518073107759206e-06, + "loss": 4.1291, + "step": 52795 + }, + { + "epoch": 3.587443946188341, + "grad_norm": 0.24122470617294312, + "learning_rate": 5.51764845767088e-06, + "loss": 4.2382, + "step": 52800 + }, + { + "epoch": 3.5877836662590026, + "grad_norm": 0.26981550455093384, + "learning_rate": 5.517223807582553e-06, + "loss": 3.9478, + "step": 52805 + }, + { + "epoch": 3.588123386329664, + "grad_norm": 0.29882127046585083, + "learning_rate": 5.516799157494225e-06, + "loss": 4.2711, + "step": 52810 + }, + { + "epoch": 3.5884631064003263, + "grad_norm": 0.2619360685348511, + "learning_rate": 5.516374507405898e-06, + "loss": 3.9983, + "step": 52815 + }, + { + "epoch": 3.588802826470988, + "grad_norm": 0.31735897064208984, + "learning_rate": 5.51594985731757e-06, + "loss": 3.9584, + "step": 52820 + }, + { + "epoch": 3.5891425465416495, + "grad_norm": 0.33554643392562866, + "learning_rate": 5.515525207229243e-06, + "loss": 4.0412, + "step": 52825 + }, + { + "epoch": 3.589482266612311, + "grad_norm": 0.24550692737102509, + "learning_rate": 5.515100557140917e-06, + "loss": 4.0093, + "step": 52830 + }, + { + "epoch": 3.5898219866829733, + "grad_norm": 0.2740858793258667, + "learning_rate": 5.514675907052589e-06, + "loss": 4.0922, + "step": 52835 + }, + { + "epoch": 3.590161706753635, + "grad_norm": 0.30510786175727844, + "learning_rate": 5.5142512569642615e-06, + "loss": 4.1753, + "step": 52840 + }, + { + "epoch": 3.5905014268242965, + "grad_norm": 0.3121318221092224, + "learning_rate": 5.513826606875935e-06, + "loss": 3.894, + "step": 52845 + }, + { + "epoch": 3.5908411468949586, + "grad_norm": 0.34656214714050293, + "learning_rate": 5.513401956787607e-06, + "loss": 4.0214, + "step": 52850 + }, + { + "epoch": 3.59118086696562, + "grad_norm": 0.657360315322876, + "learning_rate": 5.51297730669928e-06, + "loss": 4.1611, + "step": 52855 + }, + { + "epoch": 3.591520587036282, + "grad_norm": 0.22201021015644073, + "learning_rate": 5.512552656610954e-06, + "loss": 3.878, + "step": 52860 + }, + { + "epoch": 3.591860307106944, + "grad_norm": 0.3201509118080139, + "learning_rate": 5.5121280065226255e-06, + "loss": 3.9768, + "step": 52865 + }, + { + "epoch": 3.5922000271776056, + "grad_norm": 0.37057751417160034, + "learning_rate": 5.511703356434298e-06, + "loss": 4.0135, + "step": 52870 + }, + { + "epoch": 3.592539747248267, + "grad_norm": 0.3168802559375763, + "learning_rate": 5.511278706345972e-06, + "loss": 3.9621, + "step": 52875 + }, + { + "epoch": 3.5928794673189293, + "grad_norm": 0.2910158932209015, + "learning_rate": 5.510854056257644e-06, + "loss": 3.9293, + "step": 52880 + }, + { + "epoch": 3.593219187389591, + "grad_norm": 0.24882356822490692, + "learning_rate": 5.510429406169317e-06, + "loss": 3.9785, + "step": 52885 + }, + { + "epoch": 3.5935589074602525, + "grad_norm": 0.24403227865695953, + "learning_rate": 5.5100047560809896e-06, + "loss": 3.7959, + "step": 52890 + }, + { + "epoch": 3.5938986275309146, + "grad_norm": 0.2582967281341553, + "learning_rate": 5.509580105992662e-06, + "loss": 3.8459, + "step": 52895 + }, + { + "epoch": 3.5942383476015762, + "grad_norm": 0.34516069293022156, + "learning_rate": 5.509155455904335e-06, + "loss": 3.8901, + "step": 52900 + }, + { + "epoch": 3.594578067672238, + "grad_norm": 0.28960108757019043, + "learning_rate": 5.508730805816008e-06, + "loss": 4.023, + "step": 52905 + }, + { + "epoch": 3.5949177877429, + "grad_norm": 0.2539430260658264, + "learning_rate": 5.508306155727681e-06, + "loss": 4.0896, + "step": 52910 + }, + { + "epoch": 3.5952575078135616, + "grad_norm": 0.26871052384376526, + "learning_rate": 5.507881505639353e-06, + "loss": 4.2152, + "step": 52915 + }, + { + "epoch": 3.595597227884223, + "grad_norm": 0.21524113416671753, + "learning_rate": 5.507456855551026e-06, + "loss": 3.9799, + "step": 52920 + }, + { + "epoch": 3.5959369479548853, + "grad_norm": 0.294577419757843, + "learning_rate": 5.507032205462699e-06, + "loss": 3.5652, + "step": 52925 + }, + { + "epoch": 3.596276668025547, + "grad_norm": 0.37645941972732544, + "learning_rate": 5.506607555374371e-06, + "loss": 3.938, + "step": 52930 + }, + { + "epoch": 3.5966163880962085, + "grad_norm": 0.24043014645576477, + "learning_rate": 5.506182905286045e-06, + "loss": 3.8357, + "step": 52935 + }, + { + "epoch": 3.5969561081668706, + "grad_norm": 0.33054807782173157, + "learning_rate": 5.5057582551977176e-06, + "loss": 3.948, + "step": 52940 + }, + { + "epoch": 3.5972958282375322, + "grad_norm": 0.3216985762119293, + "learning_rate": 5.505333605109391e-06, + "loss": 3.8062, + "step": 52945 + }, + { + "epoch": 3.597635548308194, + "grad_norm": 0.3394196629524231, + "learning_rate": 5.504908955021063e-06, + "loss": 4.1208, + "step": 52950 + }, + { + "epoch": 3.597975268378856, + "grad_norm": 0.33497780561447144, + "learning_rate": 5.504484304932736e-06, + "loss": 4.0155, + "step": 52955 + }, + { + "epoch": 3.5983149884495176, + "grad_norm": 0.24495388567447662, + "learning_rate": 5.504059654844409e-06, + "loss": 4.0545, + "step": 52960 + }, + { + "epoch": 3.598654708520179, + "grad_norm": 0.26348739862442017, + "learning_rate": 5.5036350047560816e-06, + "loss": 3.9205, + "step": 52965 + }, + { + "epoch": 3.5989944285908413, + "grad_norm": 0.3248676657676697, + "learning_rate": 5.503210354667754e-06, + "loss": 3.9586, + "step": 52970 + }, + { + "epoch": 3.599334148661503, + "grad_norm": 0.20808054506778717, + "learning_rate": 5.502785704579427e-06, + "loss": 4.1427, + "step": 52975 + }, + { + "epoch": 3.5996738687321646, + "grad_norm": 0.26677244901657104, + "learning_rate": 5.5023610544911e-06, + "loss": 4.1401, + "step": 52980 + }, + { + "epoch": 3.6000135888028266, + "grad_norm": 0.3858265280723572, + "learning_rate": 5.501936404402772e-06, + "loss": 3.9602, + "step": 52985 + }, + { + "epoch": 3.6003533088734883, + "grad_norm": 0.5456172823905945, + "learning_rate": 5.5015117543144456e-06, + "loss": 4.0897, + "step": 52990 + }, + { + "epoch": 3.60069302894415, + "grad_norm": 0.27465179562568665, + "learning_rate": 5.501087104226118e-06, + "loss": 4.3087, + "step": 52995 + }, + { + "epoch": 3.601032749014812, + "grad_norm": 0.2700183689594269, + "learning_rate": 5.50066245413779e-06, + "loss": 3.7111, + "step": 53000 + }, + { + "epoch": 3.6013724690854736, + "grad_norm": 0.21822984516620636, + "learning_rate": 5.500237804049464e-06, + "loss": 3.9645, + "step": 53005 + }, + { + "epoch": 3.6017121891561352, + "grad_norm": 0.2843429148197174, + "learning_rate": 5.499813153961137e-06, + "loss": 3.9357, + "step": 53010 + }, + { + "epoch": 3.6020519092267973, + "grad_norm": 0.2734817564487457, + "learning_rate": 5.499388503872809e-06, + "loss": 3.9248, + "step": 53015 + }, + { + "epoch": 3.602391629297459, + "grad_norm": 0.29663053154945374, + "learning_rate": 5.498963853784482e-06, + "loss": 3.8683, + "step": 53020 + }, + { + "epoch": 3.6027313493681206, + "grad_norm": 0.34608227014541626, + "learning_rate": 5.498539203696155e-06, + "loss": 3.8831, + "step": 53025 + }, + { + "epoch": 3.6030710694387826, + "grad_norm": 0.2996266782283783, + "learning_rate": 5.498114553607827e-06, + "loss": 3.9479, + "step": 53030 + }, + { + "epoch": 3.6034107895094443, + "grad_norm": 0.3097565770149231, + "learning_rate": 5.497689903519501e-06, + "loss": 3.8995, + "step": 53035 + }, + { + "epoch": 3.603750509580106, + "grad_norm": 0.3101276755332947, + "learning_rate": 5.4972652534311736e-06, + "loss": 3.9232, + "step": 53040 + }, + { + "epoch": 3.604090229650768, + "grad_norm": 0.31920918822288513, + "learning_rate": 5.4968406033428455e-06, + "loss": 4.0225, + "step": 53045 + }, + { + "epoch": 3.6044299497214296, + "grad_norm": 0.3157196342945099, + "learning_rate": 5.496415953254519e-06, + "loss": 3.938, + "step": 53050 + }, + { + "epoch": 3.6047696697920912, + "grad_norm": 0.2539858818054199, + "learning_rate": 5.495991303166192e-06, + "loss": 3.8743, + "step": 53055 + }, + { + "epoch": 3.6051093898627533, + "grad_norm": 0.32244113087654114, + "learning_rate": 5.495566653077864e-06, + "loss": 3.9929, + "step": 53060 + }, + { + "epoch": 3.605449109933415, + "grad_norm": 0.3075619339942932, + "learning_rate": 5.495142002989538e-06, + "loss": 4.0697, + "step": 53065 + }, + { + "epoch": 3.6057888300040766, + "grad_norm": 0.20904117822647095, + "learning_rate": 5.4947173529012095e-06, + "loss": 3.9118, + "step": 53070 + }, + { + "epoch": 3.6061285500747386, + "grad_norm": 0.34004727005958557, + "learning_rate": 5.494292702812882e-06, + "loss": 4.0278, + "step": 53075 + }, + { + "epoch": 3.6064682701454003, + "grad_norm": 0.2974352538585663, + "learning_rate": 5.493868052724556e-06, + "loss": 4.01, + "step": 53080 + }, + { + "epoch": 3.606807990216062, + "grad_norm": 0.3019084930419922, + "learning_rate": 5.493443402636228e-06, + "loss": 3.8084, + "step": 53085 + }, + { + "epoch": 3.607147710286724, + "grad_norm": 0.25891178846359253, + "learning_rate": 5.493018752547901e-06, + "loss": 3.9073, + "step": 53090 + }, + { + "epoch": 3.6074874303573856, + "grad_norm": 0.3712993264198303, + "learning_rate": 5.492594102459574e-06, + "loss": 3.9681, + "step": 53095 + }, + { + "epoch": 3.6078271504280472, + "grad_norm": 0.2504538893699646, + "learning_rate": 5.492169452371246e-06, + "loss": 4.0338, + "step": 53100 + }, + { + "epoch": 3.6081668704987093, + "grad_norm": 0.2300759106874466, + "learning_rate": 5.491744802282919e-06, + "loss": 4.0369, + "step": 53105 + }, + { + "epoch": 3.608506590569371, + "grad_norm": 0.29668989777565, + "learning_rate": 5.491320152194593e-06, + "loss": 3.9033, + "step": 53110 + }, + { + "epoch": 3.6088463106400326, + "grad_norm": 0.2874857783317566, + "learning_rate": 5.490895502106265e-06, + "loss": 3.9524, + "step": 53115 + }, + { + "epoch": 3.6091860307106947, + "grad_norm": 0.31294354796409607, + "learning_rate": 5.4904708520179375e-06, + "loss": 4.0614, + "step": 53120 + }, + { + "epoch": 3.6095257507813563, + "grad_norm": 0.32445600628852844, + "learning_rate": 5.490046201929611e-06, + "loss": 4.2645, + "step": 53125 + }, + { + "epoch": 3.609865470852018, + "grad_norm": 0.3535427749156952, + "learning_rate": 5.489621551841283e-06, + "loss": 4.1474, + "step": 53130 + }, + { + "epoch": 3.6102051909226796, + "grad_norm": 0.2634577751159668, + "learning_rate": 5.489196901752956e-06, + "loss": 4.1107, + "step": 53135 + }, + { + "epoch": 3.6105449109933416, + "grad_norm": 0.25658243894577026, + "learning_rate": 5.488772251664629e-06, + "loss": 3.8601, + "step": 53140 + }, + { + "epoch": 3.6108846310640033, + "grad_norm": 0.34517234563827515, + "learning_rate": 5.4883476015763015e-06, + "loss": 3.9887, + "step": 53145 + }, + { + "epoch": 3.611224351134665, + "grad_norm": 0.2935846149921417, + "learning_rate": 5.487922951487974e-06, + "loss": 4.1901, + "step": 53150 + }, + { + "epoch": 3.611564071205327, + "grad_norm": 0.2546500563621521, + "learning_rate": 5.487498301399647e-06, + "loss": 3.9703, + "step": 53155 + }, + { + "epoch": 3.6119037912759886, + "grad_norm": 0.2707505524158478, + "learning_rate": 5.48707365131132e-06, + "loss": 3.8194, + "step": 53160 + }, + { + "epoch": 3.6122435113466502, + "grad_norm": 0.2641737163066864, + "learning_rate": 5.486649001222992e-06, + "loss": 4.0982, + "step": 53165 + }, + { + "epoch": 3.612583231417312, + "grad_norm": 0.2482622265815735, + "learning_rate": 5.4862243511346655e-06, + "loss": 3.8522, + "step": 53170 + }, + { + "epoch": 3.612922951487974, + "grad_norm": 0.3929363489151001, + "learning_rate": 5.485799701046338e-06, + "loss": 3.9776, + "step": 53175 + }, + { + "epoch": 3.6132626715586356, + "grad_norm": 0.27094361186027527, + "learning_rate": 5.48537505095801e-06, + "loss": 3.9427, + "step": 53180 + }, + { + "epoch": 3.613602391629297, + "grad_norm": 0.4322945475578308, + "learning_rate": 5.484950400869684e-06, + "loss": 3.8604, + "step": 53185 + }, + { + "epoch": 3.6139421116999593, + "grad_norm": 0.4753023087978363, + "learning_rate": 5.484525750781357e-06, + "loss": 4.1175, + "step": 53190 + }, + { + "epoch": 3.614281831770621, + "grad_norm": 0.30484673380851746, + "learning_rate": 5.484101100693029e-06, + "loss": 4.1233, + "step": 53195 + }, + { + "epoch": 3.6146215518412825, + "grad_norm": 0.33213648200035095, + "learning_rate": 5.483676450604702e-06, + "loss": 3.9815, + "step": 53200 + }, + { + "epoch": 3.6149612719119446, + "grad_norm": 0.35763871669769287, + "learning_rate": 5.483251800516375e-06, + "loss": 3.919, + "step": 53205 + }, + { + "epoch": 3.6153009919826062, + "grad_norm": 0.33298856019973755, + "learning_rate": 5.482827150428047e-06, + "loss": 4.1873, + "step": 53210 + }, + { + "epoch": 3.615640712053268, + "grad_norm": 0.6505615711212158, + "learning_rate": 5.482402500339721e-06, + "loss": 4.2757, + "step": 53215 + }, + { + "epoch": 3.61598043212393, + "grad_norm": 0.24404919147491455, + "learning_rate": 5.4819778502513936e-06, + "loss": 3.9925, + "step": 53220 + }, + { + "epoch": 3.6163201521945916, + "grad_norm": 0.36005038022994995, + "learning_rate": 5.4815532001630655e-06, + "loss": 3.9652, + "step": 53225 + }, + { + "epoch": 3.616659872265253, + "grad_norm": 0.2663913071155548, + "learning_rate": 5.481128550074739e-06, + "loss": 4.0958, + "step": 53230 + }, + { + "epoch": 3.6169995923359153, + "grad_norm": 0.22736883163452148, + "learning_rate": 5.480703899986411e-06, + "loss": 3.9182, + "step": 53235 + }, + { + "epoch": 3.617339312406577, + "grad_norm": 0.33480337262153625, + "learning_rate": 5.480279249898084e-06, + "loss": 4.1012, + "step": 53240 + }, + { + "epoch": 3.6176790324772385, + "grad_norm": 0.3799399137496948, + "learning_rate": 5.4798545998097576e-06, + "loss": 4.2179, + "step": 53245 + }, + { + "epoch": 3.6180187525479006, + "grad_norm": 0.2908582091331482, + "learning_rate": 5.4794299497214295e-06, + "loss": 4.0055, + "step": 53250 + }, + { + "epoch": 3.6183584726185622, + "grad_norm": 0.3860257863998413, + "learning_rate": 5.479005299633102e-06, + "loss": 4.0054, + "step": 53255 + }, + { + "epoch": 3.618698192689224, + "grad_norm": 0.3189333379268646, + "learning_rate": 5.478580649544776e-06, + "loss": 3.9603, + "step": 53260 + }, + { + "epoch": 3.619037912759886, + "grad_norm": 0.31818392872810364, + "learning_rate": 5.478155999456448e-06, + "loss": 3.9095, + "step": 53265 + }, + { + "epoch": 3.6193776328305476, + "grad_norm": 0.3180738091468811, + "learning_rate": 5.477731349368121e-06, + "loss": 3.8079, + "step": 53270 + }, + { + "epoch": 3.619717352901209, + "grad_norm": 0.3523971736431122, + "learning_rate": 5.477306699279794e-06, + "loss": 4.1079, + "step": 53275 + }, + { + "epoch": 3.6200570729718713, + "grad_norm": 0.4290299117565155, + "learning_rate": 5.476882049191466e-06, + "loss": 4.2279, + "step": 53280 + }, + { + "epoch": 3.620396793042533, + "grad_norm": 0.3197324872016907, + "learning_rate": 5.47645739910314e-06, + "loss": 3.9006, + "step": 53285 + }, + { + "epoch": 3.6207365131131946, + "grad_norm": 0.2796623408794403, + "learning_rate": 5.476032749014813e-06, + "loss": 4.0945, + "step": 53290 + }, + { + "epoch": 3.6210762331838566, + "grad_norm": 0.3348744809627533, + "learning_rate": 5.475608098926485e-06, + "loss": 3.9601, + "step": 53295 + }, + { + "epoch": 3.6214159532545183, + "grad_norm": 0.33099138736724854, + "learning_rate": 5.475183448838158e-06, + "loss": 4.1705, + "step": 53300 + }, + { + "epoch": 3.62175567332518, + "grad_norm": 0.3437923192977905, + "learning_rate": 5.47475879874983e-06, + "loss": 3.9749, + "step": 53305 + }, + { + "epoch": 3.622095393395842, + "grad_norm": 0.2676003873348236, + "learning_rate": 5.474334148661503e-06, + "loss": 3.9455, + "step": 53310 + }, + { + "epoch": 3.6224351134665036, + "grad_norm": 0.253348708152771, + "learning_rate": 5.473909498573177e-06, + "loss": 4.0577, + "step": 53315 + }, + { + "epoch": 3.6227748335371652, + "grad_norm": 0.3803572654724121, + "learning_rate": 5.473484848484849e-06, + "loss": 4.0498, + "step": 53320 + }, + { + "epoch": 3.6231145536078273, + "grad_norm": 0.255016952753067, + "learning_rate": 5.4730601983965215e-06, + "loss": 4.0004, + "step": 53325 + }, + { + "epoch": 3.623454273678489, + "grad_norm": 0.26024502515792847, + "learning_rate": 5.472635548308195e-06, + "loss": 4.0273, + "step": 53330 + }, + { + "epoch": 3.6237939937491506, + "grad_norm": 0.3204329311847687, + "learning_rate": 5.472210898219867e-06, + "loss": 4.0322, + "step": 53335 + }, + { + "epoch": 3.6241337138198126, + "grad_norm": 0.29575473070144653, + "learning_rate": 5.47178624813154e-06, + "loss": 4.1978, + "step": 53340 + }, + { + "epoch": 3.6244734338904743, + "grad_norm": 0.2896324694156647, + "learning_rate": 5.4713615980432136e-06, + "loss": 3.8255, + "step": 53345 + }, + { + "epoch": 3.624813153961136, + "grad_norm": 0.3386075794696808, + "learning_rate": 5.4709369479548855e-06, + "loss": 3.9295, + "step": 53350 + }, + { + "epoch": 3.625152874031798, + "grad_norm": 0.30367541313171387, + "learning_rate": 5.470512297866558e-06, + "loss": 4.0579, + "step": 53355 + }, + { + "epoch": 3.6254925941024596, + "grad_norm": 0.30948251485824585, + "learning_rate": 5.470087647778232e-06, + "loss": 4.1968, + "step": 53360 + }, + { + "epoch": 3.6258323141731212, + "grad_norm": 0.21384912729263306, + "learning_rate": 5.469662997689904e-06, + "loss": 3.8669, + "step": 53365 + }, + { + "epoch": 3.6261720342437833, + "grad_norm": 0.2669837474822998, + "learning_rate": 5.469238347601577e-06, + "loss": 4.1263, + "step": 53370 + }, + { + "epoch": 3.626511754314445, + "grad_norm": 0.29736602306365967, + "learning_rate": 5.46881369751325e-06, + "loss": 3.9401, + "step": 53375 + }, + { + "epoch": 3.6268514743851066, + "grad_norm": 0.31950780749320984, + "learning_rate": 5.468389047424922e-06, + "loss": 4.246, + "step": 53380 + }, + { + "epoch": 3.6271911944557687, + "grad_norm": 0.22838173806667328, + "learning_rate": 5.467964397336595e-06, + "loss": 4.1324, + "step": 53385 + }, + { + "epoch": 3.6275309145264303, + "grad_norm": 0.34203147888183594, + "learning_rate": 5.467539747248268e-06, + "loss": 4.0313, + "step": 53390 + }, + { + "epoch": 3.627870634597092, + "grad_norm": 0.27809008955955505, + "learning_rate": 5.467115097159941e-06, + "loss": 3.9802, + "step": 53395 + }, + { + "epoch": 3.628210354667754, + "grad_norm": 0.3611108362674713, + "learning_rate": 5.4666904470716135e-06, + "loss": 4.0231, + "step": 53400 + }, + { + "epoch": 3.6285500747384156, + "grad_norm": 0.3838385343551636, + "learning_rate": 5.466265796983286e-06, + "loss": 4.1911, + "step": 53405 + }, + { + "epoch": 3.6288897948090773, + "grad_norm": 0.3895691931247711, + "learning_rate": 5.465841146894959e-06, + "loss": 4.17, + "step": 53410 + }, + { + "epoch": 3.6292295148797393, + "grad_norm": 0.3506527245044708, + "learning_rate": 5.465416496806631e-06, + "loss": 3.7367, + "step": 53415 + }, + { + "epoch": 3.629569234950401, + "grad_norm": 0.3017815351486206, + "learning_rate": 5.464991846718305e-06, + "loss": 3.8072, + "step": 53420 + }, + { + "epoch": 3.6299089550210626, + "grad_norm": 0.2702261507511139, + "learning_rate": 5.4645671966299775e-06, + "loss": 4.0504, + "step": 53425 + }, + { + "epoch": 3.6302486750917247, + "grad_norm": 0.3129432797431946, + "learning_rate": 5.4641425465416495e-06, + "loss": 4.2811, + "step": 53430 + }, + { + "epoch": 3.6305883951623863, + "grad_norm": 0.2596615254878998, + "learning_rate": 5.463717896453323e-06, + "loss": 4.0078, + "step": 53435 + }, + { + "epoch": 3.630928115233048, + "grad_norm": 0.2848029136657715, + "learning_rate": 5.463293246364996e-06, + "loss": 4.1354, + "step": 53440 + }, + { + "epoch": 3.63126783530371, + "grad_norm": 0.27714869379997253, + "learning_rate": 5.462868596276668e-06, + "loss": 3.8747, + "step": 53445 + }, + { + "epoch": 3.6316075553743716, + "grad_norm": 0.1998296082019806, + "learning_rate": 5.4624439461883415e-06, + "loss": 3.9428, + "step": 53450 + }, + { + "epoch": 3.6319472754450333, + "grad_norm": 0.25601664185523987, + "learning_rate": 5.462019296100014e-06, + "loss": 4.045, + "step": 53455 + }, + { + "epoch": 3.6322869955156953, + "grad_norm": 0.4841800630092621, + "learning_rate": 5.461594646011686e-06, + "loss": 3.7674, + "step": 53460 + }, + { + "epoch": 3.632626715586357, + "grad_norm": 0.3039047122001648, + "learning_rate": 5.46116999592336e-06, + "loss": 3.968, + "step": 53465 + }, + { + "epoch": 3.6329664356570186, + "grad_norm": 0.2564915716648102, + "learning_rate": 5.460745345835033e-06, + "loss": 4.0275, + "step": 53470 + }, + { + "epoch": 3.6333061557276802, + "grad_norm": 0.31475555896759033, + "learning_rate": 5.460320695746705e-06, + "loss": 3.9433, + "step": 53475 + }, + { + "epoch": 3.6336458757983423, + "grad_norm": 0.27436670660972595, + "learning_rate": 5.459896045658378e-06, + "loss": 4.163, + "step": 53480 + }, + { + "epoch": 3.633985595869004, + "grad_norm": 0.27634912729263306, + "learning_rate": 5.45947139557005e-06, + "loss": 4.065, + "step": 53485 + }, + { + "epoch": 3.6343253159396656, + "grad_norm": 0.3092660903930664, + "learning_rate": 5.459046745481723e-06, + "loss": 4.0065, + "step": 53490 + }, + { + "epoch": 3.6346650360103276, + "grad_norm": 0.3285192847251892, + "learning_rate": 5.458622095393397e-06, + "loss": 4.0187, + "step": 53495 + }, + { + "epoch": 3.6350047560809893, + "grad_norm": 0.2674078345298767, + "learning_rate": 5.458197445305069e-06, + "loss": 3.9992, + "step": 53500 + }, + { + "epoch": 3.635344476151651, + "grad_norm": 0.28802168369293213, + "learning_rate": 5.4577727952167415e-06, + "loss": 4.0974, + "step": 53505 + }, + { + "epoch": 3.635684196222313, + "grad_norm": 0.3344692885875702, + "learning_rate": 5.457348145128415e-06, + "loss": 4.0084, + "step": 53510 + }, + { + "epoch": 3.6360239162929746, + "grad_norm": 0.2510889768600464, + "learning_rate": 5.456923495040087e-06, + "loss": 4.0827, + "step": 53515 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.29009485244750977, + "learning_rate": 5.45649884495176e-06, + "loss": 4.2129, + "step": 53520 + }, + { + "epoch": 3.636703356434298, + "grad_norm": 0.36152321100234985, + "learning_rate": 5.4560741948634336e-06, + "loss": 4.1284, + "step": 53525 + }, + { + "epoch": 3.63704307650496, + "grad_norm": 0.3199556767940521, + "learning_rate": 5.4556495447751055e-06, + "loss": 4.1949, + "step": 53530 + }, + { + "epoch": 3.6373827965756216, + "grad_norm": 0.31329545378685, + "learning_rate": 5.455224894686778e-06, + "loss": 4.0317, + "step": 53535 + }, + { + "epoch": 3.637722516646283, + "grad_norm": 0.33884385228157043, + "learning_rate": 5.454800244598452e-06, + "loss": 4.0129, + "step": 53540 + }, + { + "epoch": 3.6380622367169453, + "grad_norm": 0.3335493803024292, + "learning_rate": 5.454375594510124e-06, + "loss": 3.9655, + "step": 53545 + }, + { + "epoch": 3.638401956787607, + "grad_norm": 0.27943000197410583, + "learning_rate": 5.453950944421797e-06, + "loss": 3.9618, + "step": 53550 + }, + { + "epoch": 3.6387416768582685, + "grad_norm": 0.2852763831615448, + "learning_rate": 5.4535262943334695e-06, + "loss": 4.2311, + "step": 53555 + }, + { + "epoch": 3.6390813969289306, + "grad_norm": 0.3443024754524231, + "learning_rate": 5.453101644245142e-06, + "loss": 3.8505, + "step": 53560 + }, + { + "epoch": 3.6394211169995923, + "grad_norm": 0.31900754570961, + "learning_rate": 5.452676994156815e-06, + "loss": 4.225, + "step": 53565 + }, + { + "epoch": 3.639760837070254, + "grad_norm": 0.314211905002594, + "learning_rate": 5.452252344068488e-06, + "loss": 3.9484, + "step": 53570 + }, + { + "epoch": 3.640100557140916, + "grad_norm": 0.34194350242614746, + "learning_rate": 5.451827693980161e-06, + "loss": 4.3006, + "step": 53575 + }, + { + "epoch": 3.6404402772115776, + "grad_norm": 0.25904521346092224, + "learning_rate": 5.451403043891833e-06, + "loss": 4.0265, + "step": 53580 + }, + { + "epoch": 3.6407799972822392, + "grad_norm": 0.2599686086177826, + "learning_rate": 5.450978393803506e-06, + "loss": 3.8353, + "step": 53585 + }, + { + "epoch": 3.6411197173529013, + "grad_norm": 0.26747527718544006, + "learning_rate": 5.450553743715179e-06, + "loss": 4.1678, + "step": 53590 + }, + { + "epoch": 3.641459437423563, + "grad_norm": 0.3373306691646576, + "learning_rate": 5.450129093626851e-06, + "loss": 4.0921, + "step": 53595 + }, + { + "epoch": 3.6417991574942246, + "grad_norm": 0.30961164832115173, + "learning_rate": 5.449704443538525e-06, + "loss": 4.0836, + "step": 53600 + }, + { + "epoch": 3.6421388775648866, + "grad_norm": 0.2594524323940277, + "learning_rate": 5.4492797934501975e-06, + "loss": 3.9346, + "step": 53605 + }, + { + "epoch": 3.6424785976355483, + "grad_norm": 0.30886536836624146, + "learning_rate": 5.4488551433618695e-06, + "loss": 3.8295, + "step": 53610 + }, + { + "epoch": 3.64281831770621, + "grad_norm": 0.320007860660553, + "learning_rate": 5.448430493273543e-06, + "loss": 4.0544, + "step": 53615 + }, + { + "epoch": 3.643158037776872, + "grad_norm": 0.43006882071495056, + "learning_rate": 5.448005843185216e-06, + "loss": 4.1137, + "step": 53620 + }, + { + "epoch": 3.6434977578475336, + "grad_norm": 0.2754196524620056, + "learning_rate": 5.4475811930968896e-06, + "loss": 4.2003, + "step": 53625 + }, + { + "epoch": 3.6438374779181952, + "grad_norm": 0.2572307884693146, + "learning_rate": 5.4471565430085615e-06, + "loss": 4.1866, + "step": 53630 + }, + { + "epoch": 3.6441771979888573, + "grad_norm": 0.3593023419380188, + "learning_rate": 5.446731892920234e-06, + "loss": 3.8232, + "step": 53635 + }, + { + "epoch": 3.644516918059519, + "grad_norm": 0.3079235851764679, + "learning_rate": 5.446307242831907e-06, + "loss": 4.1803, + "step": 53640 + }, + { + "epoch": 3.6448566381301806, + "grad_norm": 0.2782585918903351, + "learning_rate": 5.44588259274358e-06, + "loss": 3.9085, + "step": 53645 + }, + { + "epoch": 3.6451963582008426, + "grad_norm": 0.2836547791957855, + "learning_rate": 5.445457942655252e-06, + "loss": 3.9714, + "step": 53650 + }, + { + "epoch": 3.6455360782715043, + "grad_norm": 0.4855504035949707, + "learning_rate": 5.4450332925669255e-06, + "loss": 3.9099, + "step": 53655 + }, + { + "epoch": 3.645875798342166, + "grad_norm": 0.7073346972465515, + "learning_rate": 5.444608642478598e-06, + "loss": 4.1852, + "step": 53660 + }, + { + "epoch": 3.646215518412828, + "grad_norm": 0.31473076343536377, + "learning_rate": 5.44418399239027e-06, + "loss": 4.1031, + "step": 53665 + }, + { + "epoch": 3.6465552384834896, + "grad_norm": 0.2653859555721283, + "learning_rate": 5.443759342301944e-06, + "loss": 4.0061, + "step": 53670 + }, + { + "epoch": 3.6468949585541512, + "grad_norm": 0.3289342522621155, + "learning_rate": 5.443334692213617e-06, + "loss": 3.8505, + "step": 53675 + }, + { + "epoch": 3.6472346786248133, + "grad_norm": 0.31100499629974365, + "learning_rate": 5.442910042125289e-06, + "loss": 3.993, + "step": 53680 + }, + { + "epoch": 3.647574398695475, + "grad_norm": 0.23424090445041656, + "learning_rate": 5.442485392036962e-06, + "loss": 4.0512, + "step": 53685 + }, + { + "epoch": 3.6479141187661366, + "grad_norm": 0.2543686330318451, + "learning_rate": 5.442060741948635e-06, + "loss": 4.1212, + "step": 53690 + }, + { + "epoch": 3.6482538388367987, + "grad_norm": 0.2577255666255951, + "learning_rate": 5.441636091860307e-06, + "loss": 3.9648, + "step": 53695 + }, + { + "epoch": 3.6485935589074603, + "grad_norm": 0.25055980682373047, + "learning_rate": 5.441211441771981e-06, + "loss": 3.7615, + "step": 53700 + }, + { + "epoch": 3.648933278978122, + "grad_norm": 0.4850315749645233, + "learning_rate": 5.4407867916836535e-06, + "loss": 4.0445, + "step": 53705 + }, + { + "epoch": 3.649272999048784, + "grad_norm": 0.29487475752830505, + "learning_rate": 5.4403621415953255e-06, + "loss": 4.1579, + "step": 53710 + }, + { + "epoch": 3.6496127191194456, + "grad_norm": 0.3424092233181, + "learning_rate": 5.439937491506999e-06, + "loss": 4.2796, + "step": 53715 + }, + { + "epoch": 3.6499524391901073, + "grad_norm": 0.257255882024765, + "learning_rate": 5.439512841418672e-06, + "loss": 4.0379, + "step": 53720 + }, + { + "epoch": 3.6502921592607693, + "grad_norm": 0.36272379755973816, + "learning_rate": 5.439088191330344e-06, + "loss": 3.9967, + "step": 53725 + }, + { + "epoch": 3.650631879331431, + "grad_norm": 0.3728756308555603, + "learning_rate": 5.4386635412420175e-06, + "loss": 4.1344, + "step": 53730 + }, + { + "epoch": 3.6509715994020926, + "grad_norm": 0.286321759223938, + "learning_rate": 5.4382388911536895e-06, + "loss": 4.015, + "step": 53735 + }, + { + "epoch": 3.6513113194727547, + "grad_norm": 0.29097220301628113, + "learning_rate": 5.437814241065362e-06, + "loss": 3.8387, + "step": 53740 + }, + { + "epoch": 3.6516510395434163, + "grad_norm": 0.30461275577545166, + "learning_rate": 5.437389590977036e-06, + "loss": 3.8199, + "step": 53745 + }, + { + "epoch": 3.651990759614078, + "grad_norm": 0.32263630628585815, + "learning_rate": 5.436964940888708e-06, + "loss": 3.9568, + "step": 53750 + }, + { + "epoch": 3.65233047968474, + "grad_norm": 0.34361010789871216, + "learning_rate": 5.436540290800381e-06, + "loss": 4.1927, + "step": 53755 + }, + { + "epoch": 3.6526701997554016, + "grad_norm": 0.30702465772628784, + "learning_rate": 5.436115640712054e-06, + "loss": 3.9488, + "step": 53760 + }, + { + "epoch": 3.6530099198260633, + "grad_norm": 0.43408140540122986, + "learning_rate": 5.435690990623726e-06, + "loss": 4.0121, + "step": 53765 + }, + { + "epoch": 3.6533496398967253, + "grad_norm": 0.25881609320640564, + "learning_rate": 5.435266340535399e-06, + "loss": 3.8342, + "step": 53770 + }, + { + "epoch": 3.653689359967387, + "grad_norm": 0.2607782781124115, + "learning_rate": 5.434841690447073e-06, + "loss": 4.0971, + "step": 53775 + }, + { + "epoch": 3.6540290800380486, + "grad_norm": 0.3805084228515625, + "learning_rate": 5.434417040358745e-06, + "loss": 4.0677, + "step": 53780 + }, + { + "epoch": 3.6543688001087107, + "grad_norm": 0.24771080911159515, + "learning_rate": 5.4339923902704175e-06, + "loss": 3.9815, + "step": 53785 + }, + { + "epoch": 3.6547085201793723, + "grad_norm": 0.3612104058265686, + "learning_rate": 5.433567740182091e-06, + "loss": 4.0241, + "step": 53790 + }, + { + "epoch": 3.655048240250034, + "grad_norm": 0.37702104449272156, + "learning_rate": 5.433143090093763e-06, + "loss": 4.0159, + "step": 53795 + }, + { + "epoch": 3.655387960320696, + "grad_norm": 0.37234780192375183, + "learning_rate": 5.432718440005436e-06, + "loss": 4.0206, + "step": 53800 + }, + { + "epoch": 3.6557276803913576, + "grad_norm": 0.3703904449939728, + "learning_rate": 5.432293789917109e-06, + "loss": 4.1779, + "step": 53805 + }, + { + "epoch": 3.6560674004620193, + "grad_norm": 0.3691060245037079, + "learning_rate": 5.4318691398287815e-06, + "loss": 3.9574, + "step": 53810 + }, + { + "epoch": 3.656407120532681, + "grad_norm": 0.2830825448036194, + "learning_rate": 5.431444489740454e-06, + "loss": 4.0, + "step": 53815 + }, + { + "epoch": 3.656746840603343, + "grad_norm": 0.25979849696159363, + "learning_rate": 5.431019839652127e-06, + "loss": 4.1468, + "step": 53820 + }, + { + "epoch": 3.6570865606740046, + "grad_norm": 0.30851060152053833, + "learning_rate": 5.430680119581465e-06, + "loss": 4.1636, + "step": 53825 + }, + { + "epoch": 3.6574262807446662, + "grad_norm": 0.2875680923461914, + "learning_rate": 5.430255469493139e-06, + "loss": 3.8161, + "step": 53830 + }, + { + "epoch": 3.6577660008153283, + "grad_norm": 0.3173634111881256, + "learning_rate": 5.429830819404811e-06, + "loss": 3.7461, + "step": 53835 + }, + { + "epoch": 3.65810572088599, + "grad_norm": 0.47015753388404846, + "learning_rate": 5.429406169316484e-06, + "loss": 3.9332, + "step": 53840 + }, + { + "epoch": 3.6584454409566516, + "grad_norm": 0.4385720491409302, + "learning_rate": 5.428981519228157e-06, + "loss": 3.8452, + "step": 53845 + }, + { + "epoch": 3.6587851610273137, + "grad_norm": 0.260597825050354, + "learning_rate": 5.428556869139829e-06, + "loss": 4.0134, + "step": 53850 + }, + { + "epoch": 3.6591248810979753, + "grad_norm": 0.3301398456096649, + "learning_rate": 5.428217149069167e-06, + "loss": 4.1116, + "step": 53855 + }, + { + "epoch": 3.659464601168637, + "grad_norm": 0.31505823135375977, + "learning_rate": 5.427792498980841e-06, + "loss": 4.0427, + "step": 53860 + }, + { + "epoch": 3.6598043212392986, + "grad_norm": 0.27379080653190613, + "learning_rate": 5.427367848892513e-06, + "loss": 3.9334, + "step": 53865 + }, + { + "epoch": 3.6601440413099606, + "grad_norm": 0.2984023690223694, + "learning_rate": 5.426943198804186e-06, + "loss": 3.9024, + "step": 53870 + }, + { + "epoch": 3.6604837613806223, + "grad_norm": 0.21709318459033966, + "learning_rate": 5.426518548715859e-06, + "loss": 3.9719, + "step": 53875 + }, + { + "epoch": 3.660823481451284, + "grad_norm": 0.26137223839759827, + "learning_rate": 5.426093898627531e-06, + "loss": 3.8535, + "step": 53880 + }, + { + "epoch": 3.661163201521946, + "grad_norm": 0.2203391194343567, + "learning_rate": 5.425669248539204e-06, + "loss": 3.8158, + "step": 53885 + }, + { + "epoch": 3.6615029215926076, + "grad_norm": 0.23055186867713928, + "learning_rate": 5.425244598450877e-06, + "loss": 3.8899, + "step": 53890 + }, + { + "epoch": 3.6618426416632692, + "grad_norm": 0.27468088269233704, + "learning_rate": 5.42481994836255e-06, + "loss": 4.0617, + "step": 53895 + }, + { + "epoch": 3.6621823617339313, + "grad_norm": 0.28815728425979614, + "learning_rate": 5.4243952982742225e-06, + "loss": 3.9847, + "step": 53900 + }, + { + "epoch": 3.662522081804593, + "grad_norm": 0.3036688268184662, + "learning_rate": 5.423970648185895e-06, + "loss": 4.0975, + "step": 53905 + }, + { + "epoch": 3.6628618018752546, + "grad_norm": 0.24273206293582916, + "learning_rate": 5.423545998097568e-06, + "loss": 3.9451, + "step": 53910 + }, + { + "epoch": 3.6632015219459166, + "grad_norm": 0.31668347120285034, + "learning_rate": 5.42312134800924e-06, + "loss": 3.8349, + "step": 53915 + }, + { + "epoch": 3.6635412420165783, + "grad_norm": 0.26252979040145874, + "learning_rate": 5.422696697920914e-06, + "loss": 3.9442, + "step": 53920 + }, + { + "epoch": 3.66388096208724, + "grad_norm": 0.3043847680091858, + "learning_rate": 5.4222720478325865e-06, + "loss": 4.0472, + "step": 53925 + }, + { + "epoch": 3.664220682157902, + "grad_norm": 0.23164427280426025, + "learning_rate": 5.421847397744258e-06, + "loss": 4.0752, + "step": 53930 + }, + { + "epoch": 3.6645604022285636, + "grad_norm": 0.2601160705089569, + "learning_rate": 5.421422747655932e-06, + "loss": 3.9366, + "step": 53935 + }, + { + "epoch": 3.6649001222992252, + "grad_norm": 0.3417150378227234, + "learning_rate": 5.420998097567605e-06, + "loss": 3.9946, + "step": 53940 + }, + { + "epoch": 3.6652398423698873, + "grad_norm": 0.2944501042366028, + "learning_rate": 5.420573447479277e-06, + "loss": 3.8896, + "step": 53945 + }, + { + "epoch": 3.665579562440549, + "grad_norm": 0.3096897006034851, + "learning_rate": 5.4201487973909505e-06, + "loss": 3.9472, + "step": 53950 + }, + { + "epoch": 3.6659192825112106, + "grad_norm": 0.4857281446456909, + "learning_rate": 5.419724147302623e-06, + "loss": 4.2633, + "step": 53955 + }, + { + "epoch": 3.6662590025818727, + "grad_norm": 0.48470228910446167, + "learning_rate": 5.419299497214295e-06, + "loss": 3.977, + "step": 53960 + }, + { + "epoch": 3.6665987226525343, + "grad_norm": 0.29335927963256836, + "learning_rate": 5.418874847125969e-06, + "loss": 3.9993, + "step": 53965 + }, + { + "epoch": 3.666938442723196, + "grad_norm": 0.3927261233329773, + "learning_rate": 5.418450197037642e-06, + "loss": 4.1013, + "step": 53970 + }, + { + "epoch": 3.667278162793858, + "grad_norm": 0.31387078762054443, + "learning_rate": 5.418025546949314e-06, + "loss": 4.4664, + "step": 53975 + }, + { + "epoch": 3.6676178828645196, + "grad_norm": 0.35505130887031555, + "learning_rate": 5.417600896860987e-06, + "loss": 4.1383, + "step": 53980 + }, + { + "epoch": 3.6679576029351812, + "grad_norm": 0.3086337447166443, + "learning_rate": 5.417176246772659e-06, + "loss": 4.0678, + "step": 53985 + }, + { + "epoch": 3.6682973230058433, + "grad_norm": 0.29646772146224976, + "learning_rate": 5.416751596684332e-06, + "loss": 3.7668, + "step": 53990 + }, + { + "epoch": 3.668637043076505, + "grad_norm": 0.2777829170227051, + "learning_rate": 5.416326946596006e-06, + "loss": 4.0092, + "step": 53995 + }, + { + "epoch": 3.6689767631471666, + "grad_norm": 0.3505461513996124, + "learning_rate": 5.415902296507678e-06, + "loss": 4.0202, + "step": 54000 + }, + { + "epoch": 3.6693164832178287, + "grad_norm": 0.2769280672073364, + "learning_rate": 5.4154776464193504e-06, + "loss": 3.963, + "step": 54005 + }, + { + "epoch": 3.6696562032884903, + "grad_norm": 0.21325497329235077, + "learning_rate": 5.415052996331024e-06, + "loss": 3.937, + "step": 54010 + }, + { + "epoch": 3.669995923359152, + "grad_norm": 0.25381043553352356, + "learning_rate": 5.414628346242696e-06, + "loss": 4.0357, + "step": 54015 + }, + { + "epoch": 3.670335643429814, + "grad_norm": 0.26270654797554016, + "learning_rate": 5.414203696154369e-06, + "loss": 3.8783, + "step": 54020 + }, + { + "epoch": 3.6706753635004756, + "grad_norm": 0.28753331303596497, + "learning_rate": 5.4137790460660425e-06, + "loss": 3.8356, + "step": 54025 + }, + { + "epoch": 3.6710150835711373, + "grad_norm": 0.2888298034667969, + "learning_rate": 5.4133543959777144e-06, + "loss": 3.9871, + "step": 54030 + }, + { + "epoch": 3.6713548036417993, + "grad_norm": 0.2835046947002411, + "learning_rate": 5.412929745889388e-06, + "loss": 4.0189, + "step": 54035 + }, + { + "epoch": 3.671694523712461, + "grad_norm": 0.24742329120635986, + "learning_rate": 5.412505095801061e-06, + "loss": 4.251, + "step": 54040 + }, + { + "epoch": 3.6720342437831226, + "grad_norm": 0.27753645181655884, + "learning_rate": 5.412080445712733e-06, + "loss": 4.0749, + "step": 54045 + }, + { + "epoch": 3.6723739638537847, + "grad_norm": 0.21896933019161224, + "learning_rate": 5.4116557956244065e-06, + "loss": 3.9552, + "step": 54050 + }, + { + "epoch": 3.6727136839244463, + "grad_norm": 0.3862415552139282, + "learning_rate": 5.411231145536079e-06, + "loss": 4.0815, + "step": 54055 + }, + { + "epoch": 3.673053403995108, + "grad_norm": 0.2782331705093384, + "learning_rate": 5.410806495447751e-06, + "loss": 4.1115, + "step": 54060 + }, + { + "epoch": 3.67339312406577, + "grad_norm": 0.2667519748210907, + "learning_rate": 5.410381845359425e-06, + "loss": 4.0628, + "step": 54065 + }, + { + "epoch": 3.6737328441364316, + "grad_norm": 0.34110748767852783, + "learning_rate": 5.409957195271097e-06, + "loss": 4.189, + "step": 54070 + }, + { + "epoch": 3.6740725642070933, + "grad_norm": 0.3010605573654175, + "learning_rate": 5.40953254518277e-06, + "loss": 4.2228, + "step": 54075 + }, + { + "epoch": 3.6744122842777553, + "grad_norm": 0.2831747531890869, + "learning_rate": 5.409107895094443e-06, + "loss": 4.1105, + "step": 54080 + }, + { + "epoch": 3.674752004348417, + "grad_norm": 0.3543800413608551, + "learning_rate": 5.408683245006115e-06, + "loss": 4.0538, + "step": 54085 + }, + { + "epoch": 3.6750917244190786, + "grad_norm": 0.25091293454170227, + "learning_rate": 5.408258594917788e-06, + "loss": 3.9534, + "step": 54090 + }, + { + "epoch": 3.6754314444897407, + "grad_norm": 0.23942774534225464, + "learning_rate": 5.407833944829462e-06, + "loss": 3.8277, + "step": 54095 + }, + { + "epoch": 3.6757711645604023, + "grad_norm": 0.2814890742301941, + "learning_rate": 5.407409294741134e-06, + "loss": 4.1586, + "step": 54100 + }, + { + "epoch": 3.676110884631064, + "grad_norm": 0.3699527680873871, + "learning_rate": 5.4069846446528064e-06, + "loss": 3.8, + "step": 54105 + }, + { + "epoch": 3.676450604701726, + "grad_norm": 0.2883819341659546, + "learning_rate": 5.40655999456448e-06, + "loss": 3.9721, + "step": 54110 + }, + { + "epoch": 3.6767903247723877, + "grad_norm": 0.26390913128852844, + "learning_rate": 5.406135344476152e-06, + "loss": 4.0332, + "step": 54115 + }, + { + "epoch": 3.6771300448430493, + "grad_norm": 0.28773584961891174, + "learning_rate": 5.405710694387825e-06, + "loss": 4.0042, + "step": 54120 + }, + { + "epoch": 3.6774697649137114, + "grad_norm": 0.2706543803215027, + "learning_rate": 5.4052860442994985e-06, + "loss": 4.0849, + "step": 54125 + }, + { + "epoch": 3.677809484984373, + "grad_norm": 0.24636700749397278, + "learning_rate": 5.4048613942111704e-06, + "loss": 3.7643, + "step": 54130 + }, + { + "epoch": 3.6781492050550346, + "grad_norm": 0.318774938583374, + "learning_rate": 5.404436744122843e-06, + "loss": 4.24, + "step": 54135 + }, + { + "epoch": 3.6784889251256967, + "grad_norm": 0.34555643796920776, + "learning_rate": 5.404012094034516e-06, + "loss": 4.0607, + "step": 54140 + }, + { + "epoch": 3.6788286451963583, + "grad_norm": 0.2379884570837021, + "learning_rate": 5.403587443946189e-06, + "loss": 4.2168, + "step": 54145 + }, + { + "epoch": 3.67916836526702, + "grad_norm": 0.2949323058128357, + "learning_rate": 5.403162793857862e-06, + "loss": 4.0416, + "step": 54150 + }, + { + "epoch": 3.6795080853376816, + "grad_norm": 0.29020604491233826, + "learning_rate": 5.4027381437695345e-06, + "loss": 3.8881, + "step": 54155 + }, + { + "epoch": 3.6798478054083437, + "grad_norm": 0.345445454120636, + "learning_rate": 5.402313493681207e-06, + "loss": 4.2757, + "step": 54160 + }, + { + "epoch": 3.6801875254790053, + "grad_norm": 0.32722824811935425, + "learning_rate": 5.401888843592879e-06, + "loss": 4.0644, + "step": 54165 + }, + { + "epoch": 3.680527245549667, + "grad_norm": 0.31034553050994873, + "learning_rate": 5.401464193504553e-06, + "loss": 3.9281, + "step": 54170 + }, + { + "epoch": 3.680866965620329, + "grad_norm": 0.33521461486816406, + "learning_rate": 5.401039543416226e-06, + "loss": 4.2262, + "step": 54175 + }, + { + "epoch": 3.6812066856909906, + "grad_norm": 0.255731463432312, + "learning_rate": 5.400614893327898e-06, + "loss": 3.848, + "step": 54180 + }, + { + "epoch": 3.6815464057616523, + "grad_norm": 0.41911330819129944, + "learning_rate": 5.400190243239571e-06, + "loss": 4.0584, + "step": 54185 + }, + { + "epoch": 3.6818861258323143, + "grad_norm": 0.2600201964378357, + "learning_rate": 5.399765593151244e-06, + "loss": 3.8227, + "step": 54190 + }, + { + "epoch": 3.682225845902976, + "grad_norm": 0.2201353907585144, + "learning_rate": 5.399340943062916e-06, + "loss": 3.9841, + "step": 54195 + }, + { + "epoch": 3.6825655659736376, + "grad_norm": 0.2447071373462677, + "learning_rate": 5.39891629297459e-06, + "loss": 3.9753, + "step": 54200 + }, + { + "epoch": 3.6829052860442992, + "grad_norm": 0.28142714500427246, + "learning_rate": 5.3984916428862625e-06, + "loss": 3.8741, + "step": 54205 + }, + { + "epoch": 3.6832450061149613, + "grad_norm": 0.303962379693985, + "learning_rate": 5.398066992797934e-06, + "loss": 4.0565, + "step": 54210 + }, + { + "epoch": 3.683584726185623, + "grad_norm": 0.2846957743167877, + "learning_rate": 5.397642342709608e-06, + "loss": 3.8937, + "step": 54215 + }, + { + "epoch": 3.6839244462562846, + "grad_norm": 0.23436078429222107, + "learning_rate": 5.397217692621281e-06, + "loss": 3.8208, + "step": 54220 + }, + { + "epoch": 3.6842641663269466, + "grad_norm": 0.40262237191200256, + "learning_rate": 5.396793042532953e-06, + "loss": 3.837, + "step": 54225 + }, + { + "epoch": 3.6846038863976083, + "grad_norm": 0.31761378049850464, + "learning_rate": 5.3963683924446265e-06, + "loss": 4.1102, + "step": 54230 + }, + { + "epoch": 3.68494360646827, + "grad_norm": 0.23495113849639893, + "learning_rate": 5.395943742356298e-06, + "loss": 4.0032, + "step": 54235 + }, + { + "epoch": 3.685283326538932, + "grad_norm": 0.24986127018928528, + "learning_rate": 5.395519092267971e-06, + "loss": 3.6323, + "step": 54240 + }, + { + "epoch": 3.6856230466095936, + "grad_norm": 0.4008989930152893, + "learning_rate": 5.395094442179645e-06, + "loss": 4.2375, + "step": 54245 + }, + { + "epoch": 3.6859627666802552, + "grad_norm": 0.2900419235229492, + "learning_rate": 5.394669792091317e-06, + "loss": 4.1002, + "step": 54250 + }, + { + "epoch": 3.6863024867509173, + "grad_norm": 0.40733274817466736, + "learning_rate": 5.39424514200299e-06, + "loss": 4.3138, + "step": 54255 + }, + { + "epoch": 3.686642206821579, + "grad_norm": 0.20797215402126312, + "learning_rate": 5.393820491914663e-06, + "loss": 3.8528, + "step": 54260 + }, + { + "epoch": 3.6869819268922406, + "grad_norm": 0.21877945959568024, + "learning_rate": 5.393395841826335e-06, + "loss": 4.0266, + "step": 54265 + }, + { + "epoch": 3.6873216469629027, + "grad_norm": 0.25580957531929016, + "learning_rate": 5.392971191738008e-06, + "loss": 4.0939, + "step": 54270 + }, + { + "epoch": 3.6876613670335643, + "grad_norm": 0.28597334027290344, + "learning_rate": 5.392546541649682e-06, + "loss": 4.1773, + "step": 54275 + }, + { + "epoch": 3.688001087104226, + "grad_norm": 0.21666894853115082, + "learning_rate": 5.392121891561354e-06, + "loss": 4.0864, + "step": 54280 + }, + { + "epoch": 3.688340807174888, + "grad_norm": 0.20852510631084442, + "learning_rate": 5.3916972414730264e-06, + "loss": 3.9293, + "step": 54285 + }, + { + "epoch": 3.6886805272455496, + "grad_norm": 0.269683837890625, + "learning_rate": 5.3912725913847e-06, + "loss": 3.8427, + "step": 54290 + }, + { + "epoch": 3.6890202473162113, + "grad_norm": 0.22253529727458954, + "learning_rate": 5.390847941296372e-06, + "loss": 3.9387, + "step": 54295 + }, + { + "epoch": 3.6893599673868733, + "grad_norm": 0.3298462927341461, + "learning_rate": 5.390423291208045e-06, + "loss": 4.0469, + "step": 54300 + }, + { + "epoch": 3.689699687457535, + "grad_norm": 0.3953602612018585, + "learning_rate": 5.389998641119718e-06, + "loss": 4.2577, + "step": 54305 + }, + { + "epoch": 3.6900394075281966, + "grad_norm": 0.28131958842277527, + "learning_rate": 5.3895739910313904e-06, + "loss": 3.7146, + "step": 54310 + }, + { + "epoch": 3.6903791275988587, + "grad_norm": 0.30085933208465576, + "learning_rate": 5.389149340943063e-06, + "loss": 4.0233, + "step": 54315 + }, + { + "epoch": 3.6907188476695203, + "grad_norm": 0.24693503975868225, + "learning_rate": 5.388724690854736e-06, + "loss": 4.0529, + "step": 54320 + }, + { + "epoch": 3.691058567740182, + "grad_norm": 0.2980952262878418, + "learning_rate": 5.388300040766409e-06, + "loss": 3.9916, + "step": 54325 + }, + { + "epoch": 3.691398287810844, + "grad_norm": 0.2764996886253357, + "learning_rate": 5.387875390678081e-06, + "loss": 3.9576, + "step": 54330 + }, + { + "epoch": 3.6917380078815056, + "grad_norm": 0.3067329525947571, + "learning_rate": 5.3874507405897544e-06, + "loss": 4.0729, + "step": 54335 + }, + { + "epoch": 3.6920777279521673, + "grad_norm": 0.23777426779270172, + "learning_rate": 5.387026090501427e-06, + "loss": 3.8643, + "step": 54340 + }, + { + "epoch": 3.6924174480228293, + "grad_norm": 0.24441254138946533, + "learning_rate": 5.386601440413099e-06, + "loss": 3.931, + "step": 54345 + }, + { + "epoch": 3.692757168093491, + "grad_norm": 0.2541712522506714, + "learning_rate": 5.386176790324773e-06, + "loss": 4.1034, + "step": 54350 + }, + { + "epoch": 3.6930968881641526, + "grad_norm": 0.2861768305301666, + "learning_rate": 5.385752140236446e-06, + "loss": 4.2195, + "step": 54355 + }, + { + "epoch": 3.6934366082348147, + "grad_norm": 0.2594451904296875, + "learning_rate": 5.385327490148118e-06, + "loss": 4.0612, + "step": 54360 + }, + { + "epoch": 3.6937763283054763, + "grad_norm": 0.21940679848194122, + "learning_rate": 5.384902840059791e-06, + "loss": 3.9434, + "step": 54365 + }, + { + "epoch": 3.694116048376138, + "grad_norm": 0.2373967319726944, + "learning_rate": 5.384478189971464e-06, + "loss": 3.8391, + "step": 54370 + }, + { + "epoch": 3.6944557684468, + "grad_norm": 0.5194597840309143, + "learning_rate": 5.384053539883138e-06, + "loss": 4.081, + "step": 54375 + }, + { + "epoch": 3.6947954885174616, + "grad_norm": 0.2627664804458618, + "learning_rate": 5.38362888979481e-06, + "loss": 3.8404, + "step": 54380 + }, + { + "epoch": 3.6951352085881233, + "grad_norm": 0.47428959608078003, + "learning_rate": 5.3832042397064824e-06, + "loss": 3.9917, + "step": 54385 + }, + { + "epoch": 3.6954749286587854, + "grad_norm": 0.38447830080986023, + "learning_rate": 5.382779589618155e-06, + "loss": 4.0198, + "step": 54390 + }, + { + "epoch": 3.695814648729447, + "grad_norm": 0.2979658544063568, + "learning_rate": 5.382354939529828e-06, + "loss": 3.8848, + "step": 54395 + }, + { + "epoch": 3.6961543688001086, + "grad_norm": 0.4201115667819977, + "learning_rate": 5.381930289441501e-06, + "loss": 4.0209, + "step": 54400 + }, + { + "epoch": 3.6964940888707707, + "grad_norm": 0.2466408908367157, + "learning_rate": 5.381505639353174e-06, + "loss": 4.0312, + "step": 54405 + }, + { + "epoch": 3.6968338089414323, + "grad_norm": 0.31356555223464966, + "learning_rate": 5.3810809892648464e-06, + "loss": 3.9486, + "step": 54410 + }, + { + "epoch": 3.697173529012094, + "grad_norm": 0.3517455458641052, + "learning_rate": 5.380656339176518e-06, + "loss": 4.1632, + "step": 54415 + }, + { + "epoch": 3.697513249082756, + "grad_norm": 0.31863126158714294, + "learning_rate": 5.380231689088192e-06, + "loss": 4.0746, + "step": 54420 + }, + { + "epoch": 3.6978529691534177, + "grad_norm": 0.24605996906757355, + "learning_rate": 5.379807038999865e-06, + "loss": 3.9001, + "step": 54425 + }, + { + "epoch": 3.6981926892240793, + "grad_norm": 0.36269232630729675, + "learning_rate": 5.379382388911537e-06, + "loss": 4.0919, + "step": 54430 + }, + { + "epoch": 3.6985324092947414, + "grad_norm": 0.24584852159023285, + "learning_rate": 5.3789577388232104e-06, + "loss": 4.0963, + "step": 54435 + }, + { + "epoch": 3.698872129365403, + "grad_norm": 0.3177040219306946, + "learning_rate": 5.378533088734883e-06, + "loss": 3.8542, + "step": 54440 + }, + { + "epoch": 3.6992118494360646, + "grad_norm": 0.28202828764915466, + "learning_rate": 5.378108438646555e-06, + "loss": 4.1148, + "step": 54445 + }, + { + "epoch": 3.6995515695067267, + "grad_norm": 0.3428427577018738, + "learning_rate": 5.377683788558229e-06, + "loss": 3.9681, + "step": 54450 + }, + { + "epoch": 3.6998912895773883, + "grad_norm": 0.368356853723526, + "learning_rate": 5.377259138469902e-06, + "loss": 4.0265, + "step": 54455 + }, + { + "epoch": 3.70023100964805, + "grad_norm": 0.2593863904476166, + "learning_rate": 5.376834488381574e-06, + "loss": 3.9359, + "step": 54460 + }, + { + "epoch": 3.700570729718712, + "grad_norm": 0.2980496883392334, + "learning_rate": 5.376409838293247e-06, + "loss": 3.9082, + "step": 54465 + }, + { + "epoch": 3.7009104497893737, + "grad_norm": 0.2709016799926758, + "learning_rate": 5.37598518820492e-06, + "loss": 4.0023, + "step": 54470 + }, + { + "epoch": 3.7012501698600353, + "grad_norm": 0.3370714783668518, + "learning_rate": 5.375560538116592e-06, + "loss": 4.0178, + "step": 54475 + }, + { + "epoch": 3.7015898899306974, + "grad_norm": 0.27439430356025696, + "learning_rate": 5.375135888028266e-06, + "loss": 4.0143, + "step": 54480 + }, + { + "epoch": 3.701929610001359, + "grad_norm": 0.24127300083637238, + "learning_rate": 5.374711237939938e-06, + "loss": 4.1304, + "step": 54485 + }, + { + "epoch": 3.7022693300720206, + "grad_norm": 0.6075676679611206, + "learning_rate": 5.37428658785161e-06, + "loss": 4.1019, + "step": 54490 + }, + { + "epoch": 3.7026090501426823, + "grad_norm": 0.2531966269016266, + "learning_rate": 5.373861937763284e-06, + "loss": 3.9064, + "step": 54495 + }, + { + "epoch": 3.7029487702133443, + "grad_norm": 0.30309250950813293, + "learning_rate": 5.373437287674956e-06, + "loss": 3.9878, + "step": 54500 + }, + { + "epoch": 3.703288490284006, + "grad_norm": 0.25109952688217163, + "learning_rate": 5.373012637586629e-06, + "loss": 3.8201, + "step": 54505 + }, + { + "epoch": 3.7036282103546676, + "grad_norm": 0.31164979934692383, + "learning_rate": 5.3725879874983025e-06, + "loss": 4.1358, + "step": 54510 + }, + { + "epoch": 3.7039679304253297, + "grad_norm": 0.27720212936401367, + "learning_rate": 5.372163337409974e-06, + "loss": 3.6606, + "step": 54515 + }, + { + "epoch": 3.7043076504959913, + "grad_norm": 0.37089890241622925, + "learning_rate": 5.371738687321647e-06, + "loss": 3.8117, + "step": 54520 + }, + { + "epoch": 3.704647370566653, + "grad_norm": 0.2561415135860443, + "learning_rate": 5.371314037233321e-06, + "loss": 4.2183, + "step": 54525 + }, + { + "epoch": 3.704987090637315, + "grad_norm": 0.30401596426963806, + "learning_rate": 5.370889387144993e-06, + "loss": 3.948, + "step": 54530 + }, + { + "epoch": 3.7053268107079766, + "grad_norm": 0.23520061373710632, + "learning_rate": 5.370464737056666e-06, + "loss": 4.0901, + "step": 54535 + }, + { + "epoch": 3.7056665307786383, + "grad_norm": 0.5308772921562195, + "learning_rate": 5.370040086968339e-06, + "loss": 4.0604, + "step": 54540 + }, + { + "epoch": 3.7060062508493, + "grad_norm": 0.20826156437397003, + "learning_rate": 5.369615436880011e-06, + "loss": 3.9478, + "step": 54545 + }, + { + "epoch": 3.706345970919962, + "grad_norm": 0.33437275886535645, + "learning_rate": 5.369190786791684e-06, + "loss": 3.9982, + "step": 54550 + }, + { + "epoch": 3.7066856909906236, + "grad_norm": 0.4482254385948181, + "learning_rate": 5.368766136703357e-06, + "loss": 3.6929, + "step": 54555 + }, + { + "epoch": 3.7070254110612852, + "grad_norm": 0.364625483751297, + "learning_rate": 5.36834148661503e-06, + "loss": 3.892, + "step": 54560 + }, + { + "epoch": 3.7073651311319473, + "grad_norm": 0.2718562185764313, + "learning_rate": 5.367916836526702e-06, + "loss": 4.1794, + "step": 54565 + }, + { + "epoch": 3.707704851202609, + "grad_norm": 0.271418035030365, + "learning_rate": 5.367492186438375e-06, + "loss": 3.8542, + "step": 54570 + }, + { + "epoch": 3.7080445712732706, + "grad_norm": 0.2727338969707489, + "learning_rate": 5.367067536350048e-06, + "loss": 3.9856, + "step": 54575 + }, + { + "epoch": 3.7083842913439327, + "grad_norm": 0.2782030403614044, + "learning_rate": 5.36664288626172e-06, + "loss": 3.8124, + "step": 54580 + }, + { + "epoch": 3.7087240114145943, + "grad_norm": 0.38414204120635986, + "learning_rate": 5.366218236173394e-06, + "loss": 3.8641, + "step": 54585 + }, + { + "epoch": 3.709063731485256, + "grad_norm": 0.42884552478790283, + "learning_rate": 5.3657935860850664e-06, + "loss": 4.3067, + "step": 54590 + }, + { + "epoch": 3.709403451555918, + "grad_norm": 0.22553595900535583, + "learning_rate": 5.365368935996738e-06, + "loss": 4.0786, + "step": 54595 + }, + { + "epoch": 3.7097431716265796, + "grad_norm": 0.2707372307777405, + "learning_rate": 5.364944285908412e-06, + "loss": 3.922, + "step": 54600 + }, + { + "epoch": 3.7100828916972413, + "grad_norm": 0.3028053641319275, + "learning_rate": 5.364519635820085e-06, + "loss": 3.9914, + "step": 54605 + }, + { + "epoch": 3.7104226117679033, + "grad_norm": 0.35486653447151184, + "learning_rate": 5.364094985731757e-06, + "loss": 3.973, + "step": 54610 + }, + { + "epoch": 3.710762331838565, + "grad_norm": 0.28603169322013855, + "learning_rate": 5.3636703356434304e-06, + "loss": 3.8203, + "step": 54615 + }, + { + "epoch": 3.7111020519092266, + "grad_norm": 0.35026201605796814, + "learning_rate": 5.363245685555103e-06, + "loss": 3.8126, + "step": 54620 + }, + { + "epoch": 3.7114417719798887, + "grad_norm": 0.2588866055011749, + "learning_rate": 5.362821035466775e-06, + "loss": 3.9676, + "step": 54625 + }, + { + "epoch": 3.7117814920505503, + "grad_norm": 0.3575854003429413, + "learning_rate": 5.362396385378449e-06, + "loss": 4.2974, + "step": 54630 + }, + { + "epoch": 3.712121212121212, + "grad_norm": 0.24469158053398132, + "learning_rate": 5.361971735290122e-06, + "loss": 3.9599, + "step": 54635 + }, + { + "epoch": 3.712460932191874, + "grad_norm": 0.31504151225090027, + "learning_rate": 5.361547085201794e-06, + "loss": 3.9515, + "step": 54640 + }, + { + "epoch": 3.7128006522625356, + "grad_norm": 0.2638574242591858, + "learning_rate": 5.361122435113467e-06, + "loss": 3.9207, + "step": 54645 + }, + { + "epoch": 3.7131403723331973, + "grad_norm": 0.30074426531791687, + "learning_rate": 5.360697785025139e-06, + "loss": 3.9813, + "step": 54650 + }, + { + "epoch": 3.7134800924038593, + "grad_norm": 0.47241368889808655, + "learning_rate": 5.360273134936812e-06, + "loss": 3.9461, + "step": 54655 + }, + { + "epoch": 3.713819812474521, + "grad_norm": 0.25408920645713806, + "learning_rate": 5.359848484848486e-06, + "loss": 4.0468, + "step": 54660 + }, + { + "epoch": 3.7141595325451826, + "grad_norm": 0.2927599251270294, + "learning_rate": 5.359423834760158e-06, + "loss": 4.0346, + "step": 54665 + }, + { + "epoch": 3.7144992526158447, + "grad_norm": 0.4481479227542877, + "learning_rate": 5.35899918467183e-06, + "loss": 4.0636, + "step": 54670 + }, + { + "epoch": 3.7148389726865063, + "grad_norm": 0.2575829327106476, + "learning_rate": 5.358574534583504e-06, + "loss": 4.0518, + "step": 54675 + }, + { + "epoch": 3.715178692757168, + "grad_norm": 0.357424259185791, + "learning_rate": 5.358149884495176e-06, + "loss": 3.8394, + "step": 54680 + }, + { + "epoch": 3.71551841282783, + "grad_norm": 0.3871801197528839, + "learning_rate": 5.357725234406849e-06, + "loss": 4.2256, + "step": 54685 + }, + { + "epoch": 3.7158581328984917, + "grad_norm": 0.39754483103752136, + "learning_rate": 5.3573005843185224e-06, + "loss": 4.2034, + "step": 54690 + }, + { + "epoch": 3.7161978529691533, + "grad_norm": 0.341328501701355, + "learning_rate": 5.356875934230194e-06, + "loss": 3.9894, + "step": 54695 + }, + { + "epoch": 3.7165375730398154, + "grad_norm": 0.2639218866825104, + "learning_rate": 5.356451284141867e-06, + "loss": 4.0856, + "step": 54700 + }, + { + "epoch": 3.716877293110477, + "grad_norm": 0.4147554337978363, + "learning_rate": 5.356026634053541e-06, + "loss": 4.1068, + "step": 54705 + }, + { + "epoch": 3.7172170131811386, + "grad_norm": 0.27012917399406433, + "learning_rate": 5.355601983965213e-06, + "loss": 4.0835, + "step": 54710 + }, + { + "epoch": 3.7175567332518007, + "grad_norm": 0.2574452757835388, + "learning_rate": 5.3551773338768864e-06, + "loss": 4.1818, + "step": 54715 + }, + { + "epoch": 3.7178964533224623, + "grad_norm": 0.31569868326187134, + "learning_rate": 5.354752683788559e-06, + "loss": 3.9432, + "step": 54720 + }, + { + "epoch": 3.718236173393124, + "grad_norm": 0.35625189542770386, + "learning_rate": 5.354328033700231e-06, + "loss": 3.7307, + "step": 54725 + }, + { + "epoch": 3.718575893463786, + "grad_norm": 0.3473469913005829, + "learning_rate": 5.353903383611905e-06, + "loss": 4.1891, + "step": 54730 + }, + { + "epoch": 3.7189156135344477, + "grad_norm": 0.24260959029197693, + "learning_rate": 5.353478733523577e-06, + "loss": 3.9631, + "step": 54735 + }, + { + "epoch": 3.7192553336051093, + "grad_norm": 0.30720922350883484, + "learning_rate": 5.35305408343525e-06, + "loss": 4.1049, + "step": 54740 + }, + { + "epoch": 3.7195950536757714, + "grad_norm": 0.2622010409832001, + "learning_rate": 5.352629433346923e-06, + "loss": 4.0167, + "step": 54745 + }, + { + "epoch": 3.719934773746433, + "grad_norm": 0.30926966667175293, + "learning_rate": 5.352204783258595e-06, + "loss": 4.0306, + "step": 54750 + }, + { + "epoch": 3.7202744938170946, + "grad_norm": 0.21121245622634888, + "learning_rate": 5.351780133170268e-06, + "loss": 4.1008, + "step": 54755 + }, + { + "epoch": 3.7206142138877567, + "grad_norm": 0.2469659000635147, + "learning_rate": 5.351355483081942e-06, + "loss": 3.914, + "step": 54760 + }, + { + "epoch": 3.7209539339584183, + "grad_norm": 0.2400885820388794, + "learning_rate": 5.350930832993614e-06, + "loss": 4.0267, + "step": 54765 + }, + { + "epoch": 3.72129365402908, + "grad_norm": 0.2507205307483673, + "learning_rate": 5.350506182905286e-06, + "loss": 4.0477, + "step": 54770 + }, + { + "epoch": 3.721633374099742, + "grad_norm": 0.3190290927886963, + "learning_rate": 5.35008153281696e-06, + "loss": 4.0799, + "step": 54775 + }, + { + "epoch": 3.7219730941704037, + "grad_norm": 0.2815440893173218, + "learning_rate": 5.349656882728632e-06, + "loss": 4.1541, + "step": 54780 + }, + { + "epoch": 3.7223128142410653, + "grad_norm": 0.24276617169380188, + "learning_rate": 5.349232232640305e-06, + "loss": 3.9195, + "step": 54785 + }, + { + "epoch": 3.7226525343117274, + "grad_norm": 0.29927361011505127, + "learning_rate": 5.3488075825519785e-06, + "loss": 4.188, + "step": 54790 + }, + { + "epoch": 3.722992254382389, + "grad_norm": 0.4158865511417389, + "learning_rate": 5.34838293246365e-06, + "loss": 3.9536, + "step": 54795 + }, + { + "epoch": 3.7233319744530506, + "grad_norm": 0.24588750302791595, + "learning_rate": 5.347958282375323e-06, + "loss": 3.779, + "step": 54800 + }, + { + "epoch": 3.7236716945237127, + "grad_norm": 0.32284560799598694, + "learning_rate": 5.347533632286996e-06, + "loss": 4.0534, + "step": 54805 + }, + { + "epoch": 3.7240114145943743, + "grad_norm": 0.2959638833999634, + "learning_rate": 5.347108982198669e-06, + "loss": 3.8354, + "step": 54810 + }, + { + "epoch": 3.724351134665036, + "grad_norm": 0.24165265262126923, + "learning_rate": 5.346684332110342e-06, + "loss": 3.7476, + "step": 54815 + }, + { + "epoch": 3.724690854735698, + "grad_norm": 0.25277265906333923, + "learning_rate": 5.346259682022014e-06, + "loss": 4.2084, + "step": 54820 + }, + { + "epoch": 3.7250305748063597, + "grad_norm": 0.33547794818878174, + "learning_rate": 5.345835031933687e-06, + "loss": 3.7667, + "step": 54825 + }, + { + "epoch": 3.7253702948770213, + "grad_norm": 0.37732988595962524, + "learning_rate": 5.345410381845359e-06, + "loss": 3.8921, + "step": 54830 + }, + { + "epoch": 3.725710014947683, + "grad_norm": 0.33351123332977295, + "learning_rate": 5.344985731757033e-06, + "loss": 3.9606, + "step": 54835 + }, + { + "epoch": 3.726049735018345, + "grad_norm": 0.3170146942138672, + "learning_rate": 5.344561081668706e-06, + "loss": 3.8304, + "step": 54840 + }, + { + "epoch": 3.7263894550890067, + "grad_norm": 0.3411177098751068, + "learning_rate": 5.3441364315803776e-06, + "loss": 3.9076, + "step": 54845 + }, + { + "epoch": 3.7267291751596683, + "grad_norm": 0.2512016296386719, + "learning_rate": 5.343711781492051e-06, + "loss": 3.795, + "step": 54850 + }, + { + "epoch": 3.7270688952303304, + "grad_norm": 0.2610146999359131, + "learning_rate": 5.343287131403724e-06, + "loss": 3.9706, + "step": 54855 + }, + { + "epoch": 3.727408615300992, + "grad_norm": 0.23144379258155823, + "learning_rate": 5.342862481315396e-06, + "loss": 3.8423, + "step": 54860 + }, + { + "epoch": 3.7277483353716536, + "grad_norm": 0.23513107001781464, + "learning_rate": 5.34243783122707e-06, + "loss": 4.1742, + "step": 54865 + }, + { + "epoch": 3.7280880554423157, + "grad_norm": 0.3442929685115814, + "learning_rate": 5.342013181138742e-06, + "loss": 3.9317, + "step": 54870 + }, + { + "epoch": 3.7284277755129773, + "grad_norm": 0.38002580404281616, + "learning_rate": 5.341588531050414e-06, + "loss": 3.9006, + "step": 54875 + }, + { + "epoch": 3.728767495583639, + "grad_norm": 0.228803351521492, + "learning_rate": 5.341163880962088e-06, + "loss": 3.8865, + "step": 54880 + }, + { + "epoch": 3.7291072156543006, + "grad_norm": 0.3206670582294464, + "learning_rate": 5.340739230873761e-06, + "loss": 3.9508, + "step": 54885 + }, + { + "epoch": 3.7294469357249627, + "grad_norm": 0.33971306681632996, + "learning_rate": 5.340314580785433e-06, + "loss": 3.9853, + "step": 54890 + }, + { + "epoch": 3.7297866557956243, + "grad_norm": 0.32860904932022095, + "learning_rate": 5.3398899306971064e-06, + "loss": 4.0967, + "step": 54895 + }, + { + "epoch": 3.730126375866286, + "grad_norm": 0.3162437379360199, + "learning_rate": 5.339465280608778e-06, + "loss": 4.0894, + "step": 54900 + }, + { + "epoch": 3.730466095936948, + "grad_norm": 0.2971450090408325, + "learning_rate": 5.339040630520451e-06, + "loss": 3.937, + "step": 54905 + }, + { + "epoch": 3.7308058160076096, + "grad_norm": 0.2605918347835541, + "learning_rate": 5.338615980432125e-06, + "loss": 3.9472, + "step": 54910 + }, + { + "epoch": 3.7311455360782713, + "grad_norm": 0.37585872411727905, + "learning_rate": 5.338191330343797e-06, + "loss": 3.9847, + "step": 54915 + }, + { + "epoch": 3.7314852561489333, + "grad_norm": 0.47982195019721985, + "learning_rate": 5.33776668025547e-06, + "loss": 3.8166, + "step": 54920 + }, + { + "epoch": 3.731824976219595, + "grad_norm": 0.24336165189743042, + "learning_rate": 5.337342030167143e-06, + "loss": 4.028, + "step": 54925 + }, + { + "epoch": 3.7321646962902566, + "grad_norm": 0.38576120138168335, + "learning_rate": 5.336917380078815e-06, + "loss": 4.0424, + "step": 54930 + }, + { + "epoch": 3.7325044163609187, + "grad_norm": 0.267784982919693, + "learning_rate": 5.336492729990488e-06, + "loss": 4.0951, + "step": 54935 + }, + { + "epoch": 3.7328441364315803, + "grad_norm": 0.3064603805541992, + "learning_rate": 5.336068079902162e-06, + "loss": 4.05, + "step": 54940 + }, + { + "epoch": 3.733183856502242, + "grad_norm": 0.3265310227870941, + "learning_rate": 5.335643429813834e-06, + "loss": 4.0586, + "step": 54945 + }, + { + "epoch": 3.733523576572904, + "grad_norm": 0.30201131105422974, + "learning_rate": 5.335218779725506e-06, + "loss": 3.851, + "step": 54950 + }, + { + "epoch": 3.7338632966435656, + "grad_norm": 0.29058969020843506, + "learning_rate": 5.33479412963718e-06, + "loss": 4.1962, + "step": 54955 + }, + { + "epoch": 3.7342030167142273, + "grad_norm": 0.24476978182792664, + "learning_rate": 5.334369479548852e-06, + "loss": 3.9142, + "step": 54960 + }, + { + "epoch": 3.7345427367848893, + "grad_norm": 0.46926724910736084, + "learning_rate": 5.333944829460525e-06, + "loss": 4.0655, + "step": 54965 + }, + { + "epoch": 3.734882456855551, + "grad_norm": 0.2540527880191803, + "learning_rate": 5.3335201793721984e-06, + "loss": 3.8454, + "step": 54970 + }, + { + "epoch": 3.7352221769262126, + "grad_norm": 0.3791857063770294, + "learning_rate": 5.33309552928387e-06, + "loss": 4.2629, + "step": 54975 + }, + { + "epoch": 3.7355618969968747, + "grad_norm": 0.2950800955295563, + "learning_rate": 5.332670879195543e-06, + "loss": 4.0232, + "step": 54980 + }, + { + "epoch": 3.7359016170675363, + "grad_norm": 0.24610936641693115, + "learning_rate": 5.332246229107216e-06, + "loss": 4.0239, + "step": 54985 + }, + { + "epoch": 3.736241337138198, + "grad_norm": 0.21964429318904877, + "learning_rate": 5.331821579018889e-06, + "loss": 3.8007, + "step": 54990 + }, + { + "epoch": 3.73658105720886, + "grad_norm": 0.30115512013435364, + "learning_rate": 5.331396928930561e-06, + "loss": 4.133, + "step": 54995 + }, + { + "epoch": 3.7369207772795217, + "grad_norm": 0.28873515129089355, + "learning_rate": 5.330972278842234e-06, + "loss": 4.1171, + "step": 55000 + }, + { + "epoch": 3.7372604973501833, + "grad_norm": 0.21969957649707794, + "learning_rate": 5.330547628753907e-06, + "loss": 3.8457, + "step": 55005 + }, + { + "epoch": 3.7376002174208454, + "grad_norm": 0.29196298122406006, + "learning_rate": 5.330122978665579e-06, + "loss": 4.1604, + "step": 55010 + }, + { + "epoch": 3.737939937491507, + "grad_norm": 0.2906082570552826, + "learning_rate": 5.329698328577253e-06, + "loss": 4.0619, + "step": 55015 + }, + { + "epoch": 3.7382796575621686, + "grad_norm": 0.3479754328727722, + "learning_rate": 5.329273678488926e-06, + "loss": 4.0557, + "step": 55020 + }, + { + "epoch": 3.7386193776328307, + "grad_norm": 0.45826876163482666, + "learning_rate": 5.3288490284005976e-06, + "loss": 3.9882, + "step": 55025 + }, + { + "epoch": 3.7389590977034923, + "grad_norm": 0.532410204410553, + "learning_rate": 5.328424378312271e-06, + "loss": 3.9518, + "step": 55030 + }, + { + "epoch": 3.739298817774154, + "grad_norm": 0.6067249178886414, + "learning_rate": 5.327999728223944e-06, + "loss": 3.9423, + "step": 55035 + }, + { + "epoch": 3.739638537844816, + "grad_norm": 0.3950307369232178, + "learning_rate": 5.327575078135616e-06, + "loss": 4.156, + "step": 55040 + }, + { + "epoch": 3.7399782579154777, + "grad_norm": 0.2754400074481964, + "learning_rate": 5.32715042804729e-06, + "loss": 3.8511, + "step": 55045 + }, + { + "epoch": 3.7403179779861393, + "grad_norm": 0.2962836027145386, + "learning_rate": 5.326725777958962e-06, + "loss": 3.8631, + "step": 55050 + }, + { + "epoch": 3.7406576980568014, + "grad_norm": 0.43764162063598633, + "learning_rate": 5.326301127870635e-06, + "loss": 4.0199, + "step": 55055 + }, + { + "epoch": 3.740997418127463, + "grad_norm": 0.34939128160476685, + "learning_rate": 5.325876477782308e-06, + "loss": 4.1097, + "step": 55060 + }, + { + "epoch": 3.7413371381981246, + "grad_norm": 0.2900404632091522, + "learning_rate": 5.325451827693981e-06, + "loss": 4.0333, + "step": 55065 + }, + { + "epoch": 3.7416768582687867, + "grad_norm": 0.23498617112636566, + "learning_rate": 5.325027177605654e-06, + "loss": 4.1837, + "step": 55070 + }, + { + "epoch": 3.7420165783394483, + "grad_norm": 0.3536282479763031, + "learning_rate": 5.324602527517326e-06, + "loss": 4.1938, + "step": 55075 + }, + { + "epoch": 3.74235629841011, + "grad_norm": 0.26216572523117065, + "learning_rate": 5.324177877428998e-06, + "loss": 3.8063, + "step": 55080 + }, + { + "epoch": 3.742696018480772, + "grad_norm": 0.2422715425491333, + "learning_rate": 5.323753227340672e-06, + "loss": 3.7974, + "step": 55085 + }, + { + "epoch": 3.7430357385514337, + "grad_norm": 0.6665075421333313, + "learning_rate": 5.323328577252345e-06, + "loss": 4.164, + "step": 55090 + }, + { + "epoch": 3.7433754586220953, + "grad_norm": 0.42642372846603394, + "learning_rate": 5.322903927164017e-06, + "loss": 3.8408, + "step": 55095 + }, + { + "epoch": 3.7437151786927574, + "grad_norm": 0.31481289863586426, + "learning_rate": 5.32247927707569e-06, + "loss": 3.8392, + "step": 55100 + }, + { + "epoch": 3.744054898763419, + "grad_norm": 0.3031526803970337, + "learning_rate": 5.322054626987363e-06, + "loss": 3.9244, + "step": 55105 + }, + { + "epoch": 3.7443946188340806, + "grad_norm": 0.5151661038398743, + "learning_rate": 5.321629976899035e-06, + "loss": 3.8902, + "step": 55110 + }, + { + "epoch": 3.7447343389047427, + "grad_norm": 0.29527747631073, + "learning_rate": 5.321205326810709e-06, + "loss": 4.1553, + "step": 55115 + }, + { + "epoch": 3.7450740589754044, + "grad_norm": 0.2580658197402954, + "learning_rate": 5.320780676722382e-06, + "loss": 3.6375, + "step": 55120 + }, + { + "epoch": 3.745413779046066, + "grad_norm": 0.34717944264411926, + "learning_rate": 5.3203560266340536e-06, + "loss": 3.8799, + "step": 55125 + }, + { + "epoch": 3.745753499116728, + "grad_norm": 0.3492497503757477, + "learning_rate": 5.319931376545727e-06, + "loss": 4.1955, + "step": 55130 + }, + { + "epoch": 3.7460932191873897, + "grad_norm": 0.28468215465545654, + "learning_rate": 5.3195067264574e-06, + "loss": 3.9922, + "step": 55135 + }, + { + "epoch": 3.7464329392580513, + "grad_norm": 0.21767845749855042, + "learning_rate": 5.319082076369072e-06, + "loss": 3.9494, + "step": 55140 + }, + { + "epoch": 3.7467726593287134, + "grad_norm": 0.296854168176651, + "learning_rate": 5.318657426280746e-06, + "loss": 3.7201, + "step": 55145 + }, + { + "epoch": 3.747112379399375, + "grad_norm": 0.24891114234924316, + "learning_rate": 5.3182327761924176e-06, + "loss": 3.966, + "step": 55150 + }, + { + "epoch": 3.7474520994700367, + "grad_norm": 0.3681429326534271, + "learning_rate": 5.31780812610409e-06, + "loss": 3.9734, + "step": 55155 + }, + { + "epoch": 3.7477918195406987, + "grad_norm": 0.2390143871307373, + "learning_rate": 5.317383476015764e-06, + "loss": 3.8757, + "step": 55160 + }, + { + "epoch": 3.7481315396113604, + "grad_norm": 0.34439095854759216, + "learning_rate": 5.316958825927436e-06, + "loss": 4.2051, + "step": 55165 + }, + { + "epoch": 3.748471259682022, + "grad_norm": 0.24019695818424225, + "learning_rate": 5.316534175839109e-06, + "loss": 3.8775, + "step": 55170 + }, + { + "epoch": 3.7488109797526836, + "grad_norm": 0.3400024175643921, + "learning_rate": 5.316109525750782e-06, + "loss": 3.9945, + "step": 55175 + }, + { + "epoch": 3.7491506998233457, + "grad_norm": 0.3447435796260834, + "learning_rate": 5.315684875662454e-06, + "loss": 4.0574, + "step": 55180 + }, + { + "epoch": 3.7494904198940073, + "grad_norm": 0.278821736574173, + "learning_rate": 5.315260225574127e-06, + "loss": 3.9794, + "step": 55185 + }, + { + "epoch": 3.749830139964669, + "grad_norm": 0.2276442050933838, + "learning_rate": 5.314835575485801e-06, + "loss": 3.9418, + "step": 55190 + }, + { + "epoch": 3.750169860035331, + "grad_norm": 0.29726341366767883, + "learning_rate": 5.314410925397473e-06, + "loss": 3.7043, + "step": 55195 + }, + { + "epoch": 3.7505095801059927, + "grad_norm": 0.28367888927459717, + "learning_rate": 5.313986275309146e-06, + "loss": 3.8366, + "step": 55200 + }, + { + "epoch": 3.7508493001766543, + "grad_norm": 0.2541836202144623, + "learning_rate": 5.313561625220819e-06, + "loss": 4.044, + "step": 55205 + }, + { + "epoch": 3.7511890202473164, + "grad_norm": 0.48484787344932556, + "learning_rate": 5.313136975132491e-06, + "loss": 3.9578, + "step": 55210 + }, + { + "epoch": 3.751528740317978, + "grad_norm": 0.24912433326244354, + "learning_rate": 5.312712325044164e-06, + "loss": 4.0314, + "step": 55215 + }, + { + "epoch": 3.7518684603886396, + "grad_norm": 0.519493043422699, + "learning_rate": 5.312287674955837e-06, + "loss": 3.7041, + "step": 55220 + }, + { + "epoch": 3.7522081804593013, + "grad_norm": 0.2655809223651886, + "learning_rate": 5.31186302486751e-06, + "loss": 3.9373, + "step": 55225 + }, + { + "epoch": 3.7525479005299633, + "grad_norm": 0.3389650285243988, + "learning_rate": 5.311438374779182e-06, + "loss": 4.2658, + "step": 55230 + }, + { + "epoch": 3.752887620600625, + "grad_norm": 0.258669376373291, + "learning_rate": 5.311013724690855e-06, + "loss": 3.964, + "step": 55235 + }, + { + "epoch": 3.7532273406712866, + "grad_norm": 0.25816795229911804, + "learning_rate": 5.310589074602528e-06, + "loss": 3.8151, + "step": 55240 + }, + { + "epoch": 3.7535670607419487, + "grad_norm": 0.43314310908317566, + "learning_rate": 5.3101644245142e-06, + "loss": 4.3604, + "step": 55245 + }, + { + "epoch": 3.7539067808126103, + "grad_norm": 0.35230395197868347, + "learning_rate": 5.309739774425874e-06, + "loss": 4.0305, + "step": 55250 + }, + { + "epoch": 3.754246500883272, + "grad_norm": 0.21540597081184387, + "learning_rate": 5.309315124337546e-06, + "loss": 3.8289, + "step": 55255 + }, + { + "epoch": 3.754586220953934, + "grad_norm": 0.28092288970947266, + "learning_rate": 5.308890474249218e-06, + "loss": 4.0515, + "step": 55260 + }, + { + "epoch": 3.7549259410245956, + "grad_norm": 0.3113855719566345, + "learning_rate": 5.308465824160892e-06, + "loss": 3.9896, + "step": 55265 + }, + { + "epoch": 3.7552656610952573, + "grad_norm": 0.30370500683784485, + "learning_rate": 5.308041174072565e-06, + "loss": 4.0962, + "step": 55270 + }, + { + "epoch": 3.7556053811659194, + "grad_norm": 0.24579289555549622, + "learning_rate": 5.307616523984237e-06, + "loss": 3.9901, + "step": 55275 + }, + { + "epoch": 3.755945101236581, + "grad_norm": 0.29075413942337036, + "learning_rate": 5.30719187389591e-06, + "loss": 4.0233, + "step": 55280 + }, + { + "epoch": 3.7562848213072426, + "grad_norm": 0.3648258149623871, + "learning_rate": 5.306767223807583e-06, + "loss": 4.0602, + "step": 55285 + }, + { + "epoch": 3.7566245413779047, + "grad_norm": 0.46704474091529846, + "learning_rate": 5.306342573719255e-06, + "loss": 3.9847, + "step": 55290 + }, + { + "epoch": 3.7569642614485663, + "grad_norm": 0.27580568194389343, + "learning_rate": 5.305917923630929e-06, + "loss": 4.0948, + "step": 55295 + }, + { + "epoch": 3.757303981519228, + "grad_norm": 0.2826860249042511, + "learning_rate": 5.305493273542602e-06, + "loss": 4.0151, + "step": 55300 + }, + { + "epoch": 3.75764370158989, + "grad_norm": 0.2745615541934967, + "learning_rate": 5.3050686234542735e-06, + "loss": 4.0963, + "step": 55305 + }, + { + "epoch": 3.7579834216605517, + "grad_norm": 1.3686782121658325, + "learning_rate": 5.304643973365947e-06, + "loss": 4.2339, + "step": 55310 + }, + { + "epoch": 3.7583231417312133, + "grad_norm": 0.3559277057647705, + "learning_rate": 5.304219323277619e-06, + "loss": 4.2379, + "step": 55315 + }, + { + "epoch": 3.7586628618018754, + "grad_norm": 0.34458041191101074, + "learning_rate": 5.303794673189292e-06, + "loss": 4.1155, + "step": 55320 + }, + { + "epoch": 3.759002581872537, + "grad_norm": 0.21892096102237701, + "learning_rate": 5.303370023100966e-06, + "loss": 3.9907, + "step": 55325 + }, + { + "epoch": 3.7593423019431986, + "grad_norm": 0.2972621023654938, + "learning_rate": 5.3029453730126376e-06, + "loss": 3.9603, + "step": 55330 + }, + { + "epoch": 3.7596820220138607, + "grad_norm": 0.28005069494247437, + "learning_rate": 5.30252072292431e-06, + "loss": 3.9671, + "step": 55335 + }, + { + "epoch": 3.7600217420845223, + "grad_norm": 0.19487038254737854, + "learning_rate": 5.302096072835984e-06, + "loss": 3.9981, + "step": 55340 + }, + { + "epoch": 3.760361462155184, + "grad_norm": 0.2892844080924988, + "learning_rate": 5.301671422747656e-06, + "loss": 4.2515, + "step": 55345 + }, + { + "epoch": 3.760701182225846, + "grad_norm": 0.2767874300479889, + "learning_rate": 5.301246772659329e-06, + "loss": 3.9605, + "step": 55350 + }, + { + "epoch": 3.7610409022965077, + "grad_norm": 0.4868246018886566, + "learning_rate": 5.300822122571002e-06, + "loss": 4.2475, + "step": 55355 + }, + { + "epoch": 3.7613806223671693, + "grad_norm": 0.22544272243976593, + "learning_rate": 5.300397472482674e-06, + "loss": 4.0447, + "step": 55360 + }, + { + "epoch": 3.7617203424378314, + "grad_norm": 0.3908582627773285, + "learning_rate": 5.299972822394347e-06, + "loss": 4.0805, + "step": 55365 + }, + { + "epoch": 3.762060062508493, + "grad_norm": 0.3230426609516144, + "learning_rate": 5.299548172306021e-06, + "loss": 3.8604, + "step": 55370 + }, + { + "epoch": 3.7623997825791546, + "grad_norm": 0.29449188709259033, + "learning_rate": 5.299123522217693e-06, + "loss": 3.9902, + "step": 55375 + }, + { + "epoch": 3.7627395026498167, + "grad_norm": 0.31928661465644836, + "learning_rate": 5.2986988721293656e-06, + "loss": 4.1568, + "step": 55380 + }, + { + "epoch": 3.7630792227204783, + "grad_norm": 0.22877763211727142, + "learning_rate": 5.298274222041039e-06, + "loss": 3.9079, + "step": 55385 + }, + { + "epoch": 3.76341894279114, + "grad_norm": 0.2329532951116562, + "learning_rate": 5.297849571952711e-06, + "loss": 4.124, + "step": 55390 + }, + { + "epoch": 3.763758662861802, + "grad_norm": 0.30450060963630676, + "learning_rate": 5.297424921864385e-06, + "loss": 4.5179, + "step": 55395 + }, + { + "epoch": 3.7640983829324637, + "grad_norm": 0.27281662821769714, + "learning_rate": 5.297000271776057e-06, + "loss": 4.1389, + "step": 55400 + }, + { + "epoch": 3.7644381030031253, + "grad_norm": 0.4274858832359314, + "learning_rate": 5.2965756216877296e-06, + "loss": 3.9627, + "step": 55405 + }, + { + "epoch": 3.7647778230737874, + "grad_norm": 0.617708683013916, + "learning_rate": 5.296150971599403e-06, + "loss": 4.0509, + "step": 55410 + }, + { + "epoch": 3.765117543144449, + "grad_norm": 0.22819118201732635, + "learning_rate": 5.295726321511075e-06, + "loss": 3.8391, + "step": 55415 + }, + { + "epoch": 3.7654572632151107, + "grad_norm": 0.4376634359359741, + "learning_rate": 5.295301671422748e-06, + "loss": 4.1021, + "step": 55420 + }, + { + "epoch": 3.7657969832857727, + "grad_norm": 0.291790634393692, + "learning_rate": 5.294877021334422e-06, + "loss": 4.0145, + "step": 55425 + }, + { + "epoch": 3.7661367033564344, + "grad_norm": 0.2772656977176666, + "learning_rate": 5.2944523712460936e-06, + "loss": 3.9839, + "step": 55430 + }, + { + "epoch": 3.766476423427096, + "grad_norm": 0.3311315178871155, + "learning_rate": 5.294027721157766e-06, + "loss": 4.259, + "step": 55435 + }, + { + "epoch": 3.766816143497758, + "grad_norm": 0.337422639131546, + "learning_rate": 5.29360307106944e-06, + "loss": 3.9895, + "step": 55440 + }, + { + "epoch": 3.7671558635684197, + "grad_norm": 0.40638038516044617, + "learning_rate": 5.293178420981112e-06, + "loss": 4.1444, + "step": 55445 + }, + { + "epoch": 3.7674955836390813, + "grad_norm": 0.33108195662498474, + "learning_rate": 5.292753770892785e-06, + "loss": 4.0644, + "step": 55450 + }, + { + "epoch": 3.7678353037097434, + "grad_norm": 0.25892072916030884, + "learning_rate": 5.292329120804458e-06, + "loss": 4.0811, + "step": 55455 + }, + { + "epoch": 3.768175023780405, + "grad_norm": 0.37933650612831116, + "learning_rate": 5.29190447071613e-06, + "loss": 3.9293, + "step": 55460 + }, + { + "epoch": 3.7685147438510667, + "grad_norm": 0.2721557021141052, + "learning_rate": 5.291479820627803e-06, + "loss": 3.8541, + "step": 55465 + }, + { + "epoch": 3.7688544639217287, + "grad_norm": 0.33638009428977966, + "learning_rate": 5.291055170539476e-06, + "loss": 3.8264, + "step": 55470 + }, + { + "epoch": 3.7691941839923904, + "grad_norm": 0.40286529064178467, + "learning_rate": 5.290630520451149e-06, + "loss": 3.8305, + "step": 55475 + }, + { + "epoch": 3.769533904063052, + "grad_norm": 0.27536800503730774, + "learning_rate": 5.2902058703628216e-06, + "loss": 3.7961, + "step": 55480 + }, + { + "epoch": 3.769873624133714, + "grad_norm": 0.33081212639808655, + "learning_rate": 5.289781220274494e-06, + "loss": 3.9648, + "step": 55485 + }, + { + "epoch": 3.7702133442043757, + "grad_norm": 0.2117566466331482, + "learning_rate": 5.289356570186167e-06, + "loss": 3.9915, + "step": 55490 + }, + { + "epoch": 3.7705530642750373, + "grad_norm": 0.23637539148330688, + "learning_rate": 5.288931920097839e-06, + "loss": 4.1745, + "step": 55495 + }, + { + "epoch": 3.7708927843456994, + "grad_norm": 0.27855631709098816, + "learning_rate": 5.288507270009513e-06, + "loss": 4.0251, + "step": 55500 + }, + { + "epoch": 3.771232504416361, + "grad_norm": 0.6639150977134705, + "learning_rate": 5.288082619921186e-06, + "loss": 4.2908, + "step": 55505 + }, + { + "epoch": 3.7715722244870227, + "grad_norm": 0.31003543734550476, + "learning_rate": 5.2876579698328575e-06, + "loss": 3.9969, + "step": 55510 + }, + { + "epoch": 3.7719119445576843, + "grad_norm": 0.4712147116661072, + "learning_rate": 5.287233319744531e-06, + "loss": 4.1063, + "step": 55515 + }, + { + "epoch": 3.7722516646283464, + "grad_norm": 0.3521216809749603, + "learning_rate": 5.286808669656204e-06, + "loss": 4.1491, + "step": 55520 + }, + { + "epoch": 3.772591384699008, + "grad_norm": 0.43952661752700806, + "learning_rate": 5.286384019567876e-06, + "loss": 4.1381, + "step": 55525 + }, + { + "epoch": 3.7729311047696696, + "grad_norm": 0.29191115498542786, + "learning_rate": 5.28595936947955e-06, + "loss": 3.7465, + "step": 55530 + }, + { + "epoch": 3.7732708248403317, + "grad_norm": 0.23257538676261902, + "learning_rate": 5.285534719391222e-06, + "loss": 3.8469, + "step": 55535 + }, + { + "epoch": 3.7736105449109933, + "grad_norm": 0.25895652174949646, + "learning_rate": 5.285110069302894e-06, + "loss": 4.0687, + "step": 55540 + }, + { + "epoch": 3.773950264981655, + "grad_norm": 0.2844499945640564, + "learning_rate": 5.284685419214568e-06, + "loss": 4.1324, + "step": 55545 + }, + { + "epoch": 3.774289985052317, + "grad_norm": 0.501493513584137, + "learning_rate": 5.284260769126241e-06, + "loss": 4.1301, + "step": 55550 + }, + { + "epoch": 3.7746297051229787, + "grad_norm": 0.2641747295856476, + "learning_rate": 5.283836119037913e-06, + "loss": 3.9851, + "step": 55555 + }, + { + "epoch": 3.7749694251936403, + "grad_norm": 0.3316574990749359, + "learning_rate": 5.283411468949586e-06, + "loss": 4.0457, + "step": 55560 + }, + { + "epoch": 3.775309145264302, + "grad_norm": 0.33717453479766846, + "learning_rate": 5.282986818861258e-06, + "loss": 3.6333, + "step": 55565 + }, + { + "epoch": 3.775648865334964, + "grad_norm": 0.3793080449104309, + "learning_rate": 5.282562168772931e-06, + "loss": 3.8196, + "step": 55570 + }, + { + "epoch": 3.7759885854056257, + "grad_norm": 0.21296416223049164, + "learning_rate": 5.282137518684605e-06, + "loss": 4.1548, + "step": 55575 + }, + { + "epoch": 3.7763283054762873, + "grad_norm": 0.24837863445281982, + "learning_rate": 5.281712868596277e-06, + "loss": 4.0664, + "step": 55580 + }, + { + "epoch": 3.7766680255469494, + "grad_norm": 0.35066646337509155, + "learning_rate": 5.2812882185079495e-06, + "loss": 4.0478, + "step": 55585 + }, + { + "epoch": 3.777007745617611, + "grad_norm": 0.37185731530189514, + "learning_rate": 5.280863568419623e-06, + "loss": 4.1494, + "step": 55590 + }, + { + "epoch": 3.7773474656882726, + "grad_norm": 0.42291778326034546, + "learning_rate": 5.280438918331295e-06, + "loss": 4.1511, + "step": 55595 + }, + { + "epoch": 3.7776871857589347, + "grad_norm": 0.3986557126045227, + "learning_rate": 5.280014268242968e-06, + "loss": 4.0416, + "step": 55600 + }, + { + "epoch": 3.7780269058295963, + "grad_norm": 0.30581802129745483, + "learning_rate": 5.279589618154642e-06, + "loss": 4.0323, + "step": 55605 + }, + { + "epoch": 3.778366625900258, + "grad_norm": 0.3084770441055298, + "learning_rate": 5.2791649680663135e-06, + "loss": 4.0011, + "step": 55610 + }, + { + "epoch": 3.77870634597092, + "grad_norm": 0.20227067172527313, + "learning_rate": 5.278740317977986e-06, + "loss": 4.2932, + "step": 55615 + }, + { + "epoch": 3.7790460660415817, + "grad_norm": 0.24946431815624237, + "learning_rate": 5.27831566788966e-06, + "loss": 4.1255, + "step": 55620 + }, + { + "epoch": 3.7793857861122433, + "grad_norm": 0.2058769166469574, + "learning_rate": 5.277891017801332e-06, + "loss": 4.1542, + "step": 55625 + }, + { + "epoch": 3.7797255061829054, + "grad_norm": 0.2963450253009796, + "learning_rate": 5.277466367713005e-06, + "loss": 4.2718, + "step": 55630 + }, + { + "epoch": 3.780065226253567, + "grad_norm": 0.24725262820720673, + "learning_rate": 5.277041717624678e-06, + "loss": 4.044, + "step": 55635 + }, + { + "epoch": 3.7804049463242286, + "grad_norm": 0.2639877200126648, + "learning_rate": 5.27661706753635e-06, + "loss": 4.2322, + "step": 55640 + }, + { + "epoch": 3.7807446663948907, + "grad_norm": 0.2735346257686615, + "learning_rate": 5.276192417448023e-06, + "loss": 3.9552, + "step": 55645 + }, + { + "epoch": 3.7810843864655523, + "grad_norm": 0.42780929803848267, + "learning_rate": 5.275767767359696e-06, + "loss": 3.9048, + "step": 55650 + }, + { + "epoch": 3.781424106536214, + "grad_norm": 0.26837581396102905, + "learning_rate": 5.275343117271369e-06, + "loss": 3.9053, + "step": 55655 + }, + { + "epoch": 3.781763826606876, + "grad_norm": 0.3275870382785797, + "learning_rate": 5.274918467183041e-06, + "loss": 4.1812, + "step": 55660 + }, + { + "epoch": 3.7821035466775377, + "grad_norm": 0.5633528232574463, + "learning_rate": 5.274493817094714e-06, + "loss": 3.864, + "step": 55665 + }, + { + "epoch": 3.7824432667481993, + "grad_norm": 0.2246256023645401, + "learning_rate": 5.274069167006387e-06, + "loss": 4.0366, + "step": 55670 + }, + { + "epoch": 3.7827829868188614, + "grad_norm": 0.3386375904083252, + "learning_rate": 5.273644516918059e-06, + "loss": 3.9391, + "step": 55675 + }, + { + "epoch": 3.783122706889523, + "grad_norm": 0.23555007576942444, + "learning_rate": 5.273219866829733e-06, + "loss": 3.9012, + "step": 55680 + }, + { + "epoch": 3.7834624269601846, + "grad_norm": 0.3705300986766815, + "learning_rate": 5.2727952167414056e-06, + "loss": 4.1097, + "step": 55685 + }, + { + "epoch": 3.7838021470308467, + "grad_norm": 0.33788540959358215, + "learning_rate": 5.2723705666530775e-06, + "loss": 3.8272, + "step": 55690 + }, + { + "epoch": 3.7841418671015083, + "grad_norm": 0.3729512691497803, + "learning_rate": 5.271945916564751e-06, + "loss": 4.2109, + "step": 55695 + }, + { + "epoch": 3.78448158717217, + "grad_norm": 0.2678385078907013, + "learning_rate": 5.271521266476424e-06, + "loss": 3.8182, + "step": 55700 + }, + { + "epoch": 3.784821307242832, + "grad_norm": 0.35957738757133484, + "learning_rate": 5.271096616388096e-06, + "loss": 4.0354, + "step": 55705 + }, + { + "epoch": 3.7851610273134937, + "grad_norm": 0.22233417630195618, + "learning_rate": 5.2706719662997696e-06, + "loss": 4.1302, + "step": 55710 + }, + { + "epoch": 3.7855007473841553, + "grad_norm": 0.31855618953704834, + "learning_rate": 5.270247316211442e-06, + "loss": 3.9874, + "step": 55715 + }, + { + "epoch": 3.7858404674548174, + "grad_norm": 0.25815537571907043, + "learning_rate": 5.269822666123114e-06, + "loss": 4.1996, + "step": 55720 + }, + { + "epoch": 3.786180187525479, + "grad_norm": 0.28675583004951477, + "learning_rate": 5.269398016034788e-06, + "loss": 4.0037, + "step": 55725 + }, + { + "epoch": 3.7865199075961407, + "grad_norm": 0.32884183526039124, + "learning_rate": 5.268973365946461e-06, + "loss": 4.1338, + "step": 55730 + }, + { + "epoch": 3.7868596276668027, + "grad_norm": 0.34469500184059143, + "learning_rate": 5.2685487158581336e-06, + "loss": 3.8197, + "step": 55735 + }, + { + "epoch": 3.7871993477374644, + "grad_norm": 0.32807543873786926, + "learning_rate": 5.268124065769806e-06, + "loss": 4.1378, + "step": 55740 + }, + { + "epoch": 3.787539067808126, + "grad_norm": 0.2524694502353668, + "learning_rate": 5.267699415681478e-06, + "loss": 3.9771, + "step": 55745 + }, + { + "epoch": 3.787878787878788, + "grad_norm": 0.2663414180278778, + "learning_rate": 5.267274765593152e-06, + "loss": 3.9936, + "step": 55750 + }, + { + "epoch": 3.7882185079494497, + "grad_norm": 0.26024430990219116, + "learning_rate": 5.266850115504825e-06, + "loss": 4.151, + "step": 55755 + }, + { + "epoch": 3.7885582280201113, + "grad_norm": 0.24578288197517395, + "learning_rate": 5.266425465416497e-06, + "loss": 3.9152, + "step": 55760 + }, + { + "epoch": 3.7888979480907734, + "grad_norm": 0.20642177760601044, + "learning_rate": 5.26600081532817e-06, + "loss": 3.9136, + "step": 55765 + }, + { + "epoch": 3.789237668161435, + "grad_norm": 0.22793860733509064, + "learning_rate": 5.265576165239843e-06, + "loss": 3.9649, + "step": 55770 + }, + { + "epoch": 3.7895773882320967, + "grad_norm": 0.2632271945476532, + "learning_rate": 5.265151515151515e-06, + "loss": 4.0124, + "step": 55775 + }, + { + "epoch": 3.7899171083027587, + "grad_norm": 0.29259565472602844, + "learning_rate": 5.264726865063189e-06, + "loss": 3.7711, + "step": 55780 + }, + { + "epoch": 3.7902568283734204, + "grad_norm": 0.26601192355155945, + "learning_rate": 5.2643022149748616e-06, + "loss": 4.1753, + "step": 55785 + }, + { + "epoch": 3.790596548444082, + "grad_norm": 0.2951129376888275, + "learning_rate": 5.2638775648865335e-06, + "loss": 4.0363, + "step": 55790 + }, + { + "epoch": 3.790936268514744, + "grad_norm": 0.2709747552871704, + "learning_rate": 5.263452914798207e-06, + "loss": 4.128, + "step": 55795 + }, + { + "epoch": 3.7912759885854057, + "grad_norm": 0.2634097635746002, + "learning_rate": 5.26302826470988e-06, + "loss": 3.9637, + "step": 55800 + }, + { + "epoch": 3.7916157086560673, + "grad_norm": 0.3929113447666168, + "learning_rate": 5.262603614621552e-06, + "loss": 4.0005, + "step": 55805 + }, + { + "epoch": 3.7919554287267294, + "grad_norm": 0.23117665946483612, + "learning_rate": 5.262178964533226e-06, + "loss": 3.8563, + "step": 55810 + }, + { + "epoch": 3.792295148797391, + "grad_norm": 0.22427237033843994, + "learning_rate": 5.2617543144448975e-06, + "loss": 3.8156, + "step": 55815 + }, + { + "epoch": 3.7926348688680527, + "grad_norm": 0.23812256753444672, + "learning_rate": 5.26132966435657e-06, + "loss": 4.0828, + "step": 55820 + }, + { + "epoch": 3.7929745889387148, + "grad_norm": 0.302453875541687, + "learning_rate": 5.260905014268244e-06, + "loss": 4.0298, + "step": 55825 + }, + { + "epoch": 3.7933143090093764, + "grad_norm": 0.3006463646888733, + "learning_rate": 5.260480364179916e-06, + "loss": 4.0212, + "step": 55830 + }, + { + "epoch": 3.793654029080038, + "grad_norm": 0.4111560881137848, + "learning_rate": 5.260055714091589e-06, + "loss": 4.1383, + "step": 55835 + }, + { + "epoch": 3.7939937491507, + "grad_norm": 0.41490381956100464, + "learning_rate": 5.259631064003262e-06, + "loss": 4.0879, + "step": 55840 + }, + { + "epoch": 3.7943334692213617, + "grad_norm": 0.2560979425907135, + "learning_rate": 5.259206413914934e-06, + "loss": 3.8842, + "step": 55845 + }, + { + "epoch": 3.7946731892920234, + "grad_norm": 0.34805646538734436, + "learning_rate": 5.258781763826607e-06, + "loss": 3.9768, + "step": 55850 + }, + { + "epoch": 3.795012909362685, + "grad_norm": 0.31479495763778687, + "learning_rate": 5.258357113738281e-06, + "loss": 3.9765, + "step": 55855 + }, + { + "epoch": 3.795352629433347, + "grad_norm": 0.24308577179908752, + "learning_rate": 5.257932463649953e-06, + "loss": 3.9487, + "step": 55860 + }, + { + "epoch": 3.7956923495040087, + "grad_norm": 0.31922879815101624, + "learning_rate": 5.2575078135616255e-06, + "loss": 4.0997, + "step": 55865 + }, + { + "epoch": 3.7960320695746703, + "grad_norm": 0.27844443917274475, + "learning_rate": 5.257083163473299e-06, + "loss": 3.8812, + "step": 55870 + }, + { + "epoch": 3.7963717896453324, + "grad_norm": 0.3839071989059448, + "learning_rate": 5.256658513384971e-06, + "loss": 3.9872, + "step": 55875 + }, + { + "epoch": 3.796711509715994, + "grad_norm": 0.2545519173145294, + "learning_rate": 5.256233863296644e-06, + "loss": 4.1035, + "step": 55880 + }, + { + "epoch": 3.7970512297866557, + "grad_norm": 0.27546200156211853, + "learning_rate": 5.255809213208317e-06, + "loss": 4.0345, + "step": 55885 + }, + { + "epoch": 3.7973909498573177, + "grad_norm": 0.3490890860557556, + "learning_rate": 5.2553845631199895e-06, + "loss": 4.0258, + "step": 55890 + }, + { + "epoch": 3.7977306699279794, + "grad_norm": 1.2513787746429443, + "learning_rate": 5.254959913031662e-06, + "loss": 3.8991, + "step": 55895 + }, + { + "epoch": 3.798070389998641, + "grad_norm": 0.27272483706474304, + "learning_rate": 5.254535262943335e-06, + "loss": 4.0925, + "step": 55900 + }, + { + "epoch": 3.7984101100693026, + "grad_norm": 0.31679773330688477, + "learning_rate": 5.254110612855008e-06, + "loss": 4.1884, + "step": 55905 + }, + { + "epoch": 3.7987498301399647, + "grad_norm": 0.36458078026771545, + "learning_rate": 5.25368596276668e-06, + "loss": 3.7435, + "step": 55910 + }, + { + "epoch": 3.7990895502106263, + "grad_norm": 0.22002461552619934, + "learning_rate": 5.2532613126783535e-06, + "loss": 3.9731, + "step": 55915 + }, + { + "epoch": 3.799429270281288, + "grad_norm": 0.40914618968963623, + "learning_rate": 5.252836662590026e-06, + "loss": 4.1606, + "step": 55920 + }, + { + "epoch": 3.79976899035195, + "grad_norm": 0.4173300564289093, + "learning_rate": 5.252412012501698e-06, + "loss": 4.0447, + "step": 55925 + }, + { + "epoch": 3.8001087104226117, + "grad_norm": 0.2606920897960663, + "learning_rate": 5.251987362413372e-06, + "loss": 4.068, + "step": 55930 + }, + { + "epoch": 3.8004484304932733, + "grad_norm": 0.23694784939289093, + "learning_rate": 5.251562712325045e-06, + "loss": 4.1764, + "step": 55935 + }, + { + "epoch": 3.8007881505639354, + "grad_norm": 0.2539706826210022, + "learning_rate": 5.251138062236717e-06, + "loss": 3.8338, + "step": 55940 + }, + { + "epoch": 3.801127870634597, + "grad_norm": 0.43233516812324524, + "learning_rate": 5.25071341214839e-06, + "loss": 3.9379, + "step": 55945 + }, + { + "epoch": 3.8014675907052586, + "grad_norm": 0.30170679092407227, + "learning_rate": 5.250288762060063e-06, + "loss": 4.0373, + "step": 55950 + }, + { + "epoch": 3.8018073107759207, + "grad_norm": 0.2589850127696991, + "learning_rate": 5.249864111971735e-06, + "loss": 3.8999, + "step": 55955 + }, + { + "epoch": 3.8021470308465823, + "grad_norm": 0.2408841848373413, + "learning_rate": 5.249439461883409e-06, + "loss": 4.0735, + "step": 55960 + }, + { + "epoch": 3.802486750917244, + "grad_norm": 0.4303705394268036, + "learning_rate": 5.2490148117950816e-06, + "loss": 3.7873, + "step": 55965 + }, + { + "epoch": 3.802826470987906, + "grad_norm": 0.2577287554740906, + "learning_rate": 5.2485901617067535e-06, + "loss": 4.0793, + "step": 55970 + }, + { + "epoch": 3.8031661910585677, + "grad_norm": 0.3373957872390747, + "learning_rate": 5.248165511618427e-06, + "loss": 4.2836, + "step": 55975 + }, + { + "epoch": 3.8035059111292293, + "grad_norm": 0.26077574491500854, + "learning_rate": 5.2477408615301e-06, + "loss": 3.7973, + "step": 55980 + }, + { + "epoch": 3.8038456311998914, + "grad_norm": 0.2846406102180481, + "learning_rate": 5.247316211441772e-06, + "loss": 4.037, + "step": 55985 + }, + { + "epoch": 3.804185351270553, + "grad_norm": 0.7587524056434631, + "learning_rate": 5.2468915613534456e-06, + "loss": 3.8419, + "step": 55990 + }, + { + "epoch": 3.8045250713412146, + "grad_norm": 0.2604556083679199, + "learning_rate": 5.2464669112651175e-06, + "loss": 4.0872, + "step": 55995 + }, + { + "epoch": 3.8048647914118767, + "grad_norm": 0.2990024983882904, + "learning_rate": 5.24604226117679e-06, + "loss": 3.9433, + "step": 56000 + }, + { + "epoch": 3.8052045114825384, + "grad_norm": 0.31455889344215393, + "learning_rate": 5.245617611088464e-06, + "loss": 4.1857, + "step": 56005 + }, + { + "epoch": 3.8055442315532, + "grad_norm": 0.4220700263977051, + "learning_rate": 5.245192961000136e-06, + "loss": 3.8479, + "step": 56010 + }, + { + "epoch": 3.805883951623862, + "grad_norm": 0.20988669991493225, + "learning_rate": 5.244768310911809e-06, + "loss": 3.957, + "step": 56015 + }, + { + "epoch": 3.8062236716945237, + "grad_norm": 0.21603892743587494, + "learning_rate": 5.244343660823482e-06, + "loss": 3.9961, + "step": 56020 + }, + { + "epoch": 3.8065633917651853, + "grad_norm": 0.31987255811691284, + "learning_rate": 5.243919010735154e-06, + "loss": 3.993, + "step": 56025 + }, + { + "epoch": 3.8069031118358474, + "grad_norm": 0.22416149079799652, + "learning_rate": 5.243494360646827e-06, + "loss": 4.1141, + "step": 56030 + }, + { + "epoch": 3.807242831906509, + "grad_norm": 0.2961234748363495, + "learning_rate": 5.243069710558501e-06, + "loss": 4.0105, + "step": 56035 + }, + { + "epoch": 3.8075825519771707, + "grad_norm": 0.20643113553524017, + "learning_rate": 5.242645060470173e-06, + "loss": 3.8932, + "step": 56040 + }, + { + "epoch": 3.8079222720478327, + "grad_norm": 0.38472074270248413, + "learning_rate": 5.2422204103818455e-06, + "loss": 3.7354, + "step": 56045 + }, + { + "epoch": 3.8082619921184944, + "grad_norm": 0.37341421842575073, + "learning_rate": 5.241795760293519e-06, + "loss": 4.1084, + "step": 56050 + }, + { + "epoch": 3.808601712189156, + "grad_norm": 0.30780646204948425, + "learning_rate": 5.241371110205191e-06, + "loss": 4.1058, + "step": 56055 + }, + { + "epoch": 3.808941432259818, + "grad_norm": 0.32385653257369995, + "learning_rate": 5.240946460116864e-06, + "loss": 4.0298, + "step": 56060 + }, + { + "epoch": 3.8092811523304797, + "grad_norm": 0.25164303183555603, + "learning_rate": 5.240521810028537e-06, + "loss": 4.1034, + "step": 56065 + }, + { + "epoch": 3.8096208724011413, + "grad_norm": 0.5296924710273743, + "learning_rate": 5.2400971599402095e-06, + "loss": 4.0193, + "step": 56070 + }, + { + "epoch": 3.8099605924718034, + "grad_norm": 0.30492326617240906, + "learning_rate": 5.239672509851883e-06, + "loss": 4.2065, + "step": 56075 + }, + { + "epoch": 3.810300312542465, + "grad_norm": 0.2929230332374573, + "learning_rate": 5.239247859763555e-06, + "loss": 4.0248, + "step": 56080 + }, + { + "epoch": 3.8106400326131267, + "grad_norm": 0.24789249897003174, + "learning_rate": 5.238823209675228e-06, + "loss": 3.906, + "step": 56085 + }, + { + "epoch": 3.8109797526837887, + "grad_norm": 0.3532521426677704, + "learning_rate": 5.2383985595869016e-06, + "loss": 3.8279, + "step": 56090 + }, + { + "epoch": 3.8113194727544504, + "grad_norm": 0.3388102054595947, + "learning_rate": 5.2379739094985735e-06, + "loss": 4.1603, + "step": 56095 + }, + { + "epoch": 3.811659192825112, + "grad_norm": 0.25282105803489685, + "learning_rate": 5.237549259410246e-06, + "loss": 3.9412, + "step": 56100 + }, + { + "epoch": 3.811998912895774, + "grad_norm": 0.2858559787273407, + "learning_rate": 5.23712460932192e-06, + "loss": 4.0268, + "step": 56105 + }, + { + "epoch": 3.8123386329664357, + "grad_norm": 0.35147371888160706, + "learning_rate": 5.236699959233592e-06, + "loss": 4.379, + "step": 56110 + }, + { + "epoch": 3.8126783530370973, + "grad_norm": 0.27043449878692627, + "learning_rate": 5.236275309145265e-06, + "loss": 4.1476, + "step": 56115 + }, + { + "epoch": 3.8130180731077594, + "grad_norm": 0.24798274040222168, + "learning_rate": 5.235850659056938e-06, + "loss": 3.8276, + "step": 56120 + }, + { + "epoch": 3.813357793178421, + "grad_norm": 0.291950523853302, + "learning_rate": 5.23542600896861e-06, + "loss": 3.8959, + "step": 56125 + }, + { + "epoch": 3.8136975132490827, + "grad_norm": 0.37640121579170227, + "learning_rate": 5.235001358880283e-06, + "loss": 4.2014, + "step": 56130 + }, + { + "epoch": 3.8140372333197448, + "grad_norm": 0.265920490026474, + "learning_rate": 5.234576708791956e-06, + "loss": 3.762, + "step": 56135 + }, + { + "epoch": 3.8143769533904064, + "grad_norm": 0.2816900312900543, + "learning_rate": 5.234152058703629e-06, + "loss": 3.7754, + "step": 56140 + }, + { + "epoch": 3.814716673461068, + "grad_norm": 0.3454332947731018, + "learning_rate": 5.2337274086153015e-06, + "loss": 3.995, + "step": 56145 + }, + { + "epoch": 3.81505639353173, + "grad_norm": 0.2938835322856903, + "learning_rate": 5.233302758526974e-06, + "loss": 4.1541, + "step": 56150 + }, + { + "epoch": 3.8153961136023917, + "grad_norm": 0.3407337963581085, + "learning_rate": 5.232878108438647e-06, + "loss": 3.9086, + "step": 56155 + }, + { + "epoch": 3.8157358336730534, + "grad_norm": 0.6165311932563782, + "learning_rate": 5.232453458350319e-06, + "loss": 3.8926, + "step": 56160 + }, + { + "epoch": 3.8160755537437154, + "grad_norm": 0.42324596643447876, + "learning_rate": 5.232028808261993e-06, + "loss": 4.2461, + "step": 56165 + }, + { + "epoch": 3.816415273814377, + "grad_norm": 0.25698480010032654, + "learning_rate": 5.2316041581736655e-06, + "loss": 4.0462, + "step": 56170 + }, + { + "epoch": 3.8167549938850387, + "grad_norm": 0.25330427289009094, + "learning_rate": 5.2311795080853375e-06, + "loss": 3.9694, + "step": 56175 + }, + { + "epoch": 3.8170947139557008, + "grad_norm": 0.3636065423488617, + "learning_rate": 5.230754857997011e-06, + "loss": 3.9938, + "step": 56180 + }, + { + "epoch": 3.8174344340263624, + "grad_norm": 0.2555505037307739, + "learning_rate": 5.230330207908684e-06, + "loss": 3.9976, + "step": 56185 + }, + { + "epoch": 3.817774154097024, + "grad_norm": 0.30434975028038025, + "learning_rate": 5.229905557820356e-06, + "loss": 3.9748, + "step": 56190 + }, + { + "epoch": 3.8181138741676857, + "grad_norm": 0.39215171337127686, + "learning_rate": 5.2294809077320295e-06, + "loss": 4.2081, + "step": 56195 + }, + { + "epoch": 3.8184535942383477, + "grad_norm": 0.27584561705589294, + "learning_rate": 5.229056257643702e-06, + "loss": 4.1309, + "step": 56200 + }, + { + "epoch": 3.8187933143090094, + "grad_norm": 0.3278537690639496, + "learning_rate": 5.228631607555374e-06, + "loss": 3.9224, + "step": 56205 + }, + { + "epoch": 3.819133034379671, + "grad_norm": 0.5773976445198059, + "learning_rate": 5.228206957467048e-06, + "loss": 3.9125, + "step": 56210 + }, + { + "epoch": 3.819472754450333, + "grad_norm": 0.33191153407096863, + "learning_rate": 5.227782307378721e-06, + "loss": 3.9959, + "step": 56215 + }, + { + "epoch": 3.8198124745209947, + "grad_norm": 0.2865919768810272, + "learning_rate": 5.227357657290393e-06, + "loss": 4.1047, + "step": 56220 + }, + { + "epoch": 3.8201521945916563, + "grad_norm": 0.4699369966983795, + "learning_rate": 5.226933007202066e-06, + "loss": 3.7915, + "step": 56225 + }, + { + "epoch": 3.8204919146623184, + "grad_norm": 0.4014461040496826, + "learning_rate": 5.226508357113738e-06, + "loss": 3.9775, + "step": 56230 + }, + { + "epoch": 3.82083163473298, + "grad_norm": 0.26069357991218567, + "learning_rate": 5.226083707025411e-06, + "loss": 4.1497, + "step": 56235 + }, + { + "epoch": 3.8211713548036417, + "grad_norm": 0.5868792533874512, + "learning_rate": 5.225659056937085e-06, + "loss": 4.1202, + "step": 56240 + }, + { + "epoch": 3.8215110748743033, + "grad_norm": 0.31488847732543945, + "learning_rate": 5.225234406848757e-06, + "loss": 3.9229, + "step": 56245 + }, + { + "epoch": 3.8218507949449654, + "grad_norm": 0.33071011304855347, + "learning_rate": 5.2248097567604295e-06, + "loss": 3.8009, + "step": 56250 + }, + { + "epoch": 3.822190515015627, + "grad_norm": 0.2954646348953247, + "learning_rate": 5.224385106672103e-06, + "loss": 4.1724, + "step": 56255 + }, + { + "epoch": 3.8225302350862886, + "grad_norm": 0.2530055046081543, + "learning_rate": 5.223960456583775e-06, + "loss": 3.9389, + "step": 56260 + }, + { + "epoch": 3.8228699551569507, + "grad_norm": 0.2022290676832199, + "learning_rate": 5.223535806495448e-06, + "loss": 3.8726, + "step": 56265 + }, + { + "epoch": 3.8232096752276123, + "grad_norm": 0.31264010071754456, + "learning_rate": 5.2231111564071216e-06, + "loss": 3.8081, + "step": 56270 + }, + { + "epoch": 3.823549395298274, + "grad_norm": 0.2779092788696289, + "learning_rate": 5.2226865063187935e-06, + "loss": 3.9815, + "step": 56275 + }, + { + "epoch": 3.823889115368936, + "grad_norm": 0.6087946891784668, + "learning_rate": 5.222261856230466e-06, + "loss": 4.125, + "step": 56280 + }, + { + "epoch": 3.8242288354395977, + "grad_norm": 0.45157352089881897, + "learning_rate": 5.22183720614214e-06, + "loss": 4.3177, + "step": 56285 + }, + { + "epoch": 3.8245685555102593, + "grad_norm": 0.2703339755535126, + "learning_rate": 5.221412556053812e-06, + "loss": 4.0372, + "step": 56290 + }, + { + "epoch": 3.8249082755809214, + "grad_norm": 0.28345564007759094, + "learning_rate": 5.220987905965485e-06, + "loss": 4.0059, + "step": 56295 + }, + { + "epoch": 3.825247995651583, + "grad_norm": 0.30226677656173706, + "learning_rate": 5.220563255877158e-06, + "loss": 3.97, + "step": 56300 + }, + { + "epoch": 3.8255877157222447, + "grad_norm": 0.758529007434845, + "learning_rate": 5.22013860578883e-06, + "loss": 4.0428, + "step": 56305 + }, + { + "epoch": 3.8259274357929067, + "grad_norm": 0.35225558280944824, + "learning_rate": 5.219713955700503e-06, + "loss": 4.2003, + "step": 56310 + }, + { + "epoch": 3.8262671558635684, + "grad_norm": 0.2109755426645279, + "learning_rate": 5.219289305612176e-06, + "loss": 3.952, + "step": 56315 + }, + { + "epoch": 3.82660687593423, + "grad_norm": 0.2262825220823288, + "learning_rate": 5.218864655523849e-06, + "loss": 3.7799, + "step": 56320 + }, + { + "epoch": 3.826946596004892, + "grad_norm": 0.2635008990764618, + "learning_rate": 5.2184400054355215e-06, + "loss": 3.8971, + "step": 56325 + }, + { + "epoch": 3.8272863160755537, + "grad_norm": 0.23946866393089294, + "learning_rate": 5.218015355347194e-06, + "loss": 3.9258, + "step": 56330 + }, + { + "epoch": 3.8276260361462153, + "grad_norm": 0.2850153148174286, + "learning_rate": 5.217590705258867e-06, + "loss": 4.013, + "step": 56335 + }, + { + "epoch": 3.8279657562168774, + "grad_norm": 0.22043846547603607, + "learning_rate": 5.217166055170539e-06, + "loss": 4.0302, + "step": 56340 + }, + { + "epoch": 3.828305476287539, + "grad_norm": 0.30768975615501404, + "learning_rate": 5.216741405082213e-06, + "loss": 4.1529, + "step": 56345 + }, + { + "epoch": 3.8286451963582007, + "grad_norm": 0.3126823604106903, + "learning_rate": 5.2163167549938855e-06, + "loss": 4.2118, + "step": 56350 + }, + { + "epoch": 3.8289849164288627, + "grad_norm": 0.35233014822006226, + "learning_rate": 5.2158921049055575e-06, + "loss": 3.7566, + "step": 56355 + }, + { + "epoch": 3.8293246364995244, + "grad_norm": 0.26713618636131287, + "learning_rate": 5.215467454817231e-06, + "loss": 4.0175, + "step": 56360 + }, + { + "epoch": 3.829664356570186, + "grad_norm": 0.3363271653652191, + "learning_rate": 5.215042804728904e-06, + "loss": 4.1115, + "step": 56365 + }, + { + "epoch": 3.830004076640848, + "grad_norm": 0.2646629214286804, + "learning_rate": 5.214618154640576e-06, + "loss": 4.021, + "step": 56370 + }, + { + "epoch": 3.8303437967115097, + "grad_norm": 0.3363969326019287, + "learning_rate": 5.2141935045522495e-06, + "loss": 3.9253, + "step": 56375 + }, + { + "epoch": 3.8306835167821713, + "grad_norm": 0.3342377543449402, + "learning_rate": 5.213768854463922e-06, + "loss": 4.0898, + "step": 56380 + }, + { + "epoch": 3.8310232368528334, + "grad_norm": 0.38610586524009705, + "learning_rate": 5.213344204375594e-06, + "loss": 3.7303, + "step": 56385 + }, + { + "epoch": 3.831362956923495, + "grad_norm": 0.2683180868625641, + "learning_rate": 5.212919554287268e-06, + "loss": 3.8409, + "step": 56390 + }, + { + "epoch": 3.8317026769941567, + "grad_norm": 0.39062774181365967, + "learning_rate": 5.212494904198941e-06, + "loss": 4.2056, + "step": 56395 + }, + { + "epoch": 3.8320423970648188, + "grad_norm": 0.24636343121528625, + "learning_rate": 5.212070254110613e-06, + "loss": 3.8994, + "step": 56400 + }, + { + "epoch": 3.8323821171354804, + "grad_norm": 0.3122744560241699, + "learning_rate": 5.211645604022286e-06, + "loss": 4.1151, + "step": 56405 + }, + { + "epoch": 3.832721837206142, + "grad_norm": 0.21648386120796204, + "learning_rate": 5.211220953933958e-06, + "loss": 4.0544, + "step": 56410 + }, + { + "epoch": 3.833061557276804, + "grad_norm": 0.33851519227027893, + "learning_rate": 5.210796303845632e-06, + "loss": 3.7121, + "step": 56415 + }, + { + "epoch": 3.8334012773474657, + "grad_norm": 0.27145254611968994, + "learning_rate": 5.210371653757305e-06, + "loss": 3.7405, + "step": 56420 + }, + { + "epoch": 3.8337409974181273, + "grad_norm": 0.3872707486152649, + "learning_rate": 5.209947003668977e-06, + "loss": 3.7171, + "step": 56425 + }, + { + "epoch": 3.8340807174887894, + "grad_norm": 0.2838762402534485, + "learning_rate": 5.20952235358065e-06, + "loss": 3.8729, + "step": 56430 + }, + { + "epoch": 3.834420437559451, + "grad_norm": 0.30423128604888916, + "learning_rate": 5.209097703492323e-06, + "loss": 3.9595, + "step": 56435 + }, + { + "epoch": 3.8347601576301127, + "grad_norm": 0.27305135130882263, + "learning_rate": 5.208673053403995e-06, + "loss": 3.8641, + "step": 56440 + }, + { + "epoch": 3.8350998777007748, + "grad_norm": 0.2819689214229584, + "learning_rate": 5.208248403315669e-06, + "loss": 3.7916, + "step": 56445 + }, + { + "epoch": 3.8354395977714364, + "grad_norm": 0.33939042687416077, + "learning_rate": 5.2078237532273415e-06, + "loss": 3.9137, + "step": 56450 + }, + { + "epoch": 3.835779317842098, + "grad_norm": 0.249456524848938, + "learning_rate": 5.2073991031390135e-06, + "loss": 4.081, + "step": 56455 + }, + { + "epoch": 3.83611903791276, + "grad_norm": 0.22614245116710663, + "learning_rate": 5.206974453050687e-06, + "loss": 4.1195, + "step": 56460 + }, + { + "epoch": 3.8364587579834217, + "grad_norm": 0.3007263243198395, + "learning_rate": 5.20654980296236e-06, + "loss": 3.9431, + "step": 56465 + }, + { + "epoch": 3.8367984780540834, + "grad_norm": 0.24145852029323578, + "learning_rate": 5.206125152874032e-06, + "loss": 4.0161, + "step": 56470 + }, + { + "epoch": 3.8371381981247454, + "grad_norm": 0.2991988956928253, + "learning_rate": 5.2057005027857055e-06, + "loss": 4.0519, + "step": 56475 + }, + { + "epoch": 3.837477918195407, + "grad_norm": 0.31414541602134705, + "learning_rate": 5.2052758526973775e-06, + "loss": 3.9642, + "step": 56480 + }, + { + "epoch": 3.8378176382660687, + "grad_norm": 0.33629053831100464, + "learning_rate": 5.20485120260905e-06, + "loss": 4.1601, + "step": 56485 + }, + { + "epoch": 3.8381573583367308, + "grad_norm": 0.5627990365028381, + "learning_rate": 5.204426552520724e-06, + "loss": 4.0651, + "step": 56490 + }, + { + "epoch": 3.8384970784073924, + "grad_norm": 0.28463178873062134, + "learning_rate": 5.204001902432396e-06, + "loss": 4.1549, + "step": 56495 + }, + { + "epoch": 3.838836798478054, + "grad_norm": 0.6054543256759644, + "learning_rate": 5.203577252344069e-06, + "loss": 4.0733, + "step": 56500 + }, + { + "epoch": 3.839176518548716, + "grad_norm": 0.25341513752937317, + "learning_rate": 5.203152602255742e-06, + "loss": 3.8202, + "step": 56505 + }, + { + "epoch": 3.8395162386193777, + "grad_norm": 0.3092518448829651, + "learning_rate": 5.202727952167414e-06, + "loss": 4.0135, + "step": 56510 + }, + { + "epoch": 3.8398559586900394, + "grad_norm": 0.27742999792099, + "learning_rate": 5.202303302079087e-06, + "loss": 4.0122, + "step": 56515 + }, + { + "epoch": 3.8401956787607014, + "grad_norm": 0.3474878966808319, + "learning_rate": 5.201878651990761e-06, + "loss": 4.0428, + "step": 56520 + }, + { + "epoch": 3.840535398831363, + "grad_norm": 0.2263130098581314, + "learning_rate": 5.201454001902433e-06, + "loss": 3.7193, + "step": 56525 + }, + { + "epoch": 3.8408751189020247, + "grad_norm": 0.3265018165111542, + "learning_rate": 5.2010293518141055e-06, + "loss": 3.8657, + "step": 56530 + }, + { + "epoch": 3.8412148389726863, + "grad_norm": 0.726813018321991, + "learning_rate": 5.200604701725779e-06, + "loss": 3.7901, + "step": 56535 + }, + { + "epoch": 3.8415545590433484, + "grad_norm": 0.27611035108566284, + "learning_rate": 5.200180051637451e-06, + "loss": 4.271, + "step": 56540 + }, + { + "epoch": 3.84189427911401, + "grad_norm": 0.3425690233707428, + "learning_rate": 5.199755401549124e-06, + "loss": 4.3162, + "step": 56545 + }, + { + "epoch": 3.8422339991846717, + "grad_norm": 0.25927451252937317, + "learning_rate": 5.1993307514607976e-06, + "loss": 3.9816, + "step": 56550 + }, + { + "epoch": 3.8425737192553338, + "grad_norm": 0.561141312122345, + "learning_rate": 5.1989061013724695e-06, + "loss": 4.074, + "step": 56555 + }, + { + "epoch": 3.8429134393259954, + "grad_norm": 0.2828943133354187, + "learning_rate": 5.198481451284142e-06, + "loss": 4.155, + "step": 56560 + }, + { + "epoch": 3.843253159396657, + "grad_norm": 0.283611536026001, + "learning_rate": 5.198056801195815e-06, + "loss": 3.8975, + "step": 56565 + }, + { + "epoch": 3.843592879467319, + "grad_norm": 0.35810768604278564, + "learning_rate": 5.197632151107488e-06, + "loss": 4.2188, + "step": 56570 + }, + { + "epoch": 3.8439325995379807, + "grad_norm": 0.3078592121601105, + "learning_rate": 5.19720750101916e-06, + "loss": 3.7198, + "step": 56575 + }, + { + "epoch": 3.8442723196086424, + "grad_norm": 0.2835005223751068, + "learning_rate": 5.1967828509308335e-06, + "loss": 4.095, + "step": 56580 + }, + { + "epoch": 3.844612039679304, + "grad_norm": 0.3171232044696808, + "learning_rate": 5.196358200842506e-06, + "loss": 4.2438, + "step": 56585 + }, + { + "epoch": 3.844951759749966, + "grad_norm": 0.2520398795604706, + "learning_rate": 5.195933550754178e-06, + "loss": 3.9024, + "step": 56590 + }, + { + "epoch": 3.8452914798206277, + "grad_norm": 0.24808929860591888, + "learning_rate": 5.195508900665852e-06, + "loss": 3.9804, + "step": 56595 + }, + { + "epoch": 3.8456311998912893, + "grad_norm": 0.31310778856277466, + "learning_rate": 5.195084250577525e-06, + "loss": 4.0595, + "step": 56600 + }, + { + "epoch": 3.8459709199619514, + "grad_norm": 0.3325978219509125, + "learning_rate": 5.194659600489197e-06, + "loss": 3.8179, + "step": 56605 + }, + { + "epoch": 3.846310640032613, + "grad_norm": 0.1987306922674179, + "learning_rate": 5.19423495040087e-06, + "loss": 3.8885, + "step": 56610 + }, + { + "epoch": 3.8466503601032747, + "grad_norm": 0.5919678807258606, + "learning_rate": 5.193810300312543e-06, + "loss": 3.9764, + "step": 56615 + }, + { + "epoch": 3.8469900801739367, + "grad_norm": 0.278903990983963, + "learning_rate": 5.193385650224215e-06, + "loss": 3.9222, + "step": 56620 + }, + { + "epoch": 3.8473298002445984, + "grad_norm": 0.3128662109375, + "learning_rate": 5.192961000135889e-06, + "loss": 3.8445, + "step": 56625 + }, + { + "epoch": 3.84766952031526, + "grad_norm": 0.31179314851760864, + "learning_rate": 5.1925363500475615e-06, + "loss": 4.1432, + "step": 56630 + }, + { + "epoch": 3.848009240385922, + "grad_norm": 0.2864527702331543, + "learning_rate": 5.1921116999592335e-06, + "loss": 4.0236, + "step": 56635 + }, + { + "epoch": 3.8483489604565837, + "grad_norm": 0.2790963351726532, + "learning_rate": 5.191687049870907e-06, + "loss": 3.9951, + "step": 56640 + }, + { + "epoch": 3.8486886805272453, + "grad_norm": 0.21928410232067108, + "learning_rate": 5.19126239978258e-06, + "loss": 4.0248, + "step": 56645 + }, + { + "epoch": 3.8490284005979074, + "grad_norm": 0.2500622570514679, + "learning_rate": 5.190837749694252e-06, + "loss": 3.7214, + "step": 56650 + }, + { + "epoch": 3.849368120668569, + "grad_norm": 0.2585926651954651, + "learning_rate": 5.1904130996059255e-06, + "loss": 3.9391, + "step": 56655 + }, + { + "epoch": 3.8497078407392307, + "grad_norm": 0.26001405715942383, + "learning_rate": 5.1899884495175975e-06, + "loss": 4.069, + "step": 56660 + }, + { + "epoch": 3.8500475608098927, + "grad_norm": 0.422750860452652, + "learning_rate": 5.18956379942927e-06, + "loss": 3.9048, + "step": 56665 + }, + { + "epoch": 3.8503872808805544, + "grad_norm": 0.23138444125652313, + "learning_rate": 5.189139149340944e-06, + "loss": 3.8318, + "step": 56670 + }, + { + "epoch": 3.850727000951216, + "grad_norm": 0.2444857954978943, + "learning_rate": 5.188714499252616e-06, + "loss": 4.0802, + "step": 56675 + }, + { + "epoch": 3.851066721021878, + "grad_norm": 0.423103004693985, + "learning_rate": 5.188289849164289e-06, + "loss": 3.946, + "step": 56680 + }, + { + "epoch": 3.8514064410925397, + "grad_norm": 0.5521993637084961, + "learning_rate": 5.187865199075962e-06, + "loss": 3.975, + "step": 56685 + }, + { + "epoch": 3.8517461611632013, + "grad_norm": 0.32362881302833557, + "learning_rate": 5.187440548987634e-06, + "loss": 4.0534, + "step": 56690 + }, + { + "epoch": 3.8520858812338634, + "grad_norm": 0.2819020450115204, + "learning_rate": 5.187015898899307e-06, + "loss": 4.125, + "step": 56695 + }, + { + "epoch": 3.852425601304525, + "grad_norm": 0.27213847637176514, + "learning_rate": 5.186591248810981e-06, + "loss": 4.0867, + "step": 56700 + }, + { + "epoch": 3.8527653213751867, + "grad_norm": 0.2833256721496582, + "learning_rate": 5.186166598722653e-06, + "loss": 3.7926, + "step": 56705 + }, + { + "epoch": 3.8531050414458488, + "grad_norm": 0.2638276517391205, + "learning_rate": 5.1857419486343255e-06, + "loss": 3.9078, + "step": 56710 + }, + { + "epoch": 3.8534447615165104, + "grad_norm": 0.38936230540275574, + "learning_rate": 5.185317298545999e-06, + "loss": 3.9541, + "step": 56715 + }, + { + "epoch": 3.853784481587172, + "grad_norm": 0.19175563752651215, + "learning_rate": 5.184892648457671e-06, + "loss": 3.9525, + "step": 56720 + }, + { + "epoch": 3.854124201657834, + "grad_norm": 0.2590482831001282, + "learning_rate": 5.184467998369344e-06, + "loss": 4.115, + "step": 56725 + }, + { + "epoch": 3.8544639217284957, + "grad_norm": 0.30314427614212036, + "learning_rate": 5.184043348281017e-06, + "loss": 4.0307, + "step": 56730 + }, + { + "epoch": 3.8548036417991574, + "grad_norm": 0.2572665512561798, + "learning_rate": 5.1836186981926895e-06, + "loss": 4.1434, + "step": 56735 + }, + { + "epoch": 3.8551433618698194, + "grad_norm": 0.35301700234413147, + "learning_rate": 5.183194048104362e-06, + "loss": 4.0595, + "step": 56740 + }, + { + "epoch": 3.855483081940481, + "grad_norm": 0.3766343593597412, + "learning_rate": 5.182769398016035e-06, + "loss": 3.7425, + "step": 56745 + }, + { + "epoch": 3.8558228020111427, + "grad_norm": 0.22638064622879028, + "learning_rate": 5.182344747927708e-06, + "loss": 4.0656, + "step": 56750 + }, + { + "epoch": 3.8561625220818048, + "grad_norm": 0.2770026624202728, + "learning_rate": 5.1819200978393815e-06, + "loss": 4.2247, + "step": 56755 + }, + { + "epoch": 3.8565022421524664, + "grad_norm": 0.36857905983924866, + "learning_rate": 5.1814954477510535e-06, + "loss": 4.0583, + "step": 56760 + }, + { + "epoch": 3.856841962223128, + "grad_norm": 0.26649338006973267, + "learning_rate": 5.181070797662726e-06, + "loss": 3.9231, + "step": 56765 + }, + { + "epoch": 3.85718168229379, + "grad_norm": 0.3069913983345032, + "learning_rate": 5.1806461475744e-06, + "loss": 4.162, + "step": 56770 + }, + { + "epoch": 3.8575214023644517, + "grad_norm": 0.23037391901016235, + "learning_rate": 5.180221497486072e-06, + "loss": 4.0463, + "step": 56775 + }, + { + "epoch": 3.8578611224351134, + "grad_norm": 0.26735296845436096, + "learning_rate": 5.179796847397745e-06, + "loss": 3.7444, + "step": 56780 + }, + { + "epoch": 3.8582008425057754, + "grad_norm": 0.3300686776638031, + "learning_rate": 5.179372197309418e-06, + "loss": 3.9284, + "step": 56785 + }, + { + "epoch": 3.858540562576437, + "grad_norm": 0.275694340467453, + "learning_rate": 5.17894754722109e-06, + "loss": 4.0346, + "step": 56790 + }, + { + "epoch": 3.8588802826470987, + "grad_norm": 0.2023475170135498, + "learning_rate": 5.178522897132763e-06, + "loss": 4.0276, + "step": 56795 + }, + { + "epoch": 3.8592200027177608, + "grad_norm": 0.37216609716415405, + "learning_rate": 5.178098247044436e-06, + "loss": 3.9837, + "step": 56800 + }, + { + "epoch": 3.8595597227884224, + "grad_norm": 0.34046852588653564, + "learning_rate": 5.177673596956109e-06, + "loss": 3.7814, + "step": 56805 + }, + { + "epoch": 3.859899442859084, + "grad_norm": 0.42550018429756165, + "learning_rate": 5.1772489468677815e-06, + "loss": 3.9519, + "step": 56810 + }, + { + "epoch": 3.860239162929746, + "grad_norm": 0.3436690866947174, + "learning_rate": 5.176824296779454e-06, + "loss": 4.0188, + "step": 56815 + }, + { + "epoch": 3.8605788830004077, + "grad_norm": 0.24025142192840576, + "learning_rate": 5.176399646691127e-06, + "loss": 4.0868, + "step": 56820 + }, + { + "epoch": 3.8609186030710694, + "grad_norm": 0.26819419860839844, + "learning_rate": 5.175974996602799e-06, + "loss": 4.0636, + "step": 56825 + }, + { + "epoch": 3.8612583231417315, + "grad_norm": 0.24563708901405334, + "learning_rate": 5.175550346514473e-06, + "loss": 3.898, + "step": 56830 + }, + { + "epoch": 3.861598043212393, + "grad_norm": 0.2638280391693115, + "learning_rate": 5.1751256964261455e-06, + "loss": 4.0382, + "step": 56835 + }, + { + "epoch": 3.8619377632830547, + "grad_norm": 0.2886779308319092, + "learning_rate": 5.1747010463378175e-06, + "loss": 4.0645, + "step": 56840 + }, + { + "epoch": 3.862277483353717, + "grad_norm": 0.37837928533554077, + "learning_rate": 5.174276396249491e-06, + "loss": 4.1421, + "step": 56845 + }, + { + "epoch": 3.8626172034243784, + "grad_norm": 0.2654368281364441, + "learning_rate": 5.173851746161164e-06, + "loss": 3.7054, + "step": 56850 + }, + { + "epoch": 3.86295692349504, + "grad_norm": 0.27200034260749817, + "learning_rate": 5.173427096072836e-06, + "loss": 3.897, + "step": 56855 + }, + { + "epoch": 3.863296643565702, + "grad_norm": 0.21804285049438477, + "learning_rate": 5.1730024459845095e-06, + "loss": 3.7333, + "step": 56860 + }, + { + "epoch": 3.8636363636363638, + "grad_norm": 0.41363146901130676, + "learning_rate": 5.172577795896182e-06, + "loss": 3.9463, + "step": 56865 + }, + { + "epoch": 3.8639760837070254, + "grad_norm": 0.25052306056022644, + "learning_rate": 5.172153145807854e-06, + "loss": 3.8376, + "step": 56870 + }, + { + "epoch": 3.864315803777687, + "grad_norm": 0.2801183760166168, + "learning_rate": 5.171728495719528e-06, + "loss": 4.2078, + "step": 56875 + }, + { + "epoch": 3.864655523848349, + "grad_norm": 0.242330864071846, + "learning_rate": 5.171303845631201e-06, + "loss": 4.2448, + "step": 56880 + }, + { + "epoch": 3.8649952439190107, + "grad_norm": 0.30182740092277527, + "learning_rate": 5.170879195542873e-06, + "loss": 3.7956, + "step": 56885 + }, + { + "epoch": 3.8653349639896724, + "grad_norm": 0.26145139336586, + "learning_rate": 5.170454545454546e-06, + "loss": 4.0091, + "step": 56890 + }, + { + "epoch": 3.8656746840603344, + "grad_norm": 0.24926036596298218, + "learning_rate": 5.170029895366218e-06, + "loss": 3.8364, + "step": 56895 + }, + { + "epoch": 3.866014404130996, + "grad_norm": 0.361491322517395, + "learning_rate": 5.169605245277891e-06, + "loss": 3.9929, + "step": 56900 + }, + { + "epoch": 3.8663541242016577, + "grad_norm": 0.19540520012378693, + "learning_rate": 5.169180595189565e-06, + "loss": 3.9958, + "step": 56905 + }, + { + "epoch": 3.8666938442723198, + "grad_norm": 0.31078049540519714, + "learning_rate": 5.168755945101237e-06, + "loss": 4.1393, + "step": 56910 + }, + { + "epoch": 3.8670335643429814, + "grad_norm": 0.33876365423202515, + "learning_rate": 5.1683312950129095e-06, + "loss": 4.0719, + "step": 56915 + }, + { + "epoch": 3.867373284413643, + "grad_norm": 0.23746460676193237, + "learning_rate": 5.167906644924583e-06, + "loss": 4.0211, + "step": 56920 + }, + { + "epoch": 3.8677130044843047, + "grad_norm": 0.23708784580230713, + "learning_rate": 5.167481994836255e-06, + "loss": 3.7332, + "step": 56925 + }, + { + "epoch": 3.8680527245549667, + "grad_norm": 0.27058419585227966, + "learning_rate": 5.167057344747928e-06, + "loss": 3.8446, + "step": 56930 + }, + { + "epoch": 3.8683924446256284, + "grad_norm": 0.28976336121559143, + "learning_rate": 5.1666326946596015e-06, + "loss": 3.8973, + "step": 56935 + }, + { + "epoch": 3.86873216469629, + "grad_norm": 0.3971906900405884, + "learning_rate": 5.1662080445712735e-06, + "loss": 4.1793, + "step": 56940 + }, + { + "epoch": 3.869071884766952, + "grad_norm": 0.29852595925331116, + "learning_rate": 5.165783394482946e-06, + "loss": 4.1801, + "step": 56945 + }, + { + "epoch": 3.8694116048376137, + "grad_norm": 0.28891250491142273, + "learning_rate": 5.16535874439462e-06, + "loss": 3.9186, + "step": 56950 + }, + { + "epoch": 3.8697513249082753, + "grad_norm": 0.27055755257606506, + "learning_rate": 5.164934094306292e-06, + "loss": 4.0316, + "step": 56955 + }, + { + "epoch": 3.8700910449789374, + "grad_norm": 0.3276680111885071, + "learning_rate": 5.164509444217965e-06, + "loss": 4.2686, + "step": 56960 + }, + { + "epoch": 3.870430765049599, + "grad_norm": 0.2568356394767761, + "learning_rate": 5.164084794129638e-06, + "loss": 4.0115, + "step": 56965 + }, + { + "epoch": 3.8707704851202607, + "grad_norm": 0.2524409592151642, + "learning_rate": 5.16366014404131e-06, + "loss": 3.9394, + "step": 56970 + }, + { + "epoch": 3.8711102051909227, + "grad_norm": 0.31336233019828796, + "learning_rate": 5.163235493952983e-06, + "loss": 3.9822, + "step": 56975 + }, + { + "epoch": 3.8714499252615844, + "grad_norm": 0.28100305795669556, + "learning_rate": 5.162810843864656e-06, + "loss": 3.9792, + "step": 56980 + }, + { + "epoch": 3.871789645332246, + "grad_norm": 0.39915376901626587, + "learning_rate": 5.162386193776329e-06, + "loss": 4.0312, + "step": 56985 + }, + { + "epoch": 3.872129365402908, + "grad_norm": 0.348465234041214, + "learning_rate": 5.1619615436880015e-06, + "loss": 4.0817, + "step": 56990 + }, + { + "epoch": 3.8724690854735697, + "grad_norm": 0.24691928923130035, + "learning_rate": 5.161536893599674e-06, + "loss": 3.9075, + "step": 56995 + }, + { + "epoch": 3.8728088055442313, + "grad_norm": 0.18756966292858124, + "learning_rate": 5.161112243511347e-06, + "loss": 4.098, + "step": 57000 + }, + { + "epoch": 3.8731485256148934, + "grad_norm": 0.24244177341461182, + "learning_rate": 5.160687593423019e-06, + "loss": 4.0667, + "step": 57005 + }, + { + "epoch": 3.873488245685555, + "grad_norm": 0.33535152673721313, + "learning_rate": 5.160262943334693e-06, + "loss": 3.9048, + "step": 57010 + }, + { + "epoch": 3.8738279657562167, + "grad_norm": 0.30845046043395996, + "learning_rate": 5.1598382932463655e-06, + "loss": 3.9501, + "step": 57015 + }, + { + "epoch": 3.8741676858268788, + "grad_norm": 0.28913745284080505, + "learning_rate": 5.1594136431580374e-06, + "loss": 4.4531, + "step": 57020 + }, + { + "epoch": 3.8745074058975404, + "grad_norm": 0.3041384816169739, + "learning_rate": 5.158988993069711e-06, + "loss": 3.9913, + "step": 57025 + }, + { + "epoch": 3.874847125968202, + "grad_norm": 0.3910787105560303, + "learning_rate": 5.158564342981384e-06, + "loss": 3.8705, + "step": 57030 + }, + { + "epoch": 3.875186846038864, + "grad_norm": 0.24396894872188568, + "learning_rate": 5.158139692893056e-06, + "loss": 4.2697, + "step": 57035 + }, + { + "epoch": 3.8755265661095257, + "grad_norm": 0.23146632313728333, + "learning_rate": 5.1577150428047295e-06, + "loss": 3.869, + "step": 57040 + }, + { + "epoch": 3.8758662861801874, + "grad_norm": 0.2957373559474945, + "learning_rate": 5.157290392716402e-06, + "loss": 3.8944, + "step": 57045 + }, + { + "epoch": 3.8762060062508494, + "grad_norm": 0.4761999845504761, + "learning_rate": 5.156865742628074e-06, + "loss": 3.8025, + "step": 57050 + }, + { + "epoch": 3.876545726321511, + "grad_norm": 0.31418898701667786, + "learning_rate": 5.156441092539748e-06, + "loss": 3.8529, + "step": 57055 + }, + { + "epoch": 3.8768854463921727, + "grad_norm": 0.29597145318984985, + "learning_rate": 5.156016442451421e-06, + "loss": 3.9837, + "step": 57060 + }, + { + "epoch": 3.8772251664628348, + "grad_norm": 0.39296644926071167, + "learning_rate": 5.155591792363093e-06, + "loss": 3.9712, + "step": 57065 + }, + { + "epoch": 3.8775648865334964, + "grad_norm": 0.28441616892814636, + "learning_rate": 5.155167142274766e-06, + "loss": 3.8133, + "step": 57070 + }, + { + "epoch": 3.877904606604158, + "grad_norm": 0.22517693042755127, + "learning_rate": 5.154742492186438e-06, + "loss": 3.9993, + "step": 57075 + }, + { + "epoch": 3.87824432667482, + "grad_norm": 0.31143665313720703, + "learning_rate": 5.154317842098111e-06, + "loss": 3.9159, + "step": 57080 + }, + { + "epoch": 3.8785840467454817, + "grad_norm": 0.35678014159202576, + "learning_rate": 5.153893192009785e-06, + "loss": 4.1504, + "step": 57085 + }, + { + "epoch": 3.8789237668161434, + "grad_norm": 0.3346000611782074, + "learning_rate": 5.153468541921457e-06, + "loss": 3.9778, + "step": 57090 + }, + { + "epoch": 3.8792634868868054, + "grad_norm": 0.29757142066955566, + "learning_rate": 5.15304389183313e-06, + "loss": 4.0529, + "step": 57095 + }, + { + "epoch": 3.879603206957467, + "grad_norm": 0.2564397156238556, + "learning_rate": 5.152619241744803e-06, + "loss": 3.8331, + "step": 57100 + }, + { + "epoch": 3.8799429270281287, + "grad_norm": 0.329372376203537, + "learning_rate": 5.152194591656475e-06, + "loss": 4.0866, + "step": 57105 + }, + { + "epoch": 3.880282647098791, + "grad_norm": 0.3697187602519989, + "learning_rate": 5.151769941568149e-06, + "loss": 4.1134, + "step": 57110 + }, + { + "epoch": 3.8806223671694524, + "grad_norm": 0.2512985169887543, + "learning_rate": 5.1513452914798215e-06, + "loss": 4.0701, + "step": 57115 + }, + { + "epoch": 3.880962087240114, + "grad_norm": 0.38260069489479065, + "learning_rate": 5.1509206413914935e-06, + "loss": 3.8638, + "step": 57120 + }, + { + "epoch": 3.881301807310776, + "grad_norm": 0.45204877853393555, + "learning_rate": 5.150495991303167e-06, + "loss": 3.9542, + "step": 57125 + }, + { + "epoch": 3.8816415273814378, + "grad_norm": 0.23046566545963287, + "learning_rate": 5.15007134121484e-06, + "loss": 4.1865, + "step": 57130 + }, + { + "epoch": 3.8819812474520994, + "grad_norm": 0.29953286051750183, + "learning_rate": 5.149646691126512e-06, + "loss": 3.9567, + "step": 57135 + }, + { + "epoch": 3.8823209675227615, + "grad_norm": 0.27269676327705383, + "learning_rate": 5.1492220410381855e-06, + "loss": 3.8107, + "step": 57140 + }, + { + "epoch": 3.882660687593423, + "grad_norm": 0.2479570060968399, + "learning_rate": 5.1487973909498575e-06, + "loss": 3.9589, + "step": 57145 + }, + { + "epoch": 3.8830004076640847, + "grad_norm": 0.3544633984565735, + "learning_rate": 5.14837274086153e-06, + "loss": 4.0682, + "step": 57150 + }, + { + "epoch": 3.883340127734747, + "grad_norm": 0.2759782373905182, + "learning_rate": 5.147948090773204e-06, + "loss": 3.7313, + "step": 57155 + }, + { + "epoch": 3.8836798478054084, + "grad_norm": 0.2989575266838074, + "learning_rate": 5.147523440684876e-06, + "loss": 4.3508, + "step": 57160 + }, + { + "epoch": 3.88401956787607, + "grad_norm": 0.2827474772930145, + "learning_rate": 5.147098790596549e-06, + "loss": 3.9779, + "step": 57165 + }, + { + "epoch": 3.884359287946732, + "grad_norm": 0.250960111618042, + "learning_rate": 5.146674140508222e-06, + "loss": 3.9767, + "step": 57170 + }, + { + "epoch": 3.8846990080173938, + "grad_norm": 0.26340967416763306, + "learning_rate": 5.146249490419894e-06, + "loss": 3.7083, + "step": 57175 + }, + { + "epoch": 3.8850387280880554, + "grad_norm": 0.3013034462928772, + "learning_rate": 5.145824840331567e-06, + "loss": 3.7281, + "step": 57180 + }, + { + "epoch": 3.8853784481587175, + "grad_norm": 0.23237712681293488, + "learning_rate": 5.145400190243241e-06, + "loss": 4.0642, + "step": 57185 + }, + { + "epoch": 3.885718168229379, + "grad_norm": 0.27681854367256165, + "learning_rate": 5.144975540154913e-06, + "loss": 3.7784, + "step": 57190 + }, + { + "epoch": 3.8860578883000407, + "grad_norm": 0.24328187108039856, + "learning_rate": 5.1445508900665855e-06, + "loss": 3.8653, + "step": 57195 + }, + { + "epoch": 3.886397608370703, + "grad_norm": 0.2746292054653168, + "learning_rate": 5.144126239978259e-06, + "loss": 3.5934, + "step": 57200 + }, + { + "epoch": 3.8867373284413644, + "grad_norm": 0.26555904746055603, + "learning_rate": 5.143701589889931e-06, + "loss": 4.0306, + "step": 57205 + }, + { + "epoch": 3.887077048512026, + "grad_norm": 0.25817185640335083, + "learning_rate": 5.143276939801604e-06, + "loss": 3.9971, + "step": 57210 + }, + { + "epoch": 3.887416768582688, + "grad_norm": 0.4859372675418854, + "learning_rate": 5.1428522897132775e-06, + "loss": 4.1953, + "step": 57215 + }, + { + "epoch": 3.8877564886533498, + "grad_norm": 0.31015437841415405, + "learning_rate": 5.1424276396249495e-06, + "loss": 4.0406, + "step": 57220 + }, + { + "epoch": 3.8880962087240114, + "grad_norm": 0.2734571695327759, + "learning_rate": 5.142002989536622e-06, + "loss": 3.8749, + "step": 57225 + }, + { + "epoch": 3.888435928794673, + "grad_norm": 0.3479461967945099, + "learning_rate": 5.141578339448295e-06, + "loss": 3.8902, + "step": 57230 + }, + { + "epoch": 3.888775648865335, + "grad_norm": 0.3174036145210266, + "learning_rate": 5.141153689359968e-06, + "loss": 3.987, + "step": 57235 + }, + { + "epoch": 3.8891153689359967, + "grad_norm": 0.2068891078233719, + "learning_rate": 5.14072903927164e-06, + "loss": 3.8276, + "step": 57240 + }, + { + "epoch": 3.8894550890066584, + "grad_norm": 0.2798910439014435, + "learning_rate": 5.1403043891833135e-06, + "loss": 3.8953, + "step": 57245 + }, + { + "epoch": 3.8897948090773204, + "grad_norm": 0.24753804504871368, + "learning_rate": 5.139879739094986e-06, + "loss": 3.8893, + "step": 57250 + }, + { + "epoch": 3.890134529147982, + "grad_norm": 0.2622090280056, + "learning_rate": 5.139455089006658e-06, + "loss": 4.229, + "step": 57255 + }, + { + "epoch": 3.8904742492186437, + "grad_norm": 0.30632349848747253, + "learning_rate": 5.139030438918332e-06, + "loss": 4.1113, + "step": 57260 + }, + { + "epoch": 3.8908139692893053, + "grad_norm": 0.28306901454925537, + "learning_rate": 5.138605788830005e-06, + "loss": 4.08, + "step": 57265 + }, + { + "epoch": 3.8911536893599674, + "grad_norm": 0.38308948278427124, + "learning_rate": 5.138181138741677e-06, + "loss": 3.9967, + "step": 57270 + }, + { + "epoch": 3.891493409430629, + "grad_norm": 0.4016103744506836, + "learning_rate": 5.13775648865335e-06, + "loss": 4.0553, + "step": 57275 + }, + { + "epoch": 3.8918331295012907, + "grad_norm": 0.2808682322502136, + "learning_rate": 5.137331838565023e-06, + "loss": 3.8678, + "step": 57280 + }, + { + "epoch": 3.8921728495719528, + "grad_norm": 0.2658054530620575, + "learning_rate": 5.136907188476695e-06, + "loss": 3.8986, + "step": 57285 + }, + { + "epoch": 3.8925125696426144, + "grad_norm": 0.26166439056396484, + "learning_rate": 5.136482538388369e-06, + "loss": 4.0691, + "step": 57290 + }, + { + "epoch": 3.892852289713276, + "grad_norm": 0.3426058888435364, + "learning_rate": 5.1360578883000415e-06, + "loss": 4.1041, + "step": 57295 + }, + { + "epoch": 3.893192009783938, + "grad_norm": 0.3161298334598541, + "learning_rate": 5.1356332382117134e-06, + "loss": 4.0333, + "step": 57300 + }, + { + "epoch": 3.8935317298545997, + "grad_norm": 0.27045050263404846, + "learning_rate": 5.135208588123387e-06, + "loss": 4.1251, + "step": 57305 + }, + { + "epoch": 3.8938714499252614, + "grad_norm": 0.304773211479187, + "learning_rate": 5.13478393803506e-06, + "loss": 4.0454, + "step": 57310 + }, + { + "epoch": 3.8942111699959234, + "grad_norm": 0.33203133940696716, + "learning_rate": 5.134359287946732e-06, + "loss": 3.9888, + "step": 57315 + }, + { + "epoch": 3.894550890066585, + "grad_norm": 0.35167011618614197, + "learning_rate": 5.1339346378584055e-06, + "loss": 4.1775, + "step": 57320 + }, + { + "epoch": 3.8948906101372467, + "grad_norm": 0.2993781864643097, + "learning_rate": 5.1335099877700774e-06, + "loss": 4.0271, + "step": 57325 + }, + { + "epoch": 3.8952303302079088, + "grad_norm": 0.3439280390739441, + "learning_rate": 5.13308533768175e-06, + "loss": 4.0719, + "step": 57330 + }, + { + "epoch": 3.8955700502785704, + "grad_norm": 0.27476567029953003, + "learning_rate": 5.132660687593424e-06, + "loss": 3.952, + "step": 57335 + }, + { + "epoch": 3.895909770349232, + "grad_norm": 0.3822624683380127, + "learning_rate": 5.132236037505096e-06, + "loss": 3.8461, + "step": 57340 + }, + { + "epoch": 3.896249490419894, + "grad_norm": 0.313394159078598, + "learning_rate": 5.131811387416769e-06, + "loss": 4.1226, + "step": 57345 + }, + { + "epoch": 3.8965892104905557, + "grad_norm": 0.22921918332576752, + "learning_rate": 5.131386737328442e-06, + "loss": 4.1067, + "step": 57350 + }, + { + "epoch": 3.8969289305612174, + "grad_norm": 0.22943130135536194, + "learning_rate": 5.130962087240114e-06, + "loss": 3.991, + "step": 57355 + }, + { + "epoch": 3.8972686506318794, + "grad_norm": 0.2791573703289032, + "learning_rate": 5.130537437151787e-06, + "loss": 3.999, + "step": 57360 + }, + { + "epoch": 3.897608370702541, + "grad_norm": 0.2647800147533417, + "learning_rate": 5.130112787063461e-06, + "loss": 4.1333, + "step": 57365 + }, + { + "epoch": 3.8979480907732027, + "grad_norm": 0.2813297212123871, + "learning_rate": 5.129688136975133e-06, + "loss": 3.99, + "step": 57370 + }, + { + "epoch": 3.8982878108438648, + "grad_norm": 0.31040555238723755, + "learning_rate": 5.1292634868868054e-06, + "loss": 4.1014, + "step": 57375 + }, + { + "epoch": 3.8986275309145264, + "grad_norm": 0.3897123634815216, + "learning_rate": 5.128838836798479e-06, + "loss": 4.166, + "step": 57380 + }, + { + "epoch": 3.898967250985188, + "grad_norm": 0.24053731560707092, + "learning_rate": 5.128414186710151e-06, + "loss": 3.8597, + "step": 57385 + }, + { + "epoch": 3.89930697105585, + "grad_norm": 0.3094630837440491, + "learning_rate": 5.127989536621824e-06, + "loss": 4.1579, + "step": 57390 + }, + { + "epoch": 3.8996466911265117, + "grad_norm": 0.3269900381565094, + "learning_rate": 5.127564886533497e-06, + "loss": 3.812, + "step": 57395 + }, + { + "epoch": 3.8999864111971734, + "grad_norm": 0.34813758730888367, + "learning_rate": 5.1271402364451694e-06, + "loss": 3.8962, + "step": 57400 + }, + { + "epoch": 3.9003261312678354, + "grad_norm": 0.2982504367828369, + "learning_rate": 5.126715586356842e-06, + "loss": 3.8343, + "step": 57405 + }, + { + "epoch": 3.900665851338497, + "grad_norm": 0.22172176837921143, + "learning_rate": 5.126290936268515e-06, + "loss": 3.8673, + "step": 57410 + }, + { + "epoch": 3.9010055714091587, + "grad_norm": 0.302410364151001, + "learning_rate": 5.125866286180188e-06, + "loss": 3.8488, + "step": 57415 + }, + { + "epoch": 3.901345291479821, + "grad_norm": 0.2571580410003662, + "learning_rate": 5.12544163609186e-06, + "loss": 3.914, + "step": 57420 + }, + { + "epoch": 3.9016850115504824, + "grad_norm": 0.27051517367362976, + "learning_rate": 5.1250169860035335e-06, + "loss": 3.9619, + "step": 57425 + }, + { + "epoch": 3.902024731621144, + "grad_norm": 0.24914699792861938, + "learning_rate": 5.124592335915206e-06, + "loss": 3.9268, + "step": 57430 + }, + { + "epoch": 3.902364451691806, + "grad_norm": 0.4160674214363098, + "learning_rate": 5.12416768582688e-06, + "loss": 3.9322, + "step": 57435 + }, + { + "epoch": 3.9027041717624678, + "grad_norm": 0.2093566656112671, + "learning_rate": 5.123743035738552e-06, + "loss": 3.867, + "step": 57440 + }, + { + "epoch": 3.9030438918331294, + "grad_norm": 0.28635886311531067, + "learning_rate": 5.123318385650225e-06, + "loss": 3.8594, + "step": 57445 + }, + { + "epoch": 3.9033836119037915, + "grad_norm": 0.7102174162864685, + "learning_rate": 5.122893735561898e-06, + "loss": 3.9873, + "step": 57450 + }, + { + "epoch": 3.903723331974453, + "grad_norm": 0.3723064959049225, + "learning_rate": 5.12246908547357e-06, + "loss": 4.1717, + "step": 57455 + }, + { + "epoch": 3.9040630520451147, + "grad_norm": 0.23121678829193115, + "learning_rate": 5.122044435385243e-06, + "loss": 4.3434, + "step": 57460 + }, + { + "epoch": 3.904402772115777, + "grad_norm": 0.2406996190547943, + "learning_rate": 5.121619785296916e-06, + "loss": 3.8311, + "step": 57465 + }, + { + "epoch": 3.9047424921864384, + "grad_norm": 0.344614714384079, + "learning_rate": 5.121195135208589e-06, + "loss": 4.0084, + "step": 57470 + }, + { + "epoch": 3.9050822122571, + "grad_norm": 0.263776034116745, + "learning_rate": 5.1207704851202615e-06, + "loss": 3.927, + "step": 57475 + }, + { + "epoch": 3.905421932327762, + "grad_norm": 0.2508595287799835, + "learning_rate": 5.120345835031934e-06, + "loss": 4.0099, + "step": 57480 + }, + { + "epoch": 3.9057616523984238, + "grad_norm": 0.42762941122055054, + "learning_rate": 5.119921184943607e-06, + "loss": 3.7378, + "step": 57485 + }, + { + "epoch": 3.9061013724690854, + "grad_norm": 0.5429128408432007, + "learning_rate": 5.119496534855279e-06, + "loss": 3.8244, + "step": 57490 + }, + { + "epoch": 3.9064410925397475, + "grad_norm": 0.307182639837265, + "learning_rate": 5.119071884766953e-06, + "loss": 4.0452, + "step": 57495 + }, + { + "epoch": 3.906780812610409, + "grad_norm": 0.23782113194465637, + "learning_rate": 5.1186472346786255e-06, + "loss": 3.7997, + "step": 57500 + }, + { + "epoch": 3.9071205326810707, + "grad_norm": 0.21723158657550812, + "learning_rate": 5.118222584590297e-06, + "loss": 4.2234, + "step": 57505 + }, + { + "epoch": 3.907460252751733, + "grad_norm": 0.22744525969028473, + "learning_rate": 5.117797934501971e-06, + "loss": 3.9991, + "step": 57510 + }, + { + "epoch": 3.9077999728223944, + "grad_norm": 0.27450674772262573, + "learning_rate": 5.117373284413644e-06, + "loss": 4.0153, + "step": 57515 + }, + { + "epoch": 3.908139692893056, + "grad_norm": 0.3690650463104248, + "learning_rate": 5.116948634325316e-06, + "loss": 3.9866, + "step": 57520 + }, + { + "epoch": 3.908479412963718, + "grad_norm": 0.45183733105659485, + "learning_rate": 5.1165239842369895e-06, + "loss": 3.6592, + "step": 57525 + }, + { + "epoch": 3.90881913303438, + "grad_norm": 0.31143102049827576, + "learning_rate": 5.116099334148662e-06, + "loss": 3.8289, + "step": 57530 + }, + { + "epoch": 3.9091588531050414, + "grad_norm": 0.3227499723434448, + "learning_rate": 5.115674684060334e-06, + "loss": 3.9878, + "step": 57535 + }, + { + "epoch": 3.9094985731757035, + "grad_norm": 0.26769834756851196, + "learning_rate": 5.115250033972008e-06, + "loss": 4.0053, + "step": 57540 + }, + { + "epoch": 3.909838293246365, + "grad_norm": 0.2430030107498169, + "learning_rate": 5.114825383883681e-06, + "loss": 3.8868, + "step": 57545 + }, + { + "epoch": 3.9101780133170267, + "grad_norm": 0.5120116472244263, + "learning_rate": 5.114400733795353e-06, + "loss": 4.0808, + "step": 57550 + }, + { + "epoch": 3.910517733387689, + "grad_norm": 0.41394150257110596, + "learning_rate": 5.113976083707026e-06, + "loss": 3.8625, + "step": 57555 + }, + { + "epoch": 3.9108574534583505, + "grad_norm": 0.3501259684562683, + "learning_rate": 5.113551433618699e-06, + "loss": 4.007, + "step": 57560 + }, + { + "epoch": 3.911197173529012, + "grad_norm": 0.23419730365276337, + "learning_rate": 5.113126783530371e-06, + "loss": 3.9377, + "step": 57565 + }, + { + "epoch": 3.9115368935996737, + "grad_norm": 0.2174137830734253, + "learning_rate": 5.112702133442045e-06, + "loss": 3.8932, + "step": 57570 + }, + { + "epoch": 3.911876613670336, + "grad_norm": 0.5515187382698059, + "learning_rate": 5.112277483353717e-06, + "loss": 3.9567, + "step": 57575 + }, + { + "epoch": 3.9122163337409974, + "grad_norm": 0.21826709806919098, + "learning_rate": 5.1118528332653894e-06, + "loss": 3.8931, + "step": 57580 + }, + { + "epoch": 3.912556053811659, + "grad_norm": 0.2767323851585388, + "learning_rate": 5.111428183177063e-06, + "loss": 3.9087, + "step": 57585 + }, + { + "epoch": 3.912895773882321, + "grad_norm": 0.364846408367157, + "learning_rate": 5.111003533088735e-06, + "loss": 4.1079, + "step": 57590 + }, + { + "epoch": 3.9132354939529828, + "grad_norm": 0.3642374277114868, + "learning_rate": 5.110578883000408e-06, + "loss": 4.0371, + "step": 57595 + }, + { + "epoch": 3.9135752140236444, + "grad_norm": 0.2649478316307068, + "learning_rate": 5.1101542329120815e-06, + "loss": 3.9837, + "step": 57600 + }, + { + "epoch": 3.913914934094306, + "grad_norm": 0.26038506627082825, + "learning_rate": 5.1097295828237534e-06, + "loss": 3.9341, + "step": 57605 + }, + { + "epoch": 3.914254654164968, + "grad_norm": 0.284596711397171, + "learning_rate": 5.109304932735426e-06, + "loss": 3.9072, + "step": 57610 + }, + { + "epoch": 3.9145943742356297, + "grad_norm": 0.34939390420913696, + "learning_rate": 5.1088802826471e-06, + "loss": 4.0586, + "step": 57615 + }, + { + "epoch": 3.9149340943062914, + "grad_norm": 0.2923862636089325, + "learning_rate": 5.108455632558772e-06, + "loss": 4.1051, + "step": 57620 + }, + { + "epoch": 3.9152738143769534, + "grad_norm": 0.23234158754348755, + "learning_rate": 5.108030982470445e-06, + "loss": 3.9474, + "step": 57625 + }, + { + "epoch": 3.915613534447615, + "grad_norm": 0.5782677531242371, + "learning_rate": 5.107606332382118e-06, + "loss": 3.9495, + "step": 57630 + }, + { + "epoch": 3.9159532545182767, + "grad_norm": 0.26067623496055603, + "learning_rate": 5.10718168229379e-06, + "loss": 4.0166, + "step": 57635 + }, + { + "epoch": 3.9162929745889388, + "grad_norm": 0.2799428701400757, + "learning_rate": 5.106757032205463e-06, + "loss": 4.0435, + "step": 57640 + }, + { + "epoch": 3.9166326946596004, + "grad_norm": 0.24058476090431213, + "learning_rate": 5.106332382117136e-06, + "loss": 4.0832, + "step": 57645 + }, + { + "epoch": 3.916972414730262, + "grad_norm": 0.2589353621006012, + "learning_rate": 5.105907732028809e-06, + "loss": 3.8575, + "step": 57650 + }, + { + "epoch": 3.917312134800924, + "grad_norm": 0.274215430021286, + "learning_rate": 5.1054830819404814e-06, + "loss": 3.919, + "step": 57655 + }, + { + "epoch": 3.9176518548715857, + "grad_norm": 0.30878305435180664, + "learning_rate": 5.105058431852154e-06, + "loss": 3.793, + "step": 57660 + }, + { + "epoch": 3.9179915749422474, + "grad_norm": 0.2031242698431015, + "learning_rate": 5.104633781763827e-06, + "loss": 4.0044, + "step": 57665 + }, + { + "epoch": 3.9183312950129094, + "grad_norm": 0.34655624628067017, + "learning_rate": 5.104209131675499e-06, + "loss": 4.0331, + "step": 57670 + }, + { + "epoch": 3.918671015083571, + "grad_norm": 0.2265501469373703, + "learning_rate": 5.103784481587173e-06, + "loss": 4.1344, + "step": 57675 + }, + { + "epoch": 3.9190107351542327, + "grad_norm": 0.20555037260055542, + "learning_rate": 5.1033598314988454e-06, + "loss": 3.8783, + "step": 57680 + }, + { + "epoch": 3.919350455224895, + "grad_norm": 0.24627287685871124, + "learning_rate": 5.102935181410517e-06, + "loss": 4.0134, + "step": 57685 + }, + { + "epoch": 3.9196901752955564, + "grad_norm": 0.27590757608413696, + "learning_rate": 5.102510531322191e-06, + "loss": 3.9589, + "step": 57690 + }, + { + "epoch": 3.920029895366218, + "grad_norm": 0.20743513107299805, + "learning_rate": 5.102085881233864e-06, + "loss": 4.186, + "step": 57695 + }, + { + "epoch": 3.92036961543688, + "grad_norm": 0.28852763772010803, + "learning_rate": 5.101661231145536e-06, + "loss": 4.1566, + "step": 57700 + }, + { + "epoch": 3.9207093355075417, + "grad_norm": 0.30288001894950867, + "learning_rate": 5.1012365810572094e-06, + "loss": 4.2727, + "step": 57705 + }, + { + "epoch": 3.9210490555782034, + "grad_norm": 0.5716973543167114, + "learning_rate": 5.100811930968882e-06, + "loss": 4.0648, + "step": 57710 + }, + { + "epoch": 3.9213887756488655, + "grad_norm": 0.4522344172000885, + "learning_rate": 5.100387280880554e-06, + "loss": 3.7957, + "step": 57715 + }, + { + "epoch": 3.921728495719527, + "grad_norm": 0.3084264397621155, + "learning_rate": 5.099962630792228e-06, + "loss": 3.9325, + "step": 57720 + }, + { + "epoch": 3.9220682157901887, + "grad_norm": 0.3232453167438507, + "learning_rate": 5.099537980703901e-06, + "loss": 3.55, + "step": 57725 + }, + { + "epoch": 3.922407935860851, + "grad_norm": 0.24959033727645874, + "learning_rate": 5.099113330615573e-06, + "loss": 3.925, + "step": 57730 + }, + { + "epoch": 3.9227476559315124, + "grad_norm": 0.23004040122032166, + "learning_rate": 5.098688680527246e-06, + "loss": 4.1286, + "step": 57735 + }, + { + "epoch": 3.923087376002174, + "grad_norm": 0.29527243971824646, + "learning_rate": 5.098264030438918e-06, + "loss": 3.8659, + "step": 57740 + }, + { + "epoch": 3.923427096072836, + "grad_norm": 0.30012401938438416, + "learning_rate": 5.097839380350591e-06, + "loss": 3.9506, + "step": 57745 + }, + { + "epoch": 3.9237668161434978, + "grad_norm": 0.2867148220539093, + "learning_rate": 5.097414730262265e-06, + "loss": 4.119, + "step": 57750 + }, + { + "epoch": 3.9241065362141594, + "grad_norm": 0.4306808412075043, + "learning_rate": 5.096990080173937e-06, + "loss": 3.8908, + "step": 57755 + }, + { + "epoch": 3.9244462562848215, + "grad_norm": 0.24244210124015808, + "learning_rate": 5.096565430085609e-06, + "loss": 3.916, + "step": 57760 + }, + { + "epoch": 3.924785976355483, + "grad_norm": 0.21228522062301636, + "learning_rate": 5.096140779997283e-06, + "loss": 4.0168, + "step": 57765 + }, + { + "epoch": 3.9251256964261447, + "grad_norm": 0.2788419723510742, + "learning_rate": 5.095716129908955e-06, + "loss": 3.8781, + "step": 57770 + }, + { + "epoch": 3.925465416496807, + "grad_norm": 0.25811389088630676, + "learning_rate": 5.095291479820629e-06, + "loss": 4.108, + "step": 57775 + }, + { + "epoch": 3.9258051365674684, + "grad_norm": 0.3433159589767456, + "learning_rate": 5.0948668297323015e-06, + "loss": 3.6543, + "step": 57780 + }, + { + "epoch": 3.92614485663813, + "grad_norm": 0.294278085231781, + "learning_rate": 5.094442179643973e-06, + "loss": 4.0579, + "step": 57785 + }, + { + "epoch": 3.926484576708792, + "grad_norm": 0.21546858549118042, + "learning_rate": 5.094017529555647e-06, + "loss": 4.0855, + "step": 57790 + }, + { + "epoch": 3.9268242967794538, + "grad_norm": 0.32663607597351074, + "learning_rate": 5.09359287946732e-06, + "loss": 3.6094, + "step": 57795 + }, + { + "epoch": 3.9271640168501154, + "grad_norm": 0.35425856709480286, + "learning_rate": 5.093168229378992e-06, + "loss": 4.0991, + "step": 57800 + }, + { + "epoch": 3.9275037369207775, + "grad_norm": 0.6653276681900024, + "learning_rate": 5.0927435792906655e-06, + "loss": 4.0586, + "step": 57805 + }, + { + "epoch": 3.927843456991439, + "grad_norm": 0.3536148965358734, + "learning_rate": 5.092318929202337e-06, + "loss": 4.0443, + "step": 57810 + }, + { + "epoch": 3.9281831770621007, + "grad_norm": 0.2901611030101776, + "learning_rate": 5.09189427911401e-06, + "loss": 3.8258, + "step": 57815 + }, + { + "epoch": 3.928522897132763, + "grad_norm": 0.3867815434932709, + "learning_rate": 5.091469629025684e-06, + "loss": 3.7439, + "step": 57820 + }, + { + "epoch": 3.9288626172034244, + "grad_norm": 0.34712058305740356, + "learning_rate": 5.091044978937356e-06, + "loss": 4.1344, + "step": 57825 + }, + { + "epoch": 3.929202337274086, + "grad_norm": 0.24046820402145386, + "learning_rate": 5.090620328849029e-06, + "loss": 3.9606, + "step": 57830 + }, + { + "epoch": 3.929542057344748, + "grad_norm": 0.3633612394332886, + "learning_rate": 5.090195678760702e-06, + "loss": 3.8558, + "step": 57835 + }, + { + "epoch": 3.92988177741541, + "grad_norm": 0.25957340002059937, + "learning_rate": 5.089771028672374e-06, + "loss": 3.8196, + "step": 57840 + }, + { + "epoch": 3.9302214974860714, + "grad_norm": 0.29832082986831665, + "learning_rate": 5.089346378584047e-06, + "loss": 3.9776, + "step": 57845 + }, + { + "epoch": 3.9305612175567335, + "grad_norm": 0.2540123462677002, + "learning_rate": 5.088921728495721e-06, + "loss": 3.8882, + "step": 57850 + }, + { + "epoch": 3.930900937627395, + "grad_norm": 0.3077661097049713, + "learning_rate": 5.088497078407393e-06, + "loss": 4.1353, + "step": 57855 + }, + { + "epoch": 3.9312406576980568, + "grad_norm": 0.4154829680919647, + "learning_rate": 5.0880724283190654e-06, + "loss": 4.0336, + "step": 57860 + }, + { + "epoch": 3.931580377768719, + "grad_norm": 0.25680482387542725, + "learning_rate": 5.087647778230739e-06, + "loss": 3.877, + "step": 57865 + }, + { + "epoch": 3.9319200978393805, + "grad_norm": 0.29273322224617004, + "learning_rate": 5.087223128142411e-06, + "loss": 3.8938, + "step": 57870 + }, + { + "epoch": 3.932259817910042, + "grad_norm": 0.2822769284248352, + "learning_rate": 5.086798478054084e-06, + "loss": 4.1077, + "step": 57875 + }, + { + "epoch": 3.932599537980704, + "grad_norm": 0.25847116112709045, + "learning_rate": 5.0863738279657575e-06, + "loss": 3.9081, + "step": 57880 + }, + { + "epoch": 3.932939258051366, + "grad_norm": 0.28054338693618774, + "learning_rate": 5.0859491778774294e-06, + "loss": 4.1062, + "step": 57885 + }, + { + "epoch": 3.9332789781220274, + "grad_norm": 0.232683464884758, + "learning_rate": 5.085524527789102e-06, + "loss": 4.0553, + "step": 57890 + }, + { + "epoch": 3.9336186981926895, + "grad_norm": 0.23438303172588348, + "learning_rate": 5.085099877700775e-06, + "loss": 3.965, + "step": 57895 + }, + { + "epoch": 3.933958418263351, + "grad_norm": 0.22040893137454987, + "learning_rate": 5.084675227612448e-06, + "loss": 3.841, + "step": 57900 + }, + { + "epoch": 3.9342981383340128, + "grad_norm": 0.34526941180229187, + "learning_rate": 5.084250577524121e-06, + "loss": 3.955, + "step": 57905 + }, + { + "epoch": 3.9346378584046744, + "grad_norm": 0.3193441927433014, + "learning_rate": 5.0838259274357934e-06, + "loss": 4.1398, + "step": 57910 + }, + { + "epoch": 3.9349775784753365, + "grad_norm": 0.3558662235736847, + "learning_rate": 5.083401277347466e-06, + "loss": 4.0602, + "step": 57915 + }, + { + "epoch": 3.935317298545998, + "grad_norm": 0.4007507264614105, + "learning_rate": 5.082976627259138e-06, + "loss": 4.0497, + "step": 57920 + }, + { + "epoch": 3.9356570186166597, + "grad_norm": 0.3910283148288727, + "learning_rate": 5.082551977170812e-06, + "loss": 4.1874, + "step": 57925 + }, + { + "epoch": 3.935996738687322, + "grad_norm": 0.3665596842765808, + "learning_rate": 5.082127327082485e-06, + "loss": 3.7922, + "step": 57930 + }, + { + "epoch": 3.9363364587579834, + "grad_norm": 0.33308926224708557, + "learning_rate": 5.081702676994157e-06, + "loss": 4.131, + "step": 57935 + }, + { + "epoch": 3.936676178828645, + "grad_norm": 0.33435794711112976, + "learning_rate": 5.08127802690583e-06, + "loss": 4.0392, + "step": 57940 + }, + { + "epoch": 3.9370158988993067, + "grad_norm": 0.3299930989742279, + "learning_rate": 5.080853376817503e-06, + "loss": 3.9002, + "step": 57945 + }, + { + "epoch": 3.9373556189699688, + "grad_norm": 0.3301013708114624, + "learning_rate": 5.080428726729175e-06, + "loss": 4.0834, + "step": 57950 + }, + { + "epoch": 3.9376953390406304, + "grad_norm": 0.2687692940235138, + "learning_rate": 5.080004076640849e-06, + "loss": 4.1623, + "step": 57955 + }, + { + "epoch": 3.938035059111292, + "grad_norm": 0.2393343448638916, + "learning_rate": 5.0795794265525214e-06, + "loss": 4.0166, + "step": 57960 + }, + { + "epoch": 3.938374779181954, + "grad_norm": 0.26372256875038147, + "learning_rate": 5.079239706481859e-06, + "loss": 4.0279, + "step": 57965 + }, + { + "epoch": 3.9387144992526157, + "grad_norm": 0.5110945105552673, + "learning_rate": 5.078815056393532e-06, + "loss": 4.2098, + "step": 57970 + }, + { + "epoch": 3.9390542193232774, + "grad_norm": 0.3466745615005493, + "learning_rate": 5.078390406305205e-06, + "loss": 3.9727, + "step": 57975 + }, + { + "epoch": 3.9393939393939394, + "grad_norm": 0.3731937110424042, + "learning_rate": 5.077965756216878e-06, + "loss": 3.8657, + "step": 57980 + }, + { + "epoch": 3.939733659464601, + "grad_norm": 0.30311134457588196, + "learning_rate": 5.077541106128551e-06, + "loss": 3.9504, + "step": 57985 + }, + { + "epoch": 3.9400733795352627, + "grad_norm": 0.2911999523639679, + "learning_rate": 5.077116456040223e-06, + "loss": 3.8721, + "step": 57990 + }, + { + "epoch": 3.940413099605925, + "grad_norm": 0.27982446551322937, + "learning_rate": 5.076691805951896e-06, + "loss": 3.8424, + "step": 57995 + }, + { + "epoch": 3.9407528196765864, + "grad_norm": 0.4406832754611969, + "learning_rate": 5.076267155863569e-06, + "loss": 3.8474, + "step": 58000 + }, + { + "epoch": 3.941092539747248, + "grad_norm": 0.6799582839012146, + "learning_rate": 5.075842505775241e-06, + "loss": 3.6651, + "step": 58005 + }, + { + "epoch": 3.94143225981791, + "grad_norm": 0.2562040388584137, + "learning_rate": 5.075417855686915e-06, + "loss": 3.9047, + "step": 58010 + }, + { + "epoch": 3.9417719798885718, + "grad_norm": 0.44013330340385437, + "learning_rate": 5.0749932055985875e-06, + "loss": 3.9906, + "step": 58015 + }, + { + "epoch": 3.9421116999592334, + "grad_norm": 0.2444671392440796, + "learning_rate": 5.0745685555102595e-06, + "loss": 3.8713, + "step": 58020 + }, + { + "epoch": 3.9424514200298955, + "grad_norm": 0.35691866278648376, + "learning_rate": 5.074143905421933e-06, + "loss": 3.7905, + "step": 58025 + }, + { + "epoch": 3.942791140100557, + "grad_norm": 0.24391581118106842, + "learning_rate": 5.073719255333606e-06, + "loss": 3.991, + "step": 58030 + }, + { + "epoch": 3.9431308601712187, + "grad_norm": 0.2085772305727005, + "learning_rate": 5.073294605245278e-06, + "loss": 3.9277, + "step": 58035 + }, + { + "epoch": 3.943470580241881, + "grad_norm": 0.37280553579330444, + "learning_rate": 5.0728699551569515e-06, + "loss": 3.9154, + "step": 58040 + }, + { + "epoch": 3.9438103003125424, + "grad_norm": 0.21575163304805756, + "learning_rate": 5.072445305068624e-06, + "loss": 3.7964, + "step": 58045 + }, + { + "epoch": 3.944150020383204, + "grad_norm": 0.2701014280319214, + "learning_rate": 5.072020654980296e-06, + "loss": 3.9719, + "step": 58050 + }, + { + "epoch": 3.944489740453866, + "grad_norm": 0.22757704555988312, + "learning_rate": 5.07159600489197e-06, + "loss": 4.1065, + "step": 58055 + }, + { + "epoch": 3.9448294605245278, + "grad_norm": 0.2609398365020752, + "learning_rate": 5.071171354803643e-06, + "loss": 4.0652, + "step": 58060 + }, + { + "epoch": 3.9451691805951894, + "grad_norm": 0.3854295015335083, + "learning_rate": 5.070746704715315e-06, + "loss": 3.9261, + "step": 58065 + }, + { + "epoch": 3.9455089006658515, + "grad_norm": 0.3832912743091583, + "learning_rate": 5.070322054626988e-06, + "loss": 3.9453, + "step": 58070 + }, + { + "epoch": 3.945848620736513, + "grad_norm": 0.3176692724227905, + "learning_rate": 5.06989740453866e-06, + "loss": 3.9995, + "step": 58075 + }, + { + "epoch": 3.9461883408071747, + "grad_norm": 0.19898195564746857, + "learning_rate": 5.069472754450333e-06, + "loss": 4.0267, + "step": 58080 + }, + { + "epoch": 3.946528060877837, + "grad_norm": 0.23372545838356018, + "learning_rate": 5.069048104362007e-06, + "loss": 3.9321, + "step": 58085 + }, + { + "epoch": 3.9468677809484984, + "grad_norm": 0.3025113344192505, + "learning_rate": 5.068623454273679e-06, + "loss": 3.9581, + "step": 58090 + }, + { + "epoch": 3.94720750101916, + "grad_norm": 0.23244702816009521, + "learning_rate": 5.0681988041853515e-06, + "loss": 4.0048, + "step": 58095 + }, + { + "epoch": 3.947547221089822, + "grad_norm": 0.2800089120864868, + "learning_rate": 5.067774154097025e-06, + "loss": 4.1546, + "step": 58100 + }, + { + "epoch": 3.9478869411604838, + "grad_norm": 0.4009687006473541, + "learning_rate": 5.067349504008697e-06, + "loss": 4.2384, + "step": 58105 + }, + { + "epoch": 3.9482266612311454, + "grad_norm": 0.3440391421318054, + "learning_rate": 5.06692485392037e-06, + "loss": 3.7949, + "step": 58110 + }, + { + "epoch": 3.9485663813018075, + "grad_norm": 0.37678247690200806, + "learning_rate": 5.0665002038320435e-06, + "loss": 4.0287, + "step": 58115 + }, + { + "epoch": 3.948906101372469, + "grad_norm": 0.428680419921875, + "learning_rate": 5.0660755537437155e-06, + "loss": 3.9035, + "step": 58120 + }, + { + "epoch": 3.9492458214431307, + "grad_norm": 0.29226255416870117, + "learning_rate": 5.065650903655388e-06, + "loss": 3.7558, + "step": 58125 + }, + { + "epoch": 3.949585541513793, + "grad_norm": 0.2797650396823883, + "learning_rate": 5.065226253567062e-06, + "loss": 4.1952, + "step": 58130 + }, + { + "epoch": 3.9499252615844545, + "grad_norm": 0.2569884657859802, + "learning_rate": 5.064801603478734e-06, + "loss": 3.9294, + "step": 58135 + }, + { + "epoch": 3.950264981655116, + "grad_norm": 0.2569122314453125, + "learning_rate": 5.064376953390407e-06, + "loss": 3.9869, + "step": 58140 + }, + { + "epoch": 3.950604701725778, + "grad_norm": 0.4154261648654938, + "learning_rate": 5.0639523033020795e-06, + "loss": 3.9454, + "step": 58145 + }, + { + "epoch": 3.95094442179644, + "grad_norm": 0.28963232040405273, + "learning_rate": 5.063527653213752e-06, + "loss": 4.0861, + "step": 58150 + }, + { + "epoch": 3.9512841418671014, + "grad_norm": 0.25232940912246704, + "learning_rate": 5.063103003125425e-06, + "loss": 4.1827, + "step": 58155 + }, + { + "epoch": 3.9516238619377635, + "grad_norm": 0.2682448625564575, + "learning_rate": 5.062678353037098e-06, + "loss": 4.0979, + "step": 58160 + }, + { + "epoch": 3.951963582008425, + "grad_norm": 0.41656580567359924, + "learning_rate": 5.062253702948771e-06, + "loss": 4.2172, + "step": 58165 + }, + { + "epoch": 3.9523033020790868, + "grad_norm": 0.3218127489089966, + "learning_rate": 5.061829052860443e-06, + "loss": 3.9708, + "step": 58170 + }, + { + "epoch": 3.952643022149749, + "grad_norm": 0.2680538594722748, + "learning_rate": 5.061404402772116e-06, + "loss": 3.822, + "step": 58175 + }, + { + "epoch": 3.9529827422204105, + "grad_norm": 0.2946203351020813, + "learning_rate": 5.060979752683789e-06, + "loss": 4.0027, + "step": 58180 + }, + { + "epoch": 3.953322462291072, + "grad_norm": 0.3006676435470581, + "learning_rate": 5.060555102595461e-06, + "loss": 3.9427, + "step": 58185 + }, + { + "epoch": 3.953662182361734, + "grad_norm": 0.3171148896217346, + "learning_rate": 5.060130452507135e-06, + "loss": 3.9053, + "step": 58190 + }, + { + "epoch": 3.954001902432396, + "grad_norm": 0.2794609069824219, + "learning_rate": 5.0597058024188075e-06, + "loss": 3.8692, + "step": 58195 + }, + { + "epoch": 3.9543416225030574, + "grad_norm": 0.23809342086315155, + "learning_rate": 5.0592811523304795e-06, + "loss": 3.7043, + "step": 58200 + }, + { + "epoch": 3.9546813425737195, + "grad_norm": 0.4814680516719818, + "learning_rate": 5.058856502242153e-06, + "loss": 4.1112, + "step": 58205 + }, + { + "epoch": 3.955021062644381, + "grad_norm": 0.3076154589653015, + "learning_rate": 5.058431852153826e-06, + "loss": 3.7357, + "step": 58210 + }, + { + "epoch": 3.9553607827150428, + "grad_norm": 0.28479108214378357, + "learning_rate": 5.058007202065498e-06, + "loss": 3.9105, + "step": 58215 + }, + { + "epoch": 3.955700502785705, + "grad_norm": 0.38190653920173645, + "learning_rate": 5.0575825519771715e-06, + "loss": 4.2604, + "step": 58220 + }, + { + "epoch": 3.9560402228563665, + "grad_norm": 0.30273616313934326, + "learning_rate": 5.057157901888844e-06, + "loss": 3.8708, + "step": 58225 + }, + { + "epoch": 3.956379942927028, + "grad_norm": 0.32773736119270325, + "learning_rate": 5.056733251800516e-06, + "loss": 3.9601, + "step": 58230 + }, + { + "epoch": 3.95671966299769, + "grad_norm": 0.3179551362991333, + "learning_rate": 5.05630860171219e-06, + "loss": 4.1167, + "step": 58235 + }, + { + "epoch": 3.957059383068352, + "grad_norm": 0.2748391926288605, + "learning_rate": 5.055883951623862e-06, + "loss": 3.9924, + "step": 58240 + }, + { + "epoch": 3.9573991031390134, + "grad_norm": 0.3347257971763611, + "learning_rate": 5.055459301535535e-06, + "loss": 4.0437, + "step": 58245 + }, + { + "epoch": 3.957738823209675, + "grad_norm": 0.3461383283138275, + "learning_rate": 5.055034651447208e-06, + "loss": 3.8551, + "step": 58250 + }, + { + "epoch": 3.958078543280337, + "grad_norm": 0.35238951444625854, + "learning_rate": 5.05461000135888e-06, + "loss": 3.7131, + "step": 58255 + }, + { + "epoch": 3.958418263350999, + "grad_norm": 0.3192906677722931, + "learning_rate": 5.054185351270553e-06, + "loss": 3.9109, + "step": 58260 + }, + { + "epoch": 3.9587579834216604, + "grad_norm": 0.3585376739501953, + "learning_rate": 5.053760701182227e-06, + "loss": 4.1169, + "step": 58265 + }, + { + "epoch": 3.9590977034923225, + "grad_norm": 0.2893425524234772, + "learning_rate": 5.053336051093899e-06, + "loss": 4.0209, + "step": 58270 + }, + { + "epoch": 3.959437423562984, + "grad_norm": 0.310381680727005, + "learning_rate": 5.0529114010055715e-06, + "loss": 3.848, + "step": 58275 + }, + { + "epoch": 3.9597771436336457, + "grad_norm": 0.23836494982242584, + "learning_rate": 5.052486750917245e-06, + "loss": 3.977, + "step": 58280 + }, + { + "epoch": 3.9601168637043074, + "grad_norm": 0.2603084146976471, + "learning_rate": 5.052062100828917e-06, + "loss": 4.1101, + "step": 58285 + }, + { + "epoch": 3.9604565837749695, + "grad_norm": 0.24512654542922974, + "learning_rate": 5.05163745074059e-06, + "loss": 4.2028, + "step": 58290 + }, + { + "epoch": 3.960796303845631, + "grad_norm": 0.22990426421165466, + "learning_rate": 5.0512128006522635e-06, + "loss": 4.0237, + "step": 58295 + }, + { + "epoch": 3.9611360239162927, + "grad_norm": 0.37541162967681885, + "learning_rate": 5.0507881505639355e-06, + "loss": 3.9725, + "step": 58300 + }, + { + "epoch": 3.961475743986955, + "grad_norm": 0.2819334864616394, + "learning_rate": 5.050363500475608e-06, + "loss": 4.0602, + "step": 58305 + }, + { + "epoch": 3.9618154640576164, + "grad_norm": 0.2433515042066574, + "learning_rate": 5.049938850387281e-06, + "loss": 3.9124, + "step": 58310 + }, + { + "epoch": 3.962155184128278, + "grad_norm": 0.27897870540618896, + "learning_rate": 5.049514200298954e-06, + "loss": 4.3352, + "step": 58315 + }, + { + "epoch": 3.96249490419894, + "grad_norm": 0.8289919495582581, + "learning_rate": 5.0490895502106275e-06, + "loss": 4.0228, + "step": 58320 + }, + { + "epoch": 3.9628346242696018, + "grad_norm": 0.2913486957550049, + "learning_rate": 5.0486649001222995e-06, + "loss": 3.891, + "step": 58325 + }, + { + "epoch": 3.9631743443402634, + "grad_norm": 0.3963407874107361, + "learning_rate": 5.048240250033972e-06, + "loss": 3.7392, + "step": 58330 + }, + { + "epoch": 3.9635140644109255, + "grad_norm": 0.24396933615207672, + "learning_rate": 5.047815599945646e-06, + "loss": 3.9901, + "step": 58335 + }, + { + "epoch": 3.963853784481587, + "grad_norm": 0.2112450897693634, + "learning_rate": 5.047390949857318e-06, + "loss": 4.003, + "step": 58340 + }, + { + "epoch": 3.9641935045522487, + "grad_norm": 0.3081175982952118, + "learning_rate": 5.046966299768991e-06, + "loss": 3.8665, + "step": 58345 + }, + { + "epoch": 3.964533224622911, + "grad_norm": 0.26346543431282043, + "learning_rate": 5.046541649680664e-06, + "loss": 4.2156, + "step": 58350 + }, + { + "epoch": 3.9648729446935724, + "grad_norm": 0.4820170998573303, + "learning_rate": 5.046116999592336e-06, + "loss": 3.7116, + "step": 58355 + }, + { + "epoch": 3.965212664764234, + "grad_norm": 0.31690627336502075, + "learning_rate": 5.045692349504009e-06, + "loss": 3.8358, + "step": 58360 + }, + { + "epoch": 3.965552384834896, + "grad_norm": 0.26780515909194946, + "learning_rate": 5.045267699415683e-06, + "loss": 4.214, + "step": 58365 + }, + { + "epoch": 3.9658921049055578, + "grad_norm": 0.392058402299881, + "learning_rate": 5.044843049327355e-06, + "loss": 3.9259, + "step": 58370 + }, + { + "epoch": 3.9662318249762194, + "grad_norm": 0.2669997811317444, + "learning_rate": 5.0444183992390275e-06, + "loss": 3.9131, + "step": 58375 + }, + { + "epoch": 3.9665715450468815, + "grad_norm": 0.26724737882614136, + "learning_rate": 5.043993749150701e-06, + "loss": 3.9918, + "step": 58380 + }, + { + "epoch": 3.966911265117543, + "grad_norm": 0.2420327365398407, + "learning_rate": 5.043569099062373e-06, + "loss": 4.119, + "step": 58385 + }, + { + "epoch": 3.9672509851882047, + "grad_norm": 0.2743793725967407, + "learning_rate": 5.043144448974046e-06, + "loss": 4.1505, + "step": 58390 + }, + { + "epoch": 3.967590705258867, + "grad_norm": 0.2819851040840149, + "learning_rate": 5.042719798885719e-06, + "loss": 4.0198, + "step": 58395 + }, + { + "epoch": 3.9679304253295284, + "grad_norm": 0.4438953995704651, + "learning_rate": 5.0422951487973915e-06, + "loss": 3.8275, + "step": 58400 + }, + { + "epoch": 3.96827014540019, + "grad_norm": 0.20595765113830566, + "learning_rate": 5.041870498709064e-06, + "loss": 3.8795, + "step": 58405 + }, + { + "epoch": 3.968609865470852, + "grad_norm": 0.2273297756910324, + "learning_rate": 5.041445848620737e-06, + "loss": 3.9481, + "step": 58410 + }, + { + "epoch": 3.968949585541514, + "grad_norm": 0.28739240765571594, + "learning_rate": 5.04102119853241e-06, + "loss": 3.8901, + "step": 58415 + }, + { + "epoch": 3.9692893056121754, + "grad_norm": 0.36695149540901184, + "learning_rate": 5.040596548444082e-06, + "loss": 3.8045, + "step": 58420 + }, + { + "epoch": 3.9696290256828375, + "grad_norm": 0.7157492637634277, + "learning_rate": 5.0401718983557555e-06, + "loss": 4.1254, + "step": 58425 + }, + { + "epoch": 3.969968745753499, + "grad_norm": 0.37680432200431824, + "learning_rate": 5.039747248267428e-06, + "loss": 4.274, + "step": 58430 + }, + { + "epoch": 3.9703084658241607, + "grad_norm": 0.27810028195381165, + "learning_rate": 5.0393225981791e-06, + "loss": 3.7312, + "step": 58435 + }, + { + "epoch": 3.970648185894823, + "grad_norm": 0.38653188943862915, + "learning_rate": 5.038897948090774e-06, + "loss": 4.0085, + "step": 58440 + }, + { + "epoch": 3.9709879059654845, + "grad_norm": 0.8323909640312195, + "learning_rate": 5.038473298002447e-06, + "loss": 4.0383, + "step": 58445 + }, + { + "epoch": 3.971327626036146, + "grad_norm": 0.3299977481365204, + "learning_rate": 5.038048647914119e-06, + "loss": 3.8155, + "step": 58450 + }, + { + "epoch": 3.971667346106808, + "grad_norm": 0.2902021110057831, + "learning_rate": 5.037623997825792e-06, + "loss": 4.0525, + "step": 58455 + }, + { + "epoch": 3.97200706617747, + "grad_norm": 0.2774941921234131, + "learning_rate": 5.037199347737465e-06, + "loss": 4.0334, + "step": 58460 + }, + { + "epoch": 3.9723467862481314, + "grad_norm": 0.37623685598373413, + "learning_rate": 5.036774697649137e-06, + "loss": 4.0245, + "step": 58465 + }, + { + "epoch": 3.9726865063187935, + "grad_norm": 0.3542513847351074, + "learning_rate": 5.036350047560811e-06, + "loss": 3.9219, + "step": 58470 + }, + { + "epoch": 3.973026226389455, + "grad_norm": 0.24315783381462097, + "learning_rate": 5.0359253974724835e-06, + "loss": 3.8426, + "step": 58475 + }, + { + "epoch": 3.9733659464601168, + "grad_norm": 0.3553272783756256, + "learning_rate": 5.0355007473841555e-06, + "loss": 3.9992, + "step": 58480 + }, + { + "epoch": 3.973705666530779, + "grad_norm": 0.2942862808704376, + "learning_rate": 5.035076097295829e-06, + "loss": 4.0137, + "step": 58485 + }, + { + "epoch": 3.9740453866014405, + "grad_norm": 0.2112511247396469, + "learning_rate": 5.034651447207501e-06, + "loss": 4.0195, + "step": 58490 + }, + { + "epoch": 3.974385106672102, + "grad_norm": 0.27731478214263916, + "learning_rate": 5.034226797119174e-06, + "loss": 4.2664, + "step": 58495 + }, + { + "epoch": 3.974724826742764, + "grad_norm": 0.36039862036705017, + "learning_rate": 5.0338021470308475e-06, + "loss": 3.8091, + "step": 58500 + }, + { + "epoch": 3.975064546813426, + "grad_norm": 0.31130048632621765, + "learning_rate": 5.0333774969425195e-06, + "loss": 4.1772, + "step": 58505 + }, + { + "epoch": 3.9754042668840874, + "grad_norm": 0.24105072021484375, + "learning_rate": 5.032952846854192e-06, + "loss": 4.1207, + "step": 58510 + }, + { + "epoch": 3.9757439869547495, + "grad_norm": 0.37929612398147583, + "learning_rate": 5.032528196765866e-06, + "loss": 4.107, + "step": 58515 + }, + { + "epoch": 3.976083707025411, + "grad_norm": 0.33433541655540466, + "learning_rate": 5.032103546677538e-06, + "loss": 3.9334, + "step": 58520 + }, + { + "epoch": 3.9764234270960728, + "grad_norm": 0.25120124220848083, + "learning_rate": 5.031678896589211e-06, + "loss": 4.0513, + "step": 58525 + }, + { + "epoch": 3.976763147166735, + "grad_norm": 0.3174200654029846, + "learning_rate": 5.031254246500884e-06, + "loss": 3.9734, + "step": 58530 + }, + { + "epoch": 3.9771028672373965, + "grad_norm": 0.37749889492988586, + "learning_rate": 5.030829596412556e-06, + "loss": 4.1585, + "step": 58535 + }, + { + "epoch": 3.977442587308058, + "grad_norm": 0.2769172489643097, + "learning_rate": 5.030404946324229e-06, + "loss": 3.9451, + "step": 58540 + }, + { + "epoch": 3.97778230737872, + "grad_norm": 0.35892000794410706, + "learning_rate": 5.029980296235903e-06, + "loss": 3.9843, + "step": 58545 + }, + { + "epoch": 3.978122027449382, + "grad_norm": 0.2968170940876007, + "learning_rate": 5.029555646147575e-06, + "loss": 4.0413, + "step": 58550 + }, + { + "epoch": 3.9784617475200434, + "grad_norm": 0.3058816194534302, + "learning_rate": 5.0291309960592475e-06, + "loss": 4.1394, + "step": 58555 + }, + { + "epoch": 3.9788014675907055, + "grad_norm": 0.25949081778526306, + "learning_rate": 5.02870634597092e-06, + "loss": 3.8917, + "step": 58560 + }, + { + "epoch": 3.979141187661367, + "grad_norm": 0.2067592442035675, + "learning_rate": 5.028281695882593e-06, + "loss": 3.9893, + "step": 58565 + }, + { + "epoch": 3.979480907732029, + "grad_norm": 0.2890806496143341, + "learning_rate": 5.027857045794266e-06, + "loss": 4.1254, + "step": 58570 + }, + { + "epoch": 3.979820627802691, + "grad_norm": 0.38214829564094543, + "learning_rate": 5.027432395705939e-06, + "loss": 3.9801, + "step": 58575 + }, + { + "epoch": 3.9801603478733525, + "grad_norm": 0.23767316341400146, + "learning_rate": 5.0270077456176115e-06, + "loss": 3.8952, + "step": 58580 + }, + { + "epoch": 3.980500067944014, + "grad_norm": 0.2371036410331726, + "learning_rate": 5.026583095529283e-06, + "loss": 4.033, + "step": 58585 + }, + { + "epoch": 3.9808397880146758, + "grad_norm": 0.2571028470993042, + "learning_rate": 5.026158445440957e-06, + "loss": 3.8123, + "step": 58590 + }, + { + "epoch": 3.981179508085338, + "grad_norm": 0.5088363289833069, + "learning_rate": 5.02573379535263e-06, + "loss": 3.9594, + "step": 58595 + }, + { + "epoch": 3.9815192281559995, + "grad_norm": 0.24852986633777618, + "learning_rate": 5.025309145264302e-06, + "loss": 4.2705, + "step": 58600 + }, + { + "epoch": 3.981858948226661, + "grad_norm": 0.28165388107299805, + "learning_rate": 5.0248844951759755e-06, + "loss": 3.8287, + "step": 58605 + }, + { + "epoch": 3.982198668297323, + "grad_norm": 0.32244953513145447, + "learning_rate": 5.024459845087648e-06, + "loss": 4.1208, + "step": 58610 + }, + { + "epoch": 3.982538388367985, + "grad_norm": 0.25507989525794983, + "learning_rate": 5.02403519499932e-06, + "loss": 3.7836, + "step": 58615 + }, + { + "epoch": 3.9828781084386464, + "grad_norm": 0.2761240303516388, + "learning_rate": 5.023610544910994e-06, + "loss": 4.0942, + "step": 58620 + }, + { + "epoch": 3.983217828509308, + "grad_norm": 0.3392726480960846, + "learning_rate": 5.023185894822667e-06, + "loss": 3.7577, + "step": 58625 + }, + { + "epoch": 3.98355754857997, + "grad_norm": 0.20019598305225372, + "learning_rate": 5.022761244734339e-06, + "loss": 3.936, + "step": 58630 + }, + { + "epoch": 3.9838972686506318, + "grad_norm": 0.5065690279006958, + "learning_rate": 5.022336594646012e-06, + "loss": 3.8257, + "step": 58635 + }, + { + "epoch": 3.9842369887212934, + "grad_norm": 0.3495621979236603, + "learning_rate": 5.021911944557685e-06, + "loss": 3.9805, + "step": 58640 + }, + { + "epoch": 3.9845767087919555, + "grad_norm": 0.2865507900714874, + "learning_rate": 5.021487294469357e-06, + "loss": 3.878, + "step": 58645 + }, + { + "epoch": 3.984916428862617, + "grad_norm": 0.256864458322525, + "learning_rate": 5.021062644381031e-06, + "loss": 4.0125, + "step": 58650 + }, + { + "epoch": 3.9852561489332787, + "grad_norm": 0.2573469281196594, + "learning_rate": 5.020637994292703e-06, + "loss": 3.9634, + "step": 58655 + }, + { + "epoch": 3.985595869003941, + "grad_norm": 0.2325226068496704, + "learning_rate": 5.020213344204376e-06, + "loss": 3.9289, + "step": 58660 + }, + { + "epoch": 3.9859355890746024, + "grad_norm": 0.2763606309890747, + "learning_rate": 5.019788694116049e-06, + "loss": 4.0202, + "step": 58665 + }, + { + "epoch": 3.986275309145264, + "grad_norm": 0.450851172208786, + "learning_rate": 5.019364044027721e-06, + "loss": 3.8745, + "step": 58670 + }, + { + "epoch": 3.986615029215926, + "grad_norm": 0.26873111724853516, + "learning_rate": 5.018939393939395e-06, + "loss": 4.0022, + "step": 58675 + }, + { + "epoch": 3.9869547492865878, + "grad_norm": 0.3331586718559265, + "learning_rate": 5.0185147438510675e-06, + "loss": 3.9313, + "step": 58680 + }, + { + "epoch": 3.9872944693572494, + "grad_norm": 0.31806641817092896, + "learning_rate": 5.0180900937627394e-06, + "loss": 3.8233, + "step": 58685 + }, + { + "epoch": 3.9876341894279115, + "grad_norm": 0.2953803539276123, + "learning_rate": 5.017665443674413e-06, + "loss": 4.1397, + "step": 58690 + }, + { + "epoch": 3.987973909498573, + "grad_norm": 0.18836170434951782, + "learning_rate": 5.017240793586086e-06, + "loss": 3.9903, + "step": 58695 + }, + { + "epoch": 3.9883136295692347, + "grad_norm": 0.20389068126678467, + "learning_rate": 5.016816143497758e-06, + "loss": 3.7871, + "step": 58700 + }, + { + "epoch": 3.988653349639897, + "grad_norm": 0.2956773042678833, + "learning_rate": 5.0163914934094315e-06, + "loss": 4.0163, + "step": 58705 + }, + { + "epoch": 3.9889930697105584, + "grad_norm": 0.24587604403495789, + "learning_rate": 5.015966843321104e-06, + "loss": 3.9819, + "step": 58710 + }, + { + "epoch": 3.98933278978122, + "grad_norm": 0.2774713635444641, + "learning_rate": 5.015542193232776e-06, + "loss": 3.9936, + "step": 58715 + }, + { + "epoch": 3.989672509851882, + "grad_norm": 0.24898895621299744, + "learning_rate": 5.01511754314445e-06, + "loss": 4.2272, + "step": 58720 + }, + { + "epoch": 3.990012229922544, + "grad_norm": 0.6028050780296326, + "learning_rate": 5.014692893056123e-06, + "loss": 3.9018, + "step": 58725 + }, + { + "epoch": 3.9903519499932054, + "grad_norm": 0.26224014163017273, + "learning_rate": 5.014268242967795e-06, + "loss": 3.7751, + "step": 58730 + }, + { + "epoch": 3.9906916700638675, + "grad_norm": 0.3066282272338867, + "learning_rate": 5.013843592879468e-06, + "loss": 3.8096, + "step": 58735 + }, + { + "epoch": 3.991031390134529, + "grad_norm": 0.28064778447151184, + "learning_rate": 5.01341894279114e-06, + "loss": 3.944, + "step": 58740 + }, + { + "epoch": 3.9913711102051908, + "grad_norm": 0.2463877648115158, + "learning_rate": 5.012994292702813e-06, + "loss": 3.904, + "step": 58745 + }, + { + "epoch": 3.991710830275853, + "grad_norm": 0.2585707902908325, + "learning_rate": 5.012569642614487e-06, + "loss": 3.9021, + "step": 58750 + }, + { + "epoch": 3.9920505503465145, + "grad_norm": 0.39545097947120667, + "learning_rate": 5.012144992526159e-06, + "loss": 4.0086, + "step": 58755 + }, + { + "epoch": 3.992390270417176, + "grad_norm": 0.3393353223800659, + "learning_rate": 5.0117203424378314e-06, + "loss": 3.8733, + "step": 58760 + }, + { + "epoch": 3.992729990487838, + "grad_norm": 0.38501644134521484, + "learning_rate": 5.011295692349505e-06, + "loss": 4.155, + "step": 58765 + }, + { + "epoch": 3.9930697105585, + "grad_norm": 0.37465691566467285, + "learning_rate": 5.010871042261177e-06, + "loss": 3.8406, + "step": 58770 + }, + { + "epoch": 3.9934094306291614, + "grad_norm": 0.3763332962989807, + "learning_rate": 5.01044639217285e-06, + "loss": 3.8617, + "step": 58775 + }, + { + "epoch": 3.9937491506998235, + "grad_norm": 0.27510812878608704, + "learning_rate": 5.0100217420845235e-06, + "loss": 3.8401, + "step": 58780 + }, + { + "epoch": 3.994088870770485, + "grad_norm": 0.2347380518913269, + "learning_rate": 5.0095970919961955e-06, + "loss": 3.8441, + "step": 58785 + }, + { + "epoch": 3.9944285908411468, + "grad_norm": 0.3111143112182617, + "learning_rate": 5.009172441907868e-06, + "loss": 4.0051, + "step": 58790 + }, + { + "epoch": 3.994768310911809, + "grad_norm": 0.26545092463493347, + "learning_rate": 5.008747791819542e-06, + "loss": 4.0199, + "step": 58795 + }, + { + "epoch": 3.9951080309824705, + "grad_norm": 0.3668433427810669, + "learning_rate": 5.008323141731214e-06, + "loss": 4.0977, + "step": 58800 + }, + { + "epoch": 3.995447751053132, + "grad_norm": 0.3385045528411865, + "learning_rate": 5.007898491642887e-06, + "loss": 4.1808, + "step": 58805 + }, + { + "epoch": 3.995787471123794, + "grad_norm": 0.21666230261325836, + "learning_rate": 5.0074738415545595e-06, + "loss": 3.8583, + "step": 58810 + }, + { + "epoch": 3.996127191194456, + "grad_norm": 0.33993229269981384, + "learning_rate": 5.007049191466232e-06, + "loss": 3.6665, + "step": 58815 + }, + { + "epoch": 3.9964669112651174, + "grad_norm": 0.36080804467201233, + "learning_rate": 5.006624541377905e-06, + "loss": 4.3122, + "step": 58820 + }, + { + "epoch": 3.9968066313357795, + "grad_norm": 0.2601718008518219, + "learning_rate": 5.006199891289578e-06, + "loss": 3.9046, + "step": 58825 + }, + { + "epoch": 3.997146351406441, + "grad_norm": 0.2595691382884979, + "learning_rate": 5.005775241201251e-06, + "loss": 3.9304, + "step": 58830 + }, + { + "epoch": 3.9974860714771028, + "grad_norm": 0.28877970576286316, + "learning_rate": 5.005350591112923e-06, + "loss": 3.7826, + "step": 58835 + }, + { + "epoch": 3.997825791547765, + "grad_norm": 0.331991046667099, + "learning_rate": 5.004925941024596e-06, + "loss": 4.0451, + "step": 58840 + }, + { + "epoch": 3.9981655116184265, + "grad_norm": 0.2068878561258316, + "learning_rate": 5.004501290936269e-06, + "loss": 3.8547, + "step": 58845 + }, + { + "epoch": 3.998505231689088, + "grad_norm": 0.32348906993865967, + "learning_rate": 5.004076640847941e-06, + "loss": 4.049, + "step": 58850 + }, + { + "epoch": 3.99884495175975, + "grad_norm": 0.2174149453639984, + "learning_rate": 5.003651990759615e-06, + "loss": 3.7558, + "step": 58855 + }, + { + "epoch": 3.999184671830412, + "grad_norm": 0.2790943682193756, + "learning_rate": 5.0032273406712875e-06, + "loss": 4.1248, + "step": 58860 + }, + { + "epoch": 3.9995243919010735, + "grad_norm": 0.31227970123291016, + "learning_rate": 5.002802690582959e-06, + "loss": 4.1422, + "step": 58865 + }, + { + "epoch": 3.9998641119717355, + "grad_norm": 0.5371958613395691, + "learning_rate": 5.002378040494633e-06, + "loss": 3.9428, + "step": 58870 + }, + { + "epoch": 4.0, + "eval_bertscore": { + "f1": 0.8338087849324035, + "precision": 0.8380063698612704, + "recall": 0.8306080695616782 + }, + "eval_bleu_4": 0.004261903199276214, + "eval_exact_match": 0.0, + "eval_loss": 3.7941973209381104, + "eval_meteor": 0.07982358744093428, + "eval_rouge": { + "rouge1": 0.12698867307812112, + "rouge2": 0.01150250267527004, + "rougeL": 0.10349406742083417, + "rougeLsum": 0.10352854988077209 + }, + "eval_runtime": 274.8728, + "eval_samples_per_second": 37.541, + "eval_steps_per_second": 4.693, + "step": 58872 + }, + { + "epoch": 4.000203832042397, + "grad_norm": 0.3240545988082886, + "learning_rate": 5.001953390406306e-06, + "loss": 4.057, + "step": 58875 + }, + { + "epoch": 4.000543552113059, + "grad_norm": 0.3308306038379669, + "learning_rate": 5.001528740317978e-06, + "loss": 4.1077, + "step": 58880 + }, + { + "epoch": 4.000883272183721, + "grad_norm": 0.26539167761802673, + "learning_rate": 5.0011040902296515e-06, + "loss": 3.8572, + "step": 58885 + }, + { + "epoch": 4.001222992254382, + "grad_norm": 0.27897241711616516, + "learning_rate": 5.000679440141324e-06, + "loss": 3.8698, + "step": 58890 + }, + { + "epoch": 4.001562712325044, + "grad_norm": 0.25208717584609985, + "learning_rate": 5.000254790052996e-06, + "loss": 3.8732, + "step": 58895 + }, + { + "epoch": 4.001902432395706, + "grad_norm": 0.33296263217926025, + "learning_rate": 4.999830139964669e-06, + "loss": 4.1049, + "step": 58900 + }, + { + "epoch": 4.002242152466367, + "grad_norm": 0.2670480012893677, + "learning_rate": 4.999405489876342e-06, + "loss": 3.7825, + "step": 58905 + }, + { + "epoch": 4.0025818725370295, + "grad_norm": 0.21418137848377228, + "learning_rate": 4.9989808397880155e-06, + "loss": 3.7758, + "step": 58910 + }, + { + "epoch": 4.0029215926076915, + "grad_norm": 0.31231242418289185, + "learning_rate": 4.998556189699687e-06, + "loss": 3.9994, + "step": 58915 + }, + { + "epoch": 4.003261312678353, + "grad_norm": 0.3212297856807709, + "learning_rate": 4.99813153961136e-06, + "loss": 4.1229, + "step": 58920 + }, + { + "epoch": 4.003601032749015, + "grad_norm": 0.2320249229669571, + "learning_rate": 4.997706889523034e-06, + "loss": 4.0161, + "step": 58925 + }, + { + "epoch": 4.003940752819677, + "grad_norm": 0.39929741621017456, + "learning_rate": 4.997282239434707e-06, + "loss": 4.0287, + "step": 58930 + }, + { + "epoch": 4.004280472890338, + "grad_norm": 0.3099552392959595, + "learning_rate": 4.996857589346379e-06, + "loss": 3.9597, + "step": 58935 + }, + { + "epoch": 4.004620192961, + "grad_norm": 0.2393285483121872, + "learning_rate": 4.9964329392580514e-06, + "loss": 3.996, + "step": 58940 + }, + { + "epoch": 4.004959913031662, + "grad_norm": 0.2122090458869934, + "learning_rate": 4.996008289169725e-06, + "loss": 4.0871, + "step": 58945 + }, + { + "epoch": 4.005299633102323, + "grad_norm": 0.238905131816864, + "learning_rate": 4.995583639081397e-06, + "loss": 3.9127, + "step": 58950 + }, + { + "epoch": 4.0056393531729855, + "grad_norm": 0.3500707745552063, + "learning_rate": 4.99515898899307e-06, + "loss": 4.0532, + "step": 58955 + }, + { + "epoch": 4.0059790732436475, + "grad_norm": 0.32830071449279785, + "learning_rate": 4.9947343389047435e-06, + "loss": 4.0629, + "step": 58960 + }, + { + "epoch": 4.006318793314309, + "grad_norm": 0.2907581329345703, + "learning_rate": 4.9943096888164154e-06, + "loss": 4.0423, + "step": 58965 + }, + { + "epoch": 4.006658513384971, + "grad_norm": 0.21315445005893707, + "learning_rate": 4.993885038728088e-06, + "loss": 3.9418, + "step": 58970 + }, + { + "epoch": 4.006998233455633, + "grad_norm": 0.2683107554912567, + "learning_rate": 4.993460388639762e-06, + "loss": 4.0637, + "step": 58975 + }, + { + "epoch": 4.007337953526294, + "grad_norm": 0.29526618123054504, + "learning_rate": 4.993035738551434e-06, + "loss": 3.9216, + "step": 58980 + }, + { + "epoch": 4.007677673596956, + "grad_norm": 0.24815714359283447, + "learning_rate": 4.992611088463107e-06, + "loss": 4.1503, + "step": 58985 + }, + { + "epoch": 4.008017393667618, + "grad_norm": 0.2832392156124115, + "learning_rate": 4.9921864383747794e-06, + "loss": 3.9661, + "step": 58990 + }, + { + "epoch": 4.008357113738279, + "grad_norm": 0.3016093373298645, + "learning_rate": 4.991761788286452e-06, + "loss": 3.9153, + "step": 58995 + }, + { + "epoch": 4.0086968338089415, + "grad_norm": 0.27975699305534363, + "learning_rate": 4.991337138198125e-06, + "loss": 4.1086, + "step": 59000 + }, + { + "epoch": 4.009036553879604, + "grad_norm": 0.25469326972961426, + "learning_rate": 4.990912488109798e-06, + "loss": 3.8761, + "step": 59005 + }, + { + "epoch": 4.009376273950265, + "grad_norm": 0.2989959120750427, + "learning_rate": 4.990487838021471e-06, + "loss": 4.1063, + "step": 59010 + }, + { + "epoch": 4.009715994020927, + "grad_norm": 0.3594907522201538, + "learning_rate": 4.9900631879331434e-06, + "loss": 3.8423, + "step": 59015 + }, + { + "epoch": 4.010055714091589, + "grad_norm": 0.20187245309352875, + "learning_rate": 4.989638537844816e-06, + "loss": 4.217, + "step": 59020 + }, + { + "epoch": 4.01039543416225, + "grad_norm": 0.28950828313827515, + "learning_rate": 4.989213887756489e-06, + "loss": 4.1526, + "step": 59025 + }, + { + "epoch": 4.010735154232912, + "grad_norm": 0.35303056240081787, + "learning_rate": 4.988789237668162e-06, + "loss": 3.8547, + "step": 59030 + }, + { + "epoch": 4.011074874303574, + "grad_norm": 0.25960245728492737, + "learning_rate": 4.988364587579835e-06, + "loss": 3.8146, + "step": 59035 + }, + { + "epoch": 4.011414594374235, + "grad_norm": 0.28908786177635193, + "learning_rate": 4.9879399374915074e-06, + "loss": 3.9102, + "step": 59040 + }, + { + "epoch": 4.0117543144448975, + "grad_norm": 0.36831414699554443, + "learning_rate": 4.98751528740318e-06, + "loss": 4.0632, + "step": 59045 + }, + { + "epoch": 4.01209403451556, + "grad_norm": 0.30861666798591614, + "learning_rate": 4.987090637314853e-06, + "loss": 4.0065, + "step": 59050 + }, + { + "epoch": 4.012433754586221, + "grad_norm": 0.2803708016872406, + "learning_rate": 4.986665987226526e-06, + "loss": 3.7911, + "step": 59055 + }, + { + "epoch": 4.012773474656883, + "grad_norm": 0.24428315460681915, + "learning_rate": 4.986241337138199e-06, + "loss": 4.0026, + "step": 59060 + }, + { + "epoch": 4.013113194727545, + "grad_norm": 0.25673505663871765, + "learning_rate": 4.9858166870498714e-06, + "loss": 3.9746, + "step": 59065 + }, + { + "epoch": 4.013452914798206, + "grad_norm": 0.2881309986114502, + "learning_rate": 4.985392036961544e-06, + "loss": 3.8443, + "step": 59070 + }, + { + "epoch": 4.013792634868868, + "grad_norm": 0.24752743542194366, + "learning_rate": 4.984967386873217e-06, + "loss": 3.8542, + "step": 59075 + }, + { + "epoch": 4.01413235493953, + "grad_norm": 0.3383945822715759, + "learning_rate": 4.98454273678489e-06, + "loss": 3.9577, + "step": 59080 + }, + { + "epoch": 4.014472075010191, + "grad_norm": 0.29039466381073, + "learning_rate": 4.984118086696562e-06, + "loss": 4.1022, + "step": 59085 + }, + { + "epoch": 4.0148117950808535, + "grad_norm": 0.23765890300273895, + "learning_rate": 4.9836934366082355e-06, + "loss": 3.8619, + "step": 59090 + }, + { + "epoch": 4.015151515151516, + "grad_norm": 0.24202607572078705, + "learning_rate": 4.983268786519908e-06, + "loss": 3.9433, + "step": 59095 + }, + { + "epoch": 4.015491235222177, + "grad_norm": 0.21072307229042053, + "learning_rate": 4.982844136431581e-06, + "loss": 4.1799, + "step": 59100 + }, + { + "epoch": 4.015830955292839, + "grad_norm": 0.27584347128868103, + "learning_rate": 4.982419486343254e-06, + "loss": 3.9041, + "step": 59105 + }, + { + "epoch": 4.016170675363501, + "grad_norm": 0.2778162360191345, + "learning_rate": 4.981994836254927e-06, + "loss": 4.1363, + "step": 59110 + }, + { + "epoch": 4.016510395434162, + "grad_norm": 0.3276975154876709, + "learning_rate": 4.9815701861665995e-06, + "loss": 3.926, + "step": 59115 + }, + { + "epoch": 4.016850115504824, + "grad_norm": 0.31340155005455017, + "learning_rate": 4.981145536078271e-06, + "loss": 4.0006, + "step": 59120 + }, + { + "epoch": 4.017189835575485, + "grad_norm": 0.3008361756801605, + "learning_rate": 4.980720885989945e-06, + "loss": 3.9538, + "step": 59125 + }, + { + "epoch": 4.017529555646147, + "grad_norm": 0.1993570178747177, + "learning_rate": 4.980296235901618e-06, + "loss": 4.0996, + "step": 59130 + }, + { + "epoch": 4.0178692757168095, + "grad_norm": 0.4189729392528534, + "learning_rate": 4.97987158581329e-06, + "loss": 4.1309, + "step": 59135 + }, + { + "epoch": 4.018208995787471, + "grad_norm": 0.3503390848636627, + "learning_rate": 4.9794469357249635e-06, + "loss": 4.1129, + "step": 59140 + }, + { + "epoch": 4.018548715858133, + "grad_norm": 0.4134794771671295, + "learning_rate": 4.979022285636636e-06, + "loss": 3.8629, + "step": 59145 + }, + { + "epoch": 4.018888435928795, + "grad_norm": 0.23379158973693848, + "learning_rate": 4.978597635548308e-06, + "loss": 3.794, + "step": 59150 + }, + { + "epoch": 4.019228155999456, + "grad_norm": 0.19967146217823029, + "learning_rate": 4.978172985459981e-06, + "loss": 4.0499, + "step": 59155 + }, + { + "epoch": 4.019567876070118, + "grad_norm": 0.265495628118515, + "learning_rate": 4.977748335371655e-06, + "loss": 3.7753, + "step": 59160 + }, + { + "epoch": 4.01990759614078, + "grad_norm": 0.3281169831752777, + "learning_rate": 4.977323685283327e-06, + "loss": 4.0413, + "step": 59165 + }, + { + "epoch": 4.020247316211441, + "grad_norm": 0.2310374528169632, + "learning_rate": 4.976899035194999e-06, + "loss": 3.8996, + "step": 59170 + }, + { + "epoch": 4.0205870362821035, + "grad_norm": 0.2582191228866577, + "learning_rate": 4.976474385106673e-06, + "loss": 4.2309, + "step": 59175 + }, + { + "epoch": 4.0209267563527655, + "grad_norm": 0.27779120206832886, + "learning_rate": 4.976049735018345e-06, + "loss": 3.7254, + "step": 59180 + }, + { + "epoch": 4.021266476423427, + "grad_norm": 0.42873725295066833, + "learning_rate": 4.975625084930018e-06, + "loss": 4.1582, + "step": 59185 + }, + { + "epoch": 4.021606196494089, + "grad_norm": 0.2186514288187027, + "learning_rate": 4.975200434841691e-06, + "loss": 4.1025, + "step": 59190 + }, + { + "epoch": 4.021945916564751, + "grad_norm": 0.254769504070282, + "learning_rate": 4.974775784753363e-06, + "loss": 4.0841, + "step": 59195 + }, + { + "epoch": 4.022285636635412, + "grad_norm": 0.2787610590457916, + "learning_rate": 4.974351134665036e-06, + "loss": 4.0291, + "step": 59200 + }, + { + "epoch": 4.022625356706074, + "grad_norm": 0.24894654750823975, + "learning_rate": 4.973926484576709e-06, + "loss": 4.2397, + "step": 59205 + }, + { + "epoch": 4.022965076776736, + "grad_norm": 0.5265717506408691, + "learning_rate": 4.973501834488382e-06, + "loss": 4.2196, + "step": 59210 + }, + { + "epoch": 4.023304796847397, + "grad_norm": 0.28574731945991516, + "learning_rate": 4.973077184400055e-06, + "loss": 3.8868, + "step": 59215 + }, + { + "epoch": 4.0236445169180595, + "grad_norm": 0.26203492283821106, + "learning_rate": 4.972652534311727e-06, + "loss": 3.9208, + "step": 59220 + }, + { + "epoch": 4.0239842369887215, + "grad_norm": 0.39725494384765625, + "learning_rate": 4.9722278842234e-06, + "loss": 4.0158, + "step": 59225 + }, + { + "epoch": 4.024323957059383, + "grad_norm": 0.19653010368347168, + "learning_rate": 4.971803234135073e-06, + "loss": 3.9689, + "step": 59230 + }, + { + "epoch": 4.024663677130045, + "grad_norm": 0.22597341239452362, + "learning_rate": 4.971378584046746e-06, + "loss": 3.914, + "step": 59235 + }, + { + "epoch": 4.025003397200707, + "grad_norm": 0.2588143050670624, + "learning_rate": 4.970953933958419e-06, + "loss": 3.9186, + "step": 59240 + }, + { + "epoch": 4.025343117271368, + "grad_norm": 0.25430893898010254, + "learning_rate": 4.9705292838700914e-06, + "loss": 4.1231, + "step": 59245 + }, + { + "epoch": 4.02568283734203, + "grad_norm": 0.5219159126281738, + "learning_rate": 4.970104633781764e-06, + "loss": 4.0533, + "step": 59250 + }, + { + "epoch": 4.026022557412692, + "grad_norm": 0.24123963713645935, + "learning_rate": 4.969679983693437e-06, + "loss": 3.8881, + "step": 59255 + }, + { + "epoch": 4.026362277483353, + "grad_norm": 0.2456669807434082, + "learning_rate": 4.96925533360511e-06, + "loss": 3.9072, + "step": 59260 + }, + { + "epoch": 4.0267019975540155, + "grad_norm": 0.2512587606906891, + "learning_rate": 4.968830683516783e-06, + "loss": 4.0993, + "step": 59265 + }, + { + "epoch": 4.0270417176246776, + "grad_norm": 0.2869113087654114, + "learning_rate": 4.9684060334284554e-06, + "loss": 4.0416, + "step": 59270 + }, + { + "epoch": 4.027381437695339, + "grad_norm": 0.413291335105896, + "learning_rate": 4.967981383340128e-06, + "loss": 4.1285, + "step": 59275 + }, + { + "epoch": 4.027721157766001, + "grad_norm": 0.519689679145813, + "learning_rate": 4.967556733251801e-06, + "loss": 3.7798, + "step": 59280 + }, + { + "epoch": 4.028060877836663, + "grad_norm": 0.8861402869224548, + "learning_rate": 4.967132083163474e-06, + "loss": 4.1011, + "step": 59285 + }, + { + "epoch": 4.028400597907324, + "grad_norm": 0.2884678542613983, + "learning_rate": 4.966707433075147e-06, + "loss": 3.8248, + "step": 59290 + }, + { + "epoch": 4.028740317977986, + "grad_norm": 0.43564891815185547, + "learning_rate": 4.9662827829868194e-06, + "loss": 4.0763, + "step": 59295 + }, + { + "epoch": 4.029080038048648, + "grad_norm": 0.20870991051197052, + "learning_rate": 4.965858132898492e-06, + "loss": 3.8858, + "step": 59300 + }, + { + "epoch": 4.029419758119309, + "grad_norm": 0.3009251058101654, + "learning_rate": 4.965433482810165e-06, + "loss": 4.0281, + "step": 59305 + }, + { + "epoch": 4.0297594781899715, + "grad_norm": 0.289917528629303, + "learning_rate": 4.965008832721838e-06, + "loss": 3.8506, + "step": 59310 + }, + { + "epoch": 4.030099198260634, + "grad_norm": 0.24047796428203583, + "learning_rate": 4.964584182633511e-06, + "loss": 3.8091, + "step": 59315 + }, + { + "epoch": 4.030438918331295, + "grad_norm": 0.24418312311172485, + "learning_rate": 4.964159532545183e-06, + "loss": 3.9894, + "step": 59320 + }, + { + "epoch": 4.030778638401957, + "grad_norm": 0.258684366941452, + "learning_rate": 4.963734882456856e-06, + "loss": 3.8837, + "step": 59325 + }, + { + "epoch": 4.031118358472619, + "grad_norm": 0.2507159411907196, + "learning_rate": 4.963310232368529e-06, + "loss": 4.111, + "step": 59330 + }, + { + "epoch": 4.03145807854328, + "grad_norm": 0.3403729796409607, + "learning_rate": 4.962885582280201e-06, + "loss": 4.0295, + "step": 59335 + }, + { + "epoch": 4.031797798613942, + "grad_norm": 0.28608500957489014, + "learning_rate": 4.962460932191875e-06, + "loss": 3.889, + "step": 59340 + }, + { + "epoch": 4.032137518684604, + "grad_norm": 0.3565313220024109, + "learning_rate": 4.9620362821035474e-06, + "loss": 4.0956, + "step": 59345 + }, + { + "epoch": 4.032477238755265, + "grad_norm": 0.25189241766929626, + "learning_rate": 4.961611632015219e-06, + "loss": 3.8933, + "step": 59350 + }, + { + "epoch": 4.0328169588259275, + "grad_norm": 0.2889832556247711, + "learning_rate": 4.961186981926893e-06, + "loss": 4.2573, + "step": 59355 + }, + { + "epoch": 4.03315667889659, + "grad_norm": 0.3300759792327881, + "learning_rate": 4.960762331838566e-06, + "loss": 3.8494, + "step": 59360 + }, + { + "epoch": 4.033496398967251, + "grad_norm": 0.32158249616622925, + "learning_rate": 4.960337681750238e-06, + "loss": 3.9484, + "step": 59365 + }, + { + "epoch": 4.033836119037913, + "grad_norm": 0.261984258890152, + "learning_rate": 4.959913031661911e-06, + "loss": 3.8272, + "step": 59370 + }, + { + "epoch": 4.034175839108575, + "grad_norm": 0.22394537925720215, + "learning_rate": 4.959488381573584e-06, + "loss": 3.8488, + "step": 59375 + }, + { + "epoch": 4.034515559179236, + "grad_norm": 0.23560678958892822, + "learning_rate": 4.959063731485256e-06, + "loss": 4.1253, + "step": 59380 + }, + { + "epoch": 4.034855279249898, + "grad_norm": 0.32170426845550537, + "learning_rate": 4.958639081396929e-06, + "loss": 3.9775, + "step": 59385 + }, + { + "epoch": 4.03519499932056, + "grad_norm": 0.35025522112846375, + "learning_rate": 4.958214431308603e-06, + "loss": 3.9789, + "step": 59390 + }, + { + "epoch": 4.035534719391221, + "grad_norm": 0.4568310081958771, + "learning_rate": 4.957789781220275e-06, + "loss": 4.2064, + "step": 59395 + }, + { + "epoch": 4.0358744394618835, + "grad_norm": 0.2863426208496094, + "learning_rate": 4.957365131131947e-06, + "loss": 4.0911, + "step": 59400 + }, + { + "epoch": 4.036214159532546, + "grad_norm": 0.3651306629180908, + "learning_rate": 4.95694048104362e-06, + "loss": 3.8917, + "step": 59405 + }, + { + "epoch": 4.036553879603207, + "grad_norm": 0.36581459641456604, + "learning_rate": 4.956515830955293e-06, + "loss": 3.8451, + "step": 59410 + }, + { + "epoch": 4.036893599673869, + "grad_norm": 0.2561807334423065, + "learning_rate": 4.956091180866966e-06, + "loss": 3.8006, + "step": 59415 + }, + { + "epoch": 4.037233319744531, + "grad_norm": 0.3911444842815399, + "learning_rate": 4.955666530778639e-06, + "loss": 3.9188, + "step": 59420 + }, + { + "epoch": 4.037573039815192, + "grad_norm": 0.2675302028656006, + "learning_rate": 4.955241880690311e-06, + "loss": 3.9969, + "step": 59425 + }, + { + "epoch": 4.037912759885854, + "grad_norm": 0.6047860383987427, + "learning_rate": 4.954817230601984e-06, + "loss": 3.8535, + "step": 59430 + }, + { + "epoch": 4.038252479956516, + "grad_norm": 0.3262932598590851, + "learning_rate": 4.954392580513657e-06, + "loss": 3.8886, + "step": 59435 + }, + { + "epoch": 4.0385922000271774, + "grad_norm": 0.5299931764602661, + "learning_rate": 4.95396793042533e-06, + "loss": 3.8539, + "step": 59440 + }, + { + "epoch": 4.0389319200978395, + "grad_norm": 0.37559497356414795, + "learning_rate": 4.953543280337003e-06, + "loss": 3.9531, + "step": 59445 + }, + { + "epoch": 4.039271640168501, + "grad_norm": 0.6851649284362793, + "learning_rate": 4.953118630248675e-06, + "loss": 3.9819, + "step": 59450 + }, + { + "epoch": 4.039611360239163, + "grad_norm": 0.2442142814397812, + "learning_rate": 4.952693980160348e-06, + "loss": 4.237, + "step": 59455 + }, + { + "epoch": 4.039951080309825, + "grad_norm": 0.3603786528110504, + "learning_rate": 4.952269330072021e-06, + "loss": 4.0153, + "step": 59460 + }, + { + "epoch": 4.040290800380486, + "grad_norm": 0.3770797848701477, + "learning_rate": 4.951844679983694e-06, + "loss": 4.2161, + "step": 59465 + }, + { + "epoch": 4.040630520451148, + "grad_norm": 0.2201598435640335, + "learning_rate": 4.951420029895367e-06, + "loss": 3.9908, + "step": 59470 + }, + { + "epoch": 4.04097024052181, + "grad_norm": 0.23838375508785248, + "learning_rate": 4.950995379807039e-06, + "loss": 3.9387, + "step": 59475 + }, + { + "epoch": 4.041309960592471, + "grad_norm": 0.27004116773605347, + "learning_rate": 4.950570729718712e-06, + "loss": 3.9005, + "step": 59480 + }, + { + "epoch": 4.0416496806631335, + "grad_norm": 0.26393434405326843, + "learning_rate": 4.950146079630385e-06, + "loss": 3.971, + "step": 59485 + }, + { + "epoch": 4.0419894007337955, + "grad_norm": 0.29067009687423706, + "learning_rate": 4.949721429542058e-06, + "loss": 4.0805, + "step": 59490 + }, + { + "epoch": 4.042329120804457, + "grad_norm": 0.325385183095932, + "learning_rate": 4.949296779453731e-06, + "loss": 4.1593, + "step": 59495 + }, + { + "epoch": 4.042668840875119, + "grad_norm": 0.29331105947494507, + "learning_rate": 4.948872129365403e-06, + "loss": 4.0737, + "step": 59500 + }, + { + "epoch": 4.043008560945781, + "grad_norm": 0.19806092977523804, + "learning_rate": 4.948447479277076e-06, + "loss": 4.0637, + "step": 59505 + }, + { + "epoch": 4.043348281016442, + "grad_norm": 0.27580544352531433, + "learning_rate": 4.948022829188749e-06, + "loss": 3.9215, + "step": 59510 + }, + { + "epoch": 4.043688001087104, + "grad_norm": 0.29618731141090393, + "learning_rate": 4.947598179100422e-06, + "loss": 4.0029, + "step": 59515 + }, + { + "epoch": 4.044027721157766, + "grad_norm": 0.2998179495334625, + "learning_rate": 4.947173529012095e-06, + "loss": 4.2006, + "step": 59520 + }, + { + "epoch": 4.044367441228427, + "grad_norm": 0.3257816731929779, + "learning_rate": 4.9467488789237674e-06, + "loss": 3.8019, + "step": 59525 + }, + { + "epoch": 4.0447071612990895, + "grad_norm": 0.2004157304763794, + "learning_rate": 4.94632422883544e-06, + "loss": 3.8646, + "step": 59530 + }, + { + "epoch": 4.0450468813697515, + "grad_norm": 0.3896368443965912, + "learning_rate": 4.945899578747112e-06, + "loss": 4.1769, + "step": 59535 + }, + { + "epoch": 4.045386601440413, + "grad_norm": 0.21750158071517944, + "learning_rate": 4.945474928658786e-06, + "loss": 4.071, + "step": 59540 + }, + { + "epoch": 4.045726321511075, + "grad_norm": 0.266295462846756, + "learning_rate": 4.945050278570459e-06, + "loss": 3.8601, + "step": 59545 + }, + { + "epoch": 4.046066041581737, + "grad_norm": 0.25846174359321594, + "learning_rate": 4.944625628482131e-06, + "loss": 4.0187, + "step": 59550 + }, + { + "epoch": 4.046405761652398, + "grad_norm": 0.34186992049217224, + "learning_rate": 4.944200978393804e-06, + "loss": 3.9203, + "step": 59555 + }, + { + "epoch": 4.04674548172306, + "grad_norm": 0.26935476064682007, + "learning_rate": 4.943776328305477e-06, + "loss": 4.1637, + "step": 59560 + }, + { + "epoch": 4.047085201793722, + "grad_norm": 0.3257148861885071, + "learning_rate": 4.943351678217149e-06, + "loss": 4.0654, + "step": 59565 + }, + { + "epoch": 4.047424921864383, + "grad_norm": 0.30682215094566345, + "learning_rate": 4.942927028128822e-06, + "loss": 4.2143, + "step": 59570 + }, + { + "epoch": 4.0477646419350455, + "grad_norm": 0.21863223612308502, + "learning_rate": 4.9425023780404954e-06, + "loss": 3.7435, + "step": 59575 + }, + { + "epoch": 4.048104362005708, + "grad_norm": 0.2561225891113281, + "learning_rate": 4.942077727952167e-06, + "loss": 4.0129, + "step": 59580 + }, + { + "epoch": 4.048444082076369, + "grad_norm": 0.2391505092382431, + "learning_rate": 4.94165307786384e-06, + "loss": 4.0349, + "step": 59585 + }, + { + "epoch": 4.048783802147031, + "grad_norm": 0.3940204381942749, + "learning_rate": 4.941228427775514e-06, + "loss": 3.8965, + "step": 59590 + }, + { + "epoch": 4.049123522217693, + "grad_norm": 0.3252713680267334, + "learning_rate": 4.940803777687186e-06, + "loss": 3.9683, + "step": 59595 + }, + { + "epoch": 4.049463242288354, + "grad_norm": 0.21060176193714142, + "learning_rate": 4.940379127598859e-06, + "loss": 3.9634, + "step": 59600 + }, + { + "epoch": 4.049802962359016, + "grad_norm": 0.2768910229206085, + "learning_rate": 4.939954477510531e-06, + "loss": 3.9149, + "step": 59605 + }, + { + "epoch": 4.050142682429678, + "grad_norm": 0.2905910015106201, + "learning_rate": 4.939529827422205e-06, + "loss": 4.0357, + "step": 59610 + }, + { + "epoch": 4.050482402500339, + "grad_norm": 0.2845216989517212, + "learning_rate": 4.939105177333877e-06, + "loss": 3.9058, + "step": 59615 + }, + { + "epoch": 4.0508221225710015, + "grad_norm": 0.2593054473400116, + "learning_rate": 4.93868052724555e-06, + "loss": 3.9721, + "step": 59620 + }, + { + "epoch": 4.051161842641664, + "grad_norm": 0.35181111097335815, + "learning_rate": 4.9382558771572234e-06, + "loss": 3.8699, + "step": 59625 + }, + { + "epoch": 4.051501562712325, + "grad_norm": 0.2752169668674469, + "learning_rate": 4.937831227068895e-06, + "loss": 3.9757, + "step": 59630 + }, + { + "epoch": 4.051841282782987, + "grad_norm": 0.3202730715274811, + "learning_rate": 4.937406576980568e-06, + "loss": 4.214, + "step": 59635 + }, + { + "epoch": 4.052181002853649, + "grad_norm": 0.18591712415218353, + "learning_rate": 4.936981926892242e-06, + "loss": 3.6627, + "step": 59640 + }, + { + "epoch": 4.05252072292431, + "grad_norm": 0.2604566514492035, + "learning_rate": 4.936557276803914e-06, + "loss": 4.0031, + "step": 59645 + }, + { + "epoch": 4.052860442994972, + "grad_norm": 0.34667113423347473, + "learning_rate": 4.936132626715587e-06, + "loss": 4.5071, + "step": 59650 + }, + { + "epoch": 4.053200163065634, + "grad_norm": 0.3184826970100403, + "learning_rate": 4.935707976627259e-06, + "loss": 4.0835, + "step": 59655 + }, + { + "epoch": 4.053539883136295, + "grad_norm": 0.2665340304374695, + "learning_rate": 4.935283326538932e-06, + "loss": 3.8624, + "step": 59660 + }, + { + "epoch": 4.0538796032069575, + "grad_norm": 0.3249729573726654, + "learning_rate": 4.934858676450605e-06, + "loss": 4.0704, + "step": 59665 + }, + { + "epoch": 4.05421932327762, + "grad_norm": 0.3317493498325348, + "learning_rate": 4.934434026362278e-06, + "loss": 4.3059, + "step": 59670 + }, + { + "epoch": 4.054559043348281, + "grad_norm": 0.412385493516922, + "learning_rate": 4.934009376273951e-06, + "loss": 3.8739, + "step": 59675 + }, + { + "epoch": 4.054898763418943, + "grad_norm": 0.2718810737133026, + "learning_rate": 4.933584726185623e-06, + "loss": 4.0555, + "step": 59680 + }, + { + "epoch": 4.055238483489605, + "grad_norm": 0.2154194861650467, + "learning_rate": 4.933160076097296e-06, + "loss": 4.0367, + "step": 59685 + }, + { + "epoch": 4.055578203560266, + "grad_norm": 0.3012862801551819, + "learning_rate": 4.932735426008969e-06, + "loss": 4.0437, + "step": 59690 + }, + { + "epoch": 4.055917923630928, + "grad_norm": 0.284700483083725, + "learning_rate": 4.932310775920642e-06, + "loss": 4.0995, + "step": 59695 + }, + { + "epoch": 4.05625764370159, + "grad_norm": 0.27776387333869934, + "learning_rate": 4.931886125832315e-06, + "loss": 3.9459, + "step": 59700 + }, + { + "epoch": 4.056597363772251, + "grad_norm": 0.3435327112674713, + "learning_rate": 4.931461475743987e-06, + "loss": 3.9789, + "step": 59705 + }, + { + "epoch": 4.0569370838429135, + "grad_norm": 0.3252156674861908, + "learning_rate": 4.93103682565566e-06, + "loss": 4.0537, + "step": 59710 + }, + { + "epoch": 4.057276803913576, + "grad_norm": 0.21786394715309143, + "learning_rate": 4.930612175567333e-06, + "loss": 4.2427, + "step": 59715 + }, + { + "epoch": 4.057616523984237, + "grad_norm": 0.3364546000957489, + "learning_rate": 4.930187525479006e-06, + "loss": 4.0485, + "step": 59720 + }, + { + "epoch": 4.057956244054899, + "grad_norm": 0.249937504529953, + "learning_rate": 4.929762875390679e-06, + "loss": 4.1229, + "step": 59725 + }, + { + "epoch": 4.058295964125561, + "grad_norm": 0.3192335069179535, + "learning_rate": 4.929338225302351e-06, + "loss": 4.1165, + "step": 59730 + }, + { + "epoch": 4.058635684196222, + "grad_norm": 0.21308356523513794, + "learning_rate": 4.928913575214024e-06, + "loss": 3.9438, + "step": 59735 + }, + { + "epoch": 4.058975404266884, + "grad_norm": 0.5273513793945312, + "learning_rate": 4.928488925125697e-06, + "loss": 4.0214, + "step": 59740 + }, + { + "epoch": 4.059315124337546, + "grad_norm": 0.2843375504016876, + "learning_rate": 4.92806427503737e-06, + "loss": 4.0622, + "step": 59745 + }, + { + "epoch": 4.0596548444082075, + "grad_norm": 0.2889918088912964, + "learning_rate": 4.927639624949042e-06, + "loss": 4.0497, + "step": 59750 + }, + { + "epoch": 4.0599945644788695, + "grad_norm": 0.20306679606437683, + "learning_rate": 4.927214974860715e-06, + "loss": 3.9788, + "step": 59755 + }, + { + "epoch": 4.060334284549532, + "grad_norm": 0.23893697559833527, + "learning_rate": 4.926790324772388e-06, + "loss": 3.9757, + "step": 59760 + }, + { + "epoch": 4.060674004620193, + "grad_norm": 0.33958521485328674, + "learning_rate": 4.92636567468406e-06, + "loss": 3.9204, + "step": 59765 + }, + { + "epoch": 4.061013724690855, + "grad_norm": 0.20333580672740936, + "learning_rate": 4.925941024595734e-06, + "loss": 3.8487, + "step": 59770 + }, + { + "epoch": 4.061353444761517, + "grad_norm": 0.24826794862747192, + "learning_rate": 4.925516374507407e-06, + "loss": 4.0645, + "step": 59775 + }, + { + "epoch": 4.061693164832178, + "grad_norm": 0.2827286124229431, + "learning_rate": 4.925091724419079e-06, + "loss": 4.0639, + "step": 59780 + }, + { + "epoch": 4.06203288490284, + "grad_norm": 0.5250346064567566, + "learning_rate": 4.924667074330751e-06, + "loss": 3.9512, + "step": 59785 + }, + { + "epoch": 4.062372604973502, + "grad_norm": 0.21225936710834503, + "learning_rate": 4.924242424242425e-06, + "loss": 4.0169, + "step": 59790 + }, + { + "epoch": 4.0627123250441635, + "grad_norm": 0.34494760632514954, + "learning_rate": 4.923817774154098e-06, + "loss": 4.1522, + "step": 59795 + }, + { + "epoch": 4.0630520451148255, + "grad_norm": 0.4381016194820404, + "learning_rate": 4.92339312406577e-06, + "loss": 4.0925, + "step": 59800 + }, + { + "epoch": 4.063391765185487, + "grad_norm": 0.17464524507522583, + "learning_rate": 4.922968473977443e-06, + "loss": 4.0612, + "step": 59805 + }, + { + "epoch": 4.063731485256149, + "grad_norm": 0.23373524844646454, + "learning_rate": 4.922543823889116e-06, + "loss": 4.0937, + "step": 59810 + }, + { + "epoch": 4.064071205326811, + "grad_norm": 0.3158719539642334, + "learning_rate": 4.922119173800788e-06, + "loss": 3.8304, + "step": 59815 + }, + { + "epoch": 4.064410925397472, + "grad_norm": 0.22750243544578552, + "learning_rate": 4.921694523712461e-06, + "loss": 3.9311, + "step": 59820 + }, + { + "epoch": 4.064750645468134, + "grad_norm": 0.32644855976104736, + "learning_rate": 4.921269873624135e-06, + "loss": 4.0869, + "step": 59825 + }, + { + "epoch": 4.065090365538796, + "grad_norm": 0.2732728123664856, + "learning_rate": 4.920845223535807e-06, + "loss": 3.9927, + "step": 59830 + }, + { + "epoch": 4.065430085609457, + "grad_norm": 0.2781323790550232, + "learning_rate": 4.920420573447479e-06, + "loss": 4.0123, + "step": 59835 + }, + { + "epoch": 4.0657698056801195, + "grad_norm": 0.24496208131313324, + "learning_rate": 4.919995923359153e-06, + "loss": 4.0411, + "step": 59840 + }, + { + "epoch": 4.0661095257507816, + "grad_norm": 0.27211102843284607, + "learning_rate": 4.919571273270825e-06, + "loss": 4.0471, + "step": 59845 + }, + { + "epoch": 4.066449245821443, + "grad_norm": 0.2920044958591461, + "learning_rate": 4.919146623182498e-06, + "loss": 3.9833, + "step": 59850 + }, + { + "epoch": 4.066788965892105, + "grad_norm": 0.39099881052970886, + "learning_rate": 4.918721973094171e-06, + "loss": 3.7837, + "step": 59855 + }, + { + "epoch": 4.067128685962767, + "grad_norm": 0.24609021842479706, + "learning_rate": 4.918297323005843e-06, + "loss": 3.9718, + "step": 59860 + }, + { + "epoch": 4.067468406033428, + "grad_norm": 0.2214209884405136, + "learning_rate": 4.917872672917516e-06, + "loss": 3.8955, + "step": 59865 + }, + { + "epoch": 4.06780812610409, + "grad_norm": 0.279147207736969, + "learning_rate": 4.917448022829189e-06, + "loss": 4.0176, + "step": 59870 + }, + { + "epoch": 4.068147846174752, + "grad_norm": 0.25888389348983765, + "learning_rate": 4.917023372740862e-06, + "loss": 3.9668, + "step": 59875 + }, + { + "epoch": 4.068487566245413, + "grad_norm": 0.2775430977344513, + "learning_rate": 4.916598722652535e-06, + "loss": 4.059, + "step": 59880 + }, + { + "epoch": 4.0688272863160755, + "grad_norm": 0.2840477526187897, + "learning_rate": 4.916174072564207e-06, + "loss": 3.9612, + "step": 59885 + }, + { + "epoch": 4.069167006386738, + "grad_norm": 0.5065069794654846, + "learning_rate": 4.91574942247588e-06, + "loss": 3.9896, + "step": 59890 + }, + { + "epoch": 4.069506726457399, + "grad_norm": 0.3153751790523529, + "learning_rate": 4.915324772387553e-06, + "loss": 4.027, + "step": 59895 + }, + { + "epoch": 4.069846446528061, + "grad_norm": 0.23139742016792297, + "learning_rate": 4.914900122299226e-06, + "loss": 3.9874, + "step": 59900 + }, + { + "epoch": 4.070186166598723, + "grad_norm": 0.21320554614067078, + "learning_rate": 4.914475472210899e-06, + "loss": 4.171, + "step": 59905 + }, + { + "epoch": 4.070525886669384, + "grad_norm": 0.31495267152786255, + "learning_rate": 4.914050822122571e-06, + "loss": 3.9323, + "step": 59910 + }, + { + "epoch": 4.070865606740046, + "grad_norm": 0.33124855160713196, + "learning_rate": 4.913626172034244e-06, + "loss": 4.1041, + "step": 59915 + }, + { + "epoch": 4.071205326810708, + "grad_norm": 0.3269854485988617, + "learning_rate": 4.913201521945917e-06, + "loss": 4.0263, + "step": 59920 + }, + { + "epoch": 4.071545046881369, + "grad_norm": 0.2717708349227905, + "learning_rate": 4.91277687185759e-06, + "loss": 3.9117, + "step": 59925 + }, + { + "epoch": 4.0718847669520315, + "grad_norm": 0.2901756167411804, + "learning_rate": 4.912437151786928e-06, + "loss": 4.041, + "step": 59930 + }, + { + "epoch": 4.072224487022694, + "grad_norm": 0.23259806632995605, + "learning_rate": 4.912012501698601e-06, + "loss": 4.0265, + "step": 59935 + }, + { + "epoch": 4.072564207093355, + "grad_norm": 0.24537916481494904, + "learning_rate": 4.9115878516102735e-06, + "loss": 4.0562, + "step": 59940 + }, + { + "epoch": 4.072903927164017, + "grad_norm": 0.2663123309612274, + "learning_rate": 4.911163201521946e-06, + "loss": 4.231, + "step": 59945 + }, + { + "epoch": 4.073243647234679, + "grad_norm": 0.24114486575126648, + "learning_rate": 4.910738551433619e-06, + "loss": 3.9163, + "step": 59950 + }, + { + "epoch": 4.07358336730534, + "grad_norm": 0.26552653312683105, + "learning_rate": 4.910313901345292e-06, + "loss": 3.8095, + "step": 59955 + }, + { + "epoch": 4.073923087376002, + "grad_norm": 0.3247566521167755, + "learning_rate": 4.909889251256965e-06, + "loss": 3.9347, + "step": 59960 + }, + { + "epoch": 4.074262807446664, + "grad_norm": 0.3494550585746765, + "learning_rate": 4.9094646011686375e-06, + "loss": 4.115, + "step": 59965 + }, + { + "epoch": 4.074602527517325, + "grad_norm": 0.25066322088241577, + "learning_rate": 4.90903995108031e-06, + "loss": 4.0242, + "step": 59970 + }, + { + "epoch": 4.0749422475879875, + "grad_norm": 0.25465235114097595, + "learning_rate": 4.908615300991983e-06, + "loss": 4.0265, + "step": 59975 + }, + { + "epoch": 4.07528196765865, + "grad_norm": 0.39051297307014465, + "learning_rate": 4.908190650903656e-06, + "loss": 3.8677, + "step": 59980 + }, + { + "epoch": 4.075621687729311, + "grad_norm": 0.25452324748039246, + "learning_rate": 4.907766000815329e-06, + "loss": 3.7485, + "step": 59985 + }, + { + "epoch": 4.075961407799973, + "grad_norm": 0.28347450494766235, + "learning_rate": 4.9073413507270015e-06, + "loss": 3.8855, + "step": 59990 + }, + { + "epoch": 4.076301127870635, + "grad_norm": 0.5997957587242126, + "learning_rate": 4.906916700638674e-06, + "loss": 4.0522, + "step": 59995 + }, + { + "epoch": 4.076640847941296, + "grad_norm": 0.3026666045188904, + "learning_rate": 4.906492050550347e-06, + "loss": 3.965, + "step": 60000 + }, + { + "epoch": 4.076980568011958, + "grad_norm": 0.3805088400840759, + "learning_rate": 4.90606740046202e-06, + "loss": 4.099, + "step": 60005 + }, + { + "epoch": 4.07732028808262, + "grad_norm": 0.28075554966926575, + "learning_rate": 4.905642750373693e-06, + "loss": 3.8234, + "step": 60010 + }, + { + "epoch": 4.0776600081532814, + "grad_norm": 0.44248029589653015, + "learning_rate": 4.9052181002853655e-06, + "loss": 3.6856, + "step": 60015 + }, + { + "epoch": 4.0779997282239435, + "grad_norm": 0.5248723030090332, + "learning_rate": 4.904793450197038e-06, + "loss": 4.1063, + "step": 60020 + }, + { + "epoch": 4.078339448294606, + "grad_norm": 0.2429022341966629, + "learning_rate": 4.904368800108711e-06, + "loss": 4.0353, + "step": 60025 + }, + { + "epoch": 4.078679168365267, + "grad_norm": 0.24052487313747406, + "learning_rate": 4.903944150020384e-06, + "loss": 3.9804, + "step": 60030 + }, + { + "epoch": 4.079018888435929, + "grad_norm": 0.2929691672325134, + "learning_rate": 4.903519499932056e-06, + "loss": 4.1498, + "step": 60035 + }, + { + "epoch": 4.079358608506591, + "grad_norm": 0.3647461235523224, + "learning_rate": 4.9030948498437295e-06, + "loss": 4.061, + "step": 60040 + }, + { + "epoch": 4.079698328577252, + "grad_norm": 0.2663695216178894, + "learning_rate": 4.902670199755402e-06, + "loss": 3.8597, + "step": 60045 + }, + { + "epoch": 4.080038048647914, + "grad_norm": 0.2331365942955017, + "learning_rate": 4.902245549667074e-06, + "loss": 4.0791, + "step": 60050 + }, + { + "epoch": 4.080377768718576, + "grad_norm": 0.27595868706703186, + "learning_rate": 4.901820899578748e-06, + "loss": 3.9337, + "step": 60055 + }, + { + "epoch": 4.0807174887892375, + "grad_norm": 0.27167585492134094, + "learning_rate": 4.901396249490421e-06, + "loss": 3.9333, + "step": 60060 + }, + { + "epoch": 4.0810572088598995, + "grad_norm": 0.2493153065443039, + "learning_rate": 4.900971599402093e-06, + "loss": 3.7589, + "step": 60065 + }, + { + "epoch": 4.081396928930562, + "grad_norm": 0.43068230152130127, + "learning_rate": 4.9005469493137654e-06, + "loss": 3.9891, + "step": 60070 + }, + { + "epoch": 4.081736649001223, + "grad_norm": 0.3261614143848419, + "learning_rate": 4.900122299225439e-06, + "loss": 4.0119, + "step": 60075 + }, + { + "epoch": 4.082076369071885, + "grad_norm": 0.30400073528289795, + "learning_rate": 4.899697649137111e-06, + "loss": 4.1737, + "step": 60080 + }, + { + "epoch": 4.082416089142547, + "grad_norm": 0.3216407597064972, + "learning_rate": 4.899272999048784e-06, + "loss": 3.9929, + "step": 60085 + }, + { + "epoch": 4.082755809213208, + "grad_norm": 0.28319212794303894, + "learning_rate": 4.8988483489604575e-06, + "loss": 3.9143, + "step": 60090 + }, + { + "epoch": 4.08309552928387, + "grad_norm": 0.2919843792915344, + "learning_rate": 4.8984236988721294e-06, + "loss": 4.2852, + "step": 60095 + }, + { + "epoch": 4.083435249354532, + "grad_norm": 0.27302464842796326, + "learning_rate": 4.897999048783802e-06, + "loss": 3.8547, + "step": 60100 + }, + { + "epoch": 4.0837749694251935, + "grad_norm": 0.24601434171199799, + "learning_rate": 4.897574398695475e-06, + "loss": 3.8578, + "step": 60105 + }, + { + "epoch": 4.0841146894958555, + "grad_norm": 0.22819572687149048, + "learning_rate": 4.897149748607148e-06, + "loss": 3.9805, + "step": 60110 + }, + { + "epoch": 4.084454409566518, + "grad_norm": 0.2693714499473572, + "learning_rate": 4.896725098518821e-06, + "loss": 4.1427, + "step": 60115 + }, + { + "epoch": 4.084794129637179, + "grad_norm": 0.2232850193977356, + "learning_rate": 4.8963004484304934e-06, + "loss": 3.8733, + "step": 60120 + }, + { + "epoch": 4.085133849707841, + "grad_norm": 0.6478984355926514, + "learning_rate": 4.895875798342166e-06, + "loss": 4.1386, + "step": 60125 + }, + { + "epoch": 4.085473569778502, + "grad_norm": 0.30835941433906555, + "learning_rate": 4.895451148253839e-06, + "loss": 3.9596, + "step": 60130 + }, + { + "epoch": 4.085813289849164, + "grad_norm": 0.2800692319869995, + "learning_rate": 4.895026498165512e-06, + "loss": 4.2219, + "step": 60135 + }, + { + "epoch": 4.086153009919826, + "grad_norm": 0.3541378676891327, + "learning_rate": 4.894601848077185e-06, + "loss": 3.7302, + "step": 60140 + }, + { + "epoch": 4.086492729990487, + "grad_norm": 0.25007402896881104, + "learning_rate": 4.8941771979888574e-06, + "loss": 3.9799, + "step": 60145 + }, + { + "epoch": 4.0868324500611495, + "grad_norm": 0.21943266689777374, + "learning_rate": 4.89375254790053e-06, + "loss": 4.0173, + "step": 60150 + }, + { + "epoch": 4.0871721701318116, + "grad_norm": 0.24861907958984375, + "learning_rate": 4.893327897812203e-06, + "loss": 4.0738, + "step": 60155 + }, + { + "epoch": 4.087511890202473, + "grad_norm": 0.27780506014823914, + "learning_rate": 4.892903247723876e-06, + "loss": 3.9997, + "step": 60160 + }, + { + "epoch": 4.087851610273135, + "grad_norm": 0.3651048243045807, + "learning_rate": 4.892478597635549e-06, + "loss": 4.1334, + "step": 60165 + }, + { + "epoch": 4.088191330343797, + "grad_norm": 0.3152467608451843, + "learning_rate": 4.8920539475472215e-06, + "loss": 3.824, + "step": 60170 + }, + { + "epoch": 4.088531050414458, + "grad_norm": 0.2924858033657074, + "learning_rate": 4.891629297458894e-06, + "loss": 4.1002, + "step": 60175 + }, + { + "epoch": 4.08887077048512, + "grad_norm": 0.3346068859100342, + "learning_rate": 4.891204647370567e-06, + "loss": 4.0864, + "step": 60180 + }, + { + "epoch": 4.089210490555782, + "grad_norm": 0.34638679027557373, + "learning_rate": 4.89077999728224e-06, + "loss": 3.818, + "step": 60185 + }, + { + "epoch": 4.089550210626443, + "grad_norm": 0.33230850100517273, + "learning_rate": 4.890355347193913e-06, + "loss": 3.9756, + "step": 60190 + }, + { + "epoch": 4.0898899306971055, + "grad_norm": 0.3372719883918762, + "learning_rate": 4.8899306971055855e-06, + "loss": 4.2837, + "step": 60195 + }, + { + "epoch": 4.090229650767768, + "grad_norm": 0.41999202966690063, + "learning_rate": 4.889506047017258e-06, + "loss": 4.1739, + "step": 60200 + }, + { + "epoch": 4.090569370838429, + "grad_norm": 0.6186972856521606, + "learning_rate": 4.889081396928931e-06, + "loss": 4.1939, + "step": 60205 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 0.2622690200805664, + "learning_rate": 4.888656746840604e-06, + "loss": 3.9389, + "step": 60210 + }, + { + "epoch": 4.091248810979753, + "grad_norm": 0.4330848157405853, + "learning_rate": 4.888232096752277e-06, + "loss": 3.9727, + "step": 60215 + }, + { + "epoch": 4.091588531050414, + "grad_norm": 0.28371235728263855, + "learning_rate": 4.8878074466639495e-06, + "loss": 4.17, + "step": 60220 + }, + { + "epoch": 4.091928251121076, + "grad_norm": 0.24109598994255066, + "learning_rate": 4.887382796575622e-06, + "loss": 4.0629, + "step": 60225 + }, + { + "epoch": 4.092267971191738, + "grad_norm": 0.2754732072353363, + "learning_rate": 4.886958146487295e-06, + "loss": 4.215, + "step": 60230 + }, + { + "epoch": 4.092607691262399, + "grad_norm": 0.2985735833644867, + "learning_rate": 4.886533496398968e-06, + "loss": 3.9171, + "step": 60235 + }, + { + "epoch": 4.0929474113330615, + "grad_norm": 0.34435656666755676, + "learning_rate": 4.886108846310641e-06, + "loss": 3.7876, + "step": 60240 + }, + { + "epoch": 4.093287131403724, + "grad_norm": 0.31353023648262024, + "learning_rate": 4.8856841962223135e-06, + "loss": 3.9895, + "step": 60245 + }, + { + "epoch": 4.093626851474385, + "grad_norm": 0.28257277607917786, + "learning_rate": 4.885259546133985e-06, + "loss": 4.0483, + "step": 60250 + }, + { + "epoch": 4.093966571545047, + "grad_norm": 0.27278414368629456, + "learning_rate": 4.884834896045659e-06, + "loss": 3.903, + "step": 60255 + }, + { + "epoch": 4.094306291615709, + "grad_norm": 0.29447489976882935, + "learning_rate": 4.884410245957332e-06, + "loss": 4.0725, + "step": 60260 + }, + { + "epoch": 4.09464601168637, + "grad_norm": 0.24533896148204803, + "learning_rate": 4.883985595869004e-06, + "loss": 4.0322, + "step": 60265 + }, + { + "epoch": 4.094985731757032, + "grad_norm": 0.36633625626564026, + "learning_rate": 4.8835609457806775e-06, + "loss": 4.0428, + "step": 60270 + }, + { + "epoch": 4.095325451827694, + "grad_norm": 0.24908892810344696, + "learning_rate": 4.88313629569235e-06, + "loss": 3.7804, + "step": 60275 + }, + { + "epoch": 4.095665171898355, + "grad_norm": 0.2889932096004486, + "learning_rate": 4.882711645604022e-06, + "loss": 3.9131, + "step": 60280 + }, + { + "epoch": 4.0960048919690175, + "grad_norm": 0.38859498500823975, + "learning_rate": 4.882286995515695e-06, + "loss": 3.9524, + "step": 60285 + }, + { + "epoch": 4.09634461203968, + "grad_norm": 0.27106812596321106, + "learning_rate": 4.881862345427369e-06, + "loss": 3.9043, + "step": 60290 + }, + { + "epoch": 4.096684332110341, + "grad_norm": 0.25677451491355896, + "learning_rate": 4.881437695339041e-06, + "loss": 3.9325, + "step": 60295 + }, + { + "epoch": 4.097024052181003, + "grad_norm": 0.35740771889686584, + "learning_rate": 4.8810130452507134e-06, + "loss": 4.0714, + "step": 60300 + }, + { + "epoch": 4.097363772251665, + "grad_norm": 0.2575574219226837, + "learning_rate": 4.880588395162387e-06, + "loss": 3.8536, + "step": 60305 + }, + { + "epoch": 4.097703492322326, + "grad_norm": 0.22886285185813904, + "learning_rate": 4.880163745074059e-06, + "loss": 3.651, + "step": 60310 + }, + { + "epoch": 4.098043212392988, + "grad_norm": 0.2974107265472412, + "learning_rate": 4.879739094985732e-06, + "loss": 4.1866, + "step": 60315 + }, + { + "epoch": 4.09838293246365, + "grad_norm": 0.26049479842185974, + "learning_rate": 4.879314444897405e-06, + "loss": 3.7076, + "step": 60320 + }, + { + "epoch": 4.0987226525343115, + "grad_norm": 0.37056964635849, + "learning_rate": 4.878889794809078e-06, + "loss": 4.1655, + "step": 60325 + }, + { + "epoch": 4.0990623726049735, + "grad_norm": 0.22817780077457428, + "learning_rate": 4.87846514472075e-06, + "loss": 3.8346, + "step": 60330 + }, + { + "epoch": 4.099402092675636, + "grad_norm": 0.30520251393318176, + "learning_rate": 4.878040494632423e-06, + "loss": 4.0134, + "step": 60335 + }, + { + "epoch": 4.099741812746297, + "grad_norm": 0.3113340437412262, + "learning_rate": 4.877615844544097e-06, + "loss": 3.9677, + "step": 60340 + }, + { + "epoch": 4.100081532816959, + "grad_norm": 0.24212712049484253, + "learning_rate": 4.877191194455769e-06, + "loss": 4.0595, + "step": 60345 + }, + { + "epoch": 4.100421252887621, + "grad_norm": 0.24159583449363708, + "learning_rate": 4.8767665443674414e-06, + "loss": 4.0604, + "step": 60350 + }, + { + "epoch": 4.100760972958282, + "grad_norm": 0.21726514399051666, + "learning_rate": 4.876341894279114e-06, + "loss": 4.0114, + "step": 60355 + }, + { + "epoch": 4.101100693028944, + "grad_norm": 0.2638032138347626, + "learning_rate": 4.875917244190787e-06, + "loss": 3.9693, + "step": 60360 + }, + { + "epoch": 4.101440413099606, + "grad_norm": 0.259399950504303, + "learning_rate": 4.87549259410246e-06, + "loss": 3.8804, + "step": 60365 + }, + { + "epoch": 4.1017801331702675, + "grad_norm": 0.4551856815814972, + "learning_rate": 4.875067944014133e-06, + "loss": 4.0441, + "step": 60370 + }, + { + "epoch": 4.1021198532409295, + "grad_norm": 0.3117848336696625, + "learning_rate": 4.8746432939258054e-06, + "loss": 3.9007, + "step": 60375 + }, + { + "epoch": 4.102459573311592, + "grad_norm": 0.2055354118347168, + "learning_rate": 4.874218643837478e-06, + "loss": 4.1188, + "step": 60380 + }, + { + "epoch": 4.102799293382253, + "grad_norm": 0.24936893582344055, + "learning_rate": 4.873793993749151e-06, + "loss": 3.9599, + "step": 60385 + }, + { + "epoch": 4.103139013452915, + "grad_norm": 0.2573182284832001, + "learning_rate": 4.873369343660824e-06, + "loss": 3.7683, + "step": 60390 + }, + { + "epoch": 4.103478733523577, + "grad_norm": 0.2835087776184082, + "learning_rate": 4.872944693572497e-06, + "loss": 3.9732, + "step": 60395 + }, + { + "epoch": 4.103818453594238, + "grad_norm": 0.22435739636421204, + "learning_rate": 4.8725200434841694e-06, + "loss": 3.9404, + "step": 60400 + }, + { + "epoch": 4.1041581736649, + "grad_norm": 0.3507901132106781, + "learning_rate": 4.872095393395842e-06, + "loss": 3.8827, + "step": 60405 + }, + { + "epoch": 4.104497893735562, + "grad_norm": 0.22476975619792938, + "learning_rate": 4.871670743307515e-06, + "loss": 4.0733, + "step": 60410 + }, + { + "epoch": 4.1048376138062235, + "grad_norm": 0.30242231488227844, + "learning_rate": 4.871246093219188e-06, + "loss": 3.9617, + "step": 60415 + }, + { + "epoch": 4.1051773338768855, + "grad_norm": 0.3671678304672241, + "learning_rate": 4.870821443130861e-06, + "loss": 4.1084, + "step": 60420 + }, + { + "epoch": 4.105517053947548, + "grad_norm": 0.3166793882846832, + "learning_rate": 4.8703967930425334e-06, + "loss": 4.0842, + "step": 60425 + }, + { + "epoch": 4.105856774018209, + "grad_norm": 0.2583977282047272, + "learning_rate": 4.869972142954206e-06, + "loss": 3.8836, + "step": 60430 + }, + { + "epoch": 4.106196494088871, + "grad_norm": 0.3733287453651428, + "learning_rate": 4.869547492865879e-06, + "loss": 3.786, + "step": 60435 + }, + { + "epoch": 4.106536214159533, + "grad_norm": 0.27258604764938354, + "learning_rate": 4.869122842777552e-06, + "loss": 4.0567, + "step": 60440 + }, + { + "epoch": 4.106875934230194, + "grad_norm": 0.21509340405464172, + "learning_rate": 4.868698192689225e-06, + "loss": 3.9144, + "step": 60445 + }, + { + "epoch": 4.107215654300856, + "grad_norm": 0.3637397587299347, + "learning_rate": 4.868273542600897e-06, + "loss": 3.9964, + "step": 60450 + }, + { + "epoch": 4.107555374371518, + "grad_norm": 0.26615989208221436, + "learning_rate": 4.86784889251257e-06, + "loss": 3.9648, + "step": 60455 + }, + { + "epoch": 4.1078950944421795, + "grad_norm": 0.34308892488479614, + "learning_rate": 4.867424242424243e-06, + "loss": 3.9765, + "step": 60460 + }, + { + "epoch": 4.108234814512842, + "grad_norm": 0.3339083194732666, + "learning_rate": 4.866999592335915e-06, + "loss": 4.0169, + "step": 60465 + }, + { + "epoch": 4.108574534583504, + "grad_norm": 0.25098568201065063, + "learning_rate": 4.866574942247589e-06, + "loss": 3.9984, + "step": 60470 + }, + { + "epoch": 4.108914254654165, + "grad_norm": 0.342677503824234, + "learning_rate": 4.8661502921592615e-06, + "loss": 4.1082, + "step": 60475 + }, + { + "epoch": 4.109253974724827, + "grad_norm": 0.22840718924999237, + "learning_rate": 4.865725642070933e-06, + "loss": 3.7791, + "step": 60480 + }, + { + "epoch": 4.109593694795488, + "grad_norm": 0.2381182461977005, + "learning_rate": 4.865300991982607e-06, + "loss": 3.9319, + "step": 60485 + }, + { + "epoch": 4.10993341486615, + "grad_norm": 0.23810768127441406, + "learning_rate": 4.86487634189428e-06, + "loss": 4.1446, + "step": 60490 + }, + { + "epoch": 4.110273134936812, + "grad_norm": 0.32339558005332947, + "learning_rate": 4.864451691805953e-06, + "loss": 3.9935, + "step": 60495 + }, + { + "epoch": 4.110612855007473, + "grad_norm": 0.5458389520645142, + "learning_rate": 4.864027041717625e-06, + "loss": 3.6753, + "step": 60500 + }, + { + "epoch": 4.1109525750781355, + "grad_norm": 0.3294123411178589, + "learning_rate": 4.863602391629298e-06, + "loss": 3.9557, + "step": 60505 + }, + { + "epoch": 4.111292295148798, + "grad_norm": 0.3615903854370117, + "learning_rate": 4.863177741540971e-06, + "loss": 4.0309, + "step": 60510 + }, + { + "epoch": 4.111632015219459, + "grad_norm": 0.24764560163021088, + "learning_rate": 4.862753091452643e-06, + "loss": 3.8218, + "step": 60515 + }, + { + "epoch": 4.111971735290121, + "grad_norm": 0.30411234498023987, + "learning_rate": 4.862328441364317e-06, + "loss": 3.9401, + "step": 60520 + }, + { + "epoch": 4.112311455360783, + "grad_norm": 0.341500848531723, + "learning_rate": 4.8619037912759895e-06, + "loss": 3.922, + "step": 60525 + }, + { + "epoch": 4.112651175431444, + "grad_norm": 0.29568618535995483, + "learning_rate": 4.861479141187661e-06, + "loss": 3.876, + "step": 60530 + }, + { + "epoch": 4.112990895502106, + "grad_norm": 0.4892462491989136, + "learning_rate": 4.861054491099334e-06, + "loss": 3.6751, + "step": 60535 + }, + { + "epoch": 4.113330615572768, + "grad_norm": 0.24996648728847504, + "learning_rate": 4.860629841011008e-06, + "loss": 3.9736, + "step": 60540 + }, + { + "epoch": 4.113670335643429, + "grad_norm": 0.22227294743061066, + "learning_rate": 4.86020519092268e-06, + "loss": 3.8735, + "step": 60545 + }, + { + "epoch": 4.1140100557140915, + "grad_norm": 0.2422158122062683, + "learning_rate": 4.859780540834353e-06, + "loss": 3.9015, + "step": 60550 + }, + { + "epoch": 4.114349775784754, + "grad_norm": 0.29685693979263306, + "learning_rate": 4.859355890746026e-06, + "loss": 3.9406, + "step": 60555 + }, + { + "epoch": 4.114689495855415, + "grad_norm": 0.38599035143852234, + "learning_rate": 4.858931240657698e-06, + "loss": 4.1206, + "step": 60560 + }, + { + "epoch": 4.115029215926077, + "grad_norm": 0.3372539281845093, + "learning_rate": 4.858506590569371e-06, + "loss": 3.9899, + "step": 60565 + }, + { + "epoch": 4.115368935996739, + "grad_norm": 0.3365822434425354, + "learning_rate": 4.858081940481044e-06, + "loss": 3.9247, + "step": 60570 + }, + { + "epoch": 4.1157086560674, + "grad_norm": 0.24620400369167328, + "learning_rate": 4.857657290392717e-06, + "loss": 3.7589, + "step": 60575 + }, + { + "epoch": 4.116048376138062, + "grad_norm": 0.3737573027610779, + "learning_rate": 4.857232640304389e-06, + "loss": 4.1336, + "step": 60580 + }, + { + "epoch": 4.116388096208724, + "grad_norm": 0.26420971751213074, + "learning_rate": 4.856807990216062e-06, + "loss": 3.8613, + "step": 60585 + }, + { + "epoch": 4.116727816279385, + "grad_norm": 0.2570493519306183, + "learning_rate": 4.856383340127735e-06, + "loss": 3.9129, + "step": 60590 + }, + { + "epoch": 4.1170675363500475, + "grad_norm": 0.35401859879493713, + "learning_rate": 4.855958690039408e-06, + "loss": 3.9723, + "step": 60595 + }, + { + "epoch": 4.11740725642071, + "grad_norm": 0.3285669684410095, + "learning_rate": 4.855534039951081e-06, + "loss": 4.0807, + "step": 60600 + }, + { + "epoch": 4.117746976491371, + "grad_norm": 0.32612067461013794, + "learning_rate": 4.8551093898627534e-06, + "loss": 3.942, + "step": 60605 + }, + { + "epoch": 4.118086696562033, + "grad_norm": 0.22106702625751495, + "learning_rate": 4.854684739774426e-06, + "loss": 3.7513, + "step": 60610 + }, + { + "epoch": 4.118426416632695, + "grad_norm": 0.45993712544441223, + "learning_rate": 4.854260089686099e-06, + "loss": 3.88, + "step": 60615 + }, + { + "epoch": 4.118766136703356, + "grad_norm": 0.29585450887680054, + "learning_rate": 4.853835439597772e-06, + "loss": 3.9753, + "step": 60620 + }, + { + "epoch": 4.119105856774018, + "grad_norm": 0.2572808563709259, + "learning_rate": 4.853410789509445e-06, + "loss": 4.0295, + "step": 60625 + }, + { + "epoch": 4.11944557684468, + "grad_norm": 0.2567860186100006, + "learning_rate": 4.8529861394211174e-06, + "loss": 3.7695, + "step": 60630 + }, + { + "epoch": 4.1197852969153415, + "grad_norm": 0.24397000670433044, + "learning_rate": 4.85256148933279e-06, + "loss": 4.1735, + "step": 60635 + }, + { + "epoch": 4.1201250169860035, + "grad_norm": 0.2328692227602005, + "learning_rate": 4.852136839244463e-06, + "loss": 4.0136, + "step": 60640 + }, + { + "epoch": 4.120464737056666, + "grad_norm": 0.37182408571243286, + "learning_rate": 4.851712189156136e-06, + "loss": 4.0434, + "step": 60645 + }, + { + "epoch": 4.120804457127327, + "grad_norm": 0.26977595686912537, + "learning_rate": 4.851287539067809e-06, + "loss": 3.9689, + "step": 60650 + }, + { + "epoch": 4.121144177197989, + "grad_norm": 0.23778395354747772, + "learning_rate": 4.8508628889794814e-06, + "loss": 3.9141, + "step": 60655 + }, + { + "epoch": 4.121483897268651, + "grad_norm": 0.24725531041622162, + "learning_rate": 4.850438238891154e-06, + "loss": 3.8085, + "step": 60660 + }, + { + "epoch": 4.121823617339312, + "grad_norm": 0.32120755314826965, + "learning_rate": 4.850013588802827e-06, + "loss": 3.9107, + "step": 60665 + }, + { + "epoch": 4.122163337409974, + "grad_norm": 0.5436185598373413, + "learning_rate": 4.8495889387145e-06, + "loss": 3.7831, + "step": 60670 + }, + { + "epoch": 4.122503057480636, + "grad_norm": 0.2847610116004944, + "learning_rate": 4.849164288626173e-06, + "loss": 3.8512, + "step": 60675 + }, + { + "epoch": 4.1228427775512975, + "grad_norm": 0.4023166000843048, + "learning_rate": 4.8487396385378454e-06, + "loss": 4.0367, + "step": 60680 + }, + { + "epoch": 4.1231824976219595, + "grad_norm": 0.43985527753829956, + "learning_rate": 4.848314988449518e-06, + "loss": 4.0765, + "step": 60685 + }, + { + "epoch": 4.123522217692622, + "grad_norm": 0.21518826484680176, + "learning_rate": 4.847890338361191e-06, + "loss": 4.2151, + "step": 60690 + }, + { + "epoch": 4.123861937763283, + "grad_norm": 0.252143532037735, + "learning_rate": 4.847465688272864e-06, + "loss": 3.7533, + "step": 60695 + }, + { + "epoch": 4.124201657833945, + "grad_norm": 0.2402086853981018, + "learning_rate": 4.847041038184536e-06, + "loss": 3.6372, + "step": 60700 + }, + { + "epoch": 4.124541377904607, + "grad_norm": 0.3410644829273224, + "learning_rate": 4.8466163880962094e-06, + "loss": 4.0314, + "step": 60705 + }, + { + "epoch": 4.124881097975268, + "grad_norm": 0.2797107398509979, + "learning_rate": 4.846191738007882e-06, + "loss": 3.8349, + "step": 60710 + }, + { + "epoch": 4.12522081804593, + "grad_norm": 0.24602708220481873, + "learning_rate": 4.845767087919554e-06, + "loss": 3.9669, + "step": 60715 + }, + { + "epoch": 4.125560538116592, + "grad_norm": 0.28387999534606934, + "learning_rate": 4.845342437831228e-06, + "loss": 3.9703, + "step": 60720 + }, + { + "epoch": 4.1259002581872535, + "grad_norm": 0.2643500566482544, + "learning_rate": 4.844917787742901e-06, + "loss": 3.8789, + "step": 60725 + }, + { + "epoch": 4.1262399782579156, + "grad_norm": 0.3453374207019806, + "learning_rate": 4.844493137654573e-06, + "loss": 4.0058, + "step": 60730 + }, + { + "epoch": 4.126579698328578, + "grad_norm": 0.3181197941303253, + "learning_rate": 4.844068487566245e-06, + "loss": 4.0939, + "step": 60735 + }, + { + "epoch": 4.126919418399239, + "grad_norm": 0.2815268933773041, + "learning_rate": 4.843643837477919e-06, + "loss": 4.0844, + "step": 60740 + }, + { + "epoch": 4.127259138469901, + "grad_norm": 0.6436794400215149, + "learning_rate": 4.843219187389591e-06, + "loss": 3.696, + "step": 60745 + }, + { + "epoch": 4.127598858540563, + "grad_norm": 0.31660404801368713, + "learning_rate": 4.842794537301264e-06, + "loss": 4.1189, + "step": 60750 + }, + { + "epoch": 4.127938578611224, + "grad_norm": 0.23211732506752014, + "learning_rate": 4.8423698872129374e-06, + "loss": 3.9274, + "step": 60755 + }, + { + "epoch": 4.128278298681886, + "grad_norm": 0.25031378865242004, + "learning_rate": 4.841945237124609e-06, + "loss": 4.1062, + "step": 60760 + }, + { + "epoch": 4.128618018752548, + "grad_norm": 0.46008726954460144, + "learning_rate": 4.841520587036282e-06, + "loss": 3.9289, + "step": 60765 + }, + { + "epoch": 4.1289577388232095, + "grad_norm": 0.35560375452041626, + "learning_rate": 4.841095936947956e-06, + "loss": 3.9288, + "step": 60770 + }, + { + "epoch": 4.129297458893872, + "grad_norm": 0.25136223435401917, + "learning_rate": 4.840671286859628e-06, + "loss": 3.841, + "step": 60775 + }, + { + "epoch": 4.129637178964534, + "grad_norm": 0.2889865040779114, + "learning_rate": 4.840246636771301e-06, + "loss": 4.0156, + "step": 60780 + }, + { + "epoch": 4.129976899035195, + "grad_norm": 0.22147689759731293, + "learning_rate": 4.839821986682973e-06, + "loss": 4.0788, + "step": 60785 + }, + { + "epoch": 4.130316619105857, + "grad_norm": 0.3497490882873535, + "learning_rate": 4.839397336594646e-06, + "loss": 3.8977, + "step": 60790 + }, + { + "epoch": 4.130656339176519, + "grad_norm": 0.4047553837299347, + "learning_rate": 4.838972686506319e-06, + "loss": 3.8824, + "step": 60795 + }, + { + "epoch": 4.13099605924718, + "grad_norm": 0.24540531635284424, + "learning_rate": 4.838548036417992e-06, + "loss": 3.9464, + "step": 60800 + }, + { + "epoch": 4.131335779317842, + "grad_norm": 0.2215431183576584, + "learning_rate": 4.838123386329665e-06, + "loss": 3.757, + "step": 60805 + }, + { + "epoch": 4.131675499388503, + "grad_norm": 0.6059263944625854, + "learning_rate": 4.837698736241337e-06, + "loss": 4.0825, + "step": 60810 + }, + { + "epoch": 4.1320152194591655, + "grad_norm": 0.28383928537368774, + "learning_rate": 4.83727408615301e-06, + "loss": 3.805, + "step": 60815 + }, + { + "epoch": 4.132354939529828, + "grad_norm": 0.2311825007200241, + "learning_rate": 4.836849436064683e-06, + "loss": 3.9056, + "step": 60820 + }, + { + "epoch": 4.132694659600489, + "grad_norm": 0.3223397135734558, + "learning_rate": 4.836424785976356e-06, + "loss": 3.8834, + "step": 60825 + }, + { + "epoch": 4.133034379671151, + "grad_norm": 0.3351813852787018, + "learning_rate": 4.836000135888029e-06, + "loss": 4.0922, + "step": 60830 + }, + { + "epoch": 4.133374099741813, + "grad_norm": 0.2509722411632538, + "learning_rate": 4.835575485799701e-06, + "loss": 3.9484, + "step": 60835 + }, + { + "epoch": 4.133713819812474, + "grad_norm": 0.3161064684391022, + "learning_rate": 4.835150835711374e-06, + "loss": 3.9614, + "step": 60840 + }, + { + "epoch": 4.134053539883136, + "grad_norm": 0.40319427847862244, + "learning_rate": 4.834726185623047e-06, + "loss": 3.7715, + "step": 60845 + }, + { + "epoch": 4.134393259953798, + "grad_norm": 0.2448004186153412, + "learning_rate": 4.83430153553472e-06, + "loss": 4.0136, + "step": 60850 + }, + { + "epoch": 4.134732980024459, + "grad_norm": 0.2592354714870453, + "learning_rate": 4.833876885446393e-06, + "loss": 3.946, + "step": 60855 + }, + { + "epoch": 4.1350727000951215, + "grad_norm": 0.2863207757472992, + "learning_rate": 4.833452235358065e-06, + "loss": 4.0361, + "step": 60860 + }, + { + "epoch": 4.135412420165784, + "grad_norm": 0.36307650804519653, + "learning_rate": 4.833027585269738e-06, + "loss": 4.1055, + "step": 60865 + }, + { + "epoch": 4.135752140236445, + "grad_norm": 0.24542146921157837, + "learning_rate": 4.832602935181411e-06, + "loss": 3.8973, + "step": 60870 + }, + { + "epoch": 4.136091860307107, + "grad_norm": 0.24722765386104584, + "learning_rate": 4.832178285093084e-06, + "loss": 3.9531, + "step": 60875 + }, + { + "epoch": 4.136431580377769, + "grad_norm": 0.282992422580719, + "learning_rate": 4.831753635004757e-06, + "loss": 3.9137, + "step": 60880 + }, + { + "epoch": 4.13677130044843, + "grad_norm": 0.2896347641944885, + "learning_rate": 4.831328984916429e-06, + "loss": 4.0283, + "step": 60885 + }, + { + "epoch": 4.137111020519092, + "grad_norm": 0.31375589966773987, + "learning_rate": 4.830904334828102e-06, + "loss": 4.0314, + "step": 60890 + }, + { + "epoch": 4.137450740589754, + "grad_norm": 0.2568930387496948, + "learning_rate": 4.830479684739775e-06, + "loss": 3.912, + "step": 60895 + }, + { + "epoch": 4.1377904606604154, + "grad_norm": 0.2674695551395416, + "learning_rate": 4.830055034651448e-06, + "loss": 4.0096, + "step": 60900 + }, + { + "epoch": 4.1381301807310775, + "grad_norm": 0.297130286693573, + "learning_rate": 4.829630384563121e-06, + "loss": 3.6849, + "step": 60905 + }, + { + "epoch": 4.13846990080174, + "grad_norm": 0.7734834551811218, + "learning_rate": 4.8292057344747934e-06, + "loss": 3.9905, + "step": 60910 + }, + { + "epoch": 4.138809620872401, + "grad_norm": 0.27973702549934387, + "learning_rate": 4.828781084386465e-06, + "loss": 4.1713, + "step": 60915 + }, + { + "epoch": 4.139149340943063, + "grad_norm": 0.3223915994167328, + "learning_rate": 4.828356434298139e-06, + "loss": 3.9038, + "step": 60920 + }, + { + "epoch": 4.139489061013725, + "grad_norm": 0.28466349840164185, + "learning_rate": 4.827931784209812e-06, + "loss": 3.9702, + "step": 60925 + }, + { + "epoch": 4.139828781084386, + "grad_norm": 0.31797096133232117, + "learning_rate": 4.827507134121484e-06, + "loss": 4.152, + "step": 60930 + }, + { + "epoch": 4.140168501155048, + "grad_norm": 0.34328192472457886, + "learning_rate": 4.8270824840331574e-06, + "loss": 4.0715, + "step": 60935 + }, + { + "epoch": 4.14050822122571, + "grad_norm": 0.1994015872478485, + "learning_rate": 4.82665783394483e-06, + "loss": 3.8301, + "step": 60940 + }, + { + "epoch": 4.1408479412963715, + "grad_norm": 0.2579984664916992, + "learning_rate": 4.826233183856502e-06, + "loss": 4.2021, + "step": 60945 + }, + { + "epoch": 4.1411876613670335, + "grad_norm": 0.24178777635097504, + "learning_rate": 4.825808533768175e-06, + "loss": 4.1331, + "step": 60950 + }, + { + "epoch": 4.141527381437696, + "grad_norm": 0.2546229362487793, + "learning_rate": 4.825383883679849e-06, + "loss": 3.6148, + "step": 60955 + }, + { + "epoch": 4.141867101508357, + "grad_norm": 0.24660849571228027, + "learning_rate": 4.824959233591521e-06, + "loss": 3.9272, + "step": 60960 + }, + { + "epoch": 4.142206821579019, + "grad_norm": 0.253140926361084, + "learning_rate": 4.824534583503193e-06, + "loss": 3.99, + "step": 60965 + }, + { + "epoch": 4.142546541649681, + "grad_norm": 0.40470367670059204, + "learning_rate": 4.824109933414867e-06, + "loss": 3.9278, + "step": 60970 + }, + { + "epoch": 4.142886261720342, + "grad_norm": 0.312686026096344, + "learning_rate": 4.823685283326539e-06, + "loss": 3.8269, + "step": 60975 + }, + { + "epoch": 4.143225981791004, + "grad_norm": 0.41640061140060425, + "learning_rate": 4.823260633238212e-06, + "loss": 3.8254, + "step": 60980 + }, + { + "epoch": 4.143565701861666, + "grad_norm": 0.28642380237579346, + "learning_rate": 4.822835983149885e-06, + "loss": 4.1185, + "step": 60985 + }, + { + "epoch": 4.1439054219323275, + "grad_norm": 0.40340378880500793, + "learning_rate": 4.822411333061557e-06, + "loss": 3.9504, + "step": 60990 + }, + { + "epoch": 4.1442451420029895, + "grad_norm": 0.2835831344127655, + "learning_rate": 4.82198668297323e-06, + "loss": 3.904, + "step": 60995 + }, + { + "epoch": 4.144584862073652, + "grad_norm": 0.24030257761478424, + "learning_rate": 4.821562032884903e-06, + "loss": 3.796, + "step": 61000 + }, + { + "epoch": 4.144924582144313, + "grad_norm": 0.22467148303985596, + "learning_rate": 4.821137382796577e-06, + "loss": 3.6611, + "step": 61005 + }, + { + "epoch": 4.145264302214975, + "grad_norm": 0.22520960867404938, + "learning_rate": 4.820712732708249e-06, + "loss": 4.0993, + "step": 61010 + }, + { + "epoch": 4.145604022285637, + "grad_norm": 0.25103092193603516, + "learning_rate": 4.820288082619921e-06, + "loss": 3.9939, + "step": 61015 + }, + { + "epoch": 4.145943742356298, + "grad_norm": 0.2691243588924408, + "learning_rate": 4.819863432531594e-06, + "loss": 3.8455, + "step": 61020 + }, + { + "epoch": 4.14628346242696, + "grad_norm": 0.3549182713031769, + "learning_rate": 4.819438782443267e-06, + "loss": 3.959, + "step": 61025 + }, + { + "epoch": 4.146623182497622, + "grad_norm": 0.26969024538993835, + "learning_rate": 4.81901413235494e-06, + "loss": 3.918, + "step": 61030 + }, + { + "epoch": 4.1469629025682835, + "grad_norm": 0.26314619183540344, + "learning_rate": 4.818589482266613e-06, + "loss": 3.9633, + "step": 61035 + }, + { + "epoch": 4.147302622638946, + "grad_norm": 0.27801117300987244, + "learning_rate": 4.818164832178285e-06, + "loss": 3.9831, + "step": 61040 + }, + { + "epoch": 4.147642342709608, + "grad_norm": 0.26764610409736633, + "learning_rate": 4.817740182089958e-06, + "loss": 4.1391, + "step": 61045 + }, + { + "epoch": 4.147982062780269, + "grad_norm": 0.35062751173973083, + "learning_rate": 4.817315532001631e-06, + "loss": 3.7865, + "step": 61050 + }, + { + "epoch": 4.148321782850931, + "grad_norm": 0.2589752674102783, + "learning_rate": 4.816890881913304e-06, + "loss": 3.9965, + "step": 61055 + }, + { + "epoch": 4.148661502921593, + "grad_norm": 0.32460442185401917, + "learning_rate": 4.816466231824977e-06, + "loss": 3.8953, + "step": 61060 + }, + { + "epoch": 4.149001222992254, + "grad_norm": 0.24110400676727295, + "learning_rate": 4.816041581736649e-06, + "loss": 3.7148, + "step": 61065 + }, + { + "epoch": 4.149340943062916, + "grad_norm": 0.2468714565038681, + "learning_rate": 4.815616931648322e-06, + "loss": 3.9964, + "step": 61070 + }, + { + "epoch": 4.149680663133578, + "grad_norm": 0.290619432926178, + "learning_rate": 4.815192281559995e-06, + "loss": 3.9676, + "step": 61075 + }, + { + "epoch": 4.1500203832042395, + "grad_norm": 0.3207867443561554, + "learning_rate": 4.814767631471668e-06, + "loss": 3.9149, + "step": 61080 + }, + { + "epoch": 4.150360103274902, + "grad_norm": 0.3195244073867798, + "learning_rate": 4.814342981383341e-06, + "loss": 3.7829, + "step": 61085 + }, + { + "epoch": 4.150699823345564, + "grad_norm": 0.27846065163612366, + "learning_rate": 4.813918331295013e-06, + "loss": 4.0092, + "step": 61090 + }, + { + "epoch": 4.151039543416225, + "grad_norm": 0.262807697057724, + "learning_rate": 4.813493681206686e-06, + "loss": 4.097, + "step": 61095 + }, + { + "epoch": 4.151379263486887, + "grad_norm": 0.3459864556789398, + "learning_rate": 4.813069031118359e-06, + "loss": 3.9253, + "step": 61100 + }, + { + "epoch": 4.151718983557549, + "grad_norm": 0.26330482959747314, + "learning_rate": 4.812644381030032e-06, + "loss": 3.8372, + "step": 61105 + }, + { + "epoch": 4.15205870362821, + "grad_norm": 0.32069098949432373, + "learning_rate": 4.812219730941705e-06, + "loss": 4.065, + "step": 61110 + }, + { + "epoch": 4.152398423698872, + "grad_norm": 0.4689369797706604, + "learning_rate": 4.811795080853377e-06, + "loss": 4.1586, + "step": 61115 + }, + { + "epoch": 4.152738143769534, + "grad_norm": 0.2651943266391754, + "learning_rate": 4.81137043076505e-06, + "loss": 3.9427, + "step": 61120 + }, + { + "epoch": 4.1530778638401955, + "grad_norm": 0.23767749965190887, + "learning_rate": 4.810945780676723e-06, + "loss": 3.9291, + "step": 61125 + }, + { + "epoch": 4.153417583910858, + "grad_norm": 0.27837249636650085, + "learning_rate": 4.810521130588395e-06, + "loss": 3.7037, + "step": 61130 + }, + { + "epoch": 4.15375730398152, + "grad_norm": 0.28350573778152466, + "learning_rate": 4.810096480500069e-06, + "loss": 3.9568, + "step": 61135 + }, + { + "epoch": 4.154097024052181, + "grad_norm": 0.362583726644516, + "learning_rate": 4.809671830411741e-06, + "loss": 4.2074, + "step": 61140 + }, + { + "epoch": 4.154436744122843, + "grad_norm": 0.4832777976989746, + "learning_rate": 4.809247180323413e-06, + "loss": 4.0591, + "step": 61145 + }, + { + "epoch": 4.154776464193505, + "grad_norm": 0.2897462248802185, + "learning_rate": 4.808822530235087e-06, + "loss": 3.9647, + "step": 61150 + }, + { + "epoch": 4.155116184264166, + "grad_norm": 0.23913851380348206, + "learning_rate": 4.80839788014676e-06, + "loss": 4.0656, + "step": 61155 + }, + { + "epoch": 4.155455904334828, + "grad_norm": 0.3899362087249756, + "learning_rate": 4.807973230058432e-06, + "loss": 3.9906, + "step": 61160 + }, + { + "epoch": 4.15579562440549, + "grad_norm": 0.33962884545326233, + "learning_rate": 4.8075485799701046e-06, + "loss": 4.2361, + "step": 61165 + }, + { + "epoch": 4.1561353444761515, + "grad_norm": 0.2407248318195343, + "learning_rate": 4.807123929881778e-06, + "loss": 4.1401, + "step": 61170 + }, + { + "epoch": 4.156475064546814, + "grad_norm": 0.38959404826164246, + "learning_rate": 4.806699279793451e-06, + "loss": 3.9745, + "step": 61175 + }, + { + "epoch": 4.156814784617475, + "grad_norm": 0.31642401218414307, + "learning_rate": 4.806274629705123e-06, + "loss": 4.0071, + "step": 61180 + }, + { + "epoch": 4.157154504688137, + "grad_norm": 0.47790467739105225, + "learning_rate": 4.805849979616797e-06, + "loss": 3.9211, + "step": 61185 + }, + { + "epoch": 4.157494224758799, + "grad_norm": 0.2753470838069916, + "learning_rate": 4.805425329528469e-06, + "loss": 3.9265, + "step": 61190 + }, + { + "epoch": 4.15783394482946, + "grad_norm": 0.2179914116859436, + "learning_rate": 4.805000679440141e-06, + "loss": 3.9445, + "step": 61195 + }, + { + "epoch": 4.158173664900122, + "grad_norm": 0.26063501834869385, + "learning_rate": 4.804576029351814e-06, + "loss": 3.864, + "step": 61200 + }, + { + "epoch": 4.158513384970784, + "grad_norm": 0.29275253415107727, + "learning_rate": 4.804151379263488e-06, + "loss": 3.7435, + "step": 61205 + }, + { + "epoch": 4.1588531050414455, + "grad_norm": 0.31084316968917847, + "learning_rate": 4.80372672917516e-06, + "loss": 4.0979, + "step": 61210 + }, + { + "epoch": 4.1591928251121075, + "grad_norm": 0.23187170922756195, + "learning_rate": 4.803302079086833e-06, + "loss": 3.9283, + "step": 61215 + }, + { + "epoch": 4.15953254518277, + "grad_norm": 0.4023597538471222, + "learning_rate": 4.802877428998506e-06, + "loss": 4.1281, + "step": 61220 + }, + { + "epoch": 4.159872265253431, + "grad_norm": 0.3254075050354004, + "learning_rate": 4.802452778910178e-06, + "loss": 4.1777, + "step": 61225 + }, + { + "epoch": 4.160211985324093, + "grad_norm": 0.23825274407863617, + "learning_rate": 4.802028128821851e-06, + "loss": 4.0928, + "step": 61230 + }, + { + "epoch": 4.160551705394755, + "grad_norm": 0.35408440232276917, + "learning_rate": 4.801603478733524e-06, + "loss": 4.1432, + "step": 61235 + }, + { + "epoch": 4.160891425465416, + "grad_norm": 0.32605308294296265, + "learning_rate": 4.801178828645197e-06, + "loss": 3.9735, + "step": 61240 + }, + { + "epoch": 4.161231145536078, + "grad_norm": 0.22368121147155762, + "learning_rate": 4.800754178556869e-06, + "loss": 3.8468, + "step": 61245 + }, + { + "epoch": 4.16157086560674, + "grad_norm": 0.5292831063270569, + "learning_rate": 4.800329528468542e-06, + "loss": 3.9136, + "step": 61250 + }, + { + "epoch": 4.1619105856774015, + "grad_norm": 0.2630666494369507, + "learning_rate": 4.799904878380215e-06, + "loss": 3.9937, + "step": 61255 + }, + { + "epoch": 4.1622503057480635, + "grad_norm": 0.22185634076595306, + "learning_rate": 4.799480228291888e-06, + "loss": 3.9692, + "step": 61260 + }, + { + "epoch": 4.162590025818726, + "grad_norm": 0.26498860120773315, + "learning_rate": 4.799055578203561e-06, + "loss": 3.9279, + "step": 61265 + }, + { + "epoch": 4.162929745889387, + "grad_norm": 0.2804400324821472, + "learning_rate": 4.798630928115233e-06, + "loss": 3.6424, + "step": 61270 + }, + { + "epoch": 4.163269465960049, + "grad_norm": 0.2761997580528259, + "learning_rate": 4.798206278026906e-06, + "loss": 4.0252, + "step": 61275 + }, + { + "epoch": 4.163609186030711, + "grad_norm": 0.8119648694992065, + "learning_rate": 4.797781627938579e-06, + "loss": 4.1268, + "step": 61280 + }, + { + "epoch": 4.163948906101372, + "grad_norm": 0.2180987298488617, + "learning_rate": 4.797356977850252e-06, + "loss": 4.0142, + "step": 61285 + }, + { + "epoch": 4.164288626172034, + "grad_norm": 0.2727504372596741, + "learning_rate": 4.796932327761925e-06, + "loss": 4.0715, + "step": 61290 + }, + { + "epoch": 4.164628346242696, + "grad_norm": 0.2674993872642517, + "learning_rate": 4.796507677673597e-06, + "loss": 3.9382, + "step": 61295 + }, + { + "epoch": 4.1649680663133575, + "grad_norm": 0.3043467700481415, + "learning_rate": 4.79608302758527e-06, + "loss": 3.9509, + "step": 61300 + }, + { + "epoch": 4.1653077863840196, + "grad_norm": 0.24701109528541565, + "learning_rate": 4.795658377496943e-06, + "loss": 4.0053, + "step": 61305 + }, + { + "epoch": 4.165647506454682, + "grad_norm": 0.32477039098739624, + "learning_rate": 4.795233727408616e-06, + "loss": 3.826, + "step": 61310 + }, + { + "epoch": 4.165987226525343, + "grad_norm": 0.2829606831073761, + "learning_rate": 4.794809077320289e-06, + "loss": 4.0741, + "step": 61315 + }, + { + "epoch": 4.166326946596005, + "grad_norm": 0.32616397738456726, + "learning_rate": 4.794384427231961e-06, + "loss": 4.0455, + "step": 61320 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.3009553849697113, + "learning_rate": 4.793959777143634e-06, + "loss": 3.9238, + "step": 61325 + }, + { + "epoch": 4.167006386737328, + "grad_norm": 0.2734869122505188, + "learning_rate": 4.793535127055306e-06, + "loss": 4.1244, + "step": 61330 + }, + { + "epoch": 4.16734610680799, + "grad_norm": 0.23155565559864044, + "learning_rate": 4.79311047696698e-06, + "loss": 4.1023, + "step": 61335 + }, + { + "epoch": 4.167685826878652, + "grad_norm": 0.28926634788513184, + "learning_rate": 4.792685826878653e-06, + "loss": 4.4137, + "step": 61340 + }, + { + "epoch": 4.1680255469493135, + "grad_norm": 0.27206671237945557, + "learning_rate": 4.792261176790325e-06, + "loss": 3.6476, + "step": 61345 + }, + { + "epoch": 4.168365267019976, + "grad_norm": 0.4633280038833618, + "learning_rate": 4.791836526701998e-06, + "loss": 3.9221, + "step": 61350 + }, + { + "epoch": 4.168704987090638, + "grad_norm": 0.2725301682949066, + "learning_rate": 4.791411876613671e-06, + "loss": 4.19, + "step": 61355 + }, + { + "epoch": 4.169044707161299, + "grad_norm": 0.24462251365184784, + "learning_rate": 4.790987226525344e-06, + "loss": 4.015, + "step": 61360 + }, + { + "epoch": 4.169384427231961, + "grad_norm": 0.28077104687690735, + "learning_rate": 4.790562576437016e-06, + "loss": 3.8294, + "step": 61365 + }, + { + "epoch": 4.169724147302623, + "grad_norm": 0.2981398105621338, + "learning_rate": 4.790137926348689e-06, + "loss": 3.8747, + "step": 61370 + }, + { + "epoch": 4.170063867373284, + "grad_norm": 0.34013888239860535, + "learning_rate": 4.789713276260362e-06, + "loss": 3.8185, + "step": 61375 + }, + { + "epoch": 4.170403587443946, + "grad_norm": 0.279318243265152, + "learning_rate": 4.789288626172034e-06, + "loss": 4.0651, + "step": 61380 + }, + { + "epoch": 4.170743307514608, + "grad_norm": 0.2753576636314392, + "learning_rate": 4.788863976083708e-06, + "loss": 3.9341, + "step": 61385 + }, + { + "epoch": 4.1710830275852695, + "grad_norm": 0.38382968306541443, + "learning_rate": 4.788439325995381e-06, + "loss": 3.8536, + "step": 61390 + }, + { + "epoch": 4.171422747655932, + "grad_norm": 0.4675155580043793, + "learning_rate": 4.7880146759070526e-06, + "loss": 4.4025, + "step": 61395 + }, + { + "epoch": 4.171762467726594, + "grad_norm": 0.25740233063697815, + "learning_rate": 4.787590025818726e-06, + "loss": 3.8714, + "step": 61400 + }, + { + "epoch": 4.172102187797255, + "grad_norm": 0.26308128237724304, + "learning_rate": 4.787165375730399e-06, + "loss": 3.7885, + "step": 61405 + }, + { + "epoch": 4.172441907867917, + "grad_norm": 0.29872244596481323, + "learning_rate": 4.786740725642071e-06, + "loss": 3.9566, + "step": 61410 + }, + { + "epoch": 4.172781627938579, + "grad_norm": 0.29686394333839417, + "learning_rate": 4.786316075553744e-06, + "loss": 3.8881, + "step": 61415 + }, + { + "epoch": 4.17312134800924, + "grad_norm": 0.37854811549186707, + "learning_rate": 4.785891425465417e-06, + "loss": 3.8684, + "step": 61420 + }, + { + "epoch": 4.173461068079902, + "grad_norm": 0.2780749499797821, + "learning_rate": 4.785466775377089e-06, + "loss": 3.8763, + "step": 61425 + }, + { + "epoch": 4.173800788150564, + "grad_norm": 0.19741030037403107, + "learning_rate": 4.785042125288762e-06, + "loss": 3.9677, + "step": 61430 + }, + { + "epoch": 4.1741405082212255, + "grad_norm": 0.27439334988594055, + "learning_rate": 4.784617475200436e-06, + "loss": 4.042, + "step": 61435 + }, + { + "epoch": 4.174480228291888, + "grad_norm": 0.2619803845882416, + "learning_rate": 4.784192825112108e-06, + "loss": 4.2282, + "step": 61440 + }, + { + "epoch": 4.17481994836255, + "grad_norm": 0.41796207427978516, + "learning_rate": 4.7837681750237806e-06, + "loss": 4.056, + "step": 61445 + }, + { + "epoch": 4.175159668433211, + "grad_norm": 0.2963656485080719, + "learning_rate": 4.783343524935453e-06, + "loss": 3.9358, + "step": 61450 + }, + { + "epoch": 4.175499388503873, + "grad_norm": 0.31494998931884766, + "learning_rate": 4.782918874847126e-06, + "loss": 4.0177, + "step": 61455 + }, + { + "epoch": 4.175839108574535, + "grad_norm": 0.23152628540992737, + "learning_rate": 4.782494224758799e-06, + "loss": 3.8987, + "step": 61460 + }, + { + "epoch": 4.176178828645196, + "grad_norm": 0.3272766172885895, + "learning_rate": 4.782069574670472e-06, + "loss": 3.9957, + "step": 61465 + }, + { + "epoch": 4.176518548715858, + "grad_norm": 0.363979697227478, + "learning_rate": 4.7816449245821446e-06, + "loss": 4.029, + "step": 61470 + }, + { + "epoch": 4.17685826878652, + "grad_norm": 0.24523551762104034, + "learning_rate": 4.781220274493817e-06, + "loss": 3.9926, + "step": 61475 + }, + { + "epoch": 4.1771979888571815, + "grad_norm": 0.2119438350200653, + "learning_rate": 4.78079562440549e-06, + "loss": 3.9633, + "step": 61480 + }, + { + "epoch": 4.177537708927844, + "grad_norm": 0.3021678626537323, + "learning_rate": 4.780370974317163e-06, + "loss": 3.8262, + "step": 61485 + }, + { + "epoch": 4.177877428998505, + "grad_norm": 0.24849840998649597, + "learning_rate": 4.779946324228836e-06, + "loss": 3.905, + "step": 61490 + }, + { + "epoch": 4.178217149069167, + "grad_norm": 0.3186309039592743, + "learning_rate": 4.7795216741405086e-06, + "loss": 3.9924, + "step": 61495 + }, + { + "epoch": 4.178556869139829, + "grad_norm": 0.2518383860588074, + "learning_rate": 4.779097024052181e-06, + "loss": 3.813, + "step": 61500 + }, + { + "epoch": 4.17889658921049, + "grad_norm": 0.36030322313308716, + "learning_rate": 4.778672373963854e-06, + "loss": 3.5955, + "step": 61505 + }, + { + "epoch": 4.179236309281152, + "grad_norm": 0.30741068720817566, + "learning_rate": 4.778247723875527e-06, + "loss": 3.9225, + "step": 61510 + }, + { + "epoch": 4.179576029351814, + "grad_norm": 0.294008731842041, + "learning_rate": 4.7778230737872e-06, + "loss": 3.814, + "step": 61515 + }, + { + "epoch": 4.1799157494224755, + "grad_norm": 0.2820918560028076, + "learning_rate": 4.777398423698873e-06, + "loss": 4.0496, + "step": 61520 + }, + { + "epoch": 4.1802554694931375, + "grad_norm": 0.30431780219078064, + "learning_rate": 4.776973773610545e-06, + "loss": 3.7903, + "step": 61525 + }, + { + "epoch": 4.1805951895638, + "grad_norm": 0.18490451574325562, + "learning_rate": 4.776549123522218e-06, + "loss": 4.0905, + "step": 61530 + }, + { + "epoch": 4.180934909634461, + "grad_norm": 0.3139650821685791, + "learning_rate": 4.776124473433891e-06, + "loss": 4.233, + "step": 61535 + }, + { + "epoch": 4.181274629705123, + "grad_norm": 0.28253334760665894, + "learning_rate": 4.775699823345564e-06, + "loss": 4.008, + "step": 61540 + }, + { + "epoch": 4.181614349775785, + "grad_norm": 0.28229987621307373, + "learning_rate": 4.775275173257237e-06, + "loss": 4.1774, + "step": 61545 + }, + { + "epoch": 4.181954069846446, + "grad_norm": 0.3306998610496521, + "learning_rate": 4.774850523168909e-06, + "loss": 4.0314, + "step": 61550 + }, + { + "epoch": 4.182293789917108, + "grad_norm": 0.23225603997707367, + "learning_rate": 4.774425873080582e-06, + "loss": 3.8454, + "step": 61555 + }, + { + "epoch": 4.18263350998777, + "grad_norm": 0.6160279512405396, + "learning_rate": 4.774001222992255e-06, + "loss": 3.7399, + "step": 61560 + }, + { + "epoch": 4.1829732300584315, + "grad_norm": 0.23976323008537292, + "learning_rate": 4.773576572903928e-06, + "loss": 4.0704, + "step": 61565 + }, + { + "epoch": 4.1833129501290935, + "grad_norm": 0.37282052636146545, + "learning_rate": 4.773151922815601e-06, + "loss": 3.9203, + "step": 61570 + }, + { + "epoch": 4.183652670199756, + "grad_norm": 0.26384904980659485, + "learning_rate": 4.772727272727273e-06, + "loss": 3.981, + "step": 61575 + }, + { + "epoch": 4.183992390270417, + "grad_norm": 0.20166735351085663, + "learning_rate": 4.772302622638945e-06, + "loss": 4.0347, + "step": 61580 + }, + { + "epoch": 4.184332110341079, + "grad_norm": 0.18135790526866913, + "learning_rate": 4.771877972550619e-06, + "loss": 3.9494, + "step": 61585 + }, + { + "epoch": 4.184671830411741, + "grad_norm": 0.3293837904930115, + "learning_rate": 4.771453322462292e-06, + "loss": 3.8987, + "step": 61590 + }, + { + "epoch": 4.185011550482402, + "grad_norm": 0.40707364678382874, + "learning_rate": 4.771028672373964e-06, + "loss": 4.0208, + "step": 61595 + }, + { + "epoch": 4.185351270553064, + "grad_norm": 0.25825655460357666, + "learning_rate": 4.770604022285637e-06, + "loss": 4.1062, + "step": 61600 + }, + { + "epoch": 4.185690990623726, + "grad_norm": 0.26428458094596863, + "learning_rate": 4.77017937219731e-06, + "loss": 3.9041, + "step": 61605 + }, + { + "epoch": 4.1860307106943875, + "grad_norm": 0.30053403973579407, + "learning_rate": 4.769754722108982e-06, + "loss": 3.9365, + "step": 61610 + }, + { + "epoch": 4.1863704307650496, + "grad_norm": 0.34271377325057983, + "learning_rate": 4.769330072020655e-06, + "loss": 3.8947, + "step": 61615 + }, + { + "epoch": 4.186710150835712, + "grad_norm": 0.3537752032279968, + "learning_rate": 4.768905421932329e-06, + "loss": 4.0064, + "step": 61620 + }, + { + "epoch": 4.187049870906373, + "grad_norm": 0.27045321464538574, + "learning_rate": 4.7684807718440005e-06, + "loss": 4.0336, + "step": 61625 + }, + { + "epoch": 4.187389590977035, + "grad_norm": 0.30273303389549255, + "learning_rate": 4.768056121755673e-06, + "loss": 4.0885, + "step": 61630 + }, + { + "epoch": 4.187729311047697, + "grad_norm": 0.2859528064727783, + "learning_rate": 4.767631471667347e-06, + "loss": 3.7416, + "step": 61635 + }, + { + "epoch": 4.188069031118358, + "grad_norm": 0.258270263671875, + "learning_rate": 4.767206821579019e-06, + "loss": 4.0371, + "step": 61640 + }, + { + "epoch": 4.18840875118902, + "grad_norm": 0.2570776343345642, + "learning_rate": 4.766782171490692e-06, + "loss": 3.9679, + "step": 61645 + }, + { + "epoch": 4.188748471259682, + "grad_norm": 0.3822353482246399, + "learning_rate": 4.7663575214023646e-06, + "loss": 3.8693, + "step": 61650 + }, + { + "epoch": 4.1890881913303435, + "grad_norm": 0.2709871828556061, + "learning_rate": 4.765932871314037e-06, + "loss": 3.9779, + "step": 61655 + }, + { + "epoch": 4.189427911401006, + "grad_norm": 0.3058035373687744, + "learning_rate": 4.76550822122571e-06, + "loss": 3.8849, + "step": 61660 + }, + { + "epoch": 4.189767631471668, + "grad_norm": 0.2834019362926483, + "learning_rate": 4.765083571137383e-06, + "loss": 4.0372, + "step": 61665 + }, + { + "epoch": 4.190107351542329, + "grad_norm": 0.29736092686653137, + "learning_rate": 4.764658921049056e-06, + "loss": 3.89, + "step": 61670 + }, + { + "epoch": 4.190447071612991, + "grad_norm": 0.27016592025756836, + "learning_rate": 4.7642342709607286e-06, + "loss": 3.9654, + "step": 61675 + }, + { + "epoch": 4.190786791683653, + "grad_norm": 0.3172113597393036, + "learning_rate": 4.763809620872401e-06, + "loss": 3.7997, + "step": 61680 + }, + { + "epoch": 4.191126511754314, + "grad_norm": 0.35189205408096313, + "learning_rate": 4.763384970784074e-06, + "loss": 3.9114, + "step": 61685 + }, + { + "epoch": 4.191466231824976, + "grad_norm": 0.2766048312187195, + "learning_rate": 4.762960320695747e-06, + "loss": 4.0506, + "step": 61690 + }, + { + "epoch": 4.191805951895638, + "grad_norm": 0.2292732149362564, + "learning_rate": 4.76253567060742e-06, + "loss": 4.1151, + "step": 61695 + }, + { + "epoch": 4.1921456719662995, + "grad_norm": 0.2773444354534149, + "learning_rate": 4.7621110205190926e-06, + "loss": 4.0186, + "step": 61700 + }, + { + "epoch": 4.192485392036962, + "grad_norm": 0.23008139431476593, + "learning_rate": 4.761686370430765e-06, + "loss": 3.8579, + "step": 61705 + }, + { + "epoch": 4.192825112107624, + "grad_norm": 0.35259953141212463, + "learning_rate": 4.761261720342438e-06, + "loss": 3.8364, + "step": 61710 + }, + { + "epoch": 4.193164832178285, + "grad_norm": 0.2657912075519562, + "learning_rate": 4.760837070254111e-06, + "loss": 3.7045, + "step": 61715 + }, + { + "epoch": 4.193504552248947, + "grad_norm": 0.2245931476354599, + "learning_rate": 4.760412420165784e-06, + "loss": 3.8231, + "step": 61720 + }, + { + "epoch": 4.193844272319609, + "grad_norm": 0.24916046857833862, + "learning_rate": 4.7599877700774566e-06, + "loss": 4.0717, + "step": 61725 + }, + { + "epoch": 4.19418399239027, + "grad_norm": 0.22046999633312225, + "learning_rate": 4.759563119989129e-06, + "loss": 4.0079, + "step": 61730 + }, + { + "epoch": 4.194523712460932, + "grad_norm": 0.26874732971191406, + "learning_rate": 4.759138469900802e-06, + "loss": 4.0648, + "step": 61735 + }, + { + "epoch": 4.194863432531594, + "grad_norm": 0.308004230260849, + "learning_rate": 4.758713819812475e-06, + "loss": 3.9171, + "step": 61740 + }, + { + "epoch": 4.1952031526022555, + "grad_norm": 0.4370112419128418, + "learning_rate": 4.758289169724148e-06, + "loss": 3.9094, + "step": 61745 + }, + { + "epoch": 4.195542872672918, + "grad_norm": 0.22966526448726654, + "learning_rate": 4.7578645196358206e-06, + "loss": 4.0695, + "step": 61750 + }, + { + "epoch": 4.19588259274358, + "grad_norm": 0.26485374569892883, + "learning_rate": 4.757439869547493e-06, + "loss": 4.0239, + "step": 61755 + }, + { + "epoch": 4.196222312814241, + "grad_norm": 0.37822362780570984, + "learning_rate": 4.757015219459166e-06, + "loss": 4.0042, + "step": 61760 + }, + { + "epoch": 4.196562032884903, + "grad_norm": 0.2277255654335022, + "learning_rate": 4.756590569370839e-06, + "loss": 4.0088, + "step": 61765 + }, + { + "epoch": 4.196901752955565, + "grad_norm": 0.38899242877960205, + "learning_rate": 4.756165919282512e-06, + "loss": 4.0962, + "step": 61770 + }, + { + "epoch": 4.197241473026226, + "grad_norm": 0.2686842978000641, + "learning_rate": 4.7557412691941846e-06, + "loss": 3.9118, + "step": 61775 + }, + { + "epoch": 4.197581193096888, + "grad_norm": 0.287179559469223, + "learning_rate": 4.755316619105857e-06, + "loss": 3.9236, + "step": 61780 + }, + { + "epoch": 4.19792091316755, + "grad_norm": 0.35689353942871094, + "learning_rate": 4.75489196901753e-06, + "loss": 3.9539, + "step": 61785 + }, + { + "epoch": 4.1982606332382115, + "grad_norm": 0.2117982655763626, + "learning_rate": 4.754467318929203e-06, + "loss": 4.0086, + "step": 61790 + }, + { + "epoch": 4.198600353308874, + "grad_norm": 0.2443109005689621, + "learning_rate": 4.754042668840875e-06, + "loss": 3.7861, + "step": 61795 + }, + { + "epoch": 4.198940073379536, + "grad_norm": 0.3792378306388855, + "learning_rate": 4.7536180187525486e-06, + "loss": 4.0797, + "step": 61800 + }, + { + "epoch": 4.199279793450197, + "grad_norm": 0.6805964708328247, + "learning_rate": 4.753193368664221e-06, + "loss": 4.2509, + "step": 61805 + }, + { + "epoch": 4.199619513520859, + "grad_norm": 0.2114548534154892, + "learning_rate": 4.752768718575893e-06, + "loss": 4.0764, + "step": 61810 + }, + { + "epoch": 4.199959233591521, + "grad_norm": 0.23994873464107513, + "learning_rate": 4.752344068487567e-06, + "loss": 4.0621, + "step": 61815 + }, + { + "epoch": 4.200298953662182, + "grad_norm": 0.3434309661388397, + "learning_rate": 4.75191941839924e-06, + "loss": 4.0364, + "step": 61820 + }, + { + "epoch": 4.200638673732844, + "grad_norm": 0.39991897344589233, + "learning_rate": 4.751494768310912e-06, + "loss": 3.9254, + "step": 61825 + }, + { + "epoch": 4.200978393803506, + "grad_norm": 0.22867533564567566, + "learning_rate": 4.7510701182225845e-06, + "loss": 4.1461, + "step": 61830 + }, + { + "epoch": 4.2013181138741675, + "grad_norm": 0.427590012550354, + "learning_rate": 4.750645468134258e-06, + "loss": 4.0162, + "step": 61835 + }, + { + "epoch": 4.20165783394483, + "grad_norm": 0.2638128995895386, + "learning_rate": 4.75022081804593e-06, + "loss": 3.9086, + "step": 61840 + }, + { + "epoch": 4.201997554015492, + "grad_norm": 0.6457057595252991, + "learning_rate": 4.749796167957603e-06, + "loss": 3.9285, + "step": 61845 + }, + { + "epoch": 4.202337274086153, + "grad_norm": 0.26894411444664, + "learning_rate": 4.749371517869277e-06, + "loss": 4.0918, + "step": 61850 + }, + { + "epoch": 4.202676994156815, + "grad_norm": 0.3668384552001953, + "learning_rate": 4.748946867780949e-06, + "loss": 3.9944, + "step": 61855 + }, + { + "epoch": 4.203016714227476, + "grad_norm": 0.2917743921279907, + "learning_rate": 4.748522217692621e-06, + "loss": 3.7965, + "step": 61860 + }, + { + "epoch": 4.203356434298138, + "grad_norm": 0.5452884435653687, + "learning_rate": 4.748097567604294e-06, + "loss": 3.8321, + "step": 61865 + }, + { + "epoch": 4.2036961543688, + "grad_norm": 0.3022116422653198, + "learning_rate": 4.747672917515968e-06, + "loss": 3.9763, + "step": 61870 + }, + { + "epoch": 4.2040358744394615, + "grad_norm": 0.44855380058288574, + "learning_rate": 4.74724826742764e-06, + "loss": 3.7799, + "step": 61875 + }, + { + "epoch": 4.2043755945101235, + "grad_norm": 0.2275865375995636, + "learning_rate": 4.7468236173393125e-06, + "loss": 4.2055, + "step": 61880 + }, + { + "epoch": 4.204715314580786, + "grad_norm": 0.27329379320144653, + "learning_rate": 4.746398967250986e-06, + "loss": 3.8728, + "step": 61885 + }, + { + "epoch": 4.205055034651447, + "grad_norm": 0.2786937355995178, + "learning_rate": 4.745974317162658e-06, + "loss": 4.0093, + "step": 61890 + }, + { + "epoch": 4.205394754722109, + "grad_norm": 0.26116180419921875, + "learning_rate": 4.745549667074331e-06, + "loss": 4.0579, + "step": 61895 + }, + { + "epoch": 4.205734474792771, + "grad_norm": 0.3717663586139679, + "learning_rate": 4.745125016986004e-06, + "loss": 4.1624, + "step": 61900 + }, + { + "epoch": 4.206074194863432, + "grad_norm": 0.3115982711315155, + "learning_rate": 4.7447003668976765e-06, + "loss": 3.8656, + "step": 61905 + }, + { + "epoch": 4.206413914934094, + "grad_norm": 0.29033178091049194, + "learning_rate": 4.744275716809349e-06, + "loss": 3.907, + "step": 61910 + }, + { + "epoch": 4.206753635004756, + "grad_norm": 0.2151937484741211, + "learning_rate": 4.743851066721022e-06, + "loss": 3.9532, + "step": 61915 + }, + { + "epoch": 4.2070933550754175, + "grad_norm": 0.3847287893295288, + "learning_rate": 4.743426416632695e-06, + "loss": 4.197, + "step": 61920 + }, + { + "epoch": 4.20743307514608, + "grad_norm": 0.21844647824764252, + "learning_rate": 4.743001766544368e-06, + "loss": 3.8418, + "step": 61925 + }, + { + "epoch": 4.207772795216742, + "grad_norm": 0.313582181930542, + "learning_rate": 4.7425771164560405e-06, + "loss": 3.939, + "step": 61930 + }, + { + "epoch": 4.208112515287403, + "grad_norm": 0.24625509977340698, + "learning_rate": 4.742152466367713e-06, + "loss": 3.9326, + "step": 61935 + }, + { + "epoch": 4.208452235358065, + "grad_norm": 0.300951212644577, + "learning_rate": 4.741727816279386e-06, + "loss": 4.0134, + "step": 61940 + }, + { + "epoch": 4.208791955428727, + "grad_norm": 0.2154758870601654, + "learning_rate": 4.741303166191059e-06, + "loss": 3.8476, + "step": 61945 + }, + { + "epoch": 4.209131675499388, + "grad_norm": 0.30585959553718567, + "learning_rate": 4.740878516102732e-06, + "loss": 4.1356, + "step": 61950 + }, + { + "epoch": 4.20947139557005, + "grad_norm": 0.28806769847869873, + "learning_rate": 4.7404538660144046e-06, + "loss": 4.0632, + "step": 61955 + }, + { + "epoch": 4.209811115640712, + "grad_norm": 0.3668673038482666, + "learning_rate": 4.740029215926077e-06, + "loss": 4.0072, + "step": 61960 + }, + { + "epoch": 4.2101508357113735, + "grad_norm": 0.587092399597168, + "learning_rate": 4.73960456583775e-06, + "loss": 4.1295, + "step": 61965 + }, + { + "epoch": 4.210490555782036, + "grad_norm": 0.2449694126844406, + "learning_rate": 4.739179915749423e-06, + "loss": 3.824, + "step": 61970 + }, + { + "epoch": 4.210830275852698, + "grad_norm": 0.23761706054210663, + "learning_rate": 4.738755265661096e-06, + "loss": 3.9292, + "step": 61975 + }, + { + "epoch": 4.211169995923359, + "grad_norm": 0.3283363878726959, + "learning_rate": 4.7383306155727686e-06, + "loss": 3.9321, + "step": 61980 + }, + { + "epoch": 4.211509715994021, + "grad_norm": 0.2631269693374634, + "learning_rate": 4.737905965484441e-06, + "loss": 3.9154, + "step": 61985 + }, + { + "epoch": 4.211849436064683, + "grad_norm": 0.2708934545516968, + "learning_rate": 4.737481315396114e-06, + "loss": 3.9682, + "step": 61990 + }, + { + "epoch": 4.212189156135344, + "grad_norm": 0.32069215178489685, + "learning_rate": 4.737056665307786e-06, + "loss": 3.897, + "step": 61995 + }, + { + "epoch": 4.212528876206006, + "grad_norm": 0.22667308151721954, + "learning_rate": 4.73663201521946e-06, + "loss": 3.945, + "step": 62000 + }, + { + "epoch": 4.212868596276668, + "grad_norm": 0.32895252108573914, + "learning_rate": 4.7362073651311326e-06, + "loss": 4.0396, + "step": 62005 + }, + { + "epoch": 4.2132083163473295, + "grad_norm": 0.36378517746925354, + "learning_rate": 4.7357827150428045e-06, + "loss": 3.8681, + "step": 62010 + }, + { + "epoch": 4.213548036417992, + "grad_norm": 0.3344714343547821, + "learning_rate": 4.735358064954478e-06, + "loss": 3.9837, + "step": 62015 + }, + { + "epoch": 4.213887756488654, + "grad_norm": 0.2954826354980469, + "learning_rate": 4.734933414866151e-06, + "loss": 4.0922, + "step": 62020 + }, + { + "epoch": 4.214227476559315, + "grad_norm": 0.36688923835754395, + "learning_rate": 4.734508764777824e-06, + "loss": 3.9387, + "step": 62025 + }, + { + "epoch": 4.214567196629977, + "grad_norm": 0.2364802360534668, + "learning_rate": 4.734084114689496e-06, + "loss": 3.9374, + "step": 62030 + }, + { + "epoch": 4.214906916700639, + "grad_norm": 0.4526437819004059, + "learning_rate": 4.733659464601169e-06, + "loss": 4.1187, + "step": 62035 + }, + { + "epoch": 4.2152466367713, + "grad_norm": 0.4148790240287781, + "learning_rate": 4.733234814512842e-06, + "loss": 3.7027, + "step": 62040 + }, + { + "epoch": 4.215586356841962, + "grad_norm": 0.2519848644733429, + "learning_rate": 4.732810164424514e-06, + "loss": 4.1097, + "step": 62045 + }, + { + "epoch": 4.215926076912624, + "grad_norm": 0.22533322870731354, + "learning_rate": 4.732385514336188e-06, + "loss": 3.8373, + "step": 62050 + }, + { + "epoch": 4.2162657969832855, + "grad_norm": 0.30786067247390747, + "learning_rate": 4.7319608642478606e-06, + "loss": 3.7687, + "step": 62055 + }, + { + "epoch": 4.216605517053948, + "grad_norm": 0.46626850962638855, + "learning_rate": 4.7315362141595325e-06, + "loss": 4.0603, + "step": 62060 + }, + { + "epoch": 4.21694523712461, + "grad_norm": 0.21773815155029297, + "learning_rate": 4.731111564071206e-06, + "loss": 3.8644, + "step": 62065 + }, + { + "epoch": 4.217284957195271, + "grad_norm": 0.29689332842826843, + "learning_rate": 4.730686913982879e-06, + "loss": 4.0378, + "step": 62070 + }, + { + "epoch": 4.217624677265933, + "grad_norm": 0.24213480949401855, + "learning_rate": 4.730262263894551e-06, + "loss": 4.0189, + "step": 62075 + }, + { + "epoch": 4.217964397336595, + "grad_norm": 0.2503039538860321, + "learning_rate": 4.729837613806224e-06, + "loss": 3.8925, + "step": 62080 + }, + { + "epoch": 4.218304117407256, + "grad_norm": 0.30794891715049744, + "learning_rate": 4.729412963717897e-06, + "loss": 4.1138, + "step": 62085 + }, + { + "epoch": 4.218643837477918, + "grad_norm": 0.31214892864227295, + "learning_rate": 4.728988313629569e-06, + "loss": 4.0181, + "step": 62090 + }, + { + "epoch": 4.21898355754858, + "grad_norm": 0.2870534062385559, + "learning_rate": 4.728563663541242e-06, + "loss": 4.0129, + "step": 62095 + }, + { + "epoch": 4.2193232776192415, + "grad_norm": 0.3330370783805847, + "learning_rate": 4.728139013452916e-06, + "loss": 4.0042, + "step": 62100 + }, + { + "epoch": 4.219662997689904, + "grad_norm": 0.26372846961021423, + "learning_rate": 4.727714363364588e-06, + "loss": 4.0935, + "step": 62105 + }, + { + "epoch": 4.220002717760566, + "grad_norm": 0.3120616674423218, + "learning_rate": 4.7272897132762605e-06, + "loss": 3.9422, + "step": 62110 + }, + { + "epoch": 4.220342437831227, + "grad_norm": 0.35629498958587646, + "learning_rate": 4.726865063187933e-06, + "loss": 3.8487, + "step": 62115 + }, + { + "epoch": 4.220682157901889, + "grad_norm": 0.4380857050418854, + "learning_rate": 4.726440413099606e-06, + "loss": 3.9759, + "step": 62120 + }, + { + "epoch": 4.221021877972551, + "grad_norm": 0.39315855503082275, + "learning_rate": 4.726015763011279e-06, + "loss": 4.0453, + "step": 62125 + }, + { + "epoch": 4.221361598043212, + "grad_norm": 0.30096665024757385, + "learning_rate": 4.725591112922952e-06, + "loss": 3.9424, + "step": 62130 + }, + { + "epoch": 4.221701318113874, + "grad_norm": 0.2714759111404419, + "learning_rate": 4.7251664628346245e-06, + "loss": 3.7757, + "step": 62135 + }, + { + "epoch": 4.222041038184536, + "grad_norm": 0.20509923994541168, + "learning_rate": 4.724741812746297e-06, + "loss": 3.8464, + "step": 62140 + }, + { + "epoch": 4.2223807582551975, + "grad_norm": 0.24592168629169464, + "learning_rate": 4.72431716265797e-06, + "loss": 4.0305, + "step": 62145 + }, + { + "epoch": 4.22272047832586, + "grad_norm": 0.24817851185798645, + "learning_rate": 4.723892512569643e-06, + "loss": 4.1176, + "step": 62150 + }, + { + "epoch": 4.223060198396522, + "grad_norm": 0.4028327465057373, + "learning_rate": 4.723467862481316e-06, + "loss": 4.0514, + "step": 62155 + }, + { + "epoch": 4.223399918467183, + "grad_norm": 0.334012508392334, + "learning_rate": 4.7230432123929885e-06, + "loss": 3.908, + "step": 62160 + }, + { + "epoch": 4.223739638537845, + "grad_norm": 0.2635384202003479, + "learning_rate": 4.722618562304661e-06, + "loss": 3.8972, + "step": 62165 + }, + { + "epoch": 4.224079358608506, + "grad_norm": 0.29548269510269165, + "learning_rate": 4.722193912216334e-06, + "loss": 3.8376, + "step": 62170 + }, + { + "epoch": 4.224419078679168, + "grad_norm": 0.22581987082958221, + "learning_rate": 4.721769262128007e-06, + "loss": 3.82, + "step": 62175 + }, + { + "epoch": 4.22475879874983, + "grad_norm": 0.28981661796569824, + "learning_rate": 4.72134461203968e-06, + "loss": 4.1969, + "step": 62180 + }, + { + "epoch": 4.2250985188204915, + "grad_norm": 0.3015347123146057, + "learning_rate": 4.7209199619513525e-06, + "loss": 3.816, + "step": 62185 + }, + { + "epoch": 4.2254382388911536, + "grad_norm": 0.3394865095615387, + "learning_rate": 4.720495311863025e-06, + "loss": 4.0969, + "step": 62190 + }, + { + "epoch": 4.225777958961816, + "grad_norm": 0.25478121638298035, + "learning_rate": 4.720070661774698e-06, + "loss": 3.8352, + "step": 62195 + }, + { + "epoch": 4.226117679032477, + "grad_norm": 0.29039254784584045, + "learning_rate": 4.719646011686371e-06, + "loss": 4.1208, + "step": 62200 + }, + { + "epoch": 4.226457399103139, + "grad_norm": 0.30992835760116577, + "learning_rate": 4.719221361598044e-06, + "loss": 3.9327, + "step": 62205 + }, + { + "epoch": 4.226797119173801, + "grad_norm": 0.25246328115463257, + "learning_rate": 4.7187967115097165e-06, + "loss": 3.8728, + "step": 62210 + }, + { + "epoch": 4.227136839244462, + "grad_norm": 0.28148651123046875, + "learning_rate": 4.718372061421389e-06, + "loss": 3.7562, + "step": 62215 + }, + { + "epoch": 4.227476559315124, + "grad_norm": 0.25969359278678894, + "learning_rate": 4.717947411333062e-06, + "loss": 3.8819, + "step": 62220 + }, + { + "epoch": 4.227816279385786, + "grad_norm": 0.27189895510673523, + "learning_rate": 4.717522761244735e-06, + "loss": 3.8611, + "step": 62225 + }, + { + "epoch": 4.2281559994564475, + "grad_norm": 0.3546859920024872, + "learning_rate": 4.717098111156408e-06, + "loss": 3.9547, + "step": 62230 + }, + { + "epoch": 4.22849571952711, + "grad_norm": 0.3086266815662384, + "learning_rate": 4.7166734610680805e-06, + "loss": 4.0909, + "step": 62235 + }, + { + "epoch": 4.228835439597772, + "grad_norm": 0.285331666469574, + "learning_rate": 4.716248810979753e-06, + "loss": 3.9444, + "step": 62240 + }, + { + "epoch": 4.229175159668433, + "grad_norm": 0.3458479642868042, + "learning_rate": 4.715824160891425e-06, + "loss": 4.3804, + "step": 62245 + }, + { + "epoch": 4.229514879739095, + "grad_norm": 0.2442927360534668, + "learning_rate": 4.715399510803099e-06, + "loss": 3.9082, + "step": 62250 + }, + { + "epoch": 4.229854599809757, + "grad_norm": 0.27794626355171204, + "learning_rate": 4.714974860714772e-06, + "loss": 3.8101, + "step": 62255 + }, + { + "epoch": 4.230194319880418, + "grad_norm": 0.19355061650276184, + "learning_rate": 4.714550210626444e-06, + "loss": 4.0978, + "step": 62260 + }, + { + "epoch": 4.23053403995108, + "grad_norm": 0.28758642077445984, + "learning_rate": 4.714125560538117e-06, + "loss": 3.9196, + "step": 62265 + }, + { + "epoch": 4.230873760021742, + "grad_norm": 0.4705720543861389, + "learning_rate": 4.71370091044979e-06, + "loss": 3.8866, + "step": 62270 + }, + { + "epoch": 4.2312134800924035, + "grad_norm": 0.258676677942276, + "learning_rate": 4.713276260361462e-06, + "loss": 4.0532, + "step": 62275 + }, + { + "epoch": 4.231553200163066, + "grad_norm": 0.2584574520587921, + "learning_rate": 4.712851610273135e-06, + "loss": 4.1723, + "step": 62280 + }, + { + "epoch": 4.231892920233728, + "grad_norm": 0.21824003756046295, + "learning_rate": 4.7124269601848086e-06, + "loss": 4.1809, + "step": 62285 + }, + { + "epoch": 4.232232640304389, + "grad_norm": 0.308216392993927, + "learning_rate": 4.7120023100964805e-06, + "loss": 4.0071, + "step": 62290 + }, + { + "epoch": 4.232572360375051, + "grad_norm": 0.2602505087852478, + "learning_rate": 4.711577660008153e-06, + "loss": 4.18, + "step": 62295 + }, + { + "epoch": 4.232912080445713, + "grad_norm": 0.25529971718788147, + "learning_rate": 4.711153009919827e-06, + "loss": 3.902, + "step": 62300 + }, + { + "epoch": 4.233251800516374, + "grad_norm": 0.3194541037082672, + "learning_rate": 4.710728359831499e-06, + "loss": 3.7595, + "step": 62305 + }, + { + "epoch": 4.233591520587036, + "grad_norm": 0.30062204599380493, + "learning_rate": 4.710303709743172e-06, + "loss": 3.6894, + "step": 62310 + }, + { + "epoch": 4.233931240657698, + "grad_norm": 0.22748476266860962, + "learning_rate": 4.7098790596548445e-06, + "loss": 4.1143, + "step": 62315 + }, + { + "epoch": 4.2342709607283595, + "grad_norm": 0.27099764347076416, + "learning_rate": 4.709454409566517e-06, + "loss": 3.7817, + "step": 62320 + }, + { + "epoch": 4.234610680799022, + "grad_norm": 0.59798663854599, + "learning_rate": 4.70902975947819e-06, + "loss": 3.8952, + "step": 62325 + }, + { + "epoch": 4.234950400869684, + "grad_norm": 0.2760470509529114, + "learning_rate": 4.708605109389863e-06, + "loss": 3.8741, + "step": 62330 + }, + { + "epoch": 4.235290120940345, + "grad_norm": 0.2257012277841568, + "learning_rate": 4.708180459301536e-06, + "loss": 3.9853, + "step": 62335 + }, + { + "epoch": 4.235629841011007, + "grad_norm": 0.28400811553001404, + "learning_rate": 4.7077558092132085e-06, + "loss": 3.9323, + "step": 62340 + }, + { + "epoch": 4.235969561081669, + "grad_norm": 0.3457925021648407, + "learning_rate": 4.707331159124881e-06, + "loss": 3.7167, + "step": 62345 + }, + { + "epoch": 4.23630928115233, + "grad_norm": 0.37262842059135437, + "learning_rate": 4.706906509036554e-06, + "loss": 3.8801, + "step": 62350 + }, + { + "epoch": 4.236649001222992, + "grad_norm": 0.23488013446331024, + "learning_rate": 4.706481858948227e-06, + "loss": 3.8366, + "step": 62355 + }, + { + "epoch": 4.236988721293654, + "grad_norm": 0.292077898979187, + "learning_rate": 4.7060572088599e-06, + "loss": 3.9516, + "step": 62360 + }, + { + "epoch": 4.2373284413643155, + "grad_norm": 0.20641033351421356, + "learning_rate": 4.7056325587715725e-06, + "loss": 3.9385, + "step": 62365 + }, + { + "epoch": 4.237668161434978, + "grad_norm": 0.31059786677360535, + "learning_rate": 4.705207908683245e-06, + "loss": 3.8337, + "step": 62370 + }, + { + "epoch": 4.23800788150564, + "grad_norm": 0.2635635435581207, + "learning_rate": 4.704783258594918e-06, + "loss": 3.8687, + "step": 62375 + }, + { + "epoch": 4.238347601576301, + "grad_norm": 0.24110375344753265, + "learning_rate": 4.704358608506591e-06, + "loss": 3.9516, + "step": 62380 + }, + { + "epoch": 4.238687321646963, + "grad_norm": 0.27785784006118774, + "learning_rate": 4.703933958418264e-06, + "loss": 3.9243, + "step": 62385 + }, + { + "epoch": 4.239027041717625, + "grad_norm": 0.24170297384262085, + "learning_rate": 4.7035093083299365e-06, + "loss": 4.0217, + "step": 62390 + }, + { + "epoch": 4.239366761788286, + "grad_norm": 0.2923421263694763, + "learning_rate": 4.703084658241609e-06, + "loss": 4.1065, + "step": 62395 + }, + { + "epoch": 4.239706481858948, + "grad_norm": 0.28189030289649963, + "learning_rate": 4.702660008153282e-06, + "loss": 3.8188, + "step": 62400 + }, + { + "epoch": 4.24004620192961, + "grad_norm": 0.31266576051712036, + "learning_rate": 4.702235358064955e-06, + "loss": 4.0461, + "step": 62405 + }, + { + "epoch": 4.2403859220002715, + "grad_norm": 0.29111313819885254, + "learning_rate": 4.701810707976628e-06, + "loss": 4.0306, + "step": 62410 + }, + { + "epoch": 4.240725642070934, + "grad_norm": 0.3071993887424469, + "learning_rate": 4.7013860578883005e-06, + "loss": 4.0735, + "step": 62415 + }, + { + "epoch": 4.241065362141596, + "grad_norm": 0.22765718400478363, + "learning_rate": 4.700961407799973e-06, + "loss": 4.0227, + "step": 62420 + }, + { + "epoch": 4.241405082212257, + "grad_norm": 0.32536599040031433, + "learning_rate": 4.700536757711646e-06, + "loss": 3.9288, + "step": 62425 + }, + { + "epoch": 4.241744802282919, + "grad_norm": 0.24402070045471191, + "learning_rate": 4.700112107623319e-06, + "loss": 3.7732, + "step": 62430 + }, + { + "epoch": 4.242084522353581, + "grad_norm": 0.2753327190876007, + "learning_rate": 4.699687457534992e-06, + "loss": 3.7918, + "step": 62435 + }, + { + "epoch": 4.242424242424242, + "grad_norm": 0.26400089263916016, + "learning_rate": 4.6992628074466645e-06, + "loss": 4.0668, + "step": 62440 + }, + { + "epoch": 4.242763962494904, + "grad_norm": 0.2681955099105835, + "learning_rate": 4.698838157358337e-06, + "loss": 4.1253, + "step": 62445 + }, + { + "epoch": 4.243103682565566, + "grad_norm": 0.25390172004699707, + "learning_rate": 4.69841350727001e-06, + "loss": 3.9097, + "step": 62450 + }, + { + "epoch": 4.2434434026362275, + "grad_norm": 0.2573413550853729, + "learning_rate": 4.697988857181683e-06, + "loss": 3.9256, + "step": 62455 + }, + { + "epoch": 4.24378312270689, + "grad_norm": 0.19899789988994598, + "learning_rate": 4.697564207093355e-06, + "loss": 3.9797, + "step": 62460 + }, + { + "epoch": 4.244122842777552, + "grad_norm": 0.24142064154148102, + "learning_rate": 4.6971395570050285e-06, + "loss": 3.9709, + "step": 62465 + }, + { + "epoch": 4.244462562848213, + "grad_norm": 0.29907235503196716, + "learning_rate": 4.696714906916701e-06, + "loss": 3.7467, + "step": 62470 + }, + { + "epoch": 4.244802282918875, + "grad_norm": 0.2352980524301529, + "learning_rate": 4.696290256828373e-06, + "loss": 3.8914, + "step": 62475 + }, + { + "epoch": 4.245142002989537, + "grad_norm": 0.23224914073944092, + "learning_rate": 4.695865606740047e-06, + "loss": 3.8986, + "step": 62480 + }, + { + "epoch": 4.245481723060198, + "grad_norm": 0.2672562003135681, + "learning_rate": 4.69544095665172e-06, + "loss": 3.9093, + "step": 62485 + }, + { + "epoch": 4.24582144313086, + "grad_norm": 0.37752217054367065, + "learning_rate": 4.695016306563392e-06, + "loss": 3.7179, + "step": 62490 + }, + { + "epoch": 4.246161163201522, + "grad_norm": 0.27638471126556396, + "learning_rate": 4.6945916564750645e-06, + "loss": 4.0945, + "step": 62495 + }, + { + "epoch": 4.246500883272184, + "grad_norm": 0.33243823051452637, + "learning_rate": 4.694167006386738e-06, + "loss": 3.8923, + "step": 62500 + }, + { + "epoch": 4.246840603342846, + "grad_norm": 0.2788658142089844, + "learning_rate": 4.69374235629841e-06, + "loss": 3.8657, + "step": 62505 + }, + { + "epoch": 4.247180323413508, + "grad_norm": 0.2874317765235901, + "learning_rate": 4.693317706210083e-06, + "loss": 3.9947, + "step": 62510 + }, + { + "epoch": 4.247520043484169, + "grad_norm": 0.35969603061676025, + "learning_rate": 4.6928930561217565e-06, + "loss": 4.1362, + "step": 62515 + }, + { + "epoch": 4.247859763554831, + "grad_norm": 0.41946104168891907, + "learning_rate": 4.6924684060334285e-06, + "loss": 3.6856, + "step": 62520 + }, + { + "epoch": 4.248199483625493, + "grad_norm": 0.2401193082332611, + "learning_rate": 4.692043755945101e-06, + "loss": 3.8164, + "step": 62525 + }, + { + "epoch": 4.248539203696154, + "grad_norm": 0.23658475279808044, + "learning_rate": 4.691619105856774e-06, + "loss": 3.7434, + "step": 62530 + }, + { + "epoch": 4.248878923766816, + "grad_norm": 0.2444046586751938, + "learning_rate": 4.691194455768448e-06, + "loss": 3.8198, + "step": 62535 + }, + { + "epoch": 4.2492186438374775, + "grad_norm": 0.3322620093822479, + "learning_rate": 4.69076980568012e-06, + "loss": 4.0856, + "step": 62540 + }, + { + "epoch": 4.24955836390814, + "grad_norm": 0.24767161905765533, + "learning_rate": 4.6903451555917925e-06, + "loss": 3.9469, + "step": 62545 + }, + { + "epoch": 4.249898083978802, + "grad_norm": 0.2666094899177551, + "learning_rate": 4.689920505503466e-06, + "loss": 3.9666, + "step": 62550 + }, + { + "epoch": 4.250237804049463, + "grad_norm": 0.25454530119895935, + "learning_rate": 4.689495855415138e-06, + "loss": 3.7363, + "step": 62555 + }, + { + "epoch": 4.250577524120125, + "grad_norm": 0.22677429020404816, + "learning_rate": 4.689071205326811e-06, + "loss": 3.6037, + "step": 62560 + }, + { + "epoch": 4.250917244190787, + "grad_norm": 1.0874582529067993, + "learning_rate": 4.688646555238484e-06, + "loss": 4.0017, + "step": 62565 + }, + { + "epoch": 4.251256964261448, + "grad_norm": 0.28233104944229126, + "learning_rate": 4.6882219051501565e-06, + "loss": 4.0091, + "step": 62570 + }, + { + "epoch": 4.25159668433211, + "grad_norm": 0.3064937889575958, + "learning_rate": 4.687797255061829e-06, + "loss": 3.9595, + "step": 62575 + }, + { + "epoch": 4.251936404402772, + "grad_norm": 0.26705774664878845, + "learning_rate": 4.687372604973502e-06, + "loss": 3.8106, + "step": 62580 + }, + { + "epoch": 4.2522761244734335, + "grad_norm": 0.2868309020996094, + "learning_rate": 4.686947954885175e-06, + "loss": 4.1321, + "step": 62585 + }, + { + "epoch": 4.252615844544096, + "grad_norm": 0.30194491147994995, + "learning_rate": 4.686523304796848e-06, + "loss": 4.0166, + "step": 62590 + }, + { + "epoch": 4.252955564614758, + "grad_norm": 0.23087137937545776, + "learning_rate": 4.6860986547085205e-06, + "loss": 3.8179, + "step": 62595 + }, + { + "epoch": 4.253295284685419, + "grad_norm": 0.30199751257896423, + "learning_rate": 4.685674004620193e-06, + "loss": 3.7737, + "step": 62600 + }, + { + "epoch": 4.253635004756081, + "grad_norm": 0.28365933895111084, + "learning_rate": 4.685249354531866e-06, + "loss": 3.8225, + "step": 62605 + }, + { + "epoch": 4.253974724826743, + "grad_norm": 0.3838574290275574, + "learning_rate": 4.684824704443539e-06, + "loss": 3.9055, + "step": 62610 + }, + { + "epoch": 4.254314444897404, + "grad_norm": 0.2689755856990814, + "learning_rate": 4.684400054355212e-06, + "loss": 3.8949, + "step": 62615 + }, + { + "epoch": 4.254654164968066, + "grad_norm": 0.2879083752632141, + "learning_rate": 4.6839754042668845e-06, + "loss": 4.0131, + "step": 62620 + }, + { + "epoch": 4.254993885038728, + "grad_norm": 0.2575042247772217, + "learning_rate": 4.683550754178557e-06, + "loss": 3.9413, + "step": 62625 + }, + { + "epoch": 4.2553336051093895, + "grad_norm": 0.24114938080310822, + "learning_rate": 4.68312610409023e-06, + "loss": 4.0243, + "step": 62630 + }, + { + "epoch": 4.255673325180052, + "grad_norm": 0.36759334802627563, + "learning_rate": 4.682701454001903e-06, + "loss": 4.0112, + "step": 62635 + }, + { + "epoch": 4.256013045250714, + "grad_norm": 0.4304599463939667, + "learning_rate": 4.682276803913576e-06, + "loss": 3.8691, + "step": 62640 + }, + { + "epoch": 4.256352765321375, + "grad_norm": 0.2402145117521286, + "learning_rate": 4.6818521538252485e-06, + "loss": 4.0154, + "step": 62645 + }, + { + "epoch": 4.256692485392037, + "grad_norm": 0.3580593466758728, + "learning_rate": 4.681427503736921e-06, + "loss": 3.9082, + "step": 62650 + }, + { + "epoch": 4.257032205462699, + "grad_norm": 0.26154834032058716, + "learning_rate": 4.681002853648594e-06, + "loss": 3.9095, + "step": 62655 + }, + { + "epoch": 4.25737192553336, + "grad_norm": 0.2250284105539322, + "learning_rate": 4.680578203560266e-06, + "loss": 3.6855, + "step": 62660 + }, + { + "epoch": 4.257711645604022, + "grad_norm": 0.24295179545879364, + "learning_rate": 4.68015355347194e-06, + "loss": 3.8943, + "step": 62665 + }, + { + "epoch": 4.258051365674684, + "grad_norm": 0.2793780565261841, + "learning_rate": 4.6797289033836125e-06, + "loss": 4.116, + "step": 62670 + }, + { + "epoch": 4.2583910857453455, + "grad_norm": 0.29992952942848206, + "learning_rate": 4.6793042532952845e-06, + "loss": 3.8909, + "step": 62675 + }, + { + "epoch": 4.258730805816008, + "grad_norm": 0.43982750177383423, + "learning_rate": 4.678879603206958e-06, + "loss": 4.0862, + "step": 62680 + }, + { + "epoch": 4.25907052588667, + "grad_norm": 0.3261341154575348, + "learning_rate": 4.678454953118631e-06, + "loss": 3.9119, + "step": 62685 + }, + { + "epoch": 4.259410245957331, + "grad_norm": 0.33805981278419495, + "learning_rate": 4.678030303030303e-06, + "loss": 3.8811, + "step": 62690 + }, + { + "epoch": 4.259749966027993, + "grad_norm": 0.23797844350337982, + "learning_rate": 4.6776056529419765e-06, + "loss": 3.8628, + "step": 62695 + }, + { + "epoch": 4.260089686098655, + "grad_norm": 0.257056325674057, + "learning_rate": 4.677181002853649e-06, + "loss": 4.1052, + "step": 62700 + }, + { + "epoch": 4.260429406169316, + "grad_norm": 0.4837534725666046, + "learning_rate": 4.676756352765322e-06, + "loss": 4.0725, + "step": 62705 + }, + { + "epoch": 4.260769126239978, + "grad_norm": 0.3102591335773468, + "learning_rate": 4.676331702676994e-06, + "loss": 4.0492, + "step": 62710 + }, + { + "epoch": 4.26110884631064, + "grad_norm": 0.24818503856658936, + "learning_rate": 4.675907052588668e-06, + "loss": 4.0275, + "step": 62715 + }, + { + "epoch": 4.2614485663813015, + "grad_norm": 0.27386632561683655, + "learning_rate": 4.6754824025003405e-06, + "loss": 3.7032, + "step": 62720 + }, + { + "epoch": 4.261788286451964, + "grad_norm": 0.34561827778816223, + "learning_rate": 4.6750577524120125e-06, + "loss": 4.0677, + "step": 62725 + }, + { + "epoch": 4.262128006522626, + "grad_norm": 0.299724817276001, + "learning_rate": 4.674633102323686e-06, + "loss": 4.0959, + "step": 62730 + }, + { + "epoch": 4.262467726593287, + "grad_norm": 0.4687436521053314, + "learning_rate": 4.674208452235359e-06, + "loss": 3.9982, + "step": 62735 + }, + { + "epoch": 4.262807446663949, + "grad_norm": 0.4362991154193878, + "learning_rate": 4.673783802147031e-06, + "loss": 3.8562, + "step": 62740 + }, + { + "epoch": 4.263147166734611, + "grad_norm": 0.26105114817619324, + "learning_rate": 4.673359152058704e-06, + "loss": 3.9139, + "step": 62745 + }, + { + "epoch": 4.263486886805272, + "grad_norm": 0.36103323101997375, + "learning_rate": 4.672934501970377e-06, + "loss": 4.0086, + "step": 62750 + }, + { + "epoch": 4.263826606875934, + "grad_norm": 0.3127668499946594, + "learning_rate": 4.672509851882049e-06, + "loss": 3.9325, + "step": 62755 + }, + { + "epoch": 4.264166326946596, + "grad_norm": 0.26798251271247864, + "learning_rate": 4.672085201793722e-06, + "loss": 4.0184, + "step": 62760 + }, + { + "epoch": 4.2645060470172576, + "grad_norm": 0.38358327746391296, + "learning_rate": 4.671660551705396e-06, + "loss": 3.9795, + "step": 62765 + }, + { + "epoch": 4.26484576708792, + "grad_norm": 0.24900370836257935, + "learning_rate": 4.671235901617068e-06, + "loss": 4.1218, + "step": 62770 + }, + { + "epoch": 4.265185487158582, + "grad_norm": 0.24770087003707886, + "learning_rate": 4.6708112515287405e-06, + "loss": 3.9276, + "step": 62775 + }, + { + "epoch": 4.265525207229243, + "grad_norm": 0.27970385551452637, + "learning_rate": 4.670386601440413e-06, + "loss": 3.8979, + "step": 62780 + }, + { + "epoch": 4.265864927299905, + "grad_norm": 0.32733049988746643, + "learning_rate": 4.669961951352086e-06, + "loss": 3.7504, + "step": 62785 + }, + { + "epoch": 4.266204647370567, + "grad_norm": 0.32450079917907715, + "learning_rate": 4.669537301263759e-06, + "loss": 4.0177, + "step": 62790 + }, + { + "epoch": 4.266544367441228, + "grad_norm": 0.26363566517829895, + "learning_rate": 4.669112651175432e-06, + "loss": 3.9471, + "step": 62795 + }, + { + "epoch": 4.26688408751189, + "grad_norm": 0.23464563488960266, + "learning_rate": 4.6686880010871045e-06, + "loss": 3.8522, + "step": 62800 + }, + { + "epoch": 4.267223807582552, + "grad_norm": 0.2798555791378021, + "learning_rate": 4.668263350998777e-06, + "loss": 3.9047, + "step": 62805 + }, + { + "epoch": 4.267563527653214, + "grad_norm": 0.3939981758594513, + "learning_rate": 4.66783870091045e-06, + "loss": 3.8574, + "step": 62810 + }, + { + "epoch": 4.267903247723876, + "grad_norm": 0.24596504867076874, + "learning_rate": 4.667414050822123e-06, + "loss": 4.1017, + "step": 62815 + }, + { + "epoch": 4.268242967794538, + "grad_norm": 0.30666106939315796, + "learning_rate": 4.666989400733796e-06, + "loss": 4.0035, + "step": 62820 + }, + { + "epoch": 4.268582687865199, + "grad_norm": 0.2527904510498047, + "learning_rate": 4.6665647506454685e-06, + "loss": 4.0259, + "step": 62825 + }, + { + "epoch": 4.268922407935861, + "grad_norm": 0.2705098092556, + "learning_rate": 4.666140100557141e-06, + "loss": 4.1512, + "step": 62830 + }, + { + "epoch": 4.269262128006522, + "grad_norm": 0.26466888189315796, + "learning_rate": 4.665715450468814e-06, + "loss": 4.0471, + "step": 62835 + }, + { + "epoch": 4.269601848077184, + "grad_norm": 0.24041742086410522, + "learning_rate": 4.665290800380487e-06, + "loss": 4.1539, + "step": 62840 + }, + { + "epoch": 4.269941568147846, + "grad_norm": 0.2898831069469452, + "learning_rate": 4.66486615029216e-06, + "loss": 4.3023, + "step": 62845 + }, + { + "epoch": 4.2702812882185075, + "grad_norm": 0.3225024938583374, + "learning_rate": 4.6644415002038325e-06, + "loss": 3.8779, + "step": 62850 + }, + { + "epoch": 4.27062100828917, + "grad_norm": 0.25764527916908264, + "learning_rate": 4.664016850115505e-06, + "loss": 3.9255, + "step": 62855 + }, + { + "epoch": 4.270960728359832, + "grad_norm": 0.30647972226142883, + "learning_rate": 4.663592200027178e-06, + "loss": 4.0698, + "step": 62860 + }, + { + "epoch": 4.271300448430493, + "grad_norm": 0.2232275903224945, + "learning_rate": 4.663167549938851e-06, + "loss": 4.009, + "step": 62865 + }, + { + "epoch": 4.271640168501155, + "grad_norm": 0.20773503184318542, + "learning_rate": 4.662742899850524e-06, + "loss": 3.8398, + "step": 62870 + }, + { + "epoch": 4.271979888571817, + "grad_norm": 0.2501775622367859, + "learning_rate": 4.6623182497621965e-06, + "loss": 3.9964, + "step": 62875 + }, + { + "epoch": 4.272319608642478, + "grad_norm": 0.31099560856819153, + "learning_rate": 4.661893599673869e-06, + "loss": 3.755, + "step": 62880 + }, + { + "epoch": 4.27265932871314, + "grad_norm": 0.47029805183410645, + "learning_rate": 4.661468949585542e-06, + "loss": 3.9739, + "step": 62885 + }, + { + "epoch": 4.272999048783802, + "grad_norm": 0.26412495970726013, + "learning_rate": 4.661044299497215e-06, + "loss": 4.0766, + "step": 62890 + }, + { + "epoch": 4.2733387688544635, + "grad_norm": 0.35809123516082764, + "learning_rate": 4.660619649408888e-06, + "loss": 3.99, + "step": 62895 + }, + { + "epoch": 4.273678488925126, + "grad_norm": 0.2717888355255127, + "learning_rate": 4.6601949993205605e-06, + "loss": 3.8894, + "step": 62900 + }, + { + "epoch": 4.274018208995788, + "grad_norm": 0.23786267638206482, + "learning_rate": 4.659770349232233e-06, + "loss": 3.9939, + "step": 62905 + }, + { + "epoch": 4.274357929066449, + "grad_norm": 0.2622360289096832, + "learning_rate": 4.659345699143905e-06, + "loss": 3.9011, + "step": 62910 + }, + { + "epoch": 4.274697649137111, + "grad_norm": 0.26813575625419617, + "learning_rate": 4.658921049055579e-06, + "loss": 4.0295, + "step": 62915 + }, + { + "epoch": 4.275037369207773, + "grad_norm": 0.24213182926177979, + "learning_rate": 4.658496398967252e-06, + "loss": 3.9591, + "step": 62920 + }, + { + "epoch": 4.275377089278434, + "grad_norm": 0.3498525619506836, + "learning_rate": 4.658071748878924e-06, + "loss": 4.0511, + "step": 62925 + }, + { + "epoch": 4.275716809349096, + "grad_norm": 0.32081490755081177, + "learning_rate": 4.657647098790597e-06, + "loss": 4.0041, + "step": 62930 + }, + { + "epoch": 4.276056529419758, + "grad_norm": 0.25441762804985046, + "learning_rate": 4.65722244870227e-06, + "loss": 3.8074, + "step": 62935 + }, + { + "epoch": 4.2763962494904195, + "grad_norm": 0.23361513018608093, + "learning_rate": 4.656797798613942e-06, + "loss": 3.9097, + "step": 62940 + }, + { + "epoch": 4.276735969561082, + "grad_norm": 0.28143665194511414, + "learning_rate": 4.656373148525615e-06, + "loss": 3.7428, + "step": 62945 + }, + { + "epoch": 4.277075689631744, + "grad_norm": 0.33434441685676575, + "learning_rate": 4.6559484984372885e-06, + "loss": 4.1059, + "step": 62950 + }, + { + "epoch": 4.277415409702405, + "grad_norm": 0.23302339017391205, + "learning_rate": 4.6555238483489605e-06, + "loss": 3.9781, + "step": 62955 + }, + { + "epoch": 4.277755129773067, + "grad_norm": 0.28324228525161743, + "learning_rate": 4.655099198260633e-06, + "loss": 3.8882, + "step": 62960 + }, + { + "epoch": 4.278094849843729, + "grad_norm": 0.2511173486709595, + "learning_rate": 4.654674548172307e-06, + "loss": 3.9178, + "step": 62965 + }, + { + "epoch": 4.27843456991439, + "grad_norm": 0.28337883949279785, + "learning_rate": 4.654249898083979e-06, + "loss": 3.9527, + "step": 62970 + }, + { + "epoch": 4.278774289985052, + "grad_norm": 0.2250768393278122, + "learning_rate": 4.653825247995652e-06, + "loss": 3.8965, + "step": 62975 + }, + { + "epoch": 4.279114010055714, + "grad_norm": 0.26725175976753235, + "learning_rate": 4.653400597907325e-06, + "loss": 4.2119, + "step": 62980 + }, + { + "epoch": 4.2794537301263755, + "grad_norm": 0.20080502331256866, + "learning_rate": 4.652975947818997e-06, + "loss": 3.8712, + "step": 62985 + }, + { + "epoch": 4.279793450197038, + "grad_norm": 0.23579458892345428, + "learning_rate": 4.65255129773067e-06, + "loss": 3.8089, + "step": 62990 + }, + { + "epoch": 4.2801331702677, + "grad_norm": 0.23966364562511444, + "learning_rate": 4.652126647642343e-06, + "loss": 3.8522, + "step": 62995 + }, + { + "epoch": 4.280472890338361, + "grad_norm": 0.2817283272743225, + "learning_rate": 4.651701997554016e-06, + "loss": 4.0132, + "step": 63000 + }, + { + "epoch": 4.280812610409023, + "grad_norm": 0.23098085820674896, + "learning_rate": 4.6512773474656885e-06, + "loss": 4.0975, + "step": 63005 + }, + { + "epoch": 4.281152330479685, + "grad_norm": 0.24570822715759277, + "learning_rate": 4.650852697377361e-06, + "loss": 4.0033, + "step": 63010 + }, + { + "epoch": 4.281492050550346, + "grad_norm": 0.24402055144309998, + "learning_rate": 4.650428047289034e-06, + "loss": 3.9544, + "step": 63015 + }, + { + "epoch": 4.281831770621008, + "grad_norm": 0.28726351261138916, + "learning_rate": 4.650003397200707e-06, + "loss": 4.379, + "step": 63020 + }, + { + "epoch": 4.28217149069167, + "grad_norm": 0.264229416847229, + "learning_rate": 4.64957874711238e-06, + "loss": 4.0712, + "step": 63025 + }, + { + "epoch": 4.2825112107623315, + "grad_norm": 0.2926389276981354, + "learning_rate": 4.6491540970240525e-06, + "loss": 3.9802, + "step": 63030 + }, + { + "epoch": 4.282850930832994, + "grad_norm": 0.22191251814365387, + "learning_rate": 4.648729446935725e-06, + "loss": 4.0552, + "step": 63035 + }, + { + "epoch": 4.283190650903656, + "grad_norm": 0.2632538080215454, + "learning_rate": 4.648304796847398e-06, + "loss": 4.0296, + "step": 63040 + }, + { + "epoch": 4.283530370974317, + "grad_norm": 0.29936978220939636, + "learning_rate": 4.647880146759071e-06, + "loss": 4.1367, + "step": 63045 + }, + { + "epoch": 4.283870091044979, + "grad_norm": 0.28195902705192566, + "learning_rate": 4.647455496670744e-06, + "loss": 3.9246, + "step": 63050 + }, + { + "epoch": 4.284209811115641, + "grad_norm": 0.2349260151386261, + "learning_rate": 4.6470308465824165e-06, + "loss": 4.101, + "step": 63055 + }, + { + "epoch": 4.284549531186302, + "grad_norm": 0.28720059990882874, + "learning_rate": 4.646606196494089e-06, + "loss": 3.9518, + "step": 63060 + }, + { + "epoch": 4.284889251256964, + "grad_norm": 0.3593010902404785, + "learning_rate": 4.646181546405762e-06, + "loss": 4.0708, + "step": 63065 + }, + { + "epoch": 4.285228971327626, + "grad_norm": 0.2641526758670807, + "learning_rate": 4.645756896317435e-06, + "loss": 3.975, + "step": 63070 + }, + { + "epoch": 4.2855686913982876, + "grad_norm": 0.3486418128013611, + "learning_rate": 4.645332246229108e-06, + "loss": 4.3611, + "step": 63075 + }, + { + "epoch": 4.28590841146895, + "grad_norm": 0.306252658367157, + "learning_rate": 4.6449075961407805e-06, + "loss": 4.0977, + "step": 63080 + }, + { + "epoch": 4.286248131539612, + "grad_norm": 0.2922380566596985, + "learning_rate": 4.644482946052453e-06, + "loss": 3.9498, + "step": 63085 + }, + { + "epoch": 4.286587851610273, + "grad_norm": 0.2324615865945816, + "learning_rate": 4.644058295964126e-06, + "loss": 3.8978, + "step": 63090 + }, + { + "epoch": 4.286927571680935, + "grad_norm": 0.3224358558654785, + "learning_rate": 4.643633645875799e-06, + "loss": 3.4687, + "step": 63095 + }, + { + "epoch": 4.287267291751597, + "grad_norm": 0.26901355385780334, + "learning_rate": 4.643208995787472e-06, + "loss": 3.9575, + "step": 63100 + }, + { + "epoch": 4.287607011822258, + "grad_norm": 0.28053444623947144, + "learning_rate": 4.6427843456991445e-06, + "loss": 3.6931, + "step": 63105 + }, + { + "epoch": 4.28794673189292, + "grad_norm": 0.5543760061264038, + "learning_rate": 4.642359695610817e-06, + "loss": 4.0293, + "step": 63110 + }, + { + "epoch": 4.288286451963582, + "grad_norm": 0.4649248719215393, + "learning_rate": 4.64193504552249e-06, + "loss": 3.9552, + "step": 63115 + }, + { + "epoch": 4.288626172034244, + "grad_norm": 0.29595208168029785, + "learning_rate": 4.641510395434163e-06, + "loss": 4.0132, + "step": 63120 + }, + { + "epoch": 4.288965892104906, + "grad_norm": 0.33856916427612305, + "learning_rate": 4.641085745345835e-06, + "loss": 4.2237, + "step": 63125 + }, + { + "epoch": 4.289305612175568, + "grad_norm": 0.3164583146572113, + "learning_rate": 4.6406610952575085e-06, + "loss": 4.0048, + "step": 63130 + }, + { + "epoch": 4.289645332246229, + "grad_norm": 0.2463207095861435, + "learning_rate": 4.640236445169181e-06, + "loss": 4.0147, + "step": 63135 + }, + { + "epoch": 4.289985052316891, + "grad_norm": 0.36851736903190613, + "learning_rate": 4.639811795080853e-06, + "loss": 3.8722, + "step": 63140 + }, + { + "epoch": 4.290324772387553, + "grad_norm": 0.3283403217792511, + "learning_rate": 4.639387144992527e-06, + "loss": 3.9223, + "step": 63145 + }, + { + "epoch": 4.290664492458214, + "grad_norm": 0.2527051270008087, + "learning_rate": 4.6389624949042e-06, + "loss": 4.155, + "step": 63150 + }, + { + "epoch": 4.291004212528876, + "grad_norm": 0.3241982161998749, + "learning_rate": 4.638537844815872e-06, + "loss": 4.0886, + "step": 63155 + }, + { + "epoch": 4.291343932599538, + "grad_norm": 0.2615754008293152, + "learning_rate": 4.6381131947275445e-06, + "loss": 4.0262, + "step": 63160 + }, + { + "epoch": 4.2916836526702, + "grad_norm": 0.40336260199546814, + "learning_rate": 4.637688544639218e-06, + "loss": 3.81, + "step": 63165 + }, + { + "epoch": 4.292023372740862, + "grad_norm": 0.30730289220809937, + "learning_rate": 4.63726389455089e-06, + "loss": 3.798, + "step": 63170 + }, + { + "epoch": 4.292363092811524, + "grad_norm": 0.24375417828559875, + "learning_rate": 4.636839244462563e-06, + "loss": 3.959, + "step": 63175 + }, + { + "epoch": 4.292702812882185, + "grad_norm": 0.25387993454933167, + "learning_rate": 4.6364145943742365e-06, + "loss": 3.7738, + "step": 63180 + }, + { + "epoch": 4.293042532952847, + "grad_norm": 0.41368991136550903, + "learning_rate": 4.6359899442859085e-06, + "loss": 3.9199, + "step": 63185 + }, + { + "epoch": 4.293382253023509, + "grad_norm": 0.3087637424468994, + "learning_rate": 4.635565294197581e-06, + "loss": 3.9163, + "step": 63190 + }, + { + "epoch": 4.29372197309417, + "grad_norm": 0.19091540575027466, + "learning_rate": 4.635140644109254e-06, + "loss": 4.0186, + "step": 63195 + }, + { + "epoch": 4.294061693164832, + "grad_norm": 0.31674349308013916, + "learning_rate": 4.634715994020927e-06, + "loss": 3.8378, + "step": 63200 + }, + { + "epoch": 4.294401413235494, + "grad_norm": 0.30809202790260315, + "learning_rate": 4.6342913439326e-06, + "loss": 3.9975, + "step": 63205 + }, + { + "epoch": 4.294741133306156, + "grad_norm": 0.21760763227939606, + "learning_rate": 4.6338666938442725e-06, + "loss": 3.8715, + "step": 63210 + }, + { + "epoch": 4.295080853376818, + "grad_norm": 0.28041911125183105, + "learning_rate": 4.633442043755946e-06, + "loss": 4.0298, + "step": 63215 + }, + { + "epoch": 4.29542057344748, + "grad_norm": 0.2741549015045166, + "learning_rate": 4.633017393667618e-06, + "loss": 4.1283, + "step": 63220 + }, + { + "epoch": 4.295760293518141, + "grad_norm": 0.2761954665184021, + "learning_rate": 4.632592743579291e-06, + "loss": 3.7878, + "step": 63225 + }, + { + "epoch": 4.296100013588803, + "grad_norm": 0.2894574999809265, + "learning_rate": 4.632168093490964e-06, + "loss": 3.8705, + "step": 63230 + }, + { + "epoch": 4.296439733659464, + "grad_norm": 0.3247862160205841, + "learning_rate": 4.6317434434026365e-06, + "loss": 4.0121, + "step": 63235 + }, + { + "epoch": 4.296779453730126, + "grad_norm": 0.37705570459365845, + "learning_rate": 4.631318793314309e-06, + "loss": 3.8678, + "step": 63240 + }, + { + "epoch": 4.297119173800788, + "grad_norm": 0.20150640606880188, + "learning_rate": 4.630894143225982e-06, + "loss": 3.8202, + "step": 63245 + }, + { + "epoch": 4.2974588938714495, + "grad_norm": 0.23984619975090027, + "learning_rate": 4.630469493137655e-06, + "loss": 4.0205, + "step": 63250 + }, + { + "epoch": 4.297798613942112, + "grad_norm": 0.294127494096756, + "learning_rate": 4.630044843049328e-06, + "loss": 3.9409, + "step": 63255 + }, + { + "epoch": 4.298138334012774, + "grad_norm": 0.3026697337627411, + "learning_rate": 4.6296201929610005e-06, + "loss": 3.8027, + "step": 63260 + }, + { + "epoch": 4.298478054083435, + "grad_norm": 0.26322340965270996, + "learning_rate": 4.629195542872673e-06, + "loss": 3.9364, + "step": 63265 + }, + { + "epoch": 4.298817774154097, + "grad_norm": 0.2755466103553772, + "learning_rate": 4.628770892784346e-06, + "loss": 3.7784, + "step": 63270 + }, + { + "epoch": 4.299157494224759, + "grad_norm": 0.284096360206604, + "learning_rate": 4.628346242696019e-06, + "loss": 4.0925, + "step": 63275 + }, + { + "epoch": 4.29949721429542, + "grad_norm": 0.22408100962638855, + "learning_rate": 4.627921592607692e-06, + "loss": 4.0418, + "step": 63280 + }, + { + "epoch": 4.299836934366082, + "grad_norm": 0.2779224216938019, + "learning_rate": 4.6274969425193645e-06, + "loss": 3.9165, + "step": 63285 + }, + { + "epoch": 4.300176654436744, + "grad_norm": 0.2984972298145294, + "learning_rate": 4.627072292431037e-06, + "loss": 4.0789, + "step": 63290 + }, + { + "epoch": 4.3005163745074055, + "grad_norm": 0.2792453169822693, + "learning_rate": 4.62664764234271e-06, + "loss": 4.0801, + "step": 63295 + }, + { + "epoch": 4.300856094578068, + "grad_norm": 0.3512900769710541, + "learning_rate": 4.626222992254383e-06, + "loss": 4.1134, + "step": 63300 + }, + { + "epoch": 4.30119581464873, + "grad_norm": 0.25825804471969604, + "learning_rate": 4.625798342166056e-06, + "loss": 4.0228, + "step": 63305 + }, + { + "epoch": 4.301535534719391, + "grad_norm": 0.45040515065193176, + "learning_rate": 4.6253736920777285e-06, + "loss": 3.6478, + "step": 63310 + }, + { + "epoch": 4.301875254790053, + "grad_norm": 0.33278918266296387, + "learning_rate": 4.624949041989401e-06, + "loss": 3.8978, + "step": 63315 + }, + { + "epoch": 4.302214974860715, + "grad_norm": 0.5593621730804443, + "learning_rate": 4.624524391901074e-06, + "loss": 4.1058, + "step": 63320 + }, + { + "epoch": 4.302554694931376, + "grad_norm": 0.22780776023864746, + "learning_rate": 4.624099741812746e-06, + "loss": 3.889, + "step": 63325 + }, + { + "epoch": 4.302894415002038, + "grad_norm": 0.31848081946372986, + "learning_rate": 4.62367509172442e-06, + "loss": 4.0382, + "step": 63330 + }, + { + "epoch": 4.3032341350727, + "grad_norm": 0.2902997136116028, + "learning_rate": 4.6232504416360925e-06, + "loss": 3.8898, + "step": 63335 + }, + { + "epoch": 4.3035738551433615, + "grad_norm": 0.29955020546913147, + "learning_rate": 4.6228257915477644e-06, + "loss": 3.932, + "step": 63340 + }, + { + "epoch": 4.303913575214024, + "grad_norm": 0.21999728679656982, + "learning_rate": 4.622401141459438e-06, + "loss": 4.1625, + "step": 63345 + }, + { + "epoch": 4.304253295284686, + "grad_norm": 0.25998419523239136, + "learning_rate": 4.621976491371111e-06, + "loss": 3.8062, + "step": 63350 + }, + { + "epoch": 4.304593015355347, + "grad_norm": 0.25402870774269104, + "learning_rate": 4.621551841282783e-06, + "loss": 3.7184, + "step": 63355 + }, + { + "epoch": 4.304932735426009, + "grad_norm": 0.21968933939933777, + "learning_rate": 4.6211271911944565e-06, + "loss": 4.1678, + "step": 63360 + }, + { + "epoch": 4.305272455496671, + "grad_norm": 0.3095821142196655, + "learning_rate": 4.620702541106129e-06, + "loss": 4.0134, + "step": 63365 + }, + { + "epoch": 4.305612175567332, + "grad_norm": 0.2670547664165497, + "learning_rate": 4.620277891017801e-06, + "loss": 3.869, + "step": 63370 + }, + { + "epoch": 4.305951895637994, + "grad_norm": 0.2596755623817444, + "learning_rate": 4.619853240929474e-06, + "loss": 4.13, + "step": 63375 + }, + { + "epoch": 4.306291615708656, + "grad_norm": 0.36382293701171875, + "learning_rate": 4.619428590841148e-06, + "loss": 3.6782, + "step": 63380 + }, + { + "epoch": 4.306631335779318, + "grad_norm": 0.2562296986579895, + "learning_rate": 4.6190039407528205e-06, + "loss": 3.8153, + "step": 63385 + }, + { + "epoch": 4.30697105584998, + "grad_norm": 0.2616384029388428, + "learning_rate": 4.6185792906644924e-06, + "loss": 3.9858, + "step": 63390 + }, + { + "epoch": 4.307310775920642, + "grad_norm": 0.22839143872261047, + "learning_rate": 4.618154640576166e-06, + "loss": 3.8186, + "step": 63395 + }, + { + "epoch": 4.307650495991303, + "grad_norm": 0.23085585236549377, + "learning_rate": 4.617729990487839e-06, + "loss": 3.9753, + "step": 63400 + }, + { + "epoch": 4.307990216061965, + "grad_norm": 0.2575468420982361, + "learning_rate": 4.617305340399511e-06, + "loss": 3.8029, + "step": 63405 + }, + { + "epoch": 4.308329936132627, + "grad_norm": 0.25673356652259827, + "learning_rate": 4.616880690311184e-06, + "loss": 3.848, + "step": 63410 + }, + { + "epoch": 4.308669656203288, + "grad_norm": 0.2084154188632965, + "learning_rate": 4.616456040222857e-06, + "loss": 4.1029, + "step": 63415 + }, + { + "epoch": 4.30900937627395, + "grad_norm": 0.2932332456111908, + "learning_rate": 4.616031390134529e-06, + "loss": 4.006, + "step": 63420 + }, + { + "epoch": 4.309349096344612, + "grad_norm": 0.2009209394454956, + "learning_rate": 4.615606740046202e-06, + "loss": 3.9695, + "step": 63425 + }, + { + "epoch": 4.309688816415274, + "grad_norm": 0.2678954303264618, + "learning_rate": 4.615182089957876e-06, + "loss": 4.1306, + "step": 63430 + }, + { + "epoch": 4.310028536485936, + "grad_norm": 0.23143555223941803, + "learning_rate": 4.614757439869548e-06, + "loss": 3.9924, + "step": 63435 + }, + { + "epoch": 4.310368256556598, + "grad_norm": 0.27265235781669617, + "learning_rate": 4.6143327897812205e-06, + "loss": 3.8588, + "step": 63440 + }, + { + "epoch": 4.310707976627259, + "grad_norm": 0.2355857491493225, + "learning_rate": 4.613908139692893e-06, + "loss": 4.0322, + "step": 63445 + }, + { + "epoch": 4.311047696697921, + "grad_norm": 0.28899094462394714, + "learning_rate": 4.613483489604566e-06, + "loss": 3.9594, + "step": 63450 + }, + { + "epoch": 4.311387416768583, + "grad_norm": 0.37474149465560913, + "learning_rate": 4.613058839516239e-06, + "loss": 4.0997, + "step": 63455 + }, + { + "epoch": 4.311727136839244, + "grad_norm": 0.3258851170539856, + "learning_rate": 4.612634189427912e-06, + "loss": 4.1544, + "step": 63460 + }, + { + "epoch": 4.312066856909906, + "grad_norm": 0.2681846022605896, + "learning_rate": 4.6122095393395845e-06, + "loss": 4.133, + "step": 63465 + }, + { + "epoch": 4.312406576980568, + "grad_norm": 0.2885022461414337, + "learning_rate": 4.611784889251257e-06, + "loss": 3.9882, + "step": 63470 + }, + { + "epoch": 4.31274629705123, + "grad_norm": 0.3160472512245178, + "learning_rate": 4.61136023916293e-06, + "loss": 3.9706, + "step": 63475 + }, + { + "epoch": 4.313086017121892, + "grad_norm": 0.24898551404476166, + "learning_rate": 4.610935589074603e-06, + "loss": 3.984, + "step": 63480 + }, + { + "epoch": 4.313425737192554, + "grad_norm": 0.33124494552612305, + "learning_rate": 4.610510938986276e-06, + "loss": 3.9176, + "step": 63485 + }, + { + "epoch": 4.313765457263215, + "grad_norm": 0.27094101905822754, + "learning_rate": 4.6100862888979485e-06, + "loss": 3.8576, + "step": 63490 + }, + { + "epoch": 4.314105177333877, + "grad_norm": 0.28031572699546814, + "learning_rate": 4.609661638809621e-06, + "loss": 3.9911, + "step": 63495 + }, + { + "epoch": 4.314444897404539, + "grad_norm": 0.23759829998016357, + "learning_rate": 4.609236988721294e-06, + "loss": 4.0254, + "step": 63500 + }, + { + "epoch": 4.3147846174752, + "grad_norm": 0.3890068233013153, + "learning_rate": 4.608812338632967e-06, + "loss": 4.0663, + "step": 63505 + }, + { + "epoch": 4.315124337545862, + "grad_norm": 0.3565245568752289, + "learning_rate": 4.60838768854464e-06, + "loss": 3.9943, + "step": 63510 + }, + { + "epoch": 4.3154640576165235, + "grad_norm": 0.29460594058036804, + "learning_rate": 4.6079630384563125e-06, + "loss": 4.1182, + "step": 63515 + }, + { + "epoch": 4.315803777687186, + "grad_norm": 0.33419546484947205, + "learning_rate": 4.607538388367985e-06, + "loss": 3.9128, + "step": 63520 + }, + { + "epoch": 4.316143497757848, + "grad_norm": 0.24092084169387817, + "learning_rate": 4.607113738279658e-06, + "loss": 3.9465, + "step": 63525 + }, + { + "epoch": 4.316483217828509, + "grad_norm": 0.5104038715362549, + "learning_rate": 4.606689088191331e-06, + "loss": 3.8281, + "step": 63530 + }, + { + "epoch": 4.316822937899171, + "grad_norm": 0.30474406480789185, + "learning_rate": 4.606264438103004e-06, + "loss": 4.072, + "step": 63535 + }, + { + "epoch": 4.317162657969833, + "grad_norm": 0.2300012707710266, + "learning_rate": 4.605839788014676e-06, + "loss": 3.9944, + "step": 63540 + }, + { + "epoch": 4.317502378040494, + "grad_norm": 0.31834253668785095, + "learning_rate": 4.605415137926349e-06, + "loss": 4.0978, + "step": 63545 + }, + { + "epoch": 4.317842098111156, + "grad_norm": 0.23526136577129364, + "learning_rate": 4.604990487838022e-06, + "loss": 3.8962, + "step": 63550 + }, + { + "epoch": 4.318181818181818, + "grad_norm": 0.2715938687324524, + "learning_rate": 4.604565837749695e-06, + "loss": 4.1261, + "step": 63555 + }, + { + "epoch": 4.3185215382524795, + "grad_norm": 0.28659772872924805, + "learning_rate": 4.604141187661368e-06, + "loss": 4.2627, + "step": 63560 + }, + { + "epoch": 4.318861258323142, + "grad_norm": 0.22656142711639404, + "learning_rate": 4.6037165375730405e-06, + "loss": 3.9089, + "step": 63565 + }, + { + "epoch": 4.319200978393804, + "grad_norm": 0.3326760530471802, + "learning_rate": 4.603291887484713e-06, + "loss": 3.8626, + "step": 63570 + }, + { + "epoch": 4.319540698464465, + "grad_norm": 0.25979703664779663, + "learning_rate": 4.602867237396385e-06, + "loss": 4.0594, + "step": 63575 + }, + { + "epoch": 4.319880418535127, + "grad_norm": 0.263639360666275, + "learning_rate": 4.602442587308059e-06, + "loss": 4.126, + "step": 63580 + }, + { + "epoch": 4.320220138605789, + "grad_norm": 0.2334248423576355, + "learning_rate": 4.602017937219732e-06, + "loss": 4.1088, + "step": 63585 + }, + { + "epoch": 4.32055985867645, + "grad_norm": 0.3267221450805664, + "learning_rate": 4.601593287131404e-06, + "loss": 3.7711, + "step": 63590 + }, + { + "epoch": 4.320899578747112, + "grad_norm": 0.2541041970252991, + "learning_rate": 4.601168637043077e-06, + "loss": 3.937, + "step": 63595 + }, + { + "epoch": 4.321239298817774, + "grad_norm": 0.23123426735401154, + "learning_rate": 4.60074398695475e-06, + "loss": 3.9655, + "step": 63600 + }, + { + "epoch": 4.3215790188884355, + "grad_norm": 0.244696706533432, + "learning_rate": 4.600319336866422e-06, + "loss": 4.2259, + "step": 63605 + }, + { + "epoch": 4.321918738959098, + "grad_norm": 0.30190566182136536, + "learning_rate": 4.599894686778095e-06, + "loss": 3.9585, + "step": 63610 + }, + { + "epoch": 4.32225845902976, + "grad_norm": 0.2358248382806778, + "learning_rate": 4.5994700366897685e-06, + "loss": 3.8781, + "step": 63615 + }, + { + "epoch": 4.322598179100421, + "grad_norm": 0.21428437530994415, + "learning_rate": 4.5990453866014404e-06, + "loss": 3.8593, + "step": 63620 + }, + { + "epoch": 4.322937899171083, + "grad_norm": 0.2317655235528946, + "learning_rate": 4.598620736513113e-06, + "loss": 4.1168, + "step": 63625 + }, + { + "epoch": 4.323277619241745, + "grad_norm": 0.35725799202919006, + "learning_rate": 4.598196086424787e-06, + "loss": 4.0318, + "step": 63630 + }, + { + "epoch": 4.323617339312406, + "grad_norm": 0.2455156445503235, + "learning_rate": 4.597771436336459e-06, + "loss": 3.6265, + "step": 63635 + }, + { + "epoch": 4.323957059383068, + "grad_norm": 0.22793129086494446, + "learning_rate": 4.597346786248132e-06, + "loss": 4.1971, + "step": 63640 + }, + { + "epoch": 4.32429677945373, + "grad_norm": 0.29897618293762207, + "learning_rate": 4.596922136159805e-06, + "loss": 4.0649, + "step": 63645 + }, + { + "epoch": 4.3246364995243916, + "grad_norm": 0.2544267177581787, + "learning_rate": 4.596497486071477e-06, + "loss": 4.0175, + "step": 63650 + }, + { + "epoch": 4.324976219595054, + "grad_norm": 0.29474279284477234, + "learning_rate": 4.59607283598315e-06, + "loss": 3.8166, + "step": 63655 + }, + { + "epoch": 4.325315939665716, + "grad_norm": 0.29206109046936035, + "learning_rate": 4.595648185894823e-06, + "loss": 3.8268, + "step": 63660 + }, + { + "epoch": 4.325655659736377, + "grad_norm": 0.34250184893608093, + "learning_rate": 4.595223535806496e-06, + "loss": 4.1465, + "step": 63665 + }, + { + "epoch": 4.325995379807039, + "grad_norm": 0.23489302396774292, + "learning_rate": 4.5947988857181684e-06, + "loss": 4.0309, + "step": 63670 + }, + { + "epoch": 4.326335099877701, + "grad_norm": 0.2058682143688202, + "learning_rate": 4.594374235629841e-06, + "loss": 3.9898, + "step": 63675 + }, + { + "epoch": 4.326674819948362, + "grad_norm": 0.30227628350257874, + "learning_rate": 4.593949585541514e-06, + "loss": 4.252, + "step": 63680 + }, + { + "epoch": 4.327014540019024, + "grad_norm": 0.28997448086738586, + "learning_rate": 4.593524935453187e-06, + "loss": 4.1582, + "step": 63685 + }, + { + "epoch": 4.327354260089686, + "grad_norm": 0.23324351012706757, + "learning_rate": 4.59310028536486e-06, + "loss": 4.0489, + "step": 63690 + }, + { + "epoch": 4.327693980160348, + "grad_norm": 0.24816681444644928, + "learning_rate": 4.5926756352765324e-06, + "loss": 3.9653, + "step": 63695 + }, + { + "epoch": 4.32803370023101, + "grad_norm": 0.26817166805267334, + "learning_rate": 4.592250985188205e-06, + "loss": 3.7032, + "step": 63700 + }, + { + "epoch": 4.328373420301672, + "grad_norm": 0.32287225127220154, + "learning_rate": 4.591826335099878e-06, + "loss": 3.8911, + "step": 63705 + }, + { + "epoch": 4.328713140372333, + "grad_norm": 0.23337939381599426, + "learning_rate": 4.591401685011551e-06, + "loss": 3.9776, + "step": 63710 + }, + { + "epoch": 4.329052860442995, + "grad_norm": 0.24686166644096375, + "learning_rate": 4.590977034923224e-06, + "loss": 3.8972, + "step": 63715 + }, + { + "epoch": 4.329392580513657, + "grad_norm": 0.21447131037712097, + "learning_rate": 4.5905523848348964e-06, + "loss": 3.7716, + "step": 63720 + }, + { + "epoch": 4.329732300584318, + "grad_norm": 0.25330325961112976, + "learning_rate": 4.590127734746569e-06, + "loss": 3.6463, + "step": 63725 + }, + { + "epoch": 4.33007202065498, + "grad_norm": 0.21799629926681519, + "learning_rate": 4.589703084658242e-06, + "loss": 4.1897, + "step": 63730 + }, + { + "epoch": 4.330411740725642, + "grad_norm": 0.28008389472961426, + "learning_rate": 4.589278434569915e-06, + "loss": 3.526, + "step": 63735 + }, + { + "epoch": 4.330751460796304, + "grad_norm": 0.24649643898010254, + "learning_rate": 4.588853784481588e-06, + "loss": 3.9514, + "step": 63740 + }, + { + "epoch": 4.331091180866966, + "grad_norm": 0.3199281096458435, + "learning_rate": 4.5884291343932605e-06, + "loss": 3.7824, + "step": 63745 + }, + { + "epoch": 4.331430900937628, + "grad_norm": 0.2226899415254593, + "learning_rate": 4.588004484304933e-06, + "loss": 4.0778, + "step": 63750 + }, + { + "epoch": 4.331770621008289, + "grad_norm": 0.25753679871559143, + "learning_rate": 4.587579834216606e-06, + "loss": 4.1343, + "step": 63755 + }, + { + "epoch": 4.332110341078951, + "grad_norm": 0.3780914545059204, + "learning_rate": 4.587155184128279e-06, + "loss": 4.1367, + "step": 63760 + }, + { + "epoch": 4.332450061149613, + "grad_norm": 0.2711632251739502, + "learning_rate": 4.586730534039952e-06, + "loss": 3.563, + "step": 63765 + }, + { + "epoch": 4.332789781220274, + "grad_norm": 0.2100655436515808, + "learning_rate": 4.5863058839516245e-06, + "loss": 3.8701, + "step": 63770 + }, + { + "epoch": 4.333129501290936, + "grad_norm": 0.349601686000824, + "learning_rate": 4.585881233863297e-06, + "loss": 3.841, + "step": 63775 + }, + { + "epoch": 4.333469221361598, + "grad_norm": 0.2795417010784149, + "learning_rate": 4.58545658377497e-06, + "loss": 3.6501, + "step": 63780 + }, + { + "epoch": 4.33380894143226, + "grad_norm": 0.339352548122406, + "learning_rate": 4.585031933686643e-06, + "loss": 4.2001, + "step": 63785 + }, + { + "epoch": 4.334148661502922, + "grad_norm": 0.21175989508628845, + "learning_rate": 4.584607283598315e-06, + "loss": 3.827, + "step": 63790 + }, + { + "epoch": 4.334488381573584, + "grad_norm": 0.28917139768600464, + "learning_rate": 4.5841826335099885e-06, + "loss": 3.9722, + "step": 63795 + }, + { + "epoch": 4.334828101644245, + "grad_norm": 0.33268535137176514, + "learning_rate": 4.583757983421661e-06, + "loss": 3.6703, + "step": 63800 + }, + { + "epoch": 4.335167821714907, + "grad_norm": 0.19390451908111572, + "learning_rate": 4.583333333333333e-06, + "loss": 3.8727, + "step": 63805 + }, + { + "epoch": 4.335507541785569, + "grad_norm": 0.37065553665161133, + "learning_rate": 4.582908683245007e-06, + "loss": 3.8226, + "step": 63810 + }, + { + "epoch": 4.33584726185623, + "grad_norm": 0.2832764685153961, + "learning_rate": 4.58248403315668e-06, + "loss": 4.1413, + "step": 63815 + }, + { + "epoch": 4.336186981926892, + "grad_norm": 0.2708664536476135, + "learning_rate": 4.582059383068352e-06, + "loss": 3.8692, + "step": 63820 + }, + { + "epoch": 4.336526701997554, + "grad_norm": 0.23989972472190857, + "learning_rate": 4.581634732980024e-06, + "loss": 3.956, + "step": 63825 + }, + { + "epoch": 4.336866422068216, + "grad_norm": 0.3124205470085144, + "learning_rate": 4.581210082891698e-06, + "loss": 3.9193, + "step": 63830 + }, + { + "epoch": 4.337206142138878, + "grad_norm": 0.30176955461502075, + "learning_rate": 4.58078543280337e-06, + "loss": 3.9101, + "step": 63835 + }, + { + "epoch": 4.33754586220954, + "grad_norm": 0.2551391124725342, + "learning_rate": 4.580360782715043e-06, + "loss": 3.9644, + "step": 63840 + }, + { + "epoch": 4.337885582280201, + "grad_norm": 0.2739262282848358, + "learning_rate": 4.5799361326267165e-06, + "loss": 3.9791, + "step": 63845 + }, + { + "epoch": 4.338225302350863, + "grad_norm": 0.285783588886261, + "learning_rate": 4.579511482538388e-06, + "loss": 4.0865, + "step": 63850 + }, + { + "epoch": 4.338565022421525, + "grad_norm": 0.3362296223640442, + "learning_rate": 4.579086832450061e-06, + "loss": 3.7485, + "step": 63855 + }, + { + "epoch": 4.338904742492186, + "grad_norm": 0.26649919152259827, + "learning_rate": 4.578662182361734e-06, + "loss": 4.0417, + "step": 63860 + }, + { + "epoch": 4.339244462562848, + "grad_norm": 0.4606997072696686, + "learning_rate": 4.578237532273407e-06, + "loss": 4.0999, + "step": 63865 + }, + { + "epoch": 4.33958418263351, + "grad_norm": 0.225183367729187, + "learning_rate": 4.57781288218508e-06, + "loss": 3.9925, + "step": 63870 + }, + { + "epoch": 4.339923902704172, + "grad_norm": 0.22698763012886047, + "learning_rate": 4.5773882320967524e-06, + "loss": 4.0666, + "step": 63875 + }, + { + "epoch": 4.340263622774834, + "grad_norm": 0.2636469304561615, + "learning_rate": 4.576963582008425e-06, + "loss": 4.1512, + "step": 63880 + }, + { + "epoch": 4.340603342845496, + "grad_norm": 0.3012545108795166, + "learning_rate": 4.576538931920098e-06, + "loss": 4.1875, + "step": 63885 + }, + { + "epoch": 4.340943062916157, + "grad_norm": 0.29192331433296204, + "learning_rate": 4.576114281831771e-06, + "loss": 3.9673, + "step": 63890 + }, + { + "epoch": 4.341282782986819, + "grad_norm": 0.21330595016479492, + "learning_rate": 4.575689631743444e-06, + "loss": 3.9004, + "step": 63895 + }, + { + "epoch": 4.341622503057481, + "grad_norm": 0.2842880189418793, + "learning_rate": 4.5752649816551164e-06, + "loss": 3.959, + "step": 63900 + }, + { + "epoch": 4.341962223128142, + "grad_norm": 0.2777903079986572, + "learning_rate": 4.574840331566789e-06, + "loss": 3.8449, + "step": 63905 + }, + { + "epoch": 4.342301943198804, + "grad_norm": 0.2624149024486542, + "learning_rate": 4.574415681478462e-06, + "loss": 3.8048, + "step": 63910 + }, + { + "epoch": 4.3426416632694655, + "grad_norm": 0.2601654529571533, + "learning_rate": 4.573991031390135e-06, + "loss": 3.9945, + "step": 63915 + }, + { + "epoch": 4.342981383340128, + "grad_norm": 0.30267706513404846, + "learning_rate": 4.573566381301808e-06, + "loss": 3.9579, + "step": 63920 + }, + { + "epoch": 4.34332110341079, + "grad_norm": 0.2599450647830963, + "learning_rate": 4.5731417312134804e-06, + "loss": 4.178, + "step": 63925 + }, + { + "epoch": 4.343660823481451, + "grad_norm": 0.23299826681613922, + "learning_rate": 4.572717081125153e-06, + "loss": 4.0199, + "step": 63930 + }, + { + "epoch": 4.344000543552113, + "grad_norm": 0.25353923439979553, + "learning_rate": 4.572292431036826e-06, + "loss": 4.0611, + "step": 63935 + }, + { + "epoch": 4.344340263622775, + "grad_norm": 0.2856368124485016, + "learning_rate": 4.571867780948499e-06, + "loss": 3.9309, + "step": 63940 + }, + { + "epoch": 4.344679983693436, + "grad_norm": 0.3265022039413452, + "learning_rate": 4.571443130860172e-06, + "loss": 3.813, + "step": 63945 + }, + { + "epoch": 4.345019703764098, + "grad_norm": 0.2500917613506317, + "learning_rate": 4.5710184807718444e-06, + "loss": 3.9706, + "step": 63950 + }, + { + "epoch": 4.34535942383476, + "grad_norm": 0.32347723841667175, + "learning_rate": 4.570593830683517e-06, + "loss": 3.803, + "step": 63955 + }, + { + "epoch": 4.345699143905422, + "grad_norm": 0.2735965847969055, + "learning_rate": 4.57016918059519e-06, + "loss": 3.8808, + "step": 63960 + }, + { + "epoch": 4.346038863976084, + "grad_norm": 0.42241621017456055, + "learning_rate": 4.569744530506863e-06, + "loss": 3.8754, + "step": 63965 + }, + { + "epoch": 4.346378584046746, + "grad_norm": 0.315925270318985, + "learning_rate": 4.569319880418536e-06, + "loss": 3.7467, + "step": 63970 + }, + { + "epoch": 4.346718304117407, + "grad_norm": 0.27913162112236023, + "learning_rate": 4.5688952303302084e-06, + "loss": 4.084, + "step": 63975 + }, + { + "epoch": 4.347058024188069, + "grad_norm": 0.29445359110832214, + "learning_rate": 4.568470580241881e-06, + "loss": 3.8707, + "step": 63980 + }, + { + "epoch": 4.347397744258731, + "grad_norm": 0.22287851572036743, + "learning_rate": 4.568045930153554e-06, + "loss": 4.1813, + "step": 63985 + }, + { + "epoch": 4.347737464329392, + "grad_norm": 0.260460764169693, + "learning_rate": 4.567621280065227e-06, + "loss": 3.9134, + "step": 63990 + }, + { + "epoch": 4.348077184400054, + "grad_norm": 0.31299084424972534, + "learning_rate": 4.5671966299769e-06, + "loss": 4.0019, + "step": 63995 + }, + { + "epoch": 4.348416904470716, + "grad_norm": 0.23889467120170593, + "learning_rate": 4.5667719798885724e-06, + "loss": 3.7504, + "step": 64000 + }, + { + "epoch": 4.348756624541378, + "grad_norm": 0.37054526805877686, + "learning_rate": 4.566347329800244e-06, + "loss": 4.1638, + "step": 64005 + }, + { + "epoch": 4.34909634461204, + "grad_norm": 0.3432253897190094, + "learning_rate": 4.565922679711918e-06, + "loss": 4.0126, + "step": 64010 + }, + { + "epoch": 4.349436064682702, + "grad_norm": 0.442227303981781, + "learning_rate": 4.565498029623591e-06, + "loss": 3.5778, + "step": 64015 + }, + { + "epoch": 4.349775784753363, + "grad_norm": 0.2446168065071106, + "learning_rate": 4.565073379535263e-06, + "loss": 3.9186, + "step": 64020 + }, + { + "epoch": 4.350115504824025, + "grad_norm": 0.2794222831726074, + "learning_rate": 4.5646487294469364e-06, + "loss": 3.6562, + "step": 64025 + }, + { + "epoch": 4.350455224894687, + "grad_norm": 0.2785649299621582, + "learning_rate": 4.564224079358609e-06, + "loss": 4.0716, + "step": 64030 + }, + { + "epoch": 4.350794944965348, + "grad_norm": 0.23818111419677734, + "learning_rate": 4.563799429270281e-06, + "loss": 3.9654, + "step": 64035 + }, + { + "epoch": 4.35113466503601, + "grad_norm": 0.31241580843925476, + "learning_rate": 4.563374779181954e-06, + "loss": 4.1754, + "step": 64040 + }, + { + "epoch": 4.351474385106672, + "grad_norm": 0.22338981926441193, + "learning_rate": 4.562950129093628e-06, + "loss": 3.9686, + "step": 64045 + }, + { + "epoch": 4.351814105177334, + "grad_norm": 0.25416651368141174, + "learning_rate": 4.5625254790053e-06, + "loss": 3.5773, + "step": 64050 + }, + { + "epoch": 4.352153825247996, + "grad_norm": 0.26365867257118225, + "learning_rate": 4.562100828916972e-06, + "loss": 3.7174, + "step": 64055 + }, + { + "epoch": 4.352493545318658, + "grad_norm": 0.45743924379348755, + "learning_rate": 4.561676178828646e-06, + "loss": 3.9827, + "step": 64060 + }, + { + "epoch": 4.352833265389319, + "grad_norm": 0.2525196373462677, + "learning_rate": 4.561251528740319e-06, + "loss": 4.1221, + "step": 64065 + }, + { + "epoch": 4.353172985459981, + "grad_norm": 0.4040074348449707, + "learning_rate": 4.560826878651991e-06, + "loss": 4.0154, + "step": 64070 + }, + { + "epoch": 4.353512705530643, + "grad_norm": 0.26095011830329895, + "learning_rate": 4.560402228563664e-06, + "loss": 4.0652, + "step": 64075 + }, + { + "epoch": 4.353852425601304, + "grad_norm": 0.23006337881088257, + "learning_rate": 4.559977578475337e-06, + "loss": 3.9859, + "step": 64080 + }, + { + "epoch": 4.354192145671966, + "grad_norm": 0.40575671195983887, + "learning_rate": 4.559552928387009e-06, + "loss": 3.885, + "step": 64085 + }, + { + "epoch": 4.354531865742628, + "grad_norm": 0.22986342012882233, + "learning_rate": 4.559128278298682e-06, + "loss": 3.964, + "step": 64090 + }, + { + "epoch": 4.35487158581329, + "grad_norm": 0.39966699481010437, + "learning_rate": 4.558703628210356e-06, + "loss": 4.0255, + "step": 64095 + }, + { + "epoch": 4.355211305883952, + "grad_norm": 0.23039676249027252, + "learning_rate": 4.558278978122028e-06, + "loss": 3.8621, + "step": 64100 + }, + { + "epoch": 4.355551025954614, + "grad_norm": 0.23818334937095642, + "learning_rate": 4.5578543280337e-06, + "loss": 3.9433, + "step": 64105 + }, + { + "epoch": 4.355890746025275, + "grad_norm": 0.23846624791622162, + "learning_rate": 4.557429677945373e-06, + "loss": 3.8481, + "step": 64110 + }, + { + "epoch": 4.356230466095937, + "grad_norm": 0.24073532223701477, + "learning_rate": 4.557005027857046e-06, + "loss": 3.7387, + "step": 64115 + }, + { + "epoch": 4.356570186166599, + "grad_norm": 0.28622573614120483, + "learning_rate": 4.556580377768719e-06, + "loss": 4.2232, + "step": 64120 + }, + { + "epoch": 4.35690990623726, + "grad_norm": 0.758140504360199, + "learning_rate": 4.556155727680392e-06, + "loss": 4.0279, + "step": 64125 + }, + { + "epoch": 4.357249626307922, + "grad_norm": 0.2405848652124405, + "learning_rate": 4.555731077592064e-06, + "loss": 3.8179, + "step": 64130 + }, + { + "epoch": 4.357589346378584, + "grad_norm": 0.27762100100517273, + "learning_rate": 4.555306427503737e-06, + "loss": 3.952, + "step": 64135 + }, + { + "epoch": 4.357929066449246, + "grad_norm": 0.20825748145580292, + "learning_rate": 4.55488177741541e-06, + "loss": 3.8061, + "step": 64140 + }, + { + "epoch": 4.358268786519908, + "grad_norm": 0.25586897134780884, + "learning_rate": 4.554457127327083e-06, + "loss": 4.1478, + "step": 64145 + }, + { + "epoch": 4.35860850659057, + "grad_norm": 0.27902400493621826, + "learning_rate": 4.554032477238756e-06, + "loss": 3.8988, + "step": 64150 + }, + { + "epoch": 4.358948226661231, + "grad_norm": 0.32900556921958923, + "learning_rate": 4.553607827150428e-06, + "loss": 4.0581, + "step": 64155 + }, + { + "epoch": 4.359287946731893, + "grad_norm": 0.36293643712997437, + "learning_rate": 4.553183177062101e-06, + "loss": 4.0189, + "step": 64160 + }, + { + "epoch": 4.359627666802555, + "grad_norm": 0.3145366609096527, + "learning_rate": 4.552758526973774e-06, + "loss": 3.9258, + "step": 64165 + }, + { + "epoch": 4.359967386873216, + "grad_norm": 0.35728761553764343, + "learning_rate": 4.552333876885447e-06, + "loss": 3.9034, + "step": 64170 + }, + { + "epoch": 4.360307106943878, + "grad_norm": 0.28386348485946655, + "learning_rate": 4.55190922679712e-06, + "loss": 4.0586, + "step": 64175 + }, + { + "epoch": 4.36064682701454, + "grad_norm": 0.25784510374069214, + "learning_rate": 4.5514845767087924e-06, + "loss": 3.9611, + "step": 64180 + }, + { + "epoch": 4.360986547085202, + "grad_norm": 0.2838253676891327, + "learning_rate": 4.551059926620465e-06, + "loss": 4.1544, + "step": 64185 + }, + { + "epoch": 4.361326267155864, + "grad_norm": 0.25028958916664124, + "learning_rate": 4.550635276532138e-06, + "loss": 4.0342, + "step": 64190 + }, + { + "epoch": 4.361665987226525, + "grad_norm": 0.2598087787628174, + "learning_rate": 4.550210626443811e-06, + "loss": 3.9637, + "step": 64195 + }, + { + "epoch": 4.362005707297187, + "grad_norm": 0.3142329752445221, + "learning_rate": 4.549785976355484e-06, + "loss": 3.8576, + "step": 64200 + }, + { + "epoch": 4.362345427367849, + "grad_norm": 0.3854285478591919, + "learning_rate": 4.549361326267156e-06, + "loss": 4.0854, + "step": 64205 + }, + { + "epoch": 4.36268514743851, + "grad_norm": 0.3394300043582916, + "learning_rate": 4.548936676178829e-06, + "loss": 3.7564, + "step": 64210 + }, + { + "epoch": 4.363024867509172, + "grad_norm": 0.2569027245044708, + "learning_rate": 4.548512026090502e-06, + "loss": 3.918, + "step": 64215 + }, + { + "epoch": 4.363364587579834, + "grad_norm": 0.24445629119873047, + "learning_rate": 4.548087376002174e-06, + "loss": 3.9438, + "step": 64220 + }, + { + "epoch": 4.3637043076504956, + "grad_norm": 0.22148092091083527, + "learning_rate": 4.547662725913848e-06, + "loss": 3.7811, + "step": 64225 + }, + { + "epoch": 4.364044027721158, + "grad_norm": 0.20377427339553833, + "learning_rate": 4.5472380758255204e-06, + "loss": 3.9423, + "step": 64230 + }, + { + "epoch": 4.36438374779182, + "grad_norm": 0.30925890803337097, + "learning_rate": 4.546813425737193e-06, + "loss": 3.9159, + "step": 64235 + }, + { + "epoch": 4.364723467862481, + "grad_norm": 0.2785552740097046, + "learning_rate": 4.546388775648865e-06, + "loss": 3.978, + "step": 64240 + }, + { + "epoch": 4.365063187933143, + "grad_norm": 0.269872784614563, + "learning_rate": 4.545964125560539e-06, + "loss": 4.0725, + "step": 64245 + }, + { + "epoch": 4.365402908003805, + "grad_norm": 0.37978610396385193, + "learning_rate": 4.545539475472212e-06, + "loss": 4.007, + "step": 64250 + }, + { + "epoch": 4.365742628074466, + "grad_norm": 0.2713046967983246, + "learning_rate": 4.545114825383884e-06, + "loss": 3.7419, + "step": 64255 + }, + { + "epoch": 4.366082348145128, + "grad_norm": 0.32269594073295593, + "learning_rate": 4.544690175295557e-06, + "loss": 3.7819, + "step": 64260 + }, + { + "epoch": 4.36642206821579, + "grad_norm": 0.25757062435150146, + "learning_rate": 4.54426552520723e-06, + "loss": 3.7951, + "step": 64265 + }, + { + "epoch": 4.366761788286452, + "grad_norm": 0.403184711933136, + "learning_rate": 4.543840875118902e-06, + "loss": 3.8828, + "step": 64270 + }, + { + "epoch": 4.367101508357114, + "grad_norm": 0.33346912264823914, + "learning_rate": 4.543416225030576e-06, + "loss": 4.2271, + "step": 64275 + }, + { + "epoch": 4.367441228427776, + "grad_norm": 0.34898635745048523, + "learning_rate": 4.5429915749422484e-06, + "loss": 3.9433, + "step": 64280 + }, + { + "epoch": 4.367780948498437, + "grad_norm": 0.2552452087402344, + "learning_rate": 4.54256692485392e-06, + "loss": 3.8642, + "step": 64285 + }, + { + "epoch": 4.368120668569099, + "grad_norm": 0.34822678565979004, + "learning_rate": 4.542142274765593e-06, + "loss": 3.9767, + "step": 64290 + }, + { + "epoch": 4.368460388639761, + "grad_norm": 0.3065694570541382, + "learning_rate": 4.541717624677267e-06, + "loss": 3.8808, + "step": 64295 + }, + { + "epoch": 4.368800108710422, + "grad_norm": 0.39100223779678345, + "learning_rate": 4.541292974588939e-06, + "loss": 4.0279, + "step": 64300 + }, + { + "epoch": 4.369139828781084, + "grad_norm": 0.29880788922309875, + "learning_rate": 4.540868324500612e-06, + "loss": 4.1463, + "step": 64305 + }, + { + "epoch": 4.369479548851746, + "grad_norm": 0.5714275240898132, + "learning_rate": 4.540443674412285e-06, + "loss": 3.8683, + "step": 64310 + }, + { + "epoch": 4.369819268922408, + "grad_norm": 0.24185124039649963, + "learning_rate": 4.540019024323957e-06, + "loss": 3.7848, + "step": 64315 + }, + { + "epoch": 4.37015898899307, + "grad_norm": 0.2664303779602051, + "learning_rate": 4.53959437423563e-06, + "loss": 4.1031, + "step": 64320 + }, + { + "epoch": 4.370498709063732, + "grad_norm": 0.24193228781223297, + "learning_rate": 4.539169724147303e-06, + "loss": 3.9039, + "step": 64325 + }, + { + "epoch": 4.370838429134393, + "grad_norm": 0.18820346891880035, + "learning_rate": 4.538745074058976e-06, + "loss": 3.98, + "step": 64330 + }, + { + "epoch": 4.371178149205055, + "grad_norm": 0.3847561776638031, + "learning_rate": 4.538320423970648e-06, + "loss": 4.0556, + "step": 64335 + }, + { + "epoch": 4.371517869275717, + "grad_norm": 0.2812846899032593, + "learning_rate": 4.537895773882321e-06, + "loss": 3.8392, + "step": 64340 + }, + { + "epoch": 4.371857589346378, + "grad_norm": 0.27820736169815063, + "learning_rate": 4.537471123793994e-06, + "loss": 4.2625, + "step": 64345 + }, + { + "epoch": 4.37219730941704, + "grad_norm": 0.29751861095428467, + "learning_rate": 4.537046473705667e-06, + "loss": 3.9554, + "step": 64350 + }, + { + "epoch": 4.372537029487702, + "grad_norm": 0.29595011472702026, + "learning_rate": 4.53662182361734e-06, + "loss": 3.9716, + "step": 64355 + }, + { + "epoch": 4.372876749558364, + "grad_norm": 0.2864948511123657, + "learning_rate": 4.536197173529012e-06, + "loss": 3.8879, + "step": 64360 + }, + { + "epoch": 4.373216469629026, + "grad_norm": 0.3192346692085266, + "learning_rate": 4.535772523440685e-06, + "loss": 4.0981, + "step": 64365 + }, + { + "epoch": 4.373556189699688, + "grad_norm": 0.26873332262039185, + "learning_rate": 4.535347873352358e-06, + "loss": 3.5115, + "step": 64370 + }, + { + "epoch": 4.373895909770349, + "grad_norm": 0.2779117226600647, + "learning_rate": 4.534923223264031e-06, + "loss": 4.2041, + "step": 64375 + }, + { + "epoch": 4.374235629841011, + "grad_norm": 0.27117469906806946, + "learning_rate": 4.534498573175704e-06, + "loss": 4.0096, + "step": 64380 + }, + { + "epoch": 4.374575349911673, + "grad_norm": 0.30328384041786194, + "learning_rate": 4.534073923087376e-06, + "loss": 3.8077, + "step": 64385 + }, + { + "epoch": 4.374915069982334, + "grad_norm": 0.26886409521102905, + "learning_rate": 4.533649272999049e-06, + "loss": 4.0809, + "step": 64390 + }, + { + "epoch": 4.375254790052996, + "grad_norm": 0.2955224812030792, + "learning_rate": 4.533224622910722e-06, + "loss": 4.1438, + "step": 64395 + }, + { + "epoch": 4.375594510123658, + "grad_norm": 0.309663861989975, + "learning_rate": 4.532799972822395e-06, + "loss": 4.1647, + "step": 64400 + }, + { + "epoch": 4.37593423019432, + "grad_norm": 0.2624545991420746, + "learning_rate": 4.532375322734068e-06, + "loss": 4.0562, + "step": 64405 + }, + { + "epoch": 4.376273950264982, + "grad_norm": 0.2510583996772766, + "learning_rate": 4.53195067264574e-06, + "loss": 3.7696, + "step": 64410 + }, + { + "epoch": 4.376613670335644, + "grad_norm": 0.3857061564922333, + "learning_rate": 4.531526022557413e-06, + "loss": 4.0015, + "step": 64415 + }, + { + "epoch": 4.376953390406305, + "grad_norm": 0.19177919626235962, + "learning_rate": 4.531101372469086e-06, + "loss": 4.0703, + "step": 64420 + }, + { + "epoch": 4.377293110476967, + "grad_norm": 0.2942197620868683, + "learning_rate": 4.530676722380759e-06, + "loss": 4.1598, + "step": 64425 + }, + { + "epoch": 4.377632830547629, + "grad_norm": 0.25958120822906494, + "learning_rate": 4.530252072292432e-06, + "loss": 4.1841, + "step": 64430 + }, + { + "epoch": 4.37797255061829, + "grad_norm": 0.2453373223543167, + "learning_rate": 4.529827422204104e-06, + "loss": 4.061, + "step": 64435 + }, + { + "epoch": 4.378312270688952, + "grad_norm": 0.31084302067756653, + "learning_rate": 4.529402772115777e-06, + "loss": 3.8474, + "step": 64440 + }, + { + "epoch": 4.378651990759614, + "grad_norm": 0.2291397899389267, + "learning_rate": 4.52897812202745e-06, + "loss": 3.7867, + "step": 64445 + }, + { + "epoch": 4.378991710830276, + "grad_norm": 0.30967187881469727, + "learning_rate": 4.528553471939123e-06, + "loss": 4.2236, + "step": 64450 + }, + { + "epoch": 4.379331430900938, + "grad_norm": 0.29312634468078613, + "learning_rate": 4.528128821850795e-06, + "loss": 3.8668, + "step": 64455 + }, + { + "epoch": 4.3796711509716, + "grad_norm": 0.20044997334480286, + "learning_rate": 4.527704171762468e-06, + "loss": 3.9925, + "step": 64460 + }, + { + "epoch": 4.380010871042261, + "grad_norm": 0.25066447257995605, + "learning_rate": 4.527279521674141e-06, + "loss": 4.1041, + "step": 64465 + }, + { + "epoch": 4.380350591112923, + "grad_norm": 0.2733761668205261, + "learning_rate": 4.526854871585813e-06, + "loss": 3.8459, + "step": 64470 + }, + { + "epoch": 4.380690311183585, + "grad_norm": 0.2861167788505554, + "learning_rate": 4.526430221497487e-06, + "loss": 3.8841, + "step": 64475 + }, + { + "epoch": 4.381030031254246, + "grad_norm": 0.34816524386405945, + "learning_rate": 4.52600557140916e-06, + "loss": 3.878, + "step": 64480 + }, + { + "epoch": 4.381369751324908, + "grad_norm": 0.27597689628601074, + "learning_rate": 4.525580921320832e-06, + "loss": 3.9788, + "step": 64485 + }, + { + "epoch": 4.38170947139557, + "grad_norm": 0.2629246413707733, + "learning_rate": 4.525156271232504e-06, + "loss": 3.9941, + "step": 64490 + }, + { + "epoch": 4.382049191466232, + "grad_norm": 0.3238661587238312, + "learning_rate": 4.524731621144178e-06, + "loss": 4.0392, + "step": 64495 + }, + { + "epoch": 4.382388911536894, + "grad_norm": 0.38129308819770813, + "learning_rate": 4.52430697105585e-06, + "loss": 3.8078, + "step": 64500 + }, + { + "epoch": 4.382728631607556, + "grad_norm": 0.3137515187263489, + "learning_rate": 4.523882320967523e-06, + "loss": 4.0663, + "step": 64505 + }, + { + "epoch": 4.383068351678217, + "grad_norm": 0.315973162651062, + "learning_rate": 4.5234576708791964e-06, + "loss": 3.8646, + "step": 64510 + }, + { + "epoch": 4.383408071748879, + "grad_norm": 0.21462111175060272, + "learning_rate": 4.523033020790868e-06, + "loss": 3.9867, + "step": 64515 + }, + { + "epoch": 4.383747791819541, + "grad_norm": 0.3140121400356293, + "learning_rate": 4.522608370702541e-06, + "loss": 4.0445, + "step": 64520 + }, + { + "epoch": 4.384087511890202, + "grad_norm": 0.30984577536582947, + "learning_rate": 4.522183720614214e-06, + "loss": 4.2512, + "step": 64525 + }, + { + "epoch": 4.384427231960864, + "grad_norm": 0.2609419822692871, + "learning_rate": 4.521759070525887e-06, + "loss": 4.1146, + "step": 64530 + }, + { + "epoch": 4.384766952031526, + "grad_norm": 0.26400700211524963, + "learning_rate": 4.52133442043756e-06, + "loss": 3.8492, + "step": 64535 + }, + { + "epoch": 4.385106672102188, + "grad_norm": 0.32106053829193115, + "learning_rate": 4.520909770349232e-06, + "loss": 4.1019, + "step": 64540 + }, + { + "epoch": 4.38544639217285, + "grad_norm": 0.2959202229976654, + "learning_rate": 4.520485120260905e-06, + "loss": 3.9526, + "step": 64545 + }, + { + "epoch": 4.385786112243512, + "grad_norm": 0.3408406972885132, + "learning_rate": 4.520060470172578e-06, + "loss": 4.1158, + "step": 64550 + }, + { + "epoch": 4.386125832314173, + "grad_norm": 0.22472362220287323, + "learning_rate": 4.519635820084251e-06, + "loss": 3.8429, + "step": 64555 + }, + { + "epoch": 4.386465552384835, + "grad_norm": 0.2355060577392578, + "learning_rate": 4.519211169995924e-06, + "loss": 3.8356, + "step": 64560 + }, + { + "epoch": 4.386805272455497, + "grad_norm": 0.28859198093414307, + "learning_rate": 4.518786519907596e-06, + "loss": 4.1552, + "step": 64565 + }, + { + "epoch": 4.387144992526158, + "grad_norm": 0.2608895003795624, + "learning_rate": 4.518361869819269e-06, + "loss": 3.8942, + "step": 64570 + }, + { + "epoch": 4.38748471259682, + "grad_norm": 0.3096848726272583, + "learning_rate": 4.517937219730942e-06, + "loss": 3.9735, + "step": 64575 + }, + { + "epoch": 4.3878244326674825, + "grad_norm": 0.27842625975608826, + "learning_rate": 4.517512569642615e-06, + "loss": 4.138, + "step": 64580 + }, + { + "epoch": 4.388164152738144, + "grad_norm": 0.2213279753923416, + "learning_rate": 4.517087919554288e-06, + "loss": 4.0128, + "step": 64585 + }, + { + "epoch": 4.388503872808806, + "grad_norm": 0.2441047877073288, + "learning_rate": 4.51666326946596e-06, + "loss": 4.1552, + "step": 64590 + }, + { + "epoch": 4.388843592879467, + "grad_norm": 0.27166077494621277, + "learning_rate": 4.516238619377633e-06, + "loss": 4.1995, + "step": 64595 + }, + { + "epoch": 4.389183312950129, + "grad_norm": 0.29382702708244324, + "learning_rate": 4.515813969289306e-06, + "loss": 3.9302, + "step": 64600 + }, + { + "epoch": 4.389523033020791, + "grad_norm": 0.3129461109638214, + "learning_rate": 4.515389319200979e-06, + "loss": 4.2599, + "step": 64605 + }, + { + "epoch": 4.389862753091452, + "grad_norm": 0.36520400643348694, + "learning_rate": 4.514964669112652e-06, + "loss": 4.0889, + "step": 64610 + }, + { + "epoch": 4.390202473162114, + "grad_norm": 0.24224752187728882, + "learning_rate": 4.514540019024324e-06, + "loss": 4.339, + "step": 64615 + }, + { + "epoch": 4.390542193232776, + "grad_norm": 0.23421648144721985, + "learning_rate": 4.514115368935997e-06, + "loss": 3.9214, + "step": 64620 + }, + { + "epoch": 4.390881913303438, + "grad_norm": 0.28729334473609924, + "learning_rate": 4.51369071884767e-06, + "loss": 4.1147, + "step": 64625 + }, + { + "epoch": 4.3912216333741, + "grad_norm": 0.34036925435066223, + "learning_rate": 4.513266068759343e-06, + "loss": 3.7844, + "step": 64630 + }, + { + "epoch": 4.391561353444762, + "grad_norm": 0.24541208148002625, + "learning_rate": 4.512841418671016e-06, + "loss": 3.8207, + "step": 64635 + }, + { + "epoch": 4.391901073515423, + "grad_norm": 0.2318154275417328, + "learning_rate": 4.512416768582688e-06, + "loss": 3.9435, + "step": 64640 + }, + { + "epoch": 4.392240793586085, + "grad_norm": 0.24716085195541382, + "learning_rate": 4.511992118494361e-06, + "loss": 3.8946, + "step": 64645 + }, + { + "epoch": 4.392580513656747, + "grad_norm": 0.356723815202713, + "learning_rate": 4.511567468406034e-06, + "loss": 3.8889, + "step": 64650 + }, + { + "epoch": 4.392920233727408, + "grad_norm": 0.27103492617607117, + "learning_rate": 4.511142818317707e-06, + "loss": 3.8967, + "step": 64655 + }, + { + "epoch": 4.39325995379807, + "grad_norm": 0.22467011213302612, + "learning_rate": 4.51071816822938e-06, + "loss": 4.0027, + "step": 64660 + }, + { + "epoch": 4.393599673868732, + "grad_norm": 0.33727556467056274, + "learning_rate": 4.510293518141052e-06, + "loss": 3.8625, + "step": 64665 + }, + { + "epoch": 4.393939393939394, + "grad_norm": 0.2713577151298523, + "learning_rate": 4.509868868052724e-06, + "loss": 3.8895, + "step": 64670 + }, + { + "epoch": 4.394279114010056, + "grad_norm": 0.256331205368042, + "learning_rate": 4.509444217964398e-06, + "loss": 3.7482, + "step": 64675 + }, + { + "epoch": 4.394618834080718, + "grad_norm": 0.40101492404937744, + "learning_rate": 4.509019567876071e-06, + "loss": 4.0684, + "step": 64680 + }, + { + "epoch": 4.394958554151379, + "grad_norm": 0.24732157588005066, + "learning_rate": 4.508594917787743e-06, + "loss": 4.1531, + "step": 64685 + }, + { + "epoch": 4.395298274222041, + "grad_norm": 0.27754828333854675, + "learning_rate": 4.508170267699416e-06, + "loss": 3.9856, + "step": 64690 + }, + { + "epoch": 4.395637994292703, + "grad_norm": 0.3964206278324127, + "learning_rate": 4.507745617611089e-06, + "loss": 3.7123, + "step": 64695 + }, + { + "epoch": 4.395977714363364, + "grad_norm": 0.2913822531700134, + "learning_rate": 4.507320967522761e-06, + "loss": 3.7124, + "step": 64700 + }, + { + "epoch": 4.396317434434026, + "grad_norm": 0.42568761110305786, + "learning_rate": 4.506896317434434e-06, + "loss": 3.7994, + "step": 64705 + }, + { + "epoch": 4.396657154504688, + "grad_norm": 0.3286326229572296, + "learning_rate": 4.506471667346108e-06, + "loss": 3.7424, + "step": 64710 + }, + { + "epoch": 4.39699687457535, + "grad_norm": 0.2569708228111267, + "learning_rate": 4.5060470172577796e-06, + "loss": 3.8, + "step": 64715 + }, + { + "epoch": 4.397336594646012, + "grad_norm": 0.41647869348526, + "learning_rate": 4.505622367169452e-06, + "loss": 3.8706, + "step": 64720 + }, + { + "epoch": 4.397676314716674, + "grad_norm": 0.2533653974533081, + "learning_rate": 4.505197717081126e-06, + "loss": 3.8802, + "step": 64725 + }, + { + "epoch": 4.398016034787335, + "grad_norm": 0.23723770678043365, + "learning_rate": 4.504773066992798e-06, + "loss": 3.739, + "step": 64730 + }, + { + "epoch": 4.398355754857997, + "grad_norm": 0.23057927191257477, + "learning_rate": 4.504348416904471e-06, + "loss": 4.0562, + "step": 64735 + }, + { + "epoch": 4.398695474928659, + "grad_norm": 0.26392287015914917, + "learning_rate": 4.5039237668161436e-06, + "loss": 3.6994, + "step": 64740 + }, + { + "epoch": 4.39903519499932, + "grad_norm": 0.3026772737503052, + "learning_rate": 4.503499116727817e-06, + "loss": 4.0359, + "step": 64745 + }, + { + "epoch": 4.399374915069982, + "grad_norm": 0.22993510961532593, + "learning_rate": 4.503074466639489e-06, + "loss": 4.0816, + "step": 64750 + }, + { + "epoch": 4.399714635140644, + "grad_norm": 0.2507398724555969, + "learning_rate": 4.502649816551162e-06, + "loss": 3.9348, + "step": 64755 + }, + { + "epoch": 4.400054355211306, + "grad_norm": 0.3573918044567108, + "learning_rate": 4.502225166462836e-06, + "loss": 3.8907, + "step": 64760 + }, + { + "epoch": 4.400394075281968, + "grad_norm": 0.2341560274362564, + "learning_rate": 4.5018005163745076e-06, + "loss": 3.8279, + "step": 64765 + }, + { + "epoch": 4.40073379535263, + "grad_norm": 0.28241419792175293, + "learning_rate": 4.50137586628618e-06, + "loss": 3.7779, + "step": 64770 + }, + { + "epoch": 4.401073515423291, + "grad_norm": 0.2159312665462494, + "learning_rate": 4.500951216197853e-06, + "loss": 3.8837, + "step": 64775 + }, + { + "epoch": 4.401413235493953, + "grad_norm": 0.24386830627918243, + "learning_rate": 4.500526566109526e-06, + "loss": 3.7942, + "step": 64780 + }, + { + "epoch": 4.401752955564615, + "grad_norm": 0.24405469000339508, + "learning_rate": 4.500101916021199e-06, + "loss": 3.7661, + "step": 64785 + }, + { + "epoch": 4.402092675635276, + "grad_norm": 0.25006821751594543, + "learning_rate": 4.499677265932872e-06, + "loss": 3.9199, + "step": 64790 + }, + { + "epoch": 4.402432395705938, + "grad_norm": 0.2636537551879883, + "learning_rate": 4.499252615844544e-06, + "loss": 3.9965, + "step": 64795 + }, + { + "epoch": 4.4027721157766, + "grad_norm": 0.28980323672294617, + "learning_rate": 4.498827965756217e-06, + "loss": 3.585, + "step": 64800 + }, + { + "epoch": 4.403111835847262, + "grad_norm": 0.317505806684494, + "learning_rate": 4.49840331566789e-06, + "loss": 4.0017, + "step": 64805 + }, + { + "epoch": 4.403451555917924, + "grad_norm": 0.33324378728866577, + "learning_rate": 4.497978665579563e-06, + "loss": 3.7853, + "step": 64810 + }, + { + "epoch": 4.403791275988586, + "grad_norm": 0.28636765480041504, + "learning_rate": 4.497554015491236e-06, + "loss": 4.0003, + "step": 64815 + }, + { + "epoch": 4.404130996059247, + "grad_norm": 0.25393950939178467, + "learning_rate": 4.497129365402908e-06, + "loss": 3.9303, + "step": 64820 + }, + { + "epoch": 4.404470716129909, + "grad_norm": 0.4096316695213318, + "learning_rate": 4.496704715314581e-06, + "loss": 3.6155, + "step": 64825 + }, + { + "epoch": 4.404810436200571, + "grad_norm": 0.3497980833053589, + "learning_rate": 4.496280065226254e-06, + "loss": 4.2395, + "step": 64830 + }, + { + "epoch": 4.405150156271232, + "grad_norm": 0.3176078796386719, + "learning_rate": 4.495855415137927e-06, + "loss": 4.0737, + "step": 64835 + }, + { + "epoch": 4.405489876341894, + "grad_norm": 0.26194247603416443, + "learning_rate": 4.4954307650496e-06, + "loss": 3.8718, + "step": 64840 + }, + { + "epoch": 4.4058295964125564, + "grad_norm": 0.3040235936641693, + "learning_rate": 4.495006114961272e-06, + "loss": 3.8166, + "step": 64845 + }, + { + "epoch": 4.406169316483218, + "grad_norm": 0.2102835476398468, + "learning_rate": 4.494581464872945e-06, + "loss": 3.9203, + "step": 64850 + }, + { + "epoch": 4.40650903655388, + "grad_norm": 0.2673487365245819, + "learning_rate": 4.494156814784618e-06, + "loss": 3.9665, + "step": 64855 + }, + { + "epoch": 4.406848756624542, + "grad_norm": 0.2861614227294922, + "learning_rate": 4.493732164696291e-06, + "loss": 3.7436, + "step": 64860 + }, + { + "epoch": 4.407188476695203, + "grad_norm": 0.23653019964694977, + "learning_rate": 4.493307514607964e-06, + "loss": 3.8988, + "step": 64865 + }, + { + "epoch": 4.407528196765865, + "grad_norm": 0.2406398355960846, + "learning_rate": 4.4928828645196355e-06, + "loss": 3.7656, + "step": 64870 + }, + { + "epoch": 4.407867916836526, + "grad_norm": 0.31146594882011414, + "learning_rate": 4.492458214431309e-06, + "loss": 3.9625, + "step": 64875 + }, + { + "epoch": 4.408207636907188, + "grad_norm": 0.42873454093933105, + "learning_rate": 4.492033564342982e-06, + "loss": 4.1749, + "step": 64880 + }, + { + "epoch": 4.40854735697785, + "grad_norm": 0.2102338820695877, + "learning_rate": 4.491608914254654e-06, + "loss": 4.0232, + "step": 64885 + }, + { + "epoch": 4.408887077048512, + "grad_norm": 0.26058050990104675, + "learning_rate": 4.491184264166328e-06, + "loss": 4.1943, + "step": 64890 + }, + { + "epoch": 4.409226797119174, + "grad_norm": 0.2658223509788513, + "learning_rate": 4.490759614078e-06, + "loss": 3.9731, + "step": 64895 + }, + { + "epoch": 4.409566517189836, + "grad_norm": 0.23374533653259277, + "learning_rate": 4.490334963989672e-06, + "loss": 3.8317, + "step": 64900 + }, + { + "epoch": 4.409906237260497, + "grad_norm": 0.27458515763282776, + "learning_rate": 4.489910313901346e-06, + "loss": 3.9568, + "step": 64905 + }, + { + "epoch": 4.410245957331159, + "grad_norm": 0.2761603891849518, + "learning_rate": 4.489485663813019e-06, + "loss": 3.9105, + "step": 64910 + }, + { + "epoch": 4.410585677401821, + "grad_norm": 0.3112422525882721, + "learning_rate": 4.489061013724692e-06, + "loss": 4.096, + "step": 64915 + }, + { + "epoch": 4.410925397472482, + "grad_norm": 0.32369595766067505, + "learning_rate": 4.4886363636363636e-06, + "loss": 3.9363, + "step": 64920 + }, + { + "epoch": 4.411265117543144, + "grad_norm": 0.2837754487991333, + "learning_rate": 4.488211713548037e-06, + "loss": 3.9225, + "step": 64925 + }, + { + "epoch": 4.411604837613806, + "grad_norm": 0.23666517436504364, + "learning_rate": 4.48778706345971e-06, + "loss": 3.8015, + "step": 64930 + }, + { + "epoch": 4.411944557684468, + "grad_norm": 0.2871319353580475, + "learning_rate": 4.487362413371382e-06, + "loss": 4.1842, + "step": 64935 + }, + { + "epoch": 4.41228427775513, + "grad_norm": 0.2972487211227417, + "learning_rate": 4.486937763283056e-06, + "loss": 3.9257, + "step": 64940 + }, + { + "epoch": 4.412623997825792, + "grad_norm": 0.23415596783161163, + "learning_rate": 4.486513113194728e-06, + "loss": 3.8497, + "step": 64945 + }, + { + "epoch": 4.412963717896453, + "grad_norm": 0.2895003855228424, + "learning_rate": 4.4860884631064e-06, + "loss": 3.9649, + "step": 64950 + }, + { + "epoch": 4.413303437967115, + "grad_norm": 0.3883235454559326, + "learning_rate": 4.485663813018073e-06, + "loss": 4.031, + "step": 64955 + }, + { + "epoch": 4.413643158037777, + "grad_norm": 0.23617486655712128, + "learning_rate": 4.485239162929747e-06, + "loss": 4.2148, + "step": 64960 + }, + { + "epoch": 4.413982878108438, + "grad_norm": 0.1773141771554947, + "learning_rate": 4.484814512841419e-06, + "loss": 3.9147, + "step": 64965 + }, + { + "epoch": 4.4143225981791, + "grad_norm": 0.3321852385997772, + "learning_rate": 4.4843898627530916e-06, + "loss": 4.1315, + "step": 64970 + }, + { + "epoch": 4.414662318249762, + "grad_norm": 0.29141002893447876, + "learning_rate": 4.483965212664765e-06, + "loss": 3.9496, + "step": 64975 + }, + { + "epoch": 4.415002038320424, + "grad_norm": 0.2356981784105301, + "learning_rate": 4.483540562576437e-06, + "loss": 4.1011, + "step": 64980 + }, + { + "epoch": 4.415341758391086, + "grad_norm": 0.3570925295352936, + "learning_rate": 4.48311591248811e-06, + "loss": 4.1237, + "step": 64985 + }, + { + "epoch": 4.415681478461748, + "grad_norm": 0.25936004519462585, + "learning_rate": 4.482691262399783e-06, + "loss": 4.1885, + "step": 64990 + }, + { + "epoch": 4.416021198532409, + "grad_norm": 0.2808702290058136, + "learning_rate": 4.4822666123114556e-06, + "loss": 3.9858, + "step": 64995 + }, + { + "epoch": 4.416360918603071, + "grad_norm": 0.2960493564605713, + "learning_rate": 4.481841962223128e-06, + "loss": 3.7613, + "step": 65000 + }, + { + "epoch": 4.416700638673733, + "grad_norm": 0.3998723328113556, + "learning_rate": 4.481417312134801e-06, + "loss": 3.8598, + "step": 65005 + }, + { + "epoch": 4.417040358744394, + "grad_norm": 0.24176892638206482, + "learning_rate": 4.480992662046474e-06, + "loss": 3.942, + "step": 65010 + }, + { + "epoch": 4.417380078815056, + "grad_norm": 0.2972923219203949, + "learning_rate": 4.480568011958147e-06, + "loss": 4.0968, + "step": 65015 + }, + { + "epoch": 4.417719798885718, + "grad_norm": 0.2910829484462738, + "learning_rate": 4.4801433618698196e-06, + "loss": 4.0574, + "step": 65020 + }, + { + "epoch": 4.41805951895638, + "grad_norm": 0.27981019020080566, + "learning_rate": 4.479718711781492e-06, + "loss": 4.0614, + "step": 65025 + }, + { + "epoch": 4.418399239027042, + "grad_norm": 0.3166872560977936, + "learning_rate": 4.479294061693165e-06, + "loss": 4.0832, + "step": 65030 + }, + { + "epoch": 4.418738959097704, + "grad_norm": 0.2539702355861664, + "learning_rate": 4.478869411604838e-06, + "loss": 3.9326, + "step": 65035 + }, + { + "epoch": 4.419078679168365, + "grad_norm": 0.2425973117351532, + "learning_rate": 4.478444761516511e-06, + "loss": 3.7172, + "step": 65040 + }, + { + "epoch": 4.419418399239027, + "grad_norm": 0.37584593892097473, + "learning_rate": 4.4780201114281836e-06, + "loss": 3.7694, + "step": 65045 + }, + { + "epoch": 4.419758119309689, + "grad_norm": 0.30355557799339294, + "learning_rate": 4.477595461339856e-06, + "loss": 4.0815, + "step": 65050 + }, + { + "epoch": 4.42009783938035, + "grad_norm": 0.22016359865665436, + "learning_rate": 4.477170811251529e-06, + "loss": 4.2319, + "step": 65055 + }, + { + "epoch": 4.420437559451012, + "grad_norm": 0.2824268341064453, + "learning_rate": 4.476746161163202e-06, + "loss": 4.0364, + "step": 65060 + }, + { + "epoch": 4.420777279521674, + "grad_norm": 0.27023401856422424, + "learning_rate": 4.476321511074875e-06, + "loss": 3.9641, + "step": 65065 + }, + { + "epoch": 4.421116999592336, + "grad_norm": 0.26498860120773315, + "learning_rate": 4.4758968609865476e-06, + "loss": 3.8241, + "step": 65070 + }, + { + "epoch": 4.421456719662998, + "grad_norm": 0.30205437541007996, + "learning_rate": 4.47547221089822e-06, + "loss": 3.8871, + "step": 65075 + }, + { + "epoch": 4.42179643973366, + "grad_norm": 0.2532247304916382, + "learning_rate": 4.475047560809893e-06, + "loss": 3.7243, + "step": 65080 + }, + { + "epoch": 4.422136159804321, + "grad_norm": 0.25602927803993225, + "learning_rate": 4.474622910721566e-06, + "loss": 3.9502, + "step": 65085 + }, + { + "epoch": 4.422475879874983, + "grad_norm": 0.2644886374473572, + "learning_rate": 4.474198260633239e-06, + "loss": 3.4947, + "step": 65090 + }, + { + "epoch": 4.422815599945645, + "grad_norm": 0.30426549911499023, + "learning_rate": 4.473773610544912e-06, + "loss": 3.9314, + "step": 65095 + }, + { + "epoch": 4.423155320016306, + "grad_norm": 0.31259533762931824, + "learning_rate": 4.473348960456584e-06, + "loss": 4.0068, + "step": 65100 + }, + { + "epoch": 4.423495040086968, + "grad_norm": 0.29200854897499084, + "learning_rate": 4.472924310368257e-06, + "loss": 3.9236, + "step": 65105 + }, + { + "epoch": 4.42383476015763, + "grad_norm": 0.21716414391994476, + "learning_rate": 4.47249966027993e-06, + "loss": 3.7788, + "step": 65110 + }, + { + "epoch": 4.424174480228292, + "grad_norm": 0.27634191513061523, + "learning_rate": 4.472075010191603e-06, + "loss": 4.05, + "step": 65115 + }, + { + "epoch": 4.424514200298954, + "grad_norm": 0.3117299973964691, + "learning_rate": 4.471650360103275e-06, + "loss": 3.752, + "step": 65120 + }, + { + "epoch": 4.424853920369616, + "grad_norm": 0.3025243580341339, + "learning_rate": 4.471225710014948e-06, + "loss": 3.849, + "step": 65125 + }, + { + "epoch": 4.425193640440277, + "grad_norm": 0.4532644748687744, + "learning_rate": 4.470801059926621e-06, + "loss": 3.8507, + "step": 65130 + }, + { + "epoch": 4.425533360510939, + "grad_norm": 0.21665042638778687, + "learning_rate": 4.470376409838293e-06, + "loss": 3.8006, + "step": 65135 + }, + { + "epoch": 4.425873080581601, + "grad_norm": 0.39507147669792175, + "learning_rate": 4.469951759749967e-06, + "loss": 3.9596, + "step": 65140 + }, + { + "epoch": 4.426212800652262, + "grad_norm": 0.32555413246154785, + "learning_rate": 4.46952710966164e-06, + "loss": 4.1518, + "step": 65145 + }, + { + "epoch": 4.426552520722924, + "grad_norm": 0.5695114135742188, + "learning_rate": 4.4691024595733115e-06, + "loss": 3.9922, + "step": 65150 + }, + { + "epoch": 4.4268922407935865, + "grad_norm": 0.26941102743148804, + "learning_rate": 4.468677809484984e-06, + "loss": 3.8479, + "step": 65155 + }, + { + "epoch": 4.427231960864248, + "grad_norm": 0.3001646399497986, + "learning_rate": 4.468253159396658e-06, + "loss": 4.2478, + "step": 65160 + }, + { + "epoch": 4.42757168093491, + "grad_norm": 0.33858537673950195, + "learning_rate": 4.46782850930833e-06, + "loss": 4.1287, + "step": 65165 + }, + { + "epoch": 4.427911401005572, + "grad_norm": 0.28132563829421997, + "learning_rate": 4.467403859220003e-06, + "loss": 4.0303, + "step": 65170 + }, + { + "epoch": 4.428251121076233, + "grad_norm": 0.3055199086666107, + "learning_rate": 4.466979209131676e-06, + "loss": 3.7619, + "step": 65175 + }, + { + "epoch": 4.428590841146895, + "grad_norm": 0.2253206968307495, + "learning_rate": 4.466554559043348e-06, + "loss": 3.9401, + "step": 65180 + }, + { + "epoch": 4.428930561217557, + "grad_norm": 0.42242297530174255, + "learning_rate": 4.466129908955021e-06, + "loss": 4.1617, + "step": 65185 + }, + { + "epoch": 4.429270281288218, + "grad_norm": 0.33496296405792236, + "learning_rate": 4.465705258866695e-06, + "loss": 4.0256, + "step": 65190 + }, + { + "epoch": 4.42961000135888, + "grad_norm": 0.4177621006965637, + "learning_rate": 4.465280608778367e-06, + "loss": 3.9335, + "step": 65195 + }, + { + "epoch": 4.4299497214295425, + "grad_norm": 0.2664821743965149, + "learning_rate": 4.4648559586900395e-06, + "loss": 4.1917, + "step": 65200 + }, + { + "epoch": 4.430289441500204, + "grad_norm": 0.30498814582824707, + "learning_rate": 4.464431308601712e-06, + "loss": 3.9352, + "step": 65205 + }, + { + "epoch": 4.430629161570866, + "grad_norm": 0.19307832419872284, + "learning_rate": 4.464006658513385e-06, + "loss": 3.7837, + "step": 65210 + }, + { + "epoch": 4.430968881641528, + "grad_norm": 0.3643074631690979, + "learning_rate": 4.463582008425058e-06, + "loss": 4.41, + "step": 65215 + }, + { + "epoch": 4.431308601712189, + "grad_norm": 0.26777854561805725, + "learning_rate": 4.463157358336731e-06, + "loss": 4.0388, + "step": 65220 + }, + { + "epoch": 4.431648321782851, + "grad_norm": 0.2563091218471527, + "learning_rate": 4.4627327082484036e-06, + "loss": 3.8943, + "step": 65225 + }, + { + "epoch": 4.431988041853513, + "grad_norm": 0.3550940752029419, + "learning_rate": 4.462308058160076e-06, + "loss": 3.8922, + "step": 65230 + }, + { + "epoch": 4.432327761924174, + "grad_norm": 0.38080915808677673, + "learning_rate": 4.461883408071749e-06, + "loss": 4.1475, + "step": 65235 + }, + { + "epoch": 4.432667481994836, + "grad_norm": 0.3347424864768982, + "learning_rate": 4.461458757983422e-06, + "loss": 4.1885, + "step": 65240 + }, + { + "epoch": 4.4330072020654985, + "grad_norm": 0.2866474986076355, + "learning_rate": 4.461034107895095e-06, + "loss": 3.9008, + "step": 65245 + }, + { + "epoch": 4.43334692213616, + "grad_norm": 0.22432266175746918, + "learning_rate": 4.4606094578067676e-06, + "loss": 4.3011, + "step": 65250 + }, + { + "epoch": 4.433686642206822, + "grad_norm": 0.3291207253932953, + "learning_rate": 4.46018480771844e-06, + "loss": 3.8486, + "step": 65255 + }, + { + "epoch": 4.434026362277484, + "grad_norm": 0.30687612295150757, + "learning_rate": 4.459760157630113e-06, + "loss": 4.0738, + "step": 65260 + }, + { + "epoch": 4.434366082348145, + "grad_norm": 0.33138543367385864, + "learning_rate": 4.459335507541786e-06, + "loss": 4.142, + "step": 65265 + }, + { + "epoch": 4.434705802418807, + "grad_norm": 0.31343698501586914, + "learning_rate": 4.458910857453459e-06, + "loss": 4.1313, + "step": 65270 + }, + { + "epoch": 4.435045522489468, + "grad_norm": 0.2671862542629242, + "learning_rate": 4.4584862073651316e-06, + "loss": 3.9192, + "step": 65275 + }, + { + "epoch": 4.43538524256013, + "grad_norm": 0.3924698829650879, + "learning_rate": 4.458061557276804e-06, + "loss": 3.896, + "step": 65280 + }, + { + "epoch": 4.435724962630792, + "grad_norm": 0.27790114283561707, + "learning_rate": 4.457636907188477e-06, + "loss": 4.3399, + "step": 65285 + }, + { + "epoch": 4.436064682701454, + "grad_norm": 0.27714571356773376, + "learning_rate": 4.45721225710015e-06, + "loss": 3.8513, + "step": 65290 + }, + { + "epoch": 4.436404402772116, + "grad_norm": 0.40689462423324585, + "learning_rate": 4.456787607011823e-06, + "loss": 4.0816, + "step": 65295 + }, + { + "epoch": 4.436744122842778, + "grad_norm": 0.3239899277687073, + "learning_rate": 4.4563629569234956e-06, + "loss": 3.872, + "step": 65300 + }, + { + "epoch": 4.437083842913439, + "grad_norm": 0.3287176489830017, + "learning_rate": 4.455938306835168e-06, + "loss": 3.9847, + "step": 65305 + }, + { + "epoch": 4.437423562984101, + "grad_norm": 0.29212018847465515, + "learning_rate": 4.4555985867645064e-06, + "loss": 4.1595, + "step": 65310 + }, + { + "epoch": 4.437763283054763, + "grad_norm": 0.2791972756385803, + "learning_rate": 4.455173936676179e-06, + "loss": 3.9457, + "step": 65315 + }, + { + "epoch": 4.438103003125424, + "grad_norm": 0.2654697299003601, + "learning_rate": 4.454749286587852e-06, + "loss": 4.1112, + "step": 65320 + }, + { + "epoch": 4.438442723196086, + "grad_norm": 0.2357431948184967, + "learning_rate": 4.454324636499525e-06, + "loss": 3.6837, + "step": 65325 + }, + { + "epoch": 4.438782443266748, + "grad_norm": 0.2213243991136551, + "learning_rate": 4.453899986411198e-06, + "loss": 3.7754, + "step": 65330 + }, + { + "epoch": 4.43912216333741, + "grad_norm": 0.30930399894714355, + "learning_rate": 4.4534753363228704e-06, + "loss": 4.0707, + "step": 65335 + }, + { + "epoch": 4.439461883408072, + "grad_norm": 0.2611488699913025, + "learning_rate": 4.453050686234543e-06, + "loss": 3.8829, + "step": 65340 + }, + { + "epoch": 4.439801603478734, + "grad_norm": 0.2796533405780792, + "learning_rate": 4.452626036146216e-06, + "loss": 3.8018, + "step": 65345 + }, + { + "epoch": 4.440141323549395, + "grad_norm": 0.2773681879043579, + "learning_rate": 4.452201386057889e-06, + "loss": 3.9575, + "step": 65350 + }, + { + "epoch": 4.440481043620057, + "grad_norm": 0.21872709691524506, + "learning_rate": 4.451776735969562e-06, + "loss": 3.9124, + "step": 65355 + }, + { + "epoch": 4.440820763690719, + "grad_norm": 0.35362669825553894, + "learning_rate": 4.4513520858812344e-06, + "loss": 3.7905, + "step": 65360 + }, + { + "epoch": 4.44116048376138, + "grad_norm": 0.341456800699234, + "learning_rate": 4.450927435792907e-06, + "loss": 4.1847, + "step": 65365 + }, + { + "epoch": 4.441500203832042, + "grad_norm": 0.2547778785228729, + "learning_rate": 4.450502785704579e-06, + "loss": 3.7718, + "step": 65370 + }, + { + "epoch": 4.441839923902704, + "grad_norm": 0.29386937618255615, + "learning_rate": 4.450078135616253e-06, + "loss": 3.7396, + "step": 65375 + }, + { + "epoch": 4.442179643973366, + "grad_norm": 0.3153959810733795, + "learning_rate": 4.449653485527926e-06, + "loss": 4.0511, + "step": 65380 + }, + { + "epoch": 4.442519364044028, + "grad_norm": 0.27438393235206604, + "learning_rate": 4.449228835439598e-06, + "loss": 3.7032, + "step": 65385 + }, + { + "epoch": 4.44285908411469, + "grad_norm": 0.2795828878879547, + "learning_rate": 4.448804185351271e-06, + "loss": 4.0581, + "step": 65390 + }, + { + "epoch": 4.443198804185351, + "grad_norm": 0.31509870290756226, + "learning_rate": 4.448379535262944e-06, + "loss": 4.0489, + "step": 65395 + }, + { + "epoch": 4.443538524256013, + "grad_norm": 0.30783092975616455, + "learning_rate": 4.447954885174616e-06, + "loss": 3.947, + "step": 65400 + }, + { + "epoch": 4.443878244326675, + "grad_norm": 0.29476872086524963, + "learning_rate": 4.44753023508629e-06, + "loss": 3.9388, + "step": 65405 + }, + { + "epoch": 4.444217964397336, + "grad_norm": 0.2626904547214508, + "learning_rate": 4.4471055849979625e-06, + "loss": 3.8351, + "step": 65410 + }, + { + "epoch": 4.444557684467998, + "grad_norm": 0.2926519215106964, + "learning_rate": 4.446680934909634e-06, + "loss": 3.8559, + "step": 65415 + }, + { + "epoch": 4.4448974045386604, + "grad_norm": 0.24017100036144257, + "learning_rate": 4.446256284821307e-06, + "loss": 3.8341, + "step": 65420 + }, + { + "epoch": 4.445237124609322, + "grad_norm": 0.2652307450771332, + "learning_rate": 4.445831634732981e-06, + "loss": 4.1998, + "step": 65425 + }, + { + "epoch": 4.445576844679984, + "grad_norm": 0.3136250078678131, + "learning_rate": 4.445406984644653e-06, + "loss": 4.0836, + "step": 65430 + }, + { + "epoch": 4.445916564750646, + "grad_norm": 0.29435229301452637, + "learning_rate": 4.444982334556326e-06, + "loss": 4.0661, + "step": 65435 + }, + { + "epoch": 4.446256284821307, + "grad_norm": 0.19816286861896515, + "learning_rate": 4.444557684467999e-06, + "loss": 4.004, + "step": 65440 + }, + { + "epoch": 4.446596004891969, + "grad_norm": 0.8735499382019043, + "learning_rate": 4.444133034379671e-06, + "loss": 4.2395, + "step": 65445 + }, + { + "epoch": 4.446935724962631, + "grad_norm": 0.24035578966140747, + "learning_rate": 4.443708384291344e-06, + "loss": 3.8592, + "step": 65450 + }, + { + "epoch": 4.447275445033292, + "grad_norm": 0.2346624881029129, + "learning_rate": 4.443283734203017e-06, + "loss": 3.9828, + "step": 65455 + }, + { + "epoch": 4.447615165103954, + "grad_norm": 0.2703493535518646, + "learning_rate": 4.4428590841146905e-06, + "loss": 3.7571, + "step": 65460 + }, + { + "epoch": 4.4479548851746165, + "grad_norm": 0.338941752910614, + "learning_rate": 4.442434434026362e-06, + "loss": 4.1502, + "step": 65465 + }, + { + "epoch": 4.448294605245278, + "grad_norm": 0.2942905128002167, + "learning_rate": 4.442009783938035e-06, + "loss": 3.8621, + "step": 65470 + }, + { + "epoch": 4.44863432531594, + "grad_norm": 0.27471795678138733, + "learning_rate": 4.441585133849709e-06, + "loss": 3.8751, + "step": 65475 + }, + { + "epoch": 4.448974045386602, + "grad_norm": 0.3508402407169342, + "learning_rate": 4.441160483761381e-06, + "loss": 4.1153, + "step": 65480 + }, + { + "epoch": 4.449313765457263, + "grad_norm": 0.5401585102081299, + "learning_rate": 4.440735833673054e-06, + "loss": 3.6992, + "step": 65485 + }, + { + "epoch": 4.449653485527925, + "grad_norm": 0.29161956906318665, + "learning_rate": 4.440311183584726e-06, + "loss": 3.863, + "step": 65490 + }, + { + "epoch": 4.449993205598587, + "grad_norm": 0.2660660743713379, + "learning_rate": 4.439886533496399e-06, + "loss": 4.1056, + "step": 65495 + }, + { + "epoch": 4.450332925669248, + "grad_norm": 0.22576870024204254, + "learning_rate": 4.439461883408072e-06, + "loss": 3.9007, + "step": 65500 + }, + { + "epoch": 4.45067264573991, + "grad_norm": 0.22681286931037903, + "learning_rate": 4.439037233319745e-06, + "loss": 4.0626, + "step": 65505 + }, + { + "epoch": 4.4510123658105725, + "grad_norm": 0.28579017519950867, + "learning_rate": 4.438612583231418e-06, + "loss": 3.8473, + "step": 65510 + }, + { + "epoch": 4.451352085881234, + "grad_norm": 0.25769537687301636, + "learning_rate": 4.43818793314309e-06, + "loss": 4.2317, + "step": 65515 + }, + { + "epoch": 4.451691805951896, + "grad_norm": 0.31450411677360535, + "learning_rate": 4.437763283054763e-06, + "loss": 3.9795, + "step": 65520 + }, + { + "epoch": 4.452031526022558, + "grad_norm": 0.2559819221496582, + "learning_rate": 4.437338632966436e-06, + "loss": 3.9548, + "step": 65525 + }, + { + "epoch": 4.452371246093219, + "grad_norm": 0.24369852244853973, + "learning_rate": 4.436913982878109e-06, + "loss": 3.5574, + "step": 65530 + }, + { + "epoch": 4.452710966163881, + "grad_norm": 0.2523629367351532, + "learning_rate": 4.436489332789782e-06, + "loss": 3.9023, + "step": 65535 + }, + { + "epoch": 4.453050686234543, + "grad_norm": 0.22525854408740997, + "learning_rate": 4.436064682701454e-06, + "loss": 4.0551, + "step": 65540 + }, + { + "epoch": 4.453390406305204, + "grad_norm": 0.28217554092407227, + "learning_rate": 4.435640032613127e-06, + "loss": 3.9622, + "step": 65545 + }, + { + "epoch": 4.453730126375866, + "grad_norm": 0.20456227660179138, + "learning_rate": 4.4352153825248e-06, + "loss": 3.8378, + "step": 65550 + }, + { + "epoch": 4.454069846446528, + "grad_norm": 0.2102375030517578, + "learning_rate": 4.434790732436473e-06, + "loss": 3.9639, + "step": 65555 + }, + { + "epoch": 4.45440956651719, + "grad_norm": 0.232166588306427, + "learning_rate": 4.434366082348146e-06, + "loss": 3.7555, + "step": 65560 + }, + { + "epoch": 4.454749286587852, + "grad_norm": 0.2989981472492218, + "learning_rate": 4.4339414322598184e-06, + "loss": 3.9228, + "step": 65565 + }, + { + "epoch": 4.455089006658513, + "grad_norm": 0.3458876311779022, + "learning_rate": 4.433516782171491e-06, + "loss": 4.0983, + "step": 65570 + }, + { + "epoch": 4.455428726729175, + "grad_norm": 0.2315695583820343, + "learning_rate": 4.433092132083164e-06, + "loss": 4.1029, + "step": 65575 + }, + { + "epoch": 4.455768446799837, + "grad_norm": 0.2826087176799774, + "learning_rate": 4.432667481994837e-06, + "loss": 3.7672, + "step": 65580 + }, + { + "epoch": 4.456108166870498, + "grad_norm": 0.2413892298936844, + "learning_rate": 4.432242831906509e-06, + "loss": 3.8362, + "step": 65585 + }, + { + "epoch": 4.45644788694116, + "grad_norm": 0.28512412309646606, + "learning_rate": 4.4318181818181824e-06, + "loss": 4.0371, + "step": 65590 + }, + { + "epoch": 4.456787607011822, + "grad_norm": 0.24534253776073456, + "learning_rate": 4.431393531729855e-06, + "loss": 3.9908, + "step": 65595 + }, + { + "epoch": 4.457127327082484, + "grad_norm": 0.2697662115097046, + "learning_rate": 4.430968881641527e-06, + "loss": 3.9814, + "step": 65600 + }, + { + "epoch": 4.457467047153146, + "grad_norm": 0.47937095165252686, + "learning_rate": 4.430544231553201e-06, + "loss": 3.8479, + "step": 65605 + }, + { + "epoch": 4.457806767223808, + "grad_norm": 0.32301539182662964, + "learning_rate": 4.430119581464874e-06, + "loss": 4.1585, + "step": 65610 + }, + { + "epoch": 4.458146487294469, + "grad_norm": 0.3437241315841675, + "learning_rate": 4.429694931376546e-06, + "loss": 4.0446, + "step": 65615 + }, + { + "epoch": 4.458486207365131, + "grad_norm": 0.228526771068573, + "learning_rate": 4.429270281288218e-06, + "loss": 3.9902, + "step": 65620 + }, + { + "epoch": 4.458825927435793, + "grad_norm": 0.28384649753570557, + "learning_rate": 4.428845631199892e-06, + "loss": 3.5638, + "step": 65625 + }, + { + "epoch": 4.459165647506454, + "grad_norm": 0.5058556199073792, + "learning_rate": 4.428420981111565e-06, + "loss": 4.0059, + "step": 65630 + }, + { + "epoch": 4.459505367577116, + "grad_norm": 0.308971643447876, + "learning_rate": 4.427996331023237e-06, + "loss": 4.1013, + "step": 65635 + }, + { + "epoch": 4.459845087647778, + "grad_norm": 0.2652873694896698, + "learning_rate": 4.4275716809349104e-06, + "loss": 4.1093, + "step": 65640 + }, + { + "epoch": 4.46018480771844, + "grad_norm": 0.32281285524368286, + "learning_rate": 4.427147030846583e-06, + "loss": 3.9159, + "step": 65645 + }, + { + "epoch": 4.460524527789102, + "grad_norm": 0.25746405124664307, + "learning_rate": 4.426722380758255e-06, + "loss": 3.8817, + "step": 65650 + }, + { + "epoch": 4.460864247859764, + "grad_norm": 0.3816303312778473, + "learning_rate": 4.426297730669928e-06, + "loss": 3.827, + "step": 65655 + }, + { + "epoch": 4.461203967930425, + "grad_norm": 0.3877706825733185, + "learning_rate": 4.425873080581602e-06, + "loss": 3.7831, + "step": 65660 + }, + { + "epoch": 4.461543688001087, + "grad_norm": 0.2594763934612274, + "learning_rate": 4.425448430493274e-06, + "loss": 3.8496, + "step": 65665 + }, + { + "epoch": 4.461883408071749, + "grad_norm": 0.33369502425193787, + "learning_rate": 4.425023780404946e-06, + "loss": 3.9246, + "step": 65670 + }, + { + "epoch": 4.46222312814241, + "grad_norm": 0.2391049712896347, + "learning_rate": 4.42459913031662e-06, + "loss": 4.1122, + "step": 65675 + }, + { + "epoch": 4.462562848213072, + "grad_norm": 0.3430517315864563, + "learning_rate": 4.424174480228292e-06, + "loss": 3.9809, + "step": 65680 + }, + { + "epoch": 4.462902568283734, + "grad_norm": 0.30058854818344116, + "learning_rate": 4.423749830139965e-06, + "loss": 4.0104, + "step": 65685 + }, + { + "epoch": 4.463242288354396, + "grad_norm": 0.2542206346988678, + "learning_rate": 4.4233251800516384e-06, + "loss": 4.0481, + "step": 65690 + }, + { + "epoch": 4.463582008425058, + "grad_norm": 0.361184298992157, + "learning_rate": 4.42290052996331e-06, + "loss": 3.8112, + "step": 65695 + }, + { + "epoch": 4.46392172849572, + "grad_norm": 0.30797094106674194, + "learning_rate": 4.422475879874983e-06, + "loss": 4.116, + "step": 65700 + }, + { + "epoch": 4.464261448566381, + "grad_norm": 0.26382046937942505, + "learning_rate": 4.422051229786656e-06, + "loss": 3.9314, + "step": 65705 + }, + { + "epoch": 4.464601168637043, + "grad_norm": 0.28428035974502563, + "learning_rate": 4.421626579698329e-06, + "loss": 3.7885, + "step": 65710 + }, + { + "epoch": 4.464940888707705, + "grad_norm": 0.2748422622680664, + "learning_rate": 4.421201929610002e-06, + "loss": 3.9245, + "step": 65715 + }, + { + "epoch": 4.465280608778366, + "grad_norm": 0.2846805453300476, + "learning_rate": 4.420777279521674e-06, + "loss": 4.011, + "step": 65720 + }, + { + "epoch": 4.465620328849028, + "grad_norm": 0.320799320936203, + "learning_rate": 4.420352629433347e-06, + "loss": 3.8571, + "step": 65725 + }, + { + "epoch": 4.4659600489196905, + "grad_norm": 0.23203080892562866, + "learning_rate": 4.41992797934502e-06, + "loss": 4.0151, + "step": 65730 + }, + { + "epoch": 4.466299768990352, + "grad_norm": 0.4093700349330902, + "learning_rate": 4.419503329256693e-06, + "loss": 3.8221, + "step": 65735 + }, + { + "epoch": 4.466639489061014, + "grad_norm": 0.35916373133659363, + "learning_rate": 4.419078679168366e-06, + "loss": 3.7023, + "step": 65740 + }, + { + "epoch": 4.466979209131676, + "grad_norm": 0.30376508831977844, + "learning_rate": 4.418654029080038e-06, + "loss": 3.814, + "step": 65745 + }, + { + "epoch": 4.467318929202337, + "grad_norm": 0.21918579936027527, + "learning_rate": 4.418229378991711e-06, + "loss": 3.7532, + "step": 65750 + }, + { + "epoch": 4.467658649272999, + "grad_norm": 0.24723540246486664, + "learning_rate": 4.417804728903384e-06, + "loss": 3.8918, + "step": 65755 + }, + { + "epoch": 4.467998369343661, + "grad_norm": 0.2349080890417099, + "learning_rate": 4.417380078815057e-06, + "loss": 4.082, + "step": 65760 + }, + { + "epoch": 4.468338089414322, + "grad_norm": 0.2727521061897278, + "learning_rate": 4.41695542872673e-06, + "loss": 4.0776, + "step": 65765 + }, + { + "epoch": 4.468677809484984, + "grad_norm": 0.2564133107662201, + "learning_rate": 4.416530778638402e-06, + "loss": 4.0677, + "step": 65770 + }, + { + "epoch": 4.4690175295556465, + "grad_norm": 0.23602253198623657, + "learning_rate": 4.416106128550075e-06, + "loss": 3.9796, + "step": 65775 + }, + { + "epoch": 4.469357249626308, + "grad_norm": 0.2509639263153076, + "learning_rate": 4.415681478461748e-06, + "loss": 3.7201, + "step": 65780 + }, + { + "epoch": 4.46969696969697, + "grad_norm": 0.21461379528045654, + "learning_rate": 4.415256828373421e-06, + "loss": 3.8074, + "step": 65785 + }, + { + "epoch": 4.470036689767632, + "grad_norm": 0.2524453401565552, + "learning_rate": 4.414832178285094e-06, + "loss": 3.8831, + "step": 65790 + }, + { + "epoch": 4.470376409838293, + "grad_norm": 0.2933724522590637, + "learning_rate": 4.414407528196766e-06, + "loss": 4.2665, + "step": 65795 + }, + { + "epoch": 4.470716129908955, + "grad_norm": 0.27520957589149475, + "learning_rate": 4.413982878108439e-06, + "loss": 3.8779, + "step": 65800 + }, + { + "epoch": 4.471055849979617, + "grad_norm": 0.3261263072490692, + "learning_rate": 4.413558228020112e-06, + "loss": 3.9789, + "step": 65805 + }, + { + "epoch": 4.471395570050278, + "grad_norm": 0.2448045015335083, + "learning_rate": 4.413133577931785e-06, + "loss": 3.8927, + "step": 65810 + }, + { + "epoch": 4.47173529012094, + "grad_norm": 0.2847655415534973, + "learning_rate": 4.412708927843458e-06, + "loss": 3.7923, + "step": 65815 + }, + { + "epoch": 4.4720750101916025, + "grad_norm": 0.3130068778991699, + "learning_rate": 4.41228427775513e-06, + "loss": 3.8409, + "step": 65820 + }, + { + "epoch": 4.472414730262264, + "grad_norm": 0.33084022998809814, + "learning_rate": 4.411859627666803e-06, + "loss": 3.9059, + "step": 65825 + }, + { + "epoch": 4.472754450332926, + "grad_norm": 0.3072475492954254, + "learning_rate": 4.411434977578476e-06, + "loss": 4.0557, + "step": 65830 + }, + { + "epoch": 4.473094170403588, + "grad_norm": 0.28055787086486816, + "learning_rate": 4.411010327490148e-06, + "loss": 3.8465, + "step": 65835 + }, + { + "epoch": 4.473433890474249, + "grad_norm": 0.24483461678028107, + "learning_rate": 4.410585677401822e-06, + "loss": 4.1032, + "step": 65840 + }, + { + "epoch": 4.473773610544911, + "grad_norm": 0.3440193831920624, + "learning_rate": 4.410161027313494e-06, + "loss": 3.8746, + "step": 65845 + }, + { + "epoch": 4.474113330615573, + "grad_norm": 0.2688996493816376, + "learning_rate": 4.409736377225166e-06, + "loss": 3.753, + "step": 65850 + }, + { + "epoch": 4.474453050686234, + "grad_norm": 0.3997231423854828, + "learning_rate": 4.40931172713684e-06, + "loss": 3.7088, + "step": 65855 + }, + { + "epoch": 4.474792770756896, + "grad_norm": 0.2536531090736389, + "learning_rate": 4.408887077048513e-06, + "loss": 3.7921, + "step": 65860 + }, + { + "epoch": 4.4751324908275585, + "grad_norm": 0.2657826542854309, + "learning_rate": 4.408462426960185e-06, + "loss": 4.0207, + "step": 65865 + }, + { + "epoch": 4.47547221089822, + "grad_norm": 0.2375260591506958, + "learning_rate": 4.408037776871858e-06, + "loss": 4.059, + "step": 65870 + }, + { + "epoch": 4.475811930968882, + "grad_norm": 0.6158850193023682, + "learning_rate": 4.407613126783531e-06, + "loss": 3.9661, + "step": 65875 + }, + { + "epoch": 4.476151651039544, + "grad_norm": 0.23400047421455383, + "learning_rate": 4.407188476695203e-06, + "loss": 4.0841, + "step": 65880 + }, + { + "epoch": 4.476491371110205, + "grad_norm": 0.2937452793121338, + "learning_rate": 4.406763826606876e-06, + "loss": 3.8345, + "step": 65885 + }, + { + "epoch": 4.476831091180867, + "grad_norm": 0.30217641592025757, + "learning_rate": 4.40633917651855e-06, + "loss": 4.0885, + "step": 65890 + }, + { + "epoch": 4.477170811251529, + "grad_norm": 0.27968892455101013, + "learning_rate": 4.405914526430222e-06, + "loss": 3.9164, + "step": 65895 + }, + { + "epoch": 4.47751053132219, + "grad_norm": 0.33322077989578247, + "learning_rate": 4.405489876341894e-06, + "loss": 3.8744, + "step": 65900 + }, + { + "epoch": 4.477850251392852, + "grad_norm": 0.31911981105804443, + "learning_rate": 4.405065226253567e-06, + "loss": 3.9155, + "step": 65905 + }, + { + "epoch": 4.4781899714635145, + "grad_norm": 0.2227923572063446, + "learning_rate": 4.40464057616524e-06, + "loss": 3.7942, + "step": 65910 + }, + { + "epoch": 4.478529691534176, + "grad_norm": 0.23060691356658936, + "learning_rate": 4.404215926076913e-06, + "loss": 4.1624, + "step": 65915 + }, + { + "epoch": 4.478869411604838, + "grad_norm": 0.2948561906814575, + "learning_rate": 4.403791275988586e-06, + "loss": 3.6819, + "step": 65920 + }, + { + "epoch": 4.4792091316755, + "grad_norm": 0.3354584574699402, + "learning_rate": 4.403366625900258e-06, + "loss": 4.0582, + "step": 65925 + }, + { + "epoch": 4.479548851746161, + "grad_norm": 0.29425331950187683, + "learning_rate": 4.402941975811931e-06, + "loss": 3.9782, + "step": 65930 + }, + { + "epoch": 4.479888571816823, + "grad_norm": 0.4065363109111786, + "learning_rate": 4.402517325723604e-06, + "loss": 3.9941, + "step": 65935 + }, + { + "epoch": 4.480228291887485, + "grad_norm": 0.2069021612405777, + "learning_rate": 4.402092675635277e-06, + "loss": 3.7128, + "step": 65940 + }, + { + "epoch": 4.480568011958146, + "grad_norm": 0.42339232563972473, + "learning_rate": 4.40166802554695e-06, + "loss": 4.1177, + "step": 65945 + }, + { + "epoch": 4.480907732028808, + "grad_norm": 0.3115142285823822, + "learning_rate": 4.401243375458622e-06, + "loss": 4.0021, + "step": 65950 + }, + { + "epoch": 4.4812474520994705, + "grad_norm": 0.3572777807712555, + "learning_rate": 4.400818725370295e-06, + "loss": 3.9467, + "step": 65955 + }, + { + "epoch": 4.481587172170132, + "grad_norm": 0.3681219816207886, + "learning_rate": 4.400394075281968e-06, + "loss": 3.9629, + "step": 65960 + }, + { + "epoch": 4.481926892240794, + "grad_norm": 0.27517396211624146, + "learning_rate": 4.399969425193641e-06, + "loss": 3.8422, + "step": 65965 + }, + { + "epoch": 4.482266612311455, + "grad_norm": 0.24587103724479675, + "learning_rate": 4.399544775105314e-06, + "loss": 3.8972, + "step": 65970 + }, + { + "epoch": 4.482606332382117, + "grad_norm": 0.2311912328004837, + "learning_rate": 4.399120125016986e-06, + "loss": 3.9311, + "step": 65975 + }, + { + "epoch": 4.482946052452779, + "grad_norm": 0.29948747158050537, + "learning_rate": 4.398695474928659e-06, + "loss": 3.9454, + "step": 65980 + }, + { + "epoch": 4.48328577252344, + "grad_norm": 0.32405713200569153, + "learning_rate": 4.398270824840332e-06, + "loss": 4.0389, + "step": 65985 + }, + { + "epoch": 4.483625492594102, + "grad_norm": 0.20752961933612823, + "learning_rate": 4.397846174752005e-06, + "loss": 3.8367, + "step": 65990 + }, + { + "epoch": 4.483965212664764, + "grad_norm": 0.23090074956417084, + "learning_rate": 4.397421524663678e-06, + "loss": 3.7597, + "step": 65995 + }, + { + "epoch": 4.484304932735426, + "grad_norm": 0.3679594397544861, + "learning_rate": 4.39699687457535e-06, + "loss": 4.0603, + "step": 66000 + }, + { + "epoch": 4.484644652806088, + "grad_norm": 0.34398138523101807, + "learning_rate": 4.396572224487023e-06, + "loss": 4.1355, + "step": 66005 + }, + { + "epoch": 4.48498437287675, + "grad_norm": 0.36612918972969055, + "learning_rate": 4.396147574398696e-06, + "loss": 3.8454, + "step": 66010 + }, + { + "epoch": 4.485324092947411, + "grad_norm": 0.2542549669742584, + "learning_rate": 4.395722924310369e-06, + "loss": 3.9355, + "step": 66015 + }, + { + "epoch": 4.485663813018073, + "grad_norm": 0.2627837359905243, + "learning_rate": 4.395298274222042e-06, + "loss": 4.0665, + "step": 66020 + }, + { + "epoch": 4.486003533088735, + "grad_norm": 0.28147777915000916, + "learning_rate": 4.394873624133714e-06, + "loss": 3.7741, + "step": 66025 + }, + { + "epoch": 4.486343253159396, + "grad_norm": 0.34509849548339844, + "learning_rate": 4.394448974045387e-06, + "loss": 3.9753, + "step": 66030 + }, + { + "epoch": 4.486682973230058, + "grad_norm": 0.25409525632858276, + "learning_rate": 4.394024323957059e-06, + "loss": 3.8249, + "step": 66035 + }, + { + "epoch": 4.4870226933007205, + "grad_norm": 0.24572181701660156, + "learning_rate": 4.393599673868733e-06, + "loss": 4.0609, + "step": 66040 + }, + { + "epoch": 4.487362413371382, + "grad_norm": 0.34126678109169006, + "learning_rate": 4.393175023780406e-06, + "loss": 4.2823, + "step": 66045 + }, + { + "epoch": 4.487702133442044, + "grad_norm": 0.25064393877983093, + "learning_rate": 4.3927503736920776e-06, + "loss": 4.0077, + "step": 66050 + }, + { + "epoch": 4.488041853512706, + "grad_norm": 0.22488395869731903, + "learning_rate": 4.392325723603751e-06, + "loss": 3.6332, + "step": 66055 + }, + { + "epoch": 4.488381573583367, + "grad_norm": 0.24610716104507446, + "learning_rate": 4.391901073515424e-06, + "loss": 3.8906, + "step": 66060 + }, + { + "epoch": 4.488721293654029, + "grad_norm": 0.25518035888671875, + "learning_rate": 4.391476423427096e-06, + "loss": 4.0306, + "step": 66065 + }, + { + "epoch": 4.489061013724691, + "grad_norm": 0.3039434850215912, + "learning_rate": 4.39105177333877e-06, + "loss": 4.056, + "step": 66070 + }, + { + "epoch": 4.489400733795352, + "grad_norm": 0.2862192988395691, + "learning_rate": 4.390627123250442e-06, + "loss": 4.2011, + "step": 66075 + }, + { + "epoch": 4.489740453866014, + "grad_norm": 0.2615927755832672, + "learning_rate": 4.390202473162114e-06, + "loss": 3.7839, + "step": 66080 + }, + { + "epoch": 4.4900801739366765, + "grad_norm": 0.3843514025211334, + "learning_rate": 4.389777823073787e-06, + "loss": 4.059, + "step": 66085 + }, + { + "epoch": 4.490419894007338, + "grad_norm": 0.37392309308052063, + "learning_rate": 4.389353172985461e-06, + "loss": 3.9316, + "step": 66090 + }, + { + "epoch": 4.490759614078, + "grad_norm": 0.36150529980659485, + "learning_rate": 4.388928522897133e-06, + "loss": 3.769, + "step": 66095 + }, + { + "epoch": 4.491099334148662, + "grad_norm": 0.31602784991264343, + "learning_rate": 4.3885038728088056e-06, + "loss": 3.8393, + "step": 66100 + }, + { + "epoch": 4.491439054219323, + "grad_norm": 0.2613423466682434, + "learning_rate": 4.388079222720479e-06, + "loss": 3.8857, + "step": 66105 + }, + { + "epoch": 4.491778774289985, + "grad_norm": 0.2936013340950012, + "learning_rate": 4.387654572632151e-06, + "loss": 3.9044, + "step": 66110 + }, + { + "epoch": 4.492118494360647, + "grad_norm": 0.29930320382118225, + "learning_rate": 4.387229922543824e-06, + "loss": 3.8575, + "step": 66115 + }, + { + "epoch": 4.492458214431308, + "grad_norm": 0.21910162270069122, + "learning_rate": 4.386805272455497e-06, + "loss": 4.3833, + "step": 66120 + }, + { + "epoch": 4.49279793450197, + "grad_norm": 0.1893531233072281, + "learning_rate": 4.3863806223671696e-06, + "loss": 3.821, + "step": 66125 + }, + { + "epoch": 4.4931376545726325, + "grad_norm": 0.33173277974128723, + "learning_rate": 4.385955972278842e-06, + "loss": 3.9461, + "step": 66130 + }, + { + "epoch": 4.493477374643294, + "grad_norm": 0.3294607400894165, + "learning_rate": 4.385531322190515e-06, + "loss": 4.0906, + "step": 66135 + }, + { + "epoch": 4.493817094713956, + "grad_norm": 0.24135582149028778, + "learning_rate": 4.385106672102189e-06, + "loss": 4.0731, + "step": 66140 + }, + { + "epoch": 4.494156814784618, + "grad_norm": 0.23986411094665527, + "learning_rate": 4.384682022013861e-06, + "loss": 3.831, + "step": 66145 + }, + { + "epoch": 4.494496534855279, + "grad_norm": 0.2749132215976715, + "learning_rate": 4.3842573719255336e-06, + "loss": 3.966, + "step": 66150 + }, + { + "epoch": 4.494836254925941, + "grad_norm": 0.24941815435886383, + "learning_rate": 4.383832721837206e-06, + "loss": 3.843, + "step": 66155 + }, + { + "epoch": 4.495175974996603, + "grad_norm": 0.25512054562568665, + "learning_rate": 4.383408071748879e-06, + "loss": 4.2607, + "step": 66160 + }, + { + "epoch": 4.495515695067264, + "grad_norm": 0.3639163672924042, + "learning_rate": 4.382983421660552e-06, + "loss": 4.0284, + "step": 66165 + }, + { + "epoch": 4.495855415137926, + "grad_norm": 0.3430449962615967, + "learning_rate": 4.382558771572225e-06, + "loss": 3.9997, + "step": 66170 + }, + { + "epoch": 4.4961951352085885, + "grad_norm": 0.5605301260948181, + "learning_rate": 4.382134121483898e-06, + "loss": 4.0044, + "step": 66175 + }, + { + "epoch": 4.49653485527925, + "grad_norm": 0.3692574203014374, + "learning_rate": 4.38170947139557e-06, + "loss": 4.0084, + "step": 66180 + }, + { + "epoch": 4.496874575349912, + "grad_norm": 0.5399828553199768, + "learning_rate": 4.381284821307243e-06, + "loss": 3.8992, + "step": 66185 + }, + { + "epoch": 4.497214295420574, + "grad_norm": 0.2671029567718506, + "learning_rate": 4.380860171218916e-06, + "loss": 4.1304, + "step": 66190 + }, + { + "epoch": 4.497554015491235, + "grad_norm": 0.28023481369018555, + "learning_rate": 4.380435521130589e-06, + "loss": 3.9383, + "step": 66195 + }, + { + "epoch": 4.497893735561897, + "grad_norm": 0.3812633752822876, + "learning_rate": 4.380010871042262e-06, + "loss": 4.048, + "step": 66200 + }, + { + "epoch": 4.498233455632559, + "grad_norm": 0.3093312680721283, + "learning_rate": 4.379586220953934e-06, + "loss": 3.7884, + "step": 66205 + }, + { + "epoch": 4.49857317570322, + "grad_norm": 0.2098570019006729, + "learning_rate": 4.379161570865607e-06, + "loss": 4.0308, + "step": 66210 + }, + { + "epoch": 4.498912895773882, + "grad_norm": 0.34720271825790405, + "learning_rate": 4.37873692077728e-06, + "loss": 3.8319, + "step": 66215 + }, + { + "epoch": 4.4992526158445445, + "grad_norm": 0.243211567401886, + "learning_rate": 4.378312270688953e-06, + "loss": 3.882, + "step": 66220 + }, + { + "epoch": 4.499592335915206, + "grad_norm": 0.3177439868450165, + "learning_rate": 4.377887620600626e-06, + "loss": 4.1342, + "step": 66225 + }, + { + "epoch": 4.499932055985868, + "grad_norm": 0.25323933362960815, + "learning_rate": 4.377462970512298e-06, + "loss": 3.9779, + "step": 66230 + }, + { + "epoch": 4.500271776056529, + "grad_norm": 0.30836090445518494, + "learning_rate": 4.377038320423971e-06, + "loss": 3.8267, + "step": 66235 + }, + { + "epoch": 4.500611496127191, + "grad_norm": 0.3299509584903717, + "learning_rate": 4.376613670335644e-06, + "loss": 3.7279, + "step": 66240 + }, + { + "epoch": 4.500951216197853, + "grad_norm": 0.3335301876068115, + "learning_rate": 4.376189020247317e-06, + "loss": 4.0259, + "step": 66245 + }, + { + "epoch": 4.501290936268514, + "grad_norm": 0.40168672800064087, + "learning_rate": 4.375764370158989e-06, + "loss": 3.9632, + "step": 66250 + }, + { + "epoch": 4.501630656339176, + "grad_norm": 0.2839922308921814, + "learning_rate": 4.375339720070662e-06, + "loss": 3.9712, + "step": 66255 + }, + { + "epoch": 4.501970376409838, + "grad_norm": 0.22459904849529266, + "learning_rate": 4.374915069982335e-06, + "loss": 3.964, + "step": 66260 + }, + { + "epoch": 4.5023100964805, + "grad_norm": 0.30191487073898315, + "learning_rate": 4.374490419894007e-06, + "loss": 3.7858, + "step": 66265 + }, + { + "epoch": 4.502649816551162, + "grad_norm": 0.2623608708381653, + "learning_rate": 4.374065769805681e-06, + "loss": 3.9435, + "step": 66270 + }, + { + "epoch": 4.502989536621824, + "grad_norm": 0.2937352657318115, + "learning_rate": 4.373641119717354e-06, + "loss": 3.9019, + "step": 66275 + }, + { + "epoch": 4.503329256692485, + "grad_norm": 0.28875818848609924, + "learning_rate": 4.3732164696290255e-06, + "loss": 4.0268, + "step": 66280 + }, + { + "epoch": 4.503668976763147, + "grad_norm": 0.2917979061603546, + "learning_rate": 4.372791819540698e-06, + "loss": 3.9497, + "step": 66285 + }, + { + "epoch": 4.504008696833809, + "grad_norm": 0.3244050145149231, + "learning_rate": 4.372367169452372e-06, + "loss": 4.0911, + "step": 66290 + }, + { + "epoch": 4.50434841690447, + "grad_norm": 0.2937333285808563, + "learning_rate": 4.371942519364044e-06, + "loss": 3.9882, + "step": 66295 + }, + { + "epoch": 4.504688136975132, + "grad_norm": 0.24389828741550446, + "learning_rate": 4.371517869275717e-06, + "loss": 3.91, + "step": 66300 + }, + { + "epoch": 4.5050278570457944, + "grad_norm": 0.20951144397258759, + "learning_rate": 4.37109321918739e-06, + "loss": 3.9354, + "step": 66305 + }, + { + "epoch": 4.505367577116456, + "grad_norm": 0.27160394191741943, + "learning_rate": 4.370668569099062e-06, + "loss": 3.6487, + "step": 66310 + }, + { + "epoch": 4.505707297187118, + "grad_norm": 0.29032203555107117, + "learning_rate": 4.370243919010735e-06, + "loss": 3.8775, + "step": 66315 + }, + { + "epoch": 4.50604701725778, + "grad_norm": 0.25592249631881714, + "learning_rate": 4.369819268922408e-06, + "loss": 3.8132, + "step": 66320 + }, + { + "epoch": 4.506386737328441, + "grad_norm": 0.2689080238342285, + "learning_rate": 4.369394618834082e-06, + "loss": 4.065, + "step": 66325 + }, + { + "epoch": 4.506726457399103, + "grad_norm": 0.24753518402576447, + "learning_rate": 4.3689699687457536e-06, + "loss": 3.9679, + "step": 66330 + }, + { + "epoch": 4.507066177469765, + "grad_norm": 0.23064841330051422, + "learning_rate": 4.368545318657426e-06, + "loss": 3.9775, + "step": 66335 + }, + { + "epoch": 4.507405897540426, + "grad_norm": 0.44386419653892517, + "learning_rate": 4.3681206685691e-06, + "loss": 3.724, + "step": 66340 + }, + { + "epoch": 4.507745617611088, + "grad_norm": 0.246175616979599, + "learning_rate": 4.367696018480772e-06, + "loss": 4.2302, + "step": 66345 + }, + { + "epoch": 4.5080853376817505, + "grad_norm": 0.2711704969406128, + "learning_rate": 4.367271368392445e-06, + "loss": 3.8872, + "step": 66350 + }, + { + "epoch": 4.508425057752412, + "grad_norm": 0.5033755302429199, + "learning_rate": 4.366846718304118e-06, + "loss": 3.9342, + "step": 66355 + }, + { + "epoch": 4.508764777823074, + "grad_norm": 0.22511738538742065, + "learning_rate": 4.36642206821579e-06, + "loss": 3.9344, + "step": 66360 + }, + { + "epoch": 4.509104497893736, + "grad_norm": 0.27368807792663574, + "learning_rate": 4.365997418127463e-06, + "loss": 4.1448, + "step": 66365 + }, + { + "epoch": 4.509444217964397, + "grad_norm": 0.3349357545375824, + "learning_rate": 4.365572768039136e-06, + "loss": 3.9575, + "step": 66370 + }, + { + "epoch": 4.509783938035059, + "grad_norm": 0.21944747865200043, + "learning_rate": 4.365148117950809e-06, + "loss": 3.7005, + "step": 66375 + }, + { + "epoch": 4.510123658105721, + "grad_norm": 0.4041455090045929, + "learning_rate": 4.3647234678624816e-06, + "loss": 4.1913, + "step": 66380 + }, + { + "epoch": 4.510463378176382, + "grad_norm": 0.4783574342727661, + "learning_rate": 4.364298817774154e-06, + "loss": 4.162, + "step": 66385 + }, + { + "epoch": 4.510803098247044, + "grad_norm": 0.3275356590747833, + "learning_rate": 4.363874167685827e-06, + "loss": 3.8559, + "step": 66390 + }, + { + "epoch": 4.5111428183177065, + "grad_norm": 0.2092715948820114, + "learning_rate": 4.3634495175975e-06, + "loss": 3.8863, + "step": 66395 + }, + { + "epoch": 4.511482538388368, + "grad_norm": 0.3491857051849365, + "learning_rate": 4.363024867509173e-06, + "loss": 3.7703, + "step": 66400 + }, + { + "epoch": 4.51182225845903, + "grad_norm": 0.3311598300933838, + "learning_rate": 4.3626002174208456e-06, + "loss": 4.055, + "step": 66405 + }, + { + "epoch": 4.512161978529692, + "grad_norm": 0.4520234167575836, + "learning_rate": 4.362175567332518e-06, + "loss": 3.9078, + "step": 66410 + }, + { + "epoch": 4.512501698600353, + "grad_norm": 0.2953861653804779, + "learning_rate": 4.361750917244191e-06, + "loss": 3.8452, + "step": 66415 + }, + { + "epoch": 4.512841418671015, + "grad_norm": 0.30080950260162354, + "learning_rate": 4.361326267155864e-06, + "loss": 3.946, + "step": 66420 + }, + { + "epoch": 4.513181138741677, + "grad_norm": 0.2612993121147156, + "learning_rate": 4.360901617067537e-06, + "loss": 3.8585, + "step": 66425 + }, + { + "epoch": 4.513520858812338, + "grad_norm": 0.29326131939888, + "learning_rate": 4.3604769669792096e-06, + "loss": 3.7941, + "step": 66430 + }, + { + "epoch": 4.513860578883, + "grad_norm": 0.25344687700271606, + "learning_rate": 4.360052316890882e-06, + "loss": 3.9759, + "step": 66435 + }, + { + "epoch": 4.5142002989536625, + "grad_norm": 0.338575154542923, + "learning_rate": 4.359627666802555e-06, + "loss": 4.0201, + "step": 66440 + }, + { + "epoch": 4.514540019024324, + "grad_norm": 0.2420404702425003, + "learning_rate": 4.359203016714228e-06, + "loss": 3.8593, + "step": 66445 + }, + { + "epoch": 4.514879739094986, + "grad_norm": 0.25433000922203064, + "learning_rate": 4.358778366625901e-06, + "loss": 3.9037, + "step": 66450 + }, + { + "epoch": 4.515219459165648, + "grad_norm": 0.2902677357196808, + "learning_rate": 4.3583537165375736e-06, + "loss": 3.7394, + "step": 66455 + }, + { + "epoch": 4.515559179236309, + "grad_norm": 0.2078062891960144, + "learning_rate": 4.357929066449246e-06, + "loss": 3.7553, + "step": 66460 + }, + { + "epoch": 4.515898899306971, + "grad_norm": 0.2907601594924927, + "learning_rate": 4.357504416360918e-06, + "loss": 3.9546, + "step": 66465 + }, + { + "epoch": 4.516238619377633, + "grad_norm": 0.2739337980747223, + "learning_rate": 4.357079766272592e-06, + "loss": 3.8882, + "step": 66470 + }, + { + "epoch": 4.516578339448294, + "grad_norm": 0.3064781427383423, + "learning_rate": 4.356655116184265e-06, + "loss": 3.8941, + "step": 66475 + }, + { + "epoch": 4.516918059518956, + "grad_norm": 0.51678067445755, + "learning_rate": 4.356230466095937e-06, + "loss": 3.6705, + "step": 66480 + }, + { + "epoch": 4.5172577795896185, + "grad_norm": 0.28242701292037964, + "learning_rate": 4.35580581600761e-06, + "loss": 4.025, + "step": 66485 + }, + { + "epoch": 4.51759749966028, + "grad_norm": 0.2414208948612213, + "learning_rate": 4.355381165919283e-06, + "loss": 3.865, + "step": 66490 + }, + { + "epoch": 4.517937219730942, + "grad_norm": 0.25046947598457336, + "learning_rate": 4.354956515830956e-06, + "loss": 3.9957, + "step": 66495 + }, + { + "epoch": 4.518276939801604, + "grad_norm": 0.6538980007171631, + "learning_rate": 4.354531865742628e-06, + "loss": 4.0183, + "step": 66500 + }, + { + "epoch": 4.518616659872265, + "grad_norm": 0.3111751079559326, + "learning_rate": 4.354107215654302e-06, + "loss": 3.8314, + "step": 66505 + }, + { + "epoch": 4.518956379942927, + "grad_norm": 0.3055705726146698, + "learning_rate": 4.353682565565974e-06, + "loss": 3.8183, + "step": 66510 + }, + { + "epoch": 4.519296100013589, + "grad_norm": 0.39896681904792786, + "learning_rate": 4.353257915477646e-06, + "loss": 3.9211, + "step": 66515 + }, + { + "epoch": 4.51963582008425, + "grad_norm": 0.2626473307609558, + "learning_rate": 4.35283326538932e-06, + "loss": 4.006, + "step": 66520 + }, + { + "epoch": 4.519975540154912, + "grad_norm": 0.2535940110683441, + "learning_rate": 4.352408615300993e-06, + "loss": 4.1451, + "step": 66525 + }, + { + "epoch": 4.5203152602255745, + "grad_norm": 0.23895573616027832, + "learning_rate": 4.351983965212665e-06, + "loss": 3.9501, + "step": 66530 + }, + { + "epoch": 4.520654980296236, + "grad_norm": 0.278279185295105, + "learning_rate": 4.3515593151243375e-06, + "loss": 3.8175, + "step": 66535 + }, + { + "epoch": 4.520994700366898, + "grad_norm": 0.28852158784866333, + "learning_rate": 4.351134665036011e-06, + "loss": 4.0729, + "step": 66540 + }, + { + "epoch": 4.52133442043756, + "grad_norm": 0.30368101596832275, + "learning_rate": 4.350710014947683e-06, + "loss": 3.7756, + "step": 66545 + }, + { + "epoch": 4.521674140508221, + "grad_norm": 0.2549142837524414, + "learning_rate": 4.350285364859356e-06, + "loss": 4.0116, + "step": 66550 + }, + { + "epoch": 4.522013860578883, + "grad_norm": 0.23078376054763794, + "learning_rate": 4.34986071477103e-06, + "loss": 3.941, + "step": 66555 + }, + { + "epoch": 4.522353580649545, + "grad_norm": 0.31937697529792786, + "learning_rate": 4.3494360646827015e-06, + "loss": 3.8942, + "step": 66560 + }, + { + "epoch": 4.522693300720206, + "grad_norm": 0.24274809658527374, + "learning_rate": 4.349011414594374e-06, + "loss": 3.7152, + "step": 66565 + }, + { + "epoch": 4.523033020790868, + "grad_norm": 0.23309430480003357, + "learning_rate": 4.348586764506047e-06, + "loss": 4.0481, + "step": 66570 + }, + { + "epoch": 4.5233727408615305, + "grad_norm": 0.6262263059616089, + "learning_rate": 4.34816211441772e-06, + "loss": 3.983, + "step": 66575 + }, + { + "epoch": 4.523712460932192, + "grad_norm": 0.25154033303260803, + "learning_rate": 4.347737464329393e-06, + "loss": 4.0393, + "step": 66580 + }, + { + "epoch": 4.524052181002854, + "grad_norm": 0.2958013713359833, + "learning_rate": 4.3473128142410655e-06, + "loss": 4.0642, + "step": 66585 + }, + { + "epoch": 4.524391901073516, + "grad_norm": 0.2082284837961197, + "learning_rate": 4.346888164152738e-06, + "loss": 4.0266, + "step": 66590 + }, + { + "epoch": 4.524731621144177, + "grad_norm": 0.20243023335933685, + "learning_rate": 4.346463514064411e-06, + "loss": 3.8608, + "step": 66595 + }, + { + "epoch": 4.525071341214839, + "grad_norm": 0.3320605456829071, + "learning_rate": 4.346038863976084e-06, + "loss": 4.0059, + "step": 66600 + }, + { + "epoch": 4.525411061285501, + "grad_norm": 0.3413389325141907, + "learning_rate": 4.345614213887757e-06, + "loss": 4.0471, + "step": 66605 + }, + { + "epoch": 4.525750781356162, + "grad_norm": 0.34722602367401123, + "learning_rate": 4.3451895637994296e-06, + "loss": 3.7932, + "step": 66610 + }, + { + "epoch": 4.5260905014268245, + "grad_norm": 0.26297709345817566, + "learning_rate": 4.344764913711102e-06, + "loss": 3.8401, + "step": 66615 + }, + { + "epoch": 4.5264302214974865, + "grad_norm": 0.2872260510921478, + "learning_rate": 4.344340263622775e-06, + "loss": 3.7587, + "step": 66620 + }, + { + "epoch": 4.526769941568148, + "grad_norm": 0.210164412856102, + "learning_rate": 4.343915613534448e-06, + "loss": 3.7736, + "step": 66625 + }, + { + "epoch": 4.52710966163881, + "grad_norm": 0.3620959520339966, + "learning_rate": 4.343490963446121e-06, + "loss": 3.896, + "step": 66630 + }, + { + "epoch": 4.527449381709472, + "grad_norm": 0.30073070526123047, + "learning_rate": 4.3430663133577936e-06, + "loss": 3.9494, + "step": 66635 + }, + { + "epoch": 4.527789101780133, + "grad_norm": 0.22271399199962616, + "learning_rate": 4.342641663269466e-06, + "loss": 3.7032, + "step": 66640 + }, + { + "epoch": 4.528128821850795, + "grad_norm": 0.23983444273471832, + "learning_rate": 4.342217013181139e-06, + "loss": 3.9539, + "step": 66645 + }, + { + "epoch": 4.528468541921457, + "grad_norm": 0.24480241537094116, + "learning_rate": 4.341792363092812e-06, + "loss": 3.9933, + "step": 66650 + }, + { + "epoch": 4.528808261992118, + "grad_norm": 0.1955876648426056, + "learning_rate": 4.341367713004485e-06, + "loss": 4.0259, + "step": 66655 + }, + { + "epoch": 4.5291479820627805, + "grad_norm": 0.29075998067855835, + "learning_rate": 4.3409430629161576e-06, + "loss": 3.9011, + "step": 66660 + }, + { + "epoch": 4.5294877021334425, + "grad_norm": 0.3051586449146271, + "learning_rate": 4.34051841282783e-06, + "loss": 3.95, + "step": 66665 + }, + { + "epoch": 4.529827422204104, + "grad_norm": 0.3319071829319, + "learning_rate": 4.340093762739503e-06, + "loss": 4.0916, + "step": 66670 + }, + { + "epoch": 4.530167142274766, + "grad_norm": 0.32046541571617126, + "learning_rate": 4.339669112651176e-06, + "loss": 4.099, + "step": 66675 + }, + { + "epoch": 4.530506862345427, + "grad_norm": 0.3940086364746094, + "learning_rate": 4.339244462562849e-06, + "loss": 4.2498, + "step": 66680 + }, + { + "epoch": 4.530846582416089, + "grad_norm": 0.3892868757247925, + "learning_rate": 4.3388198124745216e-06, + "loss": 4.064, + "step": 66685 + }, + { + "epoch": 4.531186302486751, + "grad_norm": 0.3386285603046417, + "learning_rate": 4.338395162386194e-06, + "loss": 3.8385, + "step": 66690 + }, + { + "epoch": 4.531526022557412, + "grad_norm": 0.27877217531204224, + "learning_rate": 4.337970512297867e-06, + "loss": 3.9914, + "step": 66695 + }, + { + "epoch": 4.531865742628074, + "grad_norm": 0.22714605927467346, + "learning_rate": 4.33754586220954e-06, + "loss": 3.8134, + "step": 66700 + }, + { + "epoch": 4.5322054626987365, + "grad_norm": 0.26813000440597534, + "learning_rate": 4.337121212121213e-06, + "loss": 3.9212, + "step": 66705 + }, + { + "epoch": 4.532545182769398, + "grad_norm": 0.2913253605365753, + "learning_rate": 4.3366965620328856e-06, + "loss": 4.1513, + "step": 66710 + }, + { + "epoch": 4.53288490284006, + "grad_norm": 0.4980110824108124, + "learning_rate": 4.3362719119445575e-06, + "loss": 3.82, + "step": 66715 + }, + { + "epoch": 4.533224622910722, + "grad_norm": 0.346062034368515, + "learning_rate": 4.335847261856231e-06, + "loss": 4.0813, + "step": 66720 + }, + { + "epoch": 4.533564342981383, + "grad_norm": 0.47167813777923584, + "learning_rate": 4.335422611767904e-06, + "loss": 3.9231, + "step": 66725 + }, + { + "epoch": 4.533904063052045, + "grad_norm": 0.2464008629322052, + "learning_rate": 4.334997961679576e-06, + "loss": 3.8217, + "step": 66730 + }, + { + "epoch": 4.534243783122707, + "grad_norm": 0.20529665052890778, + "learning_rate": 4.3345733115912496e-06, + "loss": 3.8758, + "step": 66735 + }, + { + "epoch": 4.534583503193368, + "grad_norm": 0.21555016934871674, + "learning_rate": 4.334148661502922e-06, + "loss": 4.0792, + "step": 66740 + }, + { + "epoch": 4.53492322326403, + "grad_norm": 0.31390976905822754, + "learning_rate": 4.333724011414594e-06, + "loss": 4.1021, + "step": 66745 + }, + { + "epoch": 4.5352629433346925, + "grad_norm": 0.32914498448371887, + "learning_rate": 4.333299361326267e-06, + "loss": 4.1218, + "step": 66750 + }, + { + "epoch": 4.535602663405354, + "grad_norm": 0.27075615525245667, + "learning_rate": 4.332874711237941e-06, + "loss": 3.7613, + "step": 66755 + }, + { + "epoch": 4.535942383476016, + "grad_norm": 0.21249176561832428, + "learning_rate": 4.332450061149613e-06, + "loss": 4.0739, + "step": 66760 + }, + { + "epoch": 4.536282103546678, + "grad_norm": 0.24318206310272217, + "learning_rate": 4.3320254110612855e-06, + "loss": 4.1395, + "step": 66765 + }, + { + "epoch": 4.536621823617339, + "grad_norm": 0.23147153854370117, + "learning_rate": 4.331600760972959e-06, + "loss": 3.721, + "step": 66770 + }, + { + "epoch": 4.536961543688001, + "grad_norm": 0.27090880274772644, + "learning_rate": 4.331176110884631e-06, + "loss": 4.049, + "step": 66775 + }, + { + "epoch": 4.537301263758663, + "grad_norm": 0.2456812709569931, + "learning_rate": 4.330751460796304e-06, + "loss": 3.7414, + "step": 66780 + }, + { + "epoch": 4.537640983829324, + "grad_norm": 0.3794187307357788, + "learning_rate": 4.330326810707977e-06, + "loss": 3.9564, + "step": 66785 + }, + { + "epoch": 4.537980703899986, + "grad_norm": 0.3272947072982788, + "learning_rate": 4.3299021606196495e-06, + "loss": 4.0305, + "step": 66790 + }, + { + "epoch": 4.5383204239706485, + "grad_norm": 0.2905542254447937, + "learning_rate": 4.329477510531322e-06, + "loss": 4.0918, + "step": 66795 + }, + { + "epoch": 4.53866014404131, + "grad_norm": 0.37539783120155334, + "learning_rate": 4.329052860442995e-06, + "loss": 4.0302, + "step": 66800 + }, + { + "epoch": 4.538999864111972, + "grad_norm": 0.3350543975830078, + "learning_rate": 4.328628210354668e-06, + "loss": 3.9967, + "step": 66805 + }, + { + "epoch": 4.539339584182634, + "grad_norm": 0.32220152020454407, + "learning_rate": 4.328203560266341e-06, + "loss": 4.209, + "step": 66810 + }, + { + "epoch": 4.539679304253295, + "grad_norm": 0.25859135389328003, + "learning_rate": 4.3277789101780135e-06, + "loss": 4.0977, + "step": 66815 + }, + { + "epoch": 4.540019024323957, + "grad_norm": 0.26696452498435974, + "learning_rate": 4.327354260089686e-06, + "loss": 3.9487, + "step": 66820 + }, + { + "epoch": 4.540358744394619, + "grad_norm": 0.27204543352127075, + "learning_rate": 4.326929610001359e-06, + "loss": 4.1567, + "step": 66825 + }, + { + "epoch": 4.54069846446528, + "grad_norm": 0.20796670019626617, + "learning_rate": 4.326504959913032e-06, + "loss": 3.9139, + "step": 66830 + }, + { + "epoch": 4.541038184535942, + "grad_norm": 0.36032986640930176, + "learning_rate": 4.326080309824705e-06, + "loss": 3.9933, + "step": 66835 + }, + { + "epoch": 4.5413779046066045, + "grad_norm": 0.2693155109882355, + "learning_rate": 4.3256556597363775e-06, + "loss": 3.842, + "step": 66840 + }, + { + "epoch": 4.541717624677266, + "grad_norm": 0.32952460646629333, + "learning_rate": 4.32523100964805e-06, + "loss": 3.9407, + "step": 66845 + }, + { + "epoch": 4.542057344747928, + "grad_norm": 0.3287781774997711, + "learning_rate": 4.324806359559723e-06, + "loss": 3.9506, + "step": 66850 + }, + { + "epoch": 4.54239706481859, + "grad_norm": 0.3339976370334625, + "learning_rate": 4.324381709471396e-06, + "loss": 3.9257, + "step": 66855 + }, + { + "epoch": 4.542736784889251, + "grad_norm": 0.21377727389335632, + "learning_rate": 4.323957059383069e-06, + "loss": 3.8941, + "step": 66860 + }, + { + "epoch": 4.543076504959913, + "grad_norm": 0.2809488773345947, + "learning_rate": 4.3235324092947415e-06, + "loss": 4.0077, + "step": 66865 + }, + { + "epoch": 4.543416225030575, + "grad_norm": 0.3917926549911499, + "learning_rate": 4.323107759206414e-06, + "loss": 3.8935, + "step": 66870 + }, + { + "epoch": 4.543755945101236, + "grad_norm": 0.23847374320030212, + "learning_rate": 4.322683109118087e-06, + "loss": 4.0379, + "step": 66875 + }, + { + "epoch": 4.5440956651718984, + "grad_norm": 0.2439882457256317, + "learning_rate": 4.32225845902976e-06, + "loss": 4.0111, + "step": 66880 + }, + { + "epoch": 4.54443538524256, + "grad_norm": 0.2977631092071533, + "learning_rate": 4.321833808941433e-06, + "loss": 3.9175, + "step": 66885 + }, + { + "epoch": 4.544775105313222, + "grad_norm": 0.2488490790128708, + "learning_rate": 4.3214091588531055e-06, + "loss": 3.7145, + "step": 66890 + }, + { + "epoch": 4.545114825383884, + "grad_norm": 0.2335614562034607, + "learning_rate": 4.320984508764778e-06, + "loss": 3.9375, + "step": 66895 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.3040982782840729, + "learning_rate": 4.320559858676451e-06, + "loss": 3.8669, + "step": 66900 + }, + { + "epoch": 4.545794265525207, + "grad_norm": 0.3225767910480499, + "learning_rate": 4.320135208588124e-06, + "loss": 3.7934, + "step": 66905 + }, + { + "epoch": 4.546133985595869, + "grad_norm": 0.3456704616546631, + "learning_rate": 4.319710558499797e-06, + "loss": 4.2529, + "step": 66910 + }, + { + "epoch": 4.54647370566653, + "grad_norm": 0.2269793450832367, + "learning_rate": 4.319285908411469e-06, + "loss": 3.9903, + "step": 66915 + }, + { + "epoch": 4.546813425737192, + "grad_norm": 0.2545710504055023, + "learning_rate": 4.318861258323142e-06, + "loss": 3.8886, + "step": 66920 + }, + { + "epoch": 4.5471531458078545, + "grad_norm": 0.2689346671104431, + "learning_rate": 4.318436608234815e-06, + "loss": 3.8971, + "step": 66925 + }, + { + "epoch": 4.547492865878516, + "grad_norm": 0.292563259601593, + "learning_rate": 4.318011958146487e-06, + "loss": 4.0384, + "step": 66930 + }, + { + "epoch": 4.547832585949178, + "grad_norm": 0.5092889070510864, + "learning_rate": 4.317587308058161e-06, + "loss": 3.8187, + "step": 66935 + }, + { + "epoch": 4.54817230601984, + "grad_norm": 0.3302598297595978, + "learning_rate": 4.3171626579698336e-06, + "loss": 3.8807, + "step": 66940 + }, + { + "epoch": 4.548512026090501, + "grad_norm": 0.2631494402885437, + "learning_rate": 4.3167380078815055e-06, + "loss": 3.9114, + "step": 66945 + }, + { + "epoch": 4.548851746161163, + "grad_norm": 0.35656094551086426, + "learning_rate": 4.316313357793178e-06, + "loss": 4.2724, + "step": 66950 + }, + { + "epoch": 4.549191466231825, + "grad_norm": 0.33647775650024414, + "learning_rate": 4.315888707704852e-06, + "loss": 4.1808, + "step": 66955 + }, + { + "epoch": 4.549531186302486, + "grad_norm": 0.3921380043029785, + "learning_rate": 4.315464057616524e-06, + "loss": 3.9627, + "step": 66960 + }, + { + "epoch": 4.549870906373148, + "grad_norm": 0.2793055474758148, + "learning_rate": 4.315039407528197e-06, + "loss": 3.8056, + "step": 66965 + }, + { + "epoch": 4.5502106264438105, + "grad_norm": 0.2224263697862625, + "learning_rate": 4.31461475743987e-06, + "loss": 3.8165, + "step": 66970 + }, + { + "epoch": 4.550550346514472, + "grad_norm": 0.3828469514846802, + "learning_rate": 4.314190107351542e-06, + "loss": 3.9002, + "step": 66975 + }, + { + "epoch": 4.550890066585134, + "grad_norm": 0.2852330505847931, + "learning_rate": 4.313765457263215e-06, + "loss": 3.774, + "step": 66980 + }, + { + "epoch": 4.551229786655796, + "grad_norm": 0.2708970308303833, + "learning_rate": 4.313340807174889e-06, + "loss": 4.0048, + "step": 66985 + }, + { + "epoch": 4.551569506726457, + "grad_norm": 0.3494373559951782, + "learning_rate": 4.312916157086561e-06, + "loss": 3.964, + "step": 66990 + }, + { + "epoch": 4.551909226797119, + "grad_norm": 0.28025034070014954, + "learning_rate": 4.3124915069982335e-06, + "loss": 4.1887, + "step": 66995 + }, + { + "epoch": 4.552248946867781, + "grad_norm": 0.3537450134754181, + "learning_rate": 4.312066856909906e-06, + "loss": 3.8431, + "step": 67000 + }, + { + "epoch": 4.552588666938442, + "grad_norm": 0.24447527527809143, + "learning_rate": 4.31164220682158e-06, + "loss": 3.8186, + "step": 67005 + }, + { + "epoch": 4.552928387009104, + "grad_norm": 0.23198318481445312, + "learning_rate": 4.311217556733252e-06, + "loss": 3.8265, + "step": 67010 + }, + { + "epoch": 4.5532681070797665, + "grad_norm": 0.2743651866912842, + "learning_rate": 4.310792906644925e-06, + "loss": 3.919, + "step": 67015 + }, + { + "epoch": 4.553607827150428, + "grad_norm": 0.3030359148979187, + "learning_rate": 4.310368256556598e-06, + "loss": 3.818, + "step": 67020 + }, + { + "epoch": 4.55394754722109, + "grad_norm": 0.25877371430397034, + "learning_rate": 4.30994360646827e-06, + "loss": 3.9307, + "step": 67025 + }, + { + "epoch": 4.554287267291752, + "grad_norm": 0.3312717378139496, + "learning_rate": 4.309518956379943e-06, + "loss": 3.9655, + "step": 67030 + }, + { + "epoch": 4.554626987362413, + "grad_norm": 0.321706622838974, + "learning_rate": 4.309094306291616e-06, + "loss": 3.9607, + "step": 67035 + }, + { + "epoch": 4.554966707433075, + "grad_norm": 0.2345273345708847, + "learning_rate": 4.308669656203289e-06, + "loss": 4.0625, + "step": 67040 + }, + { + "epoch": 4.555306427503737, + "grad_norm": 0.252048134803772, + "learning_rate": 4.3082450061149615e-06, + "loss": 3.9003, + "step": 67045 + }, + { + "epoch": 4.555646147574398, + "grad_norm": 0.3016073703765869, + "learning_rate": 4.307820356026634e-06, + "loss": 3.9348, + "step": 67050 + }, + { + "epoch": 4.55598586764506, + "grad_norm": 0.29700344800949097, + "learning_rate": 4.307395705938307e-06, + "loss": 3.9355, + "step": 67055 + }, + { + "epoch": 4.5563255877157225, + "grad_norm": 0.3166508972644806, + "learning_rate": 4.30697105584998e-06, + "loss": 4.2448, + "step": 67060 + }, + { + "epoch": 4.556665307786384, + "grad_norm": 0.2874014377593994, + "learning_rate": 4.306546405761653e-06, + "loss": 4.1364, + "step": 67065 + }, + { + "epoch": 4.557005027857046, + "grad_norm": 0.2332601249217987, + "learning_rate": 4.3061217556733255e-06, + "loss": 3.9514, + "step": 67070 + }, + { + "epoch": 4.557344747927708, + "grad_norm": 0.25075072050094604, + "learning_rate": 4.305697105584998e-06, + "loss": 4.2297, + "step": 67075 + }, + { + "epoch": 4.557684467998369, + "grad_norm": 0.22528867423534393, + "learning_rate": 4.305272455496671e-06, + "loss": 4.0267, + "step": 67080 + }, + { + "epoch": 4.558024188069031, + "grad_norm": 0.2606975734233856, + "learning_rate": 4.304847805408344e-06, + "loss": 4.0484, + "step": 67085 + }, + { + "epoch": 4.558363908139693, + "grad_norm": 0.22526878118515015, + "learning_rate": 4.304423155320017e-06, + "loss": 3.9921, + "step": 67090 + }, + { + "epoch": 4.558703628210354, + "grad_norm": 0.19882962107658386, + "learning_rate": 4.3039985052316895e-06, + "loss": 3.915, + "step": 67095 + }, + { + "epoch": 4.559043348281016, + "grad_norm": 0.38913393020629883, + "learning_rate": 4.303573855143362e-06, + "loss": 3.8499, + "step": 67100 + }, + { + "epoch": 4.5593830683516785, + "grad_norm": 0.29712164402008057, + "learning_rate": 4.303149205055035e-06, + "loss": 4.017, + "step": 67105 + }, + { + "epoch": 4.55972278842234, + "grad_norm": 0.27361831068992615, + "learning_rate": 4.302724554966708e-06, + "loss": 3.7992, + "step": 67110 + }, + { + "epoch": 4.560062508493002, + "grad_norm": 0.2404998540878296, + "learning_rate": 4.302299904878381e-06, + "loss": 3.7773, + "step": 67115 + }, + { + "epoch": 4.560402228563664, + "grad_norm": 0.29378700256347656, + "learning_rate": 4.3018752547900535e-06, + "loss": 3.8393, + "step": 67120 + }, + { + "epoch": 4.560741948634325, + "grad_norm": 0.3603328466415405, + "learning_rate": 4.301450604701726e-06, + "loss": 3.7776, + "step": 67125 + }, + { + "epoch": 4.561081668704987, + "grad_norm": 0.25979894399642944, + "learning_rate": 4.301025954613398e-06, + "loss": 3.9114, + "step": 67130 + }, + { + "epoch": 4.561421388775649, + "grad_norm": 0.49457764625549316, + "learning_rate": 4.300601304525072e-06, + "loss": 3.7065, + "step": 67135 + }, + { + "epoch": 4.56176110884631, + "grad_norm": 0.2527962327003479, + "learning_rate": 4.300176654436745e-06, + "loss": 4.0341, + "step": 67140 + }, + { + "epoch": 4.562100828916972, + "grad_norm": 0.2719496488571167, + "learning_rate": 4.299752004348417e-06, + "loss": 4.0156, + "step": 67145 + }, + { + "epoch": 4.5624405489876345, + "grad_norm": 0.2984282076358795, + "learning_rate": 4.29932735426009e-06, + "loss": 3.9511, + "step": 67150 + }, + { + "epoch": 4.562780269058296, + "grad_norm": 0.26403191685676575, + "learning_rate": 4.298902704171763e-06, + "loss": 3.9252, + "step": 67155 + }, + { + "epoch": 4.563119989128958, + "grad_norm": 0.2667091190814972, + "learning_rate": 4.298478054083435e-06, + "loss": 3.9506, + "step": 67160 + }, + { + "epoch": 4.56345970919962, + "grad_norm": 0.2851885259151459, + "learning_rate": 4.298053403995108e-06, + "loss": 4.0391, + "step": 67165 + }, + { + "epoch": 4.563799429270281, + "grad_norm": 0.2379828691482544, + "learning_rate": 4.2976287539067815e-06, + "loss": 4.045, + "step": 67170 + }, + { + "epoch": 4.564139149340943, + "grad_norm": 0.2655397653579712, + "learning_rate": 4.297204103818454e-06, + "loss": 3.853, + "step": 67175 + }, + { + "epoch": 4.564478869411605, + "grad_norm": 0.3587041199207306, + "learning_rate": 4.296779453730126e-06, + "loss": 3.9855, + "step": 67180 + }, + { + "epoch": 4.564818589482266, + "grad_norm": 0.42295774817466736, + "learning_rate": 4.2963548036418e-06, + "loss": 3.9071, + "step": 67185 + }, + { + "epoch": 4.5651583095529285, + "grad_norm": 0.3008818030357361, + "learning_rate": 4.295930153553473e-06, + "loss": 4.0922, + "step": 67190 + }, + { + "epoch": 4.5654980296235905, + "grad_norm": 0.23368512094020844, + "learning_rate": 4.295505503465145e-06, + "loss": 3.8322, + "step": 67195 + }, + { + "epoch": 4.565837749694252, + "grad_norm": 0.33227768540382385, + "learning_rate": 4.2950808533768175e-06, + "loss": 3.8088, + "step": 67200 + }, + { + "epoch": 4.566177469764914, + "grad_norm": 0.2682523727416992, + "learning_rate": 4.294656203288491e-06, + "loss": 4.0869, + "step": 67205 + }, + { + "epoch": 4.566517189835576, + "grad_norm": 0.28783470392227173, + "learning_rate": 4.294231553200163e-06, + "loss": 3.8583, + "step": 67210 + }, + { + "epoch": 4.566856909906237, + "grad_norm": 0.2810584008693695, + "learning_rate": 4.293806903111836e-06, + "loss": 3.808, + "step": 67215 + }, + { + "epoch": 4.567196629976899, + "grad_norm": 0.2799970805644989, + "learning_rate": 4.2933822530235096e-06, + "loss": 3.6455, + "step": 67220 + }, + { + "epoch": 4.567536350047561, + "grad_norm": 0.33016759157180786, + "learning_rate": 4.2929576029351815e-06, + "loss": 3.7857, + "step": 67225 + }, + { + "epoch": 4.567876070118222, + "grad_norm": 0.22877246141433716, + "learning_rate": 4.292532952846854e-06, + "loss": 3.6859, + "step": 67230 + }, + { + "epoch": 4.5682157901888845, + "grad_norm": 0.21597474813461304, + "learning_rate": 4.292108302758527e-06, + "loss": 4.1681, + "step": 67235 + }, + { + "epoch": 4.5685555102595465, + "grad_norm": 0.22910641133785248, + "learning_rate": 4.2916836526702e-06, + "loss": 3.8419, + "step": 67240 + }, + { + "epoch": 4.568895230330208, + "grad_norm": 0.18981397151947021, + "learning_rate": 4.291259002581873e-06, + "loss": 3.9649, + "step": 67245 + }, + { + "epoch": 4.56923495040087, + "grad_norm": 0.4331468939781189, + "learning_rate": 4.2908343524935455e-06, + "loss": 4.0386, + "step": 67250 + }, + { + "epoch": 4.569574670471532, + "grad_norm": 0.2585020363330841, + "learning_rate": 4.290409702405218e-06, + "loss": 3.8061, + "step": 67255 + }, + { + "epoch": 4.569914390542193, + "grad_norm": 0.2863423824310303, + "learning_rate": 4.289985052316891e-06, + "loss": 3.9028, + "step": 67260 + }, + { + "epoch": 4.570254110612855, + "grad_norm": 0.36296793818473816, + "learning_rate": 4.289560402228564e-06, + "loss": 3.7645, + "step": 67265 + }, + { + "epoch": 4.570593830683517, + "grad_norm": 0.21055181324481964, + "learning_rate": 4.289135752140237e-06, + "loss": 3.8044, + "step": 67270 + }, + { + "epoch": 4.570933550754178, + "grad_norm": 0.26365357637405396, + "learning_rate": 4.2887111020519095e-06, + "loss": 3.8389, + "step": 67275 + }, + { + "epoch": 4.5712732708248405, + "grad_norm": 0.21698567271232605, + "learning_rate": 4.288286451963582e-06, + "loss": 3.6181, + "step": 67280 + }, + { + "epoch": 4.5716129908955025, + "grad_norm": 0.293583482503891, + "learning_rate": 4.287861801875255e-06, + "loss": 3.8211, + "step": 67285 + }, + { + "epoch": 4.571952710966164, + "grad_norm": 0.26564347743988037, + "learning_rate": 4.287437151786928e-06, + "loss": 4.0393, + "step": 67290 + }, + { + "epoch": 4.572292431036826, + "grad_norm": 0.3653591573238373, + "learning_rate": 4.287012501698601e-06, + "loss": 3.744, + "step": 67295 + }, + { + "epoch": 4.572632151107488, + "grad_norm": 0.42057889699935913, + "learning_rate": 4.2865878516102735e-06, + "loss": 4.0364, + "step": 67300 + }, + { + "epoch": 4.572971871178149, + "grad_norm": 0.29418033361434937, + "learning_rate": 4.286163201521946e-06, + "loss": 3.7779, + "step": 67305 + }, + { + "epoch": 4.573311591248811, + "grad_norm": 0.20969215035438538, + "learning_rate": 4.285738551433619e-06, + "loss": 4.1787, + "step": 67310 + }, + { + "epoch": 4.573651311319473, + "grad_norm": 0.3177599012851715, + "learning_rate": 4.285313901345292e-06, + "loss": 3.9864, + "step": 67315 + }, + { + "epoch": 4.573991031390134, + "grad_norm": 0.2569708526134491, + "learning_rate": 4.284889251256965e-06, + "loss": 3.9418, + "step": 67320 + }, + { + "epoch": 4.5743307514607965, + "grad_norm": 0.3514983355998993, + "learning_rate": 4.2844646011686375e-06, + "loss": 3.9508, + "step": 67325 + }, + { + "epoch": 4.574670471531459, + "grad_norm": 0.21085749566555023, + "learning_rate": 4.28403995108031e-06, + "loss": 3.8443, + "step": 67330 + }, + { + "epoch": 4.57501019160212, + "grad_norm": 0.27080875635147095, + "learning_rate": 4.283615300991983e-06, + "loss": 4.0525, + "step": 67335 + }, + { + "epoch": 4.575349911672782, + "grad_norm": 0.18748581409454346, + "learning_rate": 4.283190650903656e-06, + "loss": 3.4613, + "step": 67340 + }, + { + "epoch": 4.575689631743444, + "grad_norm": 0.24332855641841888, + "learning_rate": 4.282766000815329e-06, + "loss": 4.0725, + "step": 67345 + }, + { + "epoch": 4.576029351814105, + "grad_norm": 0.30595919489860535, + "learning_rate": 4.2823413507270015e-06, + "loss": 3.8279, + "step": 67350 + }, + { + "epoch": 4.576369071884767, + "grad_norm": 0.2944265604019165, + "learning_rate": 4.281916700638674e-06, + "loss": 4.2375, + "step": 67355 + }, + { + "epoch": 4.576708791955428, + "grad_norm": 0.2581447958946228, + "learning_rate": 4.281492050550347e-06, + "loss": 3.9135, + "step": 67360 + }, + { + "epoch": 4.57704851202609, + "grad_norm": 0.23097622394561768, + "learning_rate": 4.28106740046202e-06, + "loss": 4.0051, + "step": 67365 + }, + { + "epoch": 4.5773882320967525, + "grad_norm": 0.3144332766532898, + "learning_rate": 4.280642750373693e-06, + "loss": 3.895, + "step": 67370 + }, + { + "epoch": 4.577727952167414, + "grad_norm": 0.28023001551628113, + "learning_rate": 4.2802181002853655e-06, + "loss": 4.0329, + "step": 67375 + }, + { + "epoch": 4.578067672238076, + "grad_norm": 0.2478003203868866, + "learning_rate": 4.2797934501970375e-06, + "loss": 3.9434, + "step": 67380 + }, + { + "epoch": 4.578407392308738, + "grad_norm": 0.38575080037117004, + "learning_rate": 4.279368800108711e-06, + "loss": 4.0503, + "step": 67385 + }, + { + "epoch": 4.578747112379399, + "grad_norm": 0.28385329246520996, + "learning_rate": 4.278944150020384e-06, + "loss": 3.9379, + "step": 67390 + }, + { + "epoch": 4.579086832450061, + "grad_norm": 0.2018290013074875, + "learning_rate": 4.278519499932056e-06, + "loss": 3.9616, + "step": 67395 + }, + { + "epoch": 4.579426552520723, + "grad_norm": 0.4250185489654541, + "learning_rate": 4.2780948498437295e-06, + "loss": 4.2129, + "step": 67400 + }, + { + "epoch": 4.579766272591384, + "grad_norm": 0.2571447491645813, + "learning_rate": 4.277670199755402e-06, + "loss": 4.0625, + "step": 67405 + }, + { + "epoch": 4.580105992662046, + "grad_norm": 0.24500402808189392, + "learning_rate": 4.277245549667074e-06, + "loss": 3.9956, + "step": 67410 + }, + { + "epoch": 4.5804457127327085, + "grad_norm": 0.23353083431720734, + "learning_rate": 4.276820899578747e-06, + "loss": 3.8188, + "step": 67415 + }, + { + "epoch": 4.58078543280337, + "grad_norm": 0.2674793004989624, + "learning_rate": 4.276396249490421e-06, + "loss": 3.9225, + "step": 67420 + }, + { + "epoch": 4.581125152874032, + "grad_norm": 0.21308283507823944, + "learning_rate": 4.275971599402093e-06, + "loss": 3.8558, + "step": 67425 + }, + { + "epoch": 4.581464872944694, + "grad_norm": 0.271158367395401, + "learning_rate": 4.2755469493137655e-06, + "loss": 3.8664, + "step": 67430 + }, + { + "epoch": 4.581804593015355, + "grad_norm": 0.2452126145362854, + "learning_rate": 4.275122299225439e-06, + "loss": 4.1671, + "step": 67435 + }, + { + "epoch": 4.582144313086017, + "grad_norm": 0.31016573309898376, + "learning_rate": 4.274697649137111e-06, + "loss": 3.7043, + "step": 67440 + }, + { + "epoch": 4.582484033156679, + "grad_norm": 0.22776691615581512, + "learning_rate": 4.274272999048784e-06, + "loss": 3.8675, + "step": 67445 + }, + { + "epoch": 4.58282375322734, + "grad_norm": 0.41142815351486206, + "learning_rate": 4.273848348960457e-06, + "loss": 3.8136, + "step": 67450 + }, + { + "epoch": 4.583163473298002, + "grad_norm": 0.22693519294261932, + "learning_rate": 4.2734236988721295e-06, + "loss": 3.7065, + "step": 67455 + }, + { + "epoch": 4.5835031933686645, + "grad_norm": 0.32255443930625916, + "learning_rate": 4.272999048783802e-06, + "loss": 4.2326, + "step": 67460 + }, + { + "epoch": 4.583842913439326, + "grad_norm": 0.5006658434867859, + "learning_rate": 4.272574398695475e-06, + "loss": 3.9092, + "step": 67465 + }, + { + "epoch": 4.584182633509988, + "grad_norm": 0.2630363404750824, + "learning_rate": 4.272149748607148e-06, + "loss": 3.7667, + "step": 67470 + }, + { + "epoch": 4.58452235358065, + "grad_norm": 0.44860953092575073, + "learning_rate": 4.271725098518821e-06, + "loss": 4.1057, + "step": 67475 + }, + { + "epoch": 4.584862073651311, + "grad_norm": 0.32381540536880493, + "learning_rate": 4.2713004484304935e-06, + "loss": 3.9577, + "step": 67480 + }, + { + "epoch": 4.585201793721973, + "grad_norm": 0.1901359260082245, + "learning_rate": 4.270875798342166e-06, + "loss": 3.7786, + "step": 67485 + }, + { + "epoch": 4.585541513792635, + "grad_norm": 0.47017571330070496, + "learning_rate": 4.270451148253839e-06, + "loss": 4.0709, + "step": 67490 + }, + { + "epoch": 4.585881233863296, + "grad_norm": 0.29682308435440063, + "learning_rate": 4.270026498165512e-06, + "loss": 3.818, + "step": 67495 + }, + { + "epoch": 4.5862209539339585, + "grad_norm": 0.26732149720191956, + "learning_rate": 4.269601848077185e-06, + "loss": 4.1954, + "step": 67500 + }, + { + "epoch": 4.5865606740046205, + "grad_norm": 0.24373750388622284, + "learning_rate": 4.2691771979888575e-06, + "loss": 4.1311, + "step": 67505 + }, + { + "epoch": 4.586900394075282, + "grad_norm": 0.27825555205345154, + "learning_rate": 4.26875254790053e-06, + "loss": 3.9198, + "step": 67510 + }, + { + "epoch": 4.587240114145944, + "grad_norm": 0.2288166731595993, + "learning_rate": 4.268327897812203e-06, + "loss": 3.9525, + "step": 67515 + }, + { + "epoch": 4.587579834216606, + "grad_norm": 0.3007759749889374, + "learning_rate": 4.267903247723876e-06, + "loss": 4.0212, + "step": 67520 + }, + { + "epoch": 4.587919554287267, + "grad_norm": 0.2629387080669403, + "learning_rate": 4.267478597635549e-06, + "loss": 4.0968, + "step": 67525 + }, + { + "epoch": 4.588259274357929, + "grad_norm": 0.3037768602371216, + "learning_rate": 4.2670539475472215e-06, + "loss": 4.0398, + "step": 67530 + }, + { + "epoch": 4.588598994428591, + "grad_norm": 0.23621849715709686, + "learning_rate": 4.266629297458894e-06, + "loss": 3.9091, + "step": 67535 + }, + { + "epoch": 4.588938714499252, + "grad_norm": 0.41871532797813416, + "learning_rate": 4.266204647370567e-06, + "loss": 4.0683, + "step": 67540 + }, + { + "epoch": 4.5892784345699145, + "grad_norm": 0.3282800614833832, + "learning_rate": 4.26577999728224e-06, + "loss": 4.1101, + "step": 67545 + }, + { + "epoch": 4.5896181546405765, + "grad_norm": 0.3949519991874695, + "learning_rate": 4.265355347193913e-06, + "loss": 3.9139, + "step": 67550 + }, + { + "epoch": 4.589957874711238, + "grad_norm": 0.4657493531703949, + "learning_rate": 4.2649306971055855e-06, + "loss": 3.8703, + "step": 67555 + }, + { + "epoch": 4.5902975947819, + "grad_norm": 0.23042535781860352, + "learning_rate": 4.264506047017258e-06, + "loss": 4.1936, + "step": 67560 + }, + { + "epoch": 4.590637314852561, + "grad_norm": 0.3346948027610779, + "learning_rate": 4.264081396928931e-06, + "loss": 3.7898, + "step": 67565 + }, + { + "epoch": 4.590977034923223, + "grad_norm": 0.25199857354164124, + "learning_rate": 4.263656746840604e-06, + "loss": 3.8441, + "step": 67570 + }, + { + "epoch": 4.591316754993885, + "grad_norm": 0.2235269993543625, + "learning_rate": 4.263232096752277e-06, + "loss": 3.8679, + "step": 67575 + }, + { + "epoch": 4.591656475064546, + "grad_norm": 0.29251736402511597, + "learning_rate": 4.262807446663949e-06, + "loss": 3.9202, + "step": 67580 + }, + { + "epoch": 4.591996195135208, + "grad_norm": 0.23119717836380005, + "learning_rate": 4.262382796575622e-06, + "loss": 4.0303, + "step": 67585 + }, + { + "epoch": 4.5923359152058705, + "grad_norm": 0.25165942311286926, + "learning_rate": 4.261958146487295e-06, + "loss": 3.8989, + "step": 67590 + }, + { + "epoch": 4.592675635276532, + "grad_norm": 0.27570462226867676, + "learning_rate": 4.261533496398967e-06, + "loss": 3.6529, + "step": 67595 + }, + { + "epoch": 4.593015355347194, + "grad_norm": 0.3126969039440155, + "learning_rate": 4.261108846310641e-06, + "loss": 3.9355, + "step": 67600 + }, + { + "epoch": 4.593355075417856, + "grad_norm": 0.27694374322891235, + "learning_rate": 4.2606841962223135e-06, + "loss": 3.8674, + "step": 67605 + }, + { + "epoch": 4.593694795488517, + "grad_norm": 0.24463343620300293, + "learning_rate": 4.2602595461339855e-06, + "loss": 3.8255, + "step": 67610 + }, + { + "epoch": 4.594034515559179, + "grad_norm": 0.32101091742515564, + "learning_rate": 4.259834896045658e-06, + "loss": 3.8007, + "step": 67615 + }, + { + "epoch": 4.594374235629841, + "grad_norm": 0.3482811450958252, + "learning_rate": 4.259410245957332e-06, + "loss": 3.8482, + "step": 67620 + }, + { + "epoch": 4.594713955700502, + "grad_norm": 0.4727182388305664, + "learning_rate": 4.258985595869004e-06, + "loss": 3.8921, + "step": 67625 + }, + { + "epoch": 4.595053675771164, + "grad_norm": 0.2532497048377991, + "learning_rate": 4.258560945780677e-06, + "loss": 4.0426, + "step": 67630 + }, + { + "epoch": 4.5953933958418265, + "grad_norm": 0.27655696868896484, + "learning_rate": 4.25813629569235e-06, + "loss": 3.8085, + "step": 67635 + }, + { + "epoch": 4.595733115912488, + "grad_norm": 0.27897366881370544, + "learning_rate": 4.257711645604022e-06, + "loss": 3.7098, + "step": 67640 + }, + { + "epoch": 4.59607283598315, + "grad_norm": 0.27818983793258667, + "learning_rate": 4.257286995515695e-06, + "loss": 4.0629, + "step": 67645 + }, + { + "epoch": 4.596412556053812, + "grad_norm": 0.2924536168575287, + "learning_rate": 4.256862345427369e-06, + "loss": 3.9738, + "step": 67650 + }, + { + "epoch": 4.596752276124473, + "grad_norm": 0.26676806807518005, + "learning_rate": 4.256437695339041e-06, + "loss": 4.0516, + "step": 67655 + }, + { + "epoch": 4.597091996195135, + "grad_norm": 0.32237592339515686, + "learning_rate": 4.2560130452507135e-06, + "loss": 3.7912, + "step": 67660 + }, + { + "epoch": 4.597431716265797, + "grad_norm": 0.26823073625564575, + "learning_rate": 4.255588395162386e-06, + "loss": 3.9998, + "step": 67665 + }, + { + "epoch": 4.597771436336458, + "grad_norm": 0.19500647485256195, + "learning_rate": 4.255163745074059e-06, + "loss": 3.9914, + "step": 67670 + }, + { + "epoch": 4.59811115640712, + "grad_norm": 0.31635451316833496, + "learning_rate": 4.254739094985732e-06, + "loss": 3.8528, + "step": 67675 + }, + { + "epoch": 4.5984508764777825, + "grad_norm": 0.20897001028060913, + "learning_rate": 4.254314444897405e-06, + "loss": 3.8077, + "step": 67680 + }, + { + "epoch": 4.598790596548444, + "grad_norm": 0.3189132809638977, + "learning_rate": 4.253889794809078e-06, + "loss": 4.0019, + "step": 67685 + }, + { + "epoch": 4.599130316619106, + "grad_norm": 0.2859196960926056, + "learning_rate": 4.25346514472075e-06, + "loss": 3.9265, + "step": 67690 + }, + { + "epoch": 4.599470036689768, + "grad_norm": 0.2726289629936218, + "learning_rate": 4.253040494632423e-06, + "loss": 4.1583, + "step": 67695 + }, + { + "epoch": 4.599809756760429, + "grad_norm": 0.28213638067245483, + "learning_rate": 4.252615844544096e-06, + "loss": 3.9651, + "step": 67700 + }, + { + "epoch": 4.600149476831091, + "grad_norm": 0.3255764842033386, + "learning_rate": 4.252191194455769e-06, + "loss": 4.2591, + "step": 67705 + }, + { + "epoch": 4.600489196901753, + "grad_norm": 0.29019659757614136, + "learning_rate": 4.2517665443674415e-06, + "loss": 3.5985, + "step": 67710 + }, + { + "epoch": 4.600828916972414, + "grad_norm": 0.3346020579338074, + "learning_rate": 4.251341894279114e-06, + "loss": 3.9733, + "step": 67715 + }, + { + "epoch": 4.601168637043076, + "grad_norm": 0.31120675802230835, + "learning_rate": 4.250917244190787e-06, + "loss": 3.9035, + "step": 67720 + }, + { + "epoch": 4.6015083571137385, + "grad_norm": 0.2863832116127014, + "learning_rate": 4.25049259410246e-06, + "loss": 3.9642, + "step": 67725 + }, + { + "epoch": 4.6018480771844, + "grad_norm": 0.29609236121177673, + "learning_rate": 4.250067944014133e-06, + "loss": 3.7909, + "step": 67730 + }, + { + "epoch": 4.602187797255062, + "grad_norm": 0.2594417631626129, + "learning_rate": 4.2496432939258055e-06, + "loss": 3.9043, + "step": 67735 + }, + { + "epoch": 4.602527517325724, + "grad_norm": 0.263486385345459, + "learning_rate": 4.249218643837478e-06, + "loss": 4.1054, + "step": 67740 + }, + { + "epoch": 4.602867237396385, + "grad_norm": 0.4104650020599365, + "learning_rate": 4.248793993749151e-06, + "loss": 3.9455, + "step": 67745 + }, + { + "epoch": 4.603206957467047, + "grad_norm": 0.28747794032096863, + "learning_rate": 4.248369343660824e-06, + "loss": 3.9406, + "step": 67750 + }, + { + "epoch": 4.603546677537709, + "grad_norm": 0.33446335792541504, + "learning_rate": 4.247944693572497e-06, + "loss": 3.8631, + "step": 67755 + }, + { + "epoch": 4.60388639760837, + "grad_norm": 0.2533634305000305, + "learning_rate": 4.2475200434841695e-06, + "loss": 3.9264, + "step": 67760 + }, + { + "epoch": 4.6042261176790324, + "grad_norm": 0.3065551221370697, + "learning_rate": 4.247095393395842e-06, + "loss": 3.9308, + "step": 67765 + }, + { + "epoch": 4.6045658377496945, + "grad_norm": 0.3115524351596832, + "learning_rate": 4.246670743307515e-06, + "loss": 4.0441, + "step": 67770 + }, + { + "epoch": 4.604905557820356, + "grad_norm": 0.28837183117866516, + "learning_rate": 4.246246093219188e-06, + "loss": 4.0827, + "step": 67775 + }, + { + "epoch": 4.605245277891018, + "grad_norm": 0.25893595814704895, + "learning_rate": 4.245821443130861e-06, + "loss": 4.0303, + "step": 67780 + }, + { + "epoch": 4.60558499796168, + "grad_norm": 0.27934515476226807, + "learning_rate": 4.2453967930425335e-06, + "loss": 3.9916, + "step": 67785 + }, + { + "epoch": 4.605924718032341, + "grad_norm": 0.23131445050239563, + "learning_rate": 4.244972142954206e-06, + "loss": 4.1404, + "step": 67790 + }, + { + "epoch": 4.606264438103003, + "grad_norm": 0.21917612850666046, + "learning_rate": 4.244547492865878e-06, + "loss": 3.9109, + "step": 67795 + }, + { + "epoch": 4.606604158173665, + "grad_norm": 0.3046896457672119, + "learning_rate": 4.244122842777552e-06, + "loss": 3.9631, + "step": 67800 + }, + { + "epoch": 4.606943878244326, + "grad_norm": 0.2506062090396881, + "learning_rate": 4.243698192689225e-06, + "loss": 4.0269, + "step": 67805 + }, + { + "epoch": 4.6072835983149885, + "grad_norm": 0.2559102177619934, + "learning_rate": 4.243273542600897e-06, + "loss": 3.8871, + "step": 67810 + }, + { + "epoch": 4.6076233183856505, + "grad_norm": 0.2665832042694092, + "learning_rate": 4.24284889251257e-06, + "loss": 3.9892, + "step": 67815 + }, + { + "epoch": 4.607963038456312, + "grad_norm": 0.2176802009344101, + "learning_rate": 4.242424242424243e-06, + "loss": 3.7686, + "step": 67820 + }, + { + "epoch": 4.608302758526974, + "grad_norm": 0.2682713568210602, + "learning_rate": 4.241999592335915e-06, + "loss": 4.0812, + "step": 67825 + }, + { + "epoch": 4.608642478597636, + "grad_norm": 0.25159400701522827, + "learning_rate": 4.241574942247588e-06, + "loss": 4.1011, + "step": 67830 + }, + { + "epoch": 4.608982198668297, + "grad_norm": 0.24604137241840363, + "learning_rate": 4.2411502921592615e-06, + "loss": 3.8569, + "step": 67835 + }, + { + "epoch": 4.609321918738959, + "grad_norm": 0.25874027609825134, + "learning_rate": 4.2407256420709335e-06, + "loss": 3.5983, + "step": 67840 + }, + { + "epoch": 4.609661638809621, + "grad_norm": 0.31212031841278076, + "learning_rate": 4.240300991982606e-06, + "loss": 4.1888, + "step": 67845 + }, + { + "epoch": 4.610001358880282, + "grad_norm": 0.27478137612342834, + "learning_rate": 4.23987634189428e-06, + "loss": 3.9149, + "step": 67850 + }, + { + "epoch": 4.6103410789509445, + "grad_norm": 0.32070863246917725, + "learning_rate": 4.239451691805953e-06, + "loss": 4.0691, + "step": 67855 + }, + { + "epoch": 4.6106807990216065, + "grad_norm": 0.5613885521888733, + "learning_rate": 4.239027041717625e-06, + "loss": 3.8556, + "step": 67860 + }, + { + "epoch": 4.611020519092268, + "grad_norm": 0.23618662357330322, + "learning_rate": 4.2386023916292975e-06, + "loss": 3.9416, + "step": 67865 + }, + { + "epoch": 4.61136023916293, + "grad_norm": 0.2910110056400299, + "learning_rate": 4.238177741540971e-06, + "loss": 3.7176, + "step": 67870 + }, + { + "epoch": 4.611699959233592, + "grad_norm": 0.3404518961906433, + "learning_rate": 4.237753091452643e-06, + "loss": 3.9741, + "step": 67875 + }, + { + "epoch": 4.612039679304253, + "grad_norm": 0.22318121790885925, + "learning_rate": 4.237328441364316e-06, + "loss": 4.0381, + "step": 67880 + }, + { + "epoch": 4.612379399374915, + "grad_norm": 0.1921626329421997, + "learning_rate": 4.2369037912759895e-06, + "loss": 4.0063, + "step": 67885 + }, + { + "epoch": 4.612719119445577, + "grad_norm": 0.2272704392671585, + "learning_rate": 4.2364791411876615e-06, + "loss": 4.1163, + "step": 67890 + }, + { + "epoch": 4.613058839516238, + "grad_norm": 0.3838313817977905, + "learning_rate": 4.2361394211169995e-06, + "loss": 3.9024, + "step": 67895 + }, + { + "epoch": 4.6133985595869005, + "grad_norm": 0.2766847312450409, + "learning_rate": 4.235714771028673e-06, + "loss": 3.954, + "step": 67900 + }, + { + "epoch": 4.613738279657563, + "grad_norm": 0.19477619230747223, + "learning_rate": 4.235290120940346e-06, + "loss": 3.9263, + "step": 67905 + }, + { + "epoch": 4.614077999728224, + "grad_norm": 0.2658841907978058, + "learning_rate": 4.234865470852018e-06, + "loss": 3.8751, + "step": 67910 + }, + { + "epoch": 4.614417719798886, + "grad_norm": 0.3295416235923767, + "learning_rate": 4.234440820763691e-06, + "loss": 3.9135, + "step": 67915 + }, + { + "epoch": 4.614757439869548, + "grad_norm": 0.24866580963134766, + "learning_rate": 4.234016170675364e-06, + "loss": 3.8909, + "step": 67920 + }, + { + "epoch": 4.615097159940209, + "grad_norm": 0.31003791093826294, + "learning_rate": 4.233591520587036e-06, + "loss": 4.1216, + "step": 67925 + }, + { + "epoch": 4.615436880010871, + "grad_norm": 0.2665789723396301, + "learning_rate": 4.233166870498709e-06, + "loss": 3.6559, + "step": 67930 + }, + { + "epoch": 4.615776600081533, + "grad_norm": 0.2964927852153778, + "learning_rate": 4.232742220410383e-06, + "loss": 3.9592, + "step": 67935 + }, + { + "epoch": 4.616116320152194, + "grad_norm": 0.23102015256881714, + "learning_rate": 4.232317570322055e-06, + "loss": 3.8681, + "step": 67940 + }, + { + "epoch": 4.6164560402228565, + "grad_norm": 0.25179141759872437, + "learning_rate": 4.2318929202337275e-06, + "loss": 4.0078, + "step": 67945 + }, + { + "epoch": 4.616795760293519, + "grad_norm": 0.20943890511989594, + "learning_rate": 4.2314682701454e-06, + "loss": 3.9248, + "step": 67950 + }, + { + "epoch": 4.61713548036418, + "grad_norm": 0.23993897438049316, + "learning_rate": 4.231043620057073e-06, + "loss": 4.1408, + "step": 67955 + }, + { + "epoch": 4.617475200434842, + "grad_norm": 0.3297443389892578, + "learning_rate": 4.230618969968746e-06, + "loss": 3.9161, + "step": 67960 + }, + { + "epoch": 4.617814920505504, + "grad_norm": 0.21671411395072937, + "learning_rate": 4.230194319880419e-06, + "loss": 4.1261, + "step": 67965 + }, + { + "epoch": 4.618154640576165, + "grad_norm": 0.3801707625389099, + "learning_rate": 4.2297696697920916e-06, + "loss": 3.8214, + "step": 67970 + }, + { + "epoch": 4.618494360646827, + "grad_norm": 0.3660886287689209, + "learning_rate": 4.229345019703764e-06, + "loss": 3.9469, + "step": 67975 + }, + { + "epoch": 4.618834080717489, + "grad_norm": 0.25845712423324585, + "learning_rate": 4.228920369615437e-06, + "loss": 3.8295, + "step": 67980 + }, + { + "epoch": 4.61917380078815, + "grad_norm": 0.2386159747838974, + "learning_rate": 4.22849571952711e-06, + "loss": 3.8377, + "step": 67985 + }, + { + "epoch": 4.6195135208588125, + "grad_norm": 0.289316326379776, + "learning_rate": 4.228071069438783e-06, + "loss": 4.0243, + "step": 67990 + }, + { + "epoch": 4.619853240929475, + "grad_norm": 0.38673681020736694, + "learning_rate": 4.2276464193504556e-06, + "loss": 4.1195, + "step": 67995 + }, + { + "epoch": 4.620192961000136, + "grad_norm": 0.28897708654403687, + "learning_rate": 4.227221769262128e-06, + "loss": 3.8933, + "step": 68000 + }, + { + "epoch": 4.620532681070798, + "grad_norm": 0.3846321702003479, + "learning_rate": 4.226797119173801e-06, + "loss": 3.8684, + "step": 68005 + }, + { + "epoch": 4.62087240114146, + "grad_norm": 0.25544339418411255, + "learning_rate": 4.226372469085474e-06, + "loss": 3.9842, + "step": 68010 + }, + { + "epoch": 4.621212121212121, + "grad_norm": 0.24828781187534332, + "learning_rate": 4.225947818997147e-06, + "loss": 3.7566, + "step": 68015 + }, + { + "epoch": 4.621551841282783, + "grad_norm": 0.2607695758342743, + "learning_rate": 4.2255231689088196e-06, + "loss": 3.9981, + "step": 68020 + }, + { + "epoch": 4.621891561353445, + "grad_norm": 0.31388112902641296, + "learning_rate": 4.225098518820492e-06, + "loss": 3.6755, + "step": 68025 + }, + { + "epoch": 4.622231281424106, + "grad_norm": 0.2838929295539856, + "learning_rate": 4.224673868732165e-06, + "loss": 3.8808, + "step": 68030 + }, + { + "epoch": 4.6225710014947685, + "grad_norm": 0.3072142004966736, + "learning_rate": 4.224249218643838e-06, + "loss": 4.065, + "step": 68035 + }, + { + "epoch": 4.62291072156543, + "grad_norm": 0.35113099217414856, + "learning_rate": 4.223824568555511e-06, + "loss": 3.8435, + "step": 68040 + }, + { + "epoch": 4.623250441636092, + "grad_norm": 0.3238869905471802, + "learning_rate": 4.223399918467183e-06, + "loss": 3.6234, + "step": 68045 + }, + { + "epoch": 4.623590161706754, + "grad_norm": 0.2541976571083069, + "learning_rate": 4.222975268378856e-06, + "loss": 3.9178, + "step": 68050 + }, + { + "epoch": 4.623929881777415, + "grad_norm": 0.24610535800457, + "learning_rate": 4.222550618290529e-06, + "loss": 3.8166, + "step": 68055 + }, + { + "epoch": 4.624269601848077, + "grad_norm": 0.2616893947124481, + "learning_rate": 4.222125968202202e-06, + "loss": 3.9276, + "step": 68060 + }, + { + "epoch": 4.624609321918739, + "grad_norm": 0.2186931073665619, + "learning_rate": 4.221701318113875e-06, + "loss": 4.1429, + "step": 68065 + }, + { + "epoch": 4.6249490419894, + "grad_norm": 0.4217185378074646, + "learning_rate": 4.2212766680255476e-06, + "loss": 3.8236, + "step": 68070 + }, + { + "epoch": 4.6252887620600625, + "grad_norm": 0.21519289910793304, + "learning_rate": 4.22085201793722e-06, + "loss": 3.7545, + "step": 68075 + }, + { + "epoch": 4.6256284821307245, + "grad_norm": 0.4808065593242645, + "learning_rate": 4.220427367848892e-06, + "loss": 4.055, + "step": 68080 + }, + { + "epoch": 4.625968202201386, + "grad_norm": 0.22588013112545013, + "learning_rate": 4.220002717760566e-06, + "loss": 4.0851, + "step": 68085 + }, + { + "epoch": 4.626307922272048, + "grad_norm": 0.2858732044696808, + "learning_rate": 4.219578067672239e-06, + "loss": 3.9701, + "step": 68090 + }, + { + "epoch": 4.62664764234271, + "grad_norm": 0.24613875150680542, + "learning_rate": 4.219153417583911e-06, + "loss": 3.9351, + "step": 68095 + }, + { + "epoch": 4.626987362413371, + "grad_norm": 0.28840482234954834, + "learning_rate": 4.218728767495584e-06, + "loss": 4.1067, + "step": 68100 + }, + { + "epoch": 4.627327082484033, + "grad_norm": 0.32664212584495544, + "learning_rate": 4.218304117407257e-06, + "loss": 3.9566, + "step": 68105 + }, + { + "epoch": 4.627666802554695, + "grad_norm": 0.22408828139305115, + "learning_rate": 4.217879467318929e-06, + "loss": 3.822, + "step": 68110 + }, + { + "epoch": 4.628006522625356, + "grad_norm": 0.3882662355899811, + "learning_rate": 4.217454817230602e-06, + "loss": 4.0279, + "step": 68115 + }, + { + "epoch": 4.6283462426960185, + "grad_norm": 0.2563667297363281, + "learning_rate": 4.2170301671422756e-06, + "loss": 3.9604, + "step": 68120 + }, + { + "epoch": 4.6286859627666805, + "grad_norm": 0.29855018854141235, + "learning_rate": 4.2166055170539475e-06, + "loss": 3.962, + "step": 68125 + }, + { + "epoch": 4.629025682837342, + "grad_norm": 0.23634998500347137, + "learning_rate": 4.21618086696562e-06, + "loss": 3.943, + "step": 68130 + }, + { + "epoch": 4.629365402908004, + "grad_norm": 0.24627536535263062, + "learning_rate": 4.215756216877294e-06, + "loss": 3.7489, + "step": 68135 + }, + { + "epoch": 4.629705122978666, + "grad_norm": 0.23143507540225983, + "learning_rate": 4.215331566788966e-06, + "loss": 3.9409, + "step": 68140 + }, + { + "epoch": 4.630044843049327, + "grad_norm": 0.304536372423172, + "learning_rate": 4.214906916700639e-06, + "loss": 3.9874, + "step": 68145 + }, + { + "epoch": 4.630384563119989, + "grad_norm": 0.3204542100429535, + "learning_rate": 4.214482266612312e-06, + "loss": 3.8896, + "step": 68150 + }, + { + "epoch": 4.630724283190651, + "grad_norm": 0.2837243974208832, + "learning_rate": 4.214057616523984e-06, + "loss": 4.0004, + "step": 68155 + }, + { + "epoch": 4.631064003261312, + "grad_norm": 0.2691572904586792, + "learning_rate": 4.213632966435657e-06, + "loss": 3.9263, + "step": 68160 + }, + { + "epoch": 4.6314037233319745, + "grad_norm": 0.2881506383419037, + "learning_rate": 4.21320831634733e-06, + "loss": 4.1646, + "step": 68165 + }, + { + "epoch": 4.6317434434026366, + "grad_norm": 0.293109655380249, + "learning_rate": 4.212783666259003e-06, + "loss": 3.8703, + "step": 68170 + }, + { + "epoch": 4.632083163473298, + "grad_norm": 0.25166767835617065, + "learning_rate": 4.2123590161706755e-06, + "loss": 4.0261, + "step": 68175 + }, + { + "epoch": 4.63242288354396, + "grad_norm": 0.21375787258148193, + "learning_rate": 4.211934366082348e-06, + "loss": 3.8856, + "step": 68180 + }, + { + "epoch": 4.632762603614622, + "grad_norm": 0.2781151235103607, + "learning_rate": 4.211509715994021e-06, + "loss": 3.8902, + "step": 68185 + }, + { + "epoch": 4.633102323685283, + "grad_norm": 0.24095192551612854, + "learning_rate": 4.211085065905694e-06, + "loss": 3.8216, + "step": 68190 + }, + { + "epoch": 4.633442043755945, + "grad_norm": 0.25739428400993347, + "learning_rate": 4.210660415817367e-06, + "loss": 4.2147, + "step": 68195 + }, + { + "epoch": 4.633781763826607, + "grad_norm": 0.2384806126356125, + "learning_rate": 4.2102357657290395e-06, + "loss": 3.7573, + "step": 68200 + }, + { + "epoch": 4.634121483897268, + "grad_norm": 0.2491605132818222, + "learning_rate": 4.209811115640712e-06, + "loss": 4.1445, + "step": 68205 + }, + { + "epoch": 4.6344612039679305, + "grad_norm": 0.30120930075645447, + "learning_rate": 4.209386465552385e-06, + "loss": 3.643, + "step": 68210 + }, + { + "epoch": 4.634800924038593, + "grad_norm": 0.22050325572490692, + "learning_rate": 4.208961815464058e-06, + "loss": 4.1083, + "step": 68215 + }, + { + "epoch": 4.635140644109254, + "grad_norm": 0.2833799719810486, + "learning_rate": 4.208537165375731e-06, + "loss": 4.1651, + "step": 68220 + }, + { + "epoch": 4.635480364179916, + "grad_norm": 0.2809854745864868, + "learning_rate": 4.2081125152874035e-06, + "loss": 4.149, + "step": 68225 + }, + { + "epoch": 4.635820084250578, + "grad_norm": 0.2466854304075241, + "learning_rate": 4.207687865199076e-06, + "loss": 3.841, + "step": 68230 + }, + { + "epoch": 4.636159804321239, + "grad_norm": 0.29536333680152893, + "learning_rate": 4.207263215110749e-06, + "loss": 3.9358, + "step": 68235 + }, + { + "epoch": 4.636499524391901, + "grad_norm": 0.23700407147407532, + "learning_rate": 4.206838565022422e-06, + "loss": 3.9941, + "step": 68240 + }, + { + "epoch": 4.636839244462563, + "grad_norm": 0.3006623089313507, + "learning_rate": 4.206413914934095e-06, + "loss": 3.8949, + "step": 68245 + }, + { + "epoch": 4.637178964533224, + "grad_norm": 0.39300966262817383, + "learning_rate": 4.2059892648457675e-06, + "loss": 3.9306, + "step": 68250 + }, + { + "epoch": 4.6375186846038865, + "grad_norm": 0.21618925034999847, + "learning_rate": 4.20556461475744e-06, + "loss": 3.7139, + "step": 68255 + }, + { + "epoch": 4.637858404674548, + "grad_norm": 0.24827785789966583, + "learning_rate": 4.205139964669113e-06, + "loss": 4.0615, + "step": 68260 + }, + { + "epoch": 4.63819812474521, + "grad_norm": 0.2638033926486969, + "learning_rate": 4.204715314580786e-06, + "loss": 3.8488, + "step": 68265 + }, + { + "epoch": 4.638537844815872, + "grad_norm": 0.2449670433998108, + "learning_rate": 4.204290664492459e-06, + "loss": 3.721, + "step": 68270 + }, + { + "epoch": 4.638877564886533, + "grad_norm": 0.21594533324241638, + "learning_rate": 4.2038660144041316e-06, + "loss": 4.022, + "step": 68275 + }, + { + "epoch": 4.639217284957195, + "grad_norm": 0.293523371219635, + "learning_rate": 4.203441364315804e-06, + "loss": 4.0042, + "step": 68280 + }, + { + "epoch": 4.639557005027857, + "grad_norm": 0.37898731231689453, + "learning_rate": 4.203016714227477e-06, + "loss": 3.7886, + "step": 68285 + }, + { + "epoch": 4.639896725098518, + "grad_norm": 0.25113263726234436, + "learning_rate": 4.20259206413915e-06, + "loss": 3.7106, + "step": 68290 + }, + { + "epoch": 4.64023644516918, + "grad_norm": 0.2953605055809021, + "learning_rate": 4.202167414050822e-06, + "loss": 4.1183, + "step": 68295 + }, + { + "epoch": 4.6405761652398425, + "grad_norm": 0.2608306407928467, + "learning_rate": 4.2017427639624956e-06, + "loss": 3.6942, + "step": 68300 + }, + { + "epoch": 4.640915885310504, + "grad_norm": 0.28780871629714966, + "learning_rate": 4.201318113874168e-06, + "loss": 3.6902, + "step": 68305 + }, + { + "epoch": 4.641255605381166, + "grad_norm": 0.24399258196353912, + "learning_rate": 4.20089346378584e-06, + "loss": 3.7738, + "step": 68310 + }, + { + "epoch": 4.641595325451828, + "grad_norm": 0.29186034202575684, + "learning_rate": 4.200468813697514e-06, + "loss": 4.0269, + "step": 68315 + }, + { + "epoch": 4.641935045522489, + "grad_norm": 0.2364969402551651, + "learning_rate": 4.200044163609187e-06, + "loss": 3.859, + "step": 68320 + }, + { + "epoch": 4.642274765593151, + "grad_norm": 0.22710303962230682, + "learning_rate": 4.199619513520859e-06, + "loss": 3.992, + "step": 68325 + }, + { + "epoch": 4.642614485663813, + "grad_norm": 0.5421158075332642, + "learning_rate": 4.1991948634325315e-06, + "loss": 4.1057, + "step": 68330 + }, + { + "epoch": 4.642954205734474, + "grad_norm": 0.20378005504608154, + "learning_rate": 4.198770213344205e-06, + "loss": 3.9065, + "step": 68335 + }, + { + "epoch": 4.6432939258051364, + "grad_norm": 0.32468992471694946, + "learning_rate": 4.198345563255877e-06, + "loss": 4.0433, + "step": 68340 + }, + { + "epoch": 4.6436336458757985, + "grad_norm": 0.269675076007843, + "learning_rate": 4.19792091316755e-06, + "loss": 3.8341, + "step": 68345 + }, + { + "epoch": 4.64397336594646, + "grad_norm": 0.37333500385284424, + "learning_rate": 4.1974962630792236e-06, + "loss": 3.9188, + "step": 68350 + }, + { + "epoch": 4.644313086017122, + "grad_norm": 0.28331223130226135, + "learning_rate": 4.1970716129908955e-06, + "loss": 3.8837, + "step": 68355 + }, + { + "epoch": 4.644652806087784, + "grad_norm": 0.2238120138645172, + "learning_rate": 4.196646962902568e-06, + "loss": 3.945, + "step": 68360 + }, + { + "epoch": 4.644992526158445, + "grad_norm": 0.3654334545135498, + "learning_rate": 4.196222312814241e-06, + "loss": 3.9001, + "step": 68365 + }, + { + "epoch": 4.645332246229107, + "grad_norm": 0.28778278827667236, + "learning_rate": 4.195797662725914e-06, + "loss": 4.1093, + "step": 68370 + }, + { + "epoch": 4.645671966299769, + "grad_norm": 0.33347049355506897, + "learning_rate": 4.195373012637587e-06, + "loss": 4.2325, + "step": 68375 + }, + { + "epoch": 4.64601168637043, + "grad_norm": 0.2866809368133545, + "learning_rate": 4.1949483625492595e-06, + "loss": 3.8398, + "step": 68380 + }, + { + "epoch": 4.6463514064410925, + "grad_norm": 0.21655680239200592, + "learning_rate": 4.194523712460932e-06, + "loss": 3.821, + "step": 68385 + }, + { + "epoch": 4.6466911265117545, + "grad_norm": 0.3744578957557678, + "learning_rate": 4.194099062372605e-06, + "loss": 3.8017, + "step": 68390 + }, + { + "epoch": 4.647030846582416, + "grad_norm": 0.301311731338501, + "learning_rate": 4.193674412284278e-06, + "loss": 3.9862, + "step": 68395 + }, + { + "epoch": 4.647370566653078, + "grad_norm": 0.29260170459747314, + "learning_rate": 4.193249762195951e-06, + "loss": 4.1441, + "step": 68400 + }, + { + "epoch": 4.64771028672374, + "grad_norm": 0.30030888319015503, + "learning_rate": 4.1928251121076235e-06, + "loss": 3.9005, + "step": 68405 + }, + { + "epoch": 4.648050006794401, + "grad_norm": 0.33152148127555847, + "learning_rate": 4.192400462019296e-06, + "loss": 3.6794, + "step": 68410 + }, + { + "epoch": 4.648389726865063, + "grad_norm": 0.37849918007850647, + "learning_rate": 4.191975811930969e-06, + "loss": 3.8369, + "step": 68415 + }, + { + "epoch": 4.648729446935725, + "grad_norm": 0.24547599256038666, + "learning_rate": 4.191551161842642e-06, + "loss": 3.7357, + "step": 68420 + }, + { + "epoch": 4.649069167006386, + "grad_norm": 0.25965654850006104, + "learning_rate": 4.191126511754315e-06, + "loss": 3.6817, + "step": 68425 + }, + { + "epoch": 4.6494088870770485, + "grad_norm": 0.2856700122356415, + "learning_rate": 4.1907018616659875e-06, + "loss": 3.9248, + "step": 68430 + }, + { + "epoch": 4.6497486071477105, + "grad_norm": 0.26208123564720154, + "learning_rate": 4.19027721157766e-06, + "loss": 3.8882, + "step": 68435 + }, + { + "epoch": 4.650088327218372, + "grad_norm": 0.25907403230667114, + "learning_rate": 4.189852561489333e-06, + "loss": 4.0268, + "step": 68440 + }, + { + "epoch": 4.650428047289034, + "grad_norm": 0.23532675206661224, + "learning_rate": 4.189427911401006e-06, + "loss": 3.9676, + "step": 68445 + }, + { + "epoch": 4.650767767359696, + "grad_norm": 0.4767886996269226, + "learning_rate": 4.189003261312679e-06, + "loss": 3.9659, + "step": 68450 + }, + { + "epoch": 4.651107487430357, + "grad_norm": 0.28128841519355774, + "learning_rate": 4.1885786112243515e-06, + "loss": 3.914, + "step": 68455 + }, + { + "epoch": 4.651447207501019, + "grad_norm": 0.21597430109977722, + "learning_rate": 4.188153961136024e-06, + "loss": 4.1519, + "step": 68460 + }, + { + "epoch": 4.651786927571681, + "grad_norm": 0.2208782583475113, + "learning_rate": 4.187729311047697e-06, + "loss": 3.8572, + "step": 68465 + }, + { + "epoch": 4.652126647642342, + "grad_norm": 0.35459208488464355, + "learning_rate": 4.18730466095937e-06, + "loss": 4.1819, + "step": 68470 + }, + { + "epoch": 4.6524663677130045, + "grad_norm": 0.3841288089752197, + "learning_rate": 4.186880010871043e-06, + "loss": 4.0152, + "step": 68475 + }, + { + "epoch": 4.6528060877836666, + "grad_norm": 0.2302207052707672, + "learning_rate": 4.1864553607827155e-06, + "loss": 3.6606, + "step": 68480 + }, + { + "epoch": 4.653145807854328, + "grad_norm": 0.3793589174747467, + "learning_rate": 4.186030710694388e-06, + "loss": 3.9526, + "step": 68485 + }, + { + "epoch": 4.65348552792499, + "grad_norm": 0.2650853097438812, + "learning_rate": 4.185606060606061e-06, + "loss": 4.0461, + "step": 68490 + }, + { + "epoch": 4.653825247995652, + "grad_norm": 0.24167215824127197, + "learning_rate": 4.185181410517734e-06, + "loss": 3.8616, + "step": 68495 + }, + { + "epoch": 4.654164968066313, + "grad_norm": 0.23142869770526886, + "learning_rate": 4.184756760429407e-06, + "loss": 3.9447, + "step": 68500 + }, + { + "epoch": 4.654504688136975, + "grad_norm": 0.2445816844701767, + "learning_rate": 4.1843321103410795e-06, + "loss": 3.9361, + "step": 68505 + }, + { + "epoch": 4.654844408207637, + "grad_norm": 0.28046905994415283, + "learning_rate": 4.1839074602527515e-06, + "loss": 3.9445, + "step": 68510 + }, + { + "epoch": 4.655184128278298, + "grad_norm": 0.23529526591300964, + "learning_rate": 4.183482810164425e-06, + "loss": 3.8045, + "step": 68515 + }, + { + "epoch": 4.6555238483489605, + "grad_norm": 0.259869247674942, + "learning_rate": 4.183058160076098e-06, + "loss": 3.9147, + "step": 68520 + }, + { + "epoch": 4.655863568419623, + "grad_norm": 0.23890432715415955, + "learning_rate": 4.18263350998777e-06, + "loss": 3.8313, + "step": 68525 + }, + { + "epoch": 4.656203288490284, + "grad_norm": 0.3247189521789551, + "learning_rate": 4.1822088598994435e-06, + "loss": 4.0821, + "step": 68530 + }, + { + "epoch": 4.656543008560946, + "grad_norm": 0.2779451012611389, + "learning_rate": 4.181784209811116e-06, + "loss": 4.0189, + "step": 68535 + }, + { + "epoch": 4.656882728631608, + "grad_norm": 0.3190973103046417, + "learning_rate": 4.181359559722788e-06, + "loss": 3.9571, + "step": 68540 + }, + { + "epoch": 4.657222448702269, + "grad_norm": 0.25086331367492676, + "learning_rate": 4.180934909634461e-06, + "loss": 3.9987, + "step": 68545 + }, + { + "epoch": 4.657562168772931, + "grad_norm": 0.2463563233613968, + "learning_rate": 4.180510259546135e-06, + "loss": 4.0776, + "step": 68550 + }, + { + "epoch": 4.657901888843593, + "grad_norm": 0.43478211760520935, + "learning_rate": 4.180085609457807e-06, + "loss": 3.728, + "step": 68555 + }, + { + "epoch": 4.658241608914254, + "grad_norm": 0.24687784910202026, + "learning_rate": 4.1796609593694795e-06, + "loss": 3.8323, + "step": 68560 + }, + { + "epoch": 4.6585813289849165, + "grad_norm": 0.23508620262145996, + "learning_rate": 4.179236309281153e-06, + "loss": 3.9965, + "step": 68565 + }, + { + "epoch": 4.658921049055579, + "grad_norm": 0.39576077461242676, + "learning_rate": 4.178811659192826e-06, + "loss": 4.1272, + "step": 68570 + }, + { + "epoch": 4.65926076912624, + "grad_norm": 0.2772352695465088, + "learning_rate": 4.178387009104498e-06, + "loss": 3.9091, + "step": 68575 + }, + { + "epoch": 4.659600489196902, + "grad_norm": 0.2584102153778076, + "learning_rate": 4.177962359016171e-06, + "loss": 4.0981, + "step": 68580 + }, + { + "epoch": 4.659940209267564, + "grad_norm": 0.38106903433799744, + "learning_rate": 4.177537708927844e-06, + "loss": 4.0445, + "step": 68585 + }, + { + "epoch": 4.660279929338225, + "grad_norm": 0.267955482006073, + "learning_rate": 4.177113058839516e-06, + "loss": 3.9337, + "step": 68590 + }, + { + "epoch": 4.660619649408887, + "grad_norm": 0.2643100619316101, + "learning_rate": 4.176688408751189e-06, + "loss": 4.1017, + "step": 68595 + }, + { + "epoch": 4.660959369479549, + "grad_norm": 0.2251884937286377, + "learning_rate": 4.176263758662863e-06, + "loss": 3.8021, + "step": 68600 + }, + { + "epoch": 4.66129908955021, + "grad_norm": 0.3228814899921417, + "learning_rate": 4.175839108574535e-06, + "loss": 3.9871, + "step": 68605 + }, + { + "epoch": 4.6616388096208725, + "grad_norm": 0.2949763834476471, + "learning_rate": 4.1754144584862075e-06, + "loss": 3.9486, + "step": 68610 + }, + { + "epoch": 4.661978529691535, + "grad_norm": 0.30344849824905396, + "learning_rate": 4.17498980839788e-06, + "loss": 3.9627, + "step": 68615 + }, + { + "epoch": 4.662318249762196, + "grad_norm": 0.32451486587524414, + "learning_rate": 4.174565158309553e-06, + "loss": 3.7778, + "step": 68620 + }, + { + "epoch": 4.662657969832858, + "grad_norm": 0.2462908774614334, + "learning_rate": 4.174140508221226e-06, + "loss": 3.7567, + "step": 68625 + }, + { + "epoch": 4.66299768990352, + "grad_norm": 0.33365416526794434, + "learning_rate": 4.173715858132899e-06, + "loss": 4.0984, + "step": 68630 + }, + { + "epoch": 4.663337409974181, + "grad_norm": 0.33219480514526367, + "learning_rate": 4.1732912080445715e-06, + "loss": 4.1298, + "step": 68635 + }, + { + "epoch": 4.663677130044843, + "grad_norm": 0.32147520780563354, + "learning_rate": 4.172866557956244e-06, + "loss": 3.9844, + "step": 68640 + }, + { + "epoch": 4.664016850115505, + "grad_norm": 0.2802076041698456, + "learning_rate": 4.172441907867917e-06, + "loss": 3.8409, + "step": 68645 + }, + { + "epoch": 4.6643565701861665, + "grad_norm": 0.3532484471797943, + "learning_rate": 4.17201725777959e-06, + "loss": 3.9059, + "step": 68650 + }, + { + "epoch": 4.6646962902568285, + "grad_norm": 0.24292081594467163, + "learning_rate": 4.171592607691263e-06, + "loss": 3.9633, + "step": 68655 + }, + { + "epoch": 4.665036010327491, + "grad_norm": 0.20776012539863586, + "learning_rate": 4.1711679576029355e-06, + "loss": 3.9699, + "step": 68660 + }, + { + "epoch": 4.665375730398152, + "grad_norm": 0.24494704604148865, + "learning_rate": 4.170743307514608e-06, + "loss": 3.8318, + "step": 68665 + }, + { + "epoch": 4.665715450468814, + "grad_norm": 0.26438838243484497, + "learning_rate": 4.170318657426281e-06, + "loss": 4.0958, + "step": 68670 + }, + { + "epoch": 4.666055170539476, + "grad_norm": 0.24772392213344574, + "learning_rate": 4.169894007337954e-06, + "loss": 4.0151, + "step": 68675 + }, + { + "epoch": 4.666394890610137, + "grad_norm": 0.313602089881897, + "learning_rate": 4.169469357249627e-06, + "loss": 3.6741, + "step": 68680 + }, + { + "epoch": 4.666734610680799, + "grad_norm": 0.35266435146331787, + "learning_rate": 4.1690447071612995e-06, + "loss": 3.9567, + "step": 68685 + }, + { + "epoch": 4.667074330751461, + "grad_norm": 0.3753041923046112, + "learning_rate": 4.168620057072972e-06, + "loss": 4.1497, + "step": 68690 + }, + { + "epoch": 4.6674140508221225, + "grad_norm": 0.23368331789970398, + "learning_rate": 4.168195406984645e-06, + "loss": 3.8703, + "step": 68695 + }, + { + "epoch": 4.6677537708927845, + "grad_norm": 0.26927322149276733, + "learning_rate": 4.167770756896318e-06, + "loss": 4.0637, + "step": 68700 + }, + { + "epoch": 4.668093490963447, + "grad_norm": 0.27580884099006653, + "learning_rate": 4.167346106807991e-06, + "loss": 4.0608, + "step": 68705 + }, + { + "epoch": 4.668433211034108, + "grad_norm": 0.2529955506324768, + "learning_rate": 4.166921456719663e-06, + "loss": 3.9375, + "step": 68710 + }, + { + "epoch": 4.66877293110477, + "grad_norm": 0.24514859914779663, + "learning_rate": 4.166496806631336e-06, + "loss": 3.8336, + "step": 68715 + }, + { + "epoch": 4.669112651175431, + "grad_norm": 0.3711336553096771, + "learning_rate": 4.166072156543009e-06, + "loss": 4.0191, + "step": 68720 + }, + { + "epoch": 4.669452371246093, + "grad_norm": 0.2812899947166443, + "learning_rate": 4.165647506454681e-06, + "loss": 4.0266, + "step": 68725 + }, + { + "epoch": 4.669792091316755, + "grad_norm": 0.3168684244155884, + "learning_rate": 4.165222856366355e-06, + "loss": 4.0327, + "step": 68730 + }, + { + "epoch": 4.670131811387416, + "grad_norm": 0.2881072759628296, + "learning_rate": 4.1647982062780275e-06, + "loss": 3.9117, + "step": 68735 + }, + { + "epoch": 4.6704715314580785, + "grad_norm": 0.22141285240650177, + "learning_rate": 4.1643735561897e-06, + "loss": 4.0409, + "step": 68740 + }, + { + "epoch": 4.6708112515287405, + "grad_norm": 0.2783028185367584, + "learning_rate": 4.163948906101372e-06, + "loss": 3.9561, + "step": 68745 + }, + { + "epoch": 4.671150971599402, + "grad_norm": 0.3697625696659088, + "learning_rate": 4.163524256013046e-06, + "loss": 4.0704, + "step": 68750 + }, + { + "epoch": 4.671490691670064, + "grad_norm": 0.34944280982017517, + "learning_rate": 4.163099605924719e-06, + "loss": 3.8977, + "step": 68755 + }, + { + "epoch": 4.671830411740726, + "grad_norm": 0.319146066904068, + "learning_rate": 4.162674955836391e-06, + "loss": 4.0313, + "step": 68760 + }, + { + "epoch": 4.672170131811387, + "grad_norm": 0.23845461010932922, + "learning_rate": 4.162250305748064e-06, + "loss": 4.0797, + "step": 68765 + }, + { + "epoch": 4.672509851882049, + "grad_norm": 0.3053780794143677, + "learning_rate": 4.161825655659737e-06, + "loss": 3.8393, + "step": 68770 + }, + { + "epoch": 4.672849571952711, + "grad_norm": 0.2044774889945984, + "learning_rate": 4.161401005571409e-06, + "loss": 4.0496, + "step": 68775 + }, + { + "epoch": 4.673189292023372, + "grad_norm": 0.4316912293434143, + "learning_rate": 4.160976355483083e-06, + "loss": 4.029, + "step": 68780 + }, + { + "epoch": 4.6735290120940345, + "grad_norm": 0.28190678358078003, + "learning_rate": 4.1605517053947555e-06, + "loss": 3.9657, + "step": 68785 + }, + { + "epoch": 4.673868732164697, + "grad_norm": 0.266234427690506, + "learning_rate": 4.1601270553064275e-06, + "loss": 3.7529, + "step": 68790 + }, + { + "epoch": 4.674208452235358, + "grad_norm": 0.2597237527370453, + "learning_rate": 4.1597024052181e-06, + "loss": 3.8918, + "step": 68795 + }, + { + "epoch": 4.67454817230602, + "grad_norm": 0.2693169116973877, + "learning_rate": 4.159277755129774e-06, + "loss": 3.9911, + "step": 68800 + }, + { + "epoch": 4.674887892376682, + "grad_norm": 0.25815272331237793, + "learning_rate": 4.158853105041446e-06, + "loss": 3.7556, + "step": 68805 + }, + { + "epoch": 4.675227612447343, + "grad_norm": 0.23364238440990448, + "learning_rate": 4.158428454953119e-06, + "loss": 4.123, + "step": 68810 + }, + { + "epoch": 4.675567332518005, + "grad_norm": 0.24950458109378815, + "learning_rate": 4.158003804864792e-06, + "loss": 4.0195, + "step": 68815 + }, + { + "epoch": 4.675907052588667, + "grad_norm": 0.26419398188591003, + "learning_rate": 4.157579154776464e-06, + "loss": 3.9906, + "step": 68820 + }, + { + "epoch": 4.676246772659328, + "grad_norm": 0.2879455089569092, + "learning_rate": 4.157154504688137e-06, + "loss": 3.9106, + "step": 68825 + }, + { + "epoch": 4.6765864927299905, + "grad_norm": 0.24852176010608673, + "learning_rate": 4.15672985459981e-06, + "loss": 3.6857, + "step": 68830 + }, + { + "epoch": 4.676926212800653, + "grad_norm": 0.2743549048900604, + "learning_rate": 4.156305204511483e-06, + "loss": 3.9461, + "step": 68835 + }, + { + "epoch": 4.677265932871314, + "grad_norm": 0.4540281891822815, + "learning_rate": 4.1558805544231555e-06, + "loss": 3.8237, + "step": 68840 + }, + { + "epoch": 4.677605652941976, + "grad_norm": 0.3191125988960266, + "learning_rate": 4.155455904334828e-06, + "loss": 3.9621, + "step": 68845 + }, + { + "epoch": 4.677945373012638, + "grad_norm": 0.24930034577846527, + "learning_rate": 4.155031254246501e-06, + "loss": 3.7688, + "step": 68850 + }, + { + "epoch": 4.678285093083299, + "grad_norm": 0.23472949862480164, + "learning_rate": 4.154606604158174e-06, + "loss": 3.8696, + "step": 68855 + }, + { + "epoch": 4.678624813153961, + "grad_norm": 0.3527679741382599, + "learning_rate": 4.154181954069847e-06, + "loss": 3.8851, + "step": 68860 + }, + { + "epoch": 4.678964533224623, + "grad_norm": 0.30998802185058594, + "learning_rate": 4.1537573039815195e-06, + "loss": 3.9578, + "step": 68865 + }, + { + "epoch": 4.679304253295284, + "grad_norm": 0.2902700901031494, + "learning_rate": 4.153332653893192e-06, + "loss": 3.9576, + "step": 68870 + }, + { + "epoch": 4.6796439733659465, + "grad_norm": 0.5123451352119446, + "learning_rate": 4.152908003804865e-06, + "loss": 3.9191, + "step": 68875 + }, + { + "epoch": 4.679983693436609, + "grad_norm": 0.3721505105495453, + "learning_rate": 4.152483353716538e-06, + "loss": 3.715, + "step": 68880 + }, + { + "epoch": 4.68032341350727, + "grad_norm": 0.3361140787601471, + "learning_rate": 4.152058703628211e-06, + "loss": 3.8267, + "step": 68885 + }, + { + "epoch": 4.680663133577932, + "grad_norm": 0.38175952434539795, + "learning_rate": 4.1516340535398835e-06, + "loss": 3.9319, + "step": 68890 + }, + { + "epoch": 4.681002853648594, + "grad_norm": 0.28953036665916443, + "learning_rate": 4.151209403451556e-06, + "loss": 3.9785, + "step": 68895 + }, + { + "epoch": 4.681342573719255, + "grad_norm": 0.3273965120315552, + "learning_rate": 4.150784753363229e-06, + "loss": 3.8501, + "step": 68900 + }, + { + "epoch": 4.681682293789917, + "grad_norm": 0.2929091453552246, + "learning_rate": 4.150360103274902e-06, + "loss": 3.884, + "step": 68905 + }, + { + "epoch": 4.682022013860579, + "grad_norm": 0.28556960821151733, + "learning_rate": 4.149935453186575e-06, + "loss": 3.8997, + "step": 68910 + }, + { + "epoch": 4.68236173393124, + "grad_norm": 0.5032363533973694, + "learning_rate": 4.1495108030982475e-06, + "loss": 3.88, + "step": 68915 + }, + { + "epoch": 4.6827014540019025, + "grad_norm": 0.31028300523757935, + "learning_rate": 4.14908615300992e-06, + "loss": 4.2084, + "step": 68920 + }, + { + "epoch": 4.683041174072565, + "grad_norm": 0.28465449810028076, + "learning_rate": 4.148661502921593e-06, + "loss": 4.0367, + "step": 68925 + }, + { + "epoch": 4.683380894143226, + "grad_norm": 0.25139886140823364, + "learning_rate": 4.148236852833266e-06, + "loss": 3.8279, + "step": 68930 + }, + { + "epoch": 4.683720614213888, + "grad_norm": 0.25696617364883423, + "learning_rate": 4.147812202744939e-06, + "loss": 3.5593, + "step": 68935 + }, + { + "epoch": 4.684060334284549, + "grad_norm": 0.2370615005493164, + "learning_rate": 4.1473875526566115e-06, + "loss": 3.9937, + "step": 68940 + }, + { + "epoch": 4.684400054355211, + "grad_norm": 0.24542482197284698, + "learning_rate": 4.146962902568284e-06, + "loss": 3.7493, + "step": 68945 + }, + { + "epoch": 4.684739774425873, + "grad_norm": 0.47304612398147583, + "learning_rate": 4.146538252479957e-06, + "loss": 3.8264, + "step": 68950 + }, + { + "epoch": 4.685079494496534, + "grad_norm": 0.29402852058410645, + "learning_rate": 4.14611360239163e-06, + "loss": 4.0809, + "step": 68955 + }, + { + "epoch": 4.6854192145671965, + "grad_norm": 0.2597920298576355, + "learning_rate": 4.145688952303302e-06, + "loss": 3.9317, + "step": 68960 + }, + { + "epoch": 4.6857589346378585, + "grad_norm": 0.26036104559898376, + "learning_rate": 4.1452643022149755e-06, + "loss": 3.8015, + "step": 68965 + }, + { + "epoch": 4.68609865470852, + "grad_norm": 0.2583705484867096, + "learning_rate": 4.144839652126648e-06, + "loss": 3.746, + "step": 68970 + }, + { + "epoch": 4.686438374779182, + "grad_norm": 0.276621550321579, + "learning_rate": 4.14441500203832e-06, + "loss": 3.9544, + "step": 68975 + }, + { + "epoch": 4.686778094849844, + "grad_norm": 0.21310463547706604, + "learning_rate": 4.143990351949994e-06, + "loss": 4.0858, + "step": 68980 + }, + { + "epoch": 4.687117814920505, + "grad_norm": 0.25109654664993286, + "learning_rate": 4.143565701861667e-06, + "loss": 3.613, + "step": 68985 + }, + { + "epoch": 4.687457534991167, + "grad_norm": 0.2787776589393616, + "learning_rate": 4.143141051773339e-06, + "loss": 3.9706, + "step": 68990 + }, + { + "epoch": 4.687797255061829, + "grad_norm": 0.28857749700546265, + "learning_rate": 4.1427164016850115e-06, + "loss": 4.2629, + "step": 68995 + }, + { + "epoch": 4.68813697513249, + "grad_norm": 0.31820738315582275, + "learning_rate": 4.142291751596685e-06, + "loss": 3.7844, + "step": 69000 + }, + { + "epoch": 4.6884766952031525, + "grad_norm": 0.2660152316093445, + "learning_rate": 4.141867101508357e-06, + "loss": 4.0776, + "step": 69005 + }, + { + "epoch": 4.6888164152738145, + "grad_norm": 0.2875518202781677, + "learning_rate": 4.14144245142003e-06, + "loss": 4.016, + "step": 69010 + }, + { + "epoch": 4.689156135344476, + "grad_norm": 0.22561688721179962, + "learning_rate": 4.1410178013317035e-06, + "loss": 4.1144, + "step": 69015 + }, + { + "epoch": 4.689495855415138, + "grad_norm": 0.37737584114074707, + "learning_rate": 4.1405931512433755e-06, + "loss": 4.0354, + "step": 69020 + }, + { + "epoch": 4.6898355754858, + "grad_norm": 0.7217426896095276, + "learning_rate": 4.140168501155048e-06, + "loss": 4.0556, + "step": 69025 + }, + { + "epoch": 4.690175295556461, + "grad_norm": 0.22923336923122406, + "learning_rate": 4.139743851066721e-06, + "loss": 4.3288, + "step": 69030 + }, + { + "epoch": 4.690515015627123, + "grad_norm": 0.31714892387390137, + "learning_rate": 4.139319200978394e-06, + "loss": 3.7806, + "step": 69035 + }, + { + "epoch": 4.690854735697785, + "grad_norm": 0.2392427772283554, + "learning_rate": 4.138894550890067e-06, + "loss": 3.6784, + "step": 69040 + }, + { + "epoch": 4.691194455768446, + "grad_norm": 0.2777140140533447, + "learning_rate": 4.1384699008017395e-06, + "loss": 3.9148, + "step": 69045 + }, + { + "epoch": 4.6915341758391085, + "grad_norm": 0.40870076417922974, + "learning_rate": 4.138045250713412e-06, + "loss": 3.9656, + "step": 69050 + }, + { + "epoch": 4.6918738959097706, + "grad_norm": 0.4268193244934082, + "learning_rate": 4.137620600625085e-06, + "loss": 3.9031, + "step": 69055 + }, + { + "epoch": 4.692213615980432, + "grad_norm": 0.40742751955986023, + "learning_rate": 4.137195950536758e-06, + "loss": 4.1324, + "step": 69060 + }, + { + "epoch": 4.692553336051094, + "grad_norm": 0.23884089291095734, + "learning_rate": 4.136771300448431e-06, + "loss": 3.9031, + "step": 69065 + }, + { + "epoch": 4.692893056121756, + "grad_norm": 0.6319859623908997, + "learning_rate": 4.1363466503601035e-06, + "loss": 3.9795, + "step": 69070 + }, + { + "epoch": 4.693232776192417, + "grad_norm": 0.318403959274292, + "learning_rate": 4.135922000271776e-06, + "loss": 4.2173, + "step": 69075 + }, + { + "epoch": 4.693572496263079, + "grad_norm": 0.22012777626514435, + "learning_rate": 4.135497350183449e-06, + "loss": 3.9628, + "step": 69080 + }, + { + "epoch": 4.693912216333741, + "grad_norm": 0.2551034688949585, + "learning_rate": 4.135072700095122e-06, + "loss": 4.2888, + "step": 69085 + }, + { + "epoch": 4.694251936404402, + "grad_norm": 0.3549894094467163, + "learning_rate": 4.134648050006795e-06, + "loss": 3.5859, + "step": 69090 + }, + { + "epoch": 4.6945916564750645, + "grad_norm": 0.24533511698246002, + "learning_rate": 4.1342233999184675e-06, + "loss": 4.2107, + "step": 69095 + }, + { + "epoch": 4.694931376545727, + "grad_norm": 0.27376890182495117, + "learning_rate": 4.13379874983014e-06, + "loss": 4.0522, + "step": 69100 + }, + { + "epoch": 4.695271096616388, + "grad_norm": 0.2558518052101135, + "learning_rate": 4.133374099741813e-06, + "loss": 3.9955, + "step": 69105 + }, + { + "epoch": 4.69561081668705, + "grad_norm": 0.29751425981521606, + "learning_rate": 4.132949449653486e-06, + "loss": 3.9222, + "step": 69110 + }, + { + "epoch": 4.695950536757712, + "grad_norm": 0.36307045817375183, + "learning_rate": 4.132524799565159e-06, + "loss": 4.0016, + "step": 69115 + }, + { + "epoch": 4.696290256828373, + "grad_norm": 0.3400798738002777, + "learning_rate": 4.1321001494768315e-06, + "loss": 4.2027, + "step": 69120 + }, + { + "epoch": 4.696629976899035, + "grad_norm": 0.4063388407230377, + "learning_rate": 4.131675499388504e-06, + "loss": 4.0733, + "step": 69125 + }, + { + "epoch": 4.696969696969697, + "grad_norm": 0.35090672969818115, + "learning_rate": 4.131250849300177e-06, + "loss": 3.7832, + "step": 69130 + }, + { + "epoch": 4.697309417040358, + "grad_norm": 0.20315729081630707, + "learning_rate": 4.13082619921185e-06, + "loss": 3.8591, + "step": 69135 + }, + { + "epoch": 4.6976491371110205, + "grad_norm": 0.24926349520683289, + "learning_rate": 4.130401549123523e-06, + "loss": 3.8812, + "step": 69140 + }, + { + "epoch": 4.697988857181683, + "grad_norm": 0.27757951617240906, + "learning_rate": 4.1299768990351955e-06, + "loss": 3.8671, + "step": 69145 + }, + { + "epoch": 4.698328577252344, + "grad_norm": 0.339923620223999, + "learning_rate": 4.129552248946868e-06, + "loss": 4.1101, + "step": 69150 + }, + { + "epoch": 4.698668297323006, + "grad_norm": 0.37694868445396423, + "learning_rate": 4.129127598858541e-06, + "loss": 4.0188, + "step": 69155 + }, + { + "epoch": 4.699008017393668, + "grad_norm": 0.22621186077594757, + "learning_rate": 4.128702948770214e-06, + "loss": 4.0001, + "step": 69160 + }, + { + "epoch": 4.699347737464329, + "grad_norm": 0.25992637872695923, + "learning_rate": 4.128278298681887e-06, + "loss": 3.8863, + "step": 69165 + }, + { + "epoch": 4.699687457534991, + "grad_norm": 0.2560329735279083, + "learning_rate": 4.1278536485935595e-06, + "loss": 3.8322, + "step": 69170 + }, + { + "epoch": 4.700027177605653, + "grad_norm": 0.28852570056915283, + "learning_rate": 4.1274289985052315e-06, + "loss": 3.8935, + "step": 69175 + }, + { + "epoch": 4.700366897676314, + "grad_norm": 0.2521514892578125, + "learning_rate": 4.127004348416905e-06, + "loss": 3.9058, + "step": 69180 + }, + { + "epoch": 4.7007066177469765, + "grad_norm": 0.2513149380683899, + "learning_rate": 4.126579698328578e-06, + "loss": 3.8288, + "step": 69185 + }, + { + "epoch": 4.701046337817639, + "grad_norm": 0.34928208589553833, + "learning_rate": 4.12615504824025e-06, + "loss": 3.7522, + "step": 69190 + }, + { + "epoch": 4.7013860578883, + "grad_norm": 0.26604336500167847, + "learning_rate": 4.1257303981519235e-06, + "loss": 4.0828, + "step": 69195 + }, + { + "epoch": 4.701725777958962, + "grad_norm": 0.2753436863422394, + "learning_rate": 4.125305748063596e-06, + "loss": 4.0388, + "step": 69200 + }, + { + "epoch": 4.702065498029624, + "grad_norm": 0.2626219689846039, + "learning_rate": 4.124881097975268e-06, + "loss": 4.1525, + "step": 69205 + }, + { + "epoch": 4.702405218100285, + "grad_norm": 0.23048512637615204, + "learning_rate": 4.124456447886941e-06, + "loss": 3.9473, + "step": 69210 + }, + { + "epoch": 4.702744938170947, + "grad_norm": 0.3290770351886749, + "learning_rate": 4.124031797798615e-06, + "loss": 3.9502, + "step": 69215 + }, + { + "epoch": 4.703084658241609, + "grad_norm": 0.5099068880081177, + "learning_rate": 4.123607147710287e-06, + "loss": 4.151, + "step": 69220 + }, + { + "epoch": 4.7034243783122704, + "grad_norm": 0.35530251264572144, + "learning_rate": 4.1231824976219595e-06, + "loss": 3.9024, + "step": 69225 + }, + { + "epoch": 4.7037640983829325, + "grad_norm": 0.21389110386371613, + "learning_rate": 4.122757847533633e-06, + "loss": 3.9628, + "step": 69230 + }, + { + "epoch": 4.704103818453595, + "grad_norm": 0.2861437201499939, + "learning_rate": 4.122333197445305e-06, + "loss": 3.9005, + "step": 69235 + }, + { + "epoch": 4.704443538524256, + "grad_norm": 0.23323971033096313, + "learning_rate": 4.121908547356978e-06, + "loss": 3.8636, + "step": 69240 + }, + { + "epoch": 4.704783258594918, + "grad_norm": 0.3150014281272888, + "learning_rate": 4.121483897268651e-06, + "loss": 4.1413, + "step": 69245 + }, + { + "epoch": 4.70512297866558, + "grad_norm": 0.3548981547355652, + "learning_rate": 4.121059247180324e-06, + "loss": 3.8715, + "step": 69250 + }, + { + "epoch": 4.705462698736241, + "grad_norm": 0.3023783266544342, + "learning_rate": 4.120634597091996e-06, + "loss": 4.1613, + "step": 69255 + }, + { + "epoch": 4.705802418806903, + "grad_norm": 0.43874624371528625, + "learning_rate": 4.120209947003669e-06, + "loss": 3.924, + "step": 69260 + }, + { + "epoch": 4.706142138877565, + "grad_norm": 0.25863397121429443, + "learning_rate": 4.119785296915343e-06, + "loss": 3.7047, + "step": 69265 + }, + { + "epoch": 4.7064818589482265, + "grad_norm": 0.23446431756019592, + "learning_rate": 4.119360646827015e-06, + "loss": 4.0315, + "step": 69270 + }, + { + "epoch": 4.7068215790188885, + "grad_norm": 0.31912946701049805, + "learning_rate": 4.1189359967386875e-06, + "loss": 3.7636, + "step": 69275 + }, + { + "epoch": 4.707161299089551, + "grad_norm": 0.29415202140808105, + "learning_rate": 4.11851134665036e-06, + "loss": 3.787, + "step": 69280 + }, + { + "epoch": 4.707501019160212, + "grad_norm": 0.28367435932159424, + "learning_rate": 4.118086696562033e-06, + "loss": 4.051, + "step": 69285 + }, + { + "epoch": 4.707840739230874, + "grad_norm": 0.3812422454357147, + "learning_rate": 4.117662046473706e-06, + "loss": 3.7685, + "step": 69290 + }, + { + "epoch": 4.708180459301536, + "grad_norm": 0.3948708176612854, + "learning_rate": 4.117237396385379e-06, + "loss": 3.902, + "step": 69295 + }, + { + "epoch": 4.708520179372197, + "grad_norm": 0.27473604679107666, + "learning_rate": 4.1168127462970515e-06, + "loss": 4.0783, + "step": 69300 + }, + { + "epoch": 4.708859899442859, + "grad_norm": 0.33832311630249023, + "learning_rate": 4.116388096208724e-06, + "loss": 4.0558, + "step": 69305 + }, + { + "epoch": 4.709199619513521, + "grad_norm": 0.26028165221214294, + "learning_rate": 4.115963446120397e-06, + "loss": 4.1617, + "step": 69310 + }, + { + "epoch": 4.7095393395841825, + "grad_norm": 0.20481957495212555, + "learning_rate": 4.11553879603207e-06, + "loss": 3.9585, + "step": 69315 + }, + { + "epoch": 4.7098790596548445, + "grad_norm": 0.30116480588912964, + "learning_rate": 4.115114145943743e-06, + "loss": 3.9273, + "step": 69320 + }, + { + "epoch": 4.710218779725507, + "grad_norm": 0.22698715329170227, + "learning_rate": 4.1146894958554155e-06, + "loss": 3.9918, + "step": 69325 + }, + { + "epoch": 4.710558499796168, + "grad_norm": 0.29998454451560974, + "learning_rate": 4.114264845767088e-06, + "loss": 4.0045, + "step": 69330 + }, + { + "epoch": 4.71089821986683, + "grad_norm": 0.2492099404335022, + "learning_rate": 4.113840195678761e-06, + "loss": 3.8208, + "step": 69335 + }, + { + "epoch": 4.711237939937492, + "grad_norm": 0.28736168146133423, + "learning_rate": 4.113415545590434e-06, + "loss": 4.1883, + "step": 69340 + }, + { + "epoch": 4.711577660008153, + "grad_norm": 0.6306652426719666, + "learning_rate": 4.112990895502107e-06, + "loss": 4.1316, + "step": 69345 + }, + { + "epoch": 4.711917380078815, + "grad_norm": 0.2569451332092285, + "learning_rate": 4.1125662454137795e-06, + "loss": 3.6893, + "step": 69350 + }, + { + "epoch": 4.712257100149477, + "grad_norm": 0.3202376067638397, + "learning_rate": 4.112141595325452e-06, + "loss": 3.7687, + "step": 69355 + }, + { + "epoch": 4.7125968202201385, + "grad_norm": 0.2108306586742401, + "learning_rate": 4.111716945237125e-06, + "loss": 3.9843, + "step": 69360 + }, + { + "epoch": 4.712936540290801, + "grad_norm": 0.2263680398464203, + "learning_rate": 4.111292295148798e-06, + "loss": 3.9628, + "step": 69365 + }, + { + "epoch": 4.713276260361463, + "grad_norm": 0.39049017429351807, + "learning_rate": 4.110867645060471e-06, + "loss": 3.8679, + "step": 69370 + }, + { + "epoch": 4.713615980432124, + "grad_norm": 0.35549047589302063, + "learning_rate": 4.110442994972143e-06, + "loss": 3.9921, + "step": 69375 + }, + { + "epoch": 4.713955700502786, + "grad_norm": 0.2297433465719223, + "learning_rate": 4.110018344883816e-06, + "loss": 3.9585, + "step": 69380 + }, + { + "epoch": 4.714295420573448, + "grad_norm": 0.21252702176570892, + "learning_rate": 4.109593694795489e-06, + "loss": 3.7165, + "step": 69385 + }, + { + "epoch": 4.714635140644109, + "grad_norm": 0.27008122205734253, + "learning_rate": 4.109169044707161e-06, + "loss": 3.7685, + "step": 69390 + }, + { + "epoch": 4.714974860714771, + "grad_norm": 0.3021203875541687, + "learning_rate": 4.108744394618835e-06, + "loss": 4.0284, + "step": 69395 + }, + { + "epoch": 4.715314580785432, + "grad_norm": 0.25168225169181824, + "learning_rate": 4.1083197445305075e-06, + "loss": 3.8432, + "step": 69400 + }, + { + "epoch": 4.7156543008560945, + "grad_norm": 0.3792319595813751, + "learning_rate": 4.1078950944421794e-06, + "loss": 4.0, + "step": 69405 + }, + { + "epoch": 4.715994020926757, + "grad_norm": 0.29155850410461426, + "learning_rate": 4.107470444353853e-06, + "loss": 4.0032, + "step": 69410 + }, + { + "epoch": 4.716333740997418, + "grad_norm": 0.31140583753585815, + "learning_rate": 4.107045794265526e-06, + "loss": 3.9426, + "step": 69415 + }, + { + "epoch": 4.71667346106808, + "grad_norm": 0.33567652106285095, + "learning_rate": 4.106621144177199e-06, + "loss": 3.9264, + "step": 69420 + }, + { + "epoch": 4.717013181138742, + "grad_norm": 0.43817460536956787, + "learning_rate": 4.106196494088871e-06, + "loss": 4.062, + "step": 69425 + }, + { + "epoch": 4.717352901209403, + "grad_norm": 0.21264372766017914, + "learning_rate": 4.105771844000544e-06, + "loss": 3.8881, + "step": 69430 + }, + { + "epoch": 4.717692621280065, + "grad_norm": 0.26022931933403015, + "learning_rate": 4.105347193912217e-06, + "loss": 3.8875, + "step": 69435 + }, + { + "epoch": 4.718032341350727, + "grad_norm": 0.2271299809217453, + "learning_rate": 4.104922543823889e-06, + "loss": 3.7595, + "step": 69440 + }, + { + "epoch": 4.718372061421388, + "grad_norm": 0.3127329349517822, + "learning_rate": 4.104497893735563e-06, + "loss": 3.9681, + "step": 69445 + }, + { + "epoch": 4.7187117814920505, + "grad_norm": 0.2074800729751587, + "learning_rate": 4.1040732436472355e-06, + "loss": 3.8984, + "step": 69450 + }, + { + "epoch": 4.719051501562713, + "grad_norm": 0.33778324723243713, + "learning_rate": 4.1036485935589075e-06, + "loss": 3.6164, + "step": 69455 + }, + { + "epoch": 4.719391221633374, + "grad_norm": 0.3309818506240845, + "learning_rate": 4.10322394347058e-06, + "loss": 3.8921, + "step": 69460 + }, + { + "epoch": 4.719730941704036, + "grad_norm": 0.308347225189209, + "learning_rate": 4.102799293382254e-06, + "loss": 3.997, + "step": 69465 + }, + { + "epoch": 4.720070661774698, + "grad_norm": 0.2695946991443634, + "learning_rate": 4.102374643293926e-06, + "loss": 4.0184, + "step": 69470 + }, + { + "epoch": 4.720410381845359, + "grad_norm": 0.4683176577091217, + "learning_rate": 4.101949993205599e-06, + "loss": 3.9107, + "step": 69475 + }, + { + "epoch": 4.720750101916021, + "grad_norm": 0.21727685630321503, + "learning_rate": 4.101525343117272e-06, + "loss": 3.7776, + "step": 69480 + }, + { + "epoch": 4.721089821986683, + "grad_norm": 0.33663907647132874, + "learning_rate": 4.101100693028944e-06, + "loss": 3.678, + "step": 69485 + }, + { + "epoch": 4.721429542057344, + "grad_norm": 0.217405304312706, + "learning_rate": 4.100676042940617e-06, + "loss": 3.8107, + "step": 69490 + }, + { + "epoch": 4.7217692621280065, + "grad_norm": 0.2770959138870239, + "learning_rate": 4.10025139285229e-06, + "loss": 4.0003, + "step": 69495 + }, + { + "epoch": 4.722108982198669, + "grad_norm": 0.267207533121109, + "learning_rate": 4.099826742763963e-06, + "loss": 4.0402, + "step": 69500 + }, + { + "epoch": 4.72244870226933, + "grad_norm": 0.23940257728099823, + "learning_rate": 4.0994020926756355e-06, + "loss": 4.0648, + "step": 69505 + }, + { + "epoch": 4.722788422339992, + "grad_norm": 0.26514357328414917, + "learning_rate": 4.098977442587308e-06, + "loss": 4.0686, + "step": 69510 + }, + { + "epoch": 4.723128142410654, + "grad_norm": 0.34200698137283325, + "learning_rate": 4.098552792498981e-06, + "loss": 4.0776, + "step": 69515 + }, + { + "epoch": 4.723467862481315, + "grad_norm": 0.2820311486721039, + "learning_rate": 4.098128142410654e-06, + "loss": 3.9398, + "step": 69520 + }, + { + "epoch": 4.723807582551977, + "grad_norm": 0.23795245587825775, + "learning_rate": 4.097703492322327e-06, + "loss": 3.8197, + "step": 69525 + }, + { + "epoch": 4.724147302622639, + "grad_norm": 0.2902182936668396, + "learning_rate": 4.0972788422339995e-06, + "loss": 3.8341, + "step": 69530 + }, + { + "epoch": 4.7244870226933005, + "grad_norm": 0.3362990617752075, + "learning_rate": 4.096854192145672e-06, + "loss": 3.8393, + "step": 69535 + }, + { + "epoch": 4.7248267427639625, + "grad_norm": 0.226045161485672, + "learning_rate": 4.096429542057345e-06, + "loss": 3.7654, + "step": 69540 + }, + { + "epoch": 4.725166462834625, + "grad_norm": 0.28829094767570496, + "learning_rate": 4.096004891969018e-06, + "loss": 3.9473, + "step": 69545 + }, + { + "epoch": 4.725506182905286, + "grad_norm": 0.230289489030838, + "learning_rate": 4.095580241880691e-06, + "loss": 4.0742, + "step": 69550 + }, + { + "epoch": 4.725845902975948, + "grad_norm": 0.19902309775352478, + "learning_rate": 4.0951555917923635e-06, + "loss": 3.705, + "step": 69555 + }, + { + "epoch": 4.72618562304661, + "grad_norm": 0.24870720505714417, + "learning_rate": 4.094730941704036e-06, + "loss": 3.8488, + "step": 69560 + }, + { + "epoch": 4.726525343117271, + "grad_norm": 0.35571226477622986, + "learning_rate": 4.094306291615709e-06, + "loss": 3.9568, + "step": 69565 + }, + { + "epoch": 4.726865063187933, + "grad_norm": 0.274068683385849, + "learning_rate": 4.093881641527382e-06, + "loss": 4.03, + "step": 69570 + }, + { + "epoch": 4.727204783258595, + "grad_norm": 0.26948195695877075, + "learning_rate": 4.093456991439055e-06, + "loss": 3.8581, + "step": 69575 + }, + { + "epoch": 4.7275445033292565, + "grad_norm": 0.2839091122150421, + "learning_rate": 4.0930323413507275e-06, + "loss": 4.0162, + "step": 69580 + }, + { + "epoch": 4.7278842233999185, + "grad_norm": 0.289050817489624, + "learning_rate": 4.0926076912624e-06, + "loss": 4.0524, + "step": 69585 + }, + { + "epoch": 4.728223943470581, + "grad_norm": 0.2547605037689209, + "learning_rate": 4.092183041174073e-06, + "loss": 3.9571, + "step": 69590 + }, + { + "epoch": 4.728563663541242, + "grad_norm": 0.27419736981391907, + "learning_rate": 4.091758391085746e-06, + "loss": 3.8827, + "step": 69595 + }, + { + "epoch": 4.728903383611904, + "grad_norm": 0.23654581606388092, + "learning_rate": 4.091333740997419e-06, + "loss": 3.7964, + "step": 69600 + }, + { + "epoch": 4.729243103682566, + "grad_norm": 0.23829735815525055, + "learning_rate": 4.0909090909090915e-06, + "loss": 3.7773, + "step": 69605 + }, + { + "epoch": 4.729582823753227, + "grad_norm": 0.244020015001297, + "learning_rate": 4.090484440820764e-06, + "loss": 3.9861, + "step": 69610 + }, + { + "epoch": 4.729922543823889, + "grad_norm": 0.3573097586631775, + "learning_rate": 4.090059790732437e-06, + "loss": 3.8415, + "step": 69615 + }, + { + "epoch": 4.73026226389455, + "grad_norm": 0.24132080376148224, + "learning_rate": 4.08963514064411e-06, + "loss": 3.9467, + "step": 69620 + }, + { + "epoch": 4.7306019839652125, + "grad_norm": 0.26825112104415894, + "learning_rate": 4.089210490555782e-06, + "loss": 3.9436, + "step": 69625 + }, + { + "epoch": 4.7309417040358746, + "grad_norm": 0.3233736753463745, + "learning_rate": 4.0887858404674555e-06, + "loss": 4.0998, + "step": 69630 + }, + { + "epoch": 4.731281424106536, + "grad_norm": 0.2508729100227356, + "learning_rate": 4.088361190379128e-06, + "loss": 3.8773, + "step": 69635 + }, + { + "epoch": 4.731621144177198, + "grad_norm": 0.27217745780944824, + "learning_rate": 4.0879365402908e-06, + "loss": 3.9687, + "step": 69640 + }, + { + "epoch": 4.73196086424786, + "grad_norm": 0.2375875860452652, + "learning_rate": 4.087511890202474e-06, + "loss": 3.9762, + "step": 69645 + }, + { + "epoch": 4.732300584318521, + "grad_norm": 0.2333940714597702, + "learning_rate": 4.087087240114147e-06, + "loss": 4.0143, + "step": 69650 + }, + { + "epoch": 4.732640304389183, + "grad_norm": 0.33029067516326904, + "learning_rate": 4.086662590025819e-06, + "loss": 3.9239, + "step": 69655 + }, + { + "epoch": 4.732980024459845, + "grad_norm": 0.31309160590171814, + "learning_rate": 4.0862379399374914e-06, + "loss": 3.9796, + "step": 69660 + }, + { + "epoch": 4.733319744530506, + "grad_norm": 0.22619031369686127, + "learning_rate": 4.085813289849165e-06, + "loss": 3.7442, + "step": 69665 + }, + { + "epoch": 4.7336594646011685, + "grad_norm": 0.28935202956199646, + "learning_rate": 4.085388639760837e-06, + "loss": 3.9339, + "step": 69670 + }, + { + "epoch": 4.733999184671831, + "grad_norm": 0.27134817838668823, + "learning_rate": 4.08496398967251e-06, + "loss": 3.9957, + "step": 69675 + }, + { + "epoch": 4.734338904742492, + "grad_norm": 0.2333812564611435, + "learning_rate": 4.0845393395841835e-06, + "loss": 3.9718, + "step": 69680 + }, + { + "epoch": 4.734678624813154, + "grad_norm": 0.2188495397567749, + "learning_rate": 4.0841146894958554e-06, + "loss": 3.877, + "step": 69685 + }, + { + "epoch": 4.735018344883816, + "grad_norm": 0.317150354385376, + "learning_rate": 4.083690039407528e-06, + "loss": 4.0837, + "step": 69690 + }, + { + "epoch": 4.735358064954477, + "grad_norm": 0.27309319376945496, + "learning_rate": 4.083265389319202e-06, + "loss": 3.8946, + "step": 69695 + }, + { + "epoch": 4.735697785025139, + "grad_norm": 0.2708241045475006, + "learning_rate": 4.082840739230874e-06, + "loss": 3.7799, + "step": 69700 + }, + { + "epoch": 4.736037505095801, + "grad_norm": 0.32467785477638245, + "learning_rate": 4.082416089142547e-06, + "loss": 3.8827, + "step": 69705 + }, + { + "epoch": 4.736377225166462, + "grad_norm": 0.29012197256088257, + "learning_rate": 4.0819914390542194e-06, + "loss": 4.0753, + "step": 69710 + }, + { + "epoch": 4.7367169452371245, + "grad_norm": 0.25069063901901245, + "learning_rate": 4.081566788965892e-06, + "loss": 4.2186, + "step": 69715 + }, + { + "epoch": 4.737056665307787, + "grad_norm": 0.40803349018096924, + "learning_rate": 4.081142138877565e-06, + "loss": 3.7347, + "step": 69720 + }, + { + "epoch": 4.737396385378448, + "grad_norm": 0.27548930048942566, + "learning_rate": 4.080717488789238e-06, + "loss": 4.2377, + "step": 69725 + }, + { + "epoch": 4.73773610544911, + "grad_norm": 0.3089364469051361, + "learning_rate": 4.080292838700911e-06, + "loss": 3.9675, + "step": 69730 + }, + { + "epoch": 4.738075825519772, + "grad_norm": 0.24532827734947205, + "learning_rate": 4.0798681886125834e-06, + "loss": 3.8407, + "step": 69735 + }, + { + "epoch": 4.738415545590433, + "grad_norm": 0.2578384280204773, + "learning_rate": 4.079443538524256e-06, + "loss": 4.0149, + "step": 69740 + }, + { + "epoch": 4.738755265661095, + "grad_norm": 0.4457300901412964, + "learning_rate": 4.079018888435929e-06, + "loss": 3.9766, + "step": 69745 + }, + { + "epoch": 4.739094985731757, + "grad_norm": 0.2624038755893707, + "learning_rate": 4.078594238347602e-06, + "loss": 4.0285, + "step": 69750 + }, + { + "epoch": 4.739434705802418, + "grad_norm": 0.22369644045829773, + "learning_rate": 4.078169588259275e-06, + "loss": 4.1425, + "step": 69755 + }, + { + "epoch": 4.7397744258730805, + "grad_norm": 0.38977596163749695, + "learning_rate": 4.0777449381709475e-06, + "loss": 3.996, + "step": 69760 + }, + { + "epoch": 4.740114145943743, + "grad_norm": 0.23872877657413483, + "learning_rate": 4.07732028808262e-06, + "loss": 3.8978, + "step": 69765 + }, + { + "epoch": 4.740453866014404, + "grad_norm": 0.43245983123779297, + "learning_rate": 4.076895637994293e-06, + "loss": 3.719, + "step": 69770 + }, + { + "epoch": 4.740793586085066, + "grad_norm": 0.29623904824256897, + "learning_rate": 4.076470987905966e-06, + "loss": 3.9096, + "step": 69775 + }, + { + "epoch": 4.741133306155728, + "grad_norm": 0.3847310245037079, + "learning_rate": 4.076046337817639e-06, + "loss": 4.0754, + "step": 69780 + }, + { + "epoch": 4.741473026226389, + "grad_norm": 0.279632031917572, + "learning_rate": 4.0756216877293115e-06, + "loss": 3.9001, + "step": 69785 + }, + { + "epoch": 4.741812746297051, + "grad_norm": 0.3875434398651123, + "learning_rate": 4.075197037640984e-06, + "loss": 3.9987, + "step": 69790 + }, + { + "epoch": 4.742152466367713, + "grad_norm": 0.22694562375545502, + "learning_rate": 4.074772387552657e-06, + "loss": 4.0789, + "step": 69795 + }, + { + "epoch": 4.7424921864383744, + "grad_norm": 0.32248997688293457, + "learning_rate": 4.07434773746433e-06, + "loss": 3.8946, + "step": 69800 + }, + { + "epoch": 4.7428319065090365, + "grad_norm": 0.2785702645778656, + "learning_rate": 4.073923087376003e-06, + "loss": 3.8235, + "step": 69805 + }, + { + "epoch": 4.743171626579699, + "grad_norm": 0.3021383285522461, + "learning_rate": 4.0734984372876755e-06, + "loss": 4.0887, + "step": 69810 + }, + { + "epoch": 4.74351134665036, + "grad_norm": 0.46751806139945984, + "learning_rate": 4.073073787199348e-06, + "loss": 3.7177, + "step": 69815 + }, + { + "epoch": 4.743851066721022, + "grad_norm": 0.23286499083042145, + "learning_rate": 4.072649137111021e-06, + "loss": 3.8164, + "step": 69820 + }, + { + "epoch": 4.744190786791684, + "grad_norm": 0.2269468456506729, + "learning_rate": 4.072224487022694e-06, + "loss": 3.7581, + "step": 69825 + }, + { + "epoch": 4.744530506862345, + "grad_norm": 0.26907163858413696, + "learning_rate": 4.071799836934367e-06, + "loss": 3.8636, + "step": 69830 + }, + { + "epoch": 4.744870226933007, + "grad_norm": 0.2867685854434967, + "learning_rate": 4.0713751868460395e-06, + "loss": 4.1196, + "step": 69835 + }, + { + "epoch": 4.745209947003669, + "grad_norm": 0.28745636343955994, + "learning_rate": 4.070950536757711e-06, + "loss": 3.8333, + "step": 69840 + }, + { + "epoch": 4.7455496670743305, + "grad_norm": 0.28876641392707825, + "learning_rate": 4.070525886669385e-06, + "loss": 3.8097, + "step": 69845 + }, + { + "epoch": 4.7458893871449925, + "grad_norm": 0.23102129995822906, + "learning_rate": 4.070101236581058e-06, + "loss": 4.0797, + "step": 69850 + }, + { + "epoch": 4.746229107215655, + "grad_norm": 0.3552204668521881, + "learning_rate": 4.06967658649273e-06, + "loss": 4.1129, + "step": 69855 + }, + { + "epoch": 4.746568827286316, + "grad_norm": 0.26606419682502747, + "learning_rate": 4.0692519364044035e-06, + "loss": 3.6983, + "step": 69860 + }, + { + "epoch": 4.746908547356978, + "grad_norm": 0.22300323843955994, + "learning_rate": 4.068827286316076e-06, + "loss": 4.0597, + "step": 69865 + }, + { + "epoch": 4.74724826742764, + "grad_norm": 0.41014716029167175, + "learning_rate": 4.068402636227748e-06, + "loss": 3.7776, + "step": 69870 + }, + { + "epoch": 4.747587987498301, + "grad_norm": 0.466350793838501, + "learning_rate": 4.067977986139421e-06, + "loss": 3.9805, + "step": 69875 + }, + { + "epoch": 4.747927707568963, + "grad_norm": 0.28359273076057434, + "learning_rate": 4.067553336051095e-06, + "loss": 3.961, + "step": 69880 + }, + { + "epoch": 4.748267427639625, + "grad_norm": 0.5546168684959412, + "learning_rate": 4.067128685962767e-06, + "loss": 4.0233, + "step": 69885 + }, + { + "epoch": 4.7486071477102865, + "grad_norm": 0.3489242494106293, + "learning_rate": 4.0667040358744394e-06, + "loss": 4.0607, + "step": 69890 + }, + { + "epoch": 4.7489468677809485, + "grad_norm": 0.39619964361190796, + "learning_rate": 4.066279385786113e-06, + "loss": 4.0997, + "step": 69895 + }, + { + "epoch": 4.749286587851611, + "grad_norm": 0.24934236705303192, + "learning_rate": 4.065854735697785e-06, + "loss": 3.9062, + "step": 69900 + }, + { + "epoch": 4.749626307922272, + "grad_norm": 0.3676300346851349, + "learning_rate": 4.065430085609458e-06, + "loss": 4.007, + "step": 69905 + }, + { + "epoch": 4.749966027992934, + "grad_norm": 0.3325871527194977, + "learning_rate": 4.065005435521131e-06, + "loss": 3.9672, + "step": 69910 + }, + { + "epoch": 4.750305748063596, + "grad_norm": 0.2331000119447708, + "learning_rate": 4.0645807854328034e-06, + "loss": 4.0238, + "step": 69915 + }, + { + "epoch": 4.750645468134257, + "grad_norm": 0.29174432158470154, + "learning_rate": 4.064156135344476e-06, + "loss": 3.9677, + "step": 69920 + }, + { + "epoch": 4.750985188204919, + "grad_norm": 0.4348433017730713, + "learning_rate": 4.063731485256149e-06, + "loss": 3.7808, + "step": 69925 + }, + { + "epoch": 4.751324908275581, + "grad_norm": 0.27299779653549194, + "learning_rate": 4.063306835167823e-06, + "loss": 3.921, + "step": 69930 + }, + { + "epoch": 4.7516646283462425, + "grad_norm": 0.38808414340019226, + "learning_rate": 4.062882185079495e-06, + "loss": 3.766, + "step": 69935 + }, + { + "epoch": 4.7520043484169046, + "grad_norm": 0.235537588596344, + "learning_rate": 4.0624575349911674e-06, + "loss": 3.9049, + "step": 69940 + }, + { + "epoch": 4.752344068487567, + "grad_norm": 0.2702772915363312, + "learning_rate": 4.06203288490284e-06, + "loss": 4.1245, + "step": 69945 + }, + { + "epoch": 4.752683788558228, + "grad_norm": 0.2797992527484894, + "learning_rate": 4.061608234814513e-06, + "loss": 3.9, + "step": 69950 + }, + { + "epoch": 4.75302350862889, + "grad_norm": 0.23077234625816345, + "learning_rate": 4.061183584726186e-06, + "loss": 4.1577, + "step": 69955 + }, + { + "epoch": 4.753363228699552, + "grad_norm": 0.33920153975486755, + "learning_rate": 4.060758934637859e-06, + "loss": 4.0127, + "step": 69960 + }, + { + "epoch": 4.753702948770213, + "grad_norm": 0.23437565565109253, + "learning_rate": 4.0603342845495314e-06, + "loss": 4.0266, + "step": 69965 + }, + { + "epoch": 4.754042668840875, + "grad_norm": 0.25116148591041565, + "learning_rate": 4.059909634461204e-06, + "loss": 3.9568, + "step": 69970 + }, + { + "epoch": 4.754382388911537, + "grad_norm": 0.3025786578655243, + "learning_rate": 4.059484984372877e-06, + "loss": 3.8102, + "step": 69975 + }, + { + "epoch": 4.7547221089821985, + "grad_norm": 0.34861040115356445, + "learning_rate": 4.05906033428455e-06, + "loss": 3.8599, + "step": 69980 + }, + { + "epoch": 4.755061829052861, + "grad_norm": 0.38687485456466675, + "learning_rate": 4.058635684196223e-06, + "loss": 4.0373, + "step": 69985 + }, + { + "epoch": 4.755401549123523, + "grad_norm": 0.25283724069595337, + "learning_rate": 4.0582110341078954e-06, + "loss": 3.854, + "step": 69990 + }, + { + "epoch": 4.755741269194184, + "grad_norm": 0.8992840051651001, + "learning_rate": 4.057786384019568e-06, + "loss": 3.8644, + "step": 69995 + }, + { + "epoch": 4.756080989264846, + "grad_norm": 0.39510712027549744, + "learning_rate": 4.057361733931241e-06, + "loss": 4.0482, + "step": 70000 + }, + { + "epoch": 4.756420709335508, + "grad_norm": 0.28661036491394043, + "learning_rate": 4.056937083842914e-06, + "loss": 4.2327, + "step": 70005 + }, + { + "epoch": 4.756760429406169, + "grad_norm": 0.30771321058273315, + "learning_rate": 4.056512433754587e-06, + "loss": 3.8794, + "step": 70010 + }, + { + "epoch": 4.757100149476831, + "grad_norm": 0.2611895501613617, + "learning_rate": 4.0560877836662594e-06, + "loss": 3.9021, + "step": 70015 + }, + { + "epoch": 4.757439869547493, + "grad_norm": 0.2519482374191284, + "learning_rate": 4.055663133577932e-06, + "loss": 3.8455, + "step": 70020 + }, + { + "epoch": 4.7577795896181545, + "grad_norm": 0.31666579842567444, + "learning_rate": 4.055238483489605e-06, + "loss": 3.936, + "step": 70025 + }, + { + "epoch": 4.758119309688817, + "grad_norm": 0.26412147283554077, + "learning_rate": 4.054813833401278e-06, + "loss": 3.6799, + "step": 70030 + }, + { + "epoch": 4.758459029759479, + "grad_norm": 0.2909509837627411, + "learning_rate": 4.054389183312951e-06, + "loss": 4.1284, + "step": 70035 + }, + { + "epoch": 4.75879874983014, + "grad_norm": 0.2908852696418762, + "learning_rate": 4.053964533224623e-06, + "loss": 4.0019, + "step": 70040 + }, + { + "epoch": 4.759138469900802, + "grad_norm": 0.23273815214633942, + "learning_rate": 4.053539883136296e-06, + "loss": 4.0223, + "step": 70045 + }, + { + "epoch": 4.759478189971464, + "grad_norm": 0.21551057696342468, + "learning_rate": 4.053115233047969e-06, + "loss": 3.4504, + "step": 70050 + }, + { + "epoch": 4.759817910042125, + "grad_norm": 0.23276357352733612, + "learning_rate": 4.052690582959641e-06, + "loss": 4.0349, + "step": 70055 + }, + { + "epoch": 4.760157630112787, + "grad_norm": 0.20522312819957733, + "learning_rate": 4.052265932871315e-06, + "loss": 3.815, + "step": 70060 + }, + { + "epoch": 4.760497350183449, + "grad_norm": 0.33245036005973816, + "learning_rate": 4.0518412827829875e-06, + "loss": 3.7694, + "step": 70065 + }, + { + "epoch": 4.7608370702541105, + "grad_norm": 0.2945246696472168, + "learning_rate": 4.051416632694659e-06, + "loss": 4.1091, + "step": 70070 + }, + { + "epoch": 4.761176790324773, + "grad_norm": 0.31853145360946655, + "learning_rate": 4.050991982606333e-06, + "loss": 3.8556, + "step": 70075 + }, + { + "epoch": 4.761516510395434, + "grad_norm": 0.4813271164894104, + "learning_rate": 4.050567332518006e-06, + "loss": 4.0358, + "step": 70080 + }, + { + "epoch": 4.761856230466096, + "grad_norm": 0.2626662254333496, + "learning_rate": 4.050142682429678e-06, + "loss": 4.0639, + "step": 70085 + }, + { + "epoch": 4.762195950536758, + "grad_norm": 0.24721068143844604, + "learning_rate": 4.049718032341351e-06, + "loss": 3.7763, + "step": 70090 + }, + { + "epoch": 4.762535670607419, + "grad_norm": 0.3117769658565521, + "learning_rate": 4.049293382253024e-06, + "loss": 4.0178, + "step": 70095 + }, + { + "epoch": 4.762875390678081, + "grad_norm": 0.23137958347797394, + "learning_rate": 4.048868732164697e-06, + "loss": 4.0155, + "step": 70100 + }, + { + "epoch": 4.763215110748743, + "grad_norm": 0.19681313633918762, + "learning_rate": 4.048444082076369e-06, + "loss": 3.8917, + "step": 70105 + }, + { + "epoch": 4.7635548308194045, + "grad_norm": 0.25542816519737244, + "learning_rate": 4.048019431988043e-06, + "loss": 3.804, + "step": 70110 + }, + { + "epoch": 4.7638945508900665, + "grad_norm": 0.2715509533882141, + "learning_rate": 4.0475947818997155e-06, + "loss": 3.7884, + "step": 70115 + }, + { + "epoch": 4.764234270960729, + "grad_norm": NaN, + "learning_rate": 4.047255061829053e-06, + "loss": 4.051, + "step": 70120 + }, + { + "epoch": 4.76457399103139, + "grad_norm": 0.24022383987903595, + "learning_rate": 4.0468304117407255e-06, + "loss": 3.9647, + "step": 70125 + }, + { + "epoch": 4.764913711102052, + "grad_norm": 0.37133854627609253, + "learning_rate": 4.046405761652399e-06, + "loss": 3.8689, + "step": 70130 + }, + { + "epoch": 4.765253431172714, + "grad_norm": 0.25900140404701233, + "learning_rate": 4.045981111564072e-06, + "loss": 3.9964, + "step": 70135 + }, + { + "epoch": 4.765593151243375, + "grad_norm": 0.3158138692378998, + "learning_rate": 4.045556461475744e-06, + "loss": 3.9533, + "step": 70140 + }, + { + "epoch": 4.765932871314037, + "grad_norm": 0.260364830493927, + "learning_rate": 4.0451318113874175e-06, + "loss": 3.9049, + "step": 70145 + }, + { + "epoch": 4.766272591384699, + "grad_norm": 0.34011712670326233, + "learning_rate": 4.04470716129909e-06, + "loss": 3.751, + "step": 70150 + }, + { + "epoch": 4.7666123114553605, + "grad_norm": 0.25202372670173645, + "learning_rate": 4.044282511210762e-06, + "loss": 3.8356, + "step": 70155 + }, + { + "epoch": 4.7669520315260225, + "grad_norm": 0.3034617006778717, + "learning_rate": 4.043857861122435e-06, + "loss": 4.0309, + "step": 70160 + }, + { + "epoch": 4.767291751596685, + "grad_norm": 0.23636774718761444, + "learning_rate": 4.043433211034109e-06, + "loss": 3.9575, + "step": 70165 + }, + { + "epoch": 4.767631471667346, + "grad_norm": 0.4882575571537018, + "learning_rate": 4.043008560945781e-06, + "loss": 4.1147, + "step": 70170 + }, + { + "epoch": 4.767971191738008, + "grad_norm": 0.21141038835048676, + "learning_rate": 4.0425839108574535e-06, + "loss": 4.0164, + "step": 70175 + }, + { + "epoch": 4.76831091180867, + "grad_norm": 0.2622799575328827, + "learning_rate": 4.042159260769127e-06, + "loss": 4.1448, + "step": 70180 + }, + { + "epoch": 4.768650631879331, + "grad_norm": 0.2458282858133316, + "learning_rate": 4.041734610680799e-06, + "loss": 3.9525, + "step": 70185 + }, + { + "epoch": 4.768990351949993, + "grad_norm": 0.3943856358528137, + "learning_rate": 4.041309960592472e-06, + "loss": 4.0738, + "step": 70190 + }, + { + "epoch": 4.769330072020655, + "grad_norm": 0.28482457995414734, + "learning_rate": 4.0408853105041455e-06, + "loss": 3.9174, + "step": 70195 + }, + { + "epoch": 4.7696697920913165, + "grad_norm": 0.24241894483566284, + "learning_rate": 4.0404606604158175e-06, + "loss": 3.8083, + "step": 70200 + }, + { + "epoch": 4.7700095121619785, + "grad_norm": 0.3307875990867615, + "learning_rate": 4.04003601032749e-06, + "loss": 3.9317, + "step": 70205 + }, + { + "epoch": 4.770349232232641, + "grad_norm": 0.32953980565071106, + "learning_rate": 4.039611360239163e-06, + "loss": 3.9749, + "step": 70210 + }, + { + "epoch": 4.770688952303302, + "grad_norm": 0.3649264872074127, + "learning_rate": 4.039186710150836e-06, + "loss": 3.8741, + "step": 70215 + }, + { + "epoch": 4.771028672373964, + "grad_norm": 0.25634825229644775, + "learning_rate": 4.038762060062509e-06, + "loss": 3.952, + "step": 70220 + }, + { + "epoch": 4.771368392444626, + "grad_norm": 0.2456485778093338, + "learning_rate": 4.0383374099741815e-06, + "loss": 3.9627, + "step": 70225 + }, + { + "epoch": 4.771708112515287, + "grad_norm": 0.24021261930465698, + "learning_rate": 4.037912759885854e-06, + "loss": 3.8978, + "step": 70230 + }, + { + "epoch": 4.772047832585949, + "grad_norm": 0.3349233567714691, + "learning_rate": 4.037488109797527e-06, + "loss": 3.7225, + "step": 70235 + }, + { + "epoch": 4.772387552656611, + "grad_norm": 0.25099819898605347, + "learning_rate": 4.0370634597092e-06, + "loss": 3.7647, + "step": 70240 + }, + { + "epoch": 4.7727272727272725, + "grad_norm": 0.2296927571296692, + "learning_rate": 4.036638809620873e-06, + "loss": 3.8701, + "step": 70245 + }, + { + "epoch": 4.773066992797935, + "grad_norm": 0.211215540766716, + "learning_rate": 4.0362141595325455e-06, + "loss": 3.8833, + "step": 70250 + }, + { + "epoch": 4.773406712868597, + "grad_norm": 0.28873389959335327, + "learning_rate": 4.035789509444218e-06, + "loss": 3.926, + "step": 70255 + }, + { + "epoch": 4.773746432939258, + "grad_norm": 0.239119753241539, + "learning_rate": 4.035364859355891e-06, + "loss": 3.8696, + "step": 70260 + }, + { + "epoch": 4.77408615300992, + "grad_norm": 0.3525744676589966, + "learning_rate": 4.034940209267564e-06, + "loss": 3.8939, + "step": 70265 + }, + { + "epoch": 4.774425873080582, + "grad_norm": 0.27122893929481506, + "learning_rate": 4.034515559179237e-06, + "loss": 4.0808, + "step": 70270 + }, + { + "epoch": 4.774765593151243, + "grad_norm": 0.21894721686840057, + "learning_rate": 4.0340909090909095e-06, + "loss": 3.944, + "step": 70275 + }, + { + "epoch": 4.775105313221905, + "grad_norm": 0.21858420968055725, + "learning_rate": 4.033666259002582e-06, + "loss": 3.7415, + "step": 70280 + }, + { + "epoch": 4.775445033292567, + "grad_norm": 0.2111034393310547, + "learning_rate": 4.033241608914255e-06, + "loss": 4.1377, + "step": 70285 + }, + { + "epoch": 4.7757847533632285, + "grad_norm": 0.23544588685035706, + "learning_rate": 4.032816958825928e-06, + "loss": 4.019, + "step": 70290 + }, + { + "epoch": 4.776124473433891, + "grad_norm": 0.32596537470817566, + "learning_rate": 4.032392308737601e-06, + "loss": 4.0971, + "step": 70295 + }, + { + "epoch": 4.776464193504552, + "grad_norm": 0.3472294807434082, + "learning_rate": 4.0319676586492735e-06, + "loss": 3.9469, + "step": 70300 + }, + { + "epoch": 4.776803913575214, + "grad_norm": 0.39081382751464844, + "learning_rate": 4.031543008560946e-06, + "loss": 4.1451, + "step": 70305 + }, + { + "epoch": 4.777143633645876, + "grad_norm": 0.4202113449573517, + "learning_rate": 4.031118358472619e-06, + "loss": 4.1272, + "step": 70310 + }, + { + "epoch": 4.777483353716537, + "grad_norm": 0.2714894115924835, + "learning_rate": 4.030693708384292e-06, + "loss": 3.9518, + "step": 70315 + }, + { + "epoch": 4.777823073787199, + "grad_norm": 0.2880323529243469, + "learning_rate": 4.030269058295965e-06, + "loss": 4.0204, + "step": 70320 + }, + { + "epoch": 4.778162793857861, + "grad_norm": 0.2612344026565552, + "learning_rate": 4.0298444082076375e-06, + "loss": 3.8483, + "step": 70325 + }, + { + "epoch": 4.778502513928522, + "grad_norm": 0.29690778255462646, + "learning_rate": 4.02941975811931e-06, + "loss": 3.8844, + "step": 70330 + }, + { + "epoch": 4.7788422339991845, + "grad_norm": 0.2676931917667389, + "learning_rate": 4.028995108030983e-06, + "loss": 4.0366, + "step": 70335 + }, + { + "epoch": 4.779181954069847, + "grad_norm": 0.3400728106498718, + "learning_rate": 4.028570457942655e-06, + "loss": 3.9762, + "step": 70340 + }, + { + "epoch": 4.779521674140508, + "grad_norm": 0.29487380385398865, + "learning_rate": 4.028145807854329e-06, + "loss": 3.8749, + "step": 70345 + }, + { + "epoch": 4.77986139421117, + "grad_norm": 0.2921297252178192, + "learning_rate": 4.0277211577660015e-06, + "loss": 3.791, + "step": 70350 + }, + { + "epoch": 4.780201114281832, + "grad_norm": 0.2522585391998291, + "learning_rate": 4.0272965076776735e-06, + "loss": 4.0539, + "step": 70355 + }, + { + "epoch": 4.780540834352493, + "grad_norm": 0.22286732494831085, + "learning_rate": 4.026871857589347e-06, + "loss": 3.8427, + "step": 70360 + }, + { + "epoch": 4.780880554423155, + "grad_norm": 0.3245927095413208, + "learning_rate": 4.02644720750102e-06, + "loss": 3.723, + "step": 70365 + }, + { + "epoch": 4.781220274493817, + "grad_norm": 0.22913506627082825, + "learning_rate": 4.026022557412692e-06, + "loss": 3.9863, + "step": 70370 + }, + { + "epoch": 4.7815599945644784, + "grad_norm": 0.2722780406475067, + "learning_rate": 4.025597907324365e-06, + "loss": 4.0869, + "step": 70375 + }, + { + "epoch": 4.7818997146351405, + "grad_norm": 0.3098515272140503, + "learning_rate": 4.025173257236038e-06, + "loss": 4.0577, + "step": 70380 + }, + { + "epoch": 4.782239434705803, + "grad_norm": 0.2472788691520691, + "learning_rate": 4.02474860714771e-06, + "loss": 4.1231, + "step": 70385 + }, + { + "epoch": 4.782579154776464, + "grad_norm": 0.2963663637638092, + "learning_rate": 4.024323957059383e-06, + "loss": 4.0136, + "step": 70390 + }, + { + "epoch": 4.782918874847126, + "grad_norm": 0.6768192052841187, + "learning_rate": 4.023899306971057e-06, + "loss": 4.0004, + "step": 70395 + }, + { + "epoch": 4.783258594917788, + "grad_norm": 0.3455759286880493, + "learning_rate": 4.023474656882729e-06, + "loss": 3.7499, + "step": 70400 + }, + { + "epoch": 4.783598314988449, + "grad_norm": 0.2737925052642822, + "learning_rate": 4.0230500067944015e-06, + "loss": 4.1834, + "step": 70405 + }, + { + "epoch": 4.783938035059111, + "grad_norm": 0.2483319491147995, + "learning_rate": 4.022625356706074e-06, + "loss": 4.0171, + "step": 70410 + }, + { + "epoch": 4.784277755129773, + "grad_norm": 0.23906946182250977, + "learning_rate": 4.022200706617747e-06, + "loss": 4.0875, + "step": 70415 + }, + { + "epoch": 4.7846174752004345, + "grad_norm": 0.2577069103717804, + "learning_rate": 4.02177605652942e-06, + "loss": 4.0183, + "step": 70420 + }, + { + "epoch": 4.7849571952710965, + "grad_norm": 0.5609533190727234, + "learning_rate": 4.021351406441093e-06, + "loss": 3.798, + "step": 70425 + }, + { + "epoch": 4.785296915341759, + "grad_norm": 0.23215356469154358, + "learning_rate": 4.0209267563527655e-06, + "loss": 4.0598, + "step": 70430 + }, + { + "epoch": 4.78563663541242, + "grad_norm": 0.25830981135368347, + "learning_rate": 4.020502106264438e-06, + "loss": 3.9281, + "step": 70435 + }, + { + "epoch": 4.785976355483082, + "grad_norm": 0.4004976451396942, + "learning_rate": 4.020077456176111e-06, + "loss": 3.6263, + "step": 70440 + }, + { + "epoch": 4.786316075553744, + "grad_norm": 0.3010730743408203, + "learning_rate": 4.019652806087784e-06, + "loss": 4.1395, + "step": 70445 + }, + { + "epoch": 4.786655795624405, + "grad_norm": 0.7220010757446289, + "learning_rate": 4.019228155999457e-06, + "loss": 3.9599, + "step": 70450 + }, + { + "epoch": 4.786995515695067, + "grad_norm": 0.2408309429883957, + "learning_rate": 4.0188035059111295e-06, + "loss": 3.7091, + "step": 70455 + }, + { + "epoch": 4.787335235765729, + "grad_norm": 0.2530069351196289, + "learning_rate": 4.018378855822802e-06, + "loss": 3.8772, + "step": 70460 + }, + { + "epoch": 4.7876749558363905, + "grad_norm": 0.30280348658561707, + "learning_rate": 4.017954205734475e-06, + "loss": 3.9944, + "step": 70465 + }, + { + "epoch": 4.7880146759070525, + "grad_norm": 0.26749473810195923, + "learning_rate": 4.017529555646148e-06, + "loss": 3.923, + "step": 70470 + }, + { + "epoch": 4.788354395977715, + "grad_norm": 0.28383100032806396, + "learning_rate": 4.017104905557821e-06, + "loss": 3.8634, + "step": 70475 + }, + { + "epoch": 4.788694116048376, + "grad_norm": 0.24135643243789673, + "learning_rate": 4.0166802554694935e-06, + "loss": 3.7075, + "step": 70480 + }, + { + "epoch": 4.789033836119038, + "grad_norm": 0.2645214796066284, + "learning_rate": 4.016255605381166e-06, + "loss": 4.178, + "step": 70485 + }, + { + "epoch": 4.7893735561897, + "grad_norm": 0.21251797676086426, + "learning_rate": 4.015830955292839e-06, + "loss": 3.7078, + "step": 70490 + }, + { + "epoch": 4.789713276260361, + "grad_norm": 0.30938881635665894, + "learning_rate": 4.015406305204512e-06, + "loss": 3.8033, + "step": 70495 + }, + { + "epoch": 4.790052996331023, + "grad_norm": 0.2705608904361725, + "learning_rate": 4.014981655116185e-06, + "loss": 4.1241, + "step": 70500 + }, + { + "epoch": 4.790392716401685, + "grad_norm": 0.20298396050930023, + "learning_rate": 4.0145570050278575e-06, + "loss": 3.6967, + "step": 70505 + }, + { + "epoch": 4.7907324364723465, + "grad_norm": 0.22329196333885193, + "learning_rate": 4.01413235493953e-06, + "loss": 3.9172, + "step": 70510 + }, + { + "epoch": 4.7910721565430086, + "grad_norm": 0.2545485198497772, + "learning_rate": 4.013707704851203e-06, + "loss": 3.8107, + "step": 70515 + }, + { + "epoch": 4.791411876613671, + "grad_norm": 0.2657466232776642, + "learning_rate": 4.013283054762876e-06, + "loss": 4.1517, + "step": 70520 + }, + { + "epoch": 4.791751596684332, + "grad_norm": 0.26083970069885254, + "learning_rate": 4.012858404674549e-06, + "loss": 3.8712, + "step": 70525 + }, + { + "epoch": 4.792091316754994, + "grad_norm": 0.29614123702049255, + "learning_rate": 4.0124337545862215e-06, + "loss": 3.8133, + "step": 70530 + }, + { + "epoch": 4.792431036825656, + "grad_norm": 0.2800218462944031, + "learning_rate": 4.012009104497894e-06, + "loss": 3.9694, + "step": 70535 + }, + { + "epoch": 4.792770756896317, + "grad_norm": 0.25084608793258667, + "learning_rate": 4.011584454409566e-06, + "loss": 3.8444, + "step": 70540 + }, + { + "epoch": 4.793110476966979, + "grad_norm": 0.5597391128540039, + "learning_rate": 4.01115980432124e-06, + "loss": 3.8316, + "step": 70545 + }, + { + "epoch": 4.793450197037641, + "grad_norm": 0.4534643292427063, + "learning_rate": 4.010735154232913e-06, + "loss": 3.9944, + "step": 70550 + }, + { + "epoch": 4.7937899171083025, + "grad_norm": 0.2650592625141144, + "learning_rate": 4.010310504144585e-06, + "loss": 4.0498, + "step": 70555 + }, + { + "epoch": 4.794129637178965, + "grad_norm": 0.2697436213493347, + "learning_rate": 4.009885854056258e-06, + "loss": 4.0989, + "step": 70560 + }, + { + "epoch": 4.794469357249627, + "grad_norm": 0.38649776577949524, + "learning_rate": 4.009461203967931e-06, + "loss": 4.009, + "step": 70565 + }, + { + "epoch": 4.794809077320288, + "grad_norm": 0.3052136301994324, + "learning_rate": 4.009036553879603e-06, + "loss": 4.0588, + "step": 70570 + }, + { + "epoch": 4.79514879739095, + "grad_norm": 0.27435585856437683, + "learning_rate": 4.008611903791277e-06, + "loss": 3.9356, + "step": 70575 + }, + { + "epoch": 4.795488517461612, + "grad_norm": 0.34690359234809875, + "learning_rate": 4.0081872537029495e-06, + "loss": 3.8561, + "step": 70580 + }, + { + "epoch": 4.795828237532273, + "grad_norm": 0.22804206609725952, + "learning_rate": 4.0077626036146215e-06, + "loss": 3.8563, + "step": 70585 + }, + { + "epoch": 4.796167957602935, + "grad_norm": 0.24772056937217712, + "learning_rate": 4.007337953526294e-06, + "loss": 4.1345, + "step": 70590 + }, + { + "epoch": 4.796507677673597, + "grad_norm": 0.26481688022613525, + "learning_rate": 4.006913303437968e-06, + "loss": 3.8501, + "step": 70595 + }, + { + "epoch": 4.7968473977442585, + "grad_norm": 0.24607187509536743, + "learning_rate": 4.00648865334964e-06, + "loss": 3.9485, + "step": 70600 + }, + { + "epoch": 4.797187117814921, + "grad_norm": 0.20004862546920776, + "learning_rate": 4.006064003261313e-06, + "loss": 3.8166, + "step": 70605 + }, + { + "epoch": 4.797526837885583, + "grad_norm": 0.3468223810195923, + "learning_rate": 4.005639353172986e-06, + "loss": 3.9848, + "step": 70610 + }, + { + "epoch": 4.797866557956244, + "grad_norm": 0.25943344831466675, + "learning_rate": 4.005214703084658e-06, + "loss": 3.7696, + "step": 70615 + }, + { + "epoch": 4.798206278026906, + "grad_norm": 0.35691359639167786, + "learning_rate": 4.004790052996331e-06, + "loss": 4.0584, + "step": 70620 + }, + { + "epoch": 4.798545998097568, + "grad_norm": 0.303670197725296, + "learning_rate": 4.004365402908004e-06, + "loss": 3.9671, + "step": 70625 + }, + { + "epoch": 4.798885718168229, + "grad_norm": 0.29346445202827454, + "learning_rate": 4.003940752819677e-06, + "loss": 3.734, + "step": 70630 + }, + { + "epoch": 4.799225438238891, + "grad_norm": 0.2621760666370392, + "learning_rate": 4.0035161027313495e-06, + "loss": 3.9013, + "step": 70635 + }, + { + "epoch": 4.799565158309553, + "grad_norm": 0.47371000051498413, + "learning_rate": 4.003091452643022e-06, + "loss": 3.8519, + "step": 70640 + }, + { + "epoch": 4.7999048783802145, + "grad_norm": 0.46555837988853455, + "learning_rate": 4.002666802554696e-06, + "loss": 3.9221, + "step": 70645 + }, + { + "epoch": 4.800244598450877, + "grad_norm": 0.23174646496772766, + "learning_rate": 4.002242152466368e-06, + "loss": 3.8282, + "step": 70650 + }, + { + "epoch": 4.800584318521539, + "grad_norm": 0.31646087765693665, + "learning_rate": 4.001817502378041e-06, + "loss": 3.937, + "step": 70655 + }, + { + "epoch": 4.8009240385922, + "grad_norm": 0.2364882379770279, + "learning_rate": 4.0013928522897135e-06, + "loss": 4.0388, + "step": 70660 + }, + { + "epoch": 4.801263758662862, + "grad_norm": 0.21606752276420593, + "learning_rate": 4.000968202201386e-06, + "loss": 4.0195, + "step": 70665 + }, + { + "epoch": 4.801603478733524, + "grad_norm": 0.38232746720314026, + "learning_rate": 4.000543552113059e-06, + "loss": 3.8529, + "step": 70670 + }, + { + "epoch": 4.801943198804185, + "grad_norm": 0.24610033631324768, + "learning_rate": 4.000118902024732e-06, + "loss": 4.1431, + "step": 70675 + }, + { + "epoch": 4.802282918874847, + "grad_norm": 0.22598163783550262, + "learning_rate": 3.999694251936405e-06, + "loss": 3.9392, + "step": 70680 + }, + { + "epoch": 4.802622638945509, + "grad_norm": 0.2114134281873703, + "learning_rate": 3.9992696018480775e-06, + "loss": 3.7255, + "step": 70685 + }, + { + "epoch": 4.8029623590161705, + "grad_norm": 0.2936353087425232, + "learning_rate": 3.99884495175975e-06, + "loss": 3.9126, + "step": 70690 + }, + { + "epoch": 4.803302079086833, + "grad_norm": 0.2974449396133423, + "learning_rate": 3.998420301671423e-06, + "loss": 4.0556, + "step": 70695 + }, + { + "epoch": 4.803641799157495, + "grad_norm": 0.3115690052509308, + "learning_rate": 3.997995651583096e-06, + "loss": 3.8846, + "step": 70700 + }, + { + "epoch": 4.803981519228156, + "grad_norm": 0.460497111082077, + "learning_rate": 3.997571001494769e-06, + "loss": 3.822, + "step": 70705 + }, + { + "epoch": 4.804321239298818, + "grad_norm": 0.25271886587142944, + "learning_rate": 3.9971463514064415e-06, + "loss": 3.9486, + "step": 70710 + }, + { + "epoch": 4.80466095936948, + "grad_norm": 0.3034849762916565, + "learning_rate": 3.996721701318114e-06, + "loss": 3.934, + "step": 70715 + }, + { + "epoch": 4.805000679440141, + "grad_norm": 0.2369827926158905, + "learning_rate": 3.996297051229787e-06, + "loss": 3.9914, + "step": 70720 + }, + { + "epoch": 4.805340399510803, + "grad_norm": 0.20041631162166595, + "learning_rate": 3.99587240114146e-06, + "loss": 3.7761, + "step": 70725 + }, + { + "epoch": 4.805680119581465, + "grad_norm": 0.28186124563217163, + "learning_rate": 3.995447751053133e-06, + "loss": 4.0417, + "step": 70730 + }, + { + "epoch": 4.8060198396521265, + "grad_norm": 0.2507608234882355, + "learning_rate": 3.9950231009648055e-06, + "loss": 3.8901, + "step": 70735 + }, + { + "epoch": 4.806359559722789, + "grad_norm": 0.3381914794445038, + "learning_rate": 3.994598450876478e-06, + "loss": 3.996, + "step": 70740 + }, + { + "epoch": 4.806699279793451, + "grad_norm": 0.387928307056427, + "learning_rate": 3.994173800788151e-06, + "loss": 4.1306, + "step": 70745 + }, + { + "epoch": 4.807038999864112, + "grad_norm": 0.27404525876045227, + "learning_rate": 3.993749150699824e-06, + "loss": 3.9864, + "step": 70750 + }, + { + "epoch": 4.807378719934774, + "grad_norm": 0.24434107542037964, + "learning_rate": 3.993324500611496e-06, + "loss": 3.8835, + "step": 70755 + }, + { + "epoch": 4.807718440005435, + "grad_norm": 0.3084767460823059, + "learning_rate": 3.9928998505231695e-06, + "loss": 3.8583, + "step": 70760 + }, + { + "epoch": 4.808058160076097, + "grad_norm": 0.23105604946613312, + "learning_rate": 3.992475200434842e-06, + "loss": 4.0957, + "step": 70765 + }, + { + "epoch": 4.808397880146759, + "grad_norm": 0.31689372658729553, + "learning_rate": 3.992050550346514e-06, + "loss": 4.0437, + "step": 70770 + }, + { + "epoch": 4.8087376002174205, + "grad_norm": 0.2529516816139221, + "learning_rate": 3.991625900258188e-06, + "loss": 4.0186, + "step": 70775 + }, + { + "epoch": 4.8090773202880825, + "grad_norm": 0.22197134792804718, + "learning_rate": 3.991201250169861e-06, + "loss": 3.8732, + "step": 70780 + }, + { + "epoch": 4.809417040358745, + "grad_norm": 0.20107465982437134, + "learning_rate": 3.990776600081533e-06, + "loss": 4.0846, + "step": 70785 + }, + { + "epoch": 4.809756760429406, + "grad_norm": 0.2696690559387207, + "learning_rate": 3.9903519499932054e-06, + "loss": 3.947, + "step": 70790 + }, + { + "epoch": 4.810096480500068, + "grad_norm": 0.34109941124916077, + "learning_rate": 3.989927299904879e-06, + "loss": 3.8633, + "step": 70795 + }, + { + "epoch": 4.81043620057073, + "grad_norm": 0.29334282875061035, + "learning_rate": 3.989502649816551e-06, + "loss": 3.8124, + "step": 70800 + }, + { + "epoch": 4.810775920641391, + "grad_norm": 0.3186081051826477, + "learning_rate": 3.989077999728224e-06, + "loss": 3.9237, + "step": 70805 + }, + { + "epoch": 4.811115640712053, + "grad_norm": 0.2796672284603119, + "learning_rate": 3.9886533496398975e-06, + "loss": 4.2371, + "step": 70810 + }, + { + "epoch": 4.811455360782715, + "grad_norm": 0.2521061897277832, + "learning_rate": 3.98822869955157e-06, + "loss": 4.1116, + "step": 70815 + }, + { + "epoch": 4.8117950808533765, + "grad_norm": 0.23809748888015747, + "learning_rate": 3.987804049463242e-06, + "loss": 3.9007, + "step": 70820 + }, + { + "epoch": 4.812134800924039, + "grad_norm": 0.27243173122406006, + "learning_rate": 3.987379399374915e-06, + "loss": 3.9366, + "step": 70825 + }, + { + "epoch": 4.812474520994701, + "grad_norm": 0.31833964586257935, + "learning_rate": 3.986954749286589e-06, + "loss": 3.9201, + "step": 70830 + }, + { + "epoch": 4.812814241065362, + "grad_norm": 0.2912781238555908, + "learning_rate": 3.986530099198261e-06, + "loss": 4.0596, + "step": 70835 + }, + { + "epoch": 4.813153961136024, + "grad_norm": 0.29814082384109497, + "learning_rate": 3.9861054491099335e-06, + "loss": 3.9242, + "step": 70840 + }, + { + "epoch": 4.813493681206686, + "grad_norm": 0.3265042006969452, + "learning_rate": 3.985680799021607e-06, + "loss": 4.2481, + "step": 70845 + }, + { + "epoch": 4.813833401277347, + "grad_norm": 0.22511295974254608, + "learning_rate": 3.985256148933279e-06, + "loss": 3.834, + "step": 70850 + }, + { + "epoch": 4.814173121348009, + "grad_norm": 0.27035412192344666, + "learning_rate": 3.984831498844952e-06, + "loss": 4.1907, + "step": 70855 + }, + { + "epoch": 4.814512841418671, + "grad_norm": 0.18496514856815338, + "learning_rate": 3.9844068487566255e-06, + "loss": 3.8638, + "step": 70860 + }, + { + "epoch": 4.8148525614893325, + "grad_norm": 0.29099100828170776, + "learning_rate": 3.9839821986682975e-06, + "loss": 4.0466, + "step": 70865 + }, + { + "epoch": 4.815192281559995, + "grad_norm": 0.27577823400497437, + "learning_rate": 3.98355754857997e-06, + "loss": 4.0019, + "step": 70870 + }, + { + "epoch": 4.815532001630657, + "grad_norm": 0.2093251347541809, + "learning_rate": 3.983132898491643e-06, + "loss": 3.6772, + "step": 70875 + }, + { + "epoch": 4.815871721701318, + "grad_norm": 0.27498358488082886, + "learning_rate": 3.982708248403316e-06, + "loss": 3.9983, + "step": 70880 + }, + { + "epoch": 4.81621144177198, + "grad_norm": 0.2094179391860962, + "learning_rate": 3.982283598314989e-06, + "loss": 3.8764, + "step": 70885 + }, + { + "epoch": 4.816551161842642, + "grad_norm": 0.3272535800933838, + "learning_rate": 3.9818589482266615e-06, + "loss": 3.9819, + "step": 70890 + }, + { + "epoch": 4.816890881913303, + "grad_norm": 0.24996136128902435, + "learning_rate": 3.981434298138334e-06, + "loss": 4.1245, + "step": 70895 + }, + { + "epoch": 4.817230601983965, + "grad_norm": 0.27132585644721985, + "learning_rate": 3.981009648050007e-06, + "loss": 4.1113, + "step": 70900 + }, + { + "epoch": 4.817570322054627, + "grad_norm": 0.29847660660743713, + "learning_rate": 3.98058499796168e-06, + "loss": 4.1907, + "step": 70905 + }, + { + "epoch": 4.8179100421252885, + "grad_norm": 0.3405512869358063, + "learning_rate": 3.980160347873353e-06, + "loss": 4.2674, + "step": 70910 + }, + { + "epoch": 4.818249762195951, + "grad_norm": 0.42902907729148865, + "learning_rate": 3.9797356977850255e-06, + "loss": 4.0815, + "step": 70915 + }, + { + "epoch": 4.818589482266613, + "grad_norm": 0.26296117901802063, + "learning_rate": 3.979311047696698e-06, + "loss": 3.6634, + "step": 70920 + }, + { + "epoch": 4.818929202337274, + "grad_norm": 0.29671815037727356, + "learning_rate": 3.978886397608371e-06, + "loss": 3.9764, + "step": 70925 + }, + { + "epoch": 4.819268922407936, + "grad_norm": 0.3104236423969269, + "learning_rate": 3.978461747520044e-06, + "loss": 3.9562, + "step": 70930 + }, + { + "epoch": 4.819608642478598, + "grad_norm": 0.3124268054962158, + "learning_rate": 3.978037097431717e-06, + "loss": 3.807, + "step": 70935 + }, + { + "epoch": 4.819948362549259, + "grad_norm": 0.2796107232570648, + "learning_rate": 3.9776124473433895e-06, + "loss": 3.9786, + "step": 70940 + }, + { + "epoch": 4.820288082619921, + "grad_norm": 0.2735986113548279, + "learning_rate": 3.977187797255062e-06, + "loss": 4.1076, + "step": 70945 + }, + { + "epoch": 4.820627802690583, + "grad_norm": 0.24404574930667877, + "learning_rate": 3.976763147166735e-06, + "loss": 4.0335, + "step": 70950 + }, + { + "epoch": 4.8209675227612445, + "grad_norm": 0.2784600853919983, + "learning_rate": 3.976338497078408e-06, + "loss": 3.8815, + "step": 70955 + }, + { + "epoch": 4.821307242831907, + "grad_norm": 0.20766131579875946, + "learning_rate": 3.975913846990081e-06, + "loss": 3.8237, + "step": 70960 + }, + { + "epoch": 4.821646962902569, + "grad_norm": 0.24091991782188416, + "learning_rate": 3.9754891969017535e-06, + "loss": 4.0148, + "step": 70965 + }, + { + "epoch": 4.82198668297323, + "grad_norm": 0.2484077364206314, + "learning_rate": 3.9750645468134254e-06, + "loss": 3.9974, + "step": 70970 + }, + { + "epoch": 4.822326403043892, + "grad_norm": 0.2508905529975891, + "learning_rate": 3.974639896725099e-06, + "loss": 3.7329, + "step": 70975 + }, + { + "epoch": 4.822666123114553, + "grad_norm": 0.2161853015422821, + "learning_rate": 3.974215246636772e-06, + "loss": 3.6505, + "step": 70980 + }, + { + "epoch": 4.823005843185215, + "grad_norm": 0.2755151093006134, + "learning_rate": 3.973790596548445e-06, + "loss": 3.9091, + "step": 70985 + }, + { + "epoch": 4.823345563255877, + "grad_norm": 0.22358037531375885, + "learning_rate": 3.9733659464601175e-06, + "loss": 3.8445, + "step": 70990 + }, + { + "epoch": 4.8236852833265385, + "grad_norm": 0.36120516061782837, + "learning_rate": 3.97294129637179e-06, + "loss": 4.0182, + "step": 70995 + }, + { + "epoch": 4.8240250033972005, + "grad_norm": 0.32993125915527344, + "learning_rate": 3.972516646283463e-06, + "loss": 3.9853, + "step": 71000 + }, + { + "epoch": 4.824364723467863, + "grad_norm": 0.22196272015571594, + "learning_rate": 3.972091996195135e-06, + "loss": 3.7526, + "step": 71005 + }, + { + "epoch": 4.824704443538524, + "grad_norm": 0.3541905879974365, + "learning_rate": 3.971667346106809e-06, + "loss": 4.1674, + "step": 71010 + }, + { + "epoch": 4.825044163609186, + "grad_norm": 0.27191445231437683, + "learning_rate": 3.9712426960184815e-06, + "loss": 3.742, + "step": 71015 + }, + { + "epoch": 4.825383883679848, + "grad_norm": 0.2935918867588043, + "learning_rate": 3.9708180459301534e-06, + "loss": 3.9231, + "step": 71020 + }, + { + "epoch": 4.825723603750509, + "grad_norm": 0.2625461518764496, + "learning_rate": 3.970393395841827e-06, + "loss": 4.0107, + "step": 71025 + }, + { + "epoch": 4.826063323821171, + "grad_norm": 0.270824670791626, + "learning_rate": 3.9699687457535e-06, + "loss": 3.9873, + "step": 71030 + }, + { + "epoch": 4.826403043891833, + "grad_norm": 0.33347615599632263, + "learning_rate": 3.969544095665172e-06, + "loss": 4.0366, + "step": 71035 + }, + { + "epoch": 4.8267427639624945, + "grad_norm": 0.21781045198440552, + "learning_rate": 3.969119445576845e-06, + "loss": 3.7838, + "step": 71040 + }, + { + "epoch": 4.8270824840331565, + "grad_norm": 0.25739213824272156, + "learning_rate": 3.968694795488518e-06, + "loss": 3.7984, + "step": 71045 + }, + { + "epoch": 4.827422204103819, + "grad_norm": 0.24744145572185516, + "learning_rate": 3.96827014540019e-06, + "loss": 4.0034, + "step": 71050 + }, + { + "epoch": 4.82776192417448, + "grad_norm": 0.29543057084083557, + "learning_rate": 3.967845495311863e-06, + "loss": 3.8137, + "step": 71055 + }, + { + "epoch": 4.828101644245142, + "grad_norm": 0.23388905823230743, + "learning_rate": 3.967420845223537e-06, + "loss": 4.0608, + "step": 71060 + }, + { + "epoch": 4.828441364315804, + "grad_norm": 0.2894444763660431, + "learning_rate": 3.966996195135209e-06, + "loss": 4.1129, + "step": 71065 + }, + { + "epoch": 4.828781084386465, + "grad_norm": 0.2697037160396576, + "learning_rate": 3.9665715450468814e-06, + "loss": 4.055, + "step": 71070 + }, + { + "epoch": 4.829120804457127, + "grad_norm": 0.32797950506210327, + "learning_rate": 3.966146894958554e-06, + "loss": 3.9087, + "step": 71075 + }, + { + "epoch": 4.829460524527789, + "grad_norm": 0.2798866033554077, + "learning_rate": 3.965722244870227e-06, + "loss": 3.8648, + "step": 71080 + }, + { + "epoch": 4.8298002445984505, + "grad_norm": 0.24076057970523834, + "learning_rate": 3.9652975947819e-06, + "loss": 4.0677, + "step": 71085 + }, + { + "epoch": 4.8301399646691126, + "grad_norm": 0.3119674623012543, + "learning_rate": 3.964872944693573e-06, + "loss": 4.0778, + "step": 71090 + }, + { + "epoch": 4.830479684739775, + "grad_norm": 0.3248090445995331, + "learning_rate": 3.9644482946052454e-06, + "loss": 3.91, + "step": 71095 + }, + { + "epoch": 4.830819404810436, + "grad_norm": 0.23231779038906097, + "learning_rate": 3.964023644516918e-06, + "loss": 3.9834, + "step": 71100 + }, + { + "epoch": 4.831159124881098, + "grad_norm": 0.28041690587997437, + "learning_rate": 3.963598994428591e-06, + "loss": 4.0695, + "step": 71105 + }, + { + "epoch": 4.83149884495176, + "grad_norm": 0.27267247438430786, + "learning_rate": 3.963174344340264e-06, + "loss": 4.1049, + "step": 71110 + }, + { + "epoch": 4.831838565022421, + "grad_norm": 0.2759474217891693, + "learning_rate": 3.962749694251937e-06, + "loss": 3.9325, + "step": 71115 + }, + { + "epoch": 4.832178285093083, + "grad_norm": 0.24838867783546448, + "learning_rate": 3.9623250441636094e-06, + "loss": 3.7957, + "step": 71120 + }, + { + "epoch": 4.832518005163745, + "grad_norm": 0.2508354187011719, + "learning_rate": 3.961900394075282e-06, + "loss": 4.1006, + "step": 71125 + }, + { + "epoch": 4.8328577252344065, + "grad_norm": 0.2536979615688324, + "learning_rate": 3.961475743986955e-06, + "loss": 3.8655, + "step": 71130 + }, + { + "epoch": 4.833197445305069, + "grad_norm": 0.2033654749393463, + "learning_rate": 3.961051093898628e-06, + "loss": 4.032, + "step": 71135 + }, + { + "epoch": 4.833537165375731, + "grad_norm": 0.2769116163253784, + "learning_rate": 3.960626443810301e-06, + "loss": 3.6337, + "step": 71140 + }, + { + "epoch": 4.833876885446392, + "grad_norm": 0.2541811466217041, + "learning_rate": 3.9602017937219735e-06, + "loss": 3.9585, + "step": 71145 + }, + { + "epoch": 4.834216605517054, + "grad_norm": 0.219735786318779, + "learning_rate": 3.959777143633646e-06, + "loss": 3.8695, + "step": 71150 + }, + { + "epoch": 4.834556325587716, + "grad_norm": 0.40764880180358887, + "learning_rate": 3.959352493545319e-06, + "loss": 3.7414, + "step": 71155 + }, + { + "epoch": 4.834896045658377, + "grad_norm": 0.2461242973804474, + "learning_rate": 3.958927843456992e-06, + "loss": 3.7149, + "step": 71160 + }, + { + "epoch": 4.835235765729039, + "grad_norm": 0.3328387141227722, + "learning_rate": 3.958503193368665e-06, + "loss": 4.122, + "step": 71165 + }, + { + "epoch": 4.835575485799701, + "grad_norm": 0.2293035089969635, + "learning_rate": 3.9580785432803375e-06, + "loss": 3.9448, + "step": 71170 + }, + { + "epoch": 4.8359152058703625, + "grad_norm": 0.4164268672466278, + "learning_rate": 3.95765389319201e-06, + "loss": 3.9919, + "step": 71175 + }, + { + "epoch": 4.836254925941025, + "grad_norm": 0.18334677815437317, + "learning_rate": 3.957229243103683e-06, + "loss": 3.7642, + "step": 71180 + }, + { + "epoch": 4.836594646011687, + "grad_norm": 0.33171209692955017, + "learning_rate": 3.956804593015356e-06, + "loss": 4.0816, + "step": 71185 + }, + { + "epoch": 4.836934366082348, + "grad_norm": 0.2404613047838211, + "learning_rate": 3.956379942927029e-06, + "loss": 4.0722, + "step": 71190 + }, + { + "epoch": 4.83727408615301, + "grad_norm": 0.2946581542491913, + "learning_rate": 3.9559552928387015e-06, + "loss": 3.7994, + "step": 71195 + }, + { + "epoch": 4.837613806223672, + "grad_norm": 0.26723936200141907, + "learning_rate": 3.955530642750374e-06, + "loss": 4.0049, + "step": 71200 + }, + { + "epoch": 4.837953526294333, + "grad_norm": 0.3253863751888275, + "learning_rate": 3.955105992662047e-06, + "loss": 3.9208, + "step": 71205 + }, + { + "epoch": 4.838293246364995, + "grad_norm": 0.4154955744743347, + "learning_rate": 3.95468134257372e-06, + "loss": 3.9053, + "step": 71210 + }, + { + "epoch": 4.838632966435657, + "grad_norm": 0.26385629177093506, + "learning_rate": 3.954256692485393e-06, + "loss": 3.8765, + "step": 71215 + }, + { + "epoch": 4.8389726865063185, + "grad_norm": 0.21962720155715942, + "learning_rate": 3.953832042397065e-06, + "loss": 3.8504, + "step": 71220 + }, + { + "epoch": 4.839312406576981, + "grad_norm": 0.2767835259437561, + "learning_rate": 3.953407392308738e-06, + "loss": 3.7472, + "step": 71225 + }, + { + "epoch": 4.839652126647643, + "grad_norm": 0.20690380036830902, + "learning_rate": 3.952982742220411e-06, + "loss": 3.7491, + "step": 71230 + }, + { + "epoch": 4.839991846718304, + "grad_norm": 0.2733159363269806, + "learning_rate": 3.952558092132083e-06, + "loss": 4.0254, + "step": 71235 + }, + { + "epoch": 4.840331566788966, + "grad_norm": 0.27105382084846497, + "learning_rate": 3.952133442043757e-06, + "loss": 3.9326, + "step": 71240 + }, + { + "epoch": 4.840671286859628, + "grad_norm": 0.36180931329727173, + "learning_rate": 3.9517087919554295e-06, + "loss": 3.7859, + "step": 71245 + }, + { + "epoch": 4.841011006930289, + "grad_norm": 0.21975359320640564, + "learning_rate": 3.951284141867101e-06, + "loss": 3.9946, + "step": 71250 + }, + { + "epoch": 4.841350727000951, + "grad_norm": 0.354133665561676, + "learning_rate": 3.950859491778774e-06, + "loss": 4.0858, + "step": 71255 + }, + { + "epoch": 4.841690447071613, + "grad_norm": 0.23910602927207947, + "learning_rate": 3.950434841690448e-06, + "loss": 3.8058, + "step": 71260 + }, + { + "epoch": 4.8420301671422745, + "grad_norm": 0.29319995641708374, + "learning_rate": 3.95001019160212e-06, + "loss": 4.0995, + "step": 71265 + }, + { + "epoch": 4.842369887212937, + "grad_norm": 0.24537880718708038, + "learning_rate": 3.949585541513793e-06, + "loss": 3.7582, + "step": 71270 + }, + { + "epoch": 4.842709607283599, + "grad_norm": 0.25063103437423706, + "learning_rate": 3.949160891425466e-06, + "loss": 4.0026, + "step": 71275 + }, + { + "epoch": 4.84304932735426, + "grad_norm": 0.22302448749542236, + "learning_rate": 3.948736241337138e-06, + "loss": 3.948, + "step": 71280 + }, + { + "epoch": 4.843389047424922, + "grad_norm": 0.2937881052494049, + "learning_rate": 3.948311591248811e-06, + "loss": 3.925, + "step": 71285 + }, + { + "epoch": 4.843728767495584, + "grad_norm": 0.5902546048164368, + "learning_rate": 3.947886941160484e-06, + "loss": 3.9271, + "step": 71290 + }, + { + "epoch": 4.844068487566245, + "grad_norm": 0.23073634505271912, + "learning_rate": 3.947462291072157e-06, + "loss": 3.9116, + "step": 71295 + }, + { + "epoch": 4.844408207636907, + "grad_norm": 0.25512275099754333, + "learning_rate": 3.9470376409838294e-06, + "loss": 3.926, + "step": 71300 + }, + { + "epoch": 4.844747927707569, + "grad_norm": 0.19912153482437134, + "learning_rate": 3.946612990895502e-06, + "loss": 4.1612, + "step": 71305 + }, + { + "epoch": 4.8450876477782305, + "grad_norm": 0.26652535796165466, + "learning_rate": 3.946188340807175e-06, + "loss": 3.9432, + "step": 71310 + }, + { + "epoch": 4.845427367848893, + "grad_norm": 0.33028244972229004, + "learning_rate": 3.945763690718848e-06, + "loss": 4.0095, + "step": 71315 + }, + { + "epoch": 4.845767087919555, + "grad_norm": 0.25776049494743347, + "learning_rate": 3.945339040630521e-06, + "loss": 4.1064, + "step": 71320 + }, + { + "epoch": 4.846106807990216, + "grad_norm": 0.2539961636066437, + "learning_rate": 3.9449143905421934e-06, + "loss": 4.1343, + "step": 71325 + }, + { + "epoch": 4.846446528060878, + "grad_norm": 1.0985581874847412, + "learning_rate": 3.944489740453866e-06, + "loss": 4.1636, + "step": 71330 + }, + { + "epoch": 4.84678624813154, + "grad_norm": 0.2863749563694, + "learning_rate": 3.944065090365539e-06, + "loss": 4.0241, + "step": 71335 + }, + { + "epoch": 4.847125968202201, + "grad_norm": 0.2882736623287201, + "learning_rate": 3.943640440277212e-06, + "loss": 3.8096, + "step": 71340 + }, + { + "epoch": 4.847465688272863, + "grad_norm": 0.2359284609556198, + "learning_rate": 3.943215790188885e-06, + "loss": 4.0116, + "step": 71345 + }, + { + "epoch": 4.847805408343525, + "grad_norm": 0.23088929057121277, + "learning_rate": 3.9427911401005574e-06, + "loss": 3.9406, + "step": 71350 + }, + { + "epoch": 4.8481451284141865, + "grad_norm": 0.29069212079048157, + "learning_rate": 3.94236649001223e-06, + "loss": 4.0944, + "step": 71355 + }, + { + "epoch": 4.848484848484849, + "grad_norm": 0.4346142113208771, + "learning_rate": 3.941941839923903e-06, + "loss": 3.7657, + "step": 71360 + }, + { + "epoch": 4.848824568555511, + "grad_norm": 0.23012147843837738, + "learning_rate": 3.941517189835576e-06, + "loss": 3.9428, + "step": 71365 + }, + { + "epoch": 4.849164288626172, + "grad_norm": 0.22713467478752136, + "learning_rate": 3.941092539747249e-06, + "loss": 3.8231, + "step": 71370 + }, + { + "epoch": 4.849504008696834, + "grad_norm": 0.24707196652889252, + "learning_rate": 3.9406678896589214e-06, + "loss": 3.8958, + "step": 71375 + }, + { + "epoch": 4.849843728767496, + "grad_norm": 0.24032452702522278, + "learning_rate": 3.940243239570594e-06, + "loss": 3.9495, + "step": 71380 + }, + { + "epoch": 4.850183448838157, + "grad_norm": 0.30279529094696045, + "learning_rate": 3.939818589482267e-06, + "loss": 3.74, + "step": 71385 + }, + { + "epoch": 4.850523168908819, + "grad_norm": 0.22876758873462677, + "learning_rate": 3.93939393939394e-06, + "loss": 3.8561, + "step": 71390 + }, + { + "epoch": 4.850862888979481, + "grad_norm": 0.4071882963180542, + "learning_rate": 3.938969289305613e-06, + "loss": 3.7483, + "step": 71395 + }, + { + "epoch": 4.8512026090501426, + "grad_norm": 0.2597973942756653, + "learning_rate": 3.9385446392172854e-06, + "loss": 3.8333, + "step": 71400 + }, + { + "epoch": 4.851542329120805, + "grad_norm": 0.3537850081920624, + "learning_rate": 3.938119989128958e-06, + "loss": 4.0262, + "step": 71405 + }, + { + "epoch": 4.851882049191467, + "grad_norm": 0.5920988321304321, + "learning_rate": 3.937695339040631e-06, + "loss": 3.8486, + "step": 71410 + }, + { + "epoch": 4.852221769262128, + "grad_norm": 0.2550630569458008, + "learning_rate": 3.937270688952304e-06, + "loss": 4.1298, + "step": 71415 + }, + { + "epoch": 4.85256148933279, + "grad_norm": 0.2395174205303192, + "learning_rate": 3.936846038863976e-06, + "loss": 3.9198, + "step": 71420 + }, + { + "epoch": 4.852901209403452, + "grad_norm": 0.22731927037239075, + "learning_rate": 3.9364213887756494e-06, + "loss": 3.7932, + "step": 71425 + }, + { + "epoch": 4.853240929474113, + "grad_norm": 0.26068204641342163, + "learning_rate": 3.935996738687322e-06, + "loss": 4.0555, + "step": 71430 + }, + { + "epoch": 4.853580649544775, + "grad_norm": 0.27182936668395996, + "learning_rate": 3.935572088598994e-06, + "loss": 4.147, + "step": 71435 + }, + { + "epoch": 4.8539203696154365, + "grad_norm": 0.2326119989156723, + "learning_rate": 3.935147438510668e-06, + "loss": 3.9553, + "step": 71440 + }, + { + "epoch": 4.854260089686099, + "grad_norm": 0.33735108375549316, + "learning_rate": 3.934722788422341e-06, + "loss": 3.703, + "step": 71445 + }, + { + "epoch": 4.854599809756761, + "grad_norm": 0.2319013923406601, + "learning_rate": 3.934298138334013e-06, + "loss": 4.0212, + "step": 71450 + }, + { + "epoch": 4.854939529827422, + "grad_norm": 0.3557378351688385, + "learning_rate": 3.933873488245685e-06, + "loss": 3.7311, + "step": 71455 + }, + { + "epoch": 4.855279249898084, + "grad_norm": 0.31177470088005066, + "learning_rate": 3.933448838157359e-06, + "loss": 3.9118, + "step": 71460 + }, + { + "epoch": 4.855618969968746, + "grad_norm": 0.2708454430103302, + "learning_rate": 3.933024188069031e-06, + "loss": 3.9217, + "step": 71465 + }, + { + "epoch": 4.855958690039407, + "grad_norm": 0.29946455359458923, + "learning_rate": 3.932599537980704e-06, + "loss": 3.8931, + "step": 71470 + }, + { + "epoch": 4.856298410110069, + "grad_norm": 0.3305034041404724, + "learning_rate": 3.9321748878923775e-06, + "loss": 4.122, + "step": 71475 + }, + { + "epoch": 4.856638130180731, + "grad_norm": 0.19419781863689423, + "learning_rate": 3.931750237804049e-06, + "loss": 3.7517, + "step": 71480 + }, + { + "epoch": 4.8569778502513925, + "grad_norm": 0.2687338590621948, + "learning_rate": 3.931325587715722e-06, + "loss": 4.1782, + "step": 71485 + }, + { + "epoch": 4.857317570322055, + "grad_norm": 0.36826783418655396, + "learning_rate": 3.930900937627396e-06, + "loss": 4.0315, + "step": 71490 + }, + { + "epoch": 4.857657290392717, + "grad_norm": 0.2557993531227112, + "learning_rate": 3.930476287539069e-06, + "loss": 3.8588, + "step": 71495 + }, + { + "epoch": 4.857997010463378, + "grad_norm": 0.24103255569934845, + "learning_rate": 3.930051637450741e-06, + "loss": 4.0185, + "step": 71500 + }, + { + "epoch": 4.85833673053404, + "grad_norm": 0.4437389075756073, + "learning_rate": 3.929626987362413e-06, + "loss": 3.8026, + "step": 71505 + }, + { + "epoch": 4.858676450604702, + "grad_norm": 0.27605730295181274, + "learning_rate": 3.929202337274087e-06, + "loss": 3.6649, + "step": 71510 + }, + { + "epoch": 4.859016170675363, + "grad_norm": 0.3301515579223633, + "learning_rate": 3.928777687185759e-06, + "loss": 3.7512, + "step": 71515 + }, + { + "epoch": 4.859355890746025, + "grad_norm": 0.22934144735336304, + "learning_rate": 3.928353037097432e-06, + "loss": 3.7584, + "step": 71520 + }, + { + "epoch": 4.859695610816687, + "grad_norm": 0.334562212228775, + "learning_rate": 3.9279283870091055e-06, + "loss": 3.9793, + "step": 71525 + }, + { + "epoch": 4.8600353308873485, + "grad_norm": 0.27747228741645813, + "learning_rate": 3.927503736920777e-06, + "loss": 3.713, + "step": 71530 + }, + { + "epoch": 4.860375050958011, + "grad_norm": 0.3027026355266571, + "learning_rate": 3.92707908683245e-06, + "loss": 4.0655, + "step": 71535 + }, + { + "epoch": 4.860714771028673, + "grad_norm": 0.383624792098999, + "learning_rate": 3.926654436744123e-06, + "loss": 4.0862, + "step": 71540 + }, + { + "epoch": 4.861054491099334, + "grad_norm": 0.2601046562194824, + "learning_rate": 3.926229786655796e-06, + "loss": 4.0834, + "step": 71545 + }, + { + "epoch": 4.861394211169996, + "grad_norm": 0.23961979150772095, + "learning_rate": 3.925805136567469e-06, + "loss": 3.9764, + "step": 71550 + }, + { + "epoch": 4.861733931240658, + "grad_norm": 0.2882446348667145, + "learning_rate": 3.925380486479141e-06, + "loss": 4.0152, + "step": 71555 + }, + { + "epoch": 4.862073651311319, + "grad_norm": 0.4331667721271515, + "learning_rate": 3.924955836390814e-06, + "loss": 3.7125, + "step": 71560 + }, + { + "epoch": 4.862413371381981, + "grad_norm": 0.3422127366065979, + "learning_rate": 3.924531186302487e-06, + "loss": 4.0841, + "step": 71565 + }, + { + "epoch": 4.862753091452643, + "grad_norm": 0.2782338559627533, + "learning_rate": 3.92410653621416e-06, + "loss": 3.8711, + "step": 71570 + }, + { + "epoch": 4.8630928115233045, + "grad_norm": 0.2582123577594757, + "learning_rate": 3.923681886125833e-06, + "loss": 4.0121, + "step": 71575 + }, + { + "epoch": 4.863432531593967, + "grad_norm": 0.2781153917312622, + "learning_rate": 3.9232572360375054e-06, + "loss": 4.0009, + "step": 71580 + }, + { + "epoch": 4.863772251664629, + "grad_norm": 0.2969920337200165, + "learning_rate": 3.922832585949178e-06, + "loss": 4.1179, + "step": 71585 + }, + { + "epoch": 4.86411197173529, + "grad_norm": 0.23126475512981415, + "learning_rate": 3.922407935860851e-06, + "loss": 3.6804, + "step": 71590 + }, + { + "epoch": 4.864451691805952, + "grad_norm": 0.25600993633270264, + "learning_rate": 3.921983285772524e-06, + "loss": 3.9654, + "step": 71595 + }, + { + "epoch": 4.864791411876614, + "grad_norm": 0.2170867770910263, + "learning_rate": 3.921558635684197e-06, + "loss": 3.6979, + "step": 71600 + }, + { + "epoch": 4.865131131947275, + "grad_norm": 0.21691983938217163, + "learning_rate": 3.9211339855958694e-06, + "loss": 3.8684, + "step": 71605 + }, + { + "epoch": 4.865470852017937, + "grad_norm": 0.334147572517395, + "learning_rate": 3.920709335507542e-06, + "loss": 3.8455, + "step": 71610 + }, + { + "epoch": 4.865810572088599, + "grad_norm": 0.2610946595668793, + "learning_rate": 3.920284685419215e-06, + "loss": 3.7953, + "step": 71615 + }, + { + "epoch": 4.8661502921592605, + "grad_norm": 0.38028448820114136, + "learning_rate": 3.919860035330888e-06, + "loss": 3.9767, + "step": 71620 + }, + { + "epoch": 4.866490012229923, + "grad_norm": 0.22996607422828674, + "learning_rate": 3.919435385242561e-06, + "loss": 4.0826, + "step": 71625 + }, + { + "epoch": 4.866829732300585, + "grad_norm": 0.2742995619773865, + "learning_rate": 3.9190107351542334e-06, + "loss": 4.0173, + "step": 71630 + }, + { + "epoch": 4.867169452371246, + "grad_norm": 0.24393072724342346, + "learning_rate": 3.918586085065905e-06, + "loss": 3.9617, + "step": 71635 + }, + { + "epoch": 4.867509172441908, + "grad_norm": 0.3478057086467743, + "learning_rate": 3.918161434977579e-06, + "loss": 3.9695, + "step": 71640 + }, + { + "epoch": 4.86784889251257, + "grad_norm": 0.3027363121509552, + "learning_rate": 3.917736784889252e-06, + "loss": 4.0202, + "step": 71645 + }, + { + "epoch": 4.868188612583231, + "grad_norm": 0.21819475293159485, + "learning_rate": 3.917312134800924e-06, + "loss": 3.8149, + "step": 71650 + }, + { + "epoch": 4.868528332653893, + "grad_norm": 0.2817140817642212, + "learning_rate": 3.9168874847125974e-06, + "loss": 4.0376, + "step": 71655 + }, + { + "epoch": 4.8688680527245545, + "grad_norm": 0.27482253313064575, + "learning_rate": 3.91646283462427e-06, + "loss": 3.9631, + "step": 71660 + }, + { + "epoch": 4.8692077727952165, + "grad_norm": 0.26373592019081116, + "learning_rate": 3.916038184535943e-06, + "loss": 4.0889, + "step": 71665 + }, + { + "epoch": 4.869547492865879, + "grad_norm": 0.21219293773174286, + "learning_rate": 3.915613534447615e-06, + "loss": 4.2152, + "step": 71670 + }, + { + "epoch": 4.86988721293654, + "grad_norm": 0.2751930058002472, + "learning_rate": 3.915188884359289e-06, + "loss": 4.0831, + "step": 71675 + }, + { + "epoch": 4.870226933007202, + "grad_norm": 0.2708759903907776, + "learning_rate": 3.9147642342709614e-06, + "loss": 3.9414, + "step": 71680 + }, + { + "epoch": 4.870566653077864, + "grad_norm": 0.27636319398880005, + "learning_rate": 3.914339584182633e-06, + "loss": 3.7818, + "step": 71685 + }, + { + "epoch": 4.870906373148525, + "grad_norm": 0.2569180130958557, + "learning_rate": 3.913914934094307e-06, + "loss": 3.9073, + "step": 71690 + }, + { + "epoch": 4.871246093219187, + "grad_norm": 0.24111460149288177, + "learning_rate": 3.91349028400598e-06, + "loss": 3.9674, + "step": 71695 + }, + { + "epoch": 4.871585813289849, + "grad_norm": 0.2891850173473358, + "learning_rate": 3.913065633917652e-06, + "loss": 3.8504, + "step": 71700 + }, + { + "epoch": 4.8719255333605105, + "grad_norm": 0.21091879904270172, + "learning_rate": 3.912640983829325e-06, + "loss": 4.0155, + "step": 71705 + }, + { + "epoch": 4.872265253431173, + "grad_norm": 0.35891810059547424, + "learning_rate": 3.912216333740998e-06, + "loss": 3.8093, + "step": 71710 + }, + { + "epoch": 4.872604973501835, + "grad_norm": 0.3484037220478058, + "learning_rate": 3.91179168365267e-06, + "loss": 3.8001, + "step": 71715 + }, + { + "epoch": 4.872944693572496, + "grad_norm": 0.2290262132883072, + "learning_rate": 3.911367033564343e-06, + "loss": 4.175, + "step": 71720 + }, + { + "epoch": 4.873284413643158, + "grad_norm": 0.26231545209884644, + "learning_rate": 3.910942383476017e-06, + "loss": 3.985, + "step": 71725 + }, + { + "epoch": 4.87362413371382, + "grad_norm": 0.2261446863412857, + "learning_rate": 3.910517733387689e-06, + "loss": 4.0698, + "step": 71730 + }, + { + "epoch": 4.873963853784481, + "grad_norm": 0.28105998039245605, + "learning_rate": 3.910093083299361e-06, + "loss": 4.0526, + "step": 71735 + }, + { + "epoch": 4.874303573855143, + "grad_norm": 0.21482911705970764, + "learning_rate": 3.909668433211034e-06, + "loss": 3.9948, + "step": 71740 + }, + { + "epoch": 4.874643293925805, + "grad_norm": 0.2958498001098633, + "learning_rate": 3.909243783122707e-06, + "loss": 4.0471, + "step": 71745 + }, + { + "epoch": 4.8749830139964665, + "grad_norm": 0.22283993661403656, + "learning_rate": 3.90881913303438e-06, + "loss": 3.9215, + "step": 71750 + }, + { + "epoch": 4.875322734067129, + "grad_norm": 0.27114009857177734, + "learning_rate": 3.908394482946053e-06, + "loss": 3.6946, + "step": 71755 + }, + { + "epoch": 4.875662454137791, + "grad_norm": 0.22637824714183807, + "learning_rate": 3.907969832857725e-06, + "loss": 3.7666, + "step": 71760 + }, + { + "epoch": 4.876002174208452, + "grad_norm": 0.18242628872394562, + "learning_rate": 3.907545182769398e-06, + "loss": 3.9991, + "step": 71765 + }, + { + "epoch": 4.876341894279114, + "grad_norm": 0.35456809401512146, + "learning_rate": 3.907120532681071e-06, + "loss": 3.7843, + "step": 71770 + }, + { + "epoch": 4.876681614349776, + "grad_norm": 0.26801759004592896, + "learning_rate": 3.906695882592744e-06, + "loss": 4.0372, + "step": 71775 + }, + { + "epoch": 4.877021334420437, + "grad_norm": 0.22000491619110107, + "learning_rate": 3.906271232504417e-06, + "loss": 4.1235, + "step": 71780 + }, + { + "epoch": 4.877361054491099, + "grad_norm": 0.22842377424240112, + "learning_rate": 3.905846582416089e-06, + "loss": 3.988, + "step": 71785 + }, + { + "epoch": 4.877700774561761, + "grad_norm": 0.2825161814689636, + "learning_rate": 3.905421932327762e-06, + "loss": 3.9691, + "step": 71790 + }, + { + "epoch": 4.8780404946324225, + "grad_norm": 0.2493940144777298, + "learning_rate": 3.904997282239435e-06, + "loss": 3.7626, + "step": 71795 + }, + { + "epoch": 4.878380214703085, + "grad_norm": 0.24779924750328064, + "learning_rate": 3.904572632151108e-06, + "loss": 3.9964, + "step": 71800 + }, + { + "epoch": 4.878719934773747, + "grad_norm": 0.322628378868103, + "learning_rate": 3.904147982062781e-06, + "loss": 4.0292, + "step": 71805 + }, + { + "epoch": 4.879059654844408, + "grad_norm": 0.3343512713909149, + "learning_rate": 3.903723331974453e-06, + "loss": 4.0591, + "step": 71810 + }, + { + "epoch": 4.87939937491507, + "grad_norm": 0.29830506443977356, + "learning_rate": 3.903298681886126e-06, + "loss": 4.1124, + "step": 71815 + }, + { + "epoch": 4.879739094985732, + "grad_norm": 0.3010801374912262, + "learning_rate": 3.902874031797799e-06, + "loss": 3.9304, + "step": 71820 + }, + { + "epoch": 4.880078815056393, + "grad_norm": 0.1943184733390808, + "learning_rate": 3.902449381709472e-06, + "loss": 3.8651, + "step": 71825 + }, + { + "epoch": 4.880418535127055, + "grad_norm": 0.2785617411136627, + "learning_rate": 3.902024731621145e-06, + "loss": 4.0653, + "step": 71830 + }, + { + "epoch": 4.880758255197717, + "grad_norm": 0.27218863368034363, + "learning_rate": 3.901600081532817e-06, + "loss": 4.0755, + "step": 71835 + }, + { + "epoch": 4.8810979752683785, + "grad_norm": 0.31196001172065735, + "learning_rate": 3.90117543144449e-06, + "loss": 3.9656, + "step": 71840 + }, + { + "epoch": 4.881437695339041, + "grad_norm": 0.31691235303878784, + "learning_rate": 3.900750781356163e-06, + "loss": 3.9722, + "step": 71845 + }, + { + "epoch": 4.881777415409703, + "grad_norm": 0.5071742534637451, + "learning_rate": 3.900326131267836e-06, + "loss": 3.9227, + "step": 71850 + }, + { + "epoch": 4.882117135480364, + "grad_norm": 0.22241492569446564, + "learning_rate": 3.899901481179509e-06, + "loss": 3.8525, + "step": 71855 + }, + { + "epoch": 4.882456855551026, + "grad_norm": 0.28385692834854126, + "learning_rate": 3.899476831091181e-06, + "loss": 3.961, + "step": 71860 + }, + { + "epoch": 4.882796575621688, + "grad_norm": 0.22027379274368286, + "learning_rate": 3.899052181002854e-06, + "loss": 4.1659, + "step": 71865 + }, + { + "epoch": 4.883136295692349, + "grad_norm": 0.26734575629234314, + "learning_rate": 3.898627530914527e-06, + "loss": 4.1421, + "step": 71870 + }, + { + "epoch": 4.883476015763011, + "grad_norm": 0.3193116784095764, + "learning_rate": 3.8982028808262e-06, + "loss": 4.0042, + "step": 71875 + }, + { + "epoch": 4.883815735833673, + "grad_norm": 0.23374277353286743, + "learning_rate": 3.897778230737873e-06, + "loss": 4.1467, + "step": 71880 + }, + { + "epoch": 4.8841554559043345, + "grad_norm": 0.27834004163742065, + "learning_rate": 3.897353580649545e-06, + "loss": 3.991, + "step": 71885 + }, + { + "epoch": 4.884495175974997, + "grad_norm": 0.3373976945877075, + "learning_rate": 3.896928930561218e-06, + "loss": 3.8943, + "step": 71890 + }, + { + "epoch": 4.884834896045659, + "grad_norm": 0.321821004152298, + "learning_rate": 3.896504280472891e-06, + "loss": 3.9607, + "step": 71895 + }, + { + "epoch": 4.88517461611632, + "grad_norm": 0.3299277722835541, + "learning_rate": 3.896079630384563e-06, + "loss": 3.9649, + "step": 71900 + }, + { + "epoch": 4.885514336186982, + "grad_norm": 0.21050602197647095, + "learning_rate": 3.895654980296237e-06, + "loss": 3.8661, + "step": 71905 + }, + { + "epoch": 4.885854056257644, + "grad_norm": 0.2956796884536743, + "learning_rate": 3.8952303302079094e-06, + "loss": 3.8644, + "step": 71910 + }, + { + "epoch": 4.886193776328305, + "grad_norm": 0.22677545249462128, + "learning_rate": 3.894805680119581e-06, + "loss": 3.9367, + "step": 71915 + }, + { + "epoch": 4.886533496398967, + "grad_norm": 0.2939862012863159, + "learning_rate": 3.894381030031254e-06, + "loss": 4.0559, + "step": 71920 + }, + { + "epoch": 4.886873216469629, + "grad_norm": 0.3511861562728882, + "learning_rate": 3.893956379942928e-06, + "loss": 4.0307, + "step": 71925 + }, + { + "epoch": 4.8872129365402905, + "grad_norm": 0.27005353569984436, + "learning_rate": 3.8935317298546e-06, + "loss": 4.0584, + "step": 71930 + }, + { + "epoch": 4.887552656610953, + "grad_norm": 0.2922818958759308, + "learning_rate": 3.893107079766273e-06, + "loss": 3.9284, + "step": 71935 + }, + { + "epoch": 4.887892376681615, + "grad_norm": 0.20411480963230133, + "learning_rate": 3.892682429677946e-06, + "loss": 4.1305, + "step": 71940 + }, + { + "epoch": 4.888232096752276, + "grad_norm": 0.34159722924232483, + "learning_rate": 3.892257779589618e-06, + "loss": 3.9362, + "step": 71945 + }, + { + "epoch": 4.888571816822938, + "grad_norm": 0.22740772366523743, + "learning_rate": 3.891833129501291e-06, + "loss": 3.8788, + "step": 71950 + }, + { + "epoch": 4.8889115368936, + "grad_norm": 0.28261712193489075, + "learning_rate": 3.891408479412964e-06, + "loss": 4.0474, + "step": 71955 + }, + { + "epoch": 4.889251256964261, + "grad_norm": 0.2507091164588928, + "learning_rate": 3.890983829324637e-06, + "loss": 3.7227, + "step": 71960 + }, + { + "epoch": 4.889590977034923, + "grad_norm": 0.23173899948596954, + "learning_rate": 3.890559179236309e-06, + "loss": 3.9508, + "step": 71965 + }, + { + "epoch": 4.889930697105585, + "grad_norm": 0.23296625912189484, + "learning_rate": 3.890134529147982e-06, + "loss": 3.6942, + "step": 71970 + }, + { + "epoch": 4.8902704171762466, + "grad_norm": 0.38399428129196167, + "learning_rate": 3.889709879059655e-06, + "loss": 4.1812, + "step": 71975 + }, + { + "epoch": 4.890610137246909, + "grad_norm": 0.28514114022254944, + "learning_rate": 3.889285228971328e-06, + "loss": 4.1482, + "step": 71980 + }, + { + "epoch": 4.890949857317571, + "grad_norm": 0.2912008762359619, + "learning_rate": 3.888860578883001e-06, + "loss": 4.105, + "step": 71985 + }, + { + "epoch": 4.891289577388232, + "grad_norm": 0.3657597601413727, + "learning_rate": 3.888435928794673e-06, + "loss": 4.0069, + "step": 71990 + }, + { + "epoch": 4.891629297458894, + "grad_norm": 0.4396205246448517, + "learning_rate": 3.888011278706346e-06, + "loss": 4.0531, + "step": 71995 + }, + { + "epoch": 4.891969017529556, + "grad_norm": 0.2632693350315094, + "learning_rate": 3.887586628618019e-06, + "loss": 3.9535, + "step": 72000 + }, + { + "epoch": 4.892308737600217, + "grad_norm": 0.25796201825141907, + "learning_rate": 3.887161978529692e-06, + "loss": 4.0396, + "step": 72005 + }, + { + "epoch": 4.892648457670879, + "grad_norm": 0.4030873477458954, + "learning_rate": 3.886737328441365e-06, + "loss": 3.9376, + "step": 72010 + }, + { + "epoch": 4.892988177741541, + "grad_norm": 0.26613107323646545, + "learning_rate": 3.886312678353037e-06, + "loss": 4.0316, + "step": 72015 + }, + { + "epoch": 4.893327897812203, + "grad_norm": 0.3662324845790863, + "learning_rate": 3.88588802826471e-06, + "loss": 4.0633, + "step": 72020 + }, + { + "epoch": 4.893667617882865, + "grad_norm": 0.23684802651405334, + "learning_rate": 3.885463378176383e-06, + "loss": 3.9004, + "step": 72025 + }, + { + "epoch": 4.894007337953527, + "grad_norm": 0.48891595005989075, + "learning_rate": 3.885038728088056e-06, + "loss": 3.9233, + "step": 72030 + }, + { + "epoch": 4.894347058024188, + "grad_norm": 0.22866861522197723, + "learning_rate": 3.884614077999729e-06, + "loss": 3.4704, + "step": 72035 + }, + { + "epoch": 4.89468677809485, + "grad_norm": 0.26701441407203674, + "learning_rate": 3.884189427911401e-06, + "loss": 4.0994, + "step": 72040 + }, + { + "epoch": 4.895026498165512, + "grad_norm": 0.3477756083011627, + "learning_rate": 3.883764777823074e-06, + "loss": 4.1263, + "step": 72045 + }, + { + "epoch": 4.895366218236173, + "grad_norm": 0.30484071373939514, + "learning_rate": 3.883340127734747e-06, + "loss": 3.9196, + "step": 72050 + }, + { + "epoch": 4.895705938306835, + "grad_norm": 0.2941303253173828, + "learning_rate": 3.88291547764642e-06, + "loss": 3.9362, + "step": 72055 + }, + { + "epoch": 4.896045658377497, + "grad_norm": 0.35157787799835205, + "learning_rate": 3.882490827558093e-06, + "loss": 3.8427, + "step": 72060 + }, + { + "epoch": 4.896385378448159, + "grad_norm": 0.3046676516532898, + "learning_rate": 3.882066177469765e-06, + "loss": 4.0203, + "step": 72065 + }, + { + "epoch": 4.896725098518821, + "grad_norm": 0.4403064250946045, + "learning_rate": 3.881641527381438e-06, + "loss": 4.0827, + "step": 72070 + }, + { + "epoch": 4.897064818589483, + "grad_norm": 0.2406865507364273, + "learning_rate": 3.881216877293111e-06, + "loss": 3.8603, + "step": 72075 + }, + { + "epoch": 4.897404538660144, + "grad_norm": 0.3123708963394165, + "learning_rate": 3.880792227204784e-06, + "loss": 4.0288, + "step": 72080 + }, + { + "epoch": 4.897744258730806, + "grad_norm": 0.4391278028488159, + "learning_rate": 3.880367577116456e-06, + "loss": 4.0944, + "step": 72085 + }, + { + "epoch": 4.898083978801468, + "grad_norm": 0.4817081689834595, + "learning_rate": 3.879942927028129e-06, + "loss": 4.1659, + "step": 72090 + }, + { + "epoch": 4.898423698872129, + "grad_norm": 0.41031211614608765, + "learning_rate": 3.879518276939802e-06, + "loss": 4.103, + "step": 72095 + }, + { + "epoch": 4.898763418942791, + "grad_norm": 0.4234890341758728, + "learning_rate": 3.879093626851474e-06, + "loss": 3.8644, + "step": 72100 + }, + { + "epoch": 4.899103139013453, + "grad_norm": 0.27525362372398376, + "learning_rate": 3.878668976763148e-06, + "loss": 3.9989, + "step": 72105 + }, + { + "epoch": 4.899442859084115, + "grad_norm": 0.27235883474349976, + "learning_rate": 3.878244326674821e-06, + "loss": 3.92, + "step": 72110 + }, + { + "epoch": 4.899782579154777, + "grad_norm": 0.2780931890010834, + "learning_rate": 3.8778196765864926e-06, + "loss": 3.8918, + "step": 72115 + }, + { + "epoch": 4.900122299225439, + "grad_norm": 0.23537780344486237, + "learning_rate": 3.877395026498166e-06, + "loss": 3.7894, + "step": 72120 + }, + { + "epoch": 4.9004620192961, + "grad_norm": 0.21320384740829468, + "learning_rate": 3.876970376409839e-06, + "loss": 3.8424, + "step": 72125 + }, + { + "epoch": 4.900801739366762, + "grad_norm": 0.25412195920944214, + "learning_rate": 3.876545726321511e-06, + "loss": 4.0639, + "step": 72130 + }, + { + "epoch": 4.901141459437423, + "grad_norm": 0.2230674922466278, + "learning_rate": 3.876121076233184e-06, + "loss": 3.9675, + "step": 72135 + }, + { + "epoch": 4.901481179508085, + "grad_norm": 0.3531084954738617, + "learning_rate": 3.875696426144857e-06, + "loss": 3.8695, + "step": 72140 + }, + { + "epoch": 4.901820899578747, + "grad_norm": 0.2595963776111603, + "learning_rate": 3.875271776056529e-06, + "loss": 3.9587, + "step": 72145 + }, + { + "epoch": 4.9021606196494085, + "grad_norm": 0.23500962555408478, + "learning_rate": 3.874847125968202e-06, + "loss": 4.0468, + "step": 72150 + }, + { + "epoch": 4.902500339720071, + "grad_norm": 0.5307453870773315, + "learning_rate": 3.874422475879876e-06, + "loss": 3.8853, + "step": 72155 + }, + { + "epoch": 4.902840059790733, + "grad_norm": 0.257587194442749, + "learning_rate": 3.874082755809214e-06, + "loss": 4.0547, + "step": 72160 + }, + { + "epoch": 4.903179779861394, + "grad_norm": 0.21341292560100555, + "learning_rate": 3.873658105720886e-06, + "loss": 3.8634, + "step": 72165 + }, + { + "epoch": 4.903519499932056, + "grad_norm": 0.35285520553588867, + "learning_rate": 3.873233455632559e-06, + "loss": 3.9309, + "step": 72170 + }, + { + "epoch": 4.903859220002718, + "grad_norm": 0.23878388106822968, + "learning_rate": 3.872808805544232e-06, + "loss": 3.8907, + "step": 72175 + }, + { + "epoch": 4.904198940073379, + "grad_norm": 0.22030426561832428, + "learning_rate": 3.872384155455904e-06, + "loss": 3.9508, + "step": 72180 + }, + { + "epoch": 4.904538660144041, + "grad_norm": 0.3108856976032257, + "learning_rate": 3.871959505367577e-06, + "loss": 3.9937, + "step": 72185 + }, + { + "epoch": 4.904878380214703, + "grad_norm": 0.2811121344566345, + "learning_rate": 3.871534855279251e-06, + "loss": 4.0489, + "step": 72190 + }, + { + "epoch": 4.9052181002853645, + "grad_norm": 0.3158907890319824, + "learning_rate": 3.871110205190923e-06, + "loss": 4.1348, + "step": 72195 + }, + { + "epoch": 4.905557820356027, + "grad_norm": 0.2857950031757355, + "learning_rate": 3.8706855551025955e-06, + "loss": 3.9269, + "step": 72200 + }, + { + "epoch": 4.905897540426689, + "grad_norm": 0.2515884339809418, + "learning_rate": 3.870260905014268e-06, + "loss": 3.9148, + "step": 72205 + }, + { + "epoch": 4.90623726049735, + "grad_norm": 0.2828814685344696, + "learning_rate": 3.869836254925942e-06, + "loss": 4.0075, + "step": 72210 + }, + { + "epoch": 4.906576980568012, + "grad_norm": 0.22873787581920624, + "learning_rate": 3.869411604837614e-06, + "loss": 4.005, + "step": 72215 + }, + { + "epoch": 4.906916700638674, + "grad_norm": 0.3013228178024292, + "learning_rate": 3.868986954749287e-06, + "loss": 3.8739, + "step": 72220 + }, + { + "epoch": 4.907256420709335, + "grad_norm": 0.26890653371810913, + "learning_rate": 3.86856230466096e-06, + "loss": 4.0537, + "step": 72225 + }, + { + "epoch": 4.907596140779997, + "grad_norm": 0.29805418848991394, + "learning_rate": 3.868137654572632e-06, + "loss": 3.8829, + "step": 72230 + }, + { + "epoch": 4.907935860850659, + "grad_norm": 0.2656497657299042, + "learning_rate": 3.867713004484305e-06, + "loss": 3.8605, + "step": 72235 + }, + { + "epoch": 4.9082755809213205, + "grad_norm": 0.27655965089797974, + "learning_rate": 3.867288354395978e-06, + "loss": 4.0014, + "step": 72240 + }, + { + "epoch": 4.908615300991983, + "grad_norm": 0.31498417258262634, + "learning_rate": 3.866863704307651e-06, + "loss": 3.9593, + "step": 72245 + }, + { + "epoch": 4.908955021062645, + "grad_norm": 0.357563316822052, + "learning_rate": 3.8664390542193235e-06, + "loss": 3.8099, + "step": 72250 + }, + { + "epoch": 4.909294741133306, + "grad_norm": 0.35473573207855225, + "learning_rate": 3.866014404130996e-06, + "loss": 3.9401, + "step": 72255 + }, + { + "epoch": 4.909634461203968, + "grad_norm": 0.29687780141830444, + "learning_rate": 3.865589754042669e-06, + "loss": 4.0051, + "step": 72260 + }, + { + "epoch": 4.90997418127463, + "grad_norm": 0.27506396174430847, + "learning_rate": 3.865165103954342e-06, + "loss": 3.9475, + "step": 72265 + }, + { + "epoch": 4.910313901345291, + "grad_norm": 0.3099190294742584, + "learning_rate": 3.864740453866015e-06, + "loss": 3.7557, + "step": 72270 + }, + { + "epoch": 4.910653621415953, + "grad_norm": 0.36189883947372437, + "learning_rate": 3.8643158037776875e-06, + "loss": 3.8273, + "step": 72275 + }, + { + "epoch": 4.910993341486615, + "grad_norm": 0.2800990343093872, + "learning_rate": 3.86389115368936e-06, + "loss": 3.7855, + "step": 72280 + }, + { + "epoch": 4.911333061557277, + "grad_norm": 0.28230705857276917, + "learning_rate": 3.863466503601033e-06, + "loss": 3.8373, + "step": 72285 + }, + { + "epoch": 4.911672781627939, + "grad_norm": 0.21237167716026306, + "learning_rate": 3.863041853512706e-06, + "loss": 3.7608, + "step": 72290 + }, + { + "epoch": 4.912012501698601, + "grad_norm": 0.24132144451141357, + "learning_rate": 3.862617203424379e-06, + "loss": 3.8414, + "step": 72295 + }, + { + "epoch": 4.912352221769262, + "grad_norm": 0.21350833773612976, + "learning_rate": 3.8621925533360515e-06, + "loss": 3.8376, + "step": 72300 + }, + { + "epoch": 4.912691941839924, + "grad_norm": 0.2026694416999817, + "learning_rate": 3.861767903247724e-06, + "loss": 3.9741, + "step": 72305 + }, + { + "epoch": 4.913031661910586, + "grad_norm": 0.28463953733444214, + "learning_rate": 3.861343253159397e-06, + "loss": 3.9787, + "step": 72310 + }, + { + "epoch": 4.913371381981247, + "grad_norm": 0.287397176027298, + "learning_rate": 3.86091860307107e-06, + "loss": 3.7723, + "step": 72315 + }, + { + "epoch": 4.913711102051909, + "grad_norm": 0.2392665147781372, + "learning_rate": 3.860493952982743e-06, + "loss": 4.0586, + "step": 72320 + }, + { + "epoch": 4.914050822122571, + "grad_norm": 0.33024221658706665, + "learning_rate": 3.8600693028944155e-06, + "loss": 3.9359, + "step": 72325 + }, + { + "epoch": 4.914390542193233, + "grad_norm": 0.2770000100135803, + "learning_rate": 3.859644652806088e-06, + "loss": 3.8164, + "step": 72330 + }, + { + "epoch": 4.914730262263895, + "grad_norm": 0.3356037735939026, + "learning_rate": 3.859220002717761e-06, + "loss": 4.0219, + "step": 72335 + }, + { + "epoch": 4.915069982334556, + "grad_norm": 0.2521895170211792, + "learning_rate": 3.858795352629434e-06, + "loss": 4.0818, + "step": 72340 + }, + { + "epoch": 4.915409702405218, + "grad_norm": 0.26013657450675964, + "learning_rate": 3.858370702541107e-06, + "loss": 3.8541, + "step": 72345 + }, + { + "epoch": 4.91574942247588, + "grad_norm": 0.38727888464927673, + "learning_rate": 3.857946052452779e-06, + "loss": 3.9418, + "step": 72350 + }, + { + "epoch": 4.916089142546541, + "grad_norm": 0.22472909092903137, + "learning_rate": 3.857521402364452e-06, + "loss": 3.9025, + "step": 72355 + }, + { + "epoch": 4.916428862617203, + "grad_norm": 0.25126975774765015, + "learning_rate": 3.857096752276125e-06, + "loss": 3.9732, + "step": 72360 + }, + { + "epoch": 4.916768582687865, + "grad_norm": 0.8298335671424866, + "learning_rate": 3.856672102187797e-06, + "loss": 4.0834, + "step": 72365 + }, + { + "epoch": 4.9171083027585265, + "grad_norm": 0.2535621225833893, + "learning_rate": 3.856247452099471e-06, + "loss": 3.7277, + "step": 72370 + }, + { + "epoch": 4.917448022829189, + "grad_norm": 0.2648312747478485, + "learning_rate": 3.8558228020111435e-06, + "loss": 3.9804, + "step": 72375 + }, + { + "epoch": 4.917787742899851, + "grad_norm": 0.30284783244132996, + "learning_rate": 3.855398151922816e-06, + "loss": 3.9332, + "step": 72380 + }, + { + "epoch": 4.918127462970512, + "grad_norm": 0.3551018536090851, + "learning_rate": 3.854973501834488e-06, + "loss": 3.9824, + "step": 72385 + }, + { + "epoch": 4.918467183041174, + "grad_norm": 0.3203759491443634, + "learning_rate": 3.854548851746162e-06, + "loss": 3.9168, + "step": 72390 + }, + { + "epoch": 4.918806903111836, + "grad_norm": 0.5737062692642212, + "learning_rate": 3.854124201657835e-06, + "loss": 4.0428, + "step": 72395 + }, + { + "epoch": 4.919146623182497, + "grad_norm": 0.4716930687427521, + "learning_rate": 3.853699551569507e-06, + "loss": 4.0183, + "step": 72400 + }, + { + "epoch": 4.919486343253159, + "grad_norm": 0.2800905704498291, + "learning_rate": 3.85327490148118e-06, + "loss": 3.838, + "step": 72405 + }, + { + "epoch": 4.919826063323821, + "grad_norm": 0.3169085383415222, + "learning_rate": 3.852850251392853e-06, + "loss": 3.9743, + "step": 72410 + }, + { + "epoch": 4.9201657833944825, + "grad_norm": 0.3110065460205078, + "learning_rate": 3.852425601304525e-06, + "loss": 3.8519, + "step": 72415 + }, + { + "epoch": 4.920505503465145, + "grad_norm": 0.2313312590122223, + "learning_rate": 3.852000951216198e-06, + "loss": 3.9037, + "step": 72420 + }, + { + "epoch": 4.920845223535807, + "grad_norm": 0.34768474102020264, + "learning_rate": 3.8515763011278715e-06, + "loss": 3.85, + "step": 72425 + }, + { + "epoch": 4.921184943606468, + "grad_norm": 0.2627730071544647, + "learning_rate": 3.8511516510395434e-06, + "loss": 3.7307, + "step": 72430 + }, + { + "epoch": 4.92152466367713, + "grad_norm": 0.2859928011894226, + "learning_rate": 3.850727000951216e-06, + "loss": 3.8819, + "step": 72435 + }, + { + "epoch": 4.921864383747792, + "grad_norm": 0.23030070960521698, + "learning_rate": 3.85030235086289e-06, + "loss": 3.8748, + "step": 72440 + }, + { + "epoch": 4.922204103818453, + "grad_norm": 0.26776042580604553, + "learning_rate": 3.849877700774562e-06, + "loss": 3.9992, + "step": 72445 + }, + { + "epoch": 4.922543823889115, + "grad_norm": 0.2966475486755371, + "learning_rate": 3.849453050686235e-06, + "loss": 3.8361, + "step": 72450 + }, + { + "epoch": 4.922883543959777, + "grad_norm": 0.2778627574443817, + "learning_rate": 3.8490284005979074e-06, + "loss": 3.9468, + "step": 72455 + }, + { + "epoch": 4.9232232640304385, + "grad_norm": 0.19472916424274445, + "learning_rate": 3.84860375050958e-06, + "loss": 3.9288, + "step": 72460 + }, + { + "epoch": 4.923562984101101, + "grad_norm": 0.2753678858280182, + "learning_rate": 3.848179100421253e-06, + "loss": 4.0051, + "step": 72465 + }, + { + "epoch": 4.923902704171763, + "grad_norm": 0.22472210228443146, + "learning_rate": 3.847754450332926e-06, + "loss": 3.9483, + "step": 72470 + }, + { + "epoch": 4.924242424242424, + "grad_norm": 0.32474493980407715, + "learning_rate": 3.847329800244599e-06, + "loss": 4.0208, + "step": 72475 + }, + { + "epoch": 4.924582144313086, + "grad_norm": 0.21338503062725067, + "learning_rate": 3.8469051501562714e-06, + "loss": 3.85, + "step": 72480 + }, + { + "epoch": 4.924921864383748, + "grad_norm": 0.32218000292778015, + "learning_rate": 3.846480500067944e-06, + "loss": 3.8372, + "step": 72485 + }, + { + "epoch": 4.925261584454409, + "grad_norm": 0.28555089235305786, + "learning_rate": 3.846055849979617e-06, + "loss": 3.8952, + "step": 72490 + }, + { + "epoch": 4.925601304525071, + "grad_norm": 0.41527390480041504, + "learning_rate": 3.84563119989129e-06, + "loss": 3.8553, + "step": 72495 + }, + { + "epoch": 4.925941024595733, + "grad_norm": 0.29098987579345703, + "learning_rate": 3.845206549802963e-06, + "loss": 3.9382, + "step": 72500 + }, + { + "epoch": 4.9262807446663945, + "grad_norm": 0.31865164637565613, + "learning_rate": 3.8447818997146355e-06, + "loss": 3.8917, + "step": 72505 + }, + { + "epoch": 4.926620464737057, + "grad_norm": 0.3069227337837219, + "learning_rate": 3.844357249626308e-06, + "loss": 4.1812, + "step": 72510 + }, + { + "epoch": 4.926960184807719, + "grad_norm": 0.26817500591278076, + "learning_rate": 3.843932599537981e-06, + "loss": 4.3455, + "step": 72515 + }, + { + "epoch": 4.92729990487838, + "grad_norm": 0.3043840527534485, + "learning_rate": 3.843507949449654e-06, + "loss": 3.9942, + "step": 72520 + }, + { + "epoch": 4.927639624949042, + "grad_norm": 0.5042397975921631, + "learning_rate": 3.843083299361327e-06, + "loss": 3.9125, + "step": 72525 + }, + { + "epoch": 4.927979345019704, + "grad_norm": 0.28596940636634827, + "learning_rate": 3.8426586492729995e-06, + "loss": 3.8107, + "step": 72530 + }, + { + "epoch": 4.928319065090365, + "grad_norm": 0.2724776268005371, + "learning_rate": 3.842233999184672e-06, + "loss": 3.8968, + "step": 72535 + }, + { + "epoch": 4.928658785161027, + "grad_norm": 0.2010091245174408, + "learning_rate": 3.841809349096345e-06, + "loss": 3.9654, + "step": 72540 + }, + { + "epoch": 4.928998505231689, + "grad_norm": 0.4170883297920227, + "learning_rate": 3.841384699008018e-06, + "loss": 3.7829, + "step": 72545 + }, + { + "epoch": 4.9293382253023506, + "grad_norm": 0.30844491720199585, + "learning_rate": 3.840960048919691e-06, + "loss": 3.9228, + "step": 72550 + }, + { + "epoch": 4.929677945373013, + "grad_norm": 0.19678349792957306, + "learning_rate": 3.8405353988313635e-06, + "loss": 3.8401, + "step": 72555 + }, + { + "epoch": 4.930017665443675, + "grad_norm": 0.2810186445713043, + "learning_rate": 3.840110748743036e-06, + "loss": 3.8631, + "step": 72560 + }, + { + "epoch": 4.930357385514336, + "grad_norm": 0.28091368079185486, + "learning_rate": 3.839686098654709e-06, + "loss": 3.9917, + "step": 72565 + }, + { + "epoch": 4.930697105584998, + "grad_norm": 0.28809884190559387, + "learning_rate": 3.839261448566382e-06, + "loss": 4.0635, + "step": 72570 + }, + { + "epoch": 4.93103682565566, + "grad_norm": 0.3903437852859497, + "learning_rate": 3.838836798478055e-06, + "loss": 4.0888, + "step": 72575 + }, + { + "epoch": 4.931376545726321, + "grad_norm": 0.27722272276878357, + "learning_rate": 3.8384121483897275e-06, + "loss": 4.0302, + "step": 72580 + }, + { + "epoch": 4.931716265796983, + "grad_norm": 0.24447868764400482, + "learning_rate": 3.837987498301399e-06, + "loss": 3.8861, + "step": 72585 + }, + { + "epoch": 4.932055985867645, + "grad_norm": 0.3430717885494232, + "learning_rate": 3.837562848213073e-06, + "loss": 4.0494, + "step": 72590 + }, + { + "epoch": 4.932395705938307, + "grad_norm": 0.23536616563796997, + "learning_rate": 3.837138198124746e-06, + "loss": 3.9696, + "step": 72595 + }, + { + "epoch": 4.932735426008969, + "grad_norm": 0.2784249484539032, + "learning_rate": 3.836713548036418e-06, + "loss": 3.935, + "step": 72600 + }, + { + "epoch": 4.933075146079631, + "grad_norm": 0.2642764747142792, + "learning_rate": 3.8362888979480915e-06, + "loss": 4.1387, + "step": 72605 + }, + { + "epoch": 4.933414866150292, + "grad_norm": 0.2866533100605011, + "learning_rate": 3.835864247859764e-06, + "loss": 3.9976, + "step": 72610 + }, + { + "epoch": 4.933754586220954, + "grad_norm": 0.27032533288002014, + "learning_rate": 3.835439597771436e-06, + "loss": 3.8918, + "step": 72615 + }, + { + "epoch": 4.934094306291616, + "grad_norm": 0.25160154700279236, + "learning_rate": 3.835014947683109e-06, + "loss": 4.1482, + "step": 72620 + }, + { + "epoch": 4.934434026362277, + "grad_norm": 0.2202221006155014, + "learning_rate": 3.834590297594783e-06, + "loss": 3.6518, + "step": 72625 + }, + { + "epoch": 4.934773746432939, + "grad_norm": 0.2752437889575958, + "learning_rate": 3.834165647506455e-06, + "loss": 3.9803, + "step": 72630 + }, + { + "epoch": 4.935113466503601, + "grad_norm": 0.20276254415512085, + "learning_rate": 3.8337409974181274e-06, + "loss": 3.5774, + "step": 72635 + }, + { + "epoch": 4.935453186574263, + "grad_norm": 0.2390061318874359, + "learning_rate": 3.833316347329801e-06, + "loss": 3.9851, + "step": 72640 + }, + { + "epoch": 4.935792906644925, + "grad_norm": 0.2634780704975128, + "learning_rate": 3.832891697241473e-06, + "loss": 3.9944, + "step": 72645 + }, + { + "epoch": 4.936132626715587, + "grad_norm": 0.26872068643569946, + "learning_rate": 3.832467047153146e-06, + "loss": 4.068, + "step": 72650 + }, + { + "epoch": 4.936472346786248, + "grad_norm": 0.26187631487846375, + "learning_rate": 3.8320423970648195e-06, + "loss": 3.6194, + "step": 72655 + }, + { + "epoch": 4.93681206685691, + "grad_norm": 0.2777792811393738, + "learning_rate": 3.8316177469764914e-06, + "loss": 4.0674, + "step": 72660 + }, + { + "epoch": 4.937151786927572, + "grad_norm": 0.2629350423812866, + "learning_rate": 3.831193096888164e-06, + "loss": 3.9917, + "step": 72665 + }, + { + "epoch": 4.937491506998233, + "grad_norm": 0.2016827017068863, + "learning_rate": 3.830768446799837e-06, + "loss": 3.9552, + "step": 72670 + }, + { + "epoch": 4.937831227068895, + "grad_norm": 0.2530340850353241, + "learning_rate": 3.83034379671151e-06, + "loss": 3.6626, + "step": 72675 + }, + { + "epoch": 4.938170947139557, + "grad_norm": 0.2074788510799408, + "learning_rate": 3.829919146623183e-06, + "loss": 3.8583, + "step": 72680 + }, + { + "epoch": 4.938510667210219, + "grad_norm": 0.20372560620307922, + "learning_rate": 3.8294944965348554e-06, + "loss": 3.7257, + "step": 72685 + }, + { + "epoch": 4.938850387280881, + "grad_norm": 0.23540830612182617, + "learning_rate": 3.829069846446528e-06, + "loss": 3.9306, + "step": 72690 + }, + { + "epoch": 4.939190107351543, + "grad_norm": 0.2773461937904358, + "learning_rate": 3.828645196358201e-06, + "loss": 3.8064, + "step": 72695 + }, + { + "epoch": 4.939529827422204, + "grad_norm": 0.29270532727241516, + "learning_rate": 3.828220546269874e-06, + "loss": 3.9501, + "step": 72700 + }, + { + "epoch": 4.939869547492866, + "grad_norm": 0.3102489113807678, + "learning_rate": 3.827795896181547e-06, + "loss": 3.9514, + "step": 72705 + }, + { + "epoch": 4.940209267563528, + "grad_norm": 0.23645149171352386, + "learning_rate": 3.8273712460932194e-06, + "loss": 3.6905, + "step": 72710 + }, + { + "epoch": 4.940548987634189, + "grad_norm": 0.23383331298828125, + "learning_rate": 3.826946596004892e-06, + "loss": 3.9343, + "step": 72715 + }, + { + "epoch": 4.940888707704851, + "grad_norm": 0.24724560976028442, + "learning_rate": 3.826521945916565e-06, + "loss": 4.0832, + "step": 72720 + }, + { + "epoch": 4.941228427775513, + "grad_norm": 0.19940991699695587, + "learning_rate": 3.826097295828238e-06, + "loss": 3.8566, + "step": 72725 + }, + { + "epoch": 4.941568147846175, + "grad_norm": 0.3446851670742035, + "learning_rate": 3.825672645739911e-06, + "loss": 3.7249, + "step": 72730 + }, + { + "epoch": 4.941907867916837, + "grad_norm": 0.2997283637523651, + "learning_rate": 3.8252479956515834e-06, + "loss": 3.9988, + "step": 72735 + }, + { + "epoch": 4.942247587987499, + "grad_norm": 0.265651673078537, + "learning_rate": 3.824823345563256e-06, + "loss": 3.9685, + "step": 72740 + }, + { + "epoch": 4.94258730805816, + "grad_norm": 0.2519665062427521, + "learning_rate": 3.824398695474929e-06, + "loss": 4.0848, + "step": 72745 + }, + { + "epoch": 4.942927028128822, + "grad_norm": 0.22233925759792328, + "learning_rate": 3.823974045386602e-06, + "loss": 4.1294, + "step": 72750 + }, + { + "epoch": 4.943266748199484, + "grad_norm": 0.28906410932540894, + "learning_rate": 3.823549395298275e-06, + "loss": 4.1223, + "step": 72755 + }, + { + "epoch": 4.943606468270145, + "grad_norm": 0.2547963261604309, + "learning_rate": 3.8231247452099474e-06, + "loss": 4.0116, + "step": 72760 + }, + { + "epoch": 4.943946188340807, + "grad_norm": 0.3279276192188263, + "learning_rate": 3.82270009512162e-06, + "loss": 3.9322, + "step": 72765 + }, + { + "epoch": 4.944285908411469, + "grad_norm": 0.2493533194065094, + "learning_rate": 3.822275445033293e-06, + "loss": 3.9162, + "step": 72770 + }, + { + "epoch": 4.944625628482131, + "grad_norm": 0.3307291865348816, + "learning_rate": 3.821850794944966e-06, + "loss": 4.0658, + "step": 72775 + }, + { + "epoch": 4.944965348552793, + "grad_norm": 0.2805522680282593, + "learning_rate": 3.821426144856639e-06, + "loss": 4.1514, + "step": 72780 + }, + { + "epoch": 4.945305068623455, + "grad_norm": 0.28560009598731995, + "learning_rate": 3.8210014947683114e-06, + "loss": 3.9439, + "step": 72785 + }, + { + "epoch": 4.945644788694116, + "grad_norm": 0.33489686250686646, + "learning_rate": 3.820576844679984e-06, + "loss": 4.0101, + "step": 72790 + }, + { + "epoch": 4.945984508764778, + "grad_norm": 0.2224481999874115, + "learning_rate": 3.820152194591657e-06, + "loss": 3.8519, + "step": 72795 + }, + { + "epoch": 4.94632422883544, + "grad_norm": 0.2586497664451599, + "learning_rate": 3.819727544503329e-06, + "loss": 4.1829, + "step": 72800 + }, + { + "epoch": 4.946663948906101, + "grad_norm": 0.2120407223701477, + "learning_rate": 3.819302894415003e-06, + "loss": 3.837, + "step": 72805 + }, + { + "epoch": 4.947003668976763, + "grad_norm": 0.2907073199748993, + "learning_rate": 3.8188782443266755e-06, + "loss": 3.7346, + "step": 72810 + }, + { + "epoch": 4.9473433890474245, + "grad_norm": 0.3300127685070038, + "learning_rate": 3.818453594238347e-06, + "loss": 4.0234, + "step": 72815 + }, + { + "epoch": 4.947683109118087, + "grad_norm": 0.2181493192911148, + "learning_rate": 3.818028944150021e-06, + "loss": 3.5627, + "step": 72820 + }, + { + "epoch": 4.948022829188749, + "grad_norm": 0.24800969660282135, + "learning_rate": 3.817604294061694e-06, + "loss": 3.929, + "step": 72825 + }, + { + "epoch": 4.94836254925941, + "grad_norm": 0.3040945827960968, + "learning_rate": 3.817179643973366e-06, + "loss": 3.8576, + "step": 72830 + }, + { + "epoch": 4.948702269330072, + "grad_norm": 0.32826176285743713, + "learning_rate": 3.816754993885039e-06, + "loss": 4.1311, + "step": 72835 + }, + { + "epoch": 4.949041989400734, + "grad_norm": 0.21314342319965363, + "learning_rate": 3.816330343796712e-06, + "loss": 3.8518, + "step": 72840 + }, + { + "epoch": 4.949381709471395, + "grad_norm": 0.31418508291244507, + "learning_rate": 3.815905693708384e-06, + "loss": 3.9865, + "step": 72845 + }, + { + "epoch": 4.949721429542057, + "grad_norm": 0.33334094285964966, + "learning_rate": 3.815481043620057e-06, + "loss": 3.9523, + "step": 72850 + }, + { + "epoch": 4.950061149612719, + "grad_norm": 0.26325181126594543, + "learning_rate": 3.815056393531731e-06, + "loss": 3.613, + "step": 72855 + }, + { + "epoch": 4.9504008696833806, + "grad_norm": 0.2726951837539673, + "learning_rate": 3.8146317434434026e-06, + "loss": 4.0103, + "step": 72860 + }, + { + "epoch": 4.950740589754043, + "grad_norm": 0.2626792788505554, + "learning_rate": 3.814207093355076e-06, + "loss": 4.0046, + "step": 72865 + }, + { + "epoch": 4.951080309824705, + "grad_norm": 0.2333577275276184, + "learning_rate": 3.8137824432667486e-06, + "loss": 4.0529, + "step": 72870 + }, + { + "epoch": 4.951420029895366, + "grad_norm": 0.26910853385925293, + "learning_rate": 3.813357793178421e-06, + "loss": 3.9834, + "step": 72875 + }, + { + "epoch": 4.951759749966028, + "grad_norm": 0.21675962209701538, + "learning_rate": 3.812933143090094e-06, + "loss": 3.9544, + "step": 72880 + }, + { + "epoch": 4.95209947003669, + "grad_norm": 0.24741217494010925, + "learning_rate": 3.812508493001767e-06, + "loss": 3.8845, + "step": 72885 + }, + { + "epoch": 4.952439190107351, + "grad_norm": 0.20645922422409058, + "learning_rate": 3.81208384291344e-06, + "loss": 3.8457, + "step": 72890 + }, + { + "epoch": 4.952778910178013, + "grad_norm": 0.2312307506799698, + "learning_rate": 3.8116591928251122e-06, + "loss": 4.1063, + "step": 72895 + }, + { + "epoch": 4.953118630248675, + "grad_norm": 0.30832529067993164, + "learning_rate": 3.8112345427367854e-06, + "loss": 3.6222, + "step": 72900 + }, + { + "epoch": 4.953458350319337, + "grad_norm": 0.25712674856185913, + "learning_rate": 3.8108098926484582e-06, + "loss": 3.6535, + "step": 72905 + }, + { + "epoch": 4.953798070389999, + "grad_norm": 0.2276853322982788, + "learning_rate": 3.8103852425601306e-06, + "loss": 4.0639, + "step": 72910 + }, + { + "epoch": 4.954137790460661, + "grad_norm": 0.2542833089828491, + "learning_rate": 3.8099605924718034e-06, + "loss": 4.177, + "step": 72915 + }, + { + "epoch": 4.954477510531322, + "grad_norm": 0.39189809560775757, + "learning_rate": 3.8095359423834766e-06, + "loss": 3.9657, + "step": 72920 + }, + { + "epoch": 4.954817230601984, + "grad_norm": 0.3045576512813568, + "learning_rate": 3.809111292295149e-06, + "loss": 3.9933, + "step": 72925 + }, + { + "epoch": 4.955156950672646, + "grad_norm": 0.24539239704608917, + "learning_rate": 3.808686642206822e-06, + "loss": 3.9943, + "step": 72930 + }, + { + "epoch": 4.955496670743307, + "grad_norm": 0.3111792802810669, + "learning_rate": 3.808261992118495e-06, + "loss": 3.939, + "step": 72935 + }, + { + "epoch": 4.955836390813969, + "grad_norm": 0.3063417673110962, + "learning_rate": 3.8078373420301674e-06, + "loss": 4.001, + "step": 72940 + }, + { + "epoch": 4.956176110884631, + "grad_norm": 0.4286917746067047, + "learning_rate": 3.8074126919418402e-06, + "loss": 3.7947, + "step": 72945 + }, + { + "epoch": 4.956515830955293, + "grad_norm": 0.3444313406944275, + "learning_rate": 3.806988041853513e-06, + "loss": 4.1472, + "step": 72950 + }, + { + "epoch": 4.956855551025955, + "grad_norm": 0.3497346341609955, + "learning_rate": 3.806563391765186e-06, + "loss": 3.7779, + "step": 72955 + }, + { + "epoch": 4.957195271096617, + "grad_norm": 0.19177374243736267, + "learning_rate": 3.8061387416768586e-06, + "loss": 3.8882, + "step": 72960 + }, + { + "epoch": 4.957534991167278, + "grad_norm": 0.28668341040611267, + "learning_rate": 3.8057140915885314e-06, + "loss": 4.1718, + "step": 72965 + }, + { + "epoch": 4.95787471123794, + "grad_norm": 1.0873878002166748, + "learning_rate": 3.805289441500204e-06, + "loss": 3.9857, + "step": 72970 + }, + { + "epoch": 4.958214431308602, + "grad_norm": 0.2385072410106659, + "learning_rate": 3.804864791411877e-06, + "loss": 3.9857, + "step": 72975 + }, + { + "epoch": 4.958554151379263, + "grad_norm": 0.24290882050991058, + "learning_rate": 3.80444014132355e-06, + "loss": 4.0618, + "step": 72980 + }, + { + "epoch": 4.958893871449925, + "grad_norm": 0.30111899971961975, + "learning_rate": 3.804015491235222e-06, + "loss": 4.0697, + "step": 72985 + }, + { + "epoch": 4.959233591520587, + "grad_norm": 0.33642470836639404, + "learning_rate": 3.8035908411468954e-06, + "loss": 3.6594, + "step": 72990 + }, + { + "epoch": 4.959573311591249, + "grad_norm": 0.2720005512237549, + "learning_rate": 3.8031661910585682e-06, + "loss": 3.727, + "step": 72995 + }, + { + "epoch": 4.959913031661911, + "grad_norm": 0.47604718804359436, + "learning_rate": 3.8027415409702406e-06, + "loss": 3.9364, + "step": 73000 + }, + { + "epoch": 4.960252751732573, + "grad_norm": 0.2803803086280823, + "learning_rate": 3.8023168908819134e-06, + "loss": 3.835, + "step": 73005 + }, + { + "epoch": 4.960592471803234, + "grad_norm": 0.2779567241668701, + "learning_rate": 3.8018922407935866e-06, + "loss": 3.9727, + "step": 73010 + }, + { + "epoch": 4.960932191873896, + "grad_norm": 0.23277795314788818, + "learning_rate": 3.801467590705259e-06, + "loss": 3.7599, + "step": 73015 + }, + { + "epoch": 4.961271911944557, + "grad_norm": 0.287117600440979, + "learning_rate": 3.801042940616932e-06, + "loss": 4.3027, + "step": 73020 + }, + { + "epoch": 4.961611632015219, + "grad_norm": 0.25988760590553284, + "learning_rate": 3.800618290528605e-06, + "loss": 3.8914, + "step": 73025 + }, + { + "epoch": 4.961951352085881, + "grad_norm": 0.24561071395874023, + "learning_rate": 3.8001936404402774e-06, + "loss": 3.9384, + "step": 73030 + }, + { + "epoch": 4.9622910721565425, + "grad_norm": 0.3774527311325073, + "learning_rate": 3.7997689903519502e-06, + "loss": 4.13, + "step": 73035 + }, + { + "epoch": 4.962630792227205, + "grad_norm": 0.20670612156391144, + "learning_rate": 3.799344340263623e-06, + "loss": 3.9531, + "step": 73040 + }, + { + "epoch": 4.962970512297867, + "grad_norm": 0.2612258195877075, + "learning_rate": 3.7989196901752954e-06, + "loss": 4.1165, + "step": 73045 + }, + { + "epoch": 4.963310232368528, + "grad_norm": 0.2653966546058655, + "learning_rate": 3.7984950400869686e-06, + "loss": 4.0828, + "step": 73050 + }, + { + "epoch": 4.96364995243919, + "grad_norm": 0.2511993944644928, + "learning_rate": 3.7980703899986414e-06, + "loss": 3.7462, + "step": 73055 + }, + { + "epoch": 4.963989672509852, + "grad_norm": 0.26508909463882446, + "learning_rate": 3.7976457399103146e-06, + "loss": 3.7802, + "step": 73060 + }, + { + "epoch": 4.964329392580513, + "grad_norm": 0.23847530782222748, + "learning_rate": 3.797221089821987e-06, + "loss": 3.6962, + "step": 73065 + }, + { + "epoch": 4.964669112651175, + "grad_norm": 0.2894808351993561, + "learning_rate": 3.79679643973366e-06, + "loss": 3.9761, + "step": 73070 + }, + { + "epoch": 4.965008832721837, + "grad_norm": 0.2743089497089386, + "learning_rate": 3.7963717896453326e-06, + "loss": 4.0072, + "step": 73075 + }, + { + "epoch": 4.9653485527924985, + "grad_norm": 0.2227019965648651, + "learning_rate": 3.7959471395570054e-06, + "loss": 3.8901, + "step": 73080 + }, + { + "epoch": 4.965688272863161, + "grad_norm": 0.2003837674856186, + "learning_rate": 3.7955224894686782e-06, + "loss": 3.9124, + "step": 73085 + }, + { + "epoch": 4.966027992933823, + "grad_norm": 0.3230415880680084, + "learning_rate": 3.795097839380351e-06, + "loss": 3.8883, + "step": 73090 + }, + { + "epoch": 4.966367713004484, + "grad_norm": 0.38703563809394836, + "learning_rate": 3.7946731892920234e-06, + "loss": 3.955, + "step": 73095 + }, + { + "epoch": 4.966707433075146, + "grad_norm": 0.3193655014038086, + "learning_rate": 3.7942485392036966e-06, + "loss": 3.8403, + "step": 73100 + }, + { + "epoch": 4.967047153145808, + "grad_norm": 0.19421613216400146, + "learning_rate": 3.7938238891153694e-06, + "loss": 4.0694, + "step": 73105 + }, + { + "epoch": 4.967386873216469, + "grad_norm": 0.34998854994773865, + "learning_rate": 3.793399239027042e-06, + "loss": 3.8955, + "step": 73110 + }, + { + "epoch": 4.967726593287131, + "grad_norm": 0.21815593540668488, + "learning_rate": 3.792974588938715e-06, + "loss": 3.8144, + "step": 73115 + }, + { + "epoch": 4.968066313357793, + "grad_norm": 0.22432149946689606, + "learning_rate": 3.792549938850388e-06, + "loss": 4.0422, + "step": 73120 + }, + { + "epoch": 4.9684060334284545, + "grad_norm": 0.3217407464981079, + "learning_rate": 3.79212528876206e-06, + "loss": 3.9965, + "step": 73125 + }, + { + "epoch": 4.968745753499117, + "grad_norm": 0.2740372121334076, + "learning_rate": 3.791700638673733e-06, + "loss": 3.7578, + "step": 73130 + }, + { + "epoch": 4.969085473569779, + "grad_norm": 0.24675016105175018, + "learning_rate": 3.7912759885854062e-06, + "loss": 3.9223, + "step": 73135 + }, + { + "epoch": 4.96942519364044, + "grad_norm": 0.2315625697374344, + "learning_rate": 3.7908513384970786e-06, + "loss": 4.0052, + "step": 73140 + }, + { + "epoch": 4.969764913711102, + "grad_norm": 0.22374649345874786, + "learning_rate": 3.7904266884087514e-06, + "loss": 3.6394, + "step": 73145 + }, + { + "epoch": 4.970104633781764, + "grad_norm": 0.23898859322071075, + "learning_rate": 3.7900020383204246e-06, + "loss": 4.285, + "step": 73150 + }, + { + "epoch": 4.970444353852425, + "grad_norm": 0.2346973419189453, + "learning_rate": 3.789577388232097e-06, + "loss": 3.9413, + "step": 73155 + }, + { + "epoch": 4.970784073923087, + "grad_norm": 0.24812310934066772, + "learning_rate": 3.78915273814377e-06, + "loss": 3.9642, + "step": 73160 + }, + { + "epoch": 4.971123793993749, + "grad_norm": 0.29282039403915405, + "learning_rate": 3.7887280880554426e-06, + "loss": 4.0515, + "step": 73165 + }, + { + "epoch": 4.971463514064411, + "grad_norm": 0.21851296722888947, + "learning_rate": 3.788303437967115e-06, + "loss": 3.8976, + "step": 73170 + }, + { + "epoch": 4.971803234135073, + "grad_norm": 0.2672637701034546, + "learning_rate": 3.7878787878787882e-06, + "loss": 4.1064, + "step": 73175 + }, + { + "epoch": 4.972142954205735, + "grad_norm": 0.28518572449684143, + "learning_rate": 3.787454137790461e-06, + "loss": 3.9669, + "step": 73180 + }, + { + "epoch": 4.972482674276396, + "grad_norm": 0.3608737587928772, + "learning_rate": 3.7870294877021334e-06, + "loss": 3.9076, + "step": 73185 + }, + { + "epoch": 4.972822394347058, + "grad_norm": 0.24887436628341675, + "learning_rate": 3.7866048376138066e-06, + "loss": 4.0567, + "step": 73190 + }, + { + "epoch": 4.97316211441772, + "grad_norm": 0.2919802963733673, + "learning_rate": 3.7861801875254794e-06, + "loss": 3.9967, + "step": 73195 + }, + { + "epoch": 4.973501834488381, + "grad_norm": 0.2761557102203369, + "learning_rate": 3.785755537437152e-06, + "loss": 4.0083, + "step": 73200 + }, + { + "epoch": 4.973841554559043, + "grad_norm": 0.2524355351924896, + "learning_rate": 3.785330887348825e-06, + "loss": 4.0214, + "step": 73205 + }, + { + "epoch": 4.974181274629705, + "grad_norm": 0.2323675900697708, + "learning_rate": 3.784906237260498e-06, + "loss": 4.0199, + "step": 73210 + }, + { + "epoch": 4.974520994700367, + "grad_norm": 0.21476741135120392, + "learning_rate": 3.78448158717217e-06, + "loss": 3.8018, + "step": 73215 + }, + { + "epoch": 4.974860714771029, + "grad_norm": 0.30108630657196045, + "learning_rate": 3.784056937083843e-06, + "loss": 3.9991, + "step": 73220 + }, + { + "epoch": 4.975200434841691, + "grad_norm": 0.309287965297699, + "learning_rate": 3.7836322869955162e-06, + "loss": 3.8948, + "step": 73225 + }, + { + "epoch": 4.975540154912352, + "grad_norm": 0.30126330256462097, + "learning_rate": 3.783207636907189e-06, + "loss": 3.6537, + "step": 73230 + }, + { + "epoch": 4.975879874983014, + "grad_norm": 0.29185956716537476, + "learning_rate": 3.7827829868188614e-06, + "loss": 3.7691, + "step": 73235 + }, + { + "epoch": 4.976219595053676, + "grad_norm": 0.24073870480060577, + "learning_rate": 3.7823583367305346e-06, + "loss": 3.6466, + "step": 73240 + }, + { + "epoch": 4.976559315124337, + "grad_norm": 0.42914119362831116, + "learning_rate": 3.7819336866422074e-06, + "loss": 3.9899, + "step": 73245 + }, + { + "epoch": 4.976899035194999, + "grad_norm": 0.23558995127677917, + "learning_rate": 3.78150903655388e-06, + "loss": 4.0992, + "step": 73250 + }, + { + "epoch": 4.977238755265661, + "grad_norm": 0.2661004960536957, + "learning_rate": 3.7810843864655526e-06, + "loss": 4.0404, + "step": 73255 + }, + { + "epoch": 4.977578475336323, + "grad_norm": 0.603276789188385, + "learning_rate": 3.780659736377226e-06, + "loss": 4.0495, + "step": 73260 + }, + { + "epoch": 4.977918195406985, + "grad_norm": 0.2076907753944397, + "learning_rate": 3.780235086288898e-06, + "loss": 3.9595, + "step": 73265 + }, + { + "epoch": 4.978257915477647, + "grad_norm": 0.30843886733055115, + "learning_rate": 3.779810436200571e-06, + "loss": 3.8252, + "step": 73270 + }, + { + "epoch": 4.978597635548308, + "grad_norm": 0.38244396448135376, + "learning_rate": 3.7793857861122442e-06, + "loss": 4.0552, + "step": 73275 + }, + { + "epoch": 4.97893735561897, + "grad_norm": 0.2517107129096985, + "learning_rate": 3.7789611360239166e-06, + "loss": 3.7867, + "step": 73280 + }, + { + "epoch": 4.979277075689632, + "grad_norm": 0.3060448169708252, + "learning_rate": 3.7785364859355894e-06, + "loss": 3.9044, + "step": 73285 + }, + { + "epoch": 4.979616795760293, + "grad_norm": 0.24211572110652924, + "learning_rate": 3.778111835847262e-06, + "loss": 3.6766, + "step": 73290 + }, + { + "epoch": 4.979956515830955, + "grad_norm": 0.40187305212020874, + "learning_rate": 3.7776871857589346e-06, + "loss": 3.8314, + "step": 73295 + }, + { + "epoch": 4.980296235901617, + "grad_norm": 0.22891418635845184, + "learning_rate": 3.777262535670608e-06, + "loss": 3.9301, + "step": 73300 + }, + { + "epoch": 4.980635955972279, + "grad_norm": 0.2537685036659241, + "learning_rate": 3.7768378855822806e-06, + "loss": 3.8589, + "step": 73305 + }, + { + "epoch": 4.980975676042941, + "grad_norm": 0.21190419793128967, + "learning_rate": 3.776413235493953e-06, + "loss": 4.1763, + "step": 73310 + }, + { + "epoch": 4.981315396113603, + "grad_norm": 0.4381144642829895, + "learning_rate": 3.7759885854056262e-06, + "loss": 3.9433, + "step": 73315 + }, + { + "epoch": 4.981655116184264, + "grad_norm": 0.2936289608478546, + "learning_rate": 3.775563935317299e-06, + "loss": 4.0627, + "step": 73320 + }, + { + "epoch": 4.981994836254926, + "grad_norm": 0.2682899832725525, + "learning_rate": 3.7751392852289714e-06, + "loss": 3.8564, + "step": 73325 + }, + { + "epoch": 4.982334556325588, + "grad_norm": 0.25794780254364014, + "learning_rate": 3.774714635140644e-06, + "loss": 3.8444, + "step": 73330 + }, + { + "epoch": 4.982674276396249, + "grad_norm": 0.24907664954662323, + "learning_rate": 3.7742899850523174e-06, + "loss": 3.8837, + "step": 73335 + }, + { + "epoch": 4.983013996466911, + "grad_norm": 0.2767338156700134, + "learning_rate": 3.77386533496399e-06, + "loss": 3.9682, + "step": 73340 + }, + { + "epoch": 4.983353716537573, + "grad_norm": 0.4821071922779083, + "learning_rate": 3.7734406848756626e-06, + "loss": 3.7218, + "step": 73345 + }, + { + "epoch": 4.983693436608235, + "grad_norm": 0.27874496579170227, + "learning_rate": 3.773016034787336e-06, + "loss": 3.9609, + "step": 73350 + }, + { + "epoch": 4.984033156678897, + "grad_norm": 0.22987107932567596, + "learning_rate": 3.772591384699008e-06, + "loss": 3.7978, + "step": 73355 + }, + { + "epoch": 4.984372876749559, + "grad_norm": 0.3957061171531677, + "learning_rate": 3.772166734610681e-06, + "loss": 3.8764, + "step": 73360 + }, + { + "epoch": 4.98471259682022, + "grad_norm": 0.26241394877433777, + "learning_rate": 3.7717420845223542e-06, + "loss": 4.0955, + "step": 73365 + }, + { + "epoch": 4.985052316890882, + "grad_norm": 0.21132336556911469, + "learning_rate": 3.7713174344340266e-06, + "loss": 3.9098, + "step": 73370 + }, + { + "epoch": 4.985392036961544, + "grad_norm": 0.261549174785614, + "learning_rate": 3.7708927843456994e-06, + "loss": 4.0235, + "step": 73375 + }, + { + "epoch": 4.985731757032205, + "grad_norm": 0.3427600860595703, + "learning_rate": 3.770468134257372e-06, + "loss": 3.7793, + "step": 73380 + }, + { + "epoch": 4.986071477102867, + "grad_norm": 0.289266973733902, + "learning_rate": 3.7700434841690446e-06, + "loss": 3.9846, + "step": 73385 + }, + { + "epoch": 4.986411197173529, + "grad_norm": 0.29205429553985596, + "learning_rate": 3.769618834080718e-06, + "loss": 3.8193, + "step": 73390 + }, + { + "epoch": 4.986750917244191, + "grad_norm": 0.2818363606929779, + "learning_rate": 3.7691941839923906e-06, + "loss": 3.808, + "step": 73395 + }, + { + "epoch": 4.987090637314853, + "grad_norm": 0.32143908739089966, + "learning_rate": 3.768769533904064e-06, + "loss": 3.889, + "step": 73400 + }, + { + "epoch": 4.987430357385515, + "grad_norm": 0.31359609961509705, + "learning_rate": 3.768344883815736e-06, + "loss": 3.8133, + "step": 73405 + }, + { + "epoch": 4.987770077456176, + "grad_norm": 0.24145111441612244, + "learning_rate": 3.767920233727409e-06, + "loss": 3.9953, + "step": 73410 + }, + { + "epoch": 4.988109797526838, + "grad_norm": 0.27372947335243225, + "learning_rate": 3.767495583639082e-06, + "loss": 3.7591, + "step": 73415 + }, + { + "epoch": 4.9884495175975, + "grad_norm": 0.25799304246902466, + "learning_rate": 3.767070933550754e-06, + "loss": 4.136, + "step": 73420 + }, + { + "epoch": 4.988789237668161, + "grad_norm": 0.30435702204704285, + "learning_rate": 3.7666462834624274e-06, + "loss": 4.071, + "step": 73425 + }, + { + "epoch": 4.989128957738823, + "grad_norm": 0.4738897681236267, + "learning_rate": 3.7662216333741e-06, + "loss": 3.8515, + "step": 73430 + }, + { + "epoch": 4.989468677809485, + "grad_norm": 0.2511361837387085, + "learning_rate": 3.7657969832857726e-06, + "loss": 4.0068, + "step": 73435 + }, + { + "epoch": 4.989808397880147, + "grad_norm": 0.21140645444393158, + "learning_rate": 3.765372333197446e-06, + "loss": 3.8896, + "step": 73440 + }, + { + "epoch": 4.990148117950809, + "grad_norm": 0.33759206533432007, + "learning_rate": 3.7649476831091186e-06, + "loss": 3.9834, + "step": 73445 + }, + { + "epoch": 4.990487838021471, + "grad_norm": 0.1962691843509674, + "learning_rate": 3.764523033020791e-06, + "loss": 3.7693, + "step": 73450 + }, + { + "epoch": 4.990827558092132, + "grad_norm": 0.31248170137405396, + "learning_rate": 3.7640983829324638e-06, + "loss": 3.9977, + "step": 73455 + }, + { + "epoch": 4.991167278162794, + "grad_norm": 0.26456475257873535, + "learning_rate": 3.763673732844137e-06, + "loss": 4.0984, + "step": 73460 + }, + { + "epoch": 4.991506998233456, + "grad_norm": 0.3341391682624817, + "learning_rate": 3.7632490827558094e-06, + "loss": 3.6292, + "step": 73465 + }, + { + "epoch": 4.991846718304117, + "grad_norm": 0.2847176194190979, + "learning_rate": 3.762824432667482e-06, + "loss": 3.9224, + "step": 73470 + }, + { + "epoch": 4.992186438374779, + "grad_norm": 0.3298454284667969, + "learning_rate": 3.7623997825791554e-06, + "loss": 3.8598, + "step": 73475 + }, + { + "epoch": 4.9925261584454415, + "grad_norm": 0.23967184126377106, + "learning_rate": 3.761975132490828e-06, + "loss": 3.9761, + "step": 73480 + }, + { + "epoch": 4.992865878516103, + "grad_norm": 0.21437209844589233, + "learning_rate": 3.7615504824025006e-06, + "loss": 3.9863, + "step": 73485 + }, + { + "epoch": 4.993205598586765, + "grad_norm": 0.2937350571155548, + "learning_rate": 3.761125832314174e-06, + "loss": 3.923, + "step": 73490 + }, + { + "epoch": 4.993545318657426, + "grad_norm": 0.20203034579753876, + "learning_rate": 3.760701182225846e-06, + "loss": 4.0113, + "step": 73495 + }, + { + "epoch": 4.993885038728088, + "grad_norm": 0.2533280849456787, + "learning_rate": 3.760276532137519e-06, + "loss": 3.7486, + "step": 73500 + }, + { + "epoch": 4.99422475879875, + "grad_norm": 0.32854101061820984, + "learning_rate": 3.759851882049192e-06, + "loss": 3.9611, + "step": 73505 + }, + { + "epoch": 4.994564478869411, + "grad_norm": 0.24207378923892975, + "learning_rate": 3.759427231960864e-06, + "loss": 4.1385, + "step": 73510 + }, + { + "epoch": 4.994904198940073, + "grad_norm": 0.47537165880203247, + "learning_rate": 3.7590025818725374e-06, + "loss": 4.1001, + "step": 73515 + }, + { + "epoch": 4.995243919010735, + "grad_norm": 0.30486345291137695, + "learning_rate": 3.75857793178421e-06, + "loss": 3.9576, + "step": 73520 + }, + { + "epoch": 4.995583639081397, + "grad_norm": 0.2663879096508026, + "learning_rate": 3.7581532816958826e-06, + "loss": 3.9473, + "step": 73525 + }, + { + "epoch": 4.995923359152059, + "grad_norm": 0.3651159703731537, + "learning_rate": 3.757728631607556e-06, + "loss": 4.1227, + "step": 73530 + }, + { + "epoch": 4.996263079222721, + "grad_norm": 0.33506783843040466, + "learning_rate": 3.7573039815192286e-06, + "loss": 3.9841, + "step": 73535 + }, + { + "epoch": 4.996602799293382, + "grad_norm": 0.24105830490589142, + "learning_rate": 3.756879331430901e-06, + "loss": 3.8857, + "step": 73540 + }, + { + "epoch": 4.996942519364044, + "grad_norm": 0.26480409502983093, + "learning_rate": 3.7564546813425738e-06, + "loss": 4.0752, + "step": 73545 + }, + { + "epoch": 4.997282239434706, + "grad_norm": 0.3050229847431183, + "learning_rate": 3.756030031254247e-06, + "loss": 4.0623, + "step": 73550 + }, + { + "epoch": 4.997621959505367, + "grad_norm": 0.24427156150341034, + "learning_rate": 3.7556053811659194e-06, + "loss": 3.7838, + "step": 73555 + }, + { + "epoch": 4.997961679576029, + "grad_norm": 0.24994361400604248, + "learning_rate": 3.755180731077592e-06, + "loss": 4.0668, + "step": 73560 + }, + { + "epoch": 4.998301399646691, + "grad_norm": 0.3821250796318054, + "learning_rate": 3.7547560809892654e-06, + "loss": 3.8559, + "step": 73565 + }, + { + "epoch": 4.998641119717353, + "grad_norm": 0.2907712757587433, + "learning_rate": 3.754331430900938e-06, + "loss": 4.0648, + "step": 73570 + }, + { + "epoch": 4.998980839788015, + "grad_norm": 0.29295670986175537, + "learning_rate": 3.7539067808126106e-06, + "loss": 3.9281, + "step": 73575 + }, + { + "epoch": 4.999320559858677, + "grad_norm": 0.2896459996700287, + "learning_rate": 3.7534821307242834e-06, + "loss": 3.9045, + "step": 73580 + }, + { + "epoch": 4.999660279929338, + "grad_norm": 0.31872427463531494, + "learning_rate": 3.7530574806359566e-06, + "loss": 4.2439, + "step": 73585 + }, + { + "epoch": 5.0, + "grad_norm": 0.6943742036819458, + "learning_rate": 3.752632830547629e-06, + "loss": 3.8509, + "step": 73590 + }, + { + "epoch": 5.0, + "eval_bertscore": { + "f1": 0.8404207238509611, + "precision": 0.848573293042675, + "recall": 0.8332806290010459 + }, + "eval_bleu_4": 0.004686798150795807, + "eval_exact_match": 0.0, + "eval_loss": 3.760023832321167, + "eval_meteor": 0.0822651218423652, + "eval_rouge": { + "rouge1": 0.13153058788023292, + "rouge2": 0.013424025424514752, + "rougeL": 0.10865798586004163, + "rougeLsum": 0.1086885257523979 + }, + "eval_runtime": 273.6061, + "eval_samples_per_second": 37.715, + "eval_steps_per_second": 4.715, + "step": 73590 + }, + { + "epoch": 5.000339720070662, + "grad_norm": 0.27186086773872375, + "learning_rate": 3.7522081804593018e-06, + "loss": 4.011, + "step": 73595 + }, + { + "epoch": 5.000679440141323, + "grad_norm": 0.23002927005290985, + "learning_rate": 3.751783530370975e-06, + "loss": 3.8445, + "step": 73600 + }, + { + "epoch": 5.001019160211985, + "grad_norm": 0.23797447979450226, + "learning_rate": 3.7513588802826474e-06, + "loss": 3.9628, + "step": 73605 + }, + { + "epoch": 5.001358880282647, + "grad_norm": 0.42946696281433105, + "learning_rate": 3.75093423019432e-06, + "loss": 4.064, + "step": 73610 + }, + { + "epoch": 5.001698600353309, + "grad_norm": 0.3028658330440521, + "learning_rate": 3.750509580105993e-06, + "loss": 4.2054, + "step": 73615 + }, + { + "epoch": 5.002038320423971, + "grad_norm": 0.2631664574146271, + "learning_rate": 3.750084930017666e-06, + "loss": 3.9346, + "step": 73620 + }, + { + "epoch": 5.002378040494633, + "grad_norm": 0.24181854724884033, + "learning_rate": 3.7496602799293386e-06, + "loss": 3.9775, + "step": 73625 + }, + { + "epoch": 5.002717760565294, + "grad_norm": 0.23782284557819366, + "learning_rate": 3.7492356298410114e-06, + "loss": 3.8345, + "step": 73630 + }, + { + "epoch": 5.003057480635956, + "grad_norm": 0.2402426153421402, + "learning_rate": 3.7488109797526838e-06, + "loss": 4.0287, + "step": 73635 + }, + { + "epoch": 5.003397200706618, + "grad_norm": 0.37009692192077637, + "learning_rate": 3.748386329664357e-06, + "loss": 3.9575, + "step": 73640 + }, + { + "epoch": 5.003736920777279, + "grad_norm": 0.2534191310405731, + "learning_rate": 3.74796167957603e-06, + "loss": 3.9993, + "step": 73645 + }, + { + "epoch": 5.004076640847941, + "grad_norm": 0.29619404673576355, + "learning_rate": 3.747537029487702e-06, + "loss": 4.046, + "step": 73650 + }, + { + "epoch": 5.004416360918603, + "grad_norm": 0.24590133130550385, + "learning_rate": 3.7471123793993754e-06, + "loss": 3.8633, + "step": 73655 + }, + { + "epoch": 5.004756080989265, + "grad_norm": 0.5594887137413025, + "learning_rate": 3.746687729311048e-06, + "loss": 3.9709, + "step": 73660 + }, + { + "epoch": 5.005095801059927, + "grad_norm": 0.2481682300567627, + "learning_rate": 3.7462630792227206e-06, + "loss": 3.9416, + "step": 73665 + }, + { + "epoch": 5.005435521130589, + "grad_norm": 0.31997761130332947, + "learning_rate": 3.7458384291343934e-06, + "loss": 3.6885, + "step": 73670 + }, + { + "epoch": 5.00577524120125, + "grad_norm": 0.2163345068693161, + "learning_rate": 3.7454137790460666e-06, + "loss": 3.8528, + "step": 73675 + }, + { + "epoch": 5.006114961271912, + "grad_norm": 0.2542116641998291, + "learning_rate": 3.744989128957739e-06, + "loss": 3.9868, + "step": 73680 + }, + { + "epoch": 5.006454681342574, + "grad_norm": 0.25063851475715637, + "learning_rate": 3.7445644788694118e-06, + "loss": 4.0791, + "step": 73685 + }, + { + "epoch": 5.006794401413235, + "grad_norm": 0.247017040848732, + "learning_rate": 3.744139828781085e-06, + "loss": 3.6641, + "step": 73690 + }, + { + "epoch": 5.007134121483897, + "grad_norm": 0.36390218138694763, + "learning_rate": 3.7437151786927574e-06, + "loss": 3.935, + "step": 73695 + }, + { + "epoch": 5.007473841554559, + "grad_norm": 0.31543388962745667, + "learning_rate": 3.74329052860443e-06, + "loss": 3.8057, + "step": 73700 + }, + { + "epoch": 5.007813561625221, + "grad_norm": 0.2567213773727417, + "learning_rate": 3.742865878516103e-06, + "loss": 3.7334, + "step": 73705 + }, + { + "epoch": 5.008153281695883, + "grad_norm": 0.31769007444381714, + "learning_rate": 3.7424412284277758e-06, + "loss": 3.9759, + "step": 73710 + }, + { + "epoch": 5.008493001766545, + "grad_norm": 0.2639865577220917, + "learning_rate": 3.7420165783394486e-06, + "loss": 3.9708, + "step": 73715 + }, + { + "epoch": 5.008832721837206, + "grad_norm": 0.22664619982242584, + "learning_rate": 3.7415919282511214e-06, + "loss": 4.0047, + "step": 73720 + }, + { + "epoch": 5.009172441907868, + "grad_norm": 0.23519571125507355, + "learning_rate": 3.7411672781627938e-06, + "loss": 3.8576, + "step": 73725 + }, + { + "epoch": 5.00951216197853, + "grad_norm": 0.2633205056190491, + "learning_rate": 3.740742628074467e-06, + "loss": 3.8945, + "step": 73730 + }, + { + "epoch": 5.009851882049191, + "grad_norm": 0.21902745962142944, + "learning_rate": 3.7403179779861398e-06, + "loss": 3.943, + "step": 73735 + }, + { + "epoch": 5.010191602119853, + "grad_norm": 0.21826273202896118, + "learning_rate": 3.739893327897812e-06, + "loss": 3.9438, + "step": 73740 + }, + { + "epoch": 5.0105313221905154, + "grad_norm": 0.26890140771865845, + "learning_rate": 3.7394686778094854e-06, + "loss": 3.8215, + "step": 73745 + }, + { + "epoch": 5.010871042261177, + "grad_norm": 0.22789490222930908, + "learning_rate": 3.739044027721158e-06, + "loss": 3.6892, + "step": 73750 + }, + { + "epoch": 5.011210762331839, + "grad_norm": 0.2774588167667389, + "learning_rate": 3.738619377632831e-06, + "loss": 3.9715, + "step": 73755 + }, + { + "epoch": 5.0115504824025, + "grad_norm": 0.3092650771141052, + "learning_rate": 3.7381947275445034e-06, + "loss": 3.9935, + "step": 73760 + }, + { + "epoch": 5.011890202473162, + "grad_norm": 0.34799787402153015, + "learning_rate": 3.7377700774561766e-06, + "loss": 4.0007, + "step": 73765 + }, + { + "epoch": 5.012229922543824, + "grad_norm": 0.29239434003829956, + "learning_rate": 3.7373454273678494e-06, + "loss": 3.8046, + "step": 73770 + }, + { + "epoch": 5.012569642614485, + "grad_norm": 0.28674888610839844, + "learning_rate": 3.7369207772795218e-06, + "loss": 3.78, + "step": 73775 + }, + { + "epoch": 5.012909362685147, + "grad_norm": 0.31073787808418274, + "learning_rate": 3.736496127191195e-06, + "loss": 4.1217, + "step": 73780 + }, + { + "epoch": 5.013249082755809, + "grad_norm": 0.23948264122009277, + "learning_rate": 3.736071477102868e-06, + "loss": 3.8599, + "step": 73785 + }, + { + "epoch": 5.013588802826471, + "grad_norm": 0.2925223112106323, + "learning_rate": 3.73564682701454e-06, + "loss": 3.9927, + "step": 73790 + }, + { + "epoch": 5.013928522897133, + "grad_norm": 0.318087637424469, + "learning_rate": 3.735222176926213e-06, + "loss": 4.1166, + "step": 73795 + }, + { + "epoch": 5.014268242967795, + "grad_norm": 0.2737840712070465, + "learning_rate": 3.734797526837886e-06, + "loss": 3.8746, + "step": 73800 + }, + { + "epoch": 5.014607963038456, + "grad_norm": 0.243369922041893, + "learning_rate": 3.7343728767495586e-06, + "loss": 3.8172, + "step": 73805 + }, + { + "epoch": 5.014947683109118, + "grad_norm": 0.2792392671108246, + "learning_rate": 3.7339482266612314e-06, + "loss": 3.8883, + "step": 73810 + }, + { + "epoch": 5.01528740317978, + "grad_norm": 0.33674028515815735, + "learning_rate": 3.7335235765729046e-06, + "loss": 3.8703, + "step": 73815 + }, + { + "epoch": 5.015627123250441, + "grad_norm": 0.2915382385253906, + "learning_rate": 3.733098926484577e-06, + "loss": 4.0326, + "step": 73820 + }, + { + "epoch": 5.015966843321103, + "grad_norm": 0.33817923069000244, + "learning_rate": 3.7326742763962498e-06, + "loss": 3.7461, + "step": 73825 + }, + { + "epoch": 5.016306563391765, + "grad_norm": 0.2625061273574829, + "learning_rate": 3.7322496263079226e-06, + "loss": 3.8395, + "step": 73830 + }, + { + "epoch": 5.016646283462427, + "grad_norm": 0.2785623073577881, + "learning_rate": 3.7318249762195954e-06, + "loss": 4.2616, + "step": 73835 + }, + { + "epoch": 5.016986003533089, + "grad_norm": 0.23575465381145477, + "learning_rate": 3.731400326131268e-06, + "loss": 3.8699, + "step": 73840 + }, + { + "epoch": 5.017325723603751, + "grad_norm": 0.3013189136981964, + "learning_rate": 3.730975676042941e-06, + "loss": 3.763, + "step": 73845 + }, + { + "epoch": 5.017665443674412, + "grad_norm": 0.2314954251050949, + "learning_rate": 3.7305510259546134e-06, + "loss": 3.9112, + "step": 73850 + }, + { + "epoch": 5.018005163745074, + "grad_norm": 0.26256000995635986, + "learning_rate": 3.7301263758662866e-06, + "loss": 4.0403, + "step": 73855 + }, + { + "epoch": 5.018344883815736, + "grad_norm": 0.19585460424423218, + "learning_rate": 3.7297017257779594e-06, + "loss": 3.7504, + "step": 73860 + }, + { + "epoch": 5.018684603886397, + "grad_norm": 0.29996994137763977, + "learning_rate": 3.7292770756896318e-06, + "loss": 4.0616, + "step": 73865 + }, + { + "epoch": 5.019024323957059, + "grad_norm": 0.24737368524074554, + "learning_rate": 3.728852425601305e-06, + "loss": 3.865, + "step": 73870 + }, + { + "epoch": 5.019364044027721, + "grad_norm": 0.39177706837654114, + "learning_rate": 3.7284277755129778e-06, + "loss": 3.9005, + "step": 73875 + }, + { + "epoch": 5.019703764098383, + "grad_norm": 0.28160572052001953, + "learning_rate": 3.72800312542465e-06, + "loss": 3.8616, + "step": 73880 + }, + { + "epoch": 5.020043484169045, + "grad_norm": 0.3062175214290619, + "learning_rate": 3.727578475336323e-06, + "loss": 3.693, + "step": 73885 + }, + { + "epoch": 5.020383204239707, + "grad_norm": 0.28116166591644287, + "learning_rate": 3.727153825247996e-06, + "loss": 4.064, + "step": 73890 + }, + { + "epoch": 5.020722924310368, + "grad_norm": 0.35119086503982544, + "learning_rate": 3.7267291751596686e-06, + "loss": 3.8015, + "step": 73895 + }, + { + "epoch": 5.02106264438103, + "grad_norm": 0.3687555491924286, + "learning_rate": 3.7263045250713414e-06, + "loss": 4.1171, + "step": 73900 + }, + { + "epoch": 5.021402364451692, + "grad_norm": 0.25553208589553833, + "learning_rate": 3.7258798749830146e-06, + "loss": 3.91, + "step": 73905 + }, + { + "epoch": 5.021742084522353, + "grad_norm": 0.28586503863334656, + "learning_rate": 3.725455224894687e-06, + "loss": 3.9438, + "step": 73910 + }, + { + "epoch": 5.022081804593015, + "grad_norm": 0.32989680767059326, + "learning_rate": 3.7250305748063598e-06, + "loss": 4.158, + "step": 73915 + }, + { + "epoch": 5.022421524663677, + "grad_norm": 0.20687799155712128, + "learning_rate": 3.7246059247180326e-06, + "loss": 3.9653, + "step": 73920 + }, + { + "epoch": 5.022761244734339, + "grad_norm": 0.296995609998703, + "learning_rate": 3.724181274629706e-06, + "loss": 3.8591, + "step": 73925 + }, + { + "epoch": 5.023100964805001, + "grad_norm": 0.2164531648159027, + "learning_rate": 3.723756624541378e-06, + "loss": 3.7119, + "step": 73930 + }, + { + "epoch": 5.023440684875663, + "grad_norm": 0.5389034152030945, + "learning_rate": 3.723331974453051e-06, + "loss": 3.9371, + "step": 73935 + }, + { + "epoch": 5.023780404946324, + "grad_norm": 0.25004687905311584, + "learning_rate": 3.722907324364724e-06, + "loss": 3.6907, + "step": 73940 + }, + { + "epoch": 5.024120125016986, + "grad_norm": 0.20980308949947357, + "learning_rate": 3.7224826742763966e-06, + "loss": 3.7429, + "step": 73945 + }, + { + "epoch": 5.024459845087648, + "grad_norm": 0.21495668590068817, + "learning_rate": 3.7220580241880694e-06, + "loss": 3.9522, + "step": 73950 + }, + { + "epoch": 5.024799565158309, + "grad_norm": 0.2698010206222534, + "learning_rate": 3.721633374099742e-06, + "loss": 3.655, + "step": 73955 + }, + { + "epoch": 5.025139285228971, + "grad_norm": 0.2428743988275528, + "learning_rate": 3.7212087240114145e-06, + "loss": 3.9417, + "step": 73960 + }, + { + "epoch": 5.025479005299633, + "grad_norm": 0.24997514486312866, + "learning_rate": 3.7207840739230878e-06, + "loss": 4.083, + "step": 73965 + }, + { + "epoch": 5.025818725370295, + "grad_norm": 0.28064000606536865, + "learning_rate": 3.7203594238347606e-06, + "loss": 3.8857, + "step": 73970 + }, + { + "epoch": 5.026158445440957, + "grad_norm": 0.2194732278585434, + "learning_rate": 3.719934773746433e-06, + "loss": 3.9633, + "step": 73975 + }, + { + "epoch": 5.026498165511619, + "grad_norm": 0.2853817939758301, + "learning_rate": 3.719510123658106e-06, + "loss": 4.004, + "step": 73980 + }, + { + "epoch": 5.02683788558228, + "grad_norm": 0.2438647747039795, + "learning_rate": 3.719085473569779e-06, + "loss": 3.8689, + "step": 73985 + }, + { + "epoch": 5.027177605652942, + "grad_norm": 0.26800113916397095, + "learning_rate": 3.7186608234814514e-06, + "loss": 3.8381, + "step": 73990 + }, + { + "epoch": 5.027517325723604, + "grad_norm": 0.20573167502880096, + "learning_rate": 3.7182361733931246e-06, + "loss": 3.9246, + "step": 73995 + }, + { + "epoch": 5.027857045794265, + "grad_norm": 0.25508517026901245, + "learning_rate": 3.7178115233047974e-06, + "loss": 3.7103, + "step": 74000 + }, + { + "epoch": 5.028196765864927, + "grad_norm": 0.270432710647583, + "learning_rate": 3.7173868732164698e-06, + "loss": 4.0976, + "step": 74005 + }, + { + "epoch": 5.028536485935589, + "grad_norm": 0.3285699188709259, + "learning_rate": 3.7169622231281426e-06, + "loss": 3.8719, + "step": 74010 + }, + { + "epoch": 5.028876206006251, + "grad_norm": 0.32187312841415405, + "learning_rate": 3.7165375730398158e-06, + "loss": 3.8361, + "step": 74015 + }, + { + "epoch": 5.029215926076913, + "grad_norm": 0.4336921274662018, + "learning_rate": 3.716112922951488e-06, + "loss": 3.7699, + "step": 74020 + }, + { + "epoch": 5.029555646147575, + "grad_norm": 0.28895413875579834, + "learning_rate": 3.715688272863161e-06, + "loss": 4.2878, + "step": 74025 + }, + { + "epoch": 5.029895366218236, + "grad_norm": 0.39816492795944214, + "learning_rate": 3.715263622774834e-06, + "loss": 4.1089, + "step": 74030 + }, + { + "epoch": 5.030235086288898, + "grad_norm": 0.5124291777610779, + "learning_rate": 3.7148389726865066e-06, + "loss": 3.8648, + "step": 74035 + }, + { + "epoch": 5.03057480635956, + "grad_norm": 0.2797344923019409, + "learning_rate": 3.7144143225981794e-06, + "loss": 3.883, + "step": 74040 + }, + { + "epoch": 5.030914526430221, + "grad_norm": 0.2852979898452759, + "learning_rate": 3.713989672509852e-06, + "loss": 3.6694, + "step": 74045 + }, + { + "epoch": 5.031254246500883, + "grad_norm": 0.3395652174949646, + "learning_rate": 3.7135650224215245e-06, + "loss": 4.1462, + "step": 74050 + }, + { + "epoch": 5.0315939665715455, + "grad_norm": 0.2508406937122345, + "learning_rate": 3.7131403723331978e-06, + "loss": 3.9298, + "step": 74055 + }, + { + "epoch": 5.031933686642207, + "grad_norm": 0.3352184593677521, + "learning_rate": 3.7127157222448706e-06, + "loss": 3.8095, + "step": 74060 + }, + { + "epoch": 5.032273406712869, + "grad_norm": 0.2556568682193756, + "learning_rate": 3.712291072156543e-06, + "loss": 3.9688, + "step": 74065 + }, + { + "epoch": 5.032613126783531, + "grad_norm": 0.35148105025291443, + "learning_rate": 3.711866422068216e-06, + "loss": 3.7912, + "step": 74070 + }, + { + "epoch": 5.032952846854192, + "grad_norm": 0.3480895757675171, + "learning_rate": 3.711441771979889e-06, + "loss": 3.7516, + "step": 74075 + }, + { + "epoch": 5.033292566924854, + "grad_norm": 0.3091602623462677, + "learning_rate": 3.7110171218915613e-06, + "loss": 3.9474, + "step": 74080 + }, + { + "epoch": 5.033632286995516, + "grad_norm": 0.284646213054657, + "learning_rate": 3.710592471803234e-06, + "loss": 4.0168, + "step": 74085 + }, + { + "epoch": 5.033972007066177, + "grad_norm": 0.3053220808506012, + "learning_rate": 3.7101678217149074e-06, + "loss": 4.0097, + "step": 74090 + }, + { + "epoch": 5.034311727136839, + "grad_norm": 0.26631754636764526, + "learning_rate": 3.70974317162658e-06, + "loss": 4.1889, + "step": 74095 + }, + { + "epoch": 5.0346514472075015, + "grad_norm": 0.3222731053829193, + "learning_rate": 3.7093185215382525e-06, + "loss": 4.0995, + "step": 74100 + }, + { + "epoch": 5.034991167278163, + "grad_norm": 0.39262232184410095, + "learning_rate": 3.7088938714499258e-06, + "loss": 4.014, + "step": 74105 + }, + { + "epoch": 5.035330887348825, + "grad_norm": 0.3347574770450592, + "learning_rate": 3.7084692213615986e-06, + "loss": 4.033, + "step": 74110 + }, + { + "epoch": 5.035670607419486, + "grad_norm": 0.24781467020511627, + "learning_rate": 3.708044571273271e-06, + "loss": 4.1348, + "step": 74115 + }, + { + "epoch": 5.036010327490148, + "grad_norm": 0.1858079582452774, + "learning_rate": 3.7076199211849437e-06, + "loss": 3.5442, + "step": 74120 + }, + { + "epoch": 5.03635004756081, + "grad_norm": 0.22509200870990753, + "learning_rate": 3.707195271096617e-06, + "loss": 3.9773, + "step": 74125 + }, + { + "epoch": 5.036689767631471, + "grad_norm": 0.26300233602523804, + "learning_rate": 3.7067706210082894e-06, + "loss": 4.0644, + "step": 74130 + }, + { + "epoch": 5.037029487702133, + "grad_norm": 0.2926155626773834, + "learning_rate": 3.706345970919962e-06, + "loss": 3.8755, + "step": 74135 + }, + { + "epoch": 5.037369207772795, + "grad_norm": 0.3040692210197449, + "learning_rate": 3.7059213208316354e-06, + "loss": 4.0306, + "step": 74140 + }, + { + "epoch": 5.037708927843457, + "grad_norm": 0.3922957479953766, + "learning_rate": 3.7054966707433078e-06, + "loss": 4.0165, + "step": 74145 + }, + { + "epoch": 5.038048647914119, + "grad_norm": 0.19151031970977783, + "learning_rate": 3.7050720206549806e-06, + "loss": 3.856, + "step": 74150 + }, + { + "epoch": 5.038388367984781, + "grad_norm": 0.20363330841064453, + "learning_rate": 3.7046473705666538e-06, + "loss": 3.9631, + "step": 74155 + }, + { + "epoch": 5.038728088055442, + "grad_norm": 0.32280707359313965, + "learning_rate": 3.704222720478326e-06, + "loss": 3.7224, + "step": 74160 + }, + { + "epoch": 5.039067808126104, + "grad_norm": 0.2767835557460785, + "learning_rate": 3.703798070389999e-06, + "loss": 4.2782, + "step": 74165 + }, + { + "epoch": 5.039407528196766, + "grad_norm": 0.34384068846702576, + "learning_rate": 3.7033734203016718e-06, + "loss": 4.0015, + "step": 74170 + }, + { + "epoch": 5.039747248267427, + "grad_norm": 0.3849162757396698, + "learning_rate": 3.702948770213344e-06, + "loss": 4.2809, + "step": 74175 + }, + { + "epoch": 5.040086968338089, + "grad_norm": 0.2729640007019043, + "learning_rate": 3.7025241201250174e-06, + "loss": 3.7364, + "step": 74180 + }, + { + "epoch": 5.040426688408751, + "grad_norm": 0.21407407522201538, + "learning_rate": 3.70209947003669e-06, + "loss": 4.0443, + "step": 74185 + }, + { + "epoch": 5.040766408479413, + "grad_norm": 0.21626026928424835, + "learning_rate": 3.7016748199483625e-06, + "loss": 3.8863, + "step": 74190 + }, + { + "epoch": 5.041106128550075, + "grad_norm": 0.2592843174934387, + "learning_rate": 3.7012501698600358e-06, + "loss": 3.8527, + "step": 74195 + }, + { + "epoch": 5.041445848620737, + "grad_norm": 0.34153544902801514, + "learning_rate": 3.7008255197717086e-06, + "loss": 4.149, + "step": 74200 + }, + { + "epoch": 5.041785568691398, + "grad_norm": 0.20026524364948273, + "learning_rate": 3.700400869683381e-06, + "loss": 3.7591, + "step": 74205 + }, + { + "epoch": 5.04212528876206, + "grad_norm": 0.3534489870071411, + "learning_rate": 3.6999762195950537e-06, + "loss": 4.2255, + "step": 74210 + }, + { + "epoch": 5.042465008832722, + "grad_norm": 0.23711930215358734, + "learning_rate": 3.699551569506727e-06, + "loss": 3.916, + "step": 74215 + }, + { + "epoch": 5.042804728903383, + "grad_norm": 0.26488518714904785, + "learning_rate": 3.6991269194183993e-06, + "loss": 4.0461, + "step": 74220 + }, + { + "epoch": 5.043144448974045, + "grad_norm": 0.29380249977111816, + "learning_rate": 3.698702269330072e-06, + "loss": 4.027, + "step": 74225 + }, + { + "epoch": 5.043484169044707, + "grad_norm": 0.27672842144966125, + "learning_rate": 3.6982776192417454e-06, + "loss": 4.1456, + "step": 74230 + }, + { + "epoch": 5.043823889115369, + "grad_norm": 0.26540011167526245, + "learning_rate": 3.6978529691534177e-06, + "loss": 3.5797, + "step": 74235 + }, + { + "epoch": 5.044163609186031, + "grad_norm": 0.3571341037750244, + "learning_rate": 3.6974283190650905e-06, + "loss": 3.6659, + "step": 74240 + }, + { + "epoch": 5.044503329256693, + "grad_norm": 0.3155442476272583, + "learning_rate": 3.697088598994429e-06, + "loss": 4.0086, + "step": 74245 + }, + { + "epoch": 5.044843049327354, + "grad_norm": 0.2799925208091736, + "learning_rate": 3.696663948906102e-06, + "loss": 3.8129, + "step": 74250 + }, + { + "epoch": 5.045182769398016, + "grad_norm": 0.2456476390361786, + "learning_rate": 3.6962392988177742e-06, + "loss": 4.0824, + "step": 74255 + }, + { + "epoch": 5.045522489468678, + "grad_norm": 0.2705201208591461, + "learning_rate": 3.695814648729447e-06, + "loss": 3.8996, + "step": 74260 + }, + { + "epoch": 5.045862209539339, + "grad_norm": 0.27157336473464966, + "learning_rate": 3.6953899986411202e-06, + "loss": 4.07, + "step": 74265 + }, + { + "epoch": 5.046201929610001, + "grad_norm": 0.23283597826957703, + "learning_rate": 3.6949653485527926e-06, + "loss": 3.858, + "step": 74270 + }, + { + "epoch": 5.046541649680663, + "grad_norm": 0.2662910223007202, + "learning_rate": 3.6945406984644654e-06, + "loss": 3.9727, + "step": 74275 + }, + { + "epoch": 5.046881369751325, + "grad_norm": 0.2677544355392456, + "learning_rate": 3.6941160483761386e-06, + "loss": 3.6647, + "step": 74280 + }, + { + "epoch": 5.047221089821987, + "grad_norm": 0.28835898637771606, + "learning_rate": 3.693691398287811e-06, + "loss": 3.8166, + "step": 74285 + }, + { + "epoch": 5.047560809892649, + "grad_norm": 0.2105235457420349, + "learning_rate": 3.693266748199484e-06, + "loss": 3.9155, + "step": 74290 + }, + { + "epoch": 5.04790052996331, + "grad_norm": 0.25957149267196655, + "learning_rate": 3.6928420981111566e-06, + "loss": 3.5965, + "step": 74295 + }, + { + "epoch": 5.048240250033972, + "grad_norm": 0.26035261154174805, + "learning_rate": 3.69241744802283e-06, + "loss": 3.9793, + "step": 74300 + }, + { + "epoch": 5.048579970104634, + "grad_norm": 0.4269610047340393, + "learning_rate": 3.6919927979345022e-06, + "loss": 4.2345, + "step": 74305 + }, + { + "epoch": 5.048919690175295, + "grad_norm": 0.3009452223777771, + "learning_rate": 3.691568147846175e-06, + "loss": 4.2471, + "step": 74310 + }, + { + "epoch": 5.049259410245957, + "grad_norm": 0.38278326392173767, + "learning_rate": 3.6911434977578482e-06, + "loss": 4.0972, + "step": 74315 + }, + { + "epoch": 5.0495991303166194, + "grad_norm": 0.2836816608905792, + "learning_rate": 3.6907188476695206e-06, + "loss": 4.1689, + "step": 74320 + }, + { + "epoch": 5.049938850387281, + "grad_norm": 0.2147931158542633, + "learning_rate": 3.6902941975811934e-06, + "loss": 3.8455, + "step": 74325 + }, + { + "epoch": 5.050278570457943, + "grad_norm": 0.35248863697052, + "learning_rate": 3.6898695474928662e-06, + "loss": 3.8186, + "step": 74330 + }, + { + "epoch": 5.050618290528605, + "grad_norm": 0.31099388003349304, + "learning_rate": 3.6894448974045386e-06, + "loss": 4.1204, + "step": 74335 + }, + { + "epoch": 5.050958010599266, + "grad_norm": 0.25293421745300293, + "learning_rate": 3.689020247316212e-06, + "loss": 3.9466, + "step": 74340 + }, + { + "epoch": 5.051297730669928, + "grad_norm": 0.2734633982181549, + "learning_rate": 3.6885955972278846e-06, + "loss": 3.6259, + "step": 74345 + }, + { + "epoch": 5.05163745074059, + "grad_norm": 0.24737456440925598, + "learning_rate": 3.688170947139557e-06, + "loss": 4.0352, + "step": 74350 + }, + { + "epoch": 5.051977170811251, + "grad_norm": 0.2218860238790512, + "learning_rate": 3.6877462970512302e-06, + "loss": 3.8965, + "step": 74355 + }, + { + "epoch": 5.052316890881913, + "grad_norm": 0.26765578985214233, + "learning_rate": 3.687321646962903e-06, + "loss": 3.6906, + "step": 74360 + }, + { + "epoch": 5.0526566109525755, + "grad_norm": 0.5274851322174072, + "learning_rate": 3.6868969968745754e-06, + "loss": 4.0166, + "step": 74365 + }, + { + "epoch": 5.052996331023237, + "grad_norm": 0.2610337436199188, + "learning_rate": 3.6864723467862486e-06, + "loss": 4.126, + "step": 74370 + }, + { + "epoch": 5.053336051093899, + "grad_norm": 0.2637679874897003, + "learning_rate": 3.6860476966979214e-06, + "loss": 3.8538, + "step": 74375 + }, + { + "epoch": 5.053675771164561, + "grad_norm": 0.22920425236225128, + "learning_rate": 3.685623046609594e-06, + "loss": 3.5412, + "step": 74380 + }, + { + "epoch": 5.054015491235222, + "grad_norm": 0.3661903142929077, + "learning_rate": 3.6851983965212666e-06, + "loss": 4.0833, + "step": 74385 + }, + { + "epoch": 5.054355211305884, + "grad_norm": 0.2998226583003998, + "learning_rate": 3.68477374643294e-06, + "loss": 4.1831, + "step": 74390 + }, + { + "epoch": 5.054694931376546, + "grad_norm": 0.22760048508644104, + "learning_rate": 3.6843490963446122e-06, + "loss": 3.8333, + "step": 74395 + }, + { + "epoch": 5.055034651447207, + "grad_norm": 0.3287696838378906, + "learning_rate": 3.683924446256285e-06, + "loss": 3.9138, + "step": 74400 + }, + { + "epoch": 5.055374371517869, + "grad_norm": 0.3410077393054962, + "learning_rate": 3.6834997961679582e-06, + "loss": 4.0139, + "step": 74405 + }, + { + "epoch": 5.0557140915885315, + "grad_norm": 0.19510914385318756, + "learning_rate": 3.6830751460796306e-06, + "loss": 3.7722, + "step": 74410 + }, + { + "epoch": 5.056053811659193, + "grad_norm": 0.23489202558994293, + "learning_rate": 3.6826504959913034e-06, + "loss": 4.0879, + "step": 74415 + }, + { + "epoch": 5.056393531729855, + "grad_norm": 0.28761327266693115, + "learning_rate": 3.6822258459029762e-06, + "loss": 3.8991, + "step": 74420 + }, + { + "epoch": 5.056733251800517, + "grad_norm": 0.23815549910068512, + "learning_rate": 3.6818011958146486e-06, + "loss": 4.4297, + "step": 74425 + }, + { + "epoch": 5.057072971871178, + "grad_norm": 0.2696380019187927, + "learning_rate": 3.681376545726322e-06, + "loss": 4.0039, + "step": 74430 + }, + { + "epoch": 5.05741269194184, + "grad_norm": 0.39655089378356934, + "learning_rate": 3.6809518956379946e-06, + "loss": 3.7702, + "step": 74435 + }, + { + "epoch": 5.057752412012501, + "grad_norm": 0.2692287266254425, + "learning_rate": 3.680527245549667e-06, + "loss": 3.9687, + "step": 74440 + }, + { + "epoch": 5.058092132083163, + "grad_norm": 0.22858186066150665, + "learning_rate": 3.6801025954613402e-06, + "loss": 3.8878, + "step": 74445 + }, + { + "epoch": 5.058431852153825, + "grad_norm": 0.24623097479343414, + "learning_rate": 3.679677945373013e-06, + "loss": 3.9083, + "step": 74450 + }, + { + "epoch": 5.058771572224487, + "grad_norm": 0.22033771872520447, + "learning_rate": 3.6792532952846854e-06, + "loss": 3.8724, + "step": 74455 + }, + { + "epoch": 5.059111292295149, + "grad_norm": 0.309226930141449, + "learning_rate": 3.678828645196358e-06, + "loss": 3.8829, + "step": 74460 + }, + { + "epoch": 5.059451012365811, + "grad_norm": 0.301235556602478, + "learning_rate": 3.6784039951080314e-06, + "loss": 3.977, + "step": 74465 + }, + { + "epoch": 5.059790732436472, + "grad_norm": 0.3057171106338501, + "learning_rate": 3.6779793450197042e-06, + "loss": 3.699, + "step": 74470 + }, + { + "epoch": 5.060130452507134, + "grad_norm": 0.2655335068702698, + "learning_rate": 3.6775546949313766e-06, + "loss": 3.89, + "step": 74475 + }, + { + "epoch": 5.060470172577796, + "grad_norm": 0.34979575872421265, + "learning_rate": 3.67713004484305e-06, + "loss": 3.9264, + "step": 74480 + }, + { + "epoch": 5.060809892648457, + "grad_norm": 0.26586639881134033, + "learning_rate": 3.6767053947547226e-06, + "loss": 3.9185, + "step": 74485 + }, + { + "epoch": 5.061149612719119, + "grad_norm": 0.2977166175842285, + "learning_rate": 3.676280744666395e-06, + "loss": 3.9681, + "step": 74490 + }, + { + "epoch": 5.061489332789781, + "grad_norm": 0.33077841997146606, + "learning_rate": 3.6758560945780682e-06, + "loss": 3.9443, + "step": 74495 + }, + { + "epoch": 5.061829052860443, + "grad_norm": 0.472089022397995, + "learning_rate": 3.675431444489741e-06, + "loss": 4.1536, + "step": 74500 + }, + { + "epoch": 5.062168772931105, + "grad_norm": 0.26641494035720825, + "learning_rate": 3.6750067944014134e-06, + "loss": 3.7845, + "step": 74505 + }, + { + "epoch": 5.062508493001767, + "grad_norm": 0.30444779992103577, + "learning_rate": 3.674582144313086e-06, + "loss": 4.0007, + "step": 74510 + }, + { + "epoch": 5.062848213072428, + "grad_norm": 0.25970980525016785, + "learning_rate": 3.6741574942247594e-06, + "loss": 3.8624, + "step": 74515 + }, + { + "epoch": 5.06318793314309, + "grad_norm": 0.24672585725784302, + "learning_rate": 3.673732844136432e-06, + "loss": 3.8981, + "step": 74520 + }, + { + "epoch": 5.063527653213752, + "grad_norm": 0.24809962511062622, + "learning_rate": 3.6733081940481046e-06, + "loss": 3.7355, + "step": 74525 + }, + { + "epoch": 5.063867373284413, + "grad_norm": 0.3577449917793274, + "learning_rate": 3.672883543959778e-06, + "loss": 4.1536, + "step": 74530 + }, + { + "epoch": 5.064207093355075, + "grad_norm": 0.24152174592018127, + "learning_rate": 3.6724588938714502e-06, + "loss": 3.9083, + "step": 74535 + }, + { + "epoch": 5.064546813425737, + "grad_norm": 0.30769672989845276, + "learning_rate": 3.672034243783123e-06, + "loss": 3.9268, + "step": 74540 + }, + { + "epoch": 5.064886533496399, + "grad_norm": 0.2589621841907501, + "learning_rate": 3.671609593694796e-06, + "loss": 3.9495, + "step": 74545 + }, + { + "epoch": 5.065226253567061, + "grad_norm": 0.25290587544441223, + "learning_rate": 3.671184943606468e-06, + "loss": 3.7012, + "step": 74550 + }, + { + "epoch": 5.065565973637723, + "grad_norm": 0.2688089609146118, + "learning_rate": 3.6707602935181414e-06, + "loss": 3.9864, + "step": 74555 + }, + { + "epoch": 5.065905693708384, + "grad_norm": 0.25781747698783875, + "learning_rate": 3.6703356434298142e-06, + "loss": 3.741, + "step": 74560 + }, + { + "epoch": 5.066245413779046, + "grad_norm": 0.28362175822257996, + "learning_rate": 3.6699109933414866e-06, + "loss": 3.9809, + "step": 74565 + }, + { + "epoch": 5.066585133849708, + "grad_norm": 0.33278173208236694, + "learning_rate": 3.66948634325316e-06, + "loss": 4.109, + "step": 74570 + }, + { + "epoch": 5.066924853920369, + "grad_norm": 0.3244694173336029, + "learning_rate": 3.6690616931648326e-06, + "loss": 4.1312, + "step": 74575 + }, + { + "epoch": 5.067264573991031, + "grad_norm": 0.32946106791496277, + "learning_rate": 3.668637043076505e-06, + "loss": 3.8308, + "step": 74580 + }, + { + "epoch": 5.067604294061693, + "grad_norm": 0.29506728053092957, + "learning_rate": 3.668212392988178e-06, + "loss": 4.1139, + "step": 74585 + }, + { + "epoch": 5.067944014132355, + "grad_norm": 0.28037869930267334, + "learning_rate": 3.667787742899851e-06, + "loss": 3.9871, + "step": 74590 + }, + { + "epoch": 5.068283734203017, + "grad_norm": 0.3994176983833313, + "learning_rate": 3.6673630928115234e-06, + "loss": 4.0799, + "step": 74595 + }, + { + "epoch": 5.068623454273679, + "grad_norm": 0.49804455041885376, + "learning_rate": 3.666938442723196e-06, + "loss": 3.9752, + "step": 74600 + }, + { + "epoch": 5.06896317434434, + "grad_norm": 0.2515793740749359, + "learning_rate": 3.6665137926348694e-06, + "loss": 3.8964, + "step": 74605 + }, + { + "epoch": 5.069302894415002, + "grad_norm": 0.2739811837673187, + "learning_rate": 3.666089142546542e-06, + "loss": 4.1424, + "step": 74610 + }, + { + "epoch": 5.069642614485664, + "grad_norm": 0.21875353157520294, + "learning_rate": 3.6656644924582146e-06, + "loss": 3.8844, + "step": 74615 + }, + { + "epoch": 5.069982334556325, + "grad_norm": 0.20770145952701569, + "learning_rate": 3.6652398423698874e-06, + "loss": 3.8924, + "step": 74620 + }, + { + "epoch": 5.070322054626987, + "grad_norm": 0.27943021059036255, + "learning_rate": 3.66481519228156e-06, + "loss": 3.7338, + "step": 74625 + }, + { + "epoch": 5.0706617746976494, + "grad_norm": 0.29481253027915955, + "learning_rate": 3.664390542193233e-06, + "loss": 3.7374, + "step": 74630 + }, + { + "epoch": 5.071001494768311, + "grad_norm": 0.40443170070648193, + "learning_rate": 3.663965892104906e-06, + "loss": 3.9342, + "step": 74635 + }, + { + "epoch": 5.071341214838973, + "grad_norm": 0.25667867064476013, + "learning_rate": 3.663541242016579e-06, + "loss": 3.883, + "step": 74640 + }, + { + "epoch": 5.071680934909635, + "grad_norm": 0.29887446761131287, + "learning_rate": 3.6631165919282514e-06, + "loss": 3.8097, + "step": 74645 + }, + { + "epoch": 5.072020654980296, + "grad_norm": 0.22826558351516724, + "learning_rate": 3.662691941839924e-06, + "loss": 3.7247, + "step": 74650 + }, + { + "epoch": 5.072360375050958, + "grad_norm": 0.20245897769927979, + "learning_rate": 3.6622672917515974e-06, + "loss": 3.8353, + "step": 74655 + }, + { + "epoch": 5.07270009512162, + "grad_norm": 0.3508000373840332, + "learning_rate": 3.66184264166327e-06, + "loss": 3.9709, + "step": 74660 + }, + { + "epoch": 5.073039815192281, + "grad_norm": 0.22664503753185272, + "learning_rate": 3.6614179915749426e-06, + "loss": 3.9187, + "step": 74665 + }, + { + "epoch": 5.073379535262943, + "grad_norm": 0.24243555963039398, + "learning_rate": 3.6609933414866154e-06, + "loss": 4.3038, + "step": 74670 + }, + { + "epoch": 5.0737192553336055, + "grad_norm": 0.2873339354991913, + "learning_rate": 3.6605686913982878e-06, + "loss": 3.7873, + "step": 74675 + }, + { + "epoch": 5.074058975404267, + "grad_norm": 0.2584930658340454, + "learning_rate": 3.660144041309961e-06, + "loss": 3.8466, + "step": 74680 + }, + { + "epoch": 5.074398695474929, + "grad_norm": 0.24346524477005005, + "learning_rate": 3.659719391221634e-06, + "loss": 4.1177, + "step": 74685 + }, + { + "epoch": 5.074738415545591, + "grad_norm": 0.2509306073188782, + "learning_rate": 3.659294741133306e-06, + "loss": 3.7035, + "step": 74690 + }, + { + "epoch": 5.075078135616252, + "grad_norm": 0.30913063883781433, + "learning_rate": 3.6588700910449794e-06, + "loss": 4.042, + "step": 74695 + }, + { + "epoch": 5.075417855686914, + "grad_norm": 0.255149781703949, + "learning_rate": 3.6584454409566522e-06, + "loss": 3.8381, + "step": 74700 + }, + { + "epoch": 5.075757575757576, + "grad_norm": 0.25062745809555054, + "learning_rate": 3.6580207908683246e-06, + "loss": 3.7655, + "step": 74705 + }, + { + "epoch": 5.076097295828237, + "grad_norm": 0.25190305709838867, + "learning_rate": 3.6575961407799974e-06, + "loss": 3.8856, + "step": 74710 + }, + { + "epoch": 5.076437015898899, + "grad_norm": 0.2914869785308838, + "learning_rate": 3.6571714906916706e-06, + "loss": 3.9873, + "step": 74715 + }, + { + "epoch": 5.0767767359695615, + "grad_norm": 0.25036749243736267, + "learning_rate": 3.656746840603343e-06, + "loss": 3.9408, + "step": 74720 + }, + { + "epoch": 5.077116456040223, + "grad_norm": 0.3085545301437378, + "learning_rate": 3.656322190515016e-06, + "loss": 4.009, + "step": 74725 + }, + { + "epoch": 5.077456176110885, + "grad_norm": 0.26069244742393494, + "learning_rate": 3.655897540426689e-06, + "loss": 4.0574, + "step": 74730 + }, + { + "epoch": 5.077795896181547, + "grad_norm": 0.235994353890419, + "learning_rate": 3.6554728903383614e-06, + "loss": 3.9358, + "step": 74735 + }, + { + "epoch": 5.078135616252208, + "grad_norm": 0.3564947247505188, + "learning_rate": 3.655048240250034e-06, + "loss": 3.908, + "step": 74740 + }, + { + "epoch": 5.07847533632287, + "grad_norm": 0.27147865295410156, + "learning_rate": 3.654623590161707e-06, + "loss": 3.9443, + "step": 74745 + }, + { + "epoch": 5.078815056393532, + "grad_norm": 0.25239139795303345, + "learning_rate": 3.65419894007338e-06, + "loss": 4.0551, + "step": 74750 + }, + { + "epoch": 5.079154776464193, + "grad_norm": 0.2588042616844177, + "learning_rate": 3.6537742899850526e-06, + "loss": 4.0844, + "step": 74755 + }, + { + "epoch": 5.079494496534855, + "grad_norm": 0.24589018523693085, + "learning_rate": 3.6533496398967254e-06, + "loss": 3.953, + "step": 74760 + }, + { + "epoch": 5.0798342166055175, + "grad_norm": 0.31994202733039856, + "learning_rate": 3.6529249898083978e-06, + "loss": 3.9747, + "step": 74765 + }, + { + "epoch": 5.080173936676179, + "grad_norm": 0.24612204730510712, + "learning_rate": 3.652500339720071e-06, + "loss": 4.0788, + "step": 74770 + }, + { + "epoch": 5.080513656746841, + "grad_norm": 0.2589918375015259, + "learning_rate": 3.652075689631744e-06, + "loss": 3.9556, + "step": 74775 + }, + { + "epoch": 5.080853376817503, + "grad_norm": 0.40015944838523865, + "learning_rate": 3.651651039543416e-06, + "loss": 3.7931, + "step": 74780 + }, + { + "epoch": 5.081193096888164, + "grad_norm": 0.3575657904148102, + "learning_rate": 3.6512263894550894e-06, + "loss": 3.5993, + "step": 74785 + }, + { + "epoch": 5.081532816958826, + "grad_norm": 0.2935221195220947, + "learning_rate": 3.650801739366762e-06, + "loss": 3.9128, + "step": 74790 + }, + { + "epoch": 5.081872537029487, + "grad_norm": 0.34302279353141785, + "learning_rate": 3.6503770892784346e-06, + "loss": 3.9352, + "step": 74795 + }, + { + "epoch": 5.082212257100149, + "grad_norm": 0.22187656164169312, + "learning_rate": 3.6499524391901074e-06, + "loss": 3.8222, + "step": 74800 + }, + { + "epoch": 5.082551977170811, + "grad_norm": 0.2164481282234192, + "learning_rate": 3.6495277891017806e-06, + "loss": 3.6951, + "step": 74805 + }, + { + "epoch": 5.082891697241473, + "grad_norm": 0.2690280079841614, + "learning_rate": 3.6491031390134534e-06, + "loss": 4.0767, + "step": 74810 + }, + { + "epoch": 5.083231417312135, + "grad_norm": 0.4621525704860687, + "learning_rate": 3.6486784889251258e-06, + "loss": 3.9467, + "step": 74815 + }, + { + "epoch": 5.083571137382797, + "grad_norm": 0.21390579640865326, + "learning_rate": 3.648253838836799e-06, + "loss": 3.8564, + "step": 74820 + }, + { + "epoch": 5.083910857453458, + "grad_norm": 0.261562317609787, + "learning_rate": 3.647829188748472e-06, + "loss": 4.1107, + "step": 74825 + }, + { + "epoch": 5.08425057752412, + "grad_norm": 0.2499632090330124, + "learning_rate": 3.647404538660144e-06, + "loss": 3.9315, + "step": 74830 + }, + { + "epoch": 5.084590297594782, + "grad_norm": 0.22192692756652832, + "learning_rate": 3.646979888571817e-06, + "loss": 3.9962, + "step": 74835 + }, + { + "epoch": 5.084930017665443, + "grad_norm": 0.3131587505340576, + "learning_rate": 3.6465552384834902e-06, + "loss": 3.9609, + "step": 74840 + }, + { + "epoch": 5.085269737736105, + "grad_norm": 0.24276535212993622, + "learning_rate": 3.6461305883951626e-06, + "loss": 4.1339, + "step": 74845 + }, + { + "epoch": 5.085609457806767, + "grad_norm": 0.2663100063800812, + "learning_rate": 3.6457059383068354e-06, + "loss": 4.0649, + "step": 74850 + }, + { + "epoch": 5.085949177877429, + "grad_norm": 0.26212796568870544, + "learning_rate": 3.6452812882185086e-06, + "loss": 3.6695, + "step": 74855 + }, + { + "epoch": 5.086288897948091, + "grad_norm": 0.20202745497226715, + "learning_rate": 3.644856638130181e-06, + "loss": 3.8696, + "step": 74860 + }, + { + "epoch": 5.086628618018753, + "grad_norm": 0.27343687415122986, + "learning_rate": 3.644431988041854e-06, + "loss": 3.9085, + "step": 74865 + }, + { + "epoch": 5.086968338089414, + "grad_norm": 0.28604820370674133, + "learning_rate": 3.6440073379535266e-06, + "loss": 3.6382, + "step": 74870 + }, + { + "epoch": 5.087308058160076, + "grad_norm": 0.33932650089263916, + "learning_rate": 3.6435826878651994e-06, + "loss": 3.7736, + "step": 74875 + }, + { + "epoch": 5.087647778230738, + "grad_norm": 0.296191543340683, + "learning_rate": 3.643158037776872e-06, + "loss": 4.065, + "step": 74880 + }, + { + "epoch": 5.087987498301399, + "grad_norm": 0.309329092502594, + "learning_rate": 3.642733387688545e-06, + "loss": 3.7169, + "step": 74885 + }, + { + "epoch": 5.088327218372061, + "grad_norm": 0.42175349593162537, + "learning_rate": 3.6423087376002174e-06, + "loss": 3.8341, + "step": 74890 + }, + { + "epoch": 5.088666938442723, + "grad_norm": 0.24346913397312164, + "learning_rate": 3.6418840875118906e-06, + "loss": 3.785, + "step": 74895 + }, + { + "epoch": 5.089006658513385, + "grad_norm": 0.3174266815185547, + "learning_rate": 3.6414594374235634e-06, + "loss": 3.9846, + "step": 74900 + }, + { + "epoch": 5.089346378584047, + "grad_norm": 0.2401762306690216, + "learning_rate": 3.6410347873352358e-06, + "loss": 3.8855, + "step": 74905 + }, + { + "epoch": 5.089686098654709, + "grad_norm": 0.30287033319473267, + "learning_rate": 3.640610137246909e-06, + "loss": 3.9229, + "step": 74910 + }, + { + "epoch": 5.09002581872537, + "grad_norm": 0.21979378163814545, + "learning_rate": 3.640185487158582e-06, + "loss": 3.7979, + "step": 74915 + }, + { + "epoch": 5.090365538796032, + "grad_norm": 0.21632564067840576, + "learning_rate": 3.639760837070254e-06, + "loss": 3.7962, + "step": 74920 + }, + { + "epoch": 5.090705258866694, + "grad_norm": 0.2912525236606598, + "learning_rate": 3.639336186981927e-06, + "loss": 3.7863, + "step": 74925 + }, + { + "epoch": 5.091044978937355, + "grad_norm": 0.24574168026447296, + "learning_rate": 3.6389115368936e-06, + "loss": 3.8087, + "step": 74930 + }, + { + "epoch": 5.091384699008017, + "grad_norm": 0.2775738835334778, + "learning_rate": 3.6384868868052726e-06, + "loss": 3.9485, + "step": 74935 + }, + { + "epoch": 5.0917244190786795, + "grad_norm": 0.2429247796535492, + "learning_rate": 3.6380622367169454e-06, + "loss": 4.0493, + "step": 74940 + }, + { + "epoch": 5.092064139149341, + "grad_norm": 0.26501792669296265, + "learning_rate": 3.6376375866286186e-06, + "loss": 3.9606, + "step": 74945 + }, + { + "epoch": 5.092403859220003, + "grad_norm": 0.25097307562828064, + "learning_rate": 3.637212936540291e-06, + "loss": 3.7312, + "step": 74950 + }, + { + "epoch": 5.092743579290665, + "grad_norm": 0.24598203599452972, + "learning_rate": 3.6367882864519638e-06, + "loss": 4.0552, + "step": 74955 + }, + { + "epoch": 5.093083299361326, + "grad_norm": 0.24357375502586365, + "learning_rate": 3.6363636363636366e-06, + "loss": 3.9384, + "step": 74960 + }, + { + "epoch": 5.093423019431988, + "grad_norm": 0.18513119220733643, + "learning_rate": 3.635938986275309e-06, + "loss": 4.0922, + "step": 74965 + }, + { + "epoch": 5.09376273950265, + "grad_norm": 0.20938046276569366, + "learning_rate": 3.635514336186982e-06, + "loss": 3.9714, + "step": 74970 + }, + { + "epoch": 5.094102459573311, + "grad_norm": 0.28681206703186035, + "learning_rate": 3.635089686098655e-06, + "loss": 4.0317, + "step": 74975 + }, + { + "epoch": 5.094442179643973, + "grad_norm": 0.32902103662490845, + "learning_rate": 3.634665036010328e-06, + "loss": 3.8023, + "step": 74980 + }, + { + "epoch": 5.0947818997146355, + "grad_norm": 0.3997032642364502, + "learning_rate": 3.6342403859220006e-06, + "loss": 3.9943, + "step": 74985 + }, + { + "epoch": 5.095121619785297, + "grad_norm": 0.29539185762405396, + "learning_rate": 3.6338157358336734e-06, + "loss": 4.0719, + "step": 74990 + }, + { + "epoch": 5.095461339855959, + "grad_norm": 0.1975867599248886, + "learning_rate": 3.633391085745346e-06, + "loss": 4.1189, + "step": 74995 + }, + { + "epoch": 5.095801059926621, + "grad_norm": 0.5295268297195435, + "learning_rate": 3.632966435657019e-06, + "loss": 4.0929, + "step": 75000 + }, + { + "epoch": 5.096140779997282, + "grad_norm": 0.2478889673948288, + "learning_rate": 3.632541785568692e-06, + "loss": 3.6711, + "step": 75005 + }, + { + "epoch": 5.096480500067944, + "grad_norm": 0.23710300028324127, + "learning_rate": 3.6321171354803646e-06, + "loss": 4.032, + "step": 75010 + }, + { + "epoch": 5.096820220138606, + "grad_norm": 0.21927843987941742, + "learning_rate": 3.631692485392037e-06, + "loss": 4.0985, + "step": 75015 + }, + { + "epoch": 5.097159940209267, + "grad_norm": 0.38991981744766235, + "learning_rate": 3.63126783530371e-06, + "loss": 4.0463, + "step": 75020 + }, + { + "epoch": 5.097499660279929, + "grad_norm": 0.2502809464931488, + "learning_rate": 3.630843185215383e-06, + "loss": 3.9324, + "step": 75025 + }, + { + "epoch": 5.0978393803505915, + "grad_norm": 0.2755567133426666, + "learning_rate": 3.6304185351270554e-06, + "loss": 3.9752, + "step": 75030 + }, + { + "epoch": 5.098179100421253, + "grad_norm": 0.2756281793117523, + "learning_rate": 3.6299938850387286e-06, + "loss": 3.8262, + "step": 75035 + }, + { + "epoch": 5.098518820491915, + "grad_norm": 0.24494901299476624, + "learning_rate": 3.6295692349504014e-06, + "loss": 3.8764, + "step": 75040 + }, + { + "epoch": 5.098858540562577, + "grad_norm": 0.20771829783916473, + "learning_rate": 3.6291445848620738e-06, + "loss": 3.8722, + "step": 75045 + }, + { + "epoch": 5.099198260633238, + "grad_norm": 0.3279958665370941, + "learning_rate": 3.6287199347737466e-06, + "loss": 3.8478, + "step": 75050 + }, + { + "epoch": 5.0995379807039, + "grad_norm": 0.3362720310688019, + "learning_rate": 3.62829528468542e-06, + "loss": 3.8604, + "step": 75055 + }, + { + "epoch": 5.099877700774562, + "grad_norm": 0.23799341917037964, + "learning_rate": 3.627870634597092e-06, + "loss": 3.9834, + "step": 75060 + }, + { + "epoch": 5.100217420845223, + "grad_norm": 0.3078777492046356, + "learning_rate": 3.627445984508765e-06, + "loss": 3.9948, + "step": 75065 + }, + { + "epoch": 5.100557140915885, + "grad_norm": 0.24305853247642517, + "learning_rate": 3.627021334420438e-06, + "loss": 3.9735, + "step": 75070 + }, + { + "epoch": 5.1008968609865475, + "grad_norm": 0.32858720421791077, + "learning_rate": 3.6265966843321106e-06, + "loss": 3.8546, + "step": 75075 + }, + { + "epoch": 5.101236581057209, + "grad_norm": 0.24592870473861694, + "learning_rate": 3.6261720342437834e-06, + "loss": 3.9311, + "step": 75080 + }, + { + "epoch": 5.101576301127871, + "grad_norm": 0.31846773624420166, + "learning_rate": 3.625747384155456e-06, + "loss": 3.7719, + "step": 75085 + }, + { + "epoch": 5.101916021198533, + "grad_norm": 0.29219570755958557, + "learning_rate": 3.6253227340671286e-06, + "loss": 4.0361, + "step": 75090 + }, + { + "epoch": 5.102255741269194, + "grad_norm": 0.25745970010757446, + "learning_rate": 3.6248980839788018e-06, + "loss": 4.0247, + "step": 75095 + }, + { + "epoch": 5.102595461339856, + "grad_norm": 0.23370715975761414, + "learning_rate": 3.6244734338904746e-06, + "loss": 3.77, + "step": 75100 + }, + { + "epoch": 5.102935181410518, + "grad_norm": 0.3861631751060486, + "learning_rate": 3.624048783802147e-06, + "loss": 3.8802, + "step": 75105 + }, + { + "epoch": 5.103274901481179, + "grad_norm": 0.2673797011375427, + "learning_rate": 3.62362413371382e-06, + "loss": 3.9606, + "step": 75110 + }, + { + "epoch": 5.103614621551841, + "grad_norm": 0.21699783205986023, + "learning_rate": 3.623199483625493e-06, + "loss": 3.976, + "step": 75115 + }, + { + "epoch": 5.103954341622503, + "grad_norm": 0.264326810836792, + "learning_rate": 3.6227748335371654e-06, + "loss": 3.9684, + "step": 75120 + }, + { + "epoch": 5.104294061693165, + "grad_norm": 0.24079027771949768, + "learning_rate": 3.6223501834488386e-06, + "loss": 3.786, + "step": 75125 + }, + { + "epoch": 5.104633781763827, + "grad_norm": 0.24433141946792603, + "learning_rate": 3.6219255333605114e-06, + "loss": 3.9582, + "step": 75130 + }, + { + "epoch": 5.104973501834488, + "grad_norm": 0.3209952414035797, + "learning_rate": 3.6215008832721838e-06, + "loss": 3.7665, + "step": 75135 + }, + { + "epoch": 5.10531322190515, + "grad_norm": 0.31406620144844055, + "learning_rate": 3.6210762331838566e-06, + "loss": 3.6506, + "step": 75140 + }, + { + "epoch": 5.105652941975812, + "grad_norm": 0.22826634347438812, + "learning_rate": 3.62065158309553e-06, + "loss": 3.8431, + "step": 75145 + }, + { + "epoch": 5.105992662046473, + "grad_norm": 0.27566954493522644, + "learning_rate": 3.6202269330072026e-06, + "loss": 3.8927, + "step": 75150 + }, + { + "epoch": 5.106332382117135, + "grad_norm": 0.265758216381073, + "learning_rate": 3.619802282918875e-06, + "loss": 3.8561, + "step": 75155 + }, + { + "epoch": 5.106672102187797, + "grad_norm": 0.6534472703933716, + "learning_rate": 3.619377632830548e-06, + "loss": 4.0697, + "step": 75160 + }, + { + "epoch": 5.107011822258459, + "grad_norm": 0.31195196509361267, + "learning_rate": 3.618952982742221e-06, + "loss": 4.1802, + "step": 75165 + }, + { + "epoch": 5.107351542329121, + "grad_norm": 0.2338072508573532, + "learning_rate": 3.6185283326538934e-06, + "loss": 3.6502, + "step": 75170 + }, + { + "epoch": 5.107691262399783, + "grad_norm": 0.22565270960330963, + "learning_rate": 3.618103682565566e-06, + "loss": 3.9702, + "step": 75175 + }, + { + "epoch": 5.108030982470444, + "grad_norm": 0.3589319586753845, + "learning_rate": 3.6176790324772394e-06, + "loss": 3.8324, + "step": 75180 + }, + { + "epoch": 5.108370702541106, + "grad_norm": 0.28329360485076904, + "learning_rate": 3.6172543823889118e-06, + "loss": 3.9473, + "step": 75185 + }, + { + "epoch": 5.108710422611768, + "grad_norm": 0.20386217534542084, + "learning_rate": 3.6168297323005846e-06, + "loss": 3.8823, + "step": 75190 + }, + { + "epoch": 5.109050142682429, + "grad_norm": 0.2926267385482788, + "learning_rate": 3.616405082212258e-06, + "loss": 3.8544, + "step": 75195 + }, + { + "epoch": 5.109389862753091, + "grad_norm": 0.2883026897907257, + "learning_rate": 3.61598043212393e-06, + "loss": 3.8866, + "step": 75200 + }, + { + "epoch": 5.1097295828237534, + "grad_norm": 0.21465902030467987, + "learning_rate": 3.615555782035603e-06, + "loss": 3.8774, + "step": 75205 + }, + { + "epoch": 5.110069302894415, + "grad_norm": 0.205490380525589, + "learning_rate": 3.6151311319472758e-06, + "loss": 4.1916, + "step": 75210 + }, + { + "epoch": 5.110409022965077, + "grad_norm": 0.24901534616947174, + "learning_rate": 3.614706481858948e-06, + "loss": 3.9211, + "step": 75215 + }, + { + "epoch": 5.110748743035739, + "grad_norm": 0.3411879539489746, + "learning_rate": 3.6142818317706214e-06, + "loss": 3.9631, + "step": 75220 + }, + { + "epoch": 5.1110884631064, + "grad_norm": 0.20286087691783905, + "learning_rate": 3.613857181682294e-06, + "loss": 3.9624, + "step": 75225 + }, + { + "epoch": 5.111428183177062, + "grad_norm": 0.30502140522003174, + "learning_rate": 3.6134325315939666e-06, + "loss": 4.0787, + "step": 75230 + }, + { + "epoch": 5.111767903247724, + "grad_norm": 0.22404667735099792, + "learning_rate": 3.6130078815056398e-06, + "loss": 3.8817, + "step": 75235 + }, + { + "epoch": 5.112107623318385, + "grad_norm": 0.25476589798927307, + "learning_rate": 3.6125832314173126e-06, + "loss": 3.9725, + "step": 75240 + }, + { + "epoch": 5.112447343389047, + "grad_norm": 0.27533188462257385, + "learning_rate": 3.612158581328985e-06, + "loss": 3.8522, + "step": 75245 + }, + { + "epoch": 5.1127870634597095, + "grad_norm": 0.34220388531684875, + "learning_rate": 3.6117339312406578e-06, + "loss": 4.1774, + "step": 75250 + }, + { + "epoch": 5.113126783530371, + "grad_norm": 0.24097517132759094, + "learning_rate": 3.611309281152331e-06, + "loss": 3.5967, + "step": 75255 + }, + { + "epoch": 5.113466503601033, + "grad_norm": 0.5018079280853271, + "learning_rate": 3.6108846310640034e-06, + "loss": 3.8907, + "step": 75260 + }, + { + "epoch": 5.113806223671695, + "grad_norm": 0.2856873571872711, + "learning_rate": 3.610459980975676e-06, + "loss": 3.8928, + "step": 75265 + }, + { + "epoch": 5.114145943742356, + "grad_norm": 0.21568633615970612, + "learning_rate": 3.6100353308873494e-06, + "loss": 3.9109, + "step": 75270 + }, + { + "epoch": 5.114485663813018, + "grad_norm": 0.28053826093673706, + "learning_rate": 3.6096106807990218e-06, + "loss": 3.9133, + "step": 75275 + }, + { + "epoch": 5.11482538388368, + "grad_norm": 0.24579131603240967, + "learning_rate": 3.6091860307106946e-06, + "loss": 3.934, + "step": 75280 + }, + { + "epoch": 5.115165103954341, + "grad_norm": 0.3645118474960327, + "learning_rate": 3.6087613806223678e-06, + "loss": 4.0046, + "step": 75285 + }, + { + "epoch": 5.115504824025003, + "grad_norm": 0.2663734555244446, + "learning_rate": 3.60833673053404e-06, + "loss": 3.7138, + "step": 75290 + }, + { + "epoch": 5.1158445440956655, + "grad_norm": 0.31818485260009766, + "learning_rate": 3.607912080445713e-06, + "loss": 3.9353, + "step": 75295 + }, + { + "epoch": 5.116184264166327, + "grad_norm": 0.27069219946861267, + "learning_rate": 3.6074874303573858e-06, + "loss": 3.9905, + "step": 75300 + }, + { + "epoch": 5.116523984236989, + "grad_norm": 0.2763652205467224, + "learning_rate": 3.607062780269058e-06, + "loss": 3.5239, + "step": 75305 + }, + { + "epoch": 5.116863704307651, + "grad_norm": 0.2558436095714569, + "learning_rate": 3.6066381301807314e-06, + "loss": 3.9816, + "step": 75310 + }, + { + "epoch": 5.117203424378312, + "grad_norm": 0.48255103826522827, + "learning_rate": 3.606213480092404e-06, + "loss": 3.8629, + "step": 75315 + }, + { + "epoch": 5.117543144448974, + "grad_norm": 0.23229096829891205, + "learning_rate": 3.6057888300040774e-06, + "loss": 3.8688, + "step": 75320 + }, + { + "epoch": 5.117882864519636, + "grad_norm": 0.27322256565093994, + "learning_rate": 3.6053641799157498e-06, + "loss": 3.8449, + "step": 75325 + }, + { + "epoch": 5.118222584590297, + "grad_norm": 0.28318703174591064, + "learning_rate": 3.6049395298274226e-06, + "loss": 3.9898, + "step": 75330 + }, + { + "epoch": 5.118562304660959, + "grad_norm": 0.28008148074150085, + "learning_rate": 3.6045148797390954e-06, + "loss": 3.9342, + "step": 75335 + }, + { + "epoch": 5.1189020247316215, + "grad_norm": 0.30514249205589294, + "learning_rate": 3.6040902296507677e-06, + "loss": 3.8321, + "step": 75340 + }, + { + "epoch": 5.119241744802283, + "grad_norm": 0.41118744015693665, + "learning_rate": 3.603665579562441e-06, + "loss": 3.7997, + "step": 75345 + }, + { + "epoch": 5.119581464872945, + "grad_norm": 0.2133941650390625, + "learning_rate": 3.6032409294741138e-06, + "loss": 4.1482, + "step": 75350 + }, + { + "epoch": 5.119921184943607, + "grad_norm": 0.259605348110199, + "learning_rate": 3.602816279385786e-06, + "loss": 3.9167, + "step": 75355 + }, + { + "epoch": 5.120260905014268, + "grad_norm": 0.24286170303821564, + "learning_rate": 3.6023916292974594e-06, + "loss": 3.9049, + "step": 75360 + }, + { + "epoch": 5.12060062508493, + "grad_norm": 0.23763802647590637, + "learning_rate": 3.601966979209132e-06, + "loss": 3.8713, + "step": 75365 + }, + { + "epoch": 5.120940345155592, + "grad_norm": 0.3305707275867462, + "learning_rate": 3.6015423291208046e-06, + "loss": 3.9637, + "step": 75370 + }, + { + "epoch": 5.121280065226253, + "grad_norm": 0.25984856486320496, + "learning_rate": 3.6011176790324774e-06, + "loss": 3.9582, + "step": 75375 + }, + { + "epoch": 5.121619785296915, + "grad_norm": 0.2721482217311859, + "learning_rate": 3.6006930289441506e-06, + "loss": 3.8623, + "step": 75380 + }, + { + "epoch": 5.1219595053675775, + "grad_norm": 0.351367324590683, + "learning_rate": 3.600268378855823e-06, + "loss": 3.6207, + "step": 75385 + }, + { + "epoch": 5.122299225438239, + "grad_norm": 0.23148943483829498, + "learning_rate": 3.5998437287674958e-06, + "loss": 3.6231, + "step": 75390 + }, + { + "epoch": 5.122638945508901, + "grad_norm": 0.25845110416412354, + "learning_rate": 3.599419078679169e-06, + "loss": 3.9511, + "step": 75395 + }, + { + "epoch": 5.122978665579563, + "grad_norm": 0.2629750669002533, + "learning_rate": 3.5989944285908414e-06, + "loss": 3.9852, + "step": 75400 + }, + { + "epoch": 5.123318385650224, + "grad_norm": 0.23520733416080475, + "learning_rate": 3.598569778502514e-06, + "loss": 3.8594, + "step": 75405 + }, + { + "epoch": 5.123658105720886, + "grad_norm": 0.2681809663772583, + "learning_rate": 3.5981451284141874e-06, + "loss": 4.1357, + "step": 75410 + }, + { + "epoch": 5.123997825791548, + "grad_norm": 0.272489070892334, + "learning_rate": 3.5977204783258598e-06, + "loss": 3.9165, + "step": 75415 + }, + { + "epoch": 5.124337545862209, + "grad_norm": 0.24212707579135895, + "learning_rate": 3.5972958282375326e-06, + "loss": 3.7922, + "step": 75420 + }, + { + "epoch": 5.124677265932871, + "grad_norm": 0.2728120684623718, + "learning_rate": 3.5968711781492054e-06, + "loss": 3.6691, + "step": 75425 + }, + { + "epoch": 5.1250169860035335, + "grad_norm": 0.21797369420528412, + "learning_rate": 3.5964465280608777e-06, + "loss": 4.0477, + "step": 75430 + }, + { + "epoch": 5.125356706074195, + "grad_norm": 0.3418463170528412, + "learning_rate": 3.596021877972551e-06, + "loss": 4.0431, + "step": 75435 + }, + { + "epoch": 5.125696426144857, + "grad_norm": 0.27357879281044006, + "learning_rate": 3.5955972278842238e-06, + "loss": 3.8436, + "step": 75440 + }, + { + "epoch": 5.126036146215519, + "grad_norm": 0.2987990975379944, + "learning_rate": 3.595172577795896e-06, + "loss": 4.0842, + "step": 75445 + }, + { + "epoch": 5.12637586628618, + "grad_norm": 0.2510972321033478, + "learning_rate": 3.5947479277075694e-06, + "loss": 4.0601, + "step": 75450 + }, + { + "epoch": 5.126715586356842, + "grad_norm": 0.24723796546459198, + "learning_rate": 3.594323277619242e-06, + "loss": 3.9906, + "step": 75455 + }, + { + "epoch": 5.127055306427504, + "grad_norm": 0.3259628713130951, + "learning_rate": 3.5938986275309145e-06, + "loss": 3.7113, + "step": 75460 + }, + { + "epoch": 5.127395026498165, + "grad_norm": 0.3302823603153229, + "learning_rate": 3.5934739774425873e-06, + "loss": 4.0583, + "step": 75465 + }, + { + "epoch": 5.127734746568827, + "grad_norm": 0.2797844707965851, + "learning_rate": 3.5930493273542606e-06, + "loss": 3.8167, + "step": 75470 + }, + { + "epoch": 5.1280744666394895, + "grad_norm": 0.29407942295074463, + "learning_rate": 3.592624677265933e-06, + "loss": 3.5754, + "step": 75475 + }, + { + "epoch": 5.128414186710151, + "grad_norm": 0.37032580375671387, + "learning_rate": 3.5922000271776057e-06, + "loss": 3.9659, + "step": 75480 + }, + { + "epoch": 5.128753906780813, + "grad_norm": 0.3363315761089325, + "learning_rate": 3.591775377089279e-06, + "loss": 3.7739, + "step": 75485 + }, + { + "epoch": 5.129093626851474, + "grad_norm": 0.2677728235721588, + "learning_rate": 3.5913507270009518e-06, + "loss": 3.9701, + "step": 75490 + }, + { + "epoch": 5.129433346922136, + "grad_norm": 0.24002677202224731, + "learning_rate": 3.590926076912624e-06, + "loss": 3.9519, + "step": 75495 + }, + { + "epoch": 5.129773066992798, + "grad_norm": 0.22817856073379517, + "learning_rate": 3.590501426824297e-06, + "loss": 3.9006, + "step": 75500 + }, + { + "epoch": 5.130112787063459, + "grad_norm": 0.25427404046058655, + "learning_rate": 3.59007677673597e-06, + "loss": 3.8122, + "step": 75505 + }, + { + "epoch": 5.130452507134121, + "grad_norm": 0.2687598466873169, + "learning_rate": 3.5896521266476426e-06, + "loss": 3.8421, + "step": 75510 + }, + { + "epoch": 5.1307922272047835, + "grad_norm": 0.263953298330307, + "learning_rate": 3.5892274765593154e-06, + "loss": 4.0536, + "step": 75515 + }, + { + "epoch": 5.131131947275445, + "grad_norm": 0.3267030715942383, + "learning_rate": 3.5888028264709886e-06, + "loss": 3.9236, + "step": 75520 + }, + { + "epoch": 5.131471667346107, + "grad_norm": 0.2631741166114807, + "learning_rate": 3.588378176382661e-06, + "loss": 3.7676, + "step": 75525 + }, + { + "epoch": 5.131811387416769, + "grad_norm": 0.2777940034866333, + "learning_rate": 3.5879535262943338e-06, + "loss": 3.7135, + "step": 75530 + }, + { + "epoch": 5.13215110748743, + "grad_norm": 0.34801197052001953, + "learning_rate": 3.5875288762060066e-06, + "loss": 3.7505, + "step": 75535 + }, + { + "epoch": 5.132490827558092, + "grad_norm": 0.2967514395713806, + "learning_rate": 3.5871042261176794e-06, + "loss": 3.9774, + "step": 75540 + }, + { + "epoch": 5.132830547628754, + "grad_norm": 0.21488970518112183, + "learning_rate": 3.586679576029352e-06, + "loss": 4.1019, + "step": 75545 + }, + { + "epoch": 5.133170267699415, + "grad_norm": 0.3491526246070862, + "learning_rate": 3.586254925941025e-06, + "loss": 3.7327, + "step": 75550 + }, + { + "epoch": 5.133509987770077, + "grad_norm": 0.22257454693317413, + "learning_rate": 3.5858302758526973e-06, + "loss": 3.8268, + "step": 75555 + }, + { + "epoch": 5.1338497078407395, + "grad_norm": 0.2888694703578949, + "learning_rate": 3.5854056257643706e-06, + "loss": 3.9932, + "step": 75560 + }, + { + "epoch": 5.134189427911401, + "grad_norm": 0.38461118936538696, + "learning_rate": 3.5849809756760434e-06, + "loss": 3.9814, + "step": 75565 + }, + { + "epoch": 5.134529147982063, + "grad_norm": 0.25647586584091187, + "learning_rate": 3.5845563255877157e-06, + "loss": 3.9474, + "step": 75570 + }, + { + "epoch": 5.134868868052725, + "grad_norm": 0.3299386203289032, + "learning_rate": 3.584131675499389e-06, + "loss": 3.8163, + "step": 75575 + }, + { + "epoch": 5.135208588123386, + "grad_norm": 0.2870849072933197, + "learning_rate": 3.5837070254110618e-06, + "loss": 3.833, + "step": 75580 + }, + { + "epoch": 5.135548308194048, + "grad_norm": 0.27849307656288147, + "learning_rate": 3.583282375322734e-06, + "loss": 4.0763, + "step": 75585 + }, + { + "epoch": 5.13588802826471, + "grad_norm": 0.3227826654911041, + "learning_rate": 3.582857725234407e-06, + "loss": 3.6043, + "step": 75590 + }, + { + "epoch": 5.136227748335371, + "grad_norm": 0.2140679955482483, + "learning_rate": 3.58243307514608e-06, + "loss": 3.8295, + "step": 75595 + }, + { + "epoch": 5.136567468406033, + "grad_norm": 0.24419645965099335, + "learning_rate": 3.5820084250577525e-06, + "loss": 3.7837, + "step": 75600 + }, + { + "epoch": 5.1369071884766955, + "grad_norm": 0.28938984870910645, + "learning_rate": 3.5815837749694253e-06, + "loss": 4.0368, + "step": 75605 + }, + { + "epoch": 5.137246908547357, + "grad_norm": 0.3571229577064514, + "learning_rate": 3.5811591248810986e-06, + "loss": 4.0692, + "step": 75610 + }, + { + "epoch": 5.137586628618019, + "grad_norm": 0.2872030735015869, + "learning_rate": 3.580734474792771e-06, + "loss": 4.0123, + "step": 75615 + }, + { + "epoch": 5.137926348688681, + "grad_norm": 0.3140016198158264, + "learning_rate": 3.5803098247044437e-06, + "loss": 4.1059, + "step": 75620 + }, + { + "epoch": 5.138266068759342, + "grad_norm": 0.25630006194114685, + "learning_rate": 3.5798851746161165e-06, + "loss": 4.0001, + "step": 75625 + }, + { + "epoch": 5.138605788830004, + "grad_norm": 0.2856374680995941, + "learning_rate": 3.5794605245277893e-06, + "loss": 3.9624, + "step": 75630 + }, + { + "epoch": 5.138945508900666, + "grad_norm": 0.2794826626777649, + "learning_rate": 3.579035874439462e-06, + "loss": 3.9043, + "step": 75635 + }, + { + "epoch": 5.139285228971327, + "grad_norm": 0.3209330141544342, + "learning_rate": 3.578611224351135e-06, + "loss": 3.6782, + "step": 75640 + }, + { + "epoch": 5.139624949041989, + "grad_norm": 0.21798880398273468, + "learning_rate": 3.5781865742628073e-06, + "loss": 3.8626, + "step": 75645 + }, + { + "epoch": 5.1399646691126515, + "grad_norm": 0.2359304577112198, + "learning_rate": 3.5777619241744805e-06, + "loss": 4.0283, + "step": 75650 + }, + { + "epoch": 5.140304389183313, + "grad_norm": 0.30009523034095764, + "learning_rate": 3.5773372740861534e-06, + "loss": 4.0765, + "step": 75655 + }, + { + "epoch": 5.140644109253975, + "grad_norm": 0.27231457829475403, + "learning_rate": 3.576912623997826e-06, + "loss": 3.7133, + "step": 75660 + }, + { + "epoch": 5.140983829324637, + "grad_norm": 0.21954436600208282, + "learning_rate": 3.576487973909499e-06, + "loss": 3.9614, + "step": 75665 + }, + { + "epoch": 5.141323549395298, + "grad_norm": 0.2709067165851593, + "learning_rate": 3.5760633238211718e-06, + "loss": 3.8694, + "step": 75670 + }, + { + "epoch": 5.14166326946596, + "grad_norm": 0.2900930643081665, + "learning_rate": 3.5756386737328446e-06, + "loss": 4.0385, + "step": 75675 + }, + { + "epoch": 5.142002989536622, + "grad_norm": 0.22636966407299042, + "learning_rate": 3.575214023644517e-06, + "loss": 3.9105, + "step": 75680 + }, + { + "epoch": 5.142342709607283, + "grad_norm": 0.26219797134399414, + "learning_rate": 3.57478937355619e-06, + "loss": 3.8644, + "step": 75685 + }, + { + "epoch": 5.142682429677945, + "grad_norm": 0.27276691794395447, + "learning_rate": 3.574364723467863e-06, + "loss": 3.9047, + "step": 75690 + }, + { + "epoch": 5.1430221497486075, + "grad_norm": 0.332793653011322, + "learning_rate": 3.5739400733795353e-06, + "loss": 4.0242, + "step": 75695 + }, + { + "epoch": 5.143361869819269, + "grad_norm": 0.28662610054016113, + "learning_rate": 3.5735154232912086e-06, + "loss": 3.9348, + "step": 75700 + }, + { + "epoch": 5.143701589889931, + "grad_norm": 0.22020824253559113, + "learning_rate": 3.5730907732028814e-06, + "loss": 4.084, + "step": 75705 + }, + { + "epoch": 5.144041309960593, + "grad_norm": 0.25447526574134827, + "learning_rate": 3.5726661231145537e-06, + "loss": 3.9414, + "step": 75710 + }, + { + "epoch": 5.144381030031254, + "grad_norm": 0.3312624394893646, + "learning_rate": 3.5722414730262265e-06, + "loss": 3.9335, + "step": 75715 + }, + { + "epoch": 5.144720750101916, + "grad_norm": 0.26014694571495056, + "learning_rate": 3.5718168229378998e-06, + "loss": 4.0423, + "step": 75720 + }, + { + "epoch": 5.145060470172578, + "grad_norm": 0.27315279841423035, + "learning_rate": 3.571392172849572e-06, + "loss": 4.0031, + "step": 75725 + }, + { + "epoch": 5.145400190243239, + "grad_norm": 0.3290015459060669, + "learning_rate": 3.570967522761245e-06, + "loss": 4.0717, + "step": 75730 + }, + { + "epoch": 5.145739910313901, + "grad_norm": 0.2712188959121704, + "learning_rate": 3.570542872672918e-06, + "loss": 3.7099, + "step": 75735 + }, + { + "epoch": 5.1460796303845635, + "grad_norm": 0.2532767653465271, + "learning_rate": 3.5701182225845905e-06, + "loss": 3.795, + "step": 75740 + }, + { + "epoch": 5.146419350455225, + "grad_norm": 0.2638775408267975, + "learning_rate": 3.5696935724962633e-06, + "loss": 3.9963, + "step": 75745 + }, + { + "epoch": 5.146759070525887, + "grad_norm": 0.23755653202533722, + "learning_rate": 3.569268922407936e-06, + "loss": 3.933, + "step": 75750 + }, + { + "epoch": 5.147098790596549, + "grad_norm": 0.28399401903152466, + "learning_rate": 3.5688442723196085e-06, + "loss": 3.8928, + "step": 75755 + }, + { + "epoch": 5.14743851066721, + "grad_norm": 0.28726181387901306, + "learning_rate": 3.5684196222312817e-06, + "loss": 3.655, + "step": 75760 + }, + { + "epoch": 5.147778230737872, + "grad_norm": 0.3148045837879181, + "learning_rate": 3.5679949721429545e-06, + "loss": 4.2649, + "step": 75765 + }, + { + "epoch": 5.148117950808534, + "grad_norm": 0.3103569447994232, + "learning_rate": 3.567570322054627e-06, + "loss": 4.0208, + "step": 75770 + }, + { + "epoch": 5.148457670879195, + "grad_norm": 0.32191476225852966, + "learning_rate": 3.5671456719663e-06, + "loss": 3.944, + "step": 75775 + }, + { + "epoch": 5.1487973909498574, + "grad_norm": 0.23163390159606934, + "learning_rate": 3.566721021877973e-06, + "loss": 3.8201, + "step": 75780 + }, + { + "epoch": 5.1491371110205195, + "grad_norm": 0.3832722008228302, + "learning_rate": 3.5662963717896453e-06, + "loss": 3.9117, + "step": 75785 + }, + { + "epoch": 5.149476831091181, + "grad_norm": 0.25626295804977417, + "learning_rate": 3.5658717217013185e-06, + "loss": 3.7964, + "step": 75790 + }, + { + "epoch": 5.149816551161843, + "grad_norm": 0.269458144903183, + "learning_rate": 3.5654470716129913e-06, + "loss": 3.96, + "step": 75795 + }, + { + "epoch": 5.150156271232504, + "grad_norm": 0.2130352258682251, + "learning_rate": 3.5650224215246637e-06, + "loss": 4.0136, + "step": 75800 + }, + { + "epoch": 5.150495991303166, + "grad_norm": 0.40207555890083313, + "learning_rate": 3.5645977714363365e-06, + "loss": 3.6587, + "step": 75805 + }, + { + "epoch": 5.150835711373828, + "grad_norm": 0.21626944839954376, + "learning_rate": 3.5641731213480098e-06, + "loss": 4.0756, + "step": 75810 + }, + { + "epoch": 5.151175431444489, + "grad_norm": 0.3262902796268463, + "learning_rate": 3.563748471259682e-06, + "loss": 3.8636, + "step": 75815 + }, + { + "epoch": 5.151515151515151, + "grad_norm": 0.24714374542236328, + "learning_rate": 3.563323821171355e-06, + "loss": 3.8375, + "step": 75820 + }, + { + "epoch": 5.1518548715858135, + "grad_norm": 0.2512703537940979, + "learning_rate": 3.562899171083028e-06, + "loss": 3.7582, + "step": 75825 + }, + { + "epoch": 5.152194591656475, + "grad_norm": 0.25623181462287903, + "learning_rate": 3.562474520994701e-06, + "loss": 4.3359, + "step": 75830 + }, + { + "epoch": 5.152534311727137, + "grad_norm": 0.2580713629722595, + "learning_rate": 3.5620498709063733e-06, + "loss": 4.0208, + "step": 75835 + }, + { + "epoch": 5.152874031797799, + "grad_norm": 0.28726962208747864, + "learning_rate": 3.561625220818046e-06, + "loss": 3.9869, + "step": 75840 + }, + { + "epoch": 5.15321375186846, + "grad_norm": 0.2783292233943939, + "learning_rate": 3.5612005707297194e-06, + "loss": 3.811, + "step": 75845 + }, + { + "epoch": 5.153553471939122, + "grad_norm": 0.23099327087402344, + "learning_rate": 3.5607759206413917e-06, + "loss": 3.9881, + "step": 75850 + }, + { + "epoch": 5.153893192009784, + "grad_norm": 0.2966514527797699, + "learning_rate": 3.5603512705530645e-06, + "loss": 3.9734, + "step": 75855 + }, + { + "epoch": 5.154232912080445, + "grad_norm": 0.3349573314189911, + "learning_rate": 3.5599266204647378e-06, + "loss": 3.8783, + "step": 75860 + }, + { + "epoch": 5.154572632151107, + "grad_norm": 0.32521742582321167, + "learning_rate": 3.55950197037641e-06, + "loss": 3.9883, + "step": 75865 + }, + { + "epoch": 5.1549123522217695, + "grad_norm": 0.2787611484527588, + "learning_rate": 3.559077320288083e-06, + "loss": 3.7933, + "step": 75870 + }, + { + "epoch": 5.155252072292431, + "grad_norm": 0.29812130331993103, + "learning_rate": 3.5586526701997557e-06, + "loss": 4.0191, + "step": 75875 + }, + { + "epoch": 5.155591792363093, + "grad_norm": 0.27020004391670227, + "learning_rate": 3.558228020111428e-06, + "loss": 3.845, + "step": 75880 + }, + { + "epoch": 5.155931512433755, + "grad_norm": 0.272748202085495, + "learning_rate": 3.5578033700231013e-06, + "loss": 3.9533, + "step": 75885 + }, + { + "epoch": 5.156271232504416, + "grad_norm": 0.3165072500705719, + "learning_rate": 3.557378719934774e-06, + "loss": 3.8638, + "step": 75890 + }, + { + "epoch": 5.156610952575078, + "grad_norm": 0.27916765213012695, + "learning_rate": 3.5569540698464465e-06, + "loss": 3.8117, + "step": 75895 + }, + { + "epoch": 5.15695067264574, + "grad_norm": 0.23307061195373535, + "learning_rate": 3.5565294197581197e-06, + "loss": 3.7832, + "step": 75900 + }, + { + "epoch": 5.157290392716401, + "grad_norm": 0.2766629457473755, + "learning_rate": 3.5561047696697925e-06, + "loss": 3.7426, + "step": 75905 + }, + { + "epoch": 5.157630112787063, + "grad_norm": 0.27175307273864746, + "learning_rate": 3.555680119581465e-06, + "loss": 4.01, + "step": 75910 + }, + { + "epoch": 5.1579698328577255, + "grad_norm": 0.24166955053806305, + "learning_rate": 3.555255469493138e-06, + "loss": 3.5191, + "step": 75915 + }, + { + "epoch": 5.158309552928387, + "grad_norm": 0.2966447174549103, + "learning_rate": 3.554830819404811e-06, + "loss": 3.9676, + "step": 75920 + }, + { + "epoch": 5.158649272999049, + "grad_norm": 0.22236983478069305, + "learning_rate": 3.5544061693164833e-06, + "loss": 3.9489, + "step": 75925 + }, + { + "epoch": 5.158988993069711, + "grad_norm": 0.24939768016338348, + "learning_rate": 3.553981519228156e-06, + "loss": 3.8332, + "step": 75930 + }, + { + "epoch": 5.159328713140372, + "grad_norm": 0.49419739842414856, + "learning_rate": 3.5535568691398293e-06, + "loss": 3.9113, + "step": 75935 + }, + { + "epoch": 5.159668433211034, + "grad_norm": 0.26229140162467957, + "learning_rate": 3.5531322190515017e-06, + "loss": 4.0327, + "step": 75940 + }, + { + "epoch": 5.160008153281696, + "grad_norm": 0.2835901081562042, + "learning_rate": 3.5527075689631745e-06, + "loss": 3.969, + "step": 75945 + }, + { + "epoch": 5.160347873352357, + "grad_norm": 0.21133220195770264, + "learning_rate": 3.5522829188748477e-06, + "loss": 3.7749, + "step": 75950 + }, + { + "epoch": 5.160687593423019, + "grad_norm": 0.3101678490638733, + "learning_rate": 3.55185826878652e-06, + "loss": 3.6369, + "step": 75955 + }, + { + "epoch": 5.1610273134936815, + "grad_norm": 0.3123478293418884, + "learning_rate": 3.551433618698193e-06, + "loss": 3.9206, + "step": 75960 + }, + { + "epoch": 5.161367033564343, + "grad_norm": 0.4219893217086792, + "learning_rate": 3.5510089686098657e-06, + "loss": 4.0615, + "step": 75965 + }, + { + "epoch": 5.161706753635005, + "grad_norm": 0.43520209193229675, + "learning_rate": 3.550584318521538e-06, + "loss": 3.8453, + "step": 75970 + }, + { + "epoch": 5.162046473705667, + "grad_norm": 0.22118555009365082, + "learning_rate": 3.5501596684332113e-06, + "loss": 3.9926, + "step": 75975 + }, + { + "epoch": 5.162386193776328, + "grad_norm": 0.5822362303733826, + "learning_rate": 3.549735018344884e-06, + "loss": 4.1689, + "step": 75980 + }, + { + "epoch": 5.16272591384699, + "grad_norm": 0.2618936002254486, + "learning_rate": 3.5493103682565565e-06, + "loss": 3.7096, + "step": 75985 + }, + { + "epoch": 5.163065633917652, + "grad_norm": 0.2456231266260147, + "learning_rate": 3.5488857181682297e-06, + "loss": 3.9575, + "step": 75990 + }, + { + "epoch": 5.163405353988313, + "grad_norm": 0.2499244660139084, + "learning_rate": 3.5484610680799025e-06, + "loss": 4.0042, + "step": 75995 + }, + { + "epoch": 5.163745074058975, + "grad_norm": 0.20953205227851868, + "learning_rate": 3.5480364179915753e-06, + "loss": 4.086, + "step": 76000 + }, + { + "epoch": 5.1640847941296375, + "grad_norm": 0.20875397324562073, + "learning_rate": 3.5476117679032477e-06, + "loss": 3.8802, + "step": 76005 + }, + { + "epoch": 5.164424514200299, + "grad_norm": 0.40534594655036926, + "learning_rate": 3.547187117814921e-06, + "loss": 3.8452, + "step": 76010 + }, + { + "epoch": 5.164764234270961, + "grad_norm": 0.21673735976219177, + "learning_rate": 3.5467624677265937e-06, + "loss": 3.8375, + "step": 76015 + }, + { + "epoch": 5.165103954341623, + "grad_norm": 0.28028830885887146, + "learning_rate": 3.546337817638266e-06, + "loss": 3.9421, + "step": 76020 + }, + { + "epoch": 5.165443674412284, + "grad_norm": 0.32364240288734436, + "learning_rate": 3.5459131675499393e-06, + "loss": 3.8504, + "step": 76025 + }, + { + "epoch": 5.165783394482946, + "grad_norm": 0.38261979818344116, + "learning_rate": 3.545488517461612e-06, + "loss": 3.8797, + "step": 76030 + }, + { + "epoch": 5.166123114553608, + "grad_norm": 0.3005666732788086, + "learning_rate": 3.5450638673732845e-06, + "loss": 4.1557, + "step": 76035 + }, + { + "epoch": 5.166462834624269, + "grad_norm": 0.5381197333335876, + "learning_rate": 3.5446392172849573e-06, + "loss": 4.029, + "step": 76040 + }, + { + "epoch": 5.166802554694931, + "grad_norm": 0.24058106541633606, + "learning_rate": 3.5442145671966305e-06, + "loss": 3.834, + "step": 76045 + }, + { + "epoch": 5.1671422747655935, + "grad_norm": 0.3032090663909912, + "learning_rate": 3.543789917108303e-06, + "loss": 3.9849, + "step": 76050 + }, + { + "epoch": 5.167481994836255, + "grad_norm": 0.30632784962654114, + "learning_rate": 3.5433652670199757e-06, + "loss": 3.8485, + "step": 76055 + }, + { + "epoch": 5.167821714906917, + "grad_norm": 0.24825604259967804, + "learning_rate": 3.542940616931649e-06, + "loss": 3.9744, + "step": 76060 + }, + { + "epoch": 5.168161434977579, + "grad_norm": 0.2012890726327896, + "learning_rate": 3.5425159668433213e-06, + "loss": 4.0342, + "step": 76065 + }, + { + "epoch": 5.16850115504824, + "grad_norm": 0.23210059106349945, + "learning_rate": 3.542091316754994e-06, + "loss": 4.1357, + "step": 76070 + }, + { + "epoch": 5.168840875118902, + "grad_norm": 0.34772390127182007, + "learning_rate": 3.5416666666666673e-06, + "loss": 3.8026, + "step": 76075 + }, + { + "epoch": 5.169180595189564, + "grad_norm": 0.30374133586883545, + "learning_rate": 3.5412420165783397e-06, + "loss": 3.9206, + "step": 76080 + }, + { + "epoch": 5.169520315260225, + "grad_norm": 0.2627328932285309, + "learning_rate": 3.5408173664900125e-06, + "loss": 3.9419, + "step": 76085 + }, + { + "epoch": 5.1698600353308874, + "grad_norm": 0.2874374985694885, + "learning_rate": 3.5403927164016853e-06, + "loss": 4.0714, + "step": 76090 + }, + { + "epoch": 5.1701997554015495, + "grad_norm": 0.2874017357826233, + "learning_rate": 3.5399680663133577e-06, + "loss": 3.9982, + "step": 76095 + }, + { + "epoch": 5.170539475472211, + "grad_norm": 0.3872988224029541, + "learning_rate": 3.539543416225031e-06, + "loss": 4.0022, + "step": 76100 + }, + { + "epoch": 5.170879195542873, + "grad_norm": 0.27199384570121765, + "learning_rate": 3.5391187661367037e-06, + "loss": 3.8485, + "step": 76105 + }, + { + "epoch": 5.171218915613535, + "grad_norm": 0.24937905371189117, + "learning_rate": 3.538694116048376e-06, + "loss": 3.8429, + "step": 76110 + }, + { + "epoch": 5.171558635684196, + "grad_norm": 0.2697790861129761, + "learning_rate": 3.5382694659600493e-06, + "loss": 4.1766, + "step": 76115 + }, + { + "epoch": 5.171898355754858, + "grad_norm": 0.2600199282169342, + "learning_rate": 3.537844815871722e-06, + "loss": 3.9402, + "step": 76120 + }, + { + "epoch": 5.17223807582552, + "grad_norm": 0.262381374835968, + "learning_rate": 3.5374201657833945e-06, + "loss": 3.6885, + "step": 76125 + }, + { + "epoch": 5.172577795896181, + "grad_norm": 0.2345334142446518, + "learning_rate": 3.5369955156950673e-06, + "loss": 4.1133, + "step": 76130 + }, + { + "epoch": 5.1729175159668435, + "grad_norm": 0.291146844625473, + "learning_rate": 3.5365708656067405e-06, + "loss": 4.0661, + "step": 76135 + }, + { + "epoch": 5.1732572360375055, + "grad_norm": 0.2430715560913086, + "learning_rate": 3.536146215518413e-06, + "loss": 3.6691, + "step": 76140 + }, + { + "epoch": 5.173596956108167, + "grad_norm": 0.2959074378013611, + "learning_rate": 3.5357215654300857e-06, + "loss": 3.9302, + "step": 76145 + }, + { + "epoch": 5.173936676178829, + "grad_norm": 0.25726115703582764, + "learning_rate": 3.535296915341759e-06, + "loss": 3.8935, + "step": 76150 + }, + { + "epoch": 5.174276396249491, + "grad_norm": 0.3095242381095886, + "learning_rate": 3.5348722652534313e-06, + "loss": 4.137, + "step": 76155 + }, + { + "epoch": 5.174616116320152, + "grad_norm": 0.24455443024635315, + "learning_rate": 3.534447615165104e-06, + "loss": 3.9393, + "step": 76160 + }, + { + "epoch": 5.174955836390814, + "grad_norm": 0.3075997233390808, + "learning_rate": 3.534022965076777e-06, + "loss": 4.0706, + "step": 76165 + }, + { + "epoch": 5.175295556461475, + "grad_norm": 0.41854774951934814, + "learning_rate": 3.53359831498845e-06, + "loss": 3.6257, + "step": 76170 + }, + { + "epoch": 5.175635276532137, + "grad_norm": 0.33543968200683594, + "learning_rate": 3.5331736649001225e-06, + "loss": 4.042, + "step": 76175 + }, + { + "epoch": 5.1759749966027995, + "grad_norm": 0.244694322347641, + "learning_rate": 3.5327490148117953e-06, + "loss": 4.0082, + "step": 76180 + }, + { + "epoch": 5.176314716673461, + "grad_norm": 0.2310916632413864, + "learning_rate": 3.5323243647234685e-06, + "loss": 3.9452, + "step": 76185 + }, + { + "epoch": 5.176654436744123, + "grad_norm": 0.36496758460998535, + "learning_rate": 3.531899714635141e-06, + "loss": 3.899, + "step": 76190 + }, + { + "epoch": 5.176994156814785, + "grad_norm": 0.2620406150817871, + "learning_rate": 3.5314750645468137e-06, + "loss": 4.0424, + "step": 76195 + }, + { + "epoch": 5.177333876885446, + "grad_norm": 0.29731395840644836, + "learning_rate": 3.531050414458487e-06, + "loss": 4.0684, + "step": 76200 + }, + { + "epoch": 5.177673596956108, + "grad_norm": 0.4221399128437042, + "learning_rate": 3.5306257643701593e-06, + "loss": 4.0453, + "step": 76205 + }, + { + "epoch": 5.17801331702677, + "grad_norm": 0.7389398813247681, + "learning_rate": 3.530201114281832e-06, + "loss": 4.0281, + "step": 76210 + }, + { + "epoch": 5.178353037097431, + "grad_norm": 0.2721656560897827, + "learning_rate": 3.529776464193505e-06, + "loss": 3.9285, + "step": 76215 + }, + { + "epoch": 5.178692757168093, + "grad_norm": 0.26922863721847534, + "learning_rate": 3.5293518141051773e-06, + "loss": 4.3163, + "step": 76220 + }, + { + "epoch": 5.1790324772387555, + "grad_norm": 0.23062610626220703, + "learning_rate": 3.5289271640168505e-06, + "loss": 4.1988, + "step": 76225 + }, + { + "epoch": 5.179372197309417, + "grad_norm": 0.2268524318933487, + "learning_rate": 3.5285025139285233e-06, + "loss": 3.9176, + "step": 76230 + }, + { + "epoch": 5.179711917380079, + "grad_norm": 0.26605746150016785, + "learning_rate": 3.5280778638401957e-06, + "loss": 3.7545, + "step": 76235 + }, + { + "epoch": 5.180051637450741, + "grad_norm": 0.20770449936389923, + "learning_rate": 3.527653213751869e-06, + "loss": 3.8993, + "step": 76240 + }, + { + "epoch": 5.180391357521402, + "grad_norm": 0.2353592813014984, + "learning_rate": 3.5272285636635417e-06, + "loss": 3.9205, + "step": 76245 + }, + { + "epoch": 5.180731077592064, + "grad_norm": 0.2304346263408661, + "learning_rate": 3.526803913575214e-06, + "loss": 4.0311, + "step": 76250 + }, + { + "epoch": 5.181070797662726, + "grad_norm": 0.28812140226364136, + "learning_rate": 3.526379263486887e-06, + "loss": 3.9022, + "step": 76255 + }, + { + "epoch": 5.181410517733387, + "grad_norm": 0.2553544342517853, + "learning_rate": 3.52595461339856e-06, + "loss": 3.8104, + "step": 76260 + }, + { + "epoch": 5.181750237804049, + "grad_norm": 0.2859724760055542, + "learning_rate": 3.5255299633102325e-06, + "loss": 3.9573, + "step": 76265 + }, + { + "epoch": 5.1820899578747115, + "grad_norm": 0.2998558282852173, + "learning_rate": 3.5251053132219053e-06, + "loss": 3.9535, + "step": 76270 + }, + { + "epoch": 5.182429677945373, + "grad_norm": 0.24180179834365845, + "learning_rate": 3.5246806631335785e-06, + "loss": 3.9696, + "step": 76275 + }, + { + "epoch": 5.182769398016035, + "grad_norm": 0.28778520226478577, + "learning_rate": 3.524256013045251e-06, + "loss": 3.8894, + "step": 76280 + }, + { + "epoch": 5.183109118086697, + "grad_norm": 0.34429529309272766, + "learning_rate": 3.5238313629569237e-06, + "loss": 3.8882, + "step": 76285 + }, + { + "epoch": 5.183448838157358, + "grad_norm": 0.2653641104698181, + "learning_rate": 3.5234067128685965e-06, + "loss": 4.1146, + "step": 76290 + }, + { + "epoch": 5.18378855822802, + "grad_norm": 0.2666766345500946, + "learning_rate": 3.5229820627802693e-06, + "loss": 3.7589, + "step": 76295 + }, + { + "epoch": 5.184128278298682, + "grad_norm": 0.33530572056770325, + "learning_rate": 3.522557412691942e-06, + "loss": 3.8218, + "step": 76300 + }, + { + "epoch": 5.184467998369343, + "grad_norm": 0.2645062208175659, + "learning_rate": 3.522132762603615e-06, + "loss": 4.0174, + "step": 76305 + }, + { + "epoch": 5.184807718440005, + "grad_norm": 0.2906581461429596, + "learning_rate": 3.5217081125152873e-06, + "loss": 3.9834, + "step": 76310 + }, + { + "epoch": 5.1851474385106675, + "grad_norm": 0.28945034742355347, + "learning_rate": 3.5212834624269605e-06, + "loss": 3.9844, + "step": 76315 + }, + { + "epoch": 5.185487158581329, + "grad_norm": 0.32043540477752686, + "learning_rate": 3.5208588123386333e-06, + "loss": 3.9531, + "step": 76320 + }, + { + "epoch": 5.185826878651991, + "grad_norm": 0.30581897497177124, + "learning_rate": 3.5204341622503057e-06, + "loss": 3.9133, + "step": 76325 + }, + { + "epoch": 5.186166598722653, + "grad_norm": 0.2631821632385254, + "learning_rate": 3.520009512161979e-06, + "loss": 4.0193, + "step": 76330 + }, + { + "epoch": 5.186506318793314, + "grad_norm": 0.2617184519767761, + "learning_rate": 3.5195848620736517e-06, + "loss": 3.9254, + "step": 76335 + }, + { + "epoch": 5.186846038863976, + "grad_norm": 0.35700124502182007, + "learning_rate": 3.5191602119853245e-06, + "loss": 4.2343, + "step": 76340 + }, + { + "epoch": 5.187185758934638, + "grad_norm": 0.26083701848983765, + "learning_rate": 3.518735561896997e-06, + "loss": 3.8815, + "step": 76345 + }, + { + "epoch": 5.187525479005299, + "grad_norm": 0.24731412529945374, + "learning_rate": 3.51831091180867e-06, + "loss": 4.01, + "step": 76350 + }, + { + "epoch": 5.187865199075961, + "grad_norm": 0.29873326420783997, + "learning_rate": 3.517886261720343e-06, + "loss": 4.165, + "step": 76355 + }, + { + "epoch": 5.1882049191466235, + "grad_norm": 0.26420751214027405, + "learning_rate": 3.5174616116320153e-06, + "loss": 3.7251, + "step": 76360 + }, + { + "epoch": 5.188544639217285, + "grad_norm": 0.21308881044387817, + "learning_rate": 3.5170369615436885e-06, + "loss": 3.9149, + "step": 76365 + }, + { + "epoch": 5.188884359287947, + "grad_norm": 0.2290252298116684, + "learning_rate": 3.5166123114553613e-06, + "loss": 3.9043, + "step": 76370 + }, + { + "epoch": 5.189224079358609, + "grad_norm": 0.2553272545337677, + "learning_rate": 3.5161876613670337e-06, + "loss": 3.8558, + "step": 76375 + }, + { + "epoch": 5.18956379942927, + "grad_norm": 0.22251728177070618, + "learning_rate": 3.5157630112787065e-06, + "loss": 3.6947, + "step": 76380 + }, + { + "epoch": 5.189903519499932, + "grad_norm": 0.30624184012413025, + "learning_rate": 3.5153383611903797e-06, + "loss": 3.9744, + "step": 76385 + }, + { + "epoch": 5.190243239570594, + "grad_norm": 0.26245421171188354, + "learning_rate": 3.514913711102052e-06, + "loss": 3.9543, + "step": 76390 + }, + { + "epoch": 5.190582959641255, + "grad_norm": 0.37156954407691956, + "learning_rate": 3.514489061013725e-06, + "loss": 4.0899, + "step": 76395 + }, + { + "epoch": 5.1909226797119175, + "grad_norm": 0.29103419184684753, + "learning_rate": 3.514064410925398e-06, + "loss": 3.8033, + "step": 76400 + }, + { + "epoch": 5.1912623997825795, + "grad_norm": 0.23188936710357666, + "learning_rate": 3.5136397608370705e-06, + "loss": 4.086, + "step": 76405 + }, + { + "epoch": 5.191602119853241, + "grad_norm": 0.23455046117305756, + "learning_rate": 3.5132151107487433e-06, + "loss": 4.1642, + "step": 76410 + }, + { + "epoch": 5.191941839923903, + "grad_norm": 0.24530844390392303, + "learning_rate": 3.512790460660416e-06, + "loss": 3.6158, + "step": 76415 + }, + { + "epoch": 5.192281559994565, + "grad_norm": 0.21825860440731049, + "learning_rate": 3.512365810572089e-06, + "loss": 3.6039, + "step": 76420 + }, + { + "epoch": 5.192621280065226, + "grad_norm": 0.22032636404037476, + "learning_rate": 3.5119411604837617e-06, + "loss": 4.0337, + "step": 76425 + }, + { + "epoch": 5.192961000135888, + "grad_norm": 0.30300652980804443, + "learning_rate": 3.5115165103954345e-06, + "loss": 3.7285, + "step": 76430 + }, + { + "epoch": 5.19330072020655, + "grad_norm": 0.25929826498031616, + "learning_rate": 3.511091860307107e-06, + "loss": 3.713, + "step": 76435 + }, + { + "epoch": 5.193640440277211, + "grad_norm": 0.2440170794725418, + "learning_rate": 3.51066721021878e-06, + "loss": 3.7983, + "step": 76440 + }, + { + "epoch": 5.1939801603478735, + "grad_norm": 0.26434606313705444, + "learning_rate": 3.510242560130453e-06, + "loss": 3.9591, + "step": 76445 + }, + { + "epoch": 5.1943198804185355, + "grad_norm": 0.19449397921562195, + "learning_rate": 3.5098179100421253e-06, + "loss": 4.0851, + "step": 76450 + }, + { + "epoch": 5.194659600489197, + "grad_norm": 0.26665109395980835, + "learning_rate": 3.5093932599537985e-06, + "loss": 3.9256, + "step": 76455 + }, + { + "epoch": 5.194999320559859, + "grad_norm": 0.24046844244003296, + "learning_rate": 3.5089686098654713e-06, + "loss": 4.0354, + "step": 76460 + }, + { + "epoch": 5.195339040630521, + "grad_norm": 0.2512931525707245, + "learning_rate": 3.5085439597771437e-06, + "loss": 4.1088, + "step": 76465 + }, + { + "epoch": 5.195678760701182, + "grad_norm": 0.24042698740959167, + "learning_rate": 3.5081193096888165e-06, + "loss": 3.9783, + "step": 76470 + }, + { + "epoch": 5.196018480771844, + "grad_norm": 0.27634957432746887, + "learning_rate": 3.5076946596004897e-06, + "loss": 3.9645, + "step": 76475 + }, + { + "epoch": 5.196358200842505, + "grad_norm": 0.17944484949111938, + "learning_rate": 3.507270009512162e-06, + "loss": 3.8701, + "step": 76480 + }, + { + "epoch": 5.196697920913167, + "grad_norm": 0.24965335428714752, + "learning_rate": 3.506845359423835e-06, + "loss": 3.988, + "step": 76485 + }, + { + "epoch": 5.1970376409838295, + "grad_norm": 0.2726874053478241, + "learning_rate": 3.506420709335508e-06, + "loss": 4.0241, + "step": 76490 + }, + { + "epoch": 5.197377361054491, + "grad_norm": 0.2636529207229614, + "learning_rate": 3.5059960592471805e-06, + "loss": 3.8138, + "step": 76495 + }, + { + "epoch": 5.197717081125153, + "grad_norm": 0.3046864867210388, + "learning_rate": 3.5055714091588533e-06, + "loss": 3.893, + "step": 76500 + }, + { + "epoch": 5.198056801195815, + "grad_norm": 0.2864179313182831, + "learning_rate": 3.505146759070526e-06, + "loss": 3.8332, + "step": 76505 + }, + { + "epoch": 5.198396521266476, + "grad_norm": 0.2829386293888092, + "learning_rate": 3.5047221089821993e-06, + "loss": 3.9085, + "step": 76510 + }, + { + "epoch": 5.198736241337138, + "grad_norm": 0.23592829704284668, + "learning_rate": 3.5042974588938717e-06, + "loss": 4.0854, + "step": 76515 + }, + { + "epoch": 5.1990759614078, + "grad_norm": 0.19264890253543854, + "learning_rate": 3.5038728088055445e-06, + "loss": 3.772, + "step": 76520 + }, + { + "epoch": 5.199415681478461, + "grad_norm": 0.3187466263771057, + "learning_rate": 3.5034481587172177e-06, + "loss": 3.8706, + "step": 76525 + }, + { + "epoch": 5.199755401549123, + "grad_norm": 0.20135921239852905, + "learning_rate": 3.50302350862889e-06, + "loss": 3.9163, + "step": 76530 + }, + { + "epoch": 5.2000951216197855, + "grad_norm": 0.26125621795654297, + "learning_rate": 3.502598858540563e-06, + "loss": 4.0388, + "step": 76535 + }, + { + "epoch": 5.200434841690447, + "grad_norm": 0.27856963872909546, + "learning_rate": 3.5021742084522357e-06, + "loss": 4.014, + "step": 76540 + }, + { + "epoch": 5.200774561761109, + "grad_norm": 0.2734912931919098, + "learning_rate": 3.501749558363908e-06, + "loss": 3.9475, + "step": 76545 + }, + { + "epoch": 5.201114281831771, + "grad_norm": 0.3171399235725403, + "learning_rate": 3.5013249082755813e-06, + "loss": 4.182, + "step": 76550 + }, + { + "epoch": 5.201454001902432, + "grad_norm": 0.24952386319637299, + "learning_rate": 3.500900258187254e-06, + "loss": 3.9177, + "step": 76555 + }, + { + "epoch": 5.201793721973094, + "grad_norm": 0.3207945227622986, + "learning_rate": 3.5004756080989265e-06, + "loss": 3.9493, + "step": 76560 + }, + { + "epoch": 5.202133442043756, + "grad_norm": 0.2903314530849457, + "learning_rate": 3.5000509580105997e-06, + "loss": 3.9847, + "step": 76565 + }, + { + "epoch": 5.202473162114417, + "grad_norm": 0.359968900680542, + "learning_rate": 3.4996263079222725e-06, + "loss": 4.0104, + "step": 76570 + }, + { + "epoch": 5.202812882185079, + "grad_norm": 0.21594853699207306, + "learning_rate": 3.499201657833945e-06, + "loss": 3.8825, + "step": 76575 + }, + { + "epoch": 5.2031526022557415, + "grad_norm": 0.26423436403274536, + "learning_rate": 3.498777007745618e-06, + "loss": 3.9923, + "step": 76580 + }, + { + "epoch": 5.203492322326403, + "grad_norm": 0.20247718691825867, + "learning_rate": 3.498352357657291e-06, + "loss": 3.8445, + "step": 76585 + }, + { + "epoch": 5.203832042397065, + "grad_norm": 0.30158331990242004, + "learning_rate": 3.4979277075689633e-06, + "loss": 3.9249, + "step": 76590 + }, + { + "epoch": 5.204171762467727, + "grad_norm": 0.26902279257774353, + "learning_rate": 3.497503057480636e-06, + "loss": 3.9653, + "step": 76595 + }, + { + "epoch": 5.204511482538388, + "grad_norm": 0.2217189520597458, + "learning_rate": 3.4970784073923093e-06, + "loss": 4.0599, + "step": 76600 + }, + { + "epoch": 5.20485120260905, + "grad_norm": 0.31828826665878296, + "learning_rate": 3.4966537573039817e-06, + "loss": 3.8083, + "step": 76605 + }, + { + "epoch": 5.205190922679712, + "grad_norm": 0.2642469108104706, + "learning_rate": 3.4962291072156545e-06, + "loss": 3.8206, + "step": 76610 + }, + { + "epoch": 5.205530642750373, + "grad_norm": 0.26733464002609253, + "learning_rate": 3.4958044571273277e-06, + "loss": 3.8998, + "step": 76615 + }, + { + "epoch": 5.205870362821035, + "grad_norm": 0.2072748839855194, + "learning_rate": 3.495379807039e-06, + "loss": 4.0199, + "step": 76620 + }, + { + "epoch": 5.2062100828916975, + "grad_norm": 0.36511602997779846, + "learning_rate": 3.494955156950673e-06, + "loss": 4.0686, + "step": 76625 + }, + { + "epoch": 5.206549802962359, + "grad_norm": 0.25056251883506775, + "learning_rate": 3.4945305068623457e-06, + "loss": 3.802, + "step": 76630 + }, + { + "epoch": 5.206889523033021, + "grad_norm": 0.2765139043331146, + "learning_rate": 3.494105856774018e-06, + "loss": 4.1333, + "step": 76635 + }, + { + "epoch": 5.207229243103683, + "grad_norm": 0.21213401854038239, + "learning_rate": 3.4936812066856913e-06, + "loss": 3.9761, + "step": 76640 + }, + { + "epoch": 5.207568963174344, + "grad_norm": 0.228286013007164, + "learning_rate": 3.493256556597364e-06, + "loss": 3.7052, + "step": 76645 + }, + { + "epoch": 5.207908683245006, + "grad_norm": 0.2787511348724365, + "learning_rate": 3.4928319065090365e-06, + "loss": 3.829, + "step": 76650 + }, + { + "epoch": 5.208248403315668, + "grad_norm": 0.2808598577976227, + "learning_rate": 3.4924072564207097e-06, + "loss": 4.1219, + "step": 76655 + }, + { + "epoch": 5.208588123386329, + "grad_norm": 0.2680814564228058, + "learning_rate": 3.4919826063323825e-06, + "loss": 3.8882, + "step": 76660 + }, + { + "epoch": 5.2089278434569914, + "grad_norm": 0.28664690256118774, + "learning_rate": 3.491557956244055e-06, + "loss": 3.9905, + "step": 76665 + }, + { + "epoch": 5.2092675635276535, + "grad_norm": 0.2684873938560486, + "learning_rate": 3.4911333061557277e-06, + "loss": 3.9041, + "step": 76670 + }, + { + "epoch": 5.209607283598315, + "grad_norm": 0.23679815232753754, + "learning_rate": 3.490708656067401e-06, + "loss": 4.1632, + "step": 76675 + }, + { + "epoch": 5.209947003668977, + "grad_norm": 0.2341061234474182, + "learning_rate": 3.4902840059790737e-06, + "loss": 3.6955, + "step": 76680 + }, + { + "epoch": 5.210286723739639, + "grad_norm": 0.26674792170524597, + "learning_rate": 3.489859355890746e-06, + "loss": 3.8797, + "step": 76685 + }, + { + "epoch": 5.2106264438103, + "grad_norm": 0.26587724685668945, + "learning_rate": 3.4894347058024193e-06, + "loss": 3.82, + "step": 76690 + }, + { + "epoch": 5.210966163880962, + "grad_norm": 0.22609108686447144, + "learning_rate": 3.489010055714092e-06, + "loss": 3.8451, + "step": 76695 + }, + { + "epoch": 5.211305883951624, + "grad_norm": 0.23291441798210144, + "learning_rate": 3.4885854056257645e-06, + "loss": 3.8625, + "step": 76700 + }, + { + "epoch": 5.211645604022285, + "grad_norm": 0.26465460658073425, + "learning_rate": 3.4881607555374377e-06, + "loss": 3.8627, + "step": 76705 + }, + { + "epoch": 5.2119853240929475, + "grad_norm": 0.27430853247642517, + "learning_rate": 3.4877361054491105e-06, + "loss": 4.0734, + "step": 76710 + }, + { + "epoch": 5.2123250441636095, + "grad_norm": 0.33493077754974365, + "learning_rate": 3.487311455360783e-06, + "loss": 3.9246, + "step": 76715 + }, + { + "epoch": 5.212664764234271, + "grad_norm": 0.2974295914173126, + "learning_rate": 3.4868868052724557e-06, + "loss": 4.137, + "step": 76720 + }, + { + "epoch": 5.213004484304933, + "grad_norm": 0.22205373644828796, + "learning_rate": 3.486462155184129e-06, + "loss": 3.8789, + "step": 76725 + }, + { + "epoch": 5.213344204375595, + "grad_norm": 0.32576146721839905, + "learning_rate": 3.4860375050958013e-06, + "loss": 3.9198, + "step": 76730 + }, + { + "epoch": 5.213683924446256, + "grad_norm": 0.2653430700302124, + "learning_rate": 3.485612855007474e-06, + "loss": 4.0029, + "step": 76735 + }, + { + "epoch": 5.214023644516918, + "grad_norm": 0.31973010301589966, + "learning_rate": 3.4851882049191473e-06, + "loss": 3.953, + "step": 76740 + }, + { + "epoch": 5.21436336458758, + "grad_norm": 0.26498380303382874, + "learning_rate": 3.4847635548308197e-06, + "loss": 3.7745, + "step": 76745 + }, + { + "epoch": 5.214703084658241, + "grad_norm": 0.31099966168403625, + "learning_rate": 3.4843389047424925e-06, + "loss": 3.7408, + "step": 76750 + }, + { + "epoch": 5.2150428047289035, + "grad_norm": 0.33426791429519653, + "learning_rate": 3.4839142546541653e-06, + "loss": 3.9337, + "step": 76755 + }, + { + "epoch": 5.2153825247995655, + "grad_norm": 0.2404015064239502, + "learning_rate": 3.4834896045658377e-06, + "loss": 3.8074, + "step": 76760 + }, + { + "epoch": 5.215722244870227, + "grad_norm": 0.4494825601577759, + "learning_rate": 3.483064954477511e-06, + "loss": 3.8917, + "step": 76765 + }, + { + "epoch": 5.216061964940889, + "grad_norm": 0.3267088830471039, + "learning_rate": 3.4826403043891837e-06, + "loss": 3.8925, + "step": 76770 + }, + { + "epoch": 5.216401685011551, + "grad_norm": 0.2520063817501068, + "learning_rate": 3.482215654300856e-06, + "loss": 4.066, + "step": 76775 + }, + { + "epoch": 5.216741405082212, + "grad_norm": 0.3037286102771759, + "learning_rate": 3.4817910042125293e-06, + "loss": 3.801, + "step": 76780 + }, + { + "epoch": 5.217081125152874, + "grad_norm": 0.2853677272796631, + "learning_rate": 3.481366354124202e-06, + "loss": 4.0729, + "step": 76785 + }, + { + "epoch": 5.217420845223536, + "grad_norm": 0.2764425277709961, + "learning_rate": 3.4809417040358745e-06, + "loss": 4.1989, + "step": 76790 + }, + { + "epoch": 5.217760565294197, + "grad_norm": 0.3343999981880188, + "learning_rate": 3.4805170539475473e-06, + "loss": 3.7375, + "step": 76795 + }, + { + "epoch": 5.2181002853648595, + "grad_norm": 0.42606502771377563, + "learning_rate": 3.4800924038592205e-06, + "loss": 4.111, + "step": 76800 + }, + { + "epoch": 5.2184400054355216, + "grad_norm": 0.21046288311481476, + "learning_rate": 3.479667753770893e-06, + "loss": 3.9886, + "step": 76805 + }, + { + "epoch": 5.218779725506183, + "grad_norm": 0.2361156940460205, + "learning_rate": 3.4792431036825657e-06, + "loss": 3.8427, + "step": 76810 + }, + { + "epoch": 5.219119445576845, + "grad_norm": 0.19893859326839447, + "learning_rate": 3.478818453594239e-06, + "loss": 3.7328, + "step": 76815 + }, + { + "epoch": 5.219459165647507, + "grad_norm": 0.23953841626644135, + "learning_rate": 3.4783938035059113e-06, + "loss": 3.816, + "step": 76820 + }, + { + "epoch": 5.219798885718168, + "grad_norm": 0.25642135739326477, + "learning_rate": 3.477969153417584e-06, + "loss": 3.8342, + "step": 76825 + }, + { + "epoch": 5.22013860578883, + "grad_norm": 0.23467016220092773, + "learning_rate": 3.477544503329257e-06, + "loss": 3.9182, + "step": 76830 + }, + { + "epoch": 5.220478325859492, + "grad_norm": 0.3158566355705261, + "learning_rate": 3.4771198532409297e-06, + "loss": 3.5896, + "step": 76835 + }, + { + "epoch": 5.220818045930153, + "grad_norm": 0.2789274752140045, + "learning_rate": 3.4766952031526025e-06, + "loss": 3.8159, + "step": 76840 + }, + { + "epoch": 5.2211577660008155, + "grad_norm": 0.2213311344385147, + "learning_rate": 3.4762705530642753e-06, + "loss": 3.7396, + "step": 76845 + }, + { + "epoch": 5.221497486071477, + "grad_norm": 0.2766816318035126, + "learning_rate": 3.4758459029759485e-06, + "loss": 3.9587, + "step": 76850 + }, + { + "epoch": 5.221837206142139, + "grad_norm": 0.23592469096183777, + "learning_rate": 3.475421252887621e-06, + "loss": 3.8126, + "step": 76855 + }, + { + "epoch": 5.222176926212801, + "grad_norm": 0.29414498805999756, + "learning_rate": 3.4749966027992937e-06, + "loss": 4.0865, + "step": 76860 + }, + { + "epoch": 5.222516646283462, + "grad_norm": 0.5791751146316528, + "learning_rate": 3.474571952710967e-06, + "loss": 4.0148, + "step": 76865 + }, + { + "epoch": 5.222856366354124, + "grad_norm": 0.37361201643943787, + "learning_rate": 3.4741473026226393e-06, + "loss": 3.973, + "step": 76870 + }, + { + "epoch": 5.223196086424786, + "grad_norm": 0.25365662574768066, + "learning_rate": 3.473722652534312e-06, + "loss": 3.9207, + "step": 76875 + }, + { + "epoch": 5.223535806495447, + "grad_norm": 0.21894198656082153, + "learning_rate": 3.473298002445985e-06, + "loss": 3.8004, + "step": 76880 + }, + { + "epoch": 5.223875526566109, + "grad_norm": 0.2605803310871124, + "learning_rate": 3.4728733523576573e-06, + "loss": 3.7776, + "step": 76885 + }, + { + "epoch": 5.2242152466367715, + "grad_norm": 0.22241924703121185, + "learning_rate": 3.4724487022693305e-06, + "loss": 4.0866, + "step": 76890 + }, + { + "epoch": 5.224554966707433, + "grad_norm": 0.33810627460479736, + "learning_rate": 3.4720240521810033e-06, + "loss": 3.6571, + "step": 76895 + }, + { + "epoch": 5.224894686778095, + "grad_norm": 0.317496120929718, + "learning_rate": 3.4715994020926757e-06, + "loss": 4.0423, + "step": 76900 + }, + { + "epoch": 5.225234406848757, + "grad_norm": 0.24351607263088226, + "learning_rate": 3.471174752004349e-06, + "loss": 4.1742, + "step": 76905 + }, + { + "epoch": 5.225574126919418, + "grad_norm": 0.24985729157924652, + "learning_rate": 3.4707501019160217e-06, + "loss": 3.9256, + "step": 76910 + }, + { + "epoch": 5.22591384699008, + "grad_norm": 0.2681102156639099, + "learning_rate": 3.470325451827694e-06, + "loss": 4.1045, + "step": 76915 + }, + { + "epoch": 5.226253567060742, + "grad_norm": 0.2688215672969818, + "learning_rate": 3.469900801739367e-06, + "loss": 3.9588, + "step": 76920 + }, + { + "epoch": 5.226593287131403, + "grad_norm": 0.2594856917858124, + "learning_rate": 3.46947615165104e-06, + "loss": 4.2069, + "step": 76925 + }, + { + "epoch": 5.226933007202065, + "grad_norm": 0.19272582232952118, + "learning_rate": 3.4690515015627125e-06, + "loss": 4.0257, + "step": 76930 + }, + { + "epoch": 5.2272727272727275, + "grad_norm": 0.2690064609050751, + "learning_rate": 3.4686268514743853e-06, + "loss": 3.767, + "step": 76935 + }, + { + "epoch": 5.227612447343389, + "grad_norm": 0.24641317129135132, + "learning_rate": 3.4682022013860585e-06, + "loss": 3.6589, + "step": 76940 + }, + { + "epoch": 5.227952167414051, + "grad_norm": 0.22214065492153168, + "learning_rate": 3.467777551297731e-06, + "loss": 3.9307, + "step": 76945 + }, + { + "epoch": 5.228291887484713, + "grad_norm": 0.2347736805677414, + "learning_rate": 3.4673529012094037e-06, + "loss": 3.8194, + "step": 76950 + }, + { + "epoch": 5.228631607555374, + "grad_norm": 0.19075952470302582, + "learning_rate": 3.4669282511210765e-06, + "loss": 4.0407, + "step": 76955 + }, + { + "epoch": 5.228971327626036, + "grad_norm": 0.2820796072483063, + "learning_rate": 3.4665036010327493e-06, + "loss": 3.9473, + "step": 76960 + }, + { + "epoch": 5.229311047696698, + "grad_norm": 0.27564048767089844, + "learning_rate": 3.466078950944422e-06, + "loss": 3.9509, + "step": 76965 + }, + { + "epoch": 5.229650767767359, + "grad_norm": 0.2539384663105011, + "learning_rate": 3.465654300856095e-06, + "loss": 3.8731, + "step": 76970 + }, + { + "epoch": 5.2299904878380215, + "grad_norm": 0.24627377092838287, + "learning_rate": 3.4652296507677672e-06, + "loss": 4.1041, + "step": 76975 + }, + { + "epoch": 5.2303302079086835, + "grad_norm": 0.26489540934562683, + "learning_rate": 3.4648050006794405e-06, + "loss": 3.8956, + "step": 76980 + }, + { + "epoch": 5.230669927979345, + "grad_norm": 0.2852974236011505, + "learning_rate": 3.4643803505911133e-06, + "loss": 3.9956, + "step": 76985 + }, + { + "epoch": 5.231009648050007, + "grad_norm": 0.2716602683067322, + "learning_rate": 3.4639557005027857e-06, + "loss": 3.9249, + "step": 76990 + }, + { + "epoch": 5.231349368120669, + "grad_norm": 0.22017799317836761, + "learning_rate": 3.463531050414459e-06, + "loss": 3.9192, + "step": 76995 + }, + { + "epoch": 5.23168908819133, + "grad_norm": 0.25833773612976074, + "learning_rate": 3.4631064003261317e-06, + "loss": 3.9993, + "step": 77000 + }, + { + "epoch": 5.232028808261992, + "grad_norm": 0.24817833304405212, + "learning_rate": 3.462681750237804e-06, + "loss": 3.8467, + "step": 77005 + }, + { + "epoch": 5.232368528332654, + "grad_norm": 0.31045371294021606, + "learning_rate": 3.462257100149477e-06, + "loss": 3.9077, + "step": 77010 + }, + { + "epoch": 5.232708248403315, + "grad_norm": 0.2233341783285141, + "learning_rate": 3.46183245006115e-06, + "loss": 3.8628, + "step": 77015 + }, + { + "epoch": 5.2330479684739775, + "grad_norm": 0.23839488625526428, + "learning_rate": 3.461407799972823e-06, + "loss": 3.8954, + "step": 77020 + }, + { + "epoch": 5.2333876885446395, + "grad_norm": 0.24511761963367462, + "learning_rate": 3.4609831498844953e-06, + "loss": 4.0142, + "step": 77025 + }, + { + "epoch": 5.233727408615301, + "grad_norm": 0.203882098197937, + "learning_rate": 3.4605584997961685e-06, + "loss": 3.9077, + "step": 77030 + }, + { + "epoch": 5.234067128685963, + "grad_norm": 0.3145321011543274, + "learning_rate": 3.4601338497078413e-06, + "loss": 3.9699, + "step": 77035 + }, + { + "epoch": 5.234406848756625, + "grad_norm": 0.19370092451572418, + "learning_rate": 3.4597091996195137e-06, + "loss": 4.0849, + "step": 77040 + }, + { + "epoch": 5.234746568827286, + "grad_norm": 0.2603953778743744, + "learning_rate": 3.4592845495311865e-06, + "loss": 3.9242, + "step": 77045 + }, + { + "epoch": 5.235086288897948, + "grad_norm": 0.21705390512943268, + "learning_rate": 3.4588598994428597e-06, + "loss": 3.9576, + "step": 77050 + }, + { + "epoch": 5.23542600896861, + "grad_norm": 0.2941245138645172, + "learning_rate": 3.458435249354532e-06, + "loss": 3.9586, + "step": 77055 + }, + { + "epoch": 5.235765729039271, + "grad_norm": 0.22796916961669922, + "learning_rate": 3.458010599266205e-06, + "loss": 3.7728, + "step": 77060 + }, + { + "epoch": 5.2361054491099335, + "grad_norm": 0.3000531792640686, + "learning_rate": 3.457585949177878e-06, + "loss": 4.0624, + "step": 77065 + }, + { + "epoch": 5.2364451691805955, + "grad_norm": 0.2659234404563904, + "learning_rate": 3.4571612990895505e-06, + "loss": 3.9407, + "step": 77070 + }, + { + "epoch": 5.236784889251257, + "grad_norm": 0.207083597779274, + "learning_rate": 3.4567366490012233e-06, + "loss": 3.9668, + "step": 77075 + }, + { + "epoch": 5.237124609321919, + "grad_norm": 0.5265147089958191, + "learning_rate": 3.456311998912896e-06, + "loss": 3.8356, + "step": 77080 + }, + { + "epoch": 5.237464329392581, + "grad_norm": 0.31143131852149963, + "learning_rate": 3.455887348824569e-06, + "loss": 4.122, + "step": 77085 + }, + { + "epoch": 5.237804049463242, + "grad_norm": 0.27576929330825806, + "learning_rate": 3.4554626987362417e-06, + "loss": 3.8817, + "step": 77090 + }, + { + "epoch": 5.238143769533904, + "grad_norm": 0.31297898292541504, + "learning_rate": 3.4550380486479145e-06, + "loss": 4.0321, + "step": 77095 + }, + { + "epoch": 5.238483489604566, + "grad_norm": 0.23362180590629578, + "learning_rate": 3.454613398559587e-06, + "loss": 3.8576, + "step": 77100 + }, + { + "epoch": 5.238823209675227, + "grad_norm": 0.2963678240776062, + "learning_rate": 3.45418874847126e-06, + "loss": 3.9305, + "step": 77105 + }, + { + "epoch": 5.2391629297458895, + "grad_norm": 0.31044289469718933, + "learning_rate": 3.453764098382933e-06, + "loss": 3.9568, + "step": 77110 + }, + { + "epoch": 5.239502649816552, + "grad_norm": 0.3907771706581116, + "learning_rate": 3.4533394482946052e-06, + "loss": 3.8245, + "step": 77115 + }, + { + "epoch": 5.239842369887213, + "grad_norm": 0.24486960470676422, + "learning_rate": 3.4529147982062785e-06, + "loss": 3.9822, + "step": 77120 + }, + { + "epoch": 5.240182089957875, + "grad_norm": 0.4601282775402069, + "learning_rate": 3.4524901481179513e-06, + "loss": 3.6642, + "step": 77125 + }, + { + "epoch": 5.240521810028537, + "grad_norm": 0.264067679643631, + "learning_rate": 3.4520654980296236e-06, + "loss": 4.1081, + "step": 77130 + }, + { + "epoch": 5.240861530099198, + "grad_norm": 0.34557029604911804, + "learning_rate": 3.4516408479412964e-06, + "loss": 4.1018, + "step": 77135 + }, + { + "epoch": 5.24120125016986, + "grad_norm": 0.2703489065170288, + "learning_rate": 3.4512161978529697e-06, + "loss": 4.1618, + "step": 77140 + }, + { + "epoch": 5.241540970240522, + "grad_norm": 0.24917322397232056, + "learning_rate": 3.450791547764642e-06, + "loss": 3.7674, + "step": 77145 + }, + { + "epoch": 5.241880690311183, + "grad_norm": 0.22074134647846222, + "learning_rate": 3.450366897676315e-06, + "loss": 4.0326, + "step": 77150 + }, + { + "epoch": 5.2422204103818455, + "grad_norm": 0.22718170285224915, + "learning_rate": 3.449942247587988e-06, + "loss": 3.8292, + "step": 77155 + }, + { + "epoch": 5.242560130452507, + "grad_norm": 0.2742270827293396, + "learning_rate": 3.4495175974996605e-06, + "loss": 3.7508, + "step": 77160 + }, + { + "epoch": 5.242899850523169, + "grad_norm": 0.3253614604473114, + "learning_rate": 3.4490929474113333e-06, + "loss": 3.969, + "step": 77165 + }, + { + "epoch": 5.243239570593831, + "grad_norm": 0.2785051763057709, + "learning_rate": 3.448668297323006e-06, + "loss": 3.9538, + "step": 77170 + }, + { + "epoch": 5.243579290664492, + "grad_norm": 0.3722057342529297, + "learning_rate": 3.4482436472346784e-06, + "loss": 3.9052, + "step": 77175 + }, + { + "epoch": 5.243919010735154, + "grad_norm": 0.3405396044254303, + "learning_rate": 3.4478189971463517e-06, + "loss": 3.9966, + "step": 77180 + }, + { + "epoch": 5.244258730805816, + "grad_norm": 0.2760040760040283, + "learning_rate": 3.4473943470580245e-06, + "loss": 3.8909, + "step": 77185 + }, + { + "epoch": 5.244598450876477, + "grad_norm": 0.3922823965549469, + "learning_rate": 3.4469696969696977e-06, + "loss": 3.4876, + "step": 77190 + }, + { + "epoch": 5.244938170947139, + "grad_norm": 0.3274230360984802, + "learning_rate": 3.44654504688137e-06, + "loss": 4.0628, + "step": 77195 + }, + { + "epoch": 5.2452778910178015, + "grad_norm": 0.2610299587249756, + "learning_rate": 3.446120396793043e-06, + "loss": 3.9723, + "step": 77200 + }, + { + "epoch": 5.245617611088463, + "grad_norm": 0.28211650252342224, + "learning_rate": 3.4456957467047157e-06, + "loss": 3.9061, + "step": 77205 + }, + { + "epoch": 5.245957331159125, + "grad_norm": 0.3008144795894623, + "learning_rate": 3.4452710966163885e-06, + "loss": 3.6805, + "step": 77210 + }, + { + "epoch": 5.246297051229787, + "grad_norm": 0.3233990967273712, + "learning_rate": 3.4448464465280613e-06, + "loss": 3.8583, + "step": 77215 + }, + { + "epoch": 5.246636771300448, + "grad_norm": 0.2854853570461273, + "learning_rate": 3.444421796439734e-06, + "loss": 4.0257, + "step": 77220 + }, + { + "epoch": 5.24697649137111, + "grad_norm": 0.26789408922195435, + "learning_rate": 3.4439971463514064e-06, + "loss": 3.9171, + "step": 77225 + }, + { + "epoch": 5.247316211441772, + "grad_norm": 0.3016020357608795, + "learning_rate": 3.4435724962630797e-06, + "loss": 3.9659, + "step": 77230 + }, + { + "epoch": 5.247655931512433, + "grad_norm": 0.23763613402843475, + "learning_rate": 3.4431478461747525e-06, + "loss": 3.682, + "step": 77235 + }, + { + "epoch": 5.2479956515830954, + "grad_norm": 0.33130860328674316, + "learning_rate": 3.442723196086425e-06, + "loss": 3.769, + "step": 77240 + }, + { + "epoch": 5.2483353716537575, + "grad_norm": 0.4199323356151581, + "learning_rate": 3.442298545998098e-06, + "loss": 4.0047, + "step": 77245 + }, + { + "epoch": 5.248675091724419, + "grad_norm": 0.2517714202404022, + "learning_rate": 3.441873895909771e-06, + "loss": 3.875, + "step": 77250 + }, + { + "epoch": 5.249014811795081, + "grad_norm": 0.2997249960899353, + "learning_rate": 3.4414492458214432e-06, + "loss": 3.6711, + "step": 77255 + }, + { + "epoch": 5.249354531865743, + "grad_norm": 0.21124544739723206, + "learning_rate": 3.441024595733116e-06, + "loss": 3.9722, + "step": 77260 + }, + { + "epoch": 5.249694251936404, + "grad_norm": 0.26533183455467224, + "learning_rate": 3.4405999456447893e-06, + "loss": 3.9486, + "step": 77265 + }, + { + "epoch": 5.250033972007066, + "grad_norm": 0.2538341283798218, + "learning_rate": 3.4401752955564616e-06, + "loss": 3.9304, + "step": 77270 + }, + { + "epoch": 5.250373692077728, + "grad_norm": 0.32518988847732544, + "learning_rate": 3.4397506454681344e-06, + "loss": 4.0249, + "step": 77275 + }, + { + "epoch": 5.250713412148389, + "grad_norm": 0.24879679083824158, + "learning_rate": 3.4393259953798077e-06, + "loss": 4.0296, + "step": 77280 + }, + { + "epoch": 5.2510531322190515, + "grad_norm": 0.3157253861427307, + "learning_rate": 3.43890134529148e-06, + "loss": 3.9528, + "step": 77285 + }, + { + "epoch": 5.2513928522897135, + "grad_norm": 0.2241828739643097, + "learning_rate": 3.438476695203153e-06, + "loss": 3.8135, + "step": 77290 + }, + { + "epoch": 5.251732572360375, + "grad_norm": 0.2726873457431793, + "learning_rate": 3.4380520451148257e-06, + "loss": 3.9307, + "step": 77295 + }, + { + "epoch": 5.252072292431037, + "grad_norm": 0.44649091362953186, + "learning_rate": 3.437627395026498e-06, + "loss": 3.9404, + "step": 77300 + }, + { + "epoch": 5.252412012501699, + "grad_norm": 0.23148977756500244, + "learning_rate": 3.4372027449381713e-06, + "loss": 4.1333, + "step": 77305 + }, + { + "epoch": 5.25275173257236, + "grad_norm": 0.41009363532066345, + "learning_rate": 3.436778094849844e-06, + "loss": 4.0701, + "step": 77310 + }, + { + "epoch": 5.253091452643022, + "grad_norm": 0.2222164273262024, + "learning_rate": 3.4363534447615164e-06, + "loss": 4.0459, + "step": 77315 + }, + { + "epoch": 5.253431172713684, + "grad_norm": 0.20838862657546997, + "learning_rate": 3.4359287946731897e-06, + "loss": 3.7533, + "step": 77320 + }, + { + "epoch": 5.253770892784345, + "grad_norm": 0.2294550985097885, + "learning_rate": 3.4355041445848625e-06, + "loss": 4.1702, + "step": 77325 + }, + { + "epoch": 5.2541106128550075, + "grad_norm": 0.6320992708206177, + "learning_rate": 3.435079494496535e-06, + "loss": 3.8344, + "step": 77330 + }, + { + "epoch": 5.2544503329256695, + "grad_norm": 0.3456054627895355, + "learning_rate": 3.4346548444082076e-06, + "loss": 3.9202, + "step": 77335 + }, + { + "epoch": 5.254790052996331, + "grad_norm": 0.20257686078548431, + "learning_rate": 3.434230194319881e-06, + "loss": 3.9728, + "step": 77340 + }, + { + "epoch": 5.255129773066993, + "grad_norm": 0.2139112651348114, + "learning_rate": 3.4338055442315532e-06, + "loss": 4.0032, + "step": 77345 + }, + { + "epoch": 5.255469493137655, + "grad_norm": 0.2649727761745453, + "learning_rate": 3.433380894143226e-06, + "loss": 3.8913, + "step": 77350 + }, + { + "epoch": 5.255809213208316, + "grad_norm": 0.24517668783664703, + "learning_rate": 3.4329562440548993e-06, + "loss": 3.791, + "step": 77355 + }, + { + "epoch": 5.256148933278978, + "grad_norm": 0.23999781906604767, + "learning_rate": 3.432531593966572e-06, + "loss": 4.0591, + "step": 77360 + }, + { + "epoch": 5.25648865334964, + "grad_norm": 0.27620959281921387, + "learning_rate": 3.4321069438782444e-06, + "loss": 3.851, + "step": 77365 + }, + { + "epoch": 5.256828373420301, + "grad_norm": 0.19737209379673004, + "learning_rate": 3.4316822937899177e-06, + "loss": 4.2163, + "step": 77370 + }, + { + "epoch": 5.2571680934909635, + "grad_norm": 0.27307426929473877, + "learning_rate": 3.4312576437015905e-06, + "loss": 3.775, + "step": 77375 + }, + { + "epoch": 5.2575078135616256, + "grad_norm": 0.35192278027534485, + "learning_rate": 3.430832993613263e-06, + "loss": 4.0564, + "step": 77380 + }, + { + "epoch": 5.257847533632287, + "grad_norm": 0.28477752208709717, + "learning_rate": 3.4304083435249356e-06, + "loss": 4.0814, + "step": 77385 + }, + { + "epoch": 5.258187253702949, + "grad_norm": 0.2554295063018799, + "learning_rate": 3.429983693436609e-06, + "loss": 3.8878, + "step": 77390 + }, + { + "epoch": 5.258526973773611, + "grad_norm": 0.2828984260559082, + "learning_rate": 3.4295590433482812e-06, + "loss": 4.0322, + "step": 77395 + }, + { + "epoch": 5.258866693844272, + "grad_norm": 0.2596217691898346, + "learning_rate": 3.429134393259954e-06, + "loss": 3.7281, + "step": 77400 + }, + { + "epoch": 5.259206413914934, + "grad_norm": 0.26874086260795593, + "learning_rate": 3.4287097431716273e-06, + "loss": 3.9246, + "step": 77405 + }, + { + "epoch": 5.259546133985596, + "grad_norm": 0.27788403630256653, + "learning_rate": 3.4282850930832996e-06, + "loss": 4.0852, + "step": 77410 + }, + { + "epoch": 5.259885854056257, + "grad_norm": 0.2545951008796692, + "learning_rate": 3.4278604429949724e-06, + "loss": 3.9695, + "step": 77415 + }, + { + "epoch": 5.2602255741269195, + "grad_norm": 0.2602097988128662, + "learning_rate": 3.4274357929066452e-06, + "loss": 4.2782, + "step": 77420 + }, + { + "epoch": 5.260565294197582, + "grad_norm": 0.34158286452293396, + "learning_rate": 3.4270111428183176e-06, + "loss": 3.6783, + "step": 77425 + }, + { + "epoch": 5.260905014268243, + "grad_norm": 0.3189217746257782, + "learning_rate": 3.426586492729991e-06, + "loss": 3.9504, + "step": 77430 + }, + { + "epoch": 5.261244734338905, + "grad_norm": 0.2694982588291168, + "learning_rate": 3.4261618426416636e-06, + "loss": 3.9512, + "step": 77435 + }, + { + "epoch": 5.261584454409567, + "grad_norm": 0.2040504813194275, + "learning_rate": 3.425737192553336e-06, + "loss": 4.0426, + "step": 77440 + }, + { + "epoch": 5.261924174480228, + "grad_norm": 0.2639462351799011, + "learning_rate": 3.4253125424650093e-06, + "loss": 3.9628, + "step": 77445 + }, + { + "epoch": 5.26226389455089, + "grad_norm": 0.24676337838172913, + "learning_rate": 3.424887892376682e-06, + "loss": 4.0226, + "step": 77450 + }, + { + "epoch": 5.262603614621552, + "grad_norm": 0.3187102675437927, + "learning_rate": 3.4244632422883544e-06, + "loss": 3.9389, + "step": 77455 + }, + { + "epoch": 5.262943334692213, + "grad_norm": 0.3127620816230774, + "learning_rate": 3.4240385922000272e-06, + "loss": 3.9524, + "step": 77460 + }, + { + "epoch": 5.2632830547628755, + "grad_norm": 0.29968664050102234, + "learning_rate": 3.4236139421117005e-06, + "loss": 4.0577, + "step": 77465 + }, + { + "epoch": 5.263622774833538, + "grad_norm": 0.29815033078193665, + "learning_rate": 3.423189292023373e-06, + "loss": 4.0021, + "step": 77470 + }, + { + "epoch": 5.263962494904199, + "grad_norm": 0.26405584812164307, + "learning_rate": 3.4227646419350456e-06, + "loss": 3.989, + "step": 77475 + }, + { + "epoch": 5.264302214974861, + "grad_norm": 0.45117488503456116, + "learning_rate": 3.422339991846719e-06, + "loss": 4.0389, + "step": 77480 + }, + { + "epoch": 5.264641935045523, + "grad_norm": 0.26212719082832336, + "learning_rate": 3.4219153417583912e-06, + "loss": 3.7387, + "step": 77485 + }, + { + "epoch": 5.264981655116184, + "grad_norm": 0.24937517940998077, + "learning_rate": 3.421490691670064e-06, + "loss": 3.7288, + "step": 77490 + }, + { + "epoch": 5.265321375186846, + "grad_norm": 0.27964362502098083, + "learning_rate": 3.4210660415817373e-06, + "loss": 3.8625, + "step": 77495 + }, + { + "epoch": 5.265661095257508, + "grad_norm": 0.2701824903488159, + "learning_rate": 3.4206413914934096e-06, + "loss": 3.9411, + "step": 77500 + }, + { + "epoch": 5.266000815328169, + "grad_norm": 0.2221783548593521, + "learning_rate": 3.4202167414050824e-06, + "loss": 3.987, + "step": 77505 + }, + { + "epoch": 5.2663405353988315, + "grad_norm": 0.2622314989566803, + "learning_rate": 3.4197920913167552e-06, + "loss": 3.7661, + "step": 77510 + }, + { + "epoch": 5.266680255469494, + "grad_norm": 0.2308296114206314, + "learning_rate": 3.4193674412284276e-06, + "loss": 4.0671, + "step": 77515 + }, + { + "epoch": 5.267019975540155, + "grad_norm": 0.3453049957752228, + "learning_rate": 3.418942791140101e-06, + "loss": 4.1384, + "step": 77520 + }, + { + "epoch": 5.267359695610817, + "grad_norm": 0.21492619812488556, + "learning_rate": 3.4185181410517736e-06, + "loss": 3.8513, + "step": 77525 + }, + { + "epoch": 5.267699415681479, + "grad_norm": 0.2701980173587799, + "learning_rate": 3.418093490963447e-06, + "loss": 3.8815, + "step": 77530 + }, + { + "epoch": 5.26803913575214, + "grad_norm": 0.2997503876686096, + "learning_rate": 3.4176688408751192e-06, + "loss": 3.7776, + "step": 77535 + }, + { + "epoch": 5.268378855822802, + "grad_norm": 0.513251781463623, + "learning_rate": 3.417244190786792e-06, + "loss": 4.1561, + "step": 77540 + }, + { + "epoch": 5.268718575893463, + "grad_norm": 0.24089080095291138, + "learning_rate": 3.416819540698465e-06, + "loss": 4.053, + "step": 77545 + }, + { + "epoch": 5.2690582959641254, + "grad_norm": 0.24209290742874146, + "learning_rate": 3.4163948906101372e-06, + "loss": 3.8759, + "step": 77550 + }, + { + "epoch": 5.2693980160347875, + "grad_norm": 0.2794300615787506, + "learning_rate": 3.4159702405218104e-06, + "loss": 3.6814, + "step": 77555 + }, + { + "epoch": 5.269737736105449, + "grad_norm": 0.29227834939956665, + "learning_rate": 3.4155455904334832e-06, + "loss": 3.8567, + "step": 77560 + }, + { + "epoch": 5.270077456176111, + "grad_norm": 0.25778841972351074, + "learning_rate": 3.4151209403451556e-06, + "loss": 3.9861, + "step": 77565 + }, + { + "epoch": 5.270417176246773, + "grad_norm": 0.24007867276668549, + "learning_rate": 3.414696290256829e-06, + "loss": 4.0008, + "step": 77570 + }, + { + "epoch": 5.270756896317434, + "grad_norm": 0.25134289264678955, + "learning_rate": 3.4142716401685016e-06, + "loss": 4.1618, + "step": 77575 + }, + { + "epoch": 5.271096616388096, + "grad_norm": 0.38050568103790283, + "learning_rate": 3.413846990080174e-06, + "loss": 3.9762, + "step": 77580 + }, + { + "epoch": 5.271436336458758, + "grad_norm": 0.39916786551475525, + "learning_rate": 3.413422339991847e-06, + "loss": 4.0355, + "step": 77585 + }, + { + "epoch": 5.271776056529419, + "grad_norm": 0.2650955021381378, + "learning_rate": 3.41299768990352e-06, + "loss": 3.8753, + "step": 77590 + }, + { + "epoch": 5.2721157766000815, + "grad_norm": 0.2315085381269455, + "learning_rate": 3.4125730398151924e-06, + "loss": 3.9969, + "step": 77595 + }, + { + "epoch": 5.2724554966707435, + "grad_norm": 0.36126989126205444, + "learning_rate": 3.4121483897268652e-06, + "loss": 3.9305, + "step": 77600 + }, + { + "epoch": 5.272795216741405, + "grad_norm": 0.2449740320444107, + "learning_rate": 3.4117237396385385e-06, + "loss": 3.9113, + "step": 77605 + }, + { + "epoch": 5.273134936812067, + "grad_norm": 0.2599676251411438, + "learning_rate": 3.411299089550211e-06, + "loss": 3.7942, + "step": 77610 + }, + { + "epoch": 5.273474656882729, + "grad_norm": 0.23523512482643127, + "learning_rate": 3.4108744394618836e-06, + "loss": 3.8505, + "step": 77615 + }, + { + "epoch": 5.27381437695339, + "grad_norm": 0.4243110120296478, + "learning_rate": 3.4104497893735564e-06, + "loss": 4.0864, + "step": 77620 + }, + { + "epoch": 5.274154097024052, + "grad_norm": 0.33355513215065, + "learning_rate": 3.4100251392852292e-06, + "loss": 3.9333, + "step": 77625 + }, + { + "epoch": 5.274493817094714, + "grad_norm": 0.2826298177242279, + "learning_rate": 3.409600489196902e-06, + "loss": 3.9434, + "step": 77630 + }, + { + "epoch": 5.274833537165375, + "grad_norm": 0.2608388662338257, + "learning_rate": 3.409175839108575e-06, + "loss": 4.0329, + "step": 77635 + }, + { + "epoch": 5.2751732572360375, + "grad_norm": 0.22985456883907318, + "learning_rate": 3.408751189020247e-06, + "loss": 4.1148, + "step": 77640 + }, + { + "epoch": 5.2755129773066995, + "grad_norm": 0.34337276220321655, + "learning_rate": 3.4083265389319204e-06, + "loss": 3.8034, + "step": 77645 + }, + { + "epoch": 5.275852697377361, + "grad_norm": 0.29447653889656067, + "learning_rate": 3.4079018888435932e-06, + "loss": 3.9493, + "step": 77650 + }, + { + "epoch": 5.276192417448023, + "grad_norm": 0.24104604125022888, + "learning_rate": 3.4074772387552656e-06, + "loss": 3.9928, + "step": 77655 + }, + { + "epoch": 5.276532137518685, + "grad_norm": 0.26111385226249695, + "learning_rate": 3.407052588666939e-06, + "loss": 3.9428, + "step": 77660 + }, + { + "epoch": 5.276871857589346, + "grad_norm": 0.35350146889686584, + "learning_rate": 3.4066279385786116e-06, + "loss": 3.7549, + "step": 77665 + }, + { + "epoch": 5.277211577660008, + "grad_norm": 0.2445794939994812, + "learning_rate": 3.406203288490284e-06, + "loss": 3.9984, + "step": 77670 + }, + { + "epoch": 5.27755129773067, + "grad_norm": 0.2173519730567932, + "learning_rate": 3.405778638401957e-06, + "loss": 3.8681, + "step": 77675 + }, + { + "epoch": 5.277891017801331, + "grad_norm": 0.42298004031181335, + "learning_rate": 3.40535398831363e-06, + "loss": 3.6406, + "step": 77680 + }, + { + "epoch": 5.2782307378719935, + "grad_norm": 0.26548248529434204, + "learning_rate": 3.4049293382253024e-06, + "loss": 3.8844, + "step": 77685 + }, + { + "epoch": 5.278570457942656, + "grad_norm": 0.35332080721855164, + "learning_rate": 3.4045046881369752e-06, + "loss": 3.9137, + "step": 77690 + }, + { + "epoch": 5.278910178013317, + "grad_norm": 0.2604771554470062, + "learning_rate": 3.4040800380486484e-06, + "loss": 3.9964, + "step": 77695 + }, + { + "epoch": 5.279249898083979, + "grad_norm": 0.1845943033695221, + "learning_rate": 3.4036553879603212e-06, + "loss": 3.757, + "step": 77700 + }, + { + "epoch": 5.279589618154641, + "grad_norm": 0.22635307908058167, + "learning_rate": 3.4032307378719936e-06, + "loss": 4.101, + "step": 77705 + }, + { + "epoch": 5.279929338225302, + "grad_norm": 0.3656695485115051, + "learning_rate": 3.4028060877836664e-06, + "loss": 3.7024, + "step": 77710 + }, + { + "epoch": 5.280269058295964, + "grad_norm": 0.30590254068374634, + "learning_rate": 3.4023814376953396e-06, + "loss": 3.9374, + "step": 77715 + }, + { + "epoch": 5.280608778366626, + "grad_norm": 0.2831103205680847, + "learning_rate": 3.401956787607012e-06, + "loss": 4.096, + "step": 77720 + }, + { + "epoch": 5.280948498437287, + "grad_norm": 0.44035667181015015, + "learning_rate": 3.401532137518685e-06, + "loss": 4.0711, + "step": 77725 + }, + { + "epoch": 5.2812882185079495, + "grad_norm": 0.3148064613342285, + "learning_rate": 3.401107487430358e-06, + "loss": 3.9231, + "step": 77730 + }, + { + "epoch": 5.281627938578612, + "grad_norm": 0.2720510959625244, + "learning_rate": 3.4006828373420304e-06, + "loss": 4.0445, + "step": 77735 + }, + { + "epoch": 5.281967658649273, + "grad_norm": 0.3779733180999756, + "learning_rate": 3.4002581872537032e-06, + "loss": 3.8189, + "step": 77740 + }, + { + "epoch": 5.282307378719935, + "grad_norm": 0.33642616868019104, + "learning_rate": 3.399833537165376e-06, + "loss": 3.7992, + "step": 77745 + }, + { + "epoch": 5.282647098790597, + "grad_norm": 0.2457066923379898, + "learning_rate": 3.399408887077049e-06, + "loss": 3.6868, + "step": 77750 + }, + { + "epoch": 5.282986818861258, + "grad_norm": 0.27312588691711426, + "learning_rate": 3.3989842369887216e-06, + "loss": 3.886, + "step": 77755 + }, + { + "epoch": 5.28332653893192, + "grad_norm": 0.3003731071949005, + "learning_rate": 3.3985595869003944e-06, + "loss": 4.0616, + "step": 77760 + }, + { + "epoch": 5.283666259002582, + "grad_norm": 0.31650635600090027, + "learning_rate": 3.398134936812067e-06, + "loss": 3.5181, + "step": 77765 + }, + { + "epoch": 5.284005979073243, + "grad_norm": 0.2337028533220291, + "learning_rate": 3.39771028672374e-06, + "loss": 3.7683, + "step": 77770 + }, + { + "epoch": 5.2843456991439055, + "grad_norm": 0.24304001033306122, + "learning_rate": 3.397285636635413e-06, + "loss": 3.7426, + "step": 77775 + }, + { + "epoch": 5.284685419214568, + "grad_norm": 0.27750304341316223, + "learning_rate": 3.396860986547085e-06, + "loss": 3.7733, + "step": 77780 + }, + { + "epoch": 5.285025139285229, + "grad_norm": 0.25097671151161194, + "learning_rate": 3.3964363364587584e-06, + "loss": 3.9275, + "step": 77785 + }, + { + "epoch": 5.285364859355891, + "grad_norm": 0.28697386384010315, + "learning_rate": 3.3960116863704312e-06, + "loss": 4.0081, + "step": 77790 + }, + { + "epoch": 5.285704579426553, + "grad_norm": 0.23983722925186157, + "learning_rate": 3.3955870362821036e-06, + "loss": 3.8236, + "step": 77795 + }, + { + "epoch": 5.286044299497214, + "grad_norm": 0.33263474702835083, + "learning_rate": 3.3951623861937764e-06, + "loss": 3.8494, + "step": 77800 + }, + { + "epoch": 5.286384019567876, + "grad_norm": 0.20254004001617432, + "learning_rate": 3.3947377361054496e-06, + "loss": 3.8564, + "step": 77805 + }, + { + "epoch": 5.286723739638538, + "grad_norm": 0.2657763957977295, + "learning_rate": 3.3943980160347877e-06, + "loss": 4.0341, + "step": 77810 + }, + { + "epoch": 5.287063459709199, + "grad_norm": 0.3179771602153778, + "learning_rate": 3.39397336594646e-06, + "loss": 4.3101, + "step": 77815 + }, + { + "epoch": 5.2874031797798615, + "grad_norm": 0.26672130823135376, + "learning_rate": 3.3935487158581333e-06, + "loss": 4.0139, + "step": 77820 + }, + { + "epoch": 5.287742899850523, + "grad_norm": 0.2911469638347626, + "learning_rate": 3.393124065769806e-06, + "loss": 3.9082, + "step": 77825 + }, + { + "epoch": 5.288082619921185, + "grad_norm": 0.28670379519462585, + "learning_rate": 3.3926994156814785e-06, + "loss": 3.6772, + "step": 77830 + }, + { + "epoch": 5.288422339991847, + "grad_norm": 0.24930334091186523, + "learning_rate": 3.3922747655931513e-06, + "loss": 3.9495, + "step": 77835 + }, + { + "epoch": 5.288762060062508, + "grad_norm": 0.24754981696605682, + "learning_rate": 3.3918501155048245e-06, + "loss": 4.1357, + "step": 77840 + }, + { + "epoch": 5.28910178013317, + "grad_norm": 0.3164045810699463, + "learning_rate": 3.391425465416497e-06, + "loss": 4.1487, + "step": 77845 + }, + { + "epoch": 5.289441500203832, + "grad_norm": 0.2980177104473114, + "learning_rate": 3.3910008153281697e-06, + "loss": 4.0371, + "step": 77850 + }, + { + "epoch": 5.289781220274493, + "grad_norm": 0.3040612041950226, + "learning_rate": 3.390576165239843e-06, + "loss": 4.089, + "step": 77855 + }, + { + "epoch": 5.2901209403451555, + "grad_norm": 0.24346216022968292, + "learning_rate": 3.3901515151515153e-06, + "loss": 3.9823, + "step": 77860 + }, + { + "epoch": 5.2904606604158175, + "grad_norm": 0.2702236771583557, + "learning_rate": 3.389726865063188e-06, + "loss": 3.8753, + "step": 77865 + }, + { + "epoch": 5.290800380486479, + "grad_norm": 0.18324117362499237, + "learning_rate": 3.3893022149748613e-06, + "loss": 3.6642, + "step": 77870 + }, + { + "epoch": 5.291140100557141, + "grad_norm": 0.23771947622299194, + "learning_rate": 3.3888775648865337e-06, + "loss": 4.0798, + "step": 77875 + }, + { + "epoch": 5.291479820627803, + "grad_norm": 0.312743216753006, + "learning_rate": 3.3884529147982065e-06, + "loss": 3.8892, + "step": 77880 + }, + { + "epoch": 5.291819540698464, + "grad_norm": 0.2427680343389511, + "learning_rate": 3.3880282647098793e-06, + "loss": 3.7418, + "step": 77885 + }, + { + "epoch": 5.292159260769126, + "grad_norm": 0.24156087636947632, + "learning_rate": 3.3876036146215517e-06, + "loss": 3.8251, + "step": 77890 + }, + { + "epoch": 5.292498980839788, + "grad_norm": 0.30738526582717896, + "learning_rate": 3.387178964533225e-06, + "loss": 3.7959, + "step": 77895 + }, + { + "epoch": 5.292838700910449, + "grad_norm": 0.3172670006752014, + "learning_rate": 3.3867543144448977e-06, + "loss": 3.9607, + "step": 77900 + }, + { + "epoch": 5.2931784209811115, + "grad_norm": 0.37396615743637085, + "learning_rate": 3.386329664356571e-06, + "loss": 3.9827, + "step": 77905 + }, + { + "epoch": 5.2935181410517735, + "grad_norm": 0.5475718975067139, + "learning_rate": 3.3859050142682433e-06, + "loss": 3.7413, + "step": 77910 + }, + { + "epoch": 5.293857861122435, + "grad_norm": 0.2535010278224945, + "learning_rate": 3.385480364179916e-06, + "loss": 3.8414, + "step": 77915 + }, + { + "epoch": 5.294197581193097, + "grad_norm": 0.3048439621925354, + "learning_rate": 3.385055714091589e-06, + "loss": 3.9865, + "step": 77920 + }, + { + "epoch": 5.294537301263759, + "grad_norm": 0.36087948083877563, + "learning_rate": 3.3846310640032613e-06, + "loss": 3.921, + "step": 77925 + }, + { + "epoch": 5.29487702133442, + "grad_norm": 0.2768619954586029, + "learning_rate": 3.3842064139149345e-06, + "loss": 3.9037, + "step": 77930 + }, + { + "epoch": 5.295216741405082, + "grad_norm": 0.27332523465156555, + "learning_rate": 3.3837817638266073e-06, + "loss": 3.9338, + "step": 77935 + }, + { + "epoch": 5.295556461475744, + "grad_norm": 0.2972012162208557, + "learning_rate": 3.3833571137382797e-06, + "loss": 3.9114, + "step": 77940 + }, + { + "epoch": 5.295896181546405, + "grad_norm": 0.30718621611595154, + "learning_rate": 3.382932463649953e-06, + "loss": 4.0368, + "step": 77945 + }, + { + "epoch": 5.2962359016170675, + "grad_norm": 0.22066761553287506, + "learning_rate": 3.3825078135616257e-06, + "loss": 3.9692, + "step": 77950 + }, + { + "epoch": 5.2965756216877296, + "grad_norm": 0.2562660276889801, + "learning_rate": 3.382083163473298e-06, + "loss": 4.0669, + "step": 77955 + }, + { + "epoch": 5.296915341758391, + "grad_norm": 0.21149572730064392, + "learning_rate": 3.381658513384971e-06, + "loss": 3.66, + "step": 77960 + }, + { + "epoch": 5.297255061829053, + "grad_norm": 0.3354402482509613, + "learning_rate": 3.381233863296644e-06, + "loss": 4.0546, + "step": 77965 + }, + { + "epoch": 5.297594781899715, + "grad_norm": 0.2396773099899292, + "learning_rate": 3.3808092132083165e-06, + "loss": 3.9108, + "step": 77970 + }, + { + "epoch": 5.297934501970376, + "grad_norm": 0.4049952030181885, + "learning_rate": 3.3803845631199893e-06, + "loss": 3.7728, + "step": 77975 + }, + { + "epoch": 5.298274222041038, + "grad_norm": 0.248119056224823, + "learning_rate": 3.3799599130316625e-06, + "loss": 4.1197, + "step": 77980 + }, + { + "epoch": 5.2986139421117, + "grad_norm": 0.2427961677312851, + "learning_rate": 3.379535262943335e-06, + "loss": 3.8476, + "step": 77985 + }, + { + "epoch": 5.298953662182361, + "grad_norm": 0.30018898844718933, + "learning_rate": 3.3791106128550077e-06, + "loss": 3.9662, + "step": 77990 + }, + { + "epoch": 5.2992933822530235, + "grad_norm": 0.3591747581958771, + "learning_rate": 3.378685962766681e-06, + "loss": 3.7789, + "step": 77995 + }, + { + "epoch": 5.299633102323686, + "grad_norm": 0.2238129824399948, + "learning_rate": 3.3782613126783533e-06, + "loss": 3.9874, + "step": 78000 + }, + { + "epoch": 5.299972822394347, + "grad_norm": 0.38132089376449585, + "learning_rate": 3.377836662590026e-06, + "loss": 4.0755, + "step": 78005 + }, + { + "epoch": 5.300312542465009, + "grad_norm": 0.28141340613365173, + "learning_rate": 3.377412012501699e-06, + "loss": 3.7074, + "step": 78010 + }, + { + "epoch": 5.300652262535671, + "grad_norm": 0.3811405301094055, + "learning_rate": 3.3769873624133713e-06, + "loss": 3.9519, + "step": 78015 + }, + { + "epoch": 5.300991982606332, + "grad_norm": 0.365054726600647, + "learning_rate": 3.3765627123250445e-06, + "loss": 3.8114, + "step": 78020 + }, + { + "epoch": 5.301331702676994, + "grad_norm": 0.27210092544555664, + "learning_rate": 3.3761380622367173e-06, + "loss": 4.0023, + "step": 78025 + }, + { + "epoch": 5.301671422747656, + "grad_norm": 0.5832469463348389, + "learning_rate": 3.3757134121483897e-06, + "loss": 4.0368, + "step": 78030 + }, + { + "epoch": 5.302011142818317, + "grad_norm": 0.36024779081344604, + "learning_rate": 3.375288762060063e-06, + "loss": 4.0654, + "step": 78035 + }, + { + "epoch": 5.3023508628889795, + "grad_norm": 0.2994922995567322, + "learning_rate": 3.3748641119717357e-06, + "loss": 4.0694, + "step": 78040 + }, + { + "epoch": 5.302690582959642, + "grad_norm": 0.2679559588432312, + "learning_rate": 3.374439461883408e-06, + "loss": 3.8451, + "step": 78045 + }, + { + "epoch": 5.303030303030303, + "grad_norm": 0.2956378757953644, + "learning_rate": 3.374014811795081e-06, + "loss": 3.8646, + "step": 78050 + }, + { + "epoch": 5.303370023100965, + "grad_norm": 0.3207266926765442, + "learning_rate": 3.373590161706754e-06, + "loss": 4.1512, + "step": 78055 + }, + { + "epoch": 5.303709743171627, + "grad_norm": 0.2569906711578369, + "learning_rate": 3.3731655116184265e-06, + "loss": 3.8704, + "step": 78060 + }, + { + "epoch": 5.304049463242288, + "grad_norm": 0.312902569770813, + "learning_rate": 3.3727408615300993e-06, + "loss": 4.1056, + "step": 78065 + }, + { + "epoch": 5.30438918331295, + "grad_norm": 0.3338075280189514, + "learning_rate": 3.3723162114417725e-06, + "loss": 3.9313, + "step": 78070 + }, + { + "epoch": 5.304728903383612, + "grad_norm": 0.38366231322288513, + "learning_rate": 3.3718915613534453e-06, + "loss": 3.8104, + "step": 78075 + }, + { + "epoch": 5.305068623454273, + "grad_norm": 0.2825753092765808, + "learning_rate": 3.3714669112651177e-06, + "loss": 3.9794, + "step": 78080 + }, + { + "epoch": 5.3054083435249355, + "grad_norm": 0.2655489146709442, + "learning_rate": 3.3710422611767905e-06, + "loss": 3.8733, + "step": 78085 + }, + { + "epoch": 5.305748063595598, + "grad_norm": 0.30702680349349976, + "learning_rate": 3.3706176110884637e-06, + "loss": 3.9935, + "step": 78090 + }, + { + "epoch": 5.306087783666259, + "grad_norm": 0.2719694674015045, + "learning_rate": 3.370192961000136e-06, + "loss": 3.8876, + "step": 78095 + }, + { + "epoch": 5.306427503736921, + "grad_norm": 0.3600277900695801, + "learning_rate": 3.369768310911809e-06, + "loss": 3.8021, + "step": 78100 + }, + { + "epoch": 5.306767223807583, + "grad_norm": 0.25612500309944153, + "learning_rate": 3.369343660823482e-06, + "loss": 4.0906, + "step": 78105 + }, + { + "epoch": 5.307106943878244, + "grad_norm": 0.3509160578250885, + "learning_rate": 3.3689190107351545e-06, + "loss": 4.3102, + "step": 78110 + }, + { + "epoch": 5.307446663948906, + "grad_norm": 0.27768474817276, + "learning_rate": 3.3684943606468273e-06, + "loss": 4.0031, + "step": 78115 + }, + { + "epoch": 5.307786384019568, + "grad_norm": 0.21730339527130127, + "learning_rate": 3.3680697105585e-06, + "loss": 3.8362, + "step": 78120 + }, + { + "epoch": 5.3081261040902294, + "grad_norm": 0.22254091501235962, + "learning_rate": 3.367645060470173e-06, + "loss": 3.9376, + "step": 78125 + }, + { + "epoch": 5.3084658241608915, + "grad_norm": 0.26836511492729187, + "learning_rate": 3.3672204103818457e-06, + "loss": 3.9432, + "step": 78130 + }, + { + "epoch": 5.308805544231554, + "grad_norm": 0.24995256960391998, + "learning_rate": 3.3667957602935185e-06, + "loss": 3.6895, + "step": 78135 + }, + { + "epoch": 5.309145264302215, + "grad_norm": 0.1970953792333603, + "learning_rate": 3.366371110205191e-06, + "loss": 3.8017, + "step": 78140 + }, + { + "epoch": 5.309484984372877, + "grad_norm": 0.2813323140144348, + "learning_rate": 3.365946460116864e-06, + "loss": 3.77, + "step": 78145 + }, + { + "epoch": 5.309824704443539, + "grad_norm": 0.3386870324611664, + "learning_rate": 3.365521810028537e-06, + "loss": 3.9582, + "step": 78150 + }, + { + "epoch": 5.3101644245142, + "grad_norm": 0.2966752052307129, + "learning_rate": 3.3650971599402093e-06, + "loss": 4.0124, + "step": 78155 + }, + { + "epoch": 5.310504144584862, + "grad_norm": 0.236747145652771, + "learning_rate": 3.3646725098518825e-06, + "loss": 3.9123, + "step": 78160 + }, + { + "epoch": 5.310843864655524, + "grad_norm": 0.2837985157966614, + "learning_rate": 3.3642478597635553e-06, + "loss": 4.2055, + "step": 78165 + }, + { + "epoch": 5.3111835847261855, + "grad_norm": 0.2155148983001709, + "learning_rate": 3.3638232096752277e-06, + "loss": 3.9256, + "step": 78170 + }, + { + "epoch": 5.3115233047968475, + "grad_norm": 0.31926605105400085, + "learning_rate": 3.3633985595869005e-06, + "loss": 4.0414, + "step": 78175 + }, + { + "epoch": 5.31186302486751, + "grad_norm": 0.3681778311729431, + "learning_rate": 3.3629739094985737e-06, + "loss": 4.2127, + "step": 78180 + }, + { + "epoch": 5.312202744938171, + "grad_norm": 0.2346734255552292, + "learning_rate": 3.362549259410246e-06, + "loss": 4.0843, + "step": 78185 + }, + { + "epoch": 5.312542465008833, + "grad_norm": 0.26822128891944885, + "learning_rate": 3.362124609321919e-06, + "loss": 3.9165, + "step": 78190 + }, + { + "epoch": 5.312882185079495, + "grad_norm": 0.2524586021900177, + "learning_rate": 3.361699959233592e-06, + "loss": 3.7493, + "step": 78195 + }, + { + "epoch": 5.313221905150156, + "grad_norm": 0.2987882196903229, + "learning_rate": 3.3612753091452645e-06, + "loss": 3.9791, + "step": 78200 + }, + { + "epoch": 5.313561625220818, + "grad_norm": 0.26960140466690063, + "learning_rate": 3.3608506590569373e-06, + "loss": 4.2773, + "step": 78205 + }, + { + "epoch": 5.31390134529148, + "grad_norm": 0.2823532819747925, + "learning_rate": 3.36042600896861e-06, + "loss": 4.0357, + "step": 78210 + }, + { + "epoch": 5.3142410653621415, + "grad_norm": 0.30143463611602783, + "learning_rate": 3.360001358880283e-06, + "loss": 4.0419, + "step": 78215 + }, + { + "epoch": 5.3145807854328035, + "grad_norm": 0.2159561663866043, + "learning_rate": 3.3595767087919557e-06, + "loss": 3.9882, + "step": 78220 + }, + { + "epoch": 5.314920505503465, + "grad_norm": 0.4325164556503296, + "learning_rate": 3.3591520587036285e-06, + "loss": 4.0738, + "step": 78225 + }, + { + "epoch": 5.315260225574127, + "grad_norm": 0.569169282913208, + "learning_rate": 3.358727408615301e-06, + "loss": 4.0787, + "step": 78230 + }, + { + "epoch": 5.315599945644789, + "grad_norm": 0.24401357769966125, + "learning_rate": 3.358302758526974e-06, + "loss": 3.8486, + "step": 78235 + }, + { + "epoch": 5.31593966571545, + "grad_norm": 0.2545844614505768, + "learning_rate": 3.357878108438647e-06, + "loss": 3.8866, + "step": 78240 + }, + { + "epoch": 5.316279385786112, + "grad_norm": 0.3297155201435089, + "learning_rate": 3.3574534583503197e-06, + "loss": 3.7853, + "step": 78245 + }, + { + "epoch": 5.316619105856774, + "grad_norm": 0.2747184634208679, + "learning_rate": 3.3570288082619925e-06, + "loss": 3.9262, + "step": 78250 + }, + { + "epoch": 5.316958825927435, + "grad_norm": 0.34674546122550964, + "learning_rate": 3.3566041581736653e-06, + "loss": 3.9937, + "step": 78255 + }, + { + "epoch": 5.3172985459980975, + "grad_norm": 0.22904403507709503, + "learning_rate": 3.356179508085338e-06, + "loss": 3.9016, + "step": 78260 + }, + { + "epoch": 5.31763826606876, + "grad_norm": 0.23453058302402496, + "learning_rate": 3.3557548579970105e-06, + "loss": 3.8755, + "step": 78265 + }, + { + "epoch": 5.317977986139421, + "grad_norm": 0.24112938344478607, + "learning_rate": 3.3553302079086837e-06, + "loss": 3.774, + "step": 78270 + }, + { + "epoch": 5.318317706210083, + "grad_norm": 0.2942092716693878, + "learning_rate": 3.3549055578203565e-06, + "loss": 3.7879, + "step": 78275 + }, + { + "epoch": 5.318657426280745, + "grad_norm": 0.22266489267349243, + "learning_rate": 3.354480907732029e-06, + "loss": 3.8764, + "step": 78280 + }, + { + "epoch": 5.318997146351406, + "grad_norm": 0.5219619870185852, + "learning_rate": 3.354056257643702e-06, + "loss": 4.0179, + "step": 78285 + }, + { + "epoch": 5.319336866422068, + "grad_norm": 0.20130670070648193, + "learning_rate": 3.353631607555375e-06, + "loss": 4.0321, + "step": 78290 + }, + { + "epoch": 5.31967658649273, + "grad_norm": 0.29542863368988037, + "learning_rate": 3.3532069574670473e-06, + "loss": 4.1569, + "step": 78295 + }, + { + "epoch": 5.320016306563391, + "grad_norm": 0.3012049198150635, + "learning_rate": 3.35278230737872e-06, + "loss": 3.7592, + "step": 78300 + }, + { + "epoch": 5.3203560266340535, + "grad_norm": 0.24687473475933075, + "learning_rate": 3.3523576572903933e-06, + "loss": 3.8486, + "step": 78305 + }, + { + "epoch": 5.320695746704716, + "grad_norm": 0.35629796981811523, + "learning_rate": 3.3519330072020657e-06, + "loss": 3.8727, + "step": 78310 + }, + { + "epoch": 5.321035466775377, + "grad_norm": 0.2803530693054199, + "learning_rate": 3.3515083571137385e-06, + "loss": 4.106, + "step": 78315 + }, + { + "epoch": 5.321375186846039, + "grad_norm": 0.2836490571498871, + "learning_rate": 3.3510837070254117e-06, + "loss": 3.7552, + "step": 78320 + }, + { + "epoch": 5.321714906916701, + "grad_norm": 0.24787205457687378, + "learning_rate": 3.350659056937084e-06, + "loss": 3.831, + "step": 78325 + }, + { + "epoch": 5.322054626987362, + "grad_norm": 0.23869864642620087, + "learning_rate": 3.350234406848757e-06, + "loss": 3.9828, + "step": 78330 + }, + { + "epoch": 5.322394347058024, + "grad_norm": 0.2518788278102875, + "learning_rate": 3.3498097567604297e-06, + "loss": 3.9151, + "step": 78335 + }, + { + "epoch": 5.322734067128686, + "grad_norm": 0.39618223905563354, + "learning_rate": 3.3493851066721025e-06, + "loss": 3.9346, + "step": 78340 + }, + { + "epoch": 5.323073787199347, + "grad_norm": 0.25280866026878357, + "learning_rate": 3.3489604565837753e-06, + "loss": 3.9821, + "step": 78345 + }, + { + "epoch": 5.3234135072700095, + "grad_norm": 0.23252595961093903, + "learning_rate": 3.348535806495448e-06, + "loss": 3.9793, + "step": 78350 + }, + { + "epoch": 5.323753227340672, + "grad_norm": 0.31565719842910767, + "learning_rate": 3.3481111564071204e-06, + "loss": 3.8779, + "step": 78355 + }, + { + "epoch": 5.324092947411333, + "grad_norm": 0.2608061134815216, + "learning_rate": 3.3476865063187937e-06, + "loss": 4.0202, + "step": 78360 + }, + { + "epoch": 5.324432667481995, + "grad_norm": 0.2988179624080658, + "learning_rate": 3.3472618562304665e-06, + "loss": 3.9138, + "step": 78365 + }, + { + "epoch": 5.324772387552657, + "grad_norm": 0.2374502718448639, + "learning_rate": 3.346837206142139e-06, + "loss": 3.9382, + "step": 78370 + }, + { + "epoch": 5.325112107623318, + "grad_norm": 0.24935311079025269, + "learning_rate": 3.346412556053812e-06, + "loss": 4.024, + "step": 78375 + }, + { + "epoch": 5.32545182769398, + "grad_norm": 0.25400102138519287, + "learning_rate": 3.345987905965485e-06, + "loss": 4.0458, + "step": 78380 + }, + { + "epoch": 5.325791547764642, + "grad_norm": 0.28459683060646057, + "learning_rate": 3.3455632558771573e-06, + "loss": 4.0486, + "step": 78385 + }, + { + "epoch": 5.326131267835303, + "grad_norm": 0.2602684497833252, + "learning_rate": 3.34513860578883e-06, + "loss": 4.0009, + "step": 78390 + }, + { + "epoch": 5.3264709879059655, + "grad_norm": 0.29617491364479065, + "learning_rate": 3.3447139557005033e-06, + "loss": 4.0974, + "step": 78395 + }, + { + "epoch": 5.326810707976628, + "grad_norm": 0.3383162021636963, + "learning_rate": 3.3442893056121757e-06, + "loss": 3.7767, + "step": 78400 + }, + { + "epoch": 5.327150428047289, + "grad_norm": 0.24850699305534363, + "learning_rate": 3.3438646555238485e-06, + "loss": 3.9098, + "step": 78405 + }, + { + "epoch": 5.327490148117951, + "grad_norm": 0.22862951457500458, + "learning_rate": 3.3434400054355217e-06, + "loss": 3.8349, + "step": 78410 + }, + { + "epoch": 5.327829868188613, + "grad_norm": 0.2669728100299835, + "learning_rate": 3.3430153553471945e-06, + "loss": 4.0994, + "step": 78415 + }, + { + "epoch": 5.328169588259274, + "grad_norm": 0.27976059913635254, + "learning_rate": 3.342590705258867e-06, + "loss": 3.724, + "step": 78420 + }, + { + "epoch": 5.328509308329936, + "grad_norm": 0.2949483096599579, + "learning_rate": 3.3421660551705397e-06, + "loss": 3.9252, + "step": 78425 + }, + { + "epoch": 5.328849028400598, + "grad_norm": 0.25142550468444824, + "learning_rate": 3.341741405082213e-06, + "loss": 3.9265, + "step": 78430 + }, + { + "epoch": 5.3291887484712595, + "grad_norm": 0.2601627707481384, + "learning_rate": 3.3413167549938853e-06, + "loss": 3.7262, + "step": 78435 + }, + { + "epoch": 5.3295284685419215, + "grad_norm": 0.3946787416934967, + "learning_rate": 3.340892104905558e-06, + "loss": 4.2595, + "step": 78440 + }, + { + "epoch": 5.329868188612584, + "grad_norm": 0.2516833543777466, + "learning_rate": 3.3404674548172313e-06, + "loss": 3.9228, + "step": 78445 + }, + { + "epoch": 5.330207908683245, + "grad_norm": 0.33286839723587036, + "learning_rate": 3.3400428047289037e-06, + "loss": 3.9952, + "step": 78450 + }, + { + "epoch": 5.330547628753907, + "grad_norm": 0.25394928455352783, + "learning_rate": 3.3396181546405765e-06, + "loss": 4.1201, + "step": 78455 + }, + { + "epoch": 5.330887348824569, + "grad_norm": 0.24383455514907837, + "learning_rate": 3.3391935045522493e-06, + "loss": 4.1628, + "step": 78460 + }, + { + "epoch": 5.33122706889523, + "grad_norm": 0.20850694179534912, + "learning_rate": 3.3387688544639216e-06, + "loss": 4.1249, + "step": 78465 + }, + { + "epoch": 5.331566788965892, + "grad_norm": 0.30318933725357056, + "learning_rate": 3.338344204375595e-06, + "loss": 3.7182, + "step": 78470 + }, + { + "epoch": 5.331906509036554, + "grad_norm": 0.3174259662628174, + "learning_rate": 3.3379195542872677e-06, + "loss": 3.8964, + "step": 78475 + }, + { + "epoch": 5.3322462291072155, + "grad_norm": 0.23561209440231323, + "learning_rate": 3.33749490419894e-06, + "loss": 4.1232, + "step": 78480 + }, + { + "epoch": 5.3325859491778775, + "grad_norm": 0.30688875913619995, + "learning_rate": 3.3370702541106133e-06, + "loss": 3.8725, + "step": 78485 + }, + { + "epoch": 5.33292566924854, + "grad_norm": 0.34225478768348694, + "learning_rate": 3.336645604022286e-06, + "loss": 3.9419, + "step": 78490 + }, + { + "epoch": 5.333265389319201, + "grad_norm": 0.24003811180591583, + "learning_rate": 3.3362209539339584e-06, + "loss": 3.826, + "step": 78495 + }, + { + "epoch": 5.333605109389863, + "grad_norm": 0.24351702630519867, + "learning_rate": 3.3357963038456317e-06, + "loss": 3.738, + "step": 78500 + }, + { + "epoch": 5.333944829460524, + "grad_norm": 0.24285875260829926, + "learning_rate": 3.3353716537573045e-06, + "loss": 3.9821, + "step": 78505 + }, + { + "epoch": 5.334284549531186, + "grad_norm": 0.27978843450546265, + "learning_rate": 3.334947003668977e-06, + "loss": 3.8039, + "step": 78510 + }, + { + "epoch": 5.334624269601848, + "grad_norm": 0.25576531887054443, + "learning_rate": 3.3345223535806497e-06, + "loss": 3.8047, + "step": 78515 + }, + { + "epoch": 5.334963989672509, + "grad_norm": 0.24590656161308289, + "learning_rate": 3.334097703492323e-06, + "loss": 4.0283, + "step": 78520 + }, + { + "epoch": 5.3353037097431715, + "grad_norm": 0.2058441936969757, + "learning_rate": 3.3336730534039953e-06, + "loss": 4.1995, + "step": 78525 + }, + { + "epoch": 5.3356434298138335, + "grad_norm": 0.2442341148853302, + "learning_rate": 3.333248403315668e-06, + "loss": 3.9995, + "step": 78530 + }, + { + "epoch": 5.335983149884495, + "grad_norm": 0.2012777179479599, + "learning_rate": 3.3328237532273413e-06, + "loss": 3.8612, + "step": 78535 + }, + { + "epoch": 5.336322869955157, + "grad_norm": 0.2541763484477997, + "learning_rate": 3.3323991031390137e-06, + "loss": 3.7329, + "step": 78540 + }, + { + "epoch": 5.336662590025819, + "grad_norm": 0.23732204735279083, + "learning_rate": 3.3319744530506865e-06, + "loss": 3.945, + "step": 78545 + }, + { + "epoch": 5.33700231009648, + "grad_norm": 0.29897522926330566, + "learning_rate": 3.3315498029623593e-06, + "loss": 3.8252, + "step": 78550 + }, + { + "epoch": 5.337342030167142, + "grad_norm": 0.24015580117702484, + "learning_rate": 3.3311251528740316e-06, + "loss": 3.8923, + "step": 78555 + }, + { + "epoch": 5.337681750237804, + "grad_norm": 0.2280910313129425, + "learning_rate": 3.330700502785705e-06, + "loss": 4.0722, + "step": 78560 + }, + { + "epoch": 5.338021470308465, + "grad_norm": 0.2886049151420593, + "learning_rate": 3.3302758526973777e-06, + "loss": 3.8634, + "step": 78565 + }, + { + "epoch": 5.3383611903791275, + "grad_norm": 0.32597458362579346, + "learning_rate": 3.32985120260905e-06, + "loss": 3.7745, + "step": 78570 + }, + { + "epoch": 5.33870091044979, + "grad_norm": 0.44148504734039307, + "learning_rate": 3.3294265525207233e-06, + "loss": 3.7078, + "step": 78575 + }, + { + "epoch": 5.339040630520451, + "grad_norm": 0.25371623039245605, + "learning_rate": 3.329001902432396e-06, + "loss": 3.9537, + "step": 78580 + }, + { + "epoch": 5.339380350591113, + "grad_norm": 0.4330389201641083, + "learning_rate": 3.328577252344069e-06, + "loss": 3.9531, + "step": 78585 + }, + { + "epoch": 5.339720070661775, + "grad_norm": 0.30584046244621277, + "learning_rate": 3.3281526022557412e-06, + "loss": 3.8125, + "step": 78590 + }, + { + "epoch": 5.340059790732436, + "grad_norm": 0.26651108264923096, + "learning_rate": 3.3277279521674145e-06, + "loss": 4.1606, + "step": 78595 + }, + { + "epoch": 5.340399510803098, + "grad_norm": 0.2859271168708801, + "learning_rate": 3.3273033020790873e-06, + "loss": 3.9793, + "step": 78600 + }, + { + "epoch": 5.34073923087376, + "grad_norm": 0.25531673431396484, + "learning_rate": 3.3268786519907596e-06, + "loss": 3.8934, + "step": 78605 + }, + { + "epoch": 5.341078950944421, + "grad_norm": 0.2833244502544403, + "learning_rate": 3.326454001902433e-06, + "loss": 4.0232, + "step": 78610 + }, + { + "epoch": 5.3414186710150835, + "grad_norm": 0.25523772835731506, + "learning_rate": 3.3260293518141057e-06, + "loss": 3.858, + "step": 78615 + }, + { + "epoch": 5.341758391085746, + "grad_norm": 0.2949567139148712, + "learning_rate": 3.325604701725778e-06, + "loss": 4.0024, + "step": 78620 + }, + { + "epoch": 5.342098111156407, + "grad_norm": 0.30889445543289185, + "learning_rate": 3.3251800516374513e-06, + "loss": 3.9082, + "step": 78625 + }, + { + "epoch": 5.342437831227069, + "grad_norm": 0.2826516628265381, + "learning_rate": 3.324755401549124e-06, + "loss": 4.1247, + "step": 78630 + }, + { + "epoch": 5.342777551297731, + "grad_norm": 0.26858046650886536, + "learning_rate": 3.3243307514607964e-06, + "loss": 3.9616, + "step": 78635 + }, + { + "epoch": 5.343117271368392, + "grad_norm": 0.21268463134765625, + "learning_rate": 3.3239061013724692e-06, + "loss": 3.8186, + "step": 78640 + }, + { + "epoch": 5.343456991439054, + "grad_norm": 0.3712489902973175, + "learning_rate": 3.3234814512841425e-06, + "loss": 3.9152, + "step": 78645 + }, + { + "epoch": 5.343796711509716, + "grad_norm": 0.2736063599586487, + "learning_rate": 3.323056801195815e-06, + "loss": 3.8445, + "step": 78650 + }, + { + "epoch": 5.344136431580377, + "grad_norm": 0.43706315755844116, + "learning_rate": 3.3226321511074876e-06, + "loss": 3.9498, + "step": 78655 + }, + { + "epoch": 5.3444761516510395, + "grad_norm": 0.260153591632843, + "learning_rate": 3.322207501019161e-06, + "loss": 4.2101, + "step": 78660 + }, + { + "epoch": 5.344815871721702, + "grad_norm": 0.3309001624584198, + "learning_rate": 3.3217828509308333e-06, + "loss": 4.0208, + "step": 78665 + }, + { + "epoch": 5.345155591792363, + "grad_norm": 0.3203592300415039, + "learning_rate": 3.321358200842506e-06, + "loss": 4.043, + "step": 78670 + }, + { + "epoch": 5.345495311863025, + "grad_norm": 0.2635324001312256, + "learning_rate": 3.320933550754179e-06, + "loss": 3.9535, + "step": 78675 + }, + { + "epoch": 5.345835031933687, + "grad_norm": 0.30144983530044556, + "learning_rate": 3.3205089006658512e-06, + "loss": 3.8984, + "step": 78680 + }, + { + "epoch": 5.346174752004348, + "grad_norm": 0.2538418173789978, + "learning_rate": 3.3200842505775245e-06, + "loss": 3.7684, + "step": 78685 + }, + { + "epoch": 5.34651447207501, + "grad_norm": 0.2023632973432541, + "learning_rate": 3.3196596004891973e-06, + "loss": 3.946, + "step": 78690 + }, + { + "epoch": 5.346854192145672, + "grad_norm": 0.25755366683006287, + "learning_rate": 3.3192349504008696e-06, + "loss": 3.9003, + "step": 78695 + }, + { + "epoch": 5.3471939122163334, + "grad_norm": 0.33342480659484863, + "learning_rate": 3.318810300312543e-06, + "loss": 4.075, + "step": 78700 + }, + { + "epoch": 5.3475336322869955, + "grad_norm": 0.2171667367219925, + "learning_rate": 3.3183856502242157e-06, + "loss": 4.0437, + "step": 78705 + }, + { + "epoch": 5.347873352357658, + "grad_norm": 0.28245240449905396, + "learning_rate": 3.317961000135888e-06, + "loss": 3.5923, + "step": 78710 + }, + { + "epoch": 5.348213072428319, + "grad_norm": 0.2875062823295593, + "learning_rate": 3.317536350047561e-06, + "loss": 3.7329, + "step": 78715 + }, + { + "epoch": 5.348552792498981, + "grad_norm": 0.2936350405216217, + "learning_rate": 3.317111699959234e-06, + "loss": 3.7679, + "step": 78720 + }, + { + "epoch": 5.348892512569643, + "grad_norm": 0.23683607578277588, + "learning_rate": 3.3166870498709064e-06, + "loss": 3.9818, + "step": 78725 + }, + { + "epoch": 5.349232232640304, + "grad_norm": 0.37423962354660034, + "learning_rate": 3.3162623997825792e-06, + "loss": 3.8878, + "step": 78730 + }, + { + "epoch": 5.349571952710966, + "grad_norm": 0.23152758181095123, + "learning_rate": 3.3158377496942525e-06, + "loss": 3.9499, + "step": 78735 + }, + { + "epoch": 5.349911672781628, + "grad_norm": 0.2294151484966278, + "learning_rate": 3.315413099605925e-06, + "loss": 3.8721, + "step": 78740 + }, + { + "epoch": 5.3502513928522895, + "grad_norm": 0.4043258726596832, + "learning_rate": 3.3149884495175976e-06, + "loss": 3.8507, + "step": 78745 + }, + { + "epoch": 5.3505911129229515, + "grad_norm": 0.27406343817710876, + "learning_rate": 3.3145637994292704e-06, + "loss": 4.041, + "step": 78750 + }, + { + "epoch": 5.350930832993614, + "grad_norm": 0.2240990251302719, + "learning_rate": 3.3141391493409437e-06, + "loss": 3.9414, + "step": 78755 + }, + { + "epoch": 5.351270553064275, + "grad_norm": 0.23265525698661804, + "learning_rate": 3.313714499252616e-06, + "loss": 3.9086, + "step": 78760 + }, + { + "epoch": 5.351610273134937, + "grad_norm": 0.292879581451416, + "learning_rate": 3.313289849164289e-06, + "loss": 4.234, + "step": 78765 + }, + { + "epoch": 5.351949993205599, + "grad_norm": 0.2727138102054596, + "learning_rate": 3.312865199075962e-06, + "loss": 3.8495, + "step": 78770 + }, + { + "epoch": 5.35228971327626, + "grad_norm": 0.2805769741535187, + "learning_rate": 3.3124405489876344e-06, + "loss": 3.8617, + "step": 78775 + }, + { + "epoch": 5.352629433346922, + "grad_norm": 0.2327158898115158, + "learning_rate": 3.3120158988993072e-06, + "loss": 4.0808, + "step": 78780 + }, + { + "epoch": 5.352969153417584, + "grad_norm": 0.3259778618812561, + "learning_rate": 3.3115912488109805e-06, + "loss": 3.7831, + "step": 78785 + }, + { + "epoch": 5.3533088734882455, + "grad_norm": 0.2324523776769638, + "learning_rate": 3.311166598722653e-06, + "loss": 3.8802, + "step": 78790 + }, + { + "epoch": 5.3536485935589075, + "grad_norm": 0.2680180072784424, + "learning_rate": 3.3107419486343256e-06, + "loss": 3.716, + "step": 78795 + }, + { + "epoch": 5.35398831362957, + "grad_norm": 0.2506570816040039, + "learning_rate": 3.3103172985459984e-06, + "loss": 3.5066, + "step": 78800 + }, + { + "epoch": 5.354328033700231, + "grad_norm": 0.36267945170402527, + "learning_rate": 3.309892648457671e-06, + "loss": 3.8508, + "step": 78805 + }, + { + "epoch": 5.354667753770893, + "grad_norm": 0.7147958278656006, + "learning_rate": 3.309467998369344e-06, + "loss": 3.9101, + "step": 78810 + }, + { + "epoch": 5.355007473841555, + "grad_norm": 0.254634290933609, + "learning_rate": 3.309043348281017e-06, + "loss": 3.7963, + "step": 78815 + }, + { + "epoch": 5.355347193912216, + "grad_norm": 0.2512383759021759, + "learning_rate": 3.3086186981926892e-06, + "loss": 4.043, + "step": 78820 + }, + { + "epoch": 5.355686913982878, + "grad_norm": 0.35204261541366577, + "learning_rate": 3.3081940481043625e-06, + "loss": 3.936, + "step": 78825 + }, + { + "epoch": 5.35602663405354, + "grad_norm": 0.23620568215847015, + "learning_rate": 3.3077693980160353e-06, + "loss": 3.6727, + "step": 78830 + }, + { + "epoch": 5.3563663541242015, + "grad_norm": 0.2661285996437073, + "learning_rate": 3.3073447479277076e-06, + "loss": 3.7834, + "step": 78835 + }, + { + "epoch": 5.3567060741948636, + "grad_norm": 0.2810073494911194, + "learning_rate": 3.3069200978393804e-06, + "loss": 4.0166, + "step": 78840 + }, + { + "epoch": 5.357045794265526, + "grad_norm": 0.31263378262519836, + "learning_rate": 3.3064954477510537e-06, + "loss": 4.0263, + "step": 78845 + }, + { + "epoch": 5.357385514336187, + "grad_norm": 0.23107071220874786, + "learning_rate": 3.306070797662726e-06, + "loss": 3.9004, + "step": 78850 + }, + { + "epoch": 5.357725234406849, + "grad_norm": 0.2966735064983368, + "learning_rate": 3.305646147574399e-06, + "loss": 3.9656, + "step": 78855 + }, + { + "epoch": 5.358064954477511, + "grad_norm": 0.28486737608909607, + "learning_rate": 3.305221497486072e-06, + "loss": 3.8757, + "step": 78860 + }, + { + "epoch": 5.358404674548172, + "grad_norm": 0.3780747652053833, + "learning_rate": 3.3047968473977444e-06, + "loss": 4.0381, + "step": 78865 + }, + { + "epoch": 5.358744394618834, + "grad_norm": 0.20650535821914673, + "learning_rate": 3.3043721973094172e-06, + "loss": 3.7287, + "step": 78870 + }, + { + "epoch": 5.359084114689496, + "grad_norm": 0.2640882730484009, + "learning_rate": 3.30394754722109e-06, + "loss": 3.9701, + "step": 78875 + }, + { + "epoch": 5.3594238347601575, + "grad_norm": 0.25656554102897644, + "learning_rate": 3.303522897132763e-06, + "loss": 3.9739, + "step": 78880 + }, + { + "epoch": 5.35976355483082, + "grad_norm": 0.22772935032844543, + "learning_rate": 3.3030982470444356e-06, + "loss": 3.9266, + "step": 78885 + }, + { + "epoch": 5.360103274901482, + "grad_norm": 0.6258191466331482, + "learning_rate": 3.3026735969561084e-06, + "loss": 4.1105, + "step": 78890 + }, + { + "epoch": 5.360442994972143, + "grad_norm": 0.23080652952194214, + "learning_rate": 3.302248946867781e-06, + "loss": 3.8617, + "step": 78895 + }, + { + "epoch": 5.360782715042805, + "grad_norm": 0.2575500011444092, + "learning_rate": 3.301824296779454e-06, + "loss": 3.6097, + "step": 78900 + }, + { + "epoch": 5.361122435113466, + "grad_norm": 0.3956640660762787, + "learning_rate": 3.301399646691127e-06, + "loss": 4.0102, + "step": 78905 + }, + { + "epoch": 5.361462155184128, + "grad_norm": 0.22392848134040833, + "learning_rate": 3.3009749966027992e-06, + "loss": 3.7752, + "step": 78910 + }, + { + "epoch": 5.36180187525479, + "grad_norm": 0.3427031636238098, + "learning_rate": 3.3005503465144724e-06, + "loss": 4.028, + "step": 78915 + }, + { + "epoch": 5.362141595325451, + "grad_norm": 0.3136894702911377, + "learning_rate": 3.3001256964261452e-06, + "loss": 3.9131, + "step": 78920 + }, + { + "epoch": 5.3624813153961135, + "grad_norm": 0.28427428007125854, + "learning_rate": 3.299701046337818e-06, + "loss": 3.9261, + "step": 78925 + }, + { + "epoch": 5.362821035466776, + "grad_norm": 0.23021388053894043, + "learning_rate": 3.2992763962494904e-06, + "loss": 3.729, + "step": 78930 + }, + { + "epoch": 5.363160755537437, + "grad_norm": 0.2690225839614868, + "learning_rate": 3.2988517461611636e-06, + "loss": 3.971, + "step": 78935 + }, + { + "epoch": 5.363500475608099, + "grad_norm": 0.22128510475158691, + "learning_rate": 3.2984270960728364e-06, + "loss": 3.8789, + "step": 78940 + }, + { + "epoch": 5.363840195678761, + "grad_norm": 0.31245413422584534, + "learning_rate": 3.298002445984509e-06, + "loss": 3.9534, + "step": 78945 + }, + { + "epoch": 5.364179915749422, + "grad_norm": 0.3766292929649353, + "learning_rate": 3.297577795896182e-06, + "loss": 4.125, + "step": 78950 + }, + { + "epoch": 5.364519635820084, + "grad_norm": 0.30250847339630127, + "learning_rate": 3.297153145807855e-06, + "loss": 4.1924, + "step": 78955 + }, + { + "epoch": 5.364859355890746, + "grad_norm": 0.2727147042751312, + "learning_rate": 3.2967284957195272e-06, + "loss": 3.8139, + "step": 78960 + }, + { + "epoch": 5.365199075961407, + "grad_norm": 0.30799728631973267, + "learning_rate": 3.2963038456312e-06, + "loss": 4.044, + "step": 78965 + }, + { + "epoch": 5.3655387960320695, + "grad_norm": 0.23467305302619934, + "learning_rate": 3.2958791955428733e-06, + "loss": 3.8755, + "step": 78970 + }, + { + "epoch": 5.365878516102732, + "grad_norm": 0.21296443045139313, + "learning_rate": 3.2954545454545456e-06, + "loss": 4.1457, + "step": 78975 + }, + { + "epoch": 5.366218236173393, + "grad_norm": 0.2264762818813324, + "learning_rate": 3.2950298953662184e-06, + "loss": 3.9758, + "step": 78980 + }, + { + "epoch": 5.366557956244055, + "grad_norm": 0.28750526905059814, + "learning_rate": 3.2946052452778917e-06, + "loss": 4.1485, + "step": 78985 + }, + { + "epoch": 5.366897676314717, + "grad_norm": 0.23745867609977722, + "learning_rate": 3.294180595189564e-06, + "loss": 3.7001, + "step": 78990 + }, + { + "epoch": 5.367237396385378, + "grad_norm": 0.23526792228221893, + "learning_rate": 3.293755945101237e-06, + "loss": 3.9182, + "step": 78995 + }, + { + "epoch": 5.36757711645604, + "grad_norm": 0.24491944909095764, + "learning_rate": 3.2933312950129096e-06, + "loss": 3.8453, + "step": 79000 + }, + { + "epoch": 5.367916836526702, + "grad_norm": 0.3144538402557373, + "learning_rate": 3.2929066449245824e-06, + "loss": 3.9297, + "step": 79005 + }, + { + "epoch": 5.3682565565973634, + "grad_norm": 0.35933318734169006, + "learning_rate": 3.2924819948362552e-06, + "loss": 4.157, + "step": 79010 + }, + { + "epoch": 5.3685962766680255, + "grad_norm": 0.3284541964530945, + "learning_rate": 3.292057344747928e-06, + "loss": 3.8157, + "step": 79015 + }, + { + "epoch": 5.368935996738688, + "grad_norm": 0.33675047755241394, + "learning_rate": 3.2916326946596004e-06, + "loss": 4.1026, + "step": 79020 + }, + { + "epoch": 5.369275716809349, + "grad_norm": 0.2476017028093338, + "learning_rate": 3.2912080445712736e-06, + "loss": 4.0258, + "step": 79025 + }, + { + "epoch": 5.369615436880011, + "grad_norm": 0.2651214003562927, + "learning_rate": 3.2907833944829464e-06, + "loss": 4.0982, + "step": 79030 + }, + { + "epoch": 5.369955156950673, + "grad_norm": 0.23034198582172394, + "learning_rate": 3.290358744394619e-06, + "loss": 3.9882, + "step": 79035 + }, + { + "epoch": 5.370294877021334, + "grad_norm": 0.27184784412384033, + "learning_rate": 3.289934094306292e-06, + "loss": 3.9819, + "step": 79040 + }, + { + "epoch": 5.370634597091996, + "grad_norm": 0.27923253178596497, + "learning_rate": 3.289509444217965e-06, + "loss": 3.7753, + "step": 79045 + }, + { + "epoch": 5.370974317162658, + "grad_norm": 0.33605554699897766, + "learning_rate": 3.2890847941296372e-06, + "loss": 3.7304, + "step": 79050 + }, + { + "epoch": 5.3713140372333195, + "grad_norm": 0.39809945225715637, + "learning_rate": 3.28866014404131e-06, + "loss": 4.0872, + "step": 79055 + }, + { + "epoch": 5.3716537573039815, + "grad_norm": 0.2384481579065323, + "learning_rate": 3.2882354939529832e-06, + "loss": 3.7966, + "step": 79060 + }, + { + "epoch": 5.371993477374644, + "grad_norm": 0.2766075134277344, + "learning_rate": 3.2878108438646556e-06, + "loss": 3.9645, + "step": 79065 + }, + { + "epoch": 5.372333197445305, + "grad_norm": 0.29449740052223206, + "learning_rate": 3.2873861937763284e-06, + "loss": 3.7153, + "step": 79070 + }, + { + "epoch": 5.372672917515967, + "grad_norm": 0.2016652226448059, + "learning_rate": 3.2869615436880016e-06, + "loss": 3.782, + "step": 79075 + }, + { + "epoch": 5.373012637586629, + "grad_norm": 0.2398022562265396, + "learning_rate": 3.286536893599674e-06, + "loss": 3.7241, + "step": 79080 + }, + { + "epoch": 5.37335235765729, + "grad_norm": 0.26317909359931946, + "learning_rate": 3.286112243511347e-06, + "loss": 3.8565, + "step": 79085 + }, + { + "epoch": 5.373692077727952, + "grad_norm": 0.270852267742157, + "learning_rate": 3.2856875934230196e-06, + "loss": 3.8663, + "step": 79090 + }, + { + "epoch": 5.374031797798614, + "grad_norm": 0.25606250762939453, + "learning_rate": 3.285262943334693e-06, + "loss": 3.7124, + "step": 79095 + }, + { + "epoch": 5.3743715178692755, + "grad_norm": 0.2047201544046402, + "learning_rate": 3.2848382932463652e-06, + "loss": 3.7714, + "step": 79100 + }, + { + "epoch": 5.3747112379399375, + "grad_norm": 0.2396644651889801, + "learning_rate": 3.284413643158038e-06, + "loss": 4.0035, + "step": 79105 + }, + { + "epoch": 5.3750509580106, + "grad_norm": 0.23783819377422333, + "learning_rate": 3.2839889930697112e-06, + "loss": 3.9266, + "step": 79110 + }, + { + "epoch": 5.375390678081261, + "grad_norm": 0.30091774463653564, + "learning_rate": 3.2835643429813836e-06, + "loss": 3.8059, + "step": 79115 + }, + { + "epoch": 5.375730398151923, + "grad_norm": 0.4563455581665039, + "learning_rate": 3.2831396928930564e-06, + "loss": 4.0983, + "step": 79120 + }, + { + "epoch": 5.376070118222585, + "grad_norm": 0.25564032793045044, + "learning_rate": 3.2827150428047292e-06, + "loss": 3.8275, + "step": 79125 + }, + { + "epoch": 5.376409838293246, + "grad_norm": 0.24988862872123718, + "learning_rate": 3.282290392716402e-06, + "loss": 3.8729, + "step": 79130 + }, + { + "epoch": 5.376749558363908, + "grad_norm": 0.27791357040405273, + "learning_rate": 3.281865742628075e-06, + "loss": 3.8487, + "step": 79135 + }, + { + "epoch": 5.37708927843457, + "grad_norm": 0.2984306514263153, + "learning_rate": 3.2814410925397476e-06, + "loss": 4.0774, + "step": 79140 + }, + { + "epoch": 5.3774289985052315, + "grad_norm": 0.3452715277671814, + "learning_rate": 3.28101644245142e-06, + "loss": 4.1877, + "step": 79145 + }, + { + "epoch": 5.377768718575894, + "grad_norm": 0.2467266321182251, + "learning_rate": 3.2805917923630932e-06, + "loss": 3.9719, + "step": 79150 + }, + { + "epoch": 5.378108438646556, + "grad_norm": 0.34375301003456116, + "learning_rate": 3.280167142274766e-06, + "loss": 4.0381, + "step": 79155 + }, + { + "epoch": 5.378448158717217, + "grad_norm": 0.26995423436164856, + "learning_rate": 3.2797424921864384e-06, + "loss": 3.8644, + "step": 79160 + }, + { + "epoch": 5.378787878787879, + "grad_norm": 0.234518364071846, + "learning_rate": 3.2793178420981116e-06, + "loss": 3.8322, + "step": 79165 + }, + { + "epoch": 5.379127598858541, + "grad_norm": 0.28608158230781555, + "learning_rate": 3.2788931920097844e-06, + "loss": 4.1692, + "step": 79170 + }, + { + "epoch": 5.379467318929202, + "grad_norm": 0.34748607873916626, + "learning_rate": 3.278468541921457e-06, + "loss": 3.8234, + "step": 79175 + }, + { + "epoch": 5.379807038999864, + "grad_norm": 0.29429563879966736, + "learning_rate": 3.2780438918331296e-06, + "loss": 4.1015, + "step": 79180 + }, + { + "epoch": 5.380146759070525, + "grad_norm": 0.2470337599515915, + "learning_rate": 3.277619241744803e-06, + "loss": 3.9298, + "step": 79185 + }, + { + "epoch": 5.3804864791411875, + "grad_norm": 0.2132025957107544, + "learning_rate": 3.2771945916564752e-06, + "loss": 3.8736, + "step": 79190 + }, + { + "epoch": 5.38082619921185, + "grad_norm": 0.22590628266334534, + "learning_rate": 3.276769941568148e-06, + "loss": 3.8692, + "step": 79195 + }, + { + "epoch": 5.381165919282511, + "grad_norm": 0.32650795578956604, + "learning_rate": 3.2763452914798212e-06, + "loss": 4.0667, + "step": 79200 + }, + { + "epoch": 5.381505639353173, + "grad_norm": 0.20435237884521484, + "learning_rate": 3.2759206413914936e-06, + "loss": 3.8901, + "step": 79205 + }, + { + "epoch": 5.381845359423835, + "grad_norm": 0.2705569863319397, + "learning_rate": 3.2754959913031664e-06, + "loss": 3.7089, + "step": 79210 + }, + { + "epoch": 5.382185079494496, + "grad_norm": 0.3166988790035248, + "learning_rate": 3.2750713412148392e-06, + "loss": 3.8144, + "step": 79215 + }, + { + "epoch": 5.382524799565158, + "grad_norm": 0.22340017557144165, + "learning_rate": 3.2746466911265116e-06, + "loss": 3.7651, + "step": 79220 + }, + { + "epoch": 5.38286451963582, + "grad_norm": 0.23842939734458923, + "learning_rate": 3.274222041038185e-06, + "loss": 3.8298, + "step": 79225 + }, + { + "epoch": 5.383204239706481, + "grad_norm": 0.23680947721004486, + "learning_rate": 3.2737973909498576e-06, + "loss": 4.1153, + "step": 79230 + }, + { + "epoch": 5.3835439597771435, + "grad_norm": 0.3003029525279999, + "learning_rate": 3.27337274086153e-06, + "loss": 4.0964, + "step": 79235 + }, + { + "epoch": 5.383883679847806, + "grad_norm": 0.22134001553058624, + "learning_rate": 3.2729480907732032e-06, + "loss": 3.9114, + "step": 79240 + }, + { + "epoch": 5.384223399918467, + "grad_norm": 0.3147158920764923, + "learning_rate": 3.272523440684876e-06, + "loss": 3.8559, + "step": 79245 + }, + { + "epoch": 5.384563119989129, + "grad_norm": 0.2375926971435547, + "learning_rate": 3.2720987905965484e-06, + "loss": 3.7509, + "step": 79250 + }, + { + "epoch": 5.384902840059791, + "grad_norm": 0.3605116903781891, + "learning_rate": 3.271674140508221e-06, + "loss": 3.8285, + "step": 79255 + }, + { + "epoch": 5.385242560130452, + "grad_norm": 0.2523520588874817, + "learning_rate": 3.2712494904198944e-06, + "loss": 3.9287, + "step": 79260 + }, + { + "epoch": 5.385582280201114, + "grad_norm": 0.30250266194343567, + "learning_rate": 3.2708248403315672e-06, + "loss": 3.9807, + "step": 79265 + }, + { + "epoch": 5.385922000271776, + "grad_norm": 0.24527594447135925, + "learning_rate": 3.2704001902432396e-06, + "loss": 3.7749, + "step": 79270 + }, + { + "epoch": 5.386261720342437, + "grad_norm": 0.2685009837150574, + "learning_rate": 3.269975540154913e-06, + "loss": 3.9227, + "step": 79275 + }, + { + "epoch": 5.3866014404130995, + "grad_norm": 0.2725149095058441, + "learning_rate": 3.2695508900665856e-06, + "loss": 4.0123, + "step": 79280 + }, + { + "epoch": 5.386941160483762, + "grad_norm": 0.39765384793281555, + "learning_rate": 3.269126239978258e-06, + "loss": 3.7114, + "step": 79285 + }, + { + "epoch": 5.387280880554423, + "grad_norm": 0.219468355178833, + "learning_rate": 3.2687015898899312e-06, + "loss": 3.8935, + "step": 79290 + }, + { + "epoch": 5.387620600625085, + "grad_norm": 0.2323777973651886, + "learning_rate": 3.268276939801604e-06, + "loss": 3.8587, + "step": 79295 + }, + { + "epoch": 5.387960320695747, + "grad_norm": 0.19639649987220764, + "learning_rate": 3.2678522897132764e-06, + "loss": 3.7377, + "step": 79300 + }, + { + "epoch": 5.388300040766408, + "grad_norm": 0.28016504645347595, + "learning_rate": 3.267427639624949e-06, + "loss": 3.7787, + "step": 79305 + }, + { + "epoch": 5.38863976083707, + "grad_norm": 0.24821753799915314, + "learning_rate": 3.2670029895366224e-06, + "loss": 3.7658, + "step": 79310 + }, + { + "epoch": 5.388979480907732, + "grad_norm": 0.23713932931423187, + "learning_rate": 3.266578339448295e-06, + "loss": 3.9013, + "step": 79315 + }, + { + "epoch": 5.3893192009783935, + "grad_norm": 0.23650313913822174, + "learning_rate": 3.2661536893599676e-06, + "loss": 3.9926, + "step": 79320 + }, + { + "epoch": 5.3896589210490555, + "grad_norm": 0.26880714297294617, + "learning_rate": 3.265729039271641e-06, + "loss": 3.9908, + "step": 79325 + }, + { + "epoch": 5.389998641119718, + "grad_norm": 0.2994160056114197, + "learning_rate": 3.2653043891833132e-06, + "loss": 4.0415, + "step": 79330 + }, + { + "epoch": 5.390338361190379, + "grad_norm": 0.23123230040073395, + "learning_rate": 3.264879739094986e-06, + "loss": 3.9446, + "step": 79335 + }, + { + "epoch": 5.390678081261041, + "grad_norm": 0.21978522837162018, + "learning_rate": 3.264455089006659e-06, + "loss": 3.9292, + "step": 79340 + }, + { + "epoch": 5.391017801331703, + "grad_norm": 0.3255656361579895, + "learning_rate": 3.264030438918331e-06, + "loss": 3.8678, + "step": 79345 + }, + { + "epoch": 5.391357521402364, + "grad_norm": 0.4190513491630554, + "learning_rate": 3.2636057888300044e-06, + "loss": 4.1229, + "step": 79350 + }, + { + "epoch": 5.391697241473026, + "grad_norm": 0.26164495944976807, + "learning_rate": 3.2631811387416772e-06, + "loss": 3.8344, + "step": 79355 + }, + { + "epoch": 5.392036961543688, + "grad_norm": 0.29226961731910706, + "learning_rate": 3.2627564886533496e-06, + "loss": 3.7412, + "step": 79360 + }, + { + "epoch": 5.3923766816143495, + "grad_norm": 0.2848049998283386, + "learning_rate": 3.262331838565023e-06, + "loss": 3.8054, + "step": 79365 + }, + { + "epoch": 5.3927164016850115, + "grad_norm": 0.2596195340156555, + "learning_rate": 3.2619071884766956e-06, + "loss": 3.6165, + "step": 79370 + }, + { + "epoch": 5.393056121755674, + "grad_norm": 0.30092400312423706, + "learning_rate": 3.261482538388368e-06, + "loss": 3.8583, + "step": 79375 + }, + { + "epoch": 5.393395841826335, + "grad_norm": 0.24183060228824615, + "learning_rate": 3.261057888300041e-06, + "loss": 3.6337, + "step": 79380 + }, + { + "epoch": 5.393735561896997, + "grad_norm": 0.26456335186958313, + "learning_rate": 3.260633238211714e-06, + "loss": 3.8604, + "step": 79385 + }, + { + "epoch": 5.394075281967659, + "grad_norm": 0.2816956043243408, + "learning_rate": 3.2602085881233864e-06, + "loss": 3.9627, + "step": 79390 + }, + { + "epoch": 5.39441500203832, + "grad_norm": 0.2509796917438507, + "learning_rate": 3.259783938035059e-06, + "loss": 3.986, + "step": 79395 + }, + { + "epoch": 5.394754722108982, + "grad_norm": 0.26769736409187317, + "learning_rate": 3.2593592879467324e-06, + "loss": 3.9317, + "step": 79400 + }, + { + "epoch": 5.395094442179644, + "grad_norm": 0.252792626619339, + "learning_rate": 3.258934637858405e-06, + "loss": 3.7477, + "step": 79405 + }, + { + "epoch": 5.3954341622503055, + "grad_norm": 0.2954219579696655, + "learning_rate": 3.2585099877700776e-06, + "loss": 4.1258, + "step": 79410 + }, + { + "epoch": 5.3957738823209676, + "grad_norm": 0.2814301550388336, + "learning_rate": 3.258085337681751e-06, + "loss": 4.007, + "step": 79415 + }, + { + "epoch": 5.39611360239163, + "grad_norm": 0.20839418470859528, + "learning_rate": 3.257660687593423e-06, + "loss": 3.8819, + "step": 79420 + }, + { + "epoch": 5.396453322462291, + "grad_norm": 0.22834643721580505, + "learning_rate": 3.257236037505096e-06, + "loss": 3.7873, + "step": 79425 + }, + { + "epoch": 5.396793042532953, + "grad_norm": 0.22003251314163208, + "learning_rate": 3.256811387416769e-06, + "loss": 3.8923, + "step": 79430 + }, + { + "epoch": 5.397132762603615, + "grad_norm": 0.2774987816810608, + "learning_rate": 3.256386737328442e-06, + "loss": 3.77, + "step": 79435 + }, + { + "epoch": 5.397472482674276, + "grad_norm": 0.23797276616096497, + "learning_rate": 3.2559620872401144e-06, + "loss": 3.9903, + "step": 79440 + }, + { + "epoch": 5.397812202744938, + "grad_norm": 0.20803052186965942, + "learning_rate": 3.255537437151787e-06, + "loss": 3.9465, + "step": 79445 + }, + { + "epoch": 5.3981519228156, + "grad_norm": 0.4214010536670685, + "learning_rate": 3.2551127870634604e-06, + "loss": 3.8007, + "step": 79450 + }, + { + "epoch": 5.3984916428862615, + "grad_norm": 0.3024577796459198, + "learning_rate": 3.254688136975133e-06, + "loss": 3.7218, + "step": 79455 + }, + { + "epoch": 5.398831362956924, + "grad_norm": 0.21571983397006989, + "learning_rate": 3.2542634868868056e-06, + "loss": 3.9774, + "step": 79460 + }, + { + "epoch": 5.399171083027586, + "grad_norm": 0.3033277988433838, + "learning_rate": 3.2538388367984784e-06, + "loss": 3.8601, + "step": 79465 + }, + { + "epoch": 5.399510803098247, + "grad_norm": 0.2283848226070404, + "learning_rate": 3.2534141867101508e-06, + "loss": 3.8862, + "step": 79470 + }, + { + "epoch": 5.399850523168909, + "grad_norm": 0.25184187293052673, + "learning_rate": 3.252989536621824e-06, + "loss": 3.9456, + "step": 79475 + }, + { + "epoch": 5.400190243239571, + "grad_norm": 0.2090906798839569, + "learning_rate": 3.252564886533497e-06, + "loss": 3.936, + "step": 79480 + }, + { + "epoch": 5.400529963310232, + "grad_norm": 0.44880211353302, + "learning_rate": 3.252140236445169e-06, + "loss": 3.8601, + "step": 79485 + }, + { + "epoch": 5.400869683380894, + "grad_norm": 0.27633947134017944, + "learning_rate": 3.2517155863568424e-06, + "loss": 3.8759, + "step": 79490 + }, + { + "epoch": 5.401209403451556, + "grad_norm": 0.28376707434654236, + "learning_rate": 3.2512909362685152e-06, + "loss": 3.9235, + "step": 79495 + }, + { + "epoch": 5.4015491235222175, + "grad_norm": 0.30260714888572693, + "learning_rate": 3.2508662861801876e-06, + "loss": 4.0544, + "step": 79500 + }, + { + "epoch": 5.40188884359288, + "grad_norm": 0.3778167963027954, + "learning_rate": 3.2504416360918604e-06, + "loss": 3.7349, + "step": 79505 + }, + { + "epoch": 5.402228563663542, + "grad_norm": 0.2108532041311264, + "learning_rate": 3.2500169860035336e-06, + "loss": 3.9064, + "step": 79510 + }, + { + "epoch": 5.402568283734203, + "grad_norm": 0.34144195914268494, + "learning_rate": 3.249592335915206e-06, + "loss": 3.9242, + "step": 79515 + }, + { + "epoch": 5.402908003804865, + "grad_norm": 0.26488542556762695, + "learning_rate": 3.249167685826879e-06, + "loss": 3.9891, + "step": 79520 + }, + { + "epoch": 5.403247723875527, + "grad_norm": 0.3264378607273102, + "learning_rate": 3.248743035738552e-06, + "loss": 3.9771, + "step": 79525 + }, + { + "epoch": 5.403587443946188, + "grad_norm": 0.28582561016082764, + "learning_rate": 3.2483183856502244e-06, + "loss": 3.9632, + "step": 79530 + }, + { + "epoch": 5.40392716401685, + "grad_norm": 0.3017978370189667, + "learning_rate": 3.247893735561897e-06, + "loss": 3.6618, + "step": 79535 + }, + { + "epoch": 5.404266884087512, + "grad_norm": 0.3630630373954773, + "learning_rate": 3.24746908547357e-06, + "loss": 4.0009, + "step": 79540 + }, + { + "epoch": 5.4046066041581735, + "grad_norm": 0.2095748782157898, + "learning_rate": 3.247044435385243e-06, + "loss": 3.8494, + "step": 79545 + }, + { + "epoch": 5.404946324228836, + "grad_norm": 0.2440824806690216, + "learning_rate": 3.2466197852969156e-06, + "loss": 3.991, + "step": 79550 + }, + { + "epoch": 5.405286044299498, + "grad_norm": 0.3072091042995453, + "learning_rate": 3.2461951352085884e-06, + "loss": 4.0616, + "step": 79555 + }, + { + "epoch": 5.405625764370159, + "grad_norm": 0.350519597530365, + "learning_rate": 3.2457704851202608e-06, + "loss": 4.2544, + "step": 79560 + }, + { + "epoch": 5.405965484440821, + "grad_norm": 0.2737938165664673, + "learning_rate": 3.245345835031934e-06, + "loss": 3.7801, + "step": 79565 + }, + { + "epoch": 5.406305204511483, + "grad_norm": 0.23711510002613068, + "learning_rate": 3.244921184943607e-06, + "loss": 4.0165, + "step": 79570 + }, + { + "epoch": 5.406644924582144, + "grad_norm": 0.3909163475036621, + "learning_rate": 3.244496534855279e-06, + "loss": 4.027, + "step": 79575 + }, + { + "epoch": 5.406984644652806, + "grad_norm": 0.33035027980804443, + "learning_rate": 3.2440718847669524e-06, + "loss": 3.9882, + "step": 79580 + }, + { + "epoch": 5.4073243647234674, + "grad_norm": 0.26239439845085144, + "learning_rate": 3.243647234678625e-06, + "loss": 3.7542, + "step": 79585 + }, + { + "epoch": 5.4076640847941295, + "grad_norm": 0.24442386627197266, + "learning_rate": 3.2432225845902976e-06, + "loss": 3.8139, + "step": 79590 + }, + { + "epoch": 5.408003804864792, + "grad_norm": 0.2509424686431885, + "learning_rate": 3.2427979345019704e-06, + "loss": 3.6419, + "step": 79595 + }, + { + "epoch": 5.408343524935453, + "grad_norm": 0.206381693482399, + "learning_rate": 3.2423732844136436e-06, + "loss": 4.1078, + "step": 79600 + }, + { + "epoch": 5.408683245006115, + "grad_norm": 0.24178457260131836, + "learning_rate": 3.2419486343253164e-06, + "loss": 3.6677, + "step": 79605 + }, + { + "epoch": 5.409022965076777, + "grad_norm": 0.26478976011276245, + "learning_rate": 3.2415239842369888e-06, + "loss": 3.9185, + "step": 79610 + }, + { + "epoch": 5.409362685147438, + "grad_norm": 0.2339533120393753, + "learning_rate": 3.241099334148662e-06, + "loss": 3.7134, + "step": 79615 + }, + { + "epoch": 5.4097024052181, + "grad_norm": 0.271583616733551, + "learning_rate": 3.240674684060335e-06, + "loss": 3.6573, + "step": 79620 + }, + { + "epoch": 5.410042125288762, + "grad_norm": 0.22767439484596252, + "learning_rate": 3.240250033972007e-06, + "loss": 3.9832, + "step": 79625 + }, + { + "epoch": 5.4103818453594235, + "grad_norm": 0.2838301658630371, + "learning_rate": 3.23982538388368e-06, + "loss": 3.8269, + "step": 79630 + }, + { + "epoch": 5.4107215654300855, + "grad_norm": 0.2797728180885315, + "learning_rate": 3.2394007337953532e-06, + "loss": 4.0043, + "step": 79635 + }, + { + "epoch": 5.411061285500748, + "grad_norm": 0.30400896072387695, + "learning_rate": 3.2389760837070256e-06, + "loss": 3.9468, + "step": 79640 + }, + { + "epoch": 5.411401005571409, + "grad_norm": 0.3464082181453705, + "learning_rate": 3.2385514336186984e-06, + "loss": 3.8653, + "step": 79645 + }, + { + "epoch": 5.411740725642071, + "grad_norm": 0.27249279618263245, + "learning_rate": 3.2381267835303716e-06, + "loss": 3.7495, + "step": 79650 + }, + { + "epoch": 5.412080445712733, + "grad_norm": 0.31315138936042786, + "learning_rate": 3.237702133442044e-06, + "loss": 3.9394, + "step": 79655 + }, + { + "epoch": 5.412420165783394, + "grad_norm": 0.25632819533348083, + "learning_rate": 3.237277483353717e-06, + "loss": 3.7237, + "step": 79660 + }, + { + "epoch": 5.412759885854056, + "grad_norm": 0.2595892548561096, + "learning_rate": 3.2368528332653896e-06, + "loss": 4.0806, + "step": 79665 + }, + { + "epoch": 5.413099605924718, + "grad_norm": 0.3775876462459564, + "learning_rate": 3.2364281831770624e-06, + "loss": 3.9459, + "step": 79670 + }, + { + "epoch": 5.4134393259953795, + "grad_norm": 0.23181511461734772, + "learning_rate": 3.236003533088735e-06, + "loss": 3.9108, + "step": 79675 + }, + { + "epoch": 5.4137790460660415, + "grad_norm": 0.22614382207393646, + "learning_rate": 3.235578883000408e-06, + "loss": 3.8336, + "step": 79680 + }, + { + "epoch": 5.414118766136704, + "grad_norm": 0.26757389307022095, + "learning_rate": 3.2351542329120804e-06, + "loss": 3.7715, + "step": 79685 + }, + { + "epoch": 5.414458486207365, + "grad_norm": 0.26370009779930115, + "learning_rate": 3.2347295828237536e-06, + "loss": 3.9442, + "step": 79690 + }, + { + "epoch": 5.414798206278027, + "grad_norm": 0.23953525722026825, + "learning_rate": 3.2343049327354264e-06, + "loss": 3.9955, + "step": 79695 + }, + { + "epoch": 5.415137926348689, + "grad_norm": 0.30581843852996826, + "learning_rate": 3.2338802826470988e-06, + "loss": 3.9041, + "step": 79700 + }, + { + "epoch": 5.41547764641935, + "grad_norm": 0.18152911961078644, + "learning_rate": 3.233455632558772e-06, + "loss": 4.0662, + "step": 79705 + }, + { + "epoch": 5.415817366490012, + "grad_norm": 0.2764197587966919, + "learning_rate": 3.233030982470445e-06, + "loss": 3.9277, + "step": 79710 + }, + { + "epoch": 5.416157086560674, + "grad_norm": 0.2502163350582123, + "learning_rate": 3.232606332382117e-06, + "loss": 3.8329, + "step": 79715 + }, + { + "epoch": 5.4164968066313355, + "grad_norm": 0.2277388721704483, + "learning_rate": 3.23218168229379e-06, + "loss": 3.8438, + "step": 79720 + }, + { + "epoch": 5.416836526701998, + "grad_norm": 0.25226277112960815, + "learning_rate": 3.231757032205463e-06, + "loss": 3.9087, + "step": 79725 + }, + { + "epoch": 5.41717624677266, + "grad_norm": 0.2419167309999466, + "learning_rate": 3.2313323821171356e-06, + "loss": 4.0145, + "step": 79730 + }, + { + "epoch": 5.417515966843321, + "grad_norm": 0.34918129444122314, + "learning_rate": 3.2309077320288084e-06, + "loss": 4.0078, + "step": 79735 + }, + { + "epoch": 5.417855686913983, + "grad_norm": 0.25195446610450745, + "learning_rate": 3.2304830819404816e-06, + "loss": 3.8946, + "step": 79740 + }, + { + "epoch": 5.418195406984645, + "grad_norm": 0.36307379603385925, + "learning_rate": 3.230058431852154e-06, + "loss": 3.8467, + "step": 79745 + }, + { + "epoch": 5.418535127055306, + "grad_norm": 0.2790764570236206, + "learning_rate": 3.2296337817638268e-06, + "loss": 4.1155, + "step": 79750 + }, + { + "epoch": 5.418874847125968, + "grad_norm": 0.32747477293014526, + "learning_rate": 3.2292091316754996e-06, + "loss": 4.1777, + "step": 79755 + }, + { + "epoch": 5.41921456719663, + "grad_norm": 0.4877395033836365, + "learning_rate": 3.228784481587172e-06, + "loss": 3.8494, + "step": 79760 + }, + { + "epoch": 5.4195542872672915, + "grad_norm": 0.2971944808959961, + "learning_rate": 3.228359831498845e-06, + "loss": 3.8152, + "step": 79765 + }, + { + "epoch": 5.419894007337954, + "grad_norm": 0.26239070296287537, + "learning_rate": 3.227935181410518e-06, + "loss": 4.0581, + "step": 79770 + }, + { + "epoch": 5.420233727408616, + "grad_norm": 0.23931513726711273, + "learning_rate": 3.227510531322191e-06, + "loss": 3.9974, + "step": 79775 + }, + { + "epoch": 5.420573447479277, + "grad_norm": 0.2343151867389679, + "learning_rate": 3.2270858812338636e-06, + "loss": 4.1103, + "step": 79780 + }, + { + "epoch": 5.420913167549939, + "grad_norm": 0.24334844946861267, + "learning_rate": 3.2266612311455364e-06, + "loss": 3.9025, + "step": 79785 + }, + { + "epoch": 5.421252887620601, + "grad_norm": 0.35514673590660095, + "learning_rate": 3.226236581057209e-06, + "loss": 3.86, + "step": 79790 + }, + { + "epoch": 5.421592607691262, + "grad_norm": 0.2541947662830353, + "learning_rate": 3.225811930968882e-06, + "loss": 3.7583, + "step": 79795 + }, + { + "epoch": 5.421932327761924, + "grad_norm": 0.24625274538993835, + "learning_rate": 3.225387280880555e-06, + "loss": 3.8843, + "step": 79800 + }, + { + "epoch": 5.422272047832586, + "grad_norm": 0.20880722999572754, + "learning_rate": 3.2249626307922276e-06, + "loss": 3.5938, + "step": 79805 + }, + { + "epoch": 5.4226117679032475, + "grad_norm": 0.31484848260879517, + "learning_rate": 3.2245379807039e-06, + "loss": 3.8698, + "step": 79810 + }, + { + "epoch": 5.42295148797391, + "grad_norm": 0.4375951886177063, + "learning_rate": 3.224113330615573e-06, + "loss": 4.2449, + "step": 79815 + }, + { + "epoch": 5.423291208044572, + "grad_norm": 0.2413213700056076, + "learning_rate": 3.223688680527246e-06, + "loss": 4.0008, + "step": 79820 + }, + { + "epoch": 5.423630928115233, + "grad_norm": 0.3017491102218628, + "learning_rate": 3.2232640304389184e-06, + "loss": 4.0909, + "step": 79825 + }, + { + "epoch": 5.423970648185895, + "grad_norm": 0.28386804461479187, + "learning_rate": 3.2228393803505916e-06, + "loss": 4.0435, + "step": 79830 + }, + { + "epoch": 5.424310368256557, + "grad_norm": 0.2406235784292221, + "learning_rate": 3.2224147302622644e-06, + "loss": 4.0615, + "step": 79835 + }, + { + "epoch": 5.424650088327218, + "grad_norm": 0.24961163103580475, + "learning_rate": 3.2219900801739368e-06, + "loss": 3.8334, + "step": 79840 + }, + { + "epoch": 5.42498980839788, + "grad_norm": 0.19513480365276337, + "learning_rate": 3.2215654300856096e-06, + "loss": 3.945, + "step": 79845 + }, + { + "epoch": 5.425329528468542, + "grad_norm": 0.20234541594982147, + "learning_rate": 3.221140779997283e-06, + "loss": 3.8459, + "step": 79850 + }, + { + "epoch": 5.4256692485392035, + "grad_norm": 0.32091444730758667, + "learning_rate": 3.220716129908955e-06, + "loss": 4.093, + "step": 79855 + }, + { + "epoch": 5.426008968609866, + "grad_norm": 0.33836838603019714, + "learning_rate": 3.220291479820628e-06, + "loss": 3.9122, + "step": 79860 + }, + { + "epoch": 5.426348688680527, + "grad_norm": 0.2709573805332184, + "learning_rate": 3.219866829732301e-06, + "loss": 4.0041, + "step": 79865 + }, + { + "epoch": 5.426688408751189, + "grad_norm": 0.25747933983802795, + "learning_rate": 3.2194421796439736e-06, + "loss": 3.6697, + "step": 79870 + }, + { + "epoch": 5.427028128821851, + "grad_norm": 0.37840402126312256, + "learning_rate": 3.2190175295556464e-06, + "loss": 3.9249, + "step": 79875 + }, + { + "epoch": 5.427367848892512, + "grad_norm": 0.1923321932554245, + "learning_rate": 3.218592879467319e-06, + "loss": 3.7558, + "step": 79880 + }, + { + "epoch": 5.427707568963174, + "grad_norm": 0.20881064236164093, + "learning_rate": 3.2181682293789916e-06, + "loss": 3.6677, + "step": 79885 + }, + { + "epoch": 5.428047289033836, + "grad_norm": 0.3185306191444397, + "learning_rate": 3.2177435792906648e-06, + "loss": 4.0526, + "step": 79890 + }, + { + "epoch": 5.4283870091044975, + "grad_norm": 0.2227005511522293, + "learning_rate": 3.2173189292023376e-06, + "loss": 4.0058, + "step": 79895 + }, + { + "epoch": 5.4287267291751595, + "grad_norm": 0.31644299626350403, + "learning_rate": 3.21689427911401e-06, + "loss": 3.974, + "step": 79900 + }, + { + "epoch": 5.429066449245822, + "grad_norm": 0.23455597460269928, + "learning_rate": 3.216469629025683e-06, + "loss": 3.8567, + "step": 79905 + }, + { + "epoch": 5.429406169316483, + "grad_norm": 0.28562435507774353, + "learning_rate": 3.216044978937356e-06, + "loss": 4.1301, + "step": 79910 + }, + { + "epoch": 5.429745889387145, + "grad_norm": 0.2669903635978699, + "learning_rate": 3.2156203288490284e-06, + "loss": 3.8805, + "step": 79915 + }, + { + "epoch": 5.430085609457807, + "grad_norm": 0.43198636174201965, + "learning_rate": 3.2151956787607016e-06, + "loss": 3.6127, + "step": 79920 + }, + { + "epoch": 5.430425329528468, + "grad_norm": 0.22437474131584167, + "learning_rate": 3.2147710286723744e-06, + "loss": 3.8639, + "step": 79925 + }, + { + "epoch": 5.43076504959913, + "grad_norm": 0.2927112281322479, + "learning_rate": 3.2143463785840468e-06, + "loss": 3.9022, + "step": 79930 + }, + { + "epoch": 5.431104769669792, + "grad_norm": 0.2820518910884857, + "learning_rate": 3.2139217284957196e-06, + "loss": 3.5761, + "step": 79935 + }, + { + "epoch": 5.4314444897404535, + "grad_norm": 0.22036640346050262, + "learning_rate": 3.213497078407393e-06, + "loss": 3.8634, + "step": 79940 + }, + { + "epoch": 5.4317842098111155, + "grad_norm": 0.40630120038986206, + "learning_rate": 3.2130724283190656e-06, + "loss": 3.8054, + "step": 79945 + }, + { + "epoch": 5.432123929881778, + "grad_norm": 0.19663108885288239, + "learning_rate": 3.212647778230738e-06, + "loss": 3.7761, + "step": 79950 + }, + { + "epoch": 5.432463649952439, + "grad_norm": 0.30280694365501404, + "learning_rate": 3.212223128142411e-06, + "loss": 3.9433, + "step": 79955 + }, + { + "epoch": 5.432803370023101, + "grad_norm": 0.3243177533149719, + "learning_rate": 3.211798478054084e-06, + "loss": 3.9439, + "step": 79960 + }, + { + "epoch": 5.433143090093763, + "grad_norm": 0.3038561940193176, + "learning_rate": 3.2113738279657564e-06, + "loss": 3.8496, + "step": 79965 + }, + { + "epoch": 5.433482810164424, + "grad_norm": 0.25069090723991394, + "learning_rate": 3.210949177877429e-06, + "loss": 3.8416, + "step": 79970 + }, + { + "epoch": 5.433822530235086, + "grad_norm": 0.24047663807868958, + "learning_rate": 3.2105245277891024e-06, + "loss": 4.0476, + "step": 79975 + }, + { + "epoch": 5.434162250305748, + "grad_norm": 0.32295045256614685, + "learning_rate": 3.2100998777007748e-06, + "loss": 3.7475, + "step": 79980 + }, + { + "epoch": 5.4345019703764095, + "grad_norm": 0.2577342987060547, + "learning_rate": 3.2096752276124476e-06, + "loss": 3.9133, + "step": 79985 + }, + { + "epoch": 5.4348416904470715, + "grad_norm": 0.1942276954650879, + "learning_rate": 3.209250577524121e-06, + "loss": 3.9475, + "step": 79990 + }, + { + "epoch": 5.435181410517734, + "grad_norm": 0.20174404978752136, + "learning_rate": 3.208825927435793e-06, + "loss": 4.0227, + "step": 79995 + }, + { + "epoch": 5.435521130588395, + "grad_norm": 0.31485673785209656, + "learning_rate": 3.208401277347466e-06, + "loss": 3.9525, + "step": 80000 + }, + { + "epoch": 5.435860850659057, + "grad_norm": 0.2668415606021881, + "learning_rate": 3.2079766272591388e-06, + "loss": 3.9216, + "step": 80005 + }, + { + "epoch": 5.436200570729719, + "grad_norm": 0.2636599540710449, + "learning_rate": 3.207551977170811e-06, + "loss": 3.8459, + "step": 80010 + }, + { + "epoch": 5.43654029080038, + "grad_norm": 0.22720946371555328, + "learning_rate": 3.2071273270824844e-06, + "loss": 3.7881, + "step": 80015 + }, + { + "epoch": 5.436880010871042, + "grad_norm": 0.3203278183937073, + "learning_rate": 3.206702676994157e-06, + "loss": 3.9073, + "step": 80020 + }, + { + "epoch": 5.437219730941704, + "grad_norm": 0.25703659653663635, + "learning_rate": 3.2062780269058296e-06, + "loss": 3.6823, + "step": 80025 + }, + { + "epoch": 5.4375594510123655, + "grad_norm": 0.36042898893356323, + "learning_rate": 3.2058533768175028e-06, + "loss": 4.0559, + "step": 80030 + }, + { + "epoch": 5.437899171083028, + "grad_norm": 0.22523336112499237, + "learning_rate": 3.2054287267291756e-06, + "loss": 3.9385, + "step": 80035 + }, + { + "epoch": 5.43823889115369, + "grad_norm": 0.19876231253147125, + "learning_rate": 3.205004076640848e-06, + "loss": 3.856, + "step": 80040 + }, + { + "epoch": 5.438578611224351, + "grad_norm": 0.21766780316829681, + "learning_rate": 3.2045794265525208e-06, + "loss": 3.7364, + "step": 80045 + }, + { + "epoch": 5.438918331295013, + "grad_norm": 0.3263103663921356, + "learning_rate": 3.204154776464194e-06, + "loss": 3.9973, + "step": 80050 + }, + { + "epoch": 5.439258051365675, + "grad_norm": 0.24903295934200287, + "learning_rate": 3.2037301263758664e-06, + "loss": 3.9659, + "step": 80055 + }, + { + "epoch": 5.439597771436336, + "grad_norm": 0.2272706776857376, + "learning_rate": 3.203305476287539e-06, + "loss": 3.6456, + "step": 80060 + }, + { + "epoch": 5.439937491506998, + "grad_norm": 0.29128730297088623, + "learning_rate": 3.2028808261992124e-06, + "loss": 3.7271, + "step": 80065 + }, + { + "epoch": 5.44027721157766, + "grad_norm": 0.22889769077301025, + "learning_rate": 3.2024561761108848e-06, + "loss": 3.8501, + "step": 80070 + }, + { + "epoch": 5.4406169316483215, + "grad_norm": 0.2417672574520111, + "learning_rate": 3.2020315260225576e-06, + "loss": 3.9366, + "step": 80075 + }, + { + "epoch": 5.440956651718984, + "grad_norm": 0.33939626812934875, + "learning_rate": 3.2016068759342308e-06, + "loss": 4.0342, + "step": 80080 + }, + { + "epoch": 5.441296371789646, + "grad_norm": 0.2346445471048355, + "learning_rate": 3.201182225845903e-06, + "loss": 3.9752, + "step": 80085 + }, + { + "epoch": 5.441636091860307, + "grad_norm": 0.24308930337429047, + "learning_rate": 3.200757575757576e-06, + "loss": 3.7875, + "step": 80090 + }, + { + "epoch": 5.441975811930969, + "grad_norm": 0.2932051718235016, + "learning_rate": 3.2003329256692488e-06, + "loss": 3.8964, + "step": 80095 + }, + { + "epoch": 5.442315532001631, + "grad_norm": 0.3528461456298828, + "learning_rate": 3.199908275580921e-06, + "loss": 3.975, + "step": 80100 + }, + { + "epoch": 5.442655252072292, + "grad_norm": 0.2166297733783722, + "learning_rate": 3.1994836254925944e-06, + "loss": 3.946, + "step": 80105 + }, + { + "epoch": 5.442994972142954, + "grad_norm": 0.24960575997829437, + "learning_rate": 3.199058975404267e-06, + "loss": 3.882, + "step": 80110 + }, + { + "epoch": 5.443334692213616, + "grad_norm": 0.2557511031627655, + "learning_rate": 3.1986343253159404e-06, + "loss": 4.0335, + "step": 80115 + }, + { + "epoch": 5.4436744122842775, + "grad_norm": 0.22665588557720184, + "learning_rate": 3.1982096752276128e-06, + "loss": 3.8717, + "step": 80120 + }, + { + "epoch": 5.44401413235494, + "grad_norm": 0.29315701127052307, + "learning_rate": 3.1977850251392856e-06, + "loss": 3.7061, + "step": 80125 + }, + { + "epoch": 5.444353852425602, + "grad_norm": 0.2620256841182709, + "learning_rate": 3.1973603750509584e-06, + "loss": 4.2262, + "step": 80130 + }, + { + "epoch": 5.444693572496263, + "grad_norm": 0.32782965898513794, + "learning_rate": 3.1969357249626307e-06, + "loss": 3.9526, + "step": 80135 + }, + { + "epoch": 5.445033292566925, + "grad_norm": 0.34283697605133057, + "learning_rate": 3.196511074874304e-06, + "loss": 4.0838, + "step": 80140 + }, + { + "epoch": 5.445373012637587, + "grad_norm": 0.4956267178058624, + "learning_rate": 3.1960864247859768e-06, + "loss": 4.0264, + "step": 80145 + }, + { + "epoch": 5.445712732708248, + "grad_norm": 0.24165205657482147, + "learning_rate": 3.195661774697649e-06, + "loss": 4.089, + "step": 80150 + }, + { + "epoch": 5.44605245277891, + "grad_norm": 0.2165759801864624, + "learning_rate": 3.1952371246093224e-06, + "loss": 3.9671, + "step": 80155 + }, + { + "epoch": 5.446392172849572, + "grad_norm": 0.27206534147262573, + "learning_rate": 3.194812474520995e-06, + "loss": 3.8096, + "step": 80160 + }, + { + "epoch": 5.4467318929202335, + "grad_norm": 0.19447855651378632, + "learning_rate": 3.1943878244326676e-06, + "loss": 3.9365, + "step": 80165 + }, + { + "epoch": 5.447071612990896, + "grad_norm": 0.2627434730529785, + "learning_rate": 3.1939631743443404e-06, + "loss": 3.9587, + "step": 80170 + }, + { + "epoch": 5.447411333061558, + "grad_norm": 0.41896724700927734, + "learning_rate": 3.1935385242560136e-06, + "loss": 4.16, + "step": 80175 + }, + { + "epoch": 5.447751053132219, + "grad_norm": 0.36933112144470215, + "learning_rate": 3.193113874167686e-06, + "loss": 3.6851, + "step": 80180 + }, + { + "epoch": 5.448090773202881, + "grad_norm": 0.32592976093292236, + "learning_rate": 3.1926892240793588e-06, + "loss": 4.1824, + "step": 80185 + }, + { + "epoch": 5.448430493273543, + "grad_norm": 0.2787201702594757, + "learning_rate": 3.192264573991032e-06, + "loss": 3.8641, + "step": 80190 + }, + { + "epoch": 5.448770213344204, + "grad_norm": 0.35139477252960205, + "learning_rate": 3.1918399239027044e-06, + "loss": 3.9215, + "step": 80195 + }, + { + "epoch": 5.449109933414866, + "grad_norm": 0.28302451968193054, + "learning_rate": 3.191415273814377e-06, + "loss": 4.1505, + "step": 80200 + }, + { + "epoch": 5.449449653485528, + "grad_norm": 0.23397530615329742, + "learning_rate": 3.1909906237260504e-06, + "loss": 3.7957, + "step": 80205 + }, + { + "epoch": 5.4497893735561895, + "grad_norm": 0.35363897681236267, + "learning_rate": 3.1905659736377228e-06, + "loss": 3.9735, + "step": 80210 + }, + { + "epoch": 5.450129093626852, + "grad_norm": 0.2409089207649231, + "learning_rate": 3.1901413235493956e-06, + "loss": 3.9528, + "step": 80215 + }, + { + "epoch": 5.450468813697514, + "grad_norm": 0.28026247024536133, + "learning_rate": 3.1897166734610684e-06, + "loss": 3.7516, + "step": 80220 + }, + { + "epoch": 5.450808533768175, + "grad_norm": 0.3133228123188019, + "learning_rate": 3.1892920233727407e-06, + "loss": 3.8618, + "step": 80225 + }, + { + "epoch": 5.451148253838837, + "grad_norm": 0.2580001652240753, + "learning_rate": 3.188867373284414e-06, + "loss": 3.8895, + "step": 80230 + }, + { + "epoch": 5.451487973909499, + "grad_norm": 0.23013834655284882, + "learning_rate": 3.1884427231960868e-06, + "loss": 4.0051, + "step": 80235 + }, + { + "epoch": 5.45182769398016, + "grad_norm": 0.2396932989358902, + "learning_rate": 3.188018073107759e-06, + "loss": 3.7804, + "step": 80240 + }, + { + "epoch": 5.452167414050822, + "grad_norm": 0.27770745754241943, + "learning_rate": 3.1875934230194324e-06, + "loss": 3.886, + "step": 80245 + }, + { + "epoch": 5.452507134121484, + "grad_norm": 0.406209796667099, + "learning_rate": 3.187168772931105e-06, + "loss": 3.8535, + "step": 80250 + }, + { + "epoch": 5.4528468541921455, + "grad_norm": 0.21301721036434174, + "learning_rate": 3.1867441228427775e-06, + "loss": 4.1385, + "step": 80255 + }, + { + "epoch": 5.453186574262808, + "grad_norm": 0.26372990012168884, + "learning_rate": 3.1863194727544503e-06, + "loss": 4.2034, + "step": 80260 + }, + { + "epoch": 5.45352629433347, + "grad_norm": 0.2930130362510681, + "learning_rate": 3.1858948226661236e-06, + "loss": 4.0139, + "step": 80265 + }, + { + "epoch": 5.453866014404131, + "grad_norm": 0.23670071363449097, + "learning_rate": 3.185470172577796e-06, + "loss": 3.8015, + "step": 80270 + }, + { + "epoch": 5.454205734474793, + "grad_norm": 0.29797741770744324, + "learning_rate": 3.1850455224894687e-06, + "loss": 3.7897, + "step": 80275 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 0.32414594292640686, + "learning_rate": 3.184620872401142e-06, + "loss": 4.0007, + "step": 80280 + }, + { + "epoch": 5.454885174616116, + "grad_norm": 0.28339800238609314, + "learning_rate": 3.1841962223128148e-06, + "loss": 3.804, + "step": 80285 + }, + { + "epoch": 5.455224894686778, + "grad_norm": 0.35800307989120483, + "learning_rate": 3.183771572224487e-06, + "loss": 3.8369, + "step": 80290 + }, + { + "epoch": 5.4555646147574395, + "grad_norm": 0.25267699360847473, + "learning_rate": 3.18334692213616e-06, + "loss": 4.2197, + "step": 80295 + }, + { + "epoch": 5.4559043348281016, + "grad_norm": 0.2605094909667969, + "learning_rate": 3.182922272047833e-06, + "loss": 4.1073, + "step": 80300 + }, + { + "epoch": 5.456244054898764, + "grad_norm": 0.47260743379592896, + "learning_rate": 3.1824976219595056e-06, + "loss": 3.9534, + "step": 80305 + }, + { + "epoch": 5.456583774969425, + "grad_norm": 0.26148176193237305, + "learning_rate": 3.1820729718711784e-06, + "loss": 3.98, + "step": 80310 + }, + { + "epoch": 5.456923495040087, + "grad_norm": 0.2956985533237457, + "learning_rate": 3.1816483217828516e-06, + "loss": 3.9414, + "step": 80315 + }, + { + "epoch": 5.457263215110749, + "grad_norm": 0.326760470867157, + "learning_rate": 3.181223671694524e-06, + "loss": 3.8374, + "step": 80320 + }, + { + "epoch": 5.45760293518141, + "grad_norm": 0.2743455171585083, + "learning_rate": 3.1807990216061968e-06, + "loss": 3.9412, + "step": 80325 + }, + { + "epoch": 5.457942655252072, + "grad_norm": 0.2777725160121918, + "learning_rate": 3.1803743715178696e-06, + "loss": 4.1473, + "step": 80330 + }, + { + "epoch": 5.458282375322734, + "grad_norm": 0.208855539560318, + "learning_rate": 3.1799497214295424e-06, + "loss": 4.1883, + "step": 80335 + }, + { + "epoch": 5.4586220953933955, + "grad_norm": 0.2881701588630676, + "learning_rate": 3.179525071341215e-06, + "loss": 3.909, + "step": 80340 + }, + { + "epoch": 5.458961815464058, + "grad_norm": 0.3033711016178131, + "learning_rate": 3.179100421252888e-06, + "loss": 3.9389, + "step": 80345 + }, + { + "epoch": 5.45930153553472, + "grad_norm": 0.2589449882507324, + "learning_rate": 3.1786757711645603e-06, + "loss": 3.924, + "step": 80350 + }, + { + "epoch": 5.459641255605381, + "grad_norm": 0.286417156457901, + "learning_rate": 3.178336051093899e-06, + "loss": 4.1266, + "step": 80355 + }, + { + "epoch": 5.459980975676043, + "grad_norm": 0.2679178714752197, + "learning_rate": 3.1779114010055716e-06, + "loss": 3.7808, + "step": 80360 + }, + { + "epoch": 5.460320695746705, + "grad_norm": 0.23467709124088287, + "learning_rate": 3.177486750917245e-06, + "loss": 3.9939, + "step": 80365 + }, + { + "epoch": 5.460660415817366, + "grad_norm": 0.3771617114543915, + "learning_rate": 3.1770621008289172e-06, + "loss": 3.9904, + "step": 80370 + }, + { + "epoch": 5.461000135888028, + "grad_norm": 0.22776924073696136, + "learning_rate": 3.17663745074059e-06, + "loss": 4.092, + "step": 80375 + }, + { + "epoch": 5.46133985595869, + "grad_norm": 0.30615124106407166, + "learning_rate": 3.176212800652263e-06, + "loss": 3.9881, + "step": 80380 + }, + { + "epoch": 5.4616795760293515, + "grad_norm": 0.4621178209781647, + "learning_rate": 3.175788150563935e-06, + "loss": 3.8454, + "step": 80385 + }, + { + "epoch": 5.462019296100014, + "grad_norm": 0.30099600553512573, + "learning_rate": 3.1753635004756084e-06, + "loss": 4.175, + "step": 80390 + }, + { + "epoch": 5.462359016170676, + "grad_norm": 0.2633526027202606, + "learning_rate": 3.1749388503872812e-06, + "loss": 3.9151, + "step": 80395 + }, + { + "epoch": 5.462698736241337, + "grad_norm": 0.2679753303527832, + "learning_rate": 3.1745142002989536e-06, + "loss": 3.8822, + "step": 80400 + }, + { + "epoch": 5.463038456311999, + "grad_norm": 0.26351073384284973, + "learning_rate": 3.174089550210627e-06, + "loss": 3.8248, + "step": 80405 + }, + { + "epoch": 5.463378176382661, + "grad_norm": 0.22284789383411407, + "learning_rate": 3.1736649001222996e-06, + "loss": 3.9577, + "step": 80410 + }, + { + "epoch": 5.463717896453322, + "grad_norm": 0.5650604963302612, + "learning_rate": 3.173240250033972e-06, + "loss": 4.0291, + "step": 80415 + }, + { + "epoch": 5.464057616523984, + "grad_norm": 0.29738524556159973, + "learning_rate": 3.1728155999456452e-06, + "loss": 3.7972, + "step": 80420 + }, + { + "epoch": 5.464397336594646, + "grad_norm": 0.27268949151039124, + "learning_rate": 3.172390949857318e-06, + "loss": 3.5768, + "step": 80425 + }, + { + "epoch": 5.4647370566653075, + "grad_norm": 0.30972790718078613, + "learning_rate": 3.1719662997689904e-06, + "loss": 3.9252, + "step": 80430 + }, + { + "epoch": 5.46507677673597, + "grad_norm": 0.2004961520433426, + "learning_rate": 3.1715416496806632e-06, + "loss": 3.925, + "step": 80435 + }, + { + "epoch": 5.465416496806632, + "grad_norm": 0.23703621327877045, + "learning_rate": 3.1711169995923364e-06, + "loss": 4.0159, + "step": 80440 + }, + { + "epoch": 5.465756216877293, + "grad_norm": 0.24926437437534332, + "learning_rate": 3.170692349504009e-06, + "loss": 4.3902, + "step": 80445 + }, + { + "epoch": 5.466095936947955, + "grad_norm": 0.26308566331863403, + "learning_rate": 3.1702676994156816e-06, + "loss": 4.161, + "step": 80450 + }, + { + "epoch": 5.466435657018617, + "grad_norm": 0.20057009160518646, + "learning_rate": 3.169843049327355e-06, + "loss": 3.9534, + "step": 80455 + }, + { + "epoch": 5.466775377089278, + "grad_norm": 0.5916411876678467, + "learning_rate": 3.1694183992390272e-06, + "loss": 3.8714, + "step": 80460 + }, + { + "epoch": 5.46711509715994, + "grad_norm": 0.37281033396720886, + "learning_rate": 3.1689937491507e-06, + "loss": 4.2029, + "step": 80465 + }, + { + "epoch": 5.467454817230602, + "grad_norm": 0.27681639790534973, + "learning_rate": 3.168569099062373e-06, + "loss": 3.8593, + "step": 80470 + }, + { + "epoch": 5.4677945373012635, + "grad_norm": 0.28691884875297546, + "learning_rate": 3.168144448974045e-06, + "loss": 3.9236, + "step": 80475 + }, + { + "epoch": 5.468134257371926, + "grad_norm": 0.2902889549732208, + "learning_rate": 3.1677197988857184e-06, + "loss": 3.797, + "step": 80480 + }, + { + "epoch": 5.468473977442588, + "grad_norm": 0.3765753507614136, + "learning_rate": 3.1672951487973912e-06, + "loss": 4.0243, + "step": 80485 + }, + { + "epoch": 5.468813697513249, + "grad_norm": 0.47514382004737854, + "learning_rate": 3.1668704987090645e-06, + "loss": 4.0126, + "step": 80490 + }, + { + "epoch": 5.469153417583911, + "grad_norm": 0.4487714171409607, + "learning_rate": 3.166445848620737e-06, + "loss": 3.9193, + "step": 80495 + }, + { + "epoch": 5.469493137654573, + "grad_norm": 0.24008384346961975, + "learning_rate": 3.1660211985324096e-06, + "loss": 3.7993, + "step": 80500 + }, + { + "epoch": 5.469832857725234, + "grad_norm": 0.21031701564788818, + "learning_rate": 3.1655965484440824e-06, + "loss": 3.9778, + "step": 80505 + }, + { + "epoch": 5.470172577795896, + "grad_norm": 0.24130019545555115, + "learning_rate": 3.165171898355755e-06, + "loss": 4.0455, + "step": 80510 + }, + { + "epoch": 5.470512297866558, + "grad_norm": 0.3567448854446411, + "learning_rate": 3.164747248267428e-06, + "loss": 3.9961, + "step": 80515 + }, + { + "epoch": 5.4708520179372195, + "grad_norm": 0.37022650241851807, + "learning_rate": 3.164322598179101e-06, + "loss": 3.7414, + "step": 80520 + }, + { + "epoch": 5.471191738007882, + "grad_norm": 0.3031681180000305, + "learning_rate": 3.163897948090773e-06, + "loss": 3.9588, + "step": 80525 + }, + { + "epoch": 5.471531458078544, + "grad_norm": 0.1893414855003357, + "learning_rate": 3.1634732980024464e-06, + "loss": 3.8749, + "step": 80530 + }, + { + "epoch": 5.471871178149205, + "grad_norm": 0.21016985177993774, + "learning_rate": 3.1630486479141192e-06, + "loss": 3.6696, + "step": 80535 + }, + { + "epoch": 5.472210898219867, + "grad_norm": 0.31074485182762146, + "learning_rate": 3.1626239978257916e-06, + "loss": 3.9736, + "step": 80540 + }, + { + "epoch": 5.472550618290528, + "grad_norm": 0.31105074286460876, + "learning_rate": 3.1621993477374644e-06, + "loss": 4.1655, + "step": 80545 + }, + { + "epoch": 5.47289033836119, + "grad_norm": 0.2500925362110138, + "learning_rate": 3.1617746976491376e-06, + "loss": 3.8264, + "step": 80550 + }, + { + "epoch": 5.473230058431852, + "grad_norm": 0.22228699922561646, + "learning_rate": 3.16135004756081e-06, + "loss": 3.8909, + "step": 80555 + }, + { + "epoch": 5.4735697785025135, + "grad_norm": 0.2653433680534363, + "learning_rate": 3.160925397472483e-06, + "loss": 3.7232, + "step": 80560 + }, + { + "epoch": 5.4739094985731755, + "grad_norm": 0.3205251395702362, + "learning_rate": 3.160500747384156e-06, + "loss": 3.7762, + "step": 80565 + }, + { + "epoch": 5.474249218643838, + "grad_norm": 0.23544570803642273, + "learning_rate": 3.1600760972958284e-06, + "loss": 3.9147, + "step": 80570 + }, + { + "epoch": 5.474588938714499, + "grad_norm": 0.32351717352867126, + "learning_rate": 3.1596514472075012e-06, + "loss": 3.8898, + "step": 80575 + }, + { + "epoch": 5.474928658785161, + "grad_norm": 0.4918053448200226, + "learning_rate": 3.1592267971191744e-06, + "loss": 4.1091, + "step": 80580 + }, + { + "epoch": 5.475268378855823, + "grad_norm": 0.31805017590522766, + "learning_rate": 3.158802147030847e-06, + "loss": 3.9813, + "step": 80585 + }, + { + "epoch": 5.475608098926484, + "grad_norm": 0.3999897837638855, + "learning_rate": 3.1583774969425196e-06, + "loss": 3.9726, + "step": 80590 + }, + { + "epoch": 5.475947818997146, + "grad_norm": 0.3572256565093994, + "learning_rate": 3.1579528468541924e-06, + "loss": 4.0613, + "step": 80595 + }, + { + "epoch": 5.476287539067808, + "grad_norm": 0.3098893165588379, + "learning_rate": 3.157528196765865e-06, + "loss": 3.9258, + "step": 80600 + }, + { + "epoch": 5.4766272591384695, + "grad_norm": 0.2635262608528137, + "learning_rate": 3.157103546677538e-06, + "loss": 3.8906, + "step": 80605 + }, + { + "epoch": 5.476966979209132, + "grad_norm": 0.2497612088918686, + "learning_rate": 3.156678896589211e-06, + "loss": 3.8132, + "step": 80610 + }, + { + "epoch": 5.477306699279794, + "grad_norm": 0.2780151069164276, + "learning_rate": 3.156254246500883e-06, + "loss": 4.0758, + "step": 80615 + }, + { + "epoch": 5.477646419350455, + "grad_norm": 0.2355847805738449, + "learning_rate": 3.1558295964125564e-06, + "loss": 4.1965, + "step": 80620 + }, + { + "epoch": 5.477986139421117, + "grad_norm": 0.32529720664024353, + "learning_rate": 3.1554049463242292e-06, + "loss": 4.1275, + "step": 80625 + }, + { + "epoch": 5.478325859491779, + "grad_norm": 0.5530174970626831, + "learning_rate": 3.1549802962359016e-06, + "loss": 3.8613, + "step": 80630 + }, + { + "epoch": 5.47866557956244, + "grad_norm": 0.24252596497535706, + "learning_rate": 3.1545556461475744e-06, + "loss": 3.8072, + "step": 80635 + }, + { + "epoch": 5.479005299633102, + "grad_norm": 0.25578510761260986, + "learning_rate": 3.1541309960592476e-06, + "loss": 3.9943, + "step": 80640 + }, + { + "epoch": 5.479345019703764, + "grad_norm": 0.27113479375839233, + "learning_rate": 3.15370634597092e-06, + "loss": 3.783, + "step": 80645 + }, + { + "epoch": 5.4796847397744255, + "grad_norm": 0.18929219245910645, + "learning_rate": 3.153281695882593e-06, + "loss": 3.9437, + "step": 80650 + }, + { + "epoch": 5.480024459845088, + "grad_norm": 0.3330477774143219, + "learning_rate": 3.152857045794266e-06, + "loss": 3.8824, + "step": 80655 + }, + { + "epoch": 5.48036417991575, + "grad_norm": 0.2345525175333023, + "learning_rate": 3.152432395705939e-06, + "loss": 3.8524, + "step": 80660 + }, + { + "epoch": 5.480703899986411, + "grad_norm": 0.2976064085960388, + "learning_rate": 3.152007745617611e-06, + "loss": 3.7479, + "step": 80665 + }, + { + "epoch": 5.481043620057073, + "grad_norm": 0.29370999336242676, + "learning_rate": 3.151583095529284e-06, + "loss": 3.8448, + "step": 80670 + }, + { + "epoch": 5.481383340127735, + "grad_norm": 0.2841353714466095, + "learning_rate": 3.1511584454409572e-06, + "loss": 4.0598, + "step": 80675 + }, + { + "epoch": 5.481723060198396, + "grad_norm": 0.2601211667060852, + "learning_rate": 3.1507337953526296e-06, + "loss": 3.9566, + "step": 80680 + }, + { + "epoch": 5.482062780269058, + "grad_norm": 0.2996302545070648, + "learning_rate": 3.1503091452643024e-06, + "loss": 4.0391, + "step": 80685 + }, + { + "epoch": 5.48240250033972, + "grad_norm": 0.38614606857299805, + "learning_rate": 3.1498844951759756e-06, + "loss": 3.8627, + "step": 80690 + }, + { + "epoch": 5.4827422204103815, + "grad_norm": 0.27909061312675476, + "learning_rate": 3.149459845087648e-06, + "loss": 4.1429, + "step": 80695 + }, + { + "epoch": 5.483081940481044, + "grad_norm": 0.2730897068977356, + "learning_rate": 3.149035194999321e-06, + "loss": 3.8982, + "step": 80700 + }, + { + "epoch": 5.483421660551706, + "grad_norm": 0.20014524459838867, + "learning_rate": 3.148610544910994e-06, + "loss": 3.6251, + "step": 80705 + }, + { + "epoch": 5.483761380622367, + "grad_norm": 0.18548144400119781, + "learning_rate": 3.1481858948226664e-06, + "loss": 4.0878, + "step": 80710 + }, + { + "epoch": 5.484101100693029, + "grad_norm": 0.2460627257823944, + "learning_rate": 3.1477612447343392e-06, + "loss": 3.8825, + "step": 80715 + }, + { + "epoch": 5.484440820763691, + "grad_norm": 0.2660672962665558, + "learning_rate": 3.147336594646012e-06, + "loss": 4.0404, + "step": 80720 + }, + { + "epoch": 5.484780540834352, + "grad_norm": 0.22401665151119232, + "learning_rate": 3.1469119445576844e-06, + "loss": 3.6864, + "step": 80725 + }, + { + "epoch": 5.485120260905014, + "grad_norm": 0.247064009308815, + "learning_rate": 3.1464872944693576e-06, + "loss": 4.1643, + "step": 80730 + }, + { + "epoch": 5.485459980975676, + "grad_norm": 0.29007020592689514, + "learning_rate": 3.1460626443810304e-06, + "loss": 4.1738, + "step": 80735 + }, + { + "epoch": 5.4857997010463375, + "grad_norm": 0.3522483706474304, + "learning_rate": 3.145637994292703e-06, + "loss": 3.8113, + "step": 80740 + }, + { + "epoch": 5.486139421117, + "grad_norm": 0.24171248078346252, + "learning_rate": 3.145213344204376e-06, + "loss": 3.7316, + "step": 80745 + }, + { + "epoch": 5.486479141187662, + "grad_norm": 0.22555364668369293, + "learning_rate": 3.144788694116049e-06, + "loss": 4.0185, + "step": 80750 + }, + { + "epoch": 5.486818861258323, + "grad_norm": 0.23406094312667847, + "learning_rate": 3.144364044027721e-06, + "loss": 3.8783, + "step": 80755 + }, + { + "epoch": 5.487158581328985, + "grad_norm": 0.21929576992988586, + "learning_rate": 3.143939393939394e-06, + "loss": 3.6837, + "step": 80760 + }, + { + "epoch": 5.487498301399647, + "grad_norm": 0.2578637897968292, + "learning_rate": 3.1435147438510672e-06, + "loss": 3.5976, + "step": 80765 + }, + { + "epoch": 5.487838021470308, + "grad_norm": 0.2487485259771347, + "learning_rate": 3.1430900937627396e-06, + "loss": 4.1818, + "step": 80770 + }, + { + "epoch": 5.48817774154097, + "grad_norm": 0.23200099170207977, + "learning_rate": 3.1426654436744124e-06, + "loss": 3.6373, + "step": 80775 + }, + { + "epoch": 5.488517461611632, + "grad_norm": 0.3842478394508362, + "learning_rate": 3.1422407935860856e-06, + "loss": 3.9133, + "step": 80780 + }, + { + "epoch": 5.4888571816822935, + "grad_norm": 0.2521730959415436, + "learning_rate": 3.141816143497758e-06, + "loss": 3.9832, + "step": 80785 + }, + { + "epoch": 5.489196901752956, + "grad_norm": 0.27198681235313416, + "learning_rate": 3.141391493409431e-06, + "loss": 3.7692, + "step": 80790 + }, + { + "epoch": 5.489536621823618, + "grad_norm": 0.18372714519500732, + "learning_rate": 3.1409668433211036e-06, + "loss": 3.9284, + "step": 80795 + }, + { + "epoch": 5.489876341894279, + "grad_norm": 0.24764497578144073, + "learning_rate": 3.1405421932327764e-06, + "loss": 3.7757, + "step": 80800 + }, + { + "epoch": 5.490216061964941, + "grad_norm": 0.28683409094810486, + "learning_rate": 3.140117543144449e-06, + "loss": 3.9993, + "step": 80805 + }, + { + "epoch": 5.490555782035603, + "grad_norm": 0.3448578715324402, + "learning_rate": 3.139692893056122e-06, + "loss": 4.1571, + "step": 80810 + }, + { + "epoch": 5.490895502106264, + "grad_norm": 0.30242159962654114, + "learning_rate": 3.1392682429677944e-06, + "loss": 4.1446, + "step": 80815 + }, + { + "epoch": 5.491235222176926, + "grad_norm": 0.26524606347084045, + "learning_rate": 3.1388435928794676e-06, + "loss": 3.9473, + "step": 80820 + }, + { + "epoch": 5.491574942247588, + "grad_norm": 0.2653529644012451, + "learning_rate": 3.1384189427911404e-06, + "loss": 3.9305, + "step": 80825 + }, + { + "epoch": 5.4919146623182495, + "grad_norm": 0.3044092059135437, + "learning_rate": 3.137994292702813e-06, + "loss": 4.0099, + "step": 80830 + }, + { + "epoch": 5.492254382388912, + "grad_norm": 0.21017077565193176, + "learning_rate": 3.137569642614486e-06, + "loss": 3.9486, + "step": 80835 + }, + { + "epoch": 5.492594102459574, + "grad_norm": 0.24268923699855804, + "learning_rate": 3.137144992526159e-06, + "loss": 4.0247, + "step": 80840 + }, + { + "epoch": 5.492933822530235, + "grad_norm": 0.34895503520965576, + "learning_rate": 3.1367203424378316e-06, + "loss": 3.7235, + "step": 80845 + }, + { + "epoch": 5.493273542600897, + "grad_norm": 0.22517161071300507, + "learning_rate": 3.136295692349504e-06, + "loss": 3.8506, + "step": 80850 + }, + { + "epoch": 5.493613262671559, + "grad_norm": 0.23873209953308105, + "learning_rate": 3.1358710422611772e-06, + "loss": 3.7497, + "step": 80855 + }, + { + "epoch": 5.49395298274222, + "grad_norm": 0.25586798787117004, + "learning_rate": 3.13544639217285e-06, + "loss": 4.1117, + "step": 80860 + }, + { + "epoch": 5.494292702812882, + "grad_norm": 0.22568267583847046, + "learning_rate": 3.1350217420845224e-06, + "loss": 3.7774, + "step": 80865 + }, + { + "epoch": 5.494632422883544, + "grad_norm": 0.26163914799690247, + "learning_rate": 3.1345970919961956e-06, + "loss": 4.074, + "step": 80870 + }, + { + "epoch": 5.4949721429542056, + "grad_norm": 0.26494741439819336, + "learning_rate": 3.1341724419078684e-06, + "loss": 3.9486, + "step": 80875 + }, + { + "epoch": 5.495311863024868, + "grad_norm": 0.32024386525154114, + "learning_rate": 3.133747791819541e-06, + "loss": 4.1379, + "step": 80880 + }, + { + "epoch": 5.49565158309553, + "grad_norm": 0.21641787886619568, + "learning_rate": 3.1333231417312136e-06, + "loss": 3.7331, + "step": 80885 + }, + { + "epoch": 5.495991303166191, + "grad_norm": 0.2706885039806366, + "learning_rate": 3.132898491642887e-06, + "loss": 3.9005, + "step": 80890 + }, + { + "epoch": 5.496331023236853, + "grad_norm": 0.2535437345504761, + "learning_rate": 3.132473841554559e-06, + "loss": 3.7866, + "step": 80895 + }, + { + "epoch": 5.496670743307515, + "grad_norm": 0.4450170695781708, + "learning_rate": 3.132049191466232e-06, + "loss": 3.7309, + "step": 80900 + }, + { + "epoch": 5.497010463378176, + "grad_norm": 0.38048774003982544, + "learning_rate": 3.1316245413779052e-06, + "loss": 4.0827, + "step": 80905 + }, + { + "epoch": 5.497350183448838, + "grad_norm": 0.41986265778541565, + "learning_rate": 3.1311998912895776e-06, + "loss": 3.8959, + "step": 80910 + }, + { + "epoch": 5.4976899035195, + "grad_norm": 0.24606934189796448, + "learning_rate": 3.1307752412012504e-06, + "loss": 4.0666, + "step": 80915 + }, + { + "epoch": 5.498029623590162, + "grad_norm": 0.21144317090511322, + "learning_rate": 3.130350591112923e-06, + "loss": 3.8143, + "step": 80920 + }, + { + "epoch": 5.498369343660824, + "grad_norm": 0.42800068855285645, + "learning_rate": 3.129925941024596e-06, + "loss": 3.9526, + "step": 80925 + }, + { + "epoch": 5.498709063731486, + "grad_norm": 0.27879250049591064, + "learning_rate": 3.129501290936269e-06, + "loss": 3.9212, + "step": 80930 + }, + { + "epoch": 5.499048783802147, + "grad_norm": 0.25562748312950134, + "learning_rate": 3.1290766408479416e-06, + "loss": 3.8809, + "step": 80935 + }, + { + "epoch": 5.499388503872809, + "grad_norm": 0.2148861289024353, + "learning_rate": 3.128651990759614e-06, + "loss": 3.8729, + "step": 80940 + }, + { + "epoch": 5.499728223943471, + "grad_norm": 0.23167341947555542, + "learning_rate": 3.128227340671287e-06, + "loss": 3.8684, + "step": 80945 + }, + { + "epoch": 5.500067944014132, + "grad_norm": 0.2390887290239334, + "learning_rate": 3.12780269058296e-06, + "loss": 3.9475, + "step": 80950 + }, + { + "epoch": 5.500407664084794, + "grad_norm": 0.2789984345436096, + "learning_rate": 3.1273780404946324e-06, + "loss": 3.826, + "step": 80955 + }, + { + "epoch": 5.500747384155456, + "grad_norm": 0.28632017970085144, + "learning_rate": 3.1269533904063056e-06, + "loss": 3.8326, + "step": 80960 + }, + { + "epoch": 5.501087104226118, + "grad_norm": 0.2115737497806549, + "learning_rate": 3.1265287403179784e-06, + "loss": 3.7264, + "step": 80965 + }, + { + "epoch": 5.50142682429678, + "grad_norm": 0.2458307147026062, + "learning_rate": 3.1261040902296508e-06, + "loss": 3.9207, + "step": 80970 + }, + { + "epoch": 5.501766544367442, + "grad_norm": 0.23788276314735413, + "learning_rate": 3.1256794401413236e-06, + "loss": 3.9247, + "step": 80975 + }, + { + "epoch": 5.502106264438103, + "grad_norm": 0.23224861919879913, + "learning_rate": 3.125254790052997e-06, + "loss": 4.0029, + "step": 80980 + }, + { + "epoch": 5.502445984508765, + "grad_norm": 0.2670969069004059, + "learning_rate": 3.124830139964669e-06, + "loss": 3.9446, + "step": 80985 + }, + { + "epoch": 5.502785704579426, + "grad_norm": 0.3506486713886261, + "learning_rate": 3.124405489876342e-06, + "loss": 3.7755, + "step": 80990 + }, + { + "epoch": 5.503125424650088, + "grad_norm": 0.2676491141319275, + "learning_rate": 3.123980839788015e-06, + "loss": 3.6786, + "step": 80995 + }, + { + "epoch": 5.50346514472075, + "grad_norm": 0.19249245524406433, + "learning_rate": 3.1235561896996876e-06, + "loss": 3.857, + "step": 81000 + }, + { + "epoch": 5.5038048647914115, + "grad_norm": 0.25857728719711304, + "learning_rate": 3.1231315396113604e-06, + "loss": 3.9315, + "step": 81005 + }, + { + "epoch": 5.504144584862074, + "grad_norm": 0.2097025066614151, + "learning_rate": 3.122706889523033e-06, + "loss": 3.9094, + "step": 81010 + }, + { + "epoch": 5.504484304932736, + "grad_norm": 0.35117703676223755, + "learning_rate": 3.1222822394347064e-06, + "loss": 3.9412, + "step": 81015 + }, + { + "epoch": 5.504824025003397, + "grad_norm": 0.2315720021724701, + "learning_rate": 3.121857589346379e-06, + "loss": 3.9994, + "step": 81020 + }, + { + "epoch": 5.505163745074059, + "grad_norm": 0.26492130756378174, + "learning_rate": 3.1214329392580516e-06, + "loss": 3.769, + "step": 81025 + }, + { + "epoch": 5.505503465144721, + "grad_norm": 0.1913767158985138, + "learning_rate": 3.121008289169725e-06, + "loss": 3.694, + "step": 81030 + }, + { + "epoch": 5.505843185215382, + "grad_norm": 0.2293664515018463, + "learning_rate": 3.120583639081397e-06, + "loss": 3.8232, + "step": 81035 + }, + { + "epoch": 5.506182905286044, + "grad_norm": 0.2523459792137146, + "learning_rate": 3.12015898899307e-06, + "loss": 3.9564, + "step": 81040 + }, + { + "epoch": 5.506522625356706, + "grad_norm": 0.3123188316822052, + "learning_rate": 3.119734338904743e-06, + "loss": 3.9491, + "step": 81045 + }, + { + "epoch": 5.5068623454273675, + "grad_norm": 0.3148459196090698, + "learning_rate": 3.119309688816415e-06, + "loss": 3.8494, + "step": 81050 + }, + { + "epoch": 5.50720206549803, + "grad_norm": 0.2894100248813629, + "learning_rate": 3.1188850387280884e-06, + "loss": 3.9592, + "step": 81055 + }, + { + "epoch": 5.507541785568692, + "grad_norm": 0.2510594427585602, + "learning_rate": 3.118460388639761e-06, + "loss": 3.8852, + "step": 81060 + }, + { + "epoch": 5.507881505639353, + "grad_norm": 0.2162431925535202, + "learning_rate": 3.1180357385514336e-06, + "loss": 3.9484, + "step": 81065 + }, + { + "epoch": 5.508221225710015, + "grad_norm": 0.20033659040927887, + "learning_rate": 3.117611088463107e-06, + "loss": 3.8964, + "step": 81070 + }, + { + "epoch": 5.508560945780677, + "grad_norm": 0.3011295199394226, + "learning_rate": 3.1171864383747796e-06, + "loss": 3.9679, + "step": 81075 + }, + { + "epoch": 5.508900665851338, + "grad_norm": 0.2556789815425873, + "learning_rate": 3.116761788286452e-06, + "loss": 3.9513, + "step": 81080 + }, + { + "epoch": 5.509240385922, + "grad_norm": 0.3558541238307953, + "learning_rate": 3.116337138198125e-06, + "loss": 3.8933, + "step": 81085 + }, + { + "epoch": 5.509580105992662, + "grad_norm": 0.2712659537792206, + "learning_rate": 3.115912488109798e-06, + "loss": 3.8832, + "step": 81090 + }, + { + "epoch": 5.5099198260633235, + "grad_norm": 0.2890116274356842, + "learning_rate": 3.1154878380214704e-06, + "loss": 3.9823, + "step": 81095 + }, + { + "epoch": 5.510259546133986, + "grad_norm": 0.2646014392375946, + "learning_rate": 3.115063187933143e-06, + "loss": 3.9244, + "step": 81100 + }, + { + "epoch": 5.510599266204648, + "grad_norm": 0.2849748730659485, + "learning_rate": 3.1146385378448164e-06, + "loss": 4.0286, + "step": 81105 + }, + { + "epoch": 5.510938986275309, + "grad_norm": 0.2634417414665222, + "learning_rate": 3.1142138877564888e-06, + "loss": 3.9537, + "step": 81110 + }, + { + "epoch": 5.511278706345971, + "grad_norm": 0.21387101709842682, + "learning_rate": 3.1137892376681616e-06, + "loss": 3.7846, + "step": 81115 + }, + { + "epoch": 5.511618426416633, + "grad_norm": 0.21164487302303314, + "learning_rate": 3.113364587579835e-06, + "loss": 3.8507, + "step": 81120 + }, + { + "epoch": 5.511958146487294, + "grad_norm": 0.4051561951637268, + "learning_rate": 3.112939937491507e-06, + "loss": 3.8362, + "step": 81125 + }, + { + "epoch": 5.512297866557956, + "grad_norm": 0.26898515224456787, + "learning_rate": 3.11251528740318e-06, + "loss": 4.0275, + "step": 81130 + }, + { + "epoch": 5.512637586628618, + "grad_norm": 0.26210346817970276, + "learning_rate": 3.1120906373148528e-06, + "loss": 4.0257, + "step": 81135 + }, + { + "epoch": 5.5129773066992795, + "grad_norm": 0.22687318921089172, + "learning_rate": 3.111665987226525e-06, + "loss": 3.7302, + "step": 81140 + }, + { + "epoch": 5.513317026769942, + "grad_norm": 0.33843475580215454, + "learning_rate": 3.1112413371381984e-06, + "loss": 4.1151, + "step": 81145 + }, + { + "epoch": 5.513656746840604, + "grad_norm": 0.28254300355911255, + "learning_rate": 3.110816687049871e-06, + "loss": 4.0291, + "step": 81150 + }, + { + "epoch": 5.513996466911265, + "grad_norm": 0.21109764277935028, + "learning_rate": 3.1103920369615436e-06, + "loss": 3.8386, + "step": 81155 + }, + { + "epoch": 5.514336186981927, + "grad_norm": 0.2757878005504608, + "learning_rate": 3.109967386873217e-06, + "loss": 4.2203, + "step": 81160 + }, + { + "epoch": 5.514675907052589, + "grad_norm": 0.2864421010017395, + "learning_rate": 3.1095427367848896e-06, + "loss": 3.9282, + "step": 81165 + }, + { + "epoch": 5.51501562712325, + "grad_norm": 0.3618451654911041, + "learning_rate": 3.109118086696562e-06, + "loss": 3.9787, + "step": 81170 + }, + { + "epoch": 5.515355347193912, + "grad_norm": 0.49940523505210876, + "learning_rate": 3.1086934366082348e-06, + "loss": 3.9938, + "step": 81175 + }, + { + "epoch": 5.515695067264574, + "grad_norm": 0.2647528052330017, + "learning_rate": 3.108268786519908e-06, + "loss": 3.8452, + "step": 81180 + }, + { + "epoch": 5.516034787335236, + "grad_norm": 0.358100563287735, + "learning_rate": 3.107844136431581e-06, + "loss": 4.0766, + "step": 81185 + }, + { + "epoch": 5.516374507405898, + "grad_norm": 0.23232540488243103, + "learning_rate": 3.107419486343253e-06, + "loss": 3.9384, + "step": 81190 + }, + { + "epoch": 5.516714227476559, + "grad_norm": 0.26425644755363464, + "learning_rate": 3.1069948362549264e-06, + "loss": 3.8623, + "step": 81195 + }, + { + "epoch": 5.517053947547221, + "grad_norm": 0.26772937178611755, + "learning_rate": 3.106570186166599e-06, + "loss": 3.9627, + "step": 81200 + }, + { + "epoch": 5.517393667617883, + "grad_norm": 0.266032338142395, + "learning_rate": 3.1061455360782716e-06, + "loss": 3.9922, + "step": 81205 + }, + { + "epoch": 5.517733387688544, + "grad_norm": 0.32858023047447205, + "learning_rate": 3.105720885989945e-06, + "loss": 4.0988, + "step": 81210 + }, + { + "epoch": 5.518073107759206, + "grad_norm": 0.25054293870925903, + "learning_rate": 3.1052962359016176e-06, + "loss": 3.8609, + "step": 81215 + }, + { + "epoch": 5.518412827829868, + "grad_norm": 0.2729601562023163, + "learning_rate": 3.10487158581329e-06, + "loss": 4.0449, + "step": 81220 + }, + { + "epoch": 5.5187525479005295, + "grad_norm": 0.27301615476608276, + "learning_rate": 3.1044469357249628e-06, + "loss": 4.1356, + "step": 81225 + }, + { + "epoch": 5.519092267971192, + "grad_norm": 0.26690250635147095, + "learning_rate": 3.104022285636636e-06, + "loss": 3.9683, + "step": 81230 + }, + { + "epoch": 5.519431988041854, + "grad_norm": 0.23845352232456207, + "learning_rate": 3.1035976355483084e-06, + "loss": 4.1475, + "step": 81235 + }, + { + "epoch": 5.519771708112515, + "grad_norm": 0.2187957614660263, + "learning_rate": 3.103172985459981e-06, + "loss": 4.0438, + "step": 81240 + }, + { + "epoch": 5.520111428183177, + "grad_norm": 0.20180249214172363, + "learning_rate": 3.1027483353716544e-06, + "loss": 4.064, + "step": 81245 + }, + { + "epoch": 5.520451148253839, + "grad_norm": 0.2450101375579834, + "learning_rate": 3.1023236852833268e-06, + "loss": 4.0473, + "step": 81250 + }, + { + "epoch": 5.5207908683245, + "grad_norm": 0.30351829528808594, + "learning_rate": 3.1018990351949996e-06, + "loss": 3.8723, + "step": 81255 + }, + { + "epoch": 5.521130588395162, + "grad_norm": 0.2668631076812744, + "learning_rate": 3.1014743851066724e-06, + "loss": 4.2333, + "step": 81260 + }, + { + "epoch": 5.521470308465824, + "grad_norm": 0.2944149672985077, + "learning_rate": 3.1010497350183448e-06, + "loss": 3.8894, + "step": 81265 + }, + { + "epoch": 5.5218100285364855, + "grad_norm": 0.3067336082458496, + "learning_rate": 3.100625084930018e-06, + "loss": 3.8357, + "step": 81270 + }, + { + "epoch": 5.522149748607148, + "grad_norm": 0.33235979080200195, + "learning_rate": 3.1002004348416908e-06, + "loss": 3.9334, + "step": 81275 + }, + { + "epoch": 5.52248946867781, + "grad_norm": 0.24674877524375916, + "learning_rate": 3.099775784753363e-06, + "loss": 4.0464, + "step": 81280 + }, + { + "epoch": 5.522829188748471, + "grad_norm": 0.22985927760601044, + "learning_rate": 3.0993511346650364e-06, + "loss": 4.051, + "step": 81285 + }, + { + "epoch": 5.523168908819133, + "grad_norm": 0.31684261560440063, + "learning_rate": 3.098926484576709e-06, + "loss": 4.0727, + "step": 81290 + }, + { + "epoch": 5.523508628889795, + "grad_norm": 0.3290243148803711, + "learning_rate": 3.0985018344883816e-06, + "loss": 3.8016, + "step": 81295 + }, + { + "epoch": 5.523848348960456, + "grad_norm": 0.3049902617931366, + "learning_rate": 3.0980771844000544e-06, + "loss": 4.0124, + "step": 81300 + }, + { + "epoch": 5.524188069031118, + "grad_norm": 0.22218500077724457, + "learning_rate": 3.0976525343117276e-06, + "loss": 3.8973, + "step": 81305 + }, + { + "epoch": 5.52452778910178, + "grad_norm": 0.4176259934902191, + "learning_rate": 3.0972278842234e-06, + "loss": 4.0087, + "step": 81310 + }, + { + "epoch": 5.5248675091724415, + "grad_norm": 0.2868182063102722, + "learning_rate": 3.0968032341350728e-06, + "loss": 3.8342, + "step": 81315 + }, + { + "epoch": 5.525207229243104, + "grad_norm": 0.3050036132335663, + "learning_rate": 3.096378584046746e-06, + "loss": 3.6938, + "step": 81320 + }, + { + "epoch": 5.525546949313766, + "grad_norm": 0.23890307545661926, + "learning_rate": 3.0959539339584184e-06, + "loss": 3.9579, + "step": 81325 + }, + { + "epoch": 5.525886669384427, + "grad_norm": 0.20347753167152405, + "learning_rate": 3.095529283870091e-06, + "loss": 3.7692, + "step": 81330 + }, + { + "epoch": 5.526226389455089, + "grad_norm": 0.28346967697143555, + "learning_rate": 3.095104633781764e-06, + "loss": 4.1319, + "step": 81335 + }, + { + "epoch": 5.526566109525751, + "grad_norm": 0.1993688941001892, + "learning_rate": 3.0946799836934368e-06, + "loss": 3.963, + "step": 81340 + }, + { + "epoch": 5.526905829596412, + "grad_norm": 0.42034849524497986, + "learning_rate": 3.0942553336051096e-06, + "loss": 3.9397, + "step": 81345 + }, + { + "epoch": 5.527245549667074, + "grad_norm": 0.24298900365829468, + "learning_rate": 3.0938306835167824e-06, + "loss": 3.9319, + "step": 81350 + }, + { + "epoch": 5.527585269737736, + "grad_norm": 0.2837727963924408, + "learning_rate": 3.0934060334284556e-06, + "loss": 4.03, + "step": 81355 + }, + { + "epoch": 5.5279249898083975, + "grad_norm": 0.22944994270801544, + "learning_rate": 3.092981383340128e-06, + "loss": 3.746, + "step": 81360 + }, + { + "epoch": 5.52826470987906, + "grad_norm": 0.2648778557777405, + "learning_rate": 3.0925567332518008e-06, + "loss": 3.984, + "step": 81365 + }, + { + "epoch": 5.528604429949722, + "grad_norm": 0.3107696771621704, + "learning_rate": 3.092132083163474e-06, + "loss": 3.8684, + "step": 81370 + }, + { + "epoch": 5.528944150020383, + "grad_norm": 0.23740483820438385, + "learning_rate": 3.0917074330751464e-06, + "loss": 3.8375, + "step": 81375 + }, + { + "epoch": 5.529283870091045, + "grad_norm": 0.29176196455955505, + "learning_rate": 3.091282782986819e-06, + "loss": 3.933, + "step": 81380 + }, + { + "epoch": 5.529623590161707, + "grad_norm": 0.2820574939250946, + "learning_rate": 3.090858132898492e-06, + "loss": 4.1823, + "step": 81385 + }, + { + "epoch": 5.529963310232368, + "grad_norm": 0.25143271684646606, + "learning_rate": 3.0904334828101644e-06, + "loss": 3.8707, + "step": 81390 + }, + { + "epoch": 5.53030303030303, + "grad_norm": 0.29350799322128296, + "learning_rate": 3.0900088327218376e-06, + "loss": 3.7606, + "step": 81395 + }, + { + "epoch": 5.530642750373692, + "grad_norm": 0.38199079036712646, + "learning_rate": 3.0895841826335104e-06, + "loss": 3.9223, + "step": 81400 + }, + { + "epoch": 5.5309824704443535, + "grad_norm": 0.19635194540023804, + "learning_rate": 3.0891595325451828e-06, + "loss": 4.0306, + "step": 81405 + }, + { + "epoch": 5.531322190515016, + "grad_norm": 0.29750239849090576, + "learning_rate": 3.088734882456856e-06, + "loss": 4.146, + "step": 81410 + }, + { + "epoch": 5.531661910585678, + "grad_norm": 0.24103908240795135, + "learning_rate": 3.0883102323685288e-06, + "loss": 3.9673, + "step": 81415 + }, + { + "epoch": 5.532001630656339, + "grad_norm": 0.2576213479042053, + "learning_rate": 3.087885582280201e-06, + "loss": 4.0486, + "step": 81420 + }, + { + "epoch": 5.532341350727001, + "grad_norm": 0.23683714866638184, + "learning_rate": 3.087460932191874e-06, + "loss": 4.0511, + "step": 81425 + }, + { + "epoch": 5.532681070797663, + "grad_norm": 0.2129494547843933, + "learning_rate": 3.087036282103547e-06, + "loss": 3.8703, + "step": 81430 + }, + { + "epoch": 5.533020790868324, + "grad_norm": 0.23003503680229187, + "learning_rate": 3.0866116320152196e-06, + "loss": 4.0159, + "step": 81435 + }, + { + "epoch": 5.533360510938986, + "grad_norm": 0.24249133467674255, + "learning_rate": 3.0861869819268924e-06, + "loss": 3.8892, + "step": 81440 + }, + { + "epoch": 5.533700231009648, + "grad_norm": 0.3433679938316345, + "learning_rate": 3.0857623318385656e-06, + "loss": 4.024, + "step": 81445 + }, + { + "epoch": 5.5340399510803095, + "grad_norm": 0.2676454186439514, + "learning_rate": 3.085337681750238e-06, + "loss": 3.6322, + "step": 81450 + }, + { + "epoch": 5.534379671150972, + "grad_norm": 0.23713521659374237, + "learning_rate": 3.0849130316619108e-06, + "loss": 4.0091, + "step": 81455 + }, + { + "epoch": 5.534719391221634, + "grad_norm": 0.2739666998386383, + "learning_rate": 3.0844883815735836e-06, + "loss": 3.9179, + "step": 81460 + }, + { + "epoch": 5.535059111292295, + "grad_norm": 0.3746265769004822, + "learning_rate": 3.0840637314852564e-06, + "loss": 3.9814, + "step": 81465 + }, + { + "epoch": 5.535398831362957, + "grad_norm": 0.3959256410598755, + "learning_rate": 3.083639081396929e-06, + "loss": 3.8098, + "step": 81470 + }, + { + "epoch": 5.535738551433619, + "grad_norm": 0.2219621241092682, + "learning_rate": 3.083214431308602e-06, + "loss": 3.9125, + "step": 81475 + }, + { + "epoch": 5.53607827150428, + "grad_norm": 0.2992073893547058, + "learning_rate": 3.0827897812202743e-06, + "loss": 3.9795, + "step": 81480 + }, + { + "epoch": 5.536417991574942, + "grad_norm": 0.34083858132362366, + "learning_rate": 3.0823651311319476e-06, + "loss": 3.9775, + "step": 81485 + }, + { + "epoch": 5.536757711645604, + "grad_norm": 0.1987905502319336, + "learning_rate": 3.0819404810436204e-06, + "loss": 3.9411, + "step": 81490 + }, + { + "epoch": 5.537097431716266, + "grad_norm": 0.3183825612068176, + "learning_rate": 3.0815158309552927e-06, + "loss": 4.0608, + "step": 81495 + }, + { + "epoch": 5.537437151786928, + "grad_norm": 0.2539103031158447, + "learning_rate": 3.081091180866966e-06, + "loss": 3.9047, + "step": 81500 + }, + { + "epoch": 5.53777687185759, + "grad_norm": 0.22974418103694916, + "learning_rate": 3.0806665307786388e-06, + "loss": 4.0936, + "step": 81505 + }, + { + "epoch": 5.538116591928251, + "grad_norm": 0.49693456292152405, + "learning_rate": 3.080241880690311e-06, + "loss": 3.8498, + "step": 81510 + }, + { + "epoch": 5.538456311998913, + "grad_norm": 0.3902360498905182, + "learning_rate": 3.079817230601984e-06, + "loss": 3.9771, + "step": 81515 + }, + { + "epoch": 5.538796032069575, + "grad_norm": 0.3575356602668762, + "learning_rate": 3.079392580513657e-06, + "loss": 3.8496, + "step": 81520 + }, + { + "epoch": 5.539135752140236, + "grad_norm": 0.24917642772197723, + "learning_rate": 3.07896793042533e-06, + "loss": 3.8919, + "step": 81525 + }, + { + "epoch": 5.539475472210898, + "grad_norm": 0.23488357663154602, + "learning_rate": 3.0785432803370024e-06, + "loss": 4.025, + "step": 81530 + }, + { + "epoch": 5.53981519228156, + "grad_norm": 0.22857467830181122, + "learning_rate": 3.0781186302486756e-06, + "loss": 4.175, + "step": 81535 + }, + { + "epoch": 5.540154912352222, + "grad_norm": 0.35787078738212585, + "learning_rate": 3.0776939801603484e-06, + "loss": 4.1127, + "step": 81540 + }, + { + "epoch": 5.540494632422884, + "grad_norm": 0.24730227887630463, + "learning_rate": 3.0772693300720208e-06, + "loss": 4.0851, + "step": 81545 + }, + { + "epoch": 5.540834352493546, + "grad_norm": 0.21218155324459076, + "learning_rate": 3.0768446799836936e-06, + "loss": 3.9043, + "step": 81550 + }, + { + "epoch": 5.541174072564207, + "grad_norm": 0.2496686726808548, + "learning_rate": 3.0764200298953668e-06, + "loss": 3.832, + "step": 81555 + }, + { + "epoch": 5.541513792634869, + "grad_norm": 0.2254050374031067, + "learning_rate": 3.075995379807039e-06, + "loss": 4.1026, + "step": 81560 + }, + { + "epoch": 5.541853512705531, + "grad_norm": 0.25343987345695496, + "learning_rate": 3.075570729718712e-06, + "loss": 3.7653, + "step": 81565 + }, + { + "epoch": 5.542193232776192, + "grad_norm": 0.2927451431751251, + "learning_rate": 3.075146079630385e-06, + "loss": 3.9071, + "step": 81570 + }, + { + "epoch": 5.542532952846854, + "grad_norm": 0.30601680278778076, + "learning_rate": 3.0747214295420576e-06, + "loss": 3.8739, + "step": 81575 + }, + { + "epoch": 5.542872672917516, + "grad_norm": 0.2707725763320923, + "learning_rate": 3.0742967794537304e-06, + "loss": 4.0312, + "step": 81580 + }, + { + "epoch": 5.543212392988178, + "grad_norm": 0.5790656208992004, + "learning_rate": 3.073872129365403e-06, + "loss": 3.9049, + "step": 81585 + }, + { + "epoch": 5.54355211305884, + "grad_norm": 0.6734880208969116, + "learning_rate": 3.073447479277076e-06, + "loss": 3.9713, + "step": 81590 + }, + { + "epoch": 5.543891833129502, + "grad_norm": 0.23047108948230743, + "learning_rate": 3.0730228291887488e-06, + "loss": 3.883, + "step": 81595 + }, + { + "epoch": 5.544231553200163, + "grad_norm": 0.27298951148986816, + "learning_rate": 3.0725981791004216e-06, + "loss": 3.7183, + "step": 81600 + }, + { + "epoch": 5.544571273270825, + "grad_norm": 0.22934632003307343, + "learning_rate": 3.072173529012094e-06, + "loss": 3.9239, + "step": 81605 + }, + { + "epoch": 5.544910993341487, + "grad_norm": 0.21506638824939728, + "learning_rate": 3.071748878923767e-06, + "loss": 3.8107, + "step": 81610 + }, + { + "epoch": 5.545250713412148, + "grad_norm": 0.29513755440711975, + "learning_rate": 3.07132422883544e-06, + "loss": 3.8568, + "step": 81615 + }, + { + "epoch": 5.54559043348281, + "grad_norm": 0.4704112708568573, + "learning_rate": 3.0708995787471123e-06, + "loss": 3.7154, + "step": 81620 + }, + { + "epoch": 5.545930153553472, + "grad_norm": 0.2598963975906372, + "learning_rate": 3.0704749286587856e-06, + "loss": 3.8024, + "step": 81625 + }, + { + "epoch": 5.546269873624134, + "grad_norm": 0.34745466709136963, + "learning_rate": 3.0700502785704584e-06, + "loss": 3.7901, + "step": 81630 + }, + { + "epoch": 5.546609593694796, + "grad_norm": 0.22305169701576233, + "learning_rate": 3.0696256284821307e-06, + "loss": 3.9719, + "step": 81635 + }, + { + "epoch": 5.546949313765458, + "grad_norm": 0.3077409863471985, + "learning_rate": 3.0692009783938035e-06, + "loss": 3.6595, + "step": 81640 + }, + { + "epoch": 5.547289033836119, + "grad_norm": 0.23973870277404785, + "learning_rate": 3.0687763283054768e-06, + "loss": 3.8124, + "step": 81645 + }, + { + "epoch": 5.547628753906781, + "grad_norm": 0.1953009068965912, + "learning_rate": 3.068351678217149e-06, + "loss": 4.074, + "step": 81650 + }, + { + "epoch": 5.547968473977443, + "grad_norm": 0.3736514151096344, + "learning_rate": 3.067927028128822e-06, + "loss": 3.7173, + "step": 81655 + }, + { + "epoch": 5.548308194048104, + "grad_norm": 0.225138857960701, + "learning_rate": 3.067502378040495e-06, + "loss": 3.8802, + "step": 81660 + }, + { + "epoch": 5.548647914118766, + "grad_norm": 0.27579638361930847, + "learning_rate": 3.0670777279521675e-06, + "loss": 4.0781, + "step": 81665 + }, + { + "epoch": 5.5489876341894275, + "grad_norm": 0.28967538475990295, + "learning_rate": 3.0666530778638404e-06, + "loss": 4.1791, + "step": 81670 + }, + { + "epoch": 5.54932735426009, + "grad_norm": 0.2623479962348938, + "learning_rate": 3.066228427775513e-06, + "loss": 3.7642, + "step": 81675 + }, + { + "epoch": 5.549667074330752, + "grad_norm": 0.28719279170036316, + "learning_rate": 3.0658037776871855e-06, + "loss": 3.972, + "step": 81680 + }, + { + "epoch": 5.550006794401413, + "grad_norm": 0.20407600700855255, + "learning_rate": 3.0653791275988588e-06, + "loss": 3.8998, + "step": 81685 + }, + { + "epoch": 5.550346514472075, + "grad_norm": 0.28158944845199585, + "learning_rate": 3.0649544775105316e-06, + "loss": 4.0296, + "step": 81690 + }, + { + "epoch": 5.550686234542737, + "grad_norm": 0.3293704688549042, + "learning_rate": 3.0645298274222048e-06, + "loss": 3.9633, + "step": 81695 + }, + { + "epoch": 5.551025954613398, + "grad_norm": 0.25440338253974915, + "learning_rate": 3.064105177333877e-06, + "loss": 4.0395, + "step": 81700 + }, + { + "epoch": 5.55136567468406, + "grad_norm": 0.23097063601016998, + "learning_rate": 3.06368052724555e-06, + "loss": 3.8872, + "step": 81705 + }, + { + "epoch": 5.551705394754722, + "grad_norm": 0.36738649010658264, + "learning_rate": 3.0632558771572228e-06, + "loss": 3.6465, + "step": 81710 + }, + { + "epoch": 5.5520451148253835, + "grad_norm": 0.27260103821754456, + "learning_rate": 3.0628312270688956e-06, + "loss": 3.9365, + "step": 81715 + }, + { + "epoch": 5.552384834896046, + "grad_norm": 0.30575326085090637, + "learning_rate": 3.0624065769805684e-06, + "loss": 3.9985, + "step": 81720 + }, + { + "epoch": 5.552724554966708, + "grad_norm": 0.25786349177360535, + "learning_rate": 3.061981926892241e-06, + "loss": 4.1122, + "step": 81725 + }, + { + "epoch": 5.553064275037369, + "grad_norm": 0.3193894028663635, + "learning_rate": 3.0615572768039135e-06, + "loss": 3.7941, + "step": 81730 + }, + { + "epoch": 5.553403995108031, + "grad_norm": 0.2956659197807312, + "learning_rate": 3.0611326267155868e-06, + "loss": 3.9248, + "step": 81735 + }, + { + "epoch": 5.553743715178693, + "grad_norm": 0.2504642605781555, + "learning_rate": 3.0607079766272596e-06, + "loss": 4.0486, + "step": 81740 + }, + { + "epoch": 5.554083435249354, + "grad_norm": 0.24831892549991608, + "learning_rate": 3.060283326538932e-06, + "loss": 3.9434, + "step": 81745 + }, + { + "epoch": 5.554423155320016, + "grad_norm": 0.28310123085975647, + "learning_rate": 3.059858676450605e-06, + "loss": 3.7685, + "step": 81750 + }, + { + "epoch": 5.554762875390678, + "grad_norm": 0.36094292998313904, + "learning_rate": 3.059434026362278e-06, + "loss": 3.8173, + "step": 81755 + }, + { + "epoch": 5.5551025954613396, + "grad_norm": 0.23008720576763153, + "learning_rate": 3.0590093762739503e-06, + "loss": 3.8964, + "step": 81760 + }, + { + "epoch": 5.555442315532002, + "grad_norm": 0.31443190574645996, + "learning_rate": 3.058584726185623e-06, + "loss": 3.8339, + "step": 81765 + }, + { + "epoch": 5.555782035602664, + "grad_norm": 0.24820590019226074, + "learning_rate": 3.0581600760972964e-06, + "loss": 3.7491, + "step": 81770 + }, + { + "epoch": 5.556121755673325, + "grad_norm": 0.3329440653324127, + "learning_rate": 3.0577354260089687e-06, + "loss": 3.9782, + "step": 81775 + }, + { + "epoch": 5.556461475743987, + "grad_norm": 0.3171837627887726, + "learning_rate": 3.0573107759206415e-06, + "loss": 3.9338, + "step": 81780 + }, + { + "epoch": 5.556801195814649, + "grad_norm": 0.23903188109397888, + "learning_rate": 3.0568861258323148e-06, + "loss": 4.0234, + "step": 81785 + }, + { + "epoch": 5.55714091588531, + "grad_norm": 0.2798403799533844, + "learning_rate": 3.056461475743987e-06, + "loss": 3.88, + "step": 81790 + }, + { + "epoch": 5.557480635955972, + "grad_norm": 0.2248610258102417, + "learning_rate": 3.05603682565566e-06, + "loss": 3.7635, + "step": 81795 + }, + { + "epoch": 5.557820356026634, + "grad_norm": 0.3291245400905609, + "learning_rate": 3.0556121755673327e-06, + "loss": 3.8608, + "step": 81800 + }, + { + "epoch": 5.558160076097296, + "grad_norm": 0.3186876177787781, + "learning_rate": 3.055187525479005e-06, + "loss": 4.0902, + "step": 81805 + }, + { + "epoch": 5.558499796167958, + "grad_norm": 0.23233577609062195, + "learning_rate": 3.0547628753906783e-06, + "loss": 3.6932, + "step": 81810 + }, + { + "epoch": 5.55883951623862, + "grad_norm": 0.39467477798461914, + "learning_rate": 3.054338225302351e-06, + "loss": 3.9101, + "step": 81815 + }, + { + "epoch": 5.559179236309281, + "grad_norm": 0.293229341506958, + "learning_rate": 3.0539135752140235e-06, + "loss": 4.1707, + "step": 81820 + }, + { + "epoch": 5.559518956379943, + "grad_norm": 0.2916674315929413, + "learning_rate": 3.0534889251256968e-06, + "loss": 3.9128, + "step": 81825 + }, + { + "epoch": 5.559858676450605, + "grad_norm": 0.29738137125968933, + "learning_rate": 3.0530642750373696e-06, + "loss": 4.0935, + "step": 81830 + }, + { + "epoch": 5.560198396521266, + "grad_norm": 0.27357086539268494, + "learning_rate": 3.052639624949042e-06, + "loss": 3.8522, + "step": 81835 + }, + { + "epoch": 5.560538116591928, + "grad_norm": 0.251128613948822, + "learning_rate": 3.0522149748607147e-06, + "loss": 3.9507, + "step": 81840 + }, + { + "epoch": 5.56087783666259, + "grad_norm": 0.2737695574760437, + "learning_rate": 3.051790324772388e-06, + "loss": 3.9216, + "step": 81845 + }, + { + "epoch": 5.561217556733252, + "grad_norm": 0.25760316848754883, + "learning_rate": 3.0513656746840603e-06, + "loss": 3.7802, + "step": 81850 + }, + { + "epoch": 5.561557276803914, + "grad_norm": 0.3128455579280853, + "learning_rate": 3.050941024595733e-06, + "loss": 4.0425, + "step": 81855 + }, + { + "epoch": 5.561896996874576, + "grad_norm": 0.2532031238079071, + "learning_rate": 3.0505163745074064e-06, + "loss": 3.9119, + "step": 81860 + }, + { + "epoch": 5.562236716945237, + "grad_norm": 0.30717042088508606, + "learning_rate": 3.050091724419079e-06, + "loss": 3.6613, + "step": 81865 + }, + { + "epoch": 5.562576437015899, + "grad_norm": 0.27718114852905273, + "learning_rate": 3.0496670743307515e-06, + "loss": 3.9678, + "step": 81870 + }, + { + "epoch": 5.56291615708656, + "grad_norm": 0.24284641444683075, + "learning_rate": 3.0492424242424248e-06, + "loss": 3.7043, + "step": 81875 + }, + { + "epoch": 5.563255877157222, + "grad_norm": 0.25817030668258667, + "learning_rate": 3.0488177741540976e-06, + "loss": 3.7042, + "step": 81880 + }, + { + "epoch": 5.563595597227884, + "grad_norm": 0.2637348473072052, + "learning_rate": 3.04839312406577e-06, + "loss": 3.79, + "step": 81885 + }, + { + "epoch": 5.5639353172985455, + "grad_norm": 0.24846671521663666, + "learning_rate": 3.0479684739774427e-06, + "loss": 4.0324, + "step": 81890 + }, + { + "epoch": 5.564275037369208, + "grad_norm": 0.23359549045562744, + "learning_rate": 3.047543823889116e-06, + "loss": 3.7225, + "step": 81895 + }, + { + "epoch": 5.56461475743987, + "grad_norm": 0.2442067265510559, + "learning_rate": 3.0471191738007883e-06, + "loss": 3.7847, + "step": 81900 + }, + { + "epoch": 5.564954477510531, + "grad_norm": 0.3345656096935272, + "learning_rate": 3.046694523712461e-06, + "loss": 3.9529, + "step": 81905 + }, + { + "epoch": 5.565294197581193, + "grad_norm": 0.2515348196029663, + "learning_rate": 3.0462698736241344e-06, + "loss": 3.9923, + "step": 81910 + }, + { + "epoch": 5.565633917651855, + "grad_norm": 0.2394503653049469, + "learning_rate": 3.0458452235358067e-06, + "loss": 3.8557, + "step": 81915 + }, + { + "epoch": 5.565973637722516, + "grad_norm": 0.3007236421108246, + "learning_rate": 3.0454205734474795e-06, + "loss": 3.6154, + "step": 81920 + }, + { + "epoch": 5.566313357793178, + "grad_norm": 0.3181275427341461, + "learning_rate": 3.0449959233591523e-06, + "loss": 4.0003, + "step": 81925 + }, + { + "epoch": 5.56665307786384, + "grad_norm": 0.21575595438480377, + "learning_rate": 3.0445712732708247e-06, + "loss": 3.8821, + "step": 81930 + }, + { + "epoch": 5.5669927979345015, + "grad_norm": 0.28208455443382263, + "learning_rate": 3.044146623182498e-06, + "loss": 4.0189, + "step": 81935 + }, + { + "epoch": 5.567332518005164, + "grad_norm": 0.20375823974609375, + "learning_rate": 3.0437219730941707e-06, + "loss": 4.2089, + "step": 81940 + }, + { + "epoch": 5.567672238075826, + "grad_norm": 0.29376786947250366, + "learning_rate": 3.043297323005843e-06, + "loss": 3.8799, + "step": 81945 + }, + { + "epoch": 5.568011958146487, + "grad_norm": 0.26675885915756226, + "learning_rate": 3.0428726729175163e-06, + "loss": 4.2007, + "step": 81950 + }, + { + "epoch": 5.568351678217149, + "grad_norm": 0.28356555104255676, + "learning_rate": 3.042448022829189e-06, + "loss": 3.7908, + "step": 81955 + }, + { + "epoch": 5.568691398287811, + "grad_norm": 0.25511980056762695, + "learning_rate": 3.0420233727408615e-06, + "loss": 3.9926, + "step": 81960 + }, + { + "epoch": 5.569031118358472, + "grad_norm": 0.2576158940792084, + "learning_rate": 3.0415987226525343e-06, + "loss": 3.8654, + "step": 81965 + }, + { + "epoch": 5.569370838429134, + "grad_norm": 0.20811928808689117, + "learning_rate": 3.0411740725642075e-06, + "loss": 4.0599, + "step": 81970 + }, + { + "epoch": 5.569710558499796, + "grad_norm": 0.2865680158138275, + "learning_rate": 3.04074942247588e-06, + "loss": 3.8446, + "step": 81975 + }, + { + "epoch": 5.5700502785704575, + "grad_norm": 0.28277450799942017, + "learning_rate": 3.0403247723875527e-06, + "loss": 3.9473, + "step": 81980 + }, + { + "epoch": 5.57038999864112, + "grad_norm": 0.23871108889579773, + "learning_rate": 3.039900122299226e-06, + "loss": 3.8989, + "step": 81985 + }, + { + "epoch": 5.570729718711782, + "grad_norm": 0.33005547523498535, + "learning_rate": 3.0394754722108983e-06, + "loss": 3.7196, + "step": 81990 + }, + { + "epoch": 5.571069438782443, + "grad_norm": 0.26059797406196594, + "learning_rate": 3.039050822122571e-06, + "loss": 4.0116, + "step": 81995 + }, + { + "epoch": 5.571409158853105, + "grad_norm": 0.28152719140052795, + "learning_rate": 3.0386261720342444e-06, + "loss": 3.8972, + "step": 82000 + }, + { + "epoch": 5.571748878923767, + "grad_norm": 0.2716578245162964, + "learning_rate": 3.0382015219459167e-06, + "loss": 3.8124, + "step": 82005 + }, + { + "epoch": 5.572088598994428, + "grad_norm": 0.4528976380825043, + "learning_rate": 3.0377768718575895e-06, + "loss": 3.9194, + "step": 82010 + }, + { + "epoch": 5.57242831906509, + "grad_norm": 0.37145018577575684, + "learning_rate": 3.0373522217692623e-06, + "loss": 4.0162, + "step": 82015 + }, + { + "epoch": 5.572768039135752, + "grad_norm": 0.2847193777561188, + "learning_rate": 3.0369275716809347e-06, + "loss": 3.8084, + "step": 82020 + }, + { + "epoch": 5.5731077592064135, + "grad_norm": 0.19781336188316345, + "learning_rate": 3.036502921592608e-06, + "loss": 4.0843, + "step": 82025 + }, + { + "epoch": 5.573447479277076, + "grad_norm": 0.2407667189836502, + "learning_rate": 3.0360782715042807e-06, + "loss": 3.9892, + "step": 82030 + }, + { + "epoch": 5.573787199347738, + "grad_norm": 0.24967540800571442, + "learning_rate": 3.035653621415954e-06, + "loss": 4.0922, + "step": 82035 + }, + { + "epoch": 5.574126919418399, + "grad_norm": 0.24083290994167328, + "learning_rate": 3.0352289713276263e-06, + "loss": 3.9934, + "step": 82040 + }, + { + "epoch": 5.574466639489061, + "grad_norm": 0.23597192764282227, + "learning_rate": 3.034804321239299e-06, + "loss": 3.7752, + "step": 82045 + }, + { + "epoch": 5.574806359559723, + "grad_norm": 0.23850874602794647, + "learning_rate": 3.034379671150972e-06, + "loss": 3.8584, + "step": 82050 + }, + { + "epoch": 5.575146079630384, + "grad_norm": 0.23821398615837097, + "learning_rate": 3.0339550210626443e-06, + "loss": 3.9524, + "step": 82055 + }, + { + "epoch": 5.575485799701046, + "grad_norm": 0.2883770763874054, + "learning_rate": 3.0335303709743175e-06, + "loss": 3.8406, + "step": 82060 + }, + { + "epoch": 5.575825519771708, + "grad_norm": 0.2609579265117645, + "learning_rate": 3.0331057208859903e-06, + "loss": 3.9761, + "step": 82065 + }, + { + "epoch": 5.57616523984237, + "grad_norm": 0.2007318139076233, + "learning_rate": 3.0326810707976627e-06, + "loss": 3.9409, + "step": 82070 + }, + { + "epoch": 5.576504959913032, + "grad_norm": 0.3328157663345337, + "learning_rate": 3.032256420709336e-06, + "loss": 3.983, + "step": 82075 + }, + { + "epoch": 5.576844679983694, + "grad_norm": 0.2240237295627594, + "learning_rate": 3.0318317706210087e-06, + "loss": 3.8016, + "step": 82080 + }, + { + "epoch": 5.577184400054355, + "grad_norm": 0.290890097618103, + "learning_rate": 3.031407120532681e-06, + "loss": 3.9894, + "step": 82085 + }, + { + "epoch": 5.577524120125017, + "grad_norm": 0.21572504937648773, + "learning_rate": 3.030982470444354e-06, + "loss": 3.9304, + "step": 82090 + }, + { + "epoch": 5.577863840195679, + "grad_norm": 0.2803897559642792, + "learning_rate": 3.030557820356027e-06, + "loss": 3.8999, + "step": 82095 + }, + { + "epoch": 5.57820356026634, + "grad_norm": 0.43754175305366516, + "learning_rate": 3.0301331702676995e-06, + "loss": 3.9936, + "step": 82100 + }, + { + "epoch": 5.578543280337002, + "grad_norm": 0.26717492938041687, + "learning_rate": 3.0297085201793723e-06, + "loss": 4.0409, + "step": 82105 + }, + { + "epoch": 5.578883000407664, + "grad_norm": 0.2423829585313797, + "learning_rate": 3.0292838700910455e-06, + "loss": 3.7924, + "step": 82110 + }, + { + "epoch": 5.579222720478326, + "grad_norm": 0.3612699806690216, + "learning_rate": 3.028859220002718e-06, + "loss": 3.7733, + "step": 82115 + }, + { + "epoch": 5.579562440548988, + "grad_norm": 0.279966801404953, + "learning_rate": 3.0284345699143907e-06, + "loss": 3.8933, + "step": 82120 + }, + { + "epoch": 5.57990216061965, + "grad_norm": 0.2322302907705307, + "learning_rate": 3.0280099198260635e-06, + "loss": 4.0427, + "step": 82125 + }, + { + "epoch": 5.580241880690311, + "grad_norm": 0.2611137926578522, + "learning_rate": 3.0275852697377363e-06, + "loss": 3.819, + "step": 82130 + }, + { + "epoch": 5.580581600760973, + "grad_norm": 0.36426863074302673, + "learning_rate": 3.027160619649409e-06, + "loss": 4.0355, + "step": 82135 + }, + { + "epoch": 5.580921320831635, + "grad_norm": 0.24809321761131287, + "learning_rate": 3.026735969561082e-06, + "loss": 3.8764, + "step": 82140 + }, + { + "epoch": 5.581261040902296, + "grad_norm": 0.31540364027023315, + "learning_rate": 3.0263113194727543e-06, + "loss": 4.1437, + "step": 82145 + }, + { + "epoch": 5.581600760972958, + "grad_norm": 0.3358740508556366, + "learning_rate": 3.0258866693844275e-06, + "loss": 4.0554, + "step": 82150 + }, + { + "epoch": 5.58194048104362, + "grad_norm": 0.25060731172561646, + "learning_rate": 3.0254620192961003e-06, + "loss": 3.8969, + "step": 82155 + }, + { + "epoch": 5.582280201114282, + "grad_norm": 0.29433321952819824, + "learning_rate": 3.0250373692077727e-06, + "loss": 4.0017, + "step": 82160 + }, + { + "epoch": 5.582619921184944, + "grad_norm": 0.2208711802959442, + "learning_rate": 3.024612719119446e-06, + "loss": 3.6753, + "step": 82165 + }, + { + "epoch": 5.582959641255606, + "grad_norm": 0.30905595421791077, + "learning_rate": 3.0241880690311187e-06, + "loss": 4.0616, + "step": 82170 + }, + { + "epoch": 5.583299361326267, + "grad_norm": 0.2579016089439392, + "learning_rate": 3.023763418942791e-06, + "loss": 4.0448, + "step": 82175 + }, + { + "epoch": 5.583639081396929, + "grad_norm": 0.244270920753479, + "learning_rate": 3.023338768854464e-06, + "loss": 3.8394, + "step": 82180 + }, + { + "epoch": 5.583978801467591, + "grad_norm": 0.27726542949676514, + "learning_rate": 3.022914118766137e-06, + "loss": 3.8597, + "step": 82185 + }, + { + "epoch": 5.584318521538252, + "grad_norm": 0.3481256663799286, + "learning_rate": 3.0224894686778095e-06, + "loss": 4.0904, + "step": 82190 + }, + { + "epoch": 5.584658241608914, + "grad_norm": 0.2764657139778137, + "learning_rate": 3.0220648185894823e-06, + "loss": 3.8703, + "step": 82195 + }, + { + "epoch": 5.584997961679576, + "grad_norm": 0.21356244385242462, + "learning_rate": 3.0216401685011555e-06, + "loss": 4.1012, + "step": 82200 + }, + { + "epoch": 5.585337681750238, + "grad_norm": 0.27199769020080566, + "learning_rate": 3.0212155184128283e-06, + "loss": 3.813, + "step": 82205 + }, + { + "epoch": 5.5856774018209, + "grad_norm": 0.2768072485923767, + "learning_rate": 3.0207908683245007e-06, + "loss": 3.7939, + "step": 82210 + }, + { + "epoch": 5.586017121891562, + "grad_norm": 0.2551281750202179, + "learning_rate": 3.0203662182361735e-06, + "loss": 3.941, + "step": 82215 + }, + { + "epoch": 5.586356841962223, + "grad_norm": 0.2403944879770279, + "learning_rate": 3.0199415681478467e-06, + "loss": 4.0285, + "step": 82220 + }, + { + "epoch": 5.586696562032885, + "grad_norm": 0.28749942779541016, + "learning_rate": 3.019516918059519e-06, + "loss": 4.0066, + "step": 82225 + }, + { + "epoch": 5.587036282103547, + "grad_norm": 0.26200172305107117, + "learning_rate": 3.019092267971192e-06, + "loss": 3.9128, + "step": 82230 + }, + { + "epoch": 5.587376002174208, + "grad_norm": 0.2995709478855133, + "learning_rate": 3.018667617882865e-06, + "loss": 4.0905, + "step": 82235 + }, + { + "epoch": 5.58771572224487, + "grad_norm": 0.3276079595088959, + "learning_rate": 3.0182429677945375e-06, + "loss": 3.7028, + "step": 82240 + }, + { + "epoch": 5.588055442315532, + "grad_norm": 0.2166028618812561, + "learning_rate": 3.0178183177062103e-06, + "loss": 3.9819, + "step": 82245 + }, + { + "epoch": 5.588395162386194, + "grad_norm": 0.24042347073554993, + "learning_rate": 3.017393667617883e-06, + "loss": 3.8648, + "step": 82250 + }, + { + "epoch": 5.588734882456856, + "grad_norm": 0.22635182738304138, + "learning_rate": 3.016969017529556e-06, + "loss": 4.0268, + "step": 82255 + }, + { + "epoch": 5.589074602527518, + "grad_norm": 0.2179947793483734, + "learning_rate": 3.0165443674412287e-06, + "loss": 4.0225, + "step": 82260 + }, + { + "epoch": 5.589414322598179, + "grad_norm": 0.2352338582277298, + "learning_rate": 3.0161197173529015e-06, + "loss": 3.9056, + "step": 82265 + }, + { + "epoch": 5.589754042668841, + "grad_norm": 0.30370041728019714, + "learning_rate": 3.015695067264574e-06, + "loss": 4.1573, + "step": 82270 + }, + { + "epoch": 5.590093762739503, + "grad_norm": 0.39868223667144775, + "learning_rate": 3.015270417176247e-06, + "loss": 3.8467, + "step": 82275 + }, + { + "epoch": 5.590433482810164, + "grad_norm": 0.2786439061164856, + "learning_rate": 3.01484576708792e-06, + "loss": 4.0416, + "step": 82280 + }, + { + "epoch": 5.590773202880826, + "grad_norm": 0.23296302556991577, + "learning_rate": 3.0144211169995923e-06, + "loss": 3.7422, + "step": 82285 + }, + { + "epoch": 5.591112922951488, + "grad_norm": 0.4314787983894348, + "learning_rate": 3.0139964669112655e-06, + "loss": 3.7761, + "step": 82290 + }, + { + "epoch": 5.59145264302215, + "grad_norm": 0.280308336019516, + "learning_rate": 3.0135718168229383e-06, + "loss": 3.7488, + "step": 82295 + }, + { + "epoch": 5.591792363092812, + "grad_norm": 0.21629706025123596, + "learning_rate": 3.0131471667346107e-06, + "loss": 4.0843, + "step": 82300 + }, + { + "epoch": 5.592132083163474, + "grad_norm": 0.20753943920135498, + "learning_rate": 3.0127225166462835e-06, + "loss": 3.8193, + "step": 82305 + }, + { + "epoch": 5.592471803234135, + "grad_norm": 0.2741736173629761, + "learning_rate": 3.0122978665579567e-06, + "loss": 3.8189, + "step": 82310 + }, + { + "epoch": 5.592811523304797, + "grad_norm": 0.5807345509529114, + "learning_rate": 3.011873216469629e-06, + "loss": 4.0203, + "step": 82315 + }, + { + "epoch": 5.593151243375459, + "grad_norm": 0.24186643958091736, + "learning_rate": 3.011448566381302e-06, + "loss": 3.8322, + "step": 82320 + }, + { + "epoch": 5.59349096344612, + "grad_norm": 0.3388252854347229, + "learning_rate": 3.011023916292975e-06, + "loss": 3.9446, + "step": 82325 + }, + { + "epoch": 5.593830683516782, + "grad_norm": 0.30717697739601135, + "learning_rate": 3.0105992662046475e-06, + "loss": 4.0538, + "step": 82330 + }, + { + "epoch": 5.594170403587444, + "grad_norm": 0.2372804582118988, + "learning_rate": 3.0101746161163203e-06, + "loss": 3.8686, + "step": 82335 + }, + { + "epoch": 5.594510123658106, + "grad_norm": 0.5136905312538147, + "learning_rate": 3.009749966027993e-06, + "loss": 4.0093, + "step": 82340 + }, + { + "epoch": 5.594849843728768, + "grad_norm": 0.2513117492198944, + "learning_rate": 3.009325315939666e-06, + "loss": 3.8631, + "step": 82345 + }, + { + "epoch": 5.595189563799429, + "grad_norm": 0.2696211636066437, + "learning_rate": 3.0089006658513387e-06, + "loss": 3.457, + "step": 82350 + }, + { + "epoch": 5.595529283870091, + "grad_norm": 0.24760358035564423, + "learning_rate": 3.0084760157630115e-06, + "loss": 4.4318, + "step": 82355 + }, + { + "epoch": 5.595869003940753, + "grad_norm": 0.19264158606529236, + "learning_rate": 3.008051365674684e-06, + "loss": 3.9487, + "step": 82360 + }, + { + "epoch": 5.596208724011414, + "grad_norm": 0.2454599142074585, + "learning_rate": 3.007626715586357e-06, + "loss": 4.0489, + "step": 82365 + }, + { + "epoch": 5.596548444082076, + "grad_norm": 0.32515212893486023, + "learning_rate": 3.00720206549803e-06, + "loss": 4.1242, + "step": 82370 + }, + { + "epoch": 5.596888164152738, + "grad_norm": 0.29177191853523254, + "learning_rate": 3.0067774154097027e-06, + "loss": 3.6818, + "step": 82375 + }, + { + "epoch": 5.5972278842234, + "grad_norm": 0.28237611055374146, + "learning_rate": 3.0063527653213755e-06, + "loss": 3.9, + "step": 82380 + }, + { + "epoch": 5.597567604294062, + "grad_norm": 0.277739554643631, + "learning_rate": 3.0059281152330483e-06, + "loss": 3.7984, + "step": 82385 + }, + { + "epoch": 5.597907324364724, + "grad_norm": 0.26888853311538696, + "learning_rate": 3.005503465144721e-06, + "loss": 3.9437, + "step": 82390 + }, + { + "epoch": 5.598247044435385, + "grad_norm": 0.21666283905506134, + "learning_rate": 3.0050788150563935e-06, + "loss": 3.9773, + "step": 82395 + }, + { + "epoch": 5.598586764506047, + "grad_norm": 0.27745604515075684, + "learning_rate": 3.0046541649680667e-06, + "loss": 3.986, + "step": 82400 + }, + { + "epoch": 5.598926484576709, + "grad_norm": 0.2621111273765564, + "learning_rate": 3.0042295148797395e-06, + "loss": 3.8047, + "step": 82405 + }, + { + "epoch": 5.59926620464737, + "grad_norm": 0.20857074856758118, + "learning_rate": 3.003804864791412e-06, + "loss": 3.8806, + "step": 82410 + }, + { + "epoch": 5.599605924718032, + "grad_norm": 0.403694212436676, + "learning_rate": 3.003380214703085e-06, + "loss": 4.2537, + "step": 82415 + }, + { + "epoch": 5.599945644788694, + "grad_norm": 0.30225884914398193, + "learning_rate": 3.002955564614758e-06, + "loss": 3.9517, + "step": 82420 + }, + { + "epoch": 5.600285364859356, + "grad_norm": 0.23128202557563782, + "learning_rate": 3.0025309145264303e-06, + "loss": 3.7741, + "step": 82425 + }, + { + "epoch": 5.600625084930018, + "grad_norm": 0.5114423632621765, + "learning_rate": 3.002106264438103e-06, + "loss": 3.9141, + "step": 82430 + }, + { + "epoch": 5.60096480500068, + "grad_norm": 0.23650673031806946, + "learning_rate": 3.0016816143497763e-06, + "loss": 3.9417, + "step": 82435 + }, + { + "epoch": 5.601304525071341, + "grad_norm": 0.20046952366828918, + "learning_rate": 3.0012569642614487e-06, + "loss": 3.8691, + "step": 82440 + }, + { + "epoch": 5.601644245142003, + "grad_norm": 0.19406308233737946, + "learning_rate": 3.0008323141731215e-06, + "loss": 3.9141, + "step": 82445 + }, + { + "epoch": 5.601983965212665, + "grad_norm": 0.3271539807319641, + "learning_rate": 3.0004076640847947e-06, + "loss": 3.9785, + "step": 82450 + }, + { + "epoch": 5.602323685283326, + "grad_norm": 0.25190460681915283, + "learning_rate": 2.999983013996467e-06, + "loss": 4.0074, + "step": 82455 + }, + { + "epoch": 5.602663405353988, + "grad_norm": NaN, + "learning_rate": 2.999643293925805e-06, + "loss": 3.762, + "step": 82460 + }, + { + "epoch": 5.60300312542465, + "grad_norm": 0.43581902980804443, + "learning_rate": 2.999218643837478e-06, + "loss": 3.9968, + "step": 82465 + }, + { + "epoch": 5.603342845495312, + "grad_norm": 0.2841743528842926, + "learning_rate": 2.998793993749151e-06, + "loss": 4.0918, + "step": 82470 + }, + { + "epoch": 5.603682565565974, + "grad_norm": 0.2836495637893677, + "learning_rate": 2.9983693436608236e-06, + "loss": 3.8904, + "step": 82475 + }, + { + "epoch": 5.604022285636636, + "grad_norm": 0.2842804789543152, + "learning_rate": 2.9979446935724964e-06, + "loss": 4.0032, + "step": 82480 + }, + { + "epoch": 5.604362005707297, + "grad_norm": 0.2250985950231552, + "learning_rate": 2.9975200434841696e-06, + "loss": 3.6747, + "step": 82485 + }, + { + "epoch": 5.604701725777959, + "grad_norm": 0.21217060089111328, + "learning_rate": 2.997095393395842e-06, + "loss": 3.6854, + "step": 82490 + }, + { + "epoch": 5.605041445848621, + "grad_norm": 0.2681633234024048, + "learning_rate": 2.9966707433075148e-06, + "loss": 3.8065, + "step": 82495 + }, + { + "epoch": 5.605381165919282, + "grad_norm": 0.24571815133094788, + "learning_rate": 2.996246093219188e-06, + "loss": 3.8268, + "step": 82500 + }, + { + "epoch": 5.605720885989944, + "grad_norm": 0.3364690840244293, + "learning_rate": 2.9958214431308604e-06, + "loss": 4.0295, + "step": 82505 + }, + { + "epoch": 5.606060606060606, + "grad_norm": 0.20742739737033844, + "learning_rate": 2.995396793042533e-06, + "loss": 3.8593, + "step": 82510 + }, + { + "epoch": 5.606400326131268, + "grad_norm": 0.23206669092178345, + "learning_rate": 2.994972142954206e-06, + "loss": 3.9694, + "step": 82515 + }, + { + "epoch": 5.60674004620193, + "grad_norm": 0.21076218783855438, + "learning_rate": 2.9945474928658784e-06, + "loss": 3.8995, + "step": 82520 + }, + { + "epoch": 5.607079766272592, + "grad_norm": 0.21284309029579163, + "learning_rate": 2.9941228427775516e-06, + "loss": 4.0033, + "step": 82525 + }, + { + "epoch": 5.607419486343253, + "grad_norm": 0.3538444936275482, + "learning_rate": 2.9936981926892244e-06, + "loss": 4.0966, + "step": 82530 + }, + { + "epoch": 5.607759206413915, + "grad_norm": 0.22918707132339478, + "learning_rate": 2.9932735426008968e-06, + "loss": 3.8885, + "step": 82535 + }, + { + "epoch": 5.608098926484577, + "grad_norm": 0.4011958837509155, + "learning_rate": 2.99284889251257e-06, + "loss": 3.8469, + "step": 82540 + }, + { + "epoch": 5.608438646555238, + "grad_norm": 0.24024619162082672, + "learning_rate": 2.992424242424243e-06, + "loss": 3.9001, + "step": 82545 + }, + { + "epoch": 5.6087783666259, + "grad_norm": 0.27416160702705383, + "learning_rate": 2.991999592335915e-06, + "loss": 4.0684, + "step": 82550 + }, + { + "epoch": 5.6091180866965615, + "grad_norm": 0.39632925391197205, + "learning_rate": 2.991574942247588e-06, + "loss": 4.0122, + "step": 82555 + }, + { + "epoch": 5.609457806767224, + "grad_norm": 0.25989603996276855, + "learning_rate": 2.991150292159261e-06, + "loss": 3.7097, + "step": 82560 + }, + { + "epoch": 5.609797526837886, + "grad_norm": 0.2190748155117035, + "learning_rate": 2.9907256420709336e-06, + "loss": 3.8965, + "step": 82565 + }, + { + "epoch": 5.610137246908547, + "grad_norm": 0.25415855646133423, + "learning_rate": 2.9903009919826064e-06, + "loss": 3.9745, + "step": 82570 + }, + { + "epoch": 5.610476966979209, + "grad_norm": 0.31287717819213867, + "learning_rate": 2.9898763418942796e-06, + "loss": 3.6519, + "step": 82575 + }, + { + "epoch": 5.610816687049871, + "grad_norm": 0.2681454122066498, + "learning_rate": 2.9894516918059524e-06, + "loss": 3.999, + "step": 82580 + }, + { + "epoch": 5.611156407120532, + "grad_norm": 0.2482173889875412, + "learning_rate": 2.9890270417176248e-06, + "loss": 3.9284, + "step": 82585 + }, + { + "epoch": 5.611496127191194, + "grad_norm": 0.31435832381248474, + "learning_rate": 2.9886023916292976e-06, + "loss": 4.0259, + "step": 82590 + }, + { + "epoch": 5.611835847261856, + "grad_norm": 0.23335927724838257, + "learning_rate": 2.988177741540971e-06, + "loss": 3.9387, + "step": 82595 + }, + { + "epoch": 5.6121755673325175, + "grad_norm": 0.23870539665222168, + "learning_rate": 2.987753091452643e-06, + "loss": 3.8694, + "step": 82600 + }, + { + "epoch": 5.61251528740318, + "grad_norm": 0.2497033327817917, + "learning_rate": 2.987328441364316e-06, + "loss": 3.7668, + "step": 82605 + }, + { + "epoch": 5.612855007473842, + "grad_norm": 0.2550201416015625, + "learning_rate": 2.986903791275989e-06, + "loss": 4.0099, + "step": 82610 + }, + { + "epoch": 5.613194727544503, + "grad_norm": 0.3610091209411621, + "learning_rate": 2.9864791411876616e-06, + "loss": 3.9606, + "step": 82615 + }, + { + "epoch": 5.613534447615165, + "grad_norm": 0.3006742596626282, + "learning_rate": 2.9860544910993344e-06, + "loss": 4.2529, + "step": 82620 + }, + { + "epoch": 5.613874167685827, + "grad_norm": 0.3030671775341034, + "learning_rate": 2.985629841011007e-06, + "loss": 4.1514, + "step": 82625 + }, + { + "epoch": 5.614213887756488, + "grad_norm": 0.2972205579280853, + "learning_rate": 2.98520519092268e-06, + "loss": 4.0164, + "step": 82630 + }, + { + "epoch": 5.61455360782715, + "grad_norm": 0.22150583565235138, + "learning_rate": 2.9847805408343528e-06, + "loss": 3.6959, + "step": 82635 + }, + { + "epoch": 5.614893327897812, + "grad_norm": 0.23251736164093018, + "learning_rate": 2.9843558907460256e-06, + "loss": 3.6937, + "step": 82640 + }, + { + "epoch": 5.615233047968474, + "grad_norm": 0.28908318281173706, + "learning_rate": 2.983931240657698e-06, + "loss": 3.7401, + "step": 82645 + }, + { + "epoch": 5.615572768039136, + "grad_norm": 0.22332435846328735, + "learning_rate": 2.983506590569371e-06, + "loss": 3.8885, + "step": 82650 + }, + { + "epoch": 5.615912488109798, + "grad_norm": 0.32455500960350037, + "learning_rate": 2.983081940481044e-06, + "loss": 3.9902, + "step": 82655 + }, + { + "epoch": 5.616252208180459, + "grad_norm": 0.20770660042762756, + "learning_rate": 2.9826572903927164e-06, + "loss": 3.9485, + "step": 82660 + }, + { + "epoch": 5.616591928251121, + "grad_norm": 0.4031413197517395, + "learning_rate": 2.9822326403043896e-06, + "loss": 3.9508, + "step": 82665 + }, + { + "epoch": 5.616931648321783, + "grad_norm": 0.32422611117362976, + "learning_rate": 2.9818079902160624e-06, + "loss": 3.7617, + "step": 82670 + }, + { + "epoch": 5.617271368392444, + "grad_norm": 0.2802262604236603, + "learning_rate": 2.9813833401277348e-06, + "loss": 4.0586, + "step": 82675 + }, + { + "epoch": 5.617611088463106, + "grad_norm": 0.24965909123420715, + "learning_rate": 2.9809586900394076e-06, + "loss": 3.7686, + "step": 82680 + }, + { + "epoch": 5.617950808533768, + "grad_norm": 0.3207235634326935, + "learning_rate": 2.980534039951081e-06, + "loss": 3.7855, + "step": 82685 + }, + { + "epoch": 5.61829052860443, + "grad_norm": 0.24650739133358002, + "learning_rate": 2.980109389862753e-06, + "loss": 3.7341, + "step": 82690 + }, + { + "epoch": 5.618630248675092, + "grad_norm": 0.19285641610622406, + "learning_rate": 2.979684739774426e-06, + "loss": 3.887, + "step": 82695 + }, + { + "epoch": 5.618969968745754, + "grad_norm": 0.3328273594379425, + "learning_rate": 2.979260089686099e-06, + "loss": 4.1355, + "step": 82700 + }, + { + "epoch": 5.619309688816415, + "grad_norm": 0.3359041213989258, + "learning_rate": 2.9788354395977716e-06, + "loss": 4.1278, + "step": 82705 + }, + { + "epoch": 5.619649408887077, + "grad_norm": 0.2870844602584839, + "learning_rate": 2.9784107895094444e-06, + "loss": 3.9728, + "step": 82710 + }, + { + "epoch": 5.619989128957739, + "grad_norm": 0.24870158731937408, + "learning_rate": 2.977986139421117e-06, + "loss": 3.8858, + "step": 82715 + }, + { + "epoch": 5.6203288490284, + "grad_norm": 0.22350744903087616, + "learning_rate": 2.97756148933279e-06, + "loss": 3.7137, + "step": 82720 + }, + { + "epoch": 5.620668569099062, + "grad_norm": 0.19873371720314026, + "learning_rate": 2.9771368392444628e-06, + "loss": 4.0129, + "step": 82725 + }, + { + "epoch": 5.621008289169724, + "grad_norm": 0.3493718206882477, + "learning_rate": 2.9767121891561356e-06, + "loss": 3.9936, + "step": 82730 + }, + { + "epoch": 5.621348009240386, + "grad_norm": 0.1970876008272171, + "learning_rate": 2.976287539067808e-06, + "loss": 3.8702, + "step": 82735 + }, + { + "epoch": 5.621687729311048, + "grad_norm": 0.22490626573562622, + "learning_rate": 2.975862888979481e-06, + "loss": 3.9707, + "step": 82740 + }, + { + "epoch": 5.62202744938171, + "grad_norm": 0.2541455626487732, + "learning_rate": 2.975438238891154e-06, + "loss": 3.9592, + "step": 82745 + }, + { + "epoch": 5.622367169452371, + "grad_norm": 0.21178412437438965, + "learning_rate": 2.9750135888028268e-06, + "loss": 4.0936, + "step": 82750 + }, + { + "epoch": 5.622706889523033, + "grad_norm": 0.26308661699295044, + "learning_rate": 2.9745889387144996e-06, + "loss": 4.0975, + "step": 82755 + }, + { + "epoch": 5.623046609593695, + "grad_norm": 0.27672627568244934, + "learning_rate": 2.9741642886261724e-06, + "loss": 3.9609, + "step": 82760 + }, + { + "epoch": 5.623386329664356, + "grad_norm": 0.31041091680526733, + "learning_rate": 2.973739638537845e-06, + "loss": 3.7758, + "step": 82765 + }, + { + "epoch": 5.623726049735018, + "grad_norm": 0.26447945833206177, + "learning_rate": 2.9733149884495176e-06, + "loss": 3.8812, + "step": 82770 + }, + { + "epoch": 5.62406576980568, + "grad_norm": 0.2437022626399994, + "learning_rate": 2.9728903383611908e-06, + "loss": 4.0174, + "step": 82775 + }, + { + "epoch": 5.624405489876342, + "grad_norm": 0.3638760447502136, + "learning_rate": 2.9724656882728636e-06, + "loss": 4.0815, + "step": 82780 + }, + { + "epoch": 5.624745209947004, + "grad_norm": 0.27313995361328125, + "learning_rate": 2.972041038184536e-06, + "loss": 4.0346, + "step": 82785 + }, + { + "epoch": 5.625084930017666, + "grad_norm": 0.23844663798809052, + "learning_rate": 2.971616388096209e-06, + "loss": 3.9016, + "step": 82790 + }, + { + "epoch": 5.625424650088327, + "grad_norm": 0.3060463070869446, + "learning_rate": 2.971191738007882e-06, + "loss": 4.1736, + "step": 82795 + }, + { + "epoch": 5.625764370158989, + "grad_norm": 0.27958357334136963, + "learning_rate": 2.9707670879195544e-06, + "loss": 4.0835, + "step": 82800 + }, + { + "epoch": 5.626104090229651, + "grad_norm": 0.36480462551116943, + "learning_rate": 2.970342437831227e-06, + "loss": 3.9087, + "step": 82805 + }, + { + "epoch": 5.626443810300312, + "grad_norm": 0.24103312194347382, + "learning_rate": 2.9699177877429004e-06, + "loss": 3.725, + "step": 82810 + }, + { + "epoch": 5.626783530370974, + "grad_norm": 0.34266263246536255, + "learning_rate": 2.9694931376545728e-06, + "loss": 3.9772, + "step": 82815 + }, + { + "epoch": 5.627123250441636, + "grad_norm": 0.3059465289115906, + "learning_rate": 2.9690684875662456e-06, + "loss": 3.9584, + "step": 82820 + }, + { + "epoch": 5.627462970512298, + "grad_norm": 0.2459665685892105, + "learning_rate": 2.9686438374779188e-06, + "loss": 3.9978, + "step": 82825 + }, + { + "epoch": 5.62780269058296, + "grad_norm": 0.32760509848594666, + "learning_rate": 2.968219187389591e-06, + "loss": 3.8241, + "step": 82830 + }, + { + "epoch": 5.628142410653622, + "grad_norm": 0.2573322355747223, + "learning_rate": 2.967794537301264e-06, + "loss": 3.6179, + "step": 82835 + }, + { + "epoch": 5.628482130724283, + "grad_norm": 0.3619418740272522, + "learning_rate": 2.9673698872129368e-06, + "loss": 4.0491, + "step": 82840 + }, + { + "epoch": 5.628821850794945, + "grad_norm": 0.24519237875938416, + "learning_rate": 2.9669452371246096e-06, + "loss": 3.8858, + "step": 82845 + }, + { + "epoch": 5.629161570865607, + "grad_norm": 0.2527708113193512, + "learning_rate": 2.9665205870362824e-06, + "loss": 3.6641, + "step": 82850 + }, + { + "epoch": 5.629501290936268, + "grad_norm": 0.33864784240722656, + "learning_rate": 2.966095936947955e-06, + "loss": 4.0581, + "step": 82855 + }, + { + "epoch": 5.62984101100693, + "grad_norm": 0.31056708097457886, + "learning_rate": 2.9656712868596275e-06, + "loss": 4.0662, + "step": 82860 + }, + { + "epoch": 5.630180731077592, + "grad_norm": 0.21123263239860535, + "learning_rate": 2.9652466367713008e-06, + "loss": 3.6866, + "step": 82865 + }, + { + "epoch": 5.630520451148254, + "grad_norm": 0.2240692675113678, + "learning_rate": 2.9648219866829736e-06, + "loss": 3.845, + "step": 82870 + }, + { + "epoch": 5.630860171218916, + "grad_norm": 0.2806878983974457, + "learning_rate": 2.964397336594646e-06, + "loss": 3.7846, + "step": 82875 + }, + { + "epoch": 5.631199891289578, + "grad_norm": 0.3013048470020294, + "learning_rate": 2.963972686506319e-06, + "loss": 3.9118, + "step": 82880 + }, + { + "epoch": 5.631539611360239, + "grad_norm": 0.25832846760749817, + "learning_rate": 2.963548036417992e-06, + "loss": 3.8522, + "step": 82885 + }, + { + "epoch": 5.631879331430901, + "grad_norm": 0.34509018063545227, + "learning_rate": 2.9631233863296644e-06, + "loss": 4.0465, + "step": 82890 + }, + { + "epoch": 5.632219051501563, + "grad_norm": 0.3534644842147827, + "learning_rate": 2.962698736241337e-06, + "loss": 4.0393, + "step": 82895 + }, + { + "epoch": 5.632558771572224, + "grad_norm": 0.24408689141273499, + "learning_rate": 2.9622740861530104e-06, + "loss": 3.9196, + "step": 82900 + }, + { + "epoch": 5.632898491642886, + "grad_norm": 0.2336636483669281, + "learning_rate": 2.9618494360646828e-06, + "loss": 3.9565, + "step": 82905 + }, + { + "epoch": 5.633238211713548, + "grad_norm": 0.2574153542518616, + "learning_rate": 2.9614247859763556e-06, + "loss": 3.9396, + "step": 82910 + }, + { + "epoch": 5.63357793178421, + "grad_norm": 0.21125616133213043, + "learning_rate": 2.9610001358880288e-06, + "loss": 3.8374, + "step": 82915 + }, + { + "epoch": 5.633917651854872, + "grad_norm": 0.2667835056781769, + "learning_rate": 2.9605754857997016e-06, + "loss": 4.0565, + "step": 82920 + }, + { + "epoch": 5.634257371925534, + "grad_norm": 0.8123535513877869, + "learning_rate": 2.960150835711374e-06, + "loss": 3.9778, + "step": 82925 + }, + { + "epoch": 5.634597091996195, + "grad_norm": 0.2621020972728729, + "learning_rate": 2.9597261856230468e-06, + "loss": 4.0773, + "step": 82930 + }, + { + "epoch": 5.634936812066857, + "grad_norm": 0.37232309579849243, + "learning_rate": 2.95930153553472e-06, + "loss": 4.011, + "step": 82935 + }, + { + "epoch": 5.635276532137519, + "grad_norm": 0.2795316278934479, + "learning_rate": 2.9588768854463924e-06, + "loss": 3.5653, + "step": 82940 + }, + { + "epoch": 5.63561625220818, + "grad_norm": 0.421001136302948, + "learning_rate": 2.958452235358065e-06, + "loss": 3.89, + "step": 82945 + }, + { + "epoch": 5.635955972278842, + "grad_norm": 0.26566454768180847, + "learning_rate": 2.9580275852697384e-06, + "loss": 3.9311, + "step": 82950 + }, + { + "epoch": 5.6362956923495044, + "grad_norm": 0.23644927144050598, + "learning_rate": 2.9576029351814108e-06, + "loss": 3.759, + "step": 82955 + }, + { + "epoch": 5.636635412420166, + "grad_norm": 0.3271882236003876, + "learning_rate": 2.9571782850930836e-06, + "loss": 4.138, + "step": 82960 + }, + { + "epoch": 5.636975132490828, + "grad_norm": 0.24641121923923492, + "learning_rate": 2.9567536350047564e-06, + "loss": 3.8349, + "step": 82965 + }, + { + "epoch": 5.63731485256149, + "grad_norm": 0.30199211835861206, + "learning_rate": 2.9563289849164287e-06, + "loss": 3.7501, + "step": 82970 + }, + { + "epoch": 5.637654572632151, + "grad_norm": 0.27762582898139954, + "learning_rate": 2.955904334828102e-06, + "loss": 4.2997, + "step": 82975 + }, + { + "epoch": 5.637994292702813, + "grad_norm": 0.28706154227256775, + "learning_rate": 2.9554796847397748e-06, + "loss": 3.7304, + "step": 82980 + }, + { + "epoch": 5.638334012773475, + "grad_norm": 0.3732813596725464, + "learning_rate": 2.955055034651447e-06, + "loss": 4.0145, + "step": 82985 + }, + { + "epoch": 5.638673732844136, + "grad_norm": 0.2594987452030182, + "learning_rate": 2.9546303845631204e-06, + "loss": 3.8389, + "step": 82990 + }, + { + "epoch": 5.639013452914798, + "grad_norm": 0.2376176118850708, + "learning_rate": 2.954205734474793e-06, + "loss": 3.7372, + "step": 82995 + }, + { + "epoch": 5.6393531729854605, + "grad_norm": 0.28966712951660156, + "learning_rate": 2.9537810843864655e-06, + "loss": 4.0041, + "step": 83000 + }, + { + "epoch": 5.639692893056122, + "grad_norm": 0.22796888649463654, + "learning_rate": 2.9533564342981388e-06, + "loss": 3.7373, + "step": 83005 + }, + { + "epoch": 5.640032613126784, + "grad_norm": 0.255079060792923, + "learning_rate": 2.9529317842098116e-06, + "loss": 4.0041, + "step": 83010 + }, + { + "epoch": 5.640372333197446, + "grad_norm": 0.28692612051963806, + "learning_rate": 2.952507134121484e-06, + "loss": 4.08, + "step": 83015 + }, + { + "epoch": 5.640712053268107, + "grad_norm": 0.3514421284198761, + "learning_rate": 2.9520824840331567e-06, + "loss": 3.7861, + "step": 83020 + }, + { + "epoch": 5.641051773338769, + "grad_norm": 0.2682304382324219, + "learning_rate": 2.95165783394483e-06, + "loss": 4.0623, + "step": 83025 + }, + { + "epoch": 5.64139149340943, + "grad_norm": 0.39712241291999817, + "learning_rate": 2.9512331838565023e-06, + "loss": 3.933, + "step": 83030 + }, + { + "epoch": 5.641731213480092, + "grad_norm": 0.36537572741508484, + "learning_rate": 2.950808533768175e-06, + "loss": 3.8993, + "step": 83035 + }, + { + "epoch": 5.642070933550754, + "grad_norm": 0.23202815651893616, + "learning_rate": 2.9503838836798484e-06, + "loss": 4.0, + "step": 83040 + }, + { + "epoch": 5.642410653621416, + "grad_norm": 0.3985599875450134, + "learning_rate": 2.9499592335915208e-06, + "loss": 3.7178, + "step": 83045 + }, + { + "epoch": 5.642750373692078, + "grad_norm": 0.24243921041488647, + "learning_rate": 2.9495345835031936e-06, + "loss": 3.9855, + "step": 83050 + }, + { + "epoch": 5.64309009376274, + "grad_norm": 0.2881965935230255, + "learning_rate": 2.9491099334148664e-06, + "loss": 3.9549, + "step": 83055 + }, + { + "epoch": 5.643429813833401, + "grad_norm": 0.2988138198852539, + "learning_rate": 2.9486852833265387e-06, + "loss": 4.0362, + "step": 83060 + }, + { + "epoch": 5.643769533904063, + "grad_norm": 0.20776961743831635, + "learning_rate": 2.948260633238212e-06, + "loss": 3.9922, + "step": 83065 + }, + { + "epoch": 5.644109253974725, + "grad_norm": 0.2094973772764206, + "learning_rate": 2.9478359831498848e-06, + "loss": 3.9753, + "step": 83070 + }, + { + "epoch": 5.644448974045386, + "grad_norm": 0.29062414169311523, + "learning_rate": 2.947411333061557e-06, + "loss": 3.78, + "step": 83075 + }, + { + "epoch": 5.644788694116048, + "grad_norm": 0.26838570833206177, + "learning_rate": 2.9469866829732304e-06, + "loss": 3.7242, + "step": 83080 + }, + { + "epoch": 5.64512841418671, + "grad_norm": 0.2713296413421631, + "learning_rate": 2.946562032884903e-06, + "loss": 3.8287, + "step": 83085 + }, + { + "epoch": 5.645468134257372, + "grad_norm": 0.4745556116104126, + "learning_rate": 2.946137382796576e-06, + "loss": 3.8946, + "step": 83090 + }, + { + "epoch": 5.645807854328034, + "grad_norm": 0.27556300163269043, + "learning_rate": 2.9457127327082483e-06, + "loss": 4.0784, + "step": 83095 + }, + { + "epoch": 5.646147574398696, + "grad_norm": 0.300598680973053, + "learning_rate": 2.9452880826199216e-06, + "loss": 3.9378, + "step": 83100 + }, + { + "epoch": 5.646487294469357, + "grad_norm": 0.28844523429870605, + "learning_rate": 2.9448634325315944e-06, + "loss": 3.906, + "step": 83105 + }, + { + "epoch": 5.646827014540019, + "grad_norm": 0.22075775265693665, + "learning_rate": 2.9444387824432667e-06, + "loss": 3.7117, + "step": 83110 + }, + { + "epoch": 5.647166734610681, + "grad_norm": 0.2937418818473816, + "learning_rate": 2.94401413235494e-06, + "loss": 4.1058, + "step": 83115 + }, + { + "epoch": 5.647506454681342, + "grad_norm": 0.25419309735298157, + "learning_rate": 2.9435894822666128e-06, + "loss": 3.9194, + "step": 83120 + }, + { + "epoch": 5.647846174752004, + "grad_norm": 0.2955542504787445, + "learning_rate": 2.943164832178285e-06, + "loss": 3.7486, + "step": 83125 + }, + { + "epoch": 5.648185894822666, + "grad_norm": 0.28391116857528687, + "learning_rate": 2.9427401820899584e-06, + "loss": 4.0037, + "step": 83130 + }, + { + "epoch": 5.648525614893328, + "grad_norm": 0.26880180835723877, + "learning_rate": 2.942315532001631e-06, + "loss": 3.8571, + "step": 83135 + }, + { + "epoch": 5.64886533496399, + "grad_norm": 0.190444678068161, + "learning_rate": 2.9418908819133035e-06, + "loss": 3.7583, + "step": 83140 + }, + { + "epoch": 5.649205055034652, + "grad_norm": 0.31046175956726074, + "learning_rate": 2.9414662318249763e-06, + "loss": 3.7485, + "step": 83145 + }, + { + "epoch": 5.649544775105313, + "grad_norm": 0.23434516787528992, + "learning_rate": 2.9410415817366496e-06, + "loss": 3.8982, + "step": 83150 + }, + { + "epoch": 5.649884495175975, + "grad_norm": 0.300480455160141, + "learning_rate": 2.940616931648322e-06, + "loss": 3.9088, + "step": 83155 + }, + { + "epoch": 5.650224215246637, + "grad_norm": 0.3732953369617462, + "learning_rate": 2.9401922815599947e-06, + "loss": 4.1354, + "step": 83160 + }, + { + "epoch": 5.650563935317298, + "grad_norm": 0.32823672890663147, + "learning_rate": 2.939767631471668e-06, + "loss": 3.975, + "step": 83165 + }, + { + "epoch": 5.65090365538796, + "grad_norm": 0.26696306467056274, + "learning_rate": 2.9393429813833403e-06, + "loss": 4.0628, + "step": 83170 + }, + { + "epoch": 5.651243375458622, + "grad_norm": 0.3082617521286011, + "learning_rate": 2.938918331295013e-06, + "loss": 3.9983, + "step": 83175 + }, + { + "epoch": 5.651583095529284, + "grad_norm": 0.31335440278053284, + "learning_rate": 2.938493681206686e-06, + "loss": 3.9382, + "step": 83180 + }, + { + "epoch": 5.651922815599946, + "grad_norm": 0.30199167132377625, + "learning_rate": 2.9380690311183583e-06, + "loss": 3.9605, + "step": 83185 + }, + { + "epoch": 5.652262535670608, + "grad_norm": 0.3838253319263458, + "learning_rate": 2.9376443810300315e-06, + "loss": 3.6262, + "step": 83190 + }, + { + "epoch": 5.652602255741269, + "grad_norm": 0.22406437993049622, + "learning_rate": 2.9372197309417044e-06, + "loss": 4.0202, + "step": 83195 + }, + { + "epoch": 5.652941975811931, + "grad_norm": 0.24827004969120026, + "learning_rate": 2.9367950808533767e-06, + "loss": 3.8963, + "step": 83200 + }, + { + "epoch": 5.653281695882593, + "grad_norm": 0.21246545016765594, + "learning_rate": 2.93637043076505e-06, + "loss": 3.9059, + "step": 83205 + }, + { + "epoch": 5.653621415953254, + "grad_norm": 0.2813291847705841, + "learning_rate": 2.9359457806767228e-06, + "loss": 4.1299, + "step": 83210 + }, + { + "epoch": 5.653961136023916, + "grad_norm": 0.32031935453414917, + "learning_rate": 2.935521130588395e-06, + "loss": 3.843, + "step": 83215 + }, + { + "epoch": 5.654300856094578, + "grad_norm": 0.21591512858867645, + "learning_rate": 2.935096480500068e-06, + "loss": 3.915, + "step": 83220 + }, + { + "epoch": 5.65464057616524, + "grad_norm": 0.23387743532657623, + "learning_rate": 2.934671830411741e-06, + "loss": 3.8291, + "step": 83225 + }, + { + "epoch": 5.654980296235902, + "grad_norm": 0.22380228340625763, + "learning_rate": 2.9342471803234135e-06, + "loss": 3.6709, + "step": 83230 + }, + { + "epoch": 5.655320016306564, + "grad_norm": 0.28683948516845703, + "learning_rate": 2.9338225302350863e-06, + "loss": 3.7323, + "step": 83235 + }, + { + "epoch": 5.655659736377225, + "grad_norm": 0.2684224843978882, + "learning_rate": 2.9333978801467596e-06, + "loss": 3.8626, + "step": 83240 + }, + { + "epoch": 5.655999456447887, + "grad_norm": 0.2610820531845093, + "learning_rate": 2.932973230058432e-06, + "loss": 3.8326, + "step": 83245 + }, + { + "epoch": 5.656339176518548, + "grad_norm": 0.5487666726112366, + "learning_rate": 2.9325485799701047e-06, + "loss": 3.781, + "step": 83250 + }, + { + "epoch": 5.65667889658921, + "grad_norm": 0.4977495074272156, + "learning_rate": 2.9321239298817775e-06, + "loss": 4.0895, + "step": 83255 + }, + { + "epoch": 5.657018616659872, + "grad_norm": 0.28318047523498535, + "learning_rate": 2.9316992797934508e-06, + "loss": 3.7462, + "step": 83260 + }, + { + "epoch": 5.657358336730534, + "grad_norm": 0.26156216859817505, + "learning_rate": 2.931274629705123e-06, + "loss": 3.938, + "step": 83265 + }, + { + "epoch": 5.657698056801196, + "grad_norm": 0.21988451480865479, + "learning_rate": 2.930849979616796e-06, + "loss": 4.1187, + "step": 83270 + }, + { + "epoch": 5.658037776871858, + "grad_norm": 0.23063522577285767, + "learning_rate": 2.930425329528469e-06, + "loss": 3.5149, + "step": 83275 + }, + { + "epoch": 5.658377496942519, + "grad_norm": 0.22537145018577576, + "learning_rate": 2.9300006794401415e-06, + "loss": 3.7813, + "step": 83280 + }, + { + "epoch": 5.658717217013181, + "grad_norm": 0.261730819940567, + "learning_rate": 2.9295760293518143e-06, + "loss": 3.7725, + "step": 83285 + }, + { + "epoch": 5.659056937083843, + "grad_norm": 0.29163822531700134, + "learning_rate": 2.9291513792634876e-06, + "loss": 4.1613, + "step": 83290 + }, + { + "epoch": 5.659396657154504, + "grad_norm": 0.32515227794647217, + "learning_rate": 2.92872672917516e-06, + "loss": 3.5028, + "step": 83295 + }, + { + "epoch": 5.659736377225166, + "grad_norm": 0.2668800354003906, + "learning_rate": 2.9283020790868327e-06, + "loss": 3.7988, + "step": 83300 + }, + { + "epoch": 5.660076097295828, + "grad_norm": 0.3049484193325043, + "learning_rate": 2.9278774289985055e-06, + "loss": 3.8654, + "step": 83305 + }, + { + "epoch": 5.66041581736649, + "grad_norm": 0.3284282684326172, + "learning_rate": 2.927452778910178e-06, + "loss": 3.858, + "step": 83310 + }, + { + "epoch": 5.660755537437152, + "grad_norm": 0.2436705380678177, + "learning_rate": 2.927028128821851e-06, + "loss": 3.7593, + "step": 83315 + }, + { + "epoch": 5.661095257507814, + "grad_norm": 0.2925579249858856, + "learning_rate": 2.926603478733524e-06, + "loss": 3.943, + "step": 83320 + }, + { + "epoch": 5.661434977578475, + "grad_norm": 0.28776970505714417, + "learning_rate": 2.9261788286451963e-06, + "loss": 3.9893, + "step": 83325 + }, + { + "epoch": 5.661774697649137, + "grad_norm": 0.3576946258544922, + "learning_rate": 2.9257541785568695e-06, + "loss": 4.1032, + "step": 83330 + }, + { + "epoch": 5.662114417719799, + "grad_norm": 0.21133050322532654, + "learning_rate": 2.9253295284685423e-06, + "loss": 3.7801, + "step": 83335 + }, + { + "epoch": 5.66245413779046, + "grad_norm": 0.2803463339805603, + "learning_rate": 2.9249048783802147e-06, + "loss": 3.8803, + "step": 83340 + }, + { + "epoch": 5.662793857861122, + "grad_norm": 0.3155409097671509, + "learning_rate": 2.9244802282918875e-06, + "loss": 3.9092, + "step": 83345 + }, + { + "epoch": 5.663133577931784, + "grad_norm": 0.29264721274375916, + "learning_rate": 2.9240555782035608e-06, + "loss": 3.8867, + "step": 83350 + }, + { + "epoch": 5.663473298002446, + "grad_norm": 0.2959388196468353, + "learning_rate": 2.923630928115233e-06, + "loss": 3.8891, + "step": 83355 + }, + { + "epoch": 5.663813018073108, + "grad_norm": 0.26002195477485657, + "learning_rate": 2.923206278026906e-06, + "loss": 4.2234, + "step": 83360 + }, + { + "epoch": 5.66415273814377, + "grad_norm": 0.3215732276439667, + "learning_rate": 2.922781627938579e-06, + "loss": 3.9199, + "step": 83365 + }, + { + "epoch": 5.664492458214431, + "grad_norm": 0.3059505522251129, + "learning_rate": 2.9223569778502515e-06, + "loss": 3.7821, + "step": 83370 + }, + { + "epoch": 5.664832178285093, + "grad_norm": 0.22712823748588562, + "learning_rate": 2.9219323277619243e-06, + "loss": 3.7873, + "step": 83375 + }, + { + "epoch": 5.665171898355755, + "grad_norm": 0.2536885738372803, + "learning_rate": 2.921507677673597e-06, + "loss": 3.8111, + "step": 83380 + }, + { + "epoch": 5.665511618426416, + "grad_norm": 0.3879782557487488, + "learning_rate": 2.92108302758527e-06, + "loss": 3.7792, + "step": 83385 + }, + { + "epoch": 5.665851338497078, + "grad_norm": 0.21599985659122467, + "learning_rate": 2.9206583774969427e-06, + "loss": 3.9221, + "step": 83390 + }, + { + "epoch": 5.66619105856774, + "grad_norm": 0.25919339060783386, + "learning_rate": 2.9202337274086155e-06, + "loss": 3.96, + "step": 83395 + }, + { + "epoch": 5.666530778638402, + "grad_norm": 0.3172163665294647, + "learning_rate": 2.919809077320288e-06, + "loss": 3.916, + "step": 83400 + }, + { + "epoch": 5.666870498709064, + "grad_norm": 0.3049868643283844, + "learning_rate": 2.919384427231961e-06, + "loss": 4.0218, + "step": 83405 + }, + { + "epoch": 5.667210218779726, + "grad_norm": 0.20061351358890533, + "learning_rate": 2.918959777143634e-06, + "loss": 3.7375, + "step": 83410 + }, + { + "epoch": 5.667549938850387, + "grad_norm": 0.24082516133785248, + "learning_rate": 2.9185351270553063e-06, + "loss": 3.9965, + "step": 83415 + }, + { + "epoch": 5.667889658921049, + "grad_norm": 0.26815494894981384, + "learning_rate": 2.9181104769669795e-06, + "loss": 4.008, + "step": 83420 + }, + { + "epoch": 5.668229378991711, + "grad_norm": 0.2124078869819641, + "learning_rate": 2.9176858268786523e-06, + "loss": 3.7776, + "step": 83425 + }, + { + "epoch": 5.668569099062372, + "grad_norm": 0.2575231194496155, + "learning_rate": 2.917261176790325e-06, + "loss": 3.8025, + "step": 83430 + }, + { + "epoch": 5.668908819133034, + "grad_norm": 0.2288043051958084, + "learning_rate": 2.9168365267019975e-06, + "loss": 4.1227, + "step": 83435 + }, + { + "epoch": 5.669248539203696, + "grad_norm": 0.6173654198646545, + "learning_rate": 2.9164118766136707e-06, + "loss": 4.0961, + "step": 83440 + }, + { + "epoch": 5.669588259274358, + "grad_norm": 0.2522679567337036, + "learning_rate": 2.9159872265253435e-06, + "loss": 4.1547, + "step": 83445 + }, + { + "epoch": 5.66992797934502, + "grad_norm": 0.26410800218582153, + "learning_rate": 2.915562576437016e-06, + "loss": 3.6951, + "step": 83450 + }, + { + "epoch": 5.670267699415682, + "grad_norm": 0.2594263255596161, + "learning_rate": 2.915137926348689e-06, + "loss": 4.0084, + "step": 83455 + }, + { + "epoch": 5.670607419486343, + "grad_norm": 0.618834912776947, + "learning_rate": 2.914713276260362e-06, + "loss": 4.1126, + "step": 83460 + }, + { + "epoch": 5.670947139557005, + "grad_norm": 0.304060161113739, + "learning_rate": 2.9142886261720343e-06, + "loss": 3.9816, + "step": 83465 + }, + { + "epoch": 5.671286859627667, + "grad_norm": 0.3959309458732605, + "learning_rate": 2.913863976083707e-06, + "loss": 3.7247, + "step": 83470 + }, + { + "epoch": 5.671626579698328, + "grad_norm": 0.29152965545654297, + "learning_rate": 2.9134393259953803e-06, + "loss": 3.7322, + "step": 83475 + }, + { + "epoch": 5.67196629976899, + "grad_norm": 0.29337096214294434, + "learning_rate": 2.9130146759070527e-06, + "loss": 3.8588, + "step": 83480 + }, + { + "epoch": 5.672306019839652, + "grad_norm": 0.25384974479675293, + "learning_rate": 2.9125900258187255e-06, + "loss": 3.9998, + "step": 83485 + }, + { + "epoch": 5.672645739910314, + "grad_norm": 0.20677433907985687, + "learning_rate": 2.9121653757303987e-06, + "loss": 4.038, + "step": 83490 + }, + { + "epoch": 5.672985459980976, + "grad_norm": 0.3867168128490448, + "learning_rate": 2.911740725642071e-06, + "loss": 3.9959, + "step": 83495 + }, + { + "epoch": 5.673325180051638, + "grad_norm": 0.19331094622612, + "learning_rate": 2.911316075553744e-06, + "loss": 4.0133, + "step": 83500 + }, + { + "epoch": 5.673664900122299, + "grad_norm": 0.2715519070625305, + "learning_rate": 2.9108914254654167e-06, + "loss": 3.9317, + "step": 83505 + }, + { + "epoch": 5.674004620192961, + "grad_norm": 0.2564511299133301, + "learning_rate": 2.9104667753770895e-06, + "loss": 3.9421, + "step": 83510 + }, + { + "epoch": 5.674344340263623, + "grad_norm": 0.27304607629776, + "learning_rate": 2.9100421252887623e-06, + "loss": 3.963, + "step": 83515 + }, + { + "epoch": 5.674684060334284, + "grad_norm": 0.2075401097536087, + "learning_rate": 2.909617475200435e-06, + "loss": 4.0257, + "step": 83520 + }, + { + "epoch": 5.675023780404946, + "grad_norm": 0.23495244979858398, + "learning_rate": 2.9091928251121075e-06, + "loss": 3.6448, + "step": 83525 + }, + { + "epoch": 5.6753635004756084, + "grad_norm": 0.2375149130821228, + "learning_rate": 2.9087681750237807e-06, + "loss": 4.0118, + "step": 83530 + }, + { + "epoch": 5.67570322054627, + "grad_norm": 0.30004438757896423, + "learning_rate": 2.9083435249354535e-06, + "loss": 3.8724, + "step": 83535 + }, + { + "epoch": 5.676042940616932, + "grad_norm": 0.24928593635559082, + "learning_rate": 2.907918874847126e-06, + "loss": 3.9785, + "step": 83540 + }, + { + "epoch": 5.676382660687594, + "grad_norm": 0.24929043650627136, + "learning_rate": 2.907494224758799e-06, + "loss": 3.8595, + "step": 83545 + }, + { + "epoch": 5.676722380758255, + "grad_norm": 0.3011619448661804, + "learning_rate": 2.907069574670472e-06, + "loss": 3.7765, + "step": 83550 + }, + { + "epoch": 5.677062100828917, + "grad_norm": 0.222738578915596, + "learning_rate": 2.9066449245821443e-06, + "loss": 3.781, + "step": 83555 + }, + { + "epoch": 5.677401820899579, + "grad_norm": 0.24364237487316132, + "learning_rate": 2.906220274493817e-06, + "loss": 3.8395, + "step": 83560 + }, + { + "epoch": 5.67774154097024, + "grad_norm": 0.27194303274154663, + "learning_rate": 2.9057956244054903e-06, + "loss": 3.9389, + "step": 83565 + }, + { + "epoch": 5.678081261040902, + "grad_norm": 0.3712664544582367, + "learning_rate": 2.9053709743171627e-06, + "loss": 3.9184, + "step": 83570 + }, + { + "epoch": 5.6784209811115645, + "grad_norm": 0.3401857018470764, + "learning_rate": 2.9049463242288355e-06, + "loss": 3.8238, + "step": 83575 + }, + { + "epoch": 5.678760701182226, + "grad_norm": 0.23468737304210663, + "learning_rate": 2.9045216741405087e-06, + "loss": 3.8257, + "step": 83580 + }, + { + "epoch": 5.679100421252888, + "grad_norm": 0.27966055274009705, + "learning_rate": 2.904097024052181e-06, + "loss": 4.0189, + "step": 83585 + }, + { + "epoch": 5.67944014132355, + "grad_norm": 0.21721075475215912, + "learning_rate": 2.903672373963854e-06, + "loss": 3.961, + "step": 83590 + }, + { + "epoch": 5.679779861394211, + "grad_norm": 0.24551421403884888, + "learning_rate": 2.9032477238755267e-06, + "loss": 3.83, + "step": 83595 + }, + { + "epoch": 5.680119581464873, + "grad_norm": 0.24976053833961487, + "learning_rate": 2.9028230737872e-06, + "loss": 3.9627, + "step": 83600 + }, + { + "epoch": 5.680459301535535, + "grad_norm": 0.2741764783859253, + "learning_rate": 2.9023984236988723e-06, + "loss": 3.9547, + "step": 83605 + }, + { + "epoch": 5.680799021606196, + "grad_norm": 0.23060199618339539, + "learning_rate": 2.901973773610545e-06, + "loss": 3.9861, + "step": 83610 + }, + { + "epoch": 5.681138741676858, + "grad_norm": 0.23997361958026886, + "learning_rate": 2.9015491235222183e-06, + "loss": 3.921, + "step": 83615 + }, + { + "epoch": 5.6814784617475205, + "grad_norm": 0.20804347097873688, + "learning_rate": 2.9011244734338907e-06, + "loss": 3.73, + "step": 83620 + }, + { + "epoch": 5.681818181818182, + "grad_norm": 0.22314876317977905, + "learning_rate": 2.9006998233455635e-06, + "loss": 4.072, + "step": 83625 + }, + { + "epoch": 5.682157901888844, + "grad_norm": 0.31919190287590027, + "learning_rate": 2.9002751732572363e-06, + "loss": 3.952, + "step": 83630 + }, + { + "epoch": 5.682497621959506, + "grad_norm": 0.23134911060333252, + "learning_rate": 2.899850523168909e-06, + "loss": 3.7207, + "step": 83635 + }, + { + "epoch": 5.682837342030167, + "grad_norm": 0.3358765244483948, + "learning_rate": 2.899425873080582e-06, + "loss": 4.1995, + "step": 83640 + }, + { + "epoch": 5.683177062100829, + "grad_norm": 0.2422250360250473, + "learning_rate": 2.8990012229922547e-06, + "loss": 3.9923, + "step": 83645 + }, + { + "epoch": 5.683516782171491, + "grad_norm": 0.31542977690696716, + "learning_rate": 2.898576572903927e-06, + "loss": 3.5949, + "step": 83650 + }, + { + "epoch": 5.683856502242152, + "grad_norm": 0.39504313468933105, + "learning_rate": 2.8981519228156003e-06, + "loss": 3.6854, + "step": 83655 + }, + { + "epoch": 5.684196222312814, + "grad_norm": 0.19276638329029083, + "learning_rate": 2.897727272727273e-06, + "loss": 3.8663, + "step": 83660 + }, + { + "epoch": 5.6845359423834765, + "grad_norm": 0.27144306898117065, + "learning_rate": 2.8973026226389455e-06, + "loss": 3.8168, + "step": 83665 + }, + { + "epoch": 5.684875662454138, + "grad_norm": 0.38064610958099365, + "learning_rate": 2.8968779725506187e-06, + "loss": 3.9464, + "step": 83670 + }, + { + "epoch": 5.6852153825248, + "grad_norm": 0.23444369435310364, + "learning_rate": 2.8964533224622915e-06, + "loss": 3.7644, + "step": 83675 + }, + { + "epoch": 5.685555102595462, + "grad_norm": 0.3158351182937622, + "learning_rate": 2.896028672373964e-06, + "loss": 4.3492, + "step": 83680 + }, + { + "epoch": 5.685894822666123, + "grad_norm": 0.21139578521251678, + "learning_rate": 2.8956040222856367e-06, + "loss": 4.0725, + "step": 83685 + }, + { + "epoch": 5.686234542736785, + "grad_norm": 0.5053378939628601, + "learning_rate": 2.89517937219731e-06, + "loss": 3.9733, + "step": 83690 + }, + { + "epoch": 5.686574262807447, + "grad_norm": 0.2675698697566986, + "learning_rate": 2.8947547221089823e-06, + "loss": 3.896, + "step": 83695 + }, + { + "epoch": 5.686913982878108, + "grad_norm": 0.19061850011348724, + "learning_rate": 2.894330072020655e-06, + "loss": 3.9077, + "step": 83700 + }, + { + "epoch": 5.68725370294877, + "grad_norm": 0.2590171694755554, + "learning_rate": 2.8939054219323283e-06, + "loss": 3.7371, + "step": 83705 + }, + { + "epoch": 5.687593423019432, + "grad_norm": 0.2503223717212677, + "learning_rate": 2.8934807718440007e-06, + "loss": 3.6207, + "step": 83710 + }, + { + "epoch": 5.687933143090094, + "grad_norm": 0.267132967710495, + "learning_rate": 2.8930561217556735e-06, + "loss": 3.9688, + "step": 83715 + }, + { + "epoch": 5.688272863160756, + "grad_norm": 0.23213587701320648, + "learning_rate": 2.8926314716673463e-06, + "loss": 3.8073, + "step": 83720 + }, + { + "epoch": 5.688612583231417, + "grad_norm": 0.27427148818969727, + "learning_rate": 2.8922068215790187e-06, + "loss": 3.8765, + "step": 83725 + }, + { + "epoch": 5.688952303302079, + "grad_norm": 0.23602084815502167, + "learning_rate": 2.891782171490692e-06, + "loss": 4.0683, + "step": 83730 + }, + { + "epoch": 5.689292023372741, + "grad_norm": 0.21969997882843018, + "learning_rate": 2.8913575214023647e-06, + "loss": 3.8692, + "step": 83735 + }, + { + "epoch": 5.689631743443402, + "grad_norm": 0.24220065772533417, + "learning_rate": 2.890932871314037e-06, + "loss": 3.9397, + "step": 83740 + }, + { + "epoch": 5.689971463514064, + "grad_norm": 0.2410583645105362, + "learning_rate": 2.8905082212257103e-06, + "loss": 3.901, + "step": 83745 + }, + { + "epoch": 5.690311183584726, + "grad_norm": 0.3144082725048065, + "learning_rate": 2.890083571137383e-06, + "loss": 3.93, + "step": 83750 + }, + { + "epoch": 5.690650903655388, + "grad_norm": 0.3149096965789795, + "learning_rate": 2.8896589210490555e-06, + "loss": 4.0512, + "step": 83755 + }, + { + "epoch": 5.69099062372605, + "grad_norm": 0.2027195245027542, + "learning_rate": 2.8892342709607283e-06, + "loss": 3.9043, + "step": 83760 + }, + { + "epoch": 5.691330343796712, + "grad_norm": 0.30530020594596863, + "learning_rate": 2.8888096208724015e-06, + "loss": 4.1669, + "step": 83765 + }, + { + "epoch": 5.691670063867373, + "grad_norm": 0.23402154445648193, + "learning_rate": 2.8883849707840743e-06, + "loss": 3.7369, + "step": 83770 + }, + { + "epoch": 5.692009783938035, + "grad_norm": 0.3278276324272156, + "learning_rate": 2.8879603206957467e-06, + "loss": 3.7418, + "step": 83775 + }, + { + "epoch": 5.692349504008697, + "grad_norm": 0.21876876056194305, + "learning_rate": 2.88753567060742e-06, + "loss": 4.014, + "step": 83780 + }, + { + "epoch": 5.692689224079358, + "grad_norm": 0.2949569523334503, + "learning_rate": 2.8871110205190927e-06, + "loss": 3.9668, + "step": 83785 + }, + { + "epoch": 5.69302894415002, + "grad_norm": 0.3330748677253723, + "learning_rate": 2.886686370430765e-06, + "loss": 4.0045, + "step": 83790 + }, + { + "epoch": 5.693368664220682, + "grad_norm": 0.28156307339668274, + "learning_rate": 2.8862617203424383e-06, + "loss": 4.0905, + "step": 83795 + }, + { + "epoch": 5.693708384291344, + "grad_norm": 0.28764092922210693, + "learning_rate": 2.885837070254111e-06, + "loss": 4.0123, + "step": 83800 + }, + { + "epoch": 5.694048104362006, + "grad_norm": 0.23155786097049713, + "learning_rate": 2.8854124201657835e-06, + "loss": 3.7131, + "step": 83805 + }, + { + "epoch": 5.694387824432668, + "grad_norm": 0.3129580616950989, + "learning_rate": 2.8849877700774563e-06, + "loss": 3.9362, + "step": 83810 + }, + { + "epoch": 5.694727544503329, + "grad_norm": 0.3150821626186371, + "learning_rate": 2.8845631199891295e-06, + "loss": 4.1289, + "step": 83815 + }, + { + "epoch": 5.695067264573991, + "grad_norm": 0.26304301619529724, + "learning_rate": 2.884138469900802e-06, + "loss": 3.8936, + "step": 83820 + }, + { + "epoch": 5.695406984644653, + "grad_norm": 0.28441259264945984, + "learning_rate": 2.8837138198124747e-06, + "loss": 3.8513, + "step": 83825 + }, + { + "epoch": 5.695746704715314, + "grad_norm": 0.5770047903060913, + "learning_rate": 2.883289169724148e-06, + "loss": 4.0142, + "step": 83830 + }, + { + "epoch": 5.696086424785976, + "grad_norm": 0.32556411623954773, + "learning_rate": 2.8828645196358203e-06, + "loss": 4.1074, + "step": 83835 + }, + { + "epoch": 5.6964261448566385, + "grad_norm": 0.25260838866233826, + "learning_rate": 2.882439869547493e-06, + "loss": 3.9384, + "step": 83840 + }, + { + "epoch": 5.6967658649273, + "grad_norm": 0.25195860862731934, + "learning_rate": 2.882015219459166e-06, + "loss": 3.8282, + "step": 83845 + }, + { + "epoch": 5.697105584997962, + "grad_norm": 0.23065529763698578, + "learning_rate": 2.8815905693708383e-06, + "loss": 3.8291, + "step": 83850 + }, + { + "epoch": 5.697445305068624, + "grad_norm": 0.28959840536117554, + "learning_rate": 2.8811659192825115e-06, + "loss": 3.9967, + "step": 83855 + }, + { + "epoch": 5.697785025139285, + "grad_norm": 0.387205570936203, + "learning_rate": 2.8807412691941843e-06, + "loss": 4.0465, + "step": 83860 + }, + { + "epoch": 5.698124745209947, + "grad_norm": 0.2932085394859314, + "learning_rate": 2.8803166191058567e-06, + "loss": 3.9133, + "step": 83865 + }, + { + "epoch": 5.698464465280609, + "grad_norm": 0.25041598081588745, + "learning_rate": 2.87989196901753e-06, + "loss": 3.9507, + "step": 83870 + }, + { + "epoch": 5.69880418535127, + "grad_norm": 0.2369944453239441, + "learning_rate": 2.8794673189292027e-06, + "loss": 3.9388, + "step": 83875 + }, + { + "epoch": 5.699143905421932, + "grad_norm": 0.3024827837944031, + "learning_rate": 2.879042668840875e-06, + "loss": 4.1215, + "step": 83880 + }, + { + "epoch": 5.6994836254925945, + "grad_norm": 0.30683425068855286, + "learning_rate": 2.878618018752548e-06, + "loss": 3.8343, + "step": 83885 + }, + { + "epoch": 5.699823345563256, + "grad_norm": 0.20360907912254333, + "learning_rate": 2.878193368664221e-06, + "loss": 3.8054, + "step": 83890 + }, + { + "epoch": 5.700163065633918, + "grad_norm": 0.16775988042354584, + "learning_rate": 2.8777687185758935e-06, + "loss": 3.7313, + "step": 83895 + }, + { + "epoch": 5.70050278570458, + "grad_norm": 0.2554439902305603, + "learning_rate": 2.8773440684875663e-06, + "loss": 3.8887, + "step": 83900 + }, + { + "epoch": 5.700842505775241, + "grad_norm": 0.3231593370437622, + "learning_rate": 2.8769194183992395e-06, + "loss": 4.0384, + "step": 83905 + }, + { + "epoch": 5.701182225845903, + "grad_norm": 0.2658419907093048, + "learning_rate": 2.876494768310912e-06, + "loss": 4.1266, + "step": 83910 + }, + { + "epoch": 5.701521945916565, + "grad_norm": 0.19120417535305023, + "learning_rate": 2.8760701182225847e-06, + "loss": 3.967, + "step": 83915 + }, + { + "epoch": 5.701861665987226, + "grad_norm": 0.2960560619831085, + "learning_rate": 2.875645468134258e-06, + "loss": 3.8445, + "step": 83920 + }, + { + "epoch": 5.702201386057888, + "grad_norm": 0.23725193738937378, + "learning_rate": 2.8752208180459303e-06, + "loss": 3.8166, + "step": 83925 + }, + { + "epoch": 5.70254110612855, + "grad_norm": 0.27364444732666016, + "learning_rate": 2.874796167957603e-06, + "loss": 3.8677, + "step": 83930 + }, + { + "epoch": 5.702880826199212, + "grad_norm": 0.2227931171655655, + "learning_rate": 2.874371517869276e-06, + "loss": 3.8913, + "step": 83935 + }, + { + "epoch": 5.703220546269874, + "grad_norm": 0.22247271239757538, + "learning_rate": 2.873946867780949e-06, + "loss": 3.6295, + "step": 83940 + }, + { + "epoch": 5.703560266340535, + "grad_norm": 0.24073022603988647, + "learning_rate": 2.8735222176926215e-06, + "loss": 4.1157, + "step": 83945 + }, + { + "epoch": 5.703899986411197, + "grad_norm": 0.3164260983467102, + "learning_rate": 2.8730975676042943e-06, + "loss": 3.8776, + "step": 83950 + }, + { + "epoch": 5.704239706481859, + "grad_norm": 0.30603307485580444, + "learning_rate": 2.8726729175159675e-06, + "loss": 3.9862, + "step": 83955 + }, + { + "epoch": 5.70457942655252, + "grad_norm": 0.30728262662887573, + "learning_rate": 2.87224826742764e-06, + "loss": 3.8732, + "step": 83960 + }, + { + "epoch": 5.704919146623182, + "grad_norm": 0.3063312768936157, + "learning_rate": 2.8718236173393127e-06, + "loss": 3.8859, + "step": 83965 + }, + { + "epoch": 5.705258866693844, + "grad_norm": 0.3294144868850708, + "learning_rate": 2.8713989672509855e-06, + "loss": 3.8301, + "step": 83970 + }, + { + "epoch": 5.705598586764506, + "grad_norm": 0.22732847929000854, + "learning_rate": 2.870974317162658e-06, + "loss": 3.9324, + "step": 83975 + }, + { + "epoch": 5.705938306835168, + "grad_norm": 0.24431274831295013, + "learning_rate": 2.870549667074331e-06, + "loss": 3.9966, + "step": 83980 + }, + { + "epoch": 5.70627802690583, + "grad_norm": 0.2605414390563965, + "learning_rate": 2.870125016986004e-06, + "loss": 3.847, + "step": 83985 + }, + { + "epoch": 5.706617746976491, + "grad_norm": 0.23596131801605225, + "learning_rate": 2.8697003668976763e-06, + "loss": 3.8264, + "step": 83990 + }, + { + "epoch": 5.706957467047153, + "grad_norm": 0.27197912335395813, + "learning_rate": 2.8692757168093495e-06, + "loss": 3.8962, + "step": 83995 + }, + { + "epoch": 5.707297187117815, + "grad_norm": 0.32573848962783813, + "learning_rate": 2.8688510667210223e-06, + "loss": 3.9159, + "step": 84000 + }, + { + "epoch": 5.707636907188476, + "grad_norm": 0.28712135553359985, + "learning_rate": 2.8684264166326947e-06, + "loss": 3.9405, + "step": 84005 + }, + { + "epoch": 5.707976627259138, + "grad_norm": 0.2411699891090393, + "learning_rate": 2.8680017665443675e-06, + "loss": 3.8892, + "step": 84010 + }, + { + "epoch": 5.7083163473298, + "grad_norm": 0.2526967525482178, + "learning_rate": 2.8675771164560407e-06, + "loss": 3.632, + "step": 84015 + }, + { + "epoch": 5.708656067400462, + "grad_norm": 0.24561995267868042, + "learning_rate": 2.867152466367713e-06, + "loss": 4.012, + "step": 84020 + }, + { + "epoch": 5.708995787471124, + "grad_norm": 0.22856107354164124, + "learning_rate": 2.866727816279386e-06, + "loss": 3.7363, + "step": 84025 + }, + { + "epoch": 5.709335507541786, + "grad_norm": 0.25154611468315125, + "learning_rate": 2.866303166191059e-06, + "loss": 3.7721, + "step": 84030 + }, + { + "epoch": 5.709675227612447, + "grad_norm": 0.2745940685272217, + "learning_rate": 2.8658785161027315e-06, + "loss": 3.9191, + "step": 84035 + }, + { + "epoch": 5.710014947683109, + "grad_norm": 0.2501216530799866, + "learning_rate": 2.8654538660144043e-06, + "loss": 4.0009, + "step": 84040 + }, + { + "epoch": 5.710354667753771, + "grad_norm": 0.29383549094200134, + "learning_rate": 2.865029215926077e-06, + "loss": 3.9135, + "step": 84045 + }, + { + "epoch": 5.710694387824432, + "grad_norm": 0.21618136763572693, + "learning_rate": 2.86460456583775e-06, + "loss": 4.2467, + "step": 84050 + }, + { + "epoch": 5.711034107895094, + "grad_norm": 0.2807536721229553, + "learning_rate": 2.8641799157494227e-06, + "loss": 4.0543, + "step": 84055 + }, + { + "epoch": 5.711373827965756, + "grad_norm": 0.30569204688072205, + "learning_rate": 2.8637552656610955e-06, + "loss": 4.0633, + "step": 84060 + }, + { + "epoch": 5.711713548036418, + "grad_norm": 0.30767789483070374, + "learning_rate": 2.863330615572768e-06, + "loss": 3.8234, + "step": 84065 + }, + { + "epoch": 5.71205326810708, + "grad_norm": 0.28846606612205505, + "learning_rate": 2.862905965484441e-06, + "loss": 4.1357, + "step": 84070 + }, + { + "epoch": 5.712392988177742, + "grad_norm": 0.2749622166156769, + "learning_rate": 2.862481315396114e-06, + "loss": 3.9125, + "step": 84075 + }, + { + "epoch": 5.712732708248403, + "grad_norm": 0.21623890101909637, + "learning_rate": 2.8620566653077863e-06, + "loss": 4.1733, + "step": 84080 + }, + { + "epoch": 5.713072428319065, + "grad_norm": 0.20960329473018646, + "learning_rate": 2.8616320152194595e-06, + "loss": 3.7771, + "step": 84085 + }, + { + "epoch": 5.713412148389727, + "grad_norm": 0.2862033545970917, + "learning_rate": 2.8612073651311323e-06, + "loss": 3.6905, + "step": 84090 + }, + { + "epoch": 5.713751868460388, + "grad_norm": 0.22319987416267395, + "learning_rate": 2.8607827150428047e-06, + "loss": 3.7524, + "step": 84095 + }, + { + "epoch": 5.71409158853105, + "grad_norm": 0.21128037571907043, + "learning_rate": 2.8603580649544775e-06, + "loss": 3.8102, + "step": 84100 + }, + { + "epoch": 5.7144313086017124, + "grad_norm": 0.2642153799533844, + "learning_rate": 2.8599334148661507e-06, + "loss": 3.8757, + "step": 84105 + }, + { + "epoch": 5.714771028672374, + "grad_norm": 0.2624675929546356, + "learning_rate": 2.8595087647778235e-06, + "loss": 3.9398, + "step": 84110 + }, + { + "epoch": 5.715110748743036, + "grad_norm": 0.26026788353919983, + "learning_rate": 2.859084114689496e-06, + "loss": 3.785, + "step": 84115 + }, + { + "epoch": 5.715450468813698, + "grad_norm": 0.2576324939727783, + "learning_rate": 2.858659464601169e-06, + "loss": 4.1321, + "step": 84120 + }, + { + "epoch": 5.715790188884359, + "grad_norm": 0.3879750669002533, + "learning_rate": 2.858234814512842e-06, + "loss": 3.9551, + "step": 84125 + }, + { + "epoch": 5.716129908955021, + "grad_norm": 0.297357439994812, + "learning_rate": 2.8578101644245143e-06, + "loss": 3.8138, + "step": 84130 + }, + { + "epoch": 5.716469629025683, + "grad_norm": 0.24181847274303436, + "learning_rate": 2.857385514336187e-06, + "loss": 3.8873, + "step": 84135 + }, + { + "epoch": 5.716809349096344, + "grad_norm": 0.21748951077461243, + "learning_rate": 2.8569608642478603e-06, + "loss": 4.2265, + "step": 84140 + }, + { + "epoch": 5.717149069167006, + "grad_norm": 0.28730058670043945, + "learning_rate": 2.8565362141595327e-06, + "loss": 3.9815, + "step": 84145 + }, + { + "epoch": 5.7174887892376685, + "grad_norm": 0.2525785267353058, + "learning_rate": 2.8561115640712055e-06, + "loss": 4.0377, + "step": 84150 + }, + { + "epoch": 5.71782850930833, + "grad_norm": 0.23138269782066345, + "learning_rate": 2.8556869139828787e-06, + "loss": 4.23, + "step": 84155 + }, + { + "epoch": 5.718168229378992, + "grad_norm": 0.3063291907310486, + "learning_rate": 2.855262263894551e-06, + "loss": 3.8388, + "step": 84160 + }, + { + "epoch": 5.718507949449654, + "grad_norm": 0.21305273473262787, + "learning_rate": 2.854837613806224e-06, + "loss": 3.9121, + "step": 84165 + }, + { + "epoch": 5.718847669520315, + "grad_norm": 0.23135864734649658, + "learning_rate": 2.8544129637178967e-06, + "loss": 4.0864, + "step": 84170 + }, + { + "epoch": 5.719187389590977, + "grad_norm": 0.296478807926178, + "learning_rate": 2.8539883136295695e-06, + "loss": 3.8512, + "step": 84175 + }, + { + "epoch": 5.719527109661639, + "grad_norm": 0.29423952102661133, + "learning_rate": 2.8535636635412423e-06, + "loss": 3.9179, + "step": 84180 + }, + { + "epoch": 5.7198668297323, + "grad_norm": 0.21590061485767365, + "learning_rate": 2.853139013452915e-06, + "loss": 3.978, + "step": 84185 + }, + { + "epoch": 5.720206549802962, + "grad_norm": 0.2287379652261734, + "learning_rate": 2.8527143633645875e-06, + "loss": 3.735, + "step": 84190 + }, + { + "epoch": 5.7205462698736245, + "grad_norm": 0.2441743165254593, + "learning_rate": 2.8522897132762607e-06, + "loss": 3.955, + "step": 84195 + }, + { + "epoch": 5.720885989944286, + "grad_norm": 0.3720686435699463, + "learning_rate": 2.8518650631879335e-06, + "loss": 3.8523, + "step": 84200 + }, + { + "epoch": 5.721225710014948, + "grad_norm": 0.19626042246818542, + "learning_rate": 2.851440413099606e-06, + "loss": 3.8657, + "step": 84205 + }, + { + "epoch": 5.72156543008561, + "grad_norm": 0.22530139982700348, + "learning_rate": 2.851015763011279e-06, + "loss": 3.942, + "step": 84210 + }, + { + "epoch": 5.721905150156271, + "grad_norm": 0.2526290714740753, + "learning_rate": 2.850591112922952e-06, + "loss": 3.9582, + "step": 84215 + }, + { + "epoch": 5.722244870226933, + "grad_norm": 0.27513551712036133, + "learning_rate": 2.8501664628346243e-06, + "loss": 3.983, + "step": 84220 + }, + { + "epoch": 5.722584590297595, + "grad_norm": 0.3499318063259125, + "learning_rate": 2.849741812746297e-06, + "loss": 3.8444, + "step": 84225 + }, + { + "epoch": 5.722924310368256, + "grad_norm": 0.1880858987569809, + "learning_rate": 2.8493171626579703e-06, + "loss": 3.7518, + "step": 84230 + }, + { + "epoch": 5.723264030438918, + "grad_norm": 0.2622208893299103, + "learning_rate": 2.8488925125696427e-06, + "loss": 3.6923, + "step": 84235 + }, + { + "epoch": 5.7236037505095805, + "grad_norm": 0.235309436917305, + "learning_rate": 2.8484678624813155e-06, + "loss": 4.128, + "step": 84240 + }, + { + "epoch": 5.723943470580242, + "grad_norm": 0.2563299238681793, + "learning_rate": 2.8480432123929887e-06, + "loss": 3.8142, + "step": 84245 + }, + { + "epoch": 5.724283190650904, + "grad_norm": 0.23838108777999878, + "learning_rate": 2.847618562304661e-06, + "loss": 3.8732, + "step": 84250 + }, + { + "epoch": 5.724622910721566, + "grad_norm": 0.29825955629348755, + "learning_rate": 2.847193912216334e-06, + "loss": 3.7567, + "step": 84255 + }, + { + "epoch": 5.724962630792227, + "grad_norm": 0.22108721733093262, + "learning_rate": 2.8467692621280067e-06, + "loss": 3.4365, + "step": 84260 + }, + { + "epoch": 5.725302350862889, + "grad_norm": 0.21969453990459442, + "learning_rate": 2.846344612039679e-06, + "loss": 3.9243, + "step": 84265 + }, + { + "epoch": 5.725642070933551, + "grad_norm": 0.22920571267604828, + "learning_rate": 2.8459199619513523e-06, + "loss": 3.7959, + "step": 84270 + }, + { + "epoch": 5.725981791004212, + "grad_norm": 0.255464643239975, + "learning_rate": 2.845495311863025e-06, + "loss": 3.8892, + "step": 84275 + }, + { + "epoch": 5.726321511074874, + "grad_norm": 0.26249244809150696, + "learning_rate": 2.8450706617746983e-06, + "loss": 4.0148, + "step": 84280 + }, + { + "epoch": 5.7266612311455365, + "grad_norm": 0.243196502327919, + "learning_rate": 2.8446460116863707e-06, + "loss": 3.6394, + "step": 84285 + }, + { + "epoch": 5.727000951216198, + "grad_norm": 0.23019099235534668, + "learning_rate": 2.8442213615980435e-06, + "loss": 3.8736, + "step": 84290 + }, + { + "epoch": 5.72734067128686, + "grad_norm": 0.3639586865901947, + "learning_rate": 2.8437967115097163e-06, + "loss": 3.6575, + "step": 84295 + }, + { + "epoch": 5.727680391357522, + "grad_norm": 0.30531448125839233, + "learning_rate": 2.843372061421389e-06, + "loss": 3.8896, + "step": 84300 + }, + { + "epoch": 5.728020111428183, + "grad_norm": 0.19641712307929993, + "learning_rate": 2.842947411333062e-06, + "loss": 3.7844, + "step": 84305 + }, + { + "epoch": 5.728359831498845, + "grad_norm": 0.29399341344833374, + "learning_rate": 2.8425227612447347e-06, + "loss": 3.7572, + "step": 84310 + }, + { + "epoch": 5.728699551569507, + "grad_norm": 0.2922738492488861, + "learning_rate": 2.842098111156407e-06, + "loss": 3.9319, + "step": 84315 + }, + { + "epoch": 5.729039271640168, + "grad_norm": 0.29389017820358276, + "learning_rate": 2.8416734610680803e-06, + "loss": 3.7408, + "step": 84320 + }, + { + "epoch": 5.72937899171083, + "grad_norm": 0.2610715329647064, + "learning_rate": 2.841248810979753e-06, + "loss": 3.6653, + "step": 84325 + }, + { + "epoch": 5.7297187117814925, + "grad_norm": 0.25948548316955566, + "learning_rate": 2.8408241608914255e-06, + "loss": 3.6163, + "step": 84330 + }, + { + "epoch": 5.730058431852154, + "grad_norm": 0.47095710039138794, + "learning_rate": 2.8403995108030987e-06, + "loss": 3.76, + "step": 84335 + }, + { + "epoch": 5.730398151922816, + "grad_norm": 0.22815640270709991, + "learning_rate": 2.8399748607147715e-06, + "loss": 3.7495, + "step": 84340 + }, + { + "epoch": 5.730737871993478, + "grad_norm": 0.32507824897766113, + "learning_rate": 2.839550210626444e-06, + "loss": 4.0553, + "step": 84345 + }, + { + "epoch": 5.731077592064139, + "grad_norm": 0.3383243680000305, + "learning_rate": 2.8391255605381167e-06, + "loss": 3.7712, + "step": 84350 + }, + { + "epoch": 5.731417312134801, + "grad_norm": 0.2342037409543991, + "learning_rate": 2.83870091044979e-06, + "loss": 4.0419, + "step": 84355 + }, + { + "epoch": 5.731757032205463, + "grad_norm": 0.22459900379180908, + "learning_rate": 2.8382762603614623e-06, + "loss": 3.9276, + "step": 84360 + }, + { + "epoch": 5.732096752276124, + "grad_norm": 0.38313162326812744, + "learning_rate": 2.837851610273135e-06, + "loss": 3.9616, + "step": 84365 + }, + { + "epoch": 5.732436472346786, + "grad_norm": 0.3111829161643982, + "learning_rate": 2.8374269601848083e-06, + "loss": 3.8494, + "step": 84370 + }, + { + "epoch": 5.7327761924174485, + "grad_norm": 0.2774832546710968, + "learning_rate": 2.8370023100964807e-06, + "loss": 3.8344, + "step": 84375 + }, + { + "epoch": 5.73311591248811, + "grad_norm": 0.2617702782154083, + "learning_rate": 2.8365776600081535e-06, + "loss": 3.8807, + "step": 84380 + }, + { + "epoch": 5.733455632558772, + "grad_norm": 0.2861670255661011, + "learning_rate": 2.8361530099198263e-06, + "loss": 3.8972, + "step": 84385 + }, + { + "epoch": 5.733795352629433, + "grad_norm": 0.2702234983444214, + "learning_rate": 2.8357283598314987e-06, + "loss": 4.1743, + "step": 84390 + }, + { + "epoch": 5.734135072700095, + "grad_norm": 0.2865329086780548, + "learning_rate": 2.835303709743172e-06, + "loss": 4.2842, + "step": 84395 + }, + { + "epoch": 5.734474792770757, + "grad_norm": 0.24626582860946655, + "learning_rate": 2.8348790596548447e-06, + "loss": 3.9059, + "step": 84400 + }, + { + "epoch": 5.734814512841418, + "grad_norm": 0.2946939170360565, + "learning_rate": 2.834454409566517e-06, + "loss": 3.7071, + "step": 84405 + }, + { + "epoch": 5.73515423291208, + "grad_norm": 0.19624949991703033, + "learning_rate": 2.8340297594781903e-06, + "loss": 4.0594, + "step": 84410 + }, + { + "epoch": 5.7354939529827424, + "grad_norm": 0.2864516079425812, + "learning_rate": 2.833605109389863e-06, + "loss": 3.855, + "step": 84415 + }, + { + "epoch": 5.735833673053404, + "grad_norm": 0.2641051411628723, + "learning_rate": 2.8331804593015355e-06, + "loss": 4.096, + "step": 84420 + }, + { + "epoch": 5.736173393124066, + "grad_norm": 0.21780893206596375, + "learning_rate": 2.8327558092132087e-06, + "loss": 3.9745, + "step": 84425 + }, + { + "epoch": 5.736513113194728, + "grad_norm": 0.24153152108192444, + "learning_rate": 2.8323311591248815e-06, + "loss": 4.0132, + "step": 84430 + }, + { + "epoch": 5.736852833265389, + "grad_norm": 0.28158655762672424, + "learning_rate": 2.831906509036554e-06, + "loss": 4.018, + "step": 84435 + }, + { + "epoch": 5.737192553336051, + "grad_norm": 0.25689876079559326, + "learning_rate": 2.8314818589482267e-06, + "loss": 3.7705, + "step": 84440 + }, + { + "epoch": 5.737532273406713, + "grad_norm": 0.37134066224098206, + "learning_rate": 2.8310572088599e-06, + "loss": 3.5644, + "step": 84445 + }, + { + "epoch": 5.737871993477374, + "grad_norm": 0.34128621220588684, + "learning_rate": 2.8306325587715727e-06, + "loss": 3.9812, + "step": 84450 + }, + { + "epoch": 5.738211713548036, + "grad_norm": 0.2657933831214905, + "learning_rate": 2.830207908683245e-06, + "loss": 3.8974, + "step": 84455 + }, + { + "epoch": 5.7385514336186985, + "grad_norm": 0.20297449827194214, + "learning_rate": 2.8297832585949183e-06, + "loss": 3.7993, + "step": 84460 + }, + { + "epoch": 5.73889115368936, + "grad_norm": 0.28878265619277954, + "learning_rate": 2.829358608506591e-06, + "loss": 3.9304, + "step": 84465 + }, + { + "epoch": 5.739230873760022, + "grad_norm": 0.27709507942199707, + "learning_rate": 2.8289339584182635e-06, + "loss": 4.1125, + "step": 84470 + }, + { + "epoch": 5.739570593830684, + "grad_norm": 0.23331928253173828, + "learning_rate": 2.8285093083299363e-06, + "loss": 4.0838, + "step": 84475 + }, + { + "epoch": 5.739910313901345, + "grad_norm": 0.33194369077682495, + "learning_rate": 2.8280846582416095e-06, + "loss": 3.8739, + "step": 84480 + }, + { + "epoch": 5.740250033972007, + "grad_norm": 0.24417953193187714, + "learning_rate": 2.827660008153282e-06, + "loss": 3.8713, + "step": 84485 + }, + { + "epoch": 5.740589754042669, + "grad_norm": 0.2984294593334198, + "learning_rate": 2.8272353580649547e-06, + "loss": 3.8775, + "step": 84490 + }, + { + "epoch": 5.74092947411333, + "grad_norm": 0.25856566429138184, + "learning_rate": 2.826810707976628e-06, + "loss": 3.9282, + "step": 84495 + }, + { + "epoch": 5.741269194183992, + "grad_norm": 0.28351056575775146, + "learning_rate": 2.8263860578883003e-06, + "loss": 3.8161, + "step": 84500 + }, + { + "epoch": 5.7416089142546545, + "grad_norm": 0.24350763857364655, + "learning_rate": 2.825961407799973e-06, + "loss": 3.7599, + "step": 84505 + }, + { + "epoch": 5.741948634325316, + "grad_norm": 0.27654945850372314, + "learning_rate": 2.825536757711646e-06, + "loss": 3.6136, + "step": 84510 + }, + { + "epoch": 5.742288354395978, + "grad_norm": 0.21812257170677185, + "learning_rate": 2.8251121076233182e-06, + "loss": 3.9063, + "step": 84515 + }, + { + "epoch": 5.74262807446664, + "grad_norm": 0.26631835103034973, + "learning_rate": 2.8246874575349915e-06, + "loss": 3.9461, + "step": 84520 + }, + { + "epoch": 5.742967794537301, + "grad_norm": 0.21026699244976044, + "learning_rate": 2.8242628074466643e-06, + "loss": 3.9094, + "step": 84525 + }, + { + "epoch": 5.743307514607963, + "grad_norm": 0.19853441417217255, + "learning_rate": 2.8238381573583367e-06, + "loss": 4.0312, + "step": 84530 + }, + { + "epoch": 5.743647234678625, + "grad_norm": 0.2916458547115326, + "learning_rate": 2.82341350727001e-06, + "loss": 3.986, + "step": 84535 + }, + { + "epoch": 5.743986954749286, + "grad_norm": 0.22729970514774323, + "learning_rate": 2.8229888571816827e-06, + "loss": 4.091, + "step": 84540 + }, + { + "epoch": 5.744326674819948, + "grad_norm": 0.25219330191612244, + "learning_rate": 2.822564207093355e-06, + "loss": 3.8731, + "step": 84545 + }, + { + "epoch": 5.7446663948906105, + "grad_norm": 0.2675528824329376, + "learning_rate": 2.822139557005028e-06, + "loss": 3.8435, + "step": 84550 + }, + { + "epoch": 5.745006114961272, + "grad_norm": 0.42449212074279785, + "learning_rate": 2.821714906916701e-06, + "loss": 3.8604, + "step": 84555 + }, + { + "epoch": 5.745345835031934, + "grad_norm": 0.22384966909885406, + "learning_rate": 2.8212902568283735e-06, + "loss": 3.8725, + "step": 84560 + }, + { + "epoch": 5.745685555102596, + "grad_norm": 0.2818704843521118, + "learning_rate": 2.8208656067400463e-06, + "loss": 3.8732, + "step": 84565 + }, + { + "epoch": 5.746025275173257, + "grad_norm": 0.21432916820049286, + "learning_rate": 2.8204409566517195e-06, + "loss": 4.1373, + "step": 84570 + }, + { + "epoch": 5.746364995243919, + "grad_norm": 0.2747924327850342, + "learning_rate": 2.820016306563392e-06, + "loss": 3.9211, + "step": 84575 + }, + { + "epoch": 5.746704715314581, + "grad_norm": 0.34900760650634766, + "learning_rate": 2.8195916564750647e-06, + "loss": 3.8585, + "step": 84580 + }, + { + "epoch": 5.747044435385242, + "grad_norm": 0.30061134696006775, + "learning_rate": 2.819167006386738e-06, + "loss": 4.0309, + "step": 84585 + }, + { + "epoch": 5.747384155455904, + "grad_norm": 0.29043689370155334, + "learning_rate": 2.8187423562984103e-06, + "loss": 4.1615, + "step": 84590 + }, + { + "epoch": 5.7477238755265665, + "grad_norm": 0.29384326934814453, + "learning_rate": 2.818317706210083e-06, + "loss": 3.8557, + "step": 84595 + }, + { + "epoch": 5.748063595597228, + "grad_norm": 0.3733677566051483, + "learning_rate": 2.817893056121756e-06, + "loss": 3.7572, + "step": 84600 + }, + { + "epoch": 5.74840331566789, + "grad_norm": 0.21843454241752625, + "learning_rate": 2.8174684060334282e-06, + "loss": 4.0273, + "step": 84605 + }, + { + "epoch": 5.748743035738551, + "grad_norm": 0.23675037920475006, + "learning_rate": 2.8170437559451015e-06, + "loss": 3.8382, + "step": 84610 + }, + { + "epoch": 5.749082755809213, + "grad_norm": 0.24163325130939484, + "learning_rate": 2.8166191058567743e-06, + "loss": 3.8267, + "step": 84615 + }, + { + "epoch": 5.749422475879875, + "grad_norm": 0.2703109681606293, + "learning_rate": 2.8161944557684475e-06, + "loss": 3.7624, + "step": 84620 + }, + { + "epoch": 5.749762195950536, + "grad_norm": 0.258698046207428, + "learning_rate": 2.81576980568012e-06, + "loss": 3.7636, + "step": 84625 + }, + { + "epoch": 5.750101916021198, + "grad_norm": 0.2961307764053345, + "learning_rate": 2.8153451555917927e-06, + "loss": 3.8439, + "step": 84630 + }, + { + "epoch": 5.75044163609186, + "grad_norm": 0.22865016758441925, + "learning_rate": 2.8149205055034655e-06, + "loss": 3.6869, + "step": 84635 + }, + { + "epoch": 5.750781356162522, + "grad_norm": 0.21448557078838348, + "learning_rate": 2.814495855415138e-06, + "loss": 3.8555, + "step": 84640 + }, + { + "epoch": 5.751121076233184, + "grad_norm": 0.20941589772701263, + "learning_rate": 2.814071205326811e-06, + "loss": 3.8657, + "step": 84645 + }, + { + "epoch": 5.751460796303846, + "grad_norm": 0.2515988349914551, + "learning_rate": 2.813646555238484e-06, + "loss": 3.7566, + "step": 84650 + }, + { + "epoch": 5.751800516374507, + "grad_norm": 0.2464701384305954, + "learning_rate": 2.8132219051501562e-06, + "loss": 3.8623, + "step": 84655 + }, + { + "epoch": 5.752140236445169, + "grad_norm": 0.27768266201019287, + "learning_rate": 2.8127972550618295e-06, + "loss": 4.0288, + "step": 84660 + }, + { + "epoch": 5.752479956515831, + "grad_norm": 0.26658153533935547, + "learning_rate": 2.8123726049735023e-06, + "loss": 3.8228, + "step": 84665 + }, + { + "epoch": 5.752819676586492, + "grad_norm": 0.2449149787425995, + "learning_rate": 2.8119479548851746e-06, + "loss": 3.9267, + "step": 84670 + }, + { + "epoch": 5.753159396657154, + "grad_norm": 0.2897111475467682, + "learning_rate": 2.8115233047968474e-06, + "loss": 3.7054, + "step": 84675 + }, + { + "epoch": 5.753499116727816, + "grad_norm": 0.29038840532302856, + "learning_rate": 2.8110986547085207e-06, + "loss": 4.122, + "step": 84680 + }, + { + "epoch": 5.753838836798478, + "grad_norm": 0.27583885192871094, + "learning_rate": 2.810674004620193e-06, + "loss": 3.77, + "step": 84685 + }, + { + "epoch": 5.75417855686914, + "grad_norm": 0.3491523265838623, + "learning_rate": 2.810249354531866e-06, + "loss": 3.8548, + "step": 84690 + }, + { + "epoch": 5.754518276939802, + "grad_norm": 0.46866604685783386, + "learning_rate": 2.809824704443539e-06, + "loss": 3.9558, + "step": 84695 + }, + { + "epoch": 5.754857997010463, + "grad_norm": 0.35908281803131104, + "learning_rate": 2.8094000543552115e-06, + "loss": 3.8107, + "step": 84700 + }, + { + "epoch": 5.755197717081125, + "grad_norm": 0.4196791648864746, + "learning_rate": 2.8089754042668843e-06, + "loss": 3.8826, + "step": 84705 + }, + { + "epoch": 5.755537437151787, + "grad_norm": 0.2850252687931061, + "learning_rate": 2.8085507541785575e-06, + "loss": 3.9247, + "step": 84710 + }, + { + "epoch": 5.755877157222448, + "grad_norm": 0.24998706579208374, + "learning_rate": 2.80812610409023e-06, + "loss": 3.4892, + "step": 84715 + }, + { + "epoch": 5.75621687729311, + "grad_norm": 0.31077083945274353, + "learning_rate": 2.8077014540019027e-06, + "loss": 3.8117, + "step": 84720 + }, + { + "epoch": 5.7565565973637725, + "grad_norm": 0.2998639941215515, + "learning_rate": 2.8072768039135755e-06, + "loss": 3.673, + "step": 84725 + }, + { + "epoch": 5.756896317434434, + "grad_norm": 0.26712238788604736, + "learning_rate": 2.806852153825248e-06, + "loss": 3.7916, + "step": 84730 + }, + { + "epoch": 5.757236037505096, + "grad_norm": 0.23312632739543915, + "learning_rate": 2.806427503736921e-06, + "loss": 3.717, + "step": 84735 + }, + { + "epoch": 5.757575757575758, + "grad_norm": 0.256321519613266, + "learning_rate": 2.806002853648594e-06, + "loss": 3.8295, + "step": 84740 + }, + { + "epoch": 5.757915477646419, + "grad_norm": 0.3926994800567627, + "learning_rate": 2.8055782035602662e-06, + "loss": 3.9682, + "step": 84745 + }, + { + "epoch": 5.758255197717081, + "grad_norm": 0.2561582922935486, + "learning_rate": 2.8051535534719395e-06, + "loss": 3.8883, + "step": 84750 + }, + { + "epoch": 5.758594917787743, + "grad_norm": 0.23894882202148438, + "learning_rate": 2.8047289033836123e-06, + "loss": 4.2198, + "step": 84755 + }, + { + "epoch": 5.758934637858404, + "grad_norm": 0.28730571269989014, + "learning_rate": 2.8043042532952846e-06, + "loss": 3.7615, + "step": 84760 + }, + { + "epoch": 5.759274357929066, + "grad_norm": 0.28616833686828613, + "learning_rate": 2.8038796032069574e-06, + "loss": 4.1828, + "step": 84765 + }, + { + "epoch": 5.7596140779997285, + "grad_norm": 0.22881336510181427, + "learning_rate": 2.8034549531186307e-06, + "loss": 3.916, + "step": 84770 + }, + { + "epoch": 5.75995379807039, + "grad_norm": 0.33615460991859436, + "learning_rate": 2.803030303030303e-06, + "loss": 3.9049, + "step": 84775 + }, + { + "epoch": 5.760293518141052, + "grad_norm": 0.26335346698760986, + "learning_rate": 2.802605652941976e-06, + "loss": 4.06, + "step": 84780 + }, + { + "epoch": 5.760633238211714, + "grad_norm": 0.258797287940979, + "learning_rate": 2.802181002853649e-06, + "loss": 3.8674, + "step": 84785 + }, + { + "epoch": 5.760972958282375, + "grad_norm": 0.23528870940208435, + "learning_rate": 2.801756352765322e-06, + "loss": 3.7568, + "step": 84790 + }, + { + "epoch": 5.761312678353037, + "grad_norm": 0.26608020067214966, + "learning_rate": 2.8013317026769942e-06, + "loss": 3.8115, + "step": 84795 + }, + { + "epoch": 5.761652398423699, + "grad_norm": 0.30770665407180786, + "learning_rate": 2.800907052588667e-06, + "loss": 3.9217, + "step": 84800 + }, + { + "epoch": 5.76199211849436, + "grad_norm": 0.27119961380958557, + "learning_rate": 2.8004824025003403e-06, + "loss": 3.5291, + "step": 84805 + }, + { + "epoch": 5.762331838565022, + "grad_norm": 0.3104596436023712, + "learning_rate": 2.8000577524120126e-06, + "loss": 3.7799, + "step": 84810 + }, + { + "epoch": 5.7626715586356845, + "grad_norm": 0.24518941342830658, + "learning_rate": 2.7996331023236854e-06, + "loss": 4.1098, + "step": 84815 + }, + { + "epoch": 5.763011278706346, + "grad_norm": 0.23911333084106445, + "learning_rate": 2.7992084522353587e-06, + "loss": 4.0389, + "step": 84820 + }, + { + "epoch": 5.763350998777008, + "grad_norm": 0.21660949289798737, + "learning_rate": 2.798783802147031e-06, + "loss": 3.7318, + "step": 84825 + }, + { + "epoch": 5.76369071884767, + "grad_norm": 0.2817351818084717, + "learning_rate": 2.798359152058704e-06, + "loss": 4.0359, + "step": 84830 + }, + { + "epoch": 5.764030438918331, + "grad_norm": 0.2330397516489029, + "learning_rate": 2.7979345019703767e-06, + "loss": 3.9678, + "step": 84835 + }, + { + "epoch": 5.764370158988993, + "grad_norm": 0.2820200026035309, + "learning_rate": 2.7975098518820495e-06, + "loss": 3.748, + "step": 84840 + }, + { + "epoch": 5.764709879059655, + "grad_norm": 0.2872520685195923, + "learning_rate": 2.7970852017937223e-06, + "loss": 3.8281, + "step": 84845 + }, + { + "epoch": 5.765049599130316, + "grad_norm": 0.2795083224773407, + "learning_rate": 2.796660551705395e-06, + "loss": 3.7471, + "step": 84850 + }, + { + "epoch": 5.765389319200978, + "grad_norm": 0.346250057220459, + "learning_rate": 2.7962359016170674e-06, + "loss": 3.9482, + "step": 84855 + }, + { + "epoch": 5.7657290392716405, + "grad_norm": 0.2000250369310379, + "learning_rate": 2.7958112515287407e-06, + "loss": 3.9299, + "step": 84860 + }, + { + "epoch": 5.766068759342302, + "grad_norm": 0.231111079454422, + "learning_rate": 2.7953866014404135e-06, + "loss": 3.9038, + "step": 84865 + }, + { + "epoch": 5.766408479412964, + "grad_norm": 0.2028869241476059, + "learning_rate": 2.795046881369752e-06, + "loss": 3.89, + "step": 84870 + }, + { + "epoch": 5.766748199483626, + "grad_norm": 0.29938066005706787, + "learning_rate": 2.7946222312814243e-06, + "loss": 3.9446, + "step": 84875 + }, + { + "epoch": 5.767087919554287, + "grad_norm": 0.27151572704315186, + "learning_rate": 2.794197581193097e-06, + "loss": 3.6886, + "step": 84880 + }, + { + "epoch": 5.767427639624949, + "grad_norm": 0.2192077487707138, + "learning_rate": 2.79377293110477e-06, + "loss": 3.9493, + "step": 84885 + }, + { + "epoch": 5.767767359695611, + "grad_norm": 0.25016698241233826, + "learning_rate": 2.7933482810164423e-06, + "loss": 3.9809, + "step": 84890 + }, + { + "epoch": 5.768107079766272, + "grad_norm": 0.2852131426334381, + "learning_rate": 2.7929236309281155e-06, + "loss": 3.9175, + "step": 84895 + }, + { + "epoch": 5.768446799836934, + "grad_norm": 0.24567528069019318, + "learning_rate": 2.7924989808397883e-06, + "loss": 3.9976, + "step": 84900 + }, + { + "epoch": 5.7687865199075965, + "grad_norm": 0.257901132106781, + "learning_rate": 2.7920743307514607e-06, + "loss": 4.0433, + "step": 84905 + }, + { + "epoch": 5.769126239978258, + "grad_norm": 0.2137065976858139, + "learning_rate": 2.791649680663134e-06, + "loss": 4.1187, + "step": 84910 + }, + { + "epoch": 5.76946596004892, + "grad_norm": 0.43799176812171936, + "learning_rate": 2.7912250305748067e-06, + "loss": 4.0643, + "step": 84915 + }, + { + "epoch": 5.769805680119582, + "grad_norm": 0.26342272758483887, + "learning_rate": 2.790800380486479e-06, + "loss": 3.8527, + "step": 84920 + }, + { + "epoch": 5.770145400190243, + "grad_norm": 0.32092973589897156, + "learning_rate": 2.7903757303981523e-06, + "loss": 4.0711, + "step": 84925 + }, + { + "epoch": 5.770485120260905, + "grad_norm": 0.23448272049427032, + "learning_rate": 2.789951080309825e-06, + "loss": 4.0113, + "step": 84930 + }, + { + "epoch": 5.770824840331567, + "grad_norm": 0.30452102422714233, + "learning_rate": 2.7895264302214975e-06, + "loss": 3.9583, + "step": 84935 + }, + { + "epoch": 5.771164560402228, + "grad_norm": 0.2781599462032318, + "learning_rate": 2.7891017801331703e-06, + "loss": 3.9623, + "step": 84940 + }, + { + "epoch": 5.77150428047289, + "grad_norm": 0.21815325319766998, + "learning_rate": 2.7886771300448435e-06, + "loss": 3.8347, + "step": 84945 + }, + { + "epoch": 5.7718440005435525, + "grad_norm": 0.2859082520008087, + "learning_rate": 2.788252479956516e-06, + "loss": 4.0907, + "step": 84950 + }, + { + "epoch": 5.772183720614214, + "grad_norm": 0.21702031791210175, + "learning_rate": 2.7878278298681887e-06, + "loss": 3.9578, + "step": 84955 + }, + { + "epoch": 5.772523440684876, + "grad_norm": 0.23003165423870087, + "learning_rate": 2.787403179779862e-06, + "loss": 3.9369, + "step": 84960 + }, + { + "epoch": 5.772863160755538, + "grad_norm": 0.20521068572998047, + "learning_rate": 2.7869785296915343e-06, + "loss": 3.9721, + "step": 84965 + }, + { + "epoch": 5.773202880826199, + "grad_norm": 0.245108962059021, + "learning_rate": 2.786553879603207e-06, + "loss": 3.8613, + "step": 84970 + }, + { + "epoch": 5.773542600896861, + "grad_norm": 0.3350934684276581, + "learning_rate": 2.78612922951488e-06, + "loss": 3.8533, + "step": 84975 + }, + { + "epoch": 5.773882320967523, + "grad_norm": 0.27091774344444275, + "learning_rate": 2.7857045794265523e-06, + "loss": 3.8373, + "step": 84980 + }, + { + "epoch": 5.774222041038184, + "grad_norm": 0.251117467880249, + "learning_rate": 2.7852799293382255e-06, + "loss": 3.7205, + "step": 84985 + }, + { + "epoch": 5.7745617611088464, + "grad_norm": 0.26524031162261963, + "learning_rate": 2.7848552792498983e-06, + "loss": 4.1499, + "step": 84990 + }, + { + "epoch": 5.7749014811795085, + "grad_norm": 0.2866824269294739, + "learning_rate": 2.7844306291615715e-06, + "loss": 3.8466, + "step": 84995 + }, + { + "epoch": 5.77524120125017, + "grad_norm": 0.2416212558746338, + "learning_rate": 2.784005979073244e-06, + "loss": 3.9319, + "step": 85000 + }, + { + "epoch": 5.775580921320832, + "grad_norm": 0.2332846075296402, + "learning_rate": 2.7835813289849167e-06, + "loss": 3.8964, + "step": 85005 + }, + { + "epoch": 5.775920641391494, + "grad_norm": 0.2593822777271271, + "learning_rate": 2.7831566788965895e-06, + "loss": 3.9194, + "step": 85010 + }, + { + "epoch": 5.776260361462155, + "grad_norm": 0.2290431261062622, + "learning_rate": 2.782732028808262e-06, + "loss": 4.0235, + "step": 85015 + }, + { + "epoch": 5.776600081532817, + "grad_norm": 0.28422895073890686, + "learning_rate": 2.782307378719935e-06, + "loss": 3.8641, + "step": 85020 + }, + { + "epoch": 5.776939801603479, + "grad_norm": 0.32015252113342285, + "learning_rate": 2.781882728631608e-06, + "loss": 3.7834, + "step": 85025 + }, + { + "epoch": 5.77727952167414, + "grad_norm": 0.22282104194164276, + "learning_rate": 2.7814580785432803e-06, + "loss": 3.9953, + "step": 85030 + }, + { + "epoch": 5.7776192417448025, + "grad_norm": 0.2886078357696533, + "learning_rate": 2.7810334284549535e-06, + "loss": 3.945, + "step": 85035 + }, + { + "epoch": 5.7779589618154645, + "grad_norm": 0.29133403301239014, + "learning_rate": 2.7806087783666263e-06, + "loss": 3.7321, + "step": 85040 + }, + { + "epoch": 5.778298681886126, + "grad_norm": 0.20400534570217133, + "learning_rate": 2.7801841282782987e-06, + "loss": 4.0978, + "step": 85045 + }, + { + "epoch": 5.778638401956788, + "grad_norm": 0.3559795320034027, + "learning_rate": 2.7797594781899715e-06, + "loss": 3.8633, + "step": 85050 + }, + { + "epoch": 5.77897812202745, + "grad_norm": 0.23112653195858002, + "learning_rate": 2.7793348281016447e-06, + "loss": 3.9297, + "step": 85055 + }, + { + "epoch": 5.779317842098111, + "grad_norm": 0.3088255524635315, + "learning_rate": 2.778910178013317e-06, + "loss": 3.8658, + "step": 85060 + }, + { + "epoch": 5.779657562168773, + "grad_norm": 0.25299856066703796, + "learning_rate": 2.77848552792499e-06, + "loss": 4.0317, + "step": 85065 + }, + { + "epoch": 5.779997282239434, + "grad_norm": 0.22183537483215332, + "learning_rate": 2.778060877836663e-06, + "loss": 3.561, + "step": 85070 + }, + { + "epoch": 5.780337002310096, + "grad_norm": 0.2478993684053421, + "learning_rate": 2.7776362277483355e-06, + "loss": 3.9496, + "step": 85075 + }, + { + "epoch": 5.7806767223807585, + "grad_norm": 0.22879357635974884, + "learning_rate": 2.7772115776600083e-06, + "loss": 3.8587, + "step": 85080 + }, + { + "epoch": 5.78101644245142, + "grad_norm": 0.32937929034233093, + "learning_rate": 2.7767869275716815e-06, + "loss": 3.6282, + "step": 85085 + }, + { + "epoch": 5.781356162522082, + "grad_norm": 0.23879374563694, + "learning_rate": 2.776362277483354e-06, + "loss": 4.0003, + "step": 85090 + }, + { + "epoch": 5.781695882592744, + "grad_norm": 0.2693329453468323, + "learning_rate": 2.7759376273950267e-06, + "loss": 3.9424, + "step": 85095 + }, + { + "epoch": 5.782035602663405, + "grad_norm": 0.2548830211162567, + "learning_rate": 2.7755129773066995e-06, + "loss": 3.7426, + "step": 85100 + }, + { + "epoch": 5.782375322734067, + "grad_norm": 0.44628459215164185, + "learning_rate": 2.775088327218372e-06, + "loss": 4.0273, + "step": 85105 + }, + { + "epoch": 5.782715042804729, + "grad_norm": 0.23572057485580444, + "learning_rate": 2.774663677130045e-06, + "loss": 3.9829, + "step": 85110 + }, + { + "epoch": 5.78305476287539, + "grad_norm": 0.31085827946662903, + "learning_rate": 2.774239027041718e-06, + "loss": 4.0296, + "step": 85115 + }, + { + "epoch": 5.783394482946052, + "grad_norm": 0.382607102394104, + "learning_rate": 2.7738143769533903e-06, + "loss": 3.8405, + "step": 85120 + }, + { + "epoch": 5.7837342030167145, + "grad_norm": 0.21775515377521515, + "learning_rate": 2.7733897268650635e-06, + "loss": 3.7157, + "step": 85125 + }, + { + "epoch": 5.784073923087376, + "grad_norm": 0.2861080765724182, + "learning_rate": 2.7729650767767363e-06, + "loss": 4.0504, + "step": 85130 + }, + { + "epoch": 5.784413643158038, + "grad_norm": 0.2537540793418884, + "learning_rate": 2.7725404266884087e-06, + "loss": 4.146, + "step": 85135 + }, + { + "epoch": 5.7847533632287, + "grad_norm": 0.31802669167518616, + "learning_rate": 2.7721157766000815e-06, + "loss": 3.8759, + "step": 85140 + }, + { + "epoch": 5.785093083299361, + "grad_norm": 0.25008365511894226, + "learning_rate": 2.7716911265117547e-06, + "loss": 3.8624, + "step": 85145 + }, + { + "epoch": 5.785432803370023, + "grad_norm": 0.23464539647102356, + "learning_rate": 2.771266476423427e-06, + "loss": 3.7587, + "step": 85150 + }, + { + "epoch": 5.785772523440685, + "grad_norm": 0.26675739884376526, + "learning_rate": 2.7708418263351e-06, + "loss": 3.8521, + "step": 85155 + }, + { + "epoch": 5.786112243511346, + "grad_norm": 0.260648250579834, + "learning_rate": 2.770417176246773e-06, + "loss": 3.9314, + "step": 85160 + }, + { + "epoch": 5.786451963582008, + "grad_norm": 0.2703969180583954, + "learning_rate": 2.769992526158446e-06, + "loss": 3.7974, + "step": 85165 + }, + { + "epoch": 5.7867916836526705, + "grad_norm": 0.27849140763282776, + "learning_rate": 2.7695678760701183e-06, + "loss": 3.9485, + "step": 85170 + }, + { + "epoch": 5.787131403723332, + "grad_norm": 0.2187633365392685, + "learning_rate": 2.769143225981791e-06, + "loss": 3.916, + "step": 85175 + }, + { + "epoch": 5.787471123793994, + "grad_norm": 0.2698267996311188, + "learning_rate": 2.7687185758934643e-06, + "loss": 4.1589, + "step": 85180 + }, + { + "epoch": 5.787810843864656, + "grad_norm": 0.26105353236198425, + "learning_rate": 2.7682939258051367e-06, + "loss": 3.8301, + "step": 85185 + }, + { + "epoch": 5.788150563935317, + "grad_norm": 0.25110888481140137, + "learning_rate": 2.7678692757168095e-06, + "loss": 4.1558, + "step": 85190 + }, + { + "epoch": 5.788490284005979, + "grad_norm": 0.26040539145469666, + "learning_rate": 2.7674446256284827e-06, + "loss": 3.9386, + "step": 85195 + }, + { + "epoch": 5.788830004076641, + "grad_norm": 0.3097170889377594, + "learning_rate": 2.767019975540155e-06, + "loss": 3.8235, + "step": 85200 + }, + { + "epoch": 5.789169724147302, + "grad_norm": 0.28506872057914734, + "learning_rate": 2.766595325451828e-06, + "loss": 3.8689, + "step": 85205 + }, + { + "epoch": 5.789509444217964, + "grad_norm": 0.28175532817840576, + "learning_rate": 2.766170675363501e-06, + "loss": 3.9592, + "step": 85210 + }, + { + "epoch": 5.7898491642886265, + "grad_norm": 0.26814010739326477, + "learning_rate": 2.7657460252751735e-06, + "loss": 3.7725, + "step": 85215 + }, + { + "epoch": 5.790188884359288, + "grad_norm": 0.281424880027771, + "learning_rate": 2.7653213751868463e-06, + "loss": 4.0388, + "step": 85220 + }, + { + "epoch": 5.79052860442995, + "grad_norm": 0.2723369598388672, + "learning_rate": 2.764896725098519e-06, + "loss": 3.9207, + "step": 85225 + }, + { + "epoch": 5.790868324500612, + "grad_norm": 0.28797340393066406, + "learning_rate": 2.7644720750101915e-06, + "loss": 4.03, + "step": 85230 + }, + { + "epoch": 5.791208044571273, + "grad_norm": 0.30048689246177673, + "learning_rate": 2.7640474249218647e-06, + "loss": 3.9663, + "step": 85235 + }, + { + "epoch": 5.791547764641935, + "grad_norm": 0.2674189805984497, + "learning_rate": 2.7636227748335375e-06, + "loss": 3.8422, + "step": 85240 + }, + { + "epoch": 5.791887484712597, + "grad_norm": 0.24327847361564636, + "learning_rate": 2.76319812474521e-06, + "loss": 4.1802, + "step": 85245 + }, + { + "epoch": 5.792227204783258, + "grad_norm": 0.31822317838668823, + "learning_rate": 2.762773474656883e-06, + "loss": 3.9137, + "step": 85250 + }, + { + "epoch": 5.79256692485392, + "grad_norm": 0.2906803488731384, + "learning_rate": 2.762348824568556e-06, + "loss": 3.8853, + "step": 85255 + }, + { + "epoch": 5.7929066449245825, + "grad_norm": 0.27664631605148315, + "learning_rate": 2.7619241744802283e-06, + "loss": 4.1131, + "step": 85260 + }, + { + "epoch": 5.793246364995244, + "grad_norm": 0.2909355163574219, + "learning_rate": 2.761499524391901e-06, + "loss": 4.0614, + "step": 85265 + }, + { + "epoch": 5.793586085065906, + "grad_norm": 0.47328007221221924, + "learning_rate": 2.7610748743035743e-06, + "loss": 3.6496, + "step": 85270 + }, + { + "epoch": 5.793925805136568, + "grad_norm": 0.22076773643493652, + "learning_rate": 2.7606502242152467e-06, + "loss": 3.9752, + "step": 85275 + }, + { + "epoch": 5.794265525207229, + "grad_norm": 0.2114812731742859, + "learning_rate": 2.7602255741269195e-06, + "loss": 4.0405, + "step": 85280 + }, + { + "epoch": 5.794605245277891, + "grad_norm": 0.21240346133708954, + "learning_rate": 2.7598009240385927e-06, + "loss": 3.9953, + "step": 85285 + }, + { + "epoch": 5.794944965348552, + "grad_norm": 0.22184278070926666, + "learning_rate": 2.759376273950265e-06, + "loss": 3.818, + "step": 85290 + }, + { + "epoch": 5.795284685419214, + "grad_norm": 0.27861708402633667, + "learning_rate": 2.758951623861938e-06, + "loss": 3.8942, + "step": 85295 + }, + { + "epoch": 5.7956244054898765, + "grad_norm": 0.2676212787628174, + "learning_rate": 2.7585269737736107e-06, + "loss": 4.1095, + "step": 85300 + }, + { + "epoch": 5.795964125560538, + "grad_norm": 0.3460161089897156, + "learning_rate": 2.7581023236852835e-06, + "loss": 4.0003, + "step": 85305 + }, + { + "epoch": 5.7963038456312, + "grad_norm": 0.26134124398231506, + "learning_rate": 2.7576776735969563e-06, + "loss": 3.9428, + "step": 85310 + }, + { + "epoch": 5.796643565701862, + "grad_norm": 0.2526608109474182, + "learning_rate": 2.757253023508629e-06, + "loss": 3.8622, + "step": 85315 + }, + { + "epoch": 5.796983285772523, + "grad_norm": 0.4127359390258789, + "learning_rate": 2.7568283734203015e-06, + "loss": 3.7414, + "step": 85320 + }, + { + "epoch": 5.797323005843185, + "grad_norm": 0.21110562980175018, + "learning_rate": 2.7564037233319747e-06, + "loss": 3.6766, + "step": 85325 + }, + { + "epoch": 5.797662725913847, + "grad_norm": 0.3324955403804779, + "learning_rate": 2.7559790732436475e-06, + "loss": 4.0771, + "step": 85330 + }, + { + "epoch": 5.798002445984508, + "grad_norm": 0.21481600403785706, + "learning_rate": 2.7555544231553203e-06, + "loss": 3.9055, + "step": 85335 + }, + { + "epoch": 5.79834216605517, + "grad_norm": 0.381757915019989, + "learning_rate": 2.755129773066993e-06, + "loss": 3.8194, + "step": 85340 + }, + { + "epoch": 5.7986818861258325, + "grad_norm": 0.41357195377349854, + "learning_rate": 2.754705122978666e-06, + "loss": 3.8113, + "step": 85345 + }, + { + "epoch": 5.799021606196494, + "grad_norm": 0.4169015884399414, + "learning_rate": 2.7542804728903387e-06, + "loss": 4.2178, + "step": 85350 + }, + { + "epoch": 5.799361326267156, + "grad_norm": 0.36831629276275635, + "learning_rate": 2.753855822802011e-06, + "loss": 3.9583, + "step": 85355 + }, + { + "epoch": 5.799701046337818, + "grad_norm": 0.2750004231929779, + "learning_rate": 2.7534311727136843e-06, + "loss": 3.9819, + "step": 85360 + }, + { + "epoch": 5.800040766408479, + "grad_norm": 0.2918323278427124, + "learning_rate": 2.753006522625357e-06, + "loss": 3.7571, + "step": 85365 + }, + { + "epoch": 5.800380486479141, + "grad_norm": 0.3573005199432373, + "learning_rate": 2.7525818725370295e-06, + "loss": 3.7697, + "step": 85370 + }, + { + "epoch": 5.800720206549803, + "grad_norm": 0.29390543699264526, + "learning_rate": 2.7521572224487027e-06, + "loss": 3.6768, + "step": 85375 + }, + { + "epoch": 5.801059926620464, + "grad_norm": 0.20695818960666656, + "learning_rate": 2.7517325723603755e-06, + "loss": 3.8372, + "step": 85380 + }, + { + "epoch": 5.801399646691126, + "grad_norm": 0.23123900592327118, + "learning_rate": 2.751307922272048e-06, + "loss": 3.8501, + "step": 85385 + }, + { + "epoch": 5.8017393667617885, + "grad_norm": 0.393703818321228, + "learning_rate": 2.7508832721837207e-06, + "loss": 3.8377, + "step": 85390 + }, + { + "epoch": 5.80207908683245, + "grad_norm": 0.26827502250671387, + "learning_rate": 2.750458622095394e-06, + "loss": 4.2625, + "step": 85395 + }, + { + "epoch": 5.802418806903112, + "grad_norm": 0.25647905468940735, + "learning_rate": 2.7500339720070663e-06, + "loss": 4.0208, + "step": 85400 + }, + { + "epoch": 5.802758526973774, + "grad_norm": 0.3231372833251953, + "learning_rate": 2.749609321918739e-06, + "loss": 4.1076, + "step": 85405 + }, + { + "epoch": 5.803098247044435, + "grad_norm": 0.29695919156074524, + "learning_rate": 2.7491846718304123e-06, + "loss": 4.0276, + "step": 85410 + }, + { + "epoch": 5.803437967115097, + "grad_norm": 0.3092087209224701, + "learning_rate": 2.7487600217420847e-06, + "loss": 4.0426, + "step": 85415 + }, + { + "epoch": 5.803777687185759, + "grad_norm": 0.20880603790283203, + "learning_rate": 2.7483353716537575e-06, + "loss": 3.7986, + "step": 85420 + }, + { + "epoch": 5.80411740725642, + "grad_norm": 0.345792293548584, + "learning_rate": 2.7479107215654303e-06, + "loss": 3.802, + "step": 85425 + }, + { + "epoch": 5.804457127327082, + "grad_norm": 0.2536841034889221, + "learning_rate": 2.747486071477103e-06, + "loss": 3.4809, + "step": 85430 + }, + { + "epoch": 5.8047968473977445, + "grad_norm": 0.3230089545249939, + "learning_rate": 2.747061421388776e-06, + "loss": 4.1017, + "step": 85435 + }, + { + "epoch": 5.805136567468406, + "grad_norm": 0.22112105786800385, + "learning_rate": 2.7466367713004487e-06, + "loss": 3.8347, + "step": 85440 + }, + { + "epoch": 5.805476287539068, + "grad_norm": 0.24518445134162903, + "learning_rate": 2.746212121212121e-06, + "loss": 3.8767, + "step": 85445 + }, + { + "epoch": 5.80581600760973, + "grad_norm": 0.333987295627594, + "learning_rate": 2.7457874711237943e-06, + "loss": 3.951, + "step": 85450 + }, + { + "epoch": 5.806155727680391, + "grad_norm": 0.32057642936706543, + "learning_rate": 2.745362821035467e-06, + "loss": 3.9071, + "step": 85455 + }, + { + "epoch": 5.806495447751053, + "grad_norm": 0.24445952475070953, + "learning_rate": 2.7449381709471395e-06, + "loss": 3.9316, + "step": 85460 + }, + { + "epoch": 5.806835167821715, + "grad_norm": 0.2309802770614624, + "learning_rate": 2.7445135208588127e-06, + "loss": 4.0216, + "step": 85465 + }, + { + "epoch": 5.807174887892376, + "grad_norm": 0.26010802388191223, + "learning_rate": 2.7440888707704855e-06, + "loss": 3.8144, + "step": 85470 + }, + { + "epoch": 5.807514607963038, + "grad_norm": 0.2683320939540863, + "learning_rate": 2.743664220682158e-06, + "loss": 3.88, + "step": 85475 + }, + { + "epoch": 5.8078543280337005, + "grad_norm": 0.21299664676189423, + "learning_rate": 2.7432395705938307e-06, + "loss": 3.856, + "step": 85480 + }, + { + "epoch": 5.808194048104362, + "grad_norm": 0.2492484599351883, + "learning_rate": 2.742814920505504e-06, + "loss": 3.9487, + "step": 85485 + }, + { + "epoch": 5.808533768175024, + "grad_norm": 0.2732219696044922, + "learning_rate": 2.7423902704171763e-06, + "loss": 3.8574, + "step": 85490 + }, + { + "epoch": 5.808873488245686, + "grad_norm": 0.2841470539569855, + "learning_rate": 2.741965620328849e-06, + "loss": 3.8072, + "step": 85495 + }, + { + "epoch": 5.809213208316347, + "grad_norm": 0.27419427037239075, + "learning_rate": 2.7415409702405223e-06, + "loss": 3.9611, + "step": 85500 + }, + { + "epoch": 5.809552928387009, + "grad_norm": 0.24535644054412842, + "learning_rate": 2.741116320152195e-06, + "loss": 4.0523, + "step": 85505 + }, + { + "epoch": 5.809892648457671, + "grad_norm": 0.23804450035095215, + "learning_rate": 2.7406916700638675e-06, + "loss": 3.7535, + "step": 85510 + }, + { + "epoch": 5.810232368528332, + "grad_norm": 0.2872118353843689, + "learning_rate": 2.7402670199755403e-06, + "loss": 3.8993, + "step": 85515 + }, + { + "epoch": 5.810572088598994, + "grad_norm": 0.20515745878219604, + "learning_rate": 2.7398423698872135e-06, + "loss": 3.8492, + "step": 85520 + }, + { + "epoch": 5.8109118086696565, + "grad_norm": 0.3342312276363373, + "learning_rate": 2.739417719798886e-06, + "loss": 4.0749, + "step": 85525 + }, + { + "epoch": 5.811251528740318, + "grad_norm": 0.28248244524002075, + "learning_rate": 2.7389930697105587e-06, + "loss": 3.955, + "step": 85530 + }, + { + "epoch": 5.81159124881098, + "grad_norm": 0.22288298606872559, + "learning_rate": 2.738568419622232e-06, + "loss": 3.9369, + "step": 85535 + }, + { + "epoch": 5.811930968881642, + "grad_norm": 0.37915006279945374, + "learning_rate": 2.7381437695339043e-06, + "loss": 3.9732, + "step": 85540 + }, + { + "epoch": 5.812270688952303, + "grad_norm": 0.21465426683425903, + "learning_rate": 2.737719119445577e-06, + "loss": 3.8438, + "step": 85545 + }, + { + "epoch": 5.812610409022965, + "grad_norm": 0.24277891218662262, + "learning_rate": 2.73729446935725e-06, + "loss": 3.9044, + "step": 85550 + }, + { + "epoch": 5.812950129093627, + "grad_norm": 0.38023045659065247, + "learning_rate": 2.7368698192689223e-06, + "loss": 3.8148, + "step": 85555 + }, + { + "epoch": 5.813289849164288, + "grad_norm": 0.24016793072223663, + "learning_rate": 2.7364451691805955e-06, + "loss": 3.9991, + "step": 85560 + }, + { + "epoch": 5.8136295692349504, + "grad_norm": 0.20631346106529236, + "learning_rate": 2.7360205190922683e-06, + "loss": 3.8151, + "step": 85565 + }, + { + "epoch": 5.8139692893056125, + "grad_norm": 0.252115398645401, + "learning_rate": 2.7355958690039407e-06, + "loss": 3.8293, + "step": 85570 + }, + { + "epoch": 5.814309009376274, + "grad_norm": 0.24563837051391602, + "learning_rate": 2.735171218915614e-06, + "loss": 3.8583, + "step": 85575 + }, + { + "epoch": 5.814648729446936, + "grad_norm": 0.25683754682540894, + "learning_rate": 2.7347465688272867e-06, + "loss": 4.2153, + "step": 85580 + }, + { + "epoch": 5.814988449517598, + "grad_norm": 0.25055113434791565, + "learning_rate": 2.734321918738959e-06, + "loss": 3.8524, + "step": 85585 + }, + { + "epoch": 5.815328169588259, + "grad_norm": 0.2739469110965729, + "learning_rate": 2.7338972686506323e-06, + "loss": 3.6595, + "step": 85590 + }, + { + "epoch": 5.815667889658921, + "grad_norm": 0.30822518467903137, + "learning_rate": 2.733472618562305e-06, + "loss": 3.903, + "step": 85595 + }, + { + "epoch": 5.816007609729583, + "grad_norm": 0.24410194158554077, + "learning_rate": 2.7330479684739775e-06, + "loss": 3.9751, + "step": 85600 + }, + { + "epoch": 5.816347329800244, + "grad_norm": 0.27141621708869934, + "learning_rate": 2.7326233183856503e-06, + "loss": 3.9205, + "step": 85605 + }, + { + "epoch": 5.8166870498709065, + "grad_norm": 0.3088611364364624, + "learning_rate": 2.7321986682973235e-06, + "loss": 3.7614, + "step": 85610 + }, + { + "epoch": 5.8170267699415685, + "grad_norm": 0.3364398181438446, + "learning_rate": 2.731774018208996e-06, + "loss": 4.1404, + "step": 85615 + }, + { + "epoch": 5.81736649001223, + "grad_norm": 0.32362186908721924, + "learning_rate": 2.7313493681206687e-06, + "loss": 3.988, + "step": 85620 + }, + { + "epoch": 5.817706210082892, + "grad_norm": 0.2989341914653778, + "learning_rate": 2.730924718032342e-06, + "loss": 3.7892, + "step": 85625 + }, + { + "epoch": 5.818045930153554, + "grad_norm": 0.21933025121688843, + "learning_rate": 2.7305000679440143e-06, + "loss": 3.8783, + "step": 85630 + }, + { + "epoch": 5.818385650224215, + "grad_norm": 0.32249605655670166, + "learning_rate": 2.730075417855687e-06, + "loss": 3.8486, + "step": 85635 + }, + { + "epoch": 5.818725370294877, + "grad_norm": 0.26092857122421265, + "learning_rate": 2.72965076776736e-06, + "loss": 3.9516, + "step": 85640 + }, + { + "epoch": 5.819065090365539, + "grad_norm": 0.31537100672721863, + "learning_rate": 2.7292261176790323e-06, + "loss": 3.9754, + "step": 85645 + }, + { + "epoch": 5.8194048104362, + "grad_norm": 0.33854612708091736, + "learning_rate": 2.7288014675907055e-06, + "loss": 3.9383, + "step": 85650 + }, + { + "epoch": 5.8197445305068625, + "grad_norm": 0.1909671425819397, + "learning_rate": 2.7283768175023783e-06, + "loss": 3.8147, + "step": 85655 + }, + { + "epoch": 5.8200842505775245, + "grad_norm": 0.2604241967201233, + "learning_rate": 2.7279521674140507e-06, + "loss": 4.038, + "step": 85660 + }, + { + "epoch": 5.820423970648186, + "grad_norm": 0.46679067611694336, + "learning_rate": 2.727527517325724e-06, + "loss": 3.9886, + "step": 85665 + }, + { + "epoch": 5.820763690718848, + "grad_norm": 0.3277437090873718, + "learning_rate": 2.7271028672373967e-06, + "loss": 4.1532, + "step": 85670 + }, + { + "epoch": 5.82110341078951, + "grad_norm": 0.23466487228870392, + "learning_rate": 2.7266782171490695e-06, + "loss": 4.0062, + "step": 85675 + }, + { + "epoch": 5.821443130860171, + "grad_norm": 0.22395247220993042, + "learning_rate": 2.726253567060742e-06, + "loss": 3.9219, + "step": 85680 + }, + { + "epoch": 5.821782850930833, + "grad_norm": 0.2806961238384247, + "learning_rate": 2.725828916972415e-06, + "loss": 3.6705, + "step": 85685 + }, + { + "epoch": 5.822122571001495, + "grad_norm": 0.28975096344947815, + "learning_rate": 2.725404266884088e-06, + "loss": 3.749, + "step": 85690 + }, + { + "epoch": 5.822462291072156, + "grad_norm": 0.2592499852180481, + "learning_rate": 2.7249796167957603e-06, + "loss": 4.0616, + "step": 85695 + }, + { + "epoch": 5.8228020111428185, + "grad_norm": 0.26604732871055603, + "learning_rate": 2.7245549667074335e-06, + "loss": 3.746, + "step": 85700 + }, + { + "epoch": 5.8231417312134806, + "grad_norm": 0.5440488457679749, + "learning_rate": 2.7241303166191063e-06, + "loss": 4.0248, + "step": 85705 + }, + { + "epoch": 5.823481451284142, + "grad_norm": 0.2720935344696045, + "learning_rate": 2.7237056665307787e-06, + "loss": 3.7598, + "step": 85710 + }, + { + "epoch": 5.823821171354804, + "grad_norm": 0.28318190574645996, + "learning_rate": 2.723281016442452e-06, + "loss": 3.6862, + "step": 85715 + }, + { + "epoch": 5.824160891425466, + "grad_norm": 0.37112605571746826, + "learning_rate": 2.7228563663541247e-06, + "loss": 4.101, + "step": 85720 + }, + { + "epoch": 5.824500611496127, + "grad_norm": 0.3031947612762451, + "learning_rate": 2.722431716265797e-06, + "loss": 3.9638, + "step": 85725 + }, + { + "epoch": 5.824840331566789, + "grad_norm": 0.3083445727825165, + "learning_rate": 2.72200706617747e-06, + "loss": 3.9475, + "step": 85730 + }, + { + "epoch": 5.825180051637451, + "grad_norm": 0.6018026471138, + "learning_rate": 2.721582416089143e-06, + "loss": 3.6079, + "step": 85735 + }, + { + "epoch": 5.825519771708112, + "grad_norm": 0.24223704636096954, + "learning_rate": 2.7211577660008155e-06, + "loss": 3.7971, + "step": 85740 + }, + { + "epoch": 5.8258594917787745, + "grad_norm": 0.3085381090641022, + "learning_rate": 2.7207331159124883e-06, + "loss": 3.9644, + "step": 85745 + }, + { + "epoch": 5.826199211849436, + "grad_norm": 0.29137036204338074, + "learning_rate": 2.7203084658241615e-06, + "loss": 3.848, + "step": 85750 + }, + { + "epoch": 5.826538931920098, + "grad_norm": 0.39157673716545105, + "learning_rate": 2.719883815735834e-06, + "loss": 3.8798, + "step": 85755 + }, + { + "epoch": 5.82687865199076, + "grad_norm": 0.2457866221666336, + "learning_rate": 2.7194591656475067e-06, + "loss": 3.8213, + "step": 85760 + }, + { + "epoch": 5.827218372061421, + "grad_norm": 0.25346946716308594, + "learning_rate": 2.7190345155591795e-06, + "loss": 4.0834, + "step": 85765 + }, + { + "epoch": 5.827558092132083, + "grad_norm": 0.31743764877319336, + "learning_rate": 2.718609865470852e-06, + "loss": 3.8075, + "step": 85770 + }, + { + "epoch": 5.827897812202745, + "grad_norm": 0.2689109742641449, + "learning_rate": 2.718185215382525e-06, + "loss": 3.81, + "step": 85775 + }, + { + "epoch": 5.828237532273406, + "grad_norm": 0.2294766753911972, + "learning_rate": 2.717760565294198e-06, + "loss": 3.8849, + "step": 85780 + }, + { + "epoch": 5.828577252344068, + "grad_norm": 0.2271774560213089, + "learning_rate": 2.7173359152058703e-06, + "loss": 3.9684, + "step": 85785 + }, + { + "epoch": 5.8289169724147305, + "grad_norm": 0.24826525151729584, + "learning_rate": 2.7169112651175435e-06, + "loss": 3.8661, + "step": 85790 + }, + { + "epoch": 5.829256692485392, + "grad_norm": 0.2605609893798828, + "learning_rate": 2.7164866150292163e-06, + "loss": 4.0546, + "step": 85795 + }, + { + "epoch": 5.829596412556054, + "grad_norm": 0.2474084049463272, + "learning_rate": 2.7160619649408887e-06, + "loss": 3.7194, + "step": 85800 + }, + { + "epoch": 5.829936132626716, + "grad_norm": 0.25969916582107544, + "learning_rate": 2.7156373148525615e-06, + "loss": 3.8831, + "step": 85805 + }, + { + "epoch": 5.830275852697377, + "grad_norm": 0.24454019963741302, + "learning_rate": 2.7152126647642347e-06, + "loss": 3.9347, + "step": 85810 + }, + { + "epoch": 5.830615572768039, + "grad_norm": 0.39328405261039734, + "learning_rate": 2.714788014675907e-06, + "loss": 3.7971, + "step": 85815 + }, + { + "epoch": 5.830955292838701, + "grad_norm": 0.19945062696933746, + "learning_rate": 2.71436336458758e-06, + "loss": 3.8163, + "step": 85820 + }, + { + "epoch": 5.831295012909362, + "grad_norm": 0.2697293758392334, + "learning_rate": 2.713938714499253e-06, + "loss": 3.8419, + "step": 85825 + }, + { + "epoch": 5.831634732980024, + "grad_norm": 0.27910828590393066, + "learning_rate": 2.7135140644109255e-06, + "loss": 4.0034, + "step": 85830 + }, + { + "epoch": 5.8319744530506865, + "grad_norm": 0.17287078499794006, + "learning_rate": 2.7130894143225983e-06, + "loss": 4.1089, + "step": 85835 + }, + { + "epoch": 5.832314173121348, + "grad_norm": 0.20195186138153076, + "learning_rate": 2.712664764234271e-06, + "loss": 3.915, + "step": 85840 + }, + { + "epoch": 5.83265389319201, + "grad_norm": 0.2837425172328949, + "learning_rate": 2.7122401141459443e-06, + "loss": 3.8075, + "step": 85845 + }, + { + "epoch": 5.832993613262672, + "grad_norm": 0.2176184356212616, + "learning_rate": 2.7118154640576167e-06, + "loss": 3.8239, + "step": 85850 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.28859615325927734, + "learning_rate": 2.7113908139692895e-06, + "loss": 3.9672, + "step": 85855 + }, + { + "epoch": 5.833673053403995, + "grad_norm": 0.2164115011692047, + "learning_rate": 2.7109661638809627e-06, + "loss": 3.9264, + "step": 85860 + }, + { + "epoch": 5.834012773474657, + "grad_norm": 0.3082285225391388, + "learning_rate": 2.710541513792635e-06, + "loss": 3.9612, + "step": 85865 + }, + { + "epoch": 5.834352493545318, + "grad_norm": 0.24182312190532684, + "learning_rate": 2.710116863704308e-06, + "loss": 3.989, + "step": 85870 + }, + { + "epoch": 5.8346922136159804, + "grad_norm": 0.2419433295726776, + "learning_rate": 2.709692213615981e-06, + "loss": 3.8371, + "step": 85875 + }, + { + "epoch": 5.8350319336866425, + "grad_norm": 0.20382536947727203, + "learning_rate": 2.7092675635276535e-06, + "loss": 3.8265, + "step": 85880 + }, + { + "epoch": 5.835371653757304, + "grad_norm": 0.3004249334335327, + "learning_rate": 2.7088429134393263e-06, + "loss": 3.8748, + "step": 85885 + }, + { + "epoch": 5.835711373827966, + "grad_norm": 0.3007656931877136, + "learning_rate": 2.708418263350999e-06, + "loss": 3.9379, + "step": 85890 + }, + { + "epoch": 5.836051093898628, + "grad_norm": 0.2354200929403305, + "learning_rate": 2.7079936132626714e-06, + "loss": 3.8631, + "step": 85895 + }, + { + "epoch": 5.836390813969289, + "grad_norm": 0.29477834701538086, + "learning_rate": 2.7075689631743447e-06, + "loss": 3.8294, + "step": 85900 + }, + { + "epoch": 5.836730534039951, + "grad_norm": 0.47197258472442627, + "learning_rate": 2.7071443130860175e-06, + "loss": 3.7746, + "step": 85905 + }, + { + "epoch": 5.837070254110613, + "grad_norm": 0.25371983647346497, + "learning_rate": 2.70671966299769e-06, + "loss": 3.9293, + "step": 85910 + }, + { + "epoch": 5.837409974181274, + "grad_norm": 0.2552107572555542, + "learning_rate": 2.706295012909363e-06, + "loss": 3.7027, + "step": 85915 + }, + { + "epoch": 5.8377496942519365, + "grad_norm": 0.2557598650455475, + "learning_rate": 2.705870362821036e-06, + "loss": 3.9761, + "step": 85920 + }, + { + "epoch": 5.8380894143225985, + "grad_norm": 0.23055946826934814, + "learning_rate": 2.7054457127327083e-06, + "loss": 3.914, + "step": 85925 + }, + { + "epoch": 5.83842913439326, + "grad_norm": 0.22650998830795288, + "learning_rate": 2.705021062644381e-06, + "loss": 3.9563, + "step": 85930 + }, + { + "epoch": 5.838768854463922, + "grad_norm": 0.29954907298088074, + "learning_rate": 2.7045964125560543e-06, + "loss": 3.7507, + "step": 85935 + }, + { + "epoch": 5.839108574534584, + "grad_norm": 0.21887029707431793, + "learning_rate": 2.7041717624677267e-06, + "loss": 3.6217, + "step": 85940 + }, + { + "epoch": 5.839448294605245, + "grad_norm": 0.22387494146823883, + "learning_rate": 2.7037471123793995e-06, + "loss": 3.8108, + "step": 85945 + }, + { + "epoch": 5.839788014675907, + "grad_norm": 0.21337418258190155, + "learning_rate": 2.7033224622910727e-06, + "loss": 4.0272, + "step": 85950 + }, + { + "epoch": 5.840127734746569, + "grad_norm": 0.2642262279987335, + "learning_rate": 2.702897812202745e-06, + "loss": 4.0018, + "step": 85955 + }, + { + "epoch": 5.84046745481723, + "grad_norm": 0.35794076323509216, + "learning_rate": 2.702473162114418e-06, + "loss": 3.8057, + "step": 85960 + }, + { + "epoch": 5.8408071748878925, + "grad_norm": 0.24974121153354645, + "learning_rate": 2.7020485120260907e-06, + "loss": 3.8108, + "step": 85965 + }, + { + "epoch": 5.841146894958554, + "grad_norm": 0.39745020866394043, + "learning_rate": 2.7016238619377635e-06, + "loss": 3.6632, + "step": 85970 + }, + { + "epoch": 5.841486615029216, + "grad_norm": 0.20841476321220398, + "learning_rate": 2.7011992118494363e-06, + "loss": 3.8421, + "step": 85975 + }, + { + "epoch": 5.841826335099878, + "grad_norm": 0.2700100243091583, + "learning_rate": 2.700774561761109e-06, + "loss": 3.8456, + "step": 85980 + }, + { + "epoch": 5.842166055170539, + "grad_norm": 0.25755542516708374, + "learning_rate": 2.7003499116727814e-06, + "loss": 4.2504, + "step": 85985 + }, + { + "epoch": 5.842505775241201, + "grad_norm": 0.32419687509536743, + "learning_rate": 2.6999252615844547e-06, + "loss": 4.1347, + "step": 85990 + }, + { + "epoch": 5.842845495311863, + "grad_norm": 0.18625149130821228, + "learning_rate": 2.6995006114961275e-06, + "loss": 3.8067, + "step": 85995 + }, + { + "epoch": 5.843185215382524, + "grad_norm": 0.2726791501045227, + "learning_rate": 2.6990759614078e-06, + "loss": 3.9584, + "step": 86000 + }, + { + "epoch": 5.843524935453186, + "grad_norm": 0.31354260444641113, + "learning_rate": 2.698651311319473e-06, + "loss": 3.9567, + "step": 86005 + }, + { + "epoch": 5.8438646555238485, + "grad_norm": 0.4157947897911072, + "learning_rate": 2.698226661231146e-06, + "loss": 3.972, + "step": 86010 + }, + { + "epoch": 5.84420437559451, + "grad_norm": 0.28195902705192566, + "learning_rate": 2.6978020111428187e-06, + "loss": 3.9753, + "step": 86015 + }, + { + "epoch": 5.844544095665172, + "grad_norm": 0.25591209530830383, + "learning_rate": 2.697377361054491e-06, + "loss": 3.9431, + "step": 86020 + }, + { + "epoch": 5.844883815735834, + "grad_norm": 0.31202223896980286, + "learning_rate": 2.6969527109661643e-06, + "loss": 4.0523, + "step": 86025 + }, + { + "epoch": 5.845223535806495, + "grad_norm": 0.25854840874671936, + "learning_rate": 2.696528060877837e-06, + "loss": 4.1891, + "step": 86030 + }, + { + "epoch": 5.845563255877157, + "grad_norm": 0.25480636954307556, + "learning_rate": 2.6961034107895094e-06, + "loss": 3.8446, + "step": 86035 + }, + { + "epoch": 5.845902975947819, + "grad_norm": 0.2533976137638092, + "learning_rate": 2.6956787607011827e-06, + "loss": 3.8484, + "step": 86040 + }, + { + "epoch": 5.84624269601848, + "grad_norm": 0.2859461307525635, + "learning_rate": 2.6952541106128555e-06, + "loss": 3.9662, + "step": 86045 + }, + { + "epoch": 5.846582416089142, + "grad_norm": 0.2812875807285309, + "learning_rate": 2.694829460524528e-06, + "loss": 3.7596, + "step": 86050 + }, + { + "epoch": 5.8469221361598045, + "grad_norm": 0.2885573208332062, + "learning_rate": 2.6944048104362007e-06, + "loss": 3.7254, + "step": 86055 + }, + { + "epoch": 5.847261856230466, + "grad_norm": 0.25165170431137085, + "learning_rate": 2.693980160347874e-06, + "loss": 3.8298, + "step": 86060 + }, + { + "epoch": 5.847601576301128, + "grad_norm": 0.22094763815402985, + "learning_rate": 2.6935555102595463e-06, + "loss": 3.7587, + "step": 86065 + }, + { + "epoch": 5.84794129637179, + "grad_norm": 0.2688106298446655, + "learning_rate": 2.693130860171219e-06, + "loss": 3.7282, + "step": 86070 + }, + { + "epoch": 5.848281016442451, + "grad_norm": 0.2536678910255432, + "learning_rate": 2.6927062100828923e-06, + "loss": 4.1459, + "step": 86075 + }, + { + "epoch": 5.848620736513113, + "grad_norm": 0.2694118022918701, + "learning_rate": 2.6922815599945647e-06, + "loss": 3.7127, + "step": 86080 + }, + { + "epoch": 5.848960456583775, + "grad_norm": 0.22865092754364014, + "learning_rate": 2.6918569099062375e-06, + "loss": 4.0339, + "step": 86085 + }, + { + "epoch": 5.849300176654436, + "grad_norm": 0.25786054134368896, + "learning_rate": 2.6914322598179103e-06, + "loss": 4.0822, + "step": 86090 + }, + { + "epoch": 5.849639896725098, + "grad_norm": 0.2924213409423828, + "learning_rate": 2.691007609729583e-06, + "loss": 3.8671, + "step": 86095 + }, + { + "epoch": 5.8499796167957605, + "grad_norm": 0.2695609927177429, + "learning_rate": 2.690582959641256e-06, + "loss": 3.6935, + "step": 86100 + }, + { + "epoch": 5.850319336866422, + "grad_norm": 0.20453032851219177, + "learning_rate": 2.6901583095529287e-06, + "loss": 3.8202, + "step": 86105 + }, + { + "epoch": 5.850659056937084, + "grad_norm": 0.2243889421224594, + "learning_rate": 2.689733659464601e-06, + "loss": 3.741, + "step": 86110 + }, + { + "epoch": 5.850998777007746, + "grad_norm": 0.2716630697250366, + "learning_rate": 2.6893090093762743e-06, + "loss": 3.6604, + "step": 86115 + }, + { + "epoch": 5.851338497078407, + "grad_norm": 0.2350020408630371, + "learning_rate": 2.688884359287947e-06, + "loss": 3.8177, + "step": 86120 + }, + { + "epoch": 5.851678217149069, + "grad_norm": 0.2595739960670471, + "learning_rate": 2.6884597091996194e-06, + "loss": 3.6814, + "step": 86125 + }, + { + "epoch": 5.852017937219731, + "grad_norm": 0.21184588968753815, + "learning_rate": 2.6880350591112927e-06, + "loss": 4.0144, + "step": 86130 + }, + { + "epoch": 5.852357657290392, + "grad_norm": 0.26341164112091064, + "learning_rate": 2.6876104090229655e-06, + "loss": 3.8881, + "step": 86135 + }, + { + "epoch": 5.852697377361054, + "grad_norm": 0.25539717078208923, + "learning_rate": 2.687185758934638e-06, + "loss": 3.9455, + "step": 86140 + }, + { + "epoch": 5.8530370974317165, + "grad_norm": 0.31812530755996704, + "learning_rate": 2.6867611088463106e-06, + "loss": 3.7627, + "step": 86145 + }, + { + "epoch": 5.853376817502378, + "grad_norm": 0.22530047595500946, + "learning_rate": 2.686336458757984e-06, + "loss": 3.9962, + "step": 86150 + }, + { + "epoch": 5.85371653757304, + "grad_norm": 0.2655661702156067, + "learning_rate": 2.6859118086696562e-06, + "loss": 3.9484, + "step": 86155 + }, + { + "epoch": 5.854056257643702, + "grad_norm": 0.18980728089809418, + "learning_rate": 2.685487158581329e-06, + "loss": 3.6726, + "step": 86160 + }, + { + "epoch": 5.854395977714363, + "grad_norm": 0.21853473782539368, + "learning_rate": 2.6850625084930023e-06, + "loss": 4.0382, + "step": 86165 + }, + { + "epoch": 5.854735697785025, + "grad_norm": 0.3893739581108093, + "learning_rate": 2.6846378584046746e-06, + "loss": 4.0157, + "step": 86170 + }, + { + "epoch": 5.855075417855687, + "grad_norm": 0.26366090774536133, + "learning_rate": 2.6842132083163474e-06, + "loss": 3.9315, + "step": 86175 + }, + { + "epoch": 5.855415137926348, + "grad_norm": 0.24138082563877106, + "learning_rate": 2.6837885582280202e-06, + "loss": 3.8484, + "step": 86180 + }, + { + "epoch": 5.8557548579970105, + "grad_norm": 0.388569712638855, + "learning_rate": 2.6833639081396935e-06, + "loss": 3.985, + "step": 86185 + }, + { + "epoch": 5.8560945780676725, + "grad_norm": 0.2566661834716797, + "learning_rate": 2.682939258051366e-06, + "loss": 4.0008, + "step": 86190 + }, + { + "epoch": 5.856434298138334, + "grad_norm": 0.18722105026245117, + "learning_rate": 2.6825146079630386e-06, + "loss": 3.9069, + "step": 86195 + }, + { + "epoch": 5.856774018208996, + "grad_norm": 0.2816615700721741, + "learning_rate": 2.682089957874712e-06, + "loss": 3.9894, + "step": 86200 + }, + { + "epoch": 5.857113738279658, + "grad_norm": 0.3204309642314911, + "learning_rate": 2.6816653077863843e-06, + "loss": 4.0109, + "step": 86205 + }, + { + "epoch": 5.857453458350319, + "grad_norm": 0.3134375810623169, + "learning_rate": 2.681240657698057e-06, + "loss": 4.1692, + "step": 86210 + }, + { + "epoch": 5.857793178420981, + "grad_norm": 0.26607608795166016, + "learning_rate": 2.68081600760973e-06, + "loss": 4.0448, + "step": 86215 + }, + { + "epoch": 5.858132898491643, + "grad_norm": 0.22393743693828583, + "learning_rate": 2.6803913575214027e-06, + "loss": 3.9165, + "step": 86220 + }, + { + "epoch": 5.858472618562304, + "grad_norm": 0.23133909702301025, + "learning_rate": 2.6799667074330755e-06, + "loss": 3.7727, + "step": 86225 + }, + { + "epoch": 5.8588123386329665, + "grad_norm": 0.27474457025527954, + "learning_rate": 2.6795420573447483e-06, + "loss": 4.0192, + "step": 86230 + }, + { + "epoch": 5.8591520587036285, + "grad_norm": 0.30381760001182556, + "learning_rate": 2.6791174072564206e-06, + "loss": 3.6952, + "step": 86235 + }, + { + "epoch": 5.85949177877429, + "grad_norm": 0.33400362730026245, + "learning_rate": 2.678692757168094e-06, + "loss": 3.7785, + "step": 86240 + }, + { + "epoch": 5.859831498844952, + "grad_norm": 0.24806690216064453, + "learning_rate": 2.6782681070797667e-06, + "loss": 3.935, + "step": 86245 + }, + { + "epoch": 5.860171218915614, + "grad_norm": 0.34393247961997986, + "learning_rate": 2.677843456991439e-06, + "loss": 3.9625, + "step": 86250 + }, + { + "epoch": 5.860510938986275, + "grad_norm": 0.2210206240415573, + "learning_rate": 2.6774188069031123e-06, + "loss": 3.6756, + "step": 86255 + }, + { + "epoch": 5.860850659056937, + "grad_norm": 0.2667439579963684, + "learning_rate": 2.676994156814785e-06, + "loss": 3.7618, + "step": 86260 + }, + { + "epoch": 5.861190379127599, + "grad_norm": 0.2558465003967285, + "learning_rate": 2.6765695067264574e-06, + "loss": 3.9668, + "step": 86265 + }, + { + "epoch": 5.86153009919826, + "grad_norm": 0.25547119975090027, + "learning_rate": 2.6761448566381302e-06, + "loss": 3.9306, + "step": 86270 + }, + { + "epoch": 5.8618698192689225, + "grad_norm": 0.2738114297389984, + "learning_rate": 2.6757202065498035e-06, + "loss": 3.9142, + "step": 86275 + }, + { + "epoch": 5.8622095393395846, + "grad_norm": 0.21126213669776917, + "learning_rate": 2.675295556461476e-06, + "loss": 3.8346, + "step": 86280 + }, + { + "epoch": 5.862549259410246, + "grad_norm": 0.2910017967224121, + "learning_rate": 2.6748709063731486e-06, + "loss": 3.4926, + "step": 86285 + }, + { + "epoch": 5.862888979480908, + "grad_norm": 0.2292460799217224, + "learning_rate": 2.674446256284822e-06, + "loss": 3.7788, + "step": 86290 + }, + { + "epoch": 5.86322869955157, + "grad_norm": 0.24881599843502045, + "learning_rate": 2.6740216061964942e-06, + "loss": 3.9777, + "step": 86295 + }, + { + "epoch": 5.863568419622231, + "grad_norm": 0.23769991099834442, + "learning_rate": 2.673596956108167e-06, + "loss": 3.6075, + "step": 86300 + }, + { + "epoch": 5.863908139692893, + "grad_norm": 0.35333311557769775, + "learning_rate": 2.67317230601984e-06, + "loss": 3.9946, + "step": 86305 + }, + { + "epoch": 5.864247859763555, + "grad_norm": 0.2754904329776764, + "learning_rate": 2.6727476559315122e-06, + "loss": 4.1029, + "step": 86310 + }, + { + "epoch": 5.864587579834216, + "grad_norm": 0.25936296582221985, + "learning_rate": 2.6723230058431854e-06, + "loss": 4.0499, + "step": 86315 + }, + { + "epoch": 5.8649272999048785, + "grad_norm": 0.2295442819595337, + "learning_rate": 2.6718983557548582e-06, + "loss": 4.0778, + "step": 86320 + }, + { + "epoch": 5.865267019975541, + "grad_norm": 0.2143886238336563, + "learning_rate": 2.6714737056665306e-06, + "loss": 3.8437, + "step": 86325 + }, + { + "epoch": 5.865606740046202, + "grad_norm": 0.2979057729244232, + "learning_rate": 2.671049055578204e-06, + "loss": 4.0413, + "step": 86330 + }, + { + "epoch": 5.865946460116864, + "grad_norm": 0.2294687181711197, + "learning_rate": 2.6706244054898766e-06, + "loss": 3.7947, + "step": 86335 + }, + { + "epoch": 5.866286180187526, + "grad_norm": 0.22568583488464355, + "learning_rate": 2.670199755401549e-06, + "loss": 4.1278, + "step": 86340 + }, + { + "epoch": 5.866625900258187, + "grad_norm": 0.29666271805763245, + "learning_rate": 2.6697751053132222e-06, + "loss": 3.6482, + "step": 86345 + }, + { + "epoch": 5.866965620328849, + "grad_norm": 0.26665130257606506, + "learning_rate": 2.669350455224895e-06, + "loss": 3.752, + "step": 86350 + }, + { + "epoch": 5.867305340399511, + "grad_norm": 0.3672645688056946, + "learning_rate": 2.668925805136568e-06, + "loss": 3.7832, + "step": 86355 + }, + { + "epoch": 5.867645060470172, + "grad_norm": 0.37497302889823914, + "learning_rate": 2.6685011550482402e-06, + "loss": 3.8785, + "step": 86360 + }, + { + "epoch": 5.8679847805408345, + "grad_norm": 0.24958819150924683, + "learning_rate": 2.6680765049599135e-06, + "loss": 3.9208, + "step": 86365 + }, + { + "epoch": 5.868324500611497, + "grad_norm": 0.20668160915374756, + "learning_rate": 2.6676518548715863e-06, + "loss": 3.8288, + "step": 86370 + }, + { + "epoch": 5.868664220682158, + "grad_norm": 0.24489259719848633, + "learning_rate": 2.6672272047832586e-06, + "loss": 4.2057, + "step": 86375 + }, + { + "epoch": 5.86900394075282, + "grad_norm": 0.3884313106536865, + "learning_rate": 2.666802554694932e-06, + "loss": 3.8777, + "step": 86380 + }, + { + "epoch": 5.869343660823482, + "grad_norm": 0.26126331090927124, + "learning_rate": 2.6663779046066047e-06, + "loss": 3.8033, + "step": 86385 + }, + { + "epoch": 5.869683380894143, + "grad_norm": 0.25522708892822266, + "learning_rate": 2.665953254518277e-06, + "loss": 3.927, + "step": 86390 + }, + { + "epoch": 5.870023100964805, + "grad_norm": 0.2672354578971863, + "learning_rate": 2.66552860442995e-06, + "loss": 3.9272, + "step": 86395 + }, + { + "epoch": 5.870362821035467, + "grad_norm": 0.23086456954479218, + "learning_rate": 2.665103954341623e-06, + "loss": 4.0061, + "step": 86400 + }, + { + "epoch": 5.870702541106128, + "grad_norm": 0.2247961312532425, + "learning_rate": 2.6646793042532954e-06, + "loss": 3.6654, + "step": 86405 + }, + { + "epoch": 5.8710422611767905, + "grad_norm": 0.27301880717277527, + "learning_rate": 2.6642546541649682e-06, + "loss": 3.9118, + "step": 86410 + }, + { + "epoch": 5.871381981247453, + "grad_norm": 0.21550457179546356, + "learning_rate": 2.6638300040766415e-06, + "loss": 4.0745, + "step": 86415 + }, + { + "epoch": 5.871721701318114, + "grad_norm": 0.2174793928861618, + "learning_rate": 2.663405353988314e-06, + "loss": 3.793, + "step": 86420 + }, + { + "epoch": 5.872061421388776, + "grad_norm": 0.29322391748428345, + "learning_rate": 2.6629807038999866e-06, + "loss": 3.8574, + "step": 86425 + }, + { + "epoch": 5.872401141459437, + "grad_norm": 0.35525089502334595, + "learning_rate": 2.6625560538116594e-06, + "loss": 3.8986, + "step": 86430 + }, + { + "epoch": 5.872740861530099, + "grad_norm": 0.2996242046356201, + "learning_rate": 2.662131403723332e-06, + "loss": 4.1361, + "step": 86435 + }, + { + "epoch": 5.873080581600761, + "grad_norm": 0.24275784194469452, + "learning_rate": 2.661706753635005e-06, + "loss": 3.7628, + "step": 86440 + }, + { + "epoch": 5.873420301671422, + "grad_norm": 0.28988945484161377, + "learning_rate": 2.661282103546678e-06, + "loss": 3.9171, + "step": 86445 + }, + { + "epoch": 5.8737600217420844, + "grad_norm": 0.24717268347740173, + "learning_rate": 2.6608574534583502e-06, + "loss": 4.2417, + "step": 86450 + }, + { + "epoch": 5.8740997418127465, + "grad_norm": 0.28827494382858276, + "learning_rate": 2.6604328033700234e-06, + "loss": 3.8159, + "step": 86455 + }, + { + "epoch": 5.874439461883408, + "grad_norm": 0.26903635263442993, + "learning_rate": 2.6600081532816962e-06, + "loss": 3.9808, + "step": 86460 + }, + { + "epoch": 5.87477918195407, + "grad_norm": 0.41970470547676086, + "learning_rate": 2.6595835031933686e-06, + "loss": 3.8688, + "step": 86465 + }, + { + "epoch": 5.875118902024732, + "grad_norm": 0.2831406891345978, + "learning_rate": 2.6591588531050414e-06, + "loss": 3.9553, + "step": 86470 + }, + { + "epoch": 5.875458622095393, + "grad_norm": 0.2552293837070465, + "learning_rate": 2.6587342030167146e-06, + "loss": 4.0146, + "step": 86475 + }, + { + "epoch": 5.875798342166055, + "grad_norm": 0.19450309872627258, + "learning_rate": 2.658309552928387e-06, + "loss": 3.9492, + "step": 86480 + }, + { + "epoch": 5.876138062236717, + "grad_norm": 0.2144412398338318, + "learning_rate": 2.65788490284006e-06, + "loss": 3.7218, + "step": 86485 + }, + { + "epoch": 5.876477782307378, + "grad_norm": 0.3006945252418518, + "learning_rate": 2.657460252751733e-06, + "loss": 3.8855, + "step": 86490 + }, + { + "epoch": 5.8768175023780405, + "grad_norm": 0.22035156190395355, + "learning_rate": 2.6570356026634054e-06, + "loss": 3.7804, + "step": 86495 + }, + { + "epoch": 5.8771572224487025, + "grad_norm": 0.34726905822753906, + "learning_rate": 2.6566109525750782e-06, + "loss": 4.1514, + "step": 86500 + }, + { + "epoch": 5.877496942519364, + "grad_norm": 0.2648385465145111, + "learning_rate": 2.6561863024867514e-06, + "loss": 3.8074, + "step": 86505 + }, + { + "epoch": 5.877836662590026, + "grad_norm": 0.3677774667739868, + "learning_rate": 2.655761652398424e-06, + "loss": 3.7805, + "step": 86510 + }, + { + "epoch": 5.878176382660688, + "grad_norm": 0.21074171364307404, + "learning_rate": 2.6553370023100966e-06, + "loss": 3.9876, + "step": 86515 + }, + { + "epoch": 5.878516102731349, + "grad_norm": 0.33976152539253235, + "learning_rate": 2.6549123522217694e-06, + "loss": 4.209, + "step": 86520 + }, + { + "epoch": 5.878855822802011, + "grad_norm": 0.26923394203186035, + "learning_rate": 2.6544877021334427e-06, + "loss": 3.89, + "step": 86525 + }, + { + "epoch": 5.879195542872673, + "grad_norm": 0.2876433730125427, + "learning_rate": 2.654063052045115e-06, + "loss": 3.6058, + "step": 86530 + }, + { + "epoch": 5.879535262943334, + "grad_norm": 0.2693635821342468, + "learning_rate": 2.653638401956788e-06, + "loss": 3.9108, + "step": 86535 + }, + { + "epoch": 5.8798749830139965, + "grad_norm": 0.2624233365058899, + "learning_rate": 2.653213751868461e-06, + "loss": 4.1323, + "step": 86540 + }, + { + "epoch": 5.8802147030846585, + "grad_norm": 0.24172291159629822, + "learning_rate": 2.6527891017801334e-06, + "loss": 3.7573, + "step": 86545 + }, + { + "epoch": 5.88055442315532, + "grad_norm": 0.2875109612941742, + "learning_rate": 2.6523644516918062e-06, + "loss": 3.8985, + "step": 86550 + }, + { + "epoch": 5.880894143225982, + "grad_norm": 0.31415095925331116, + "learning_rate": 2.651939801603479e-06, + "loss": 4.1216, + "step": 86555 + }, + { + "epoch": 5.881233863296644, + "grad_norm": 0.2886560261249542, + "learning_rate": 2.6515151515151514e-06, + "loss": 3.7816, + "step": 86560 + }, + { + "epoch": 5.881573583367305, + "grad_norm": 0.2847413122653961, + "learning_rate": 2.6510905014268246e-06, + "loss": 4.0412, + "step": 86565 + }, + { + "epoch": 5.881913303437967, + "grad_norm": 0.2178695946931839, + "learning_rate": 2.6506658513384974e-06, + "loss": 3.9859, + "step": 86570 + }, + { + "epoch": 5.882253023508629, + "grad_norm": 0.29619234800338745, + "learning_rate": 2.65024120125017e-06, + "loss": 3.8698, + "step": 86575 + }, + { + "epoch": 5.88259274357929, + "grad_norm": 0.24728938937187195, + "learning_rate": 2.649816551161843e-06, + "loss": 3.8927, + "step": 86580 + }, + { + "epoch": 5.8829324636499525, + "grad_norm": 0.2798035740852356, + "learning_rate": 2.649391901073516e-06, + "loss": 4.145, + "step": 86585 + }, + { + "epoch": 5.883272183720615, + "grad_norm": 0.31184545159339905, + "learning_rate": 2.6489672509851882e-06, + "loss": 4.0316, + "step": 86590 + }, + { + "epoch": 5.883611903791276, + "grad_norm": 0.28177210688591003, + "learning_rate": 2.648542600896861e-06, + "loss": 3.8331, + "step": 86595 + }, + { + "epoch": 5.883951623861938, + "grad_norm": 0.2558991014957428, + "learning_rate": 2.6481179508085342e-06, + "loss": 3.9241, + "step": 86600 + }, + { + "epoch": 5.8842913439326, + "grad_norm": 0.3503340482711792, + "learning_rate": 2.6476933007202066e-06, + "loss": 3.7834, + "step": 86605 + }, + { + "epoch": 5.884631064003261, + "grad_norm": 0.414293497800827, + "learning_rate": 2.6472686506318794e-06, + "loss": 3.7262, + "step": 86610 + }, + { + "epoch": 5.884970784073923, + "grad_norm": 0.21867099404335022, + "learning_rate": 2.6468440005435526e-06, + "loss": 3.9995, + "step": 86615 + }, + { + "epoch": 5.885310504144585, + "grad_norm": 0.2174023985862732, + "learning_rate": 2.646419350455225e-06, + "loss": 3.9304, + "step": 86620 + }, + { + "epoch": 5.885650224215246, + "grad_norm": 0.3298936188220978, + "learning_rate": 2.645994700366898e-06, + "loss": 3.9396, + "step": 86625 + }, + { + "epoch": 5.8859899442859085, + "grad_norm": 0.2732776701450348, + "learning_rate": 2.645570050278571e-06, + "loss": 3.8961, + "step": 86630 + }, + { + "epoch": 5.886329664356571, + "grad_norm": 0.24203839898109436, + "learning_rate": 2.6451454001902434e-06, + "loss": 3.7888, + "step": 86635 + }, + { + "epoch": 5.886669384427232, + "grad_norm": 0.29226329922676086, + "learning_rate": 2.6447207501019162e-06, + "loss": 3.8543, + "step": 86640 + }, + { + "epoch": 5.887009104497894, + "grad_norm": 0.3214116394519806, + "learning_rate": 2.644296100013589e-06, + "loss": 3.8611, + "step": 86645 + }, + { + "epoch": 5.887348824568555, + "grad_norm": 0.2521412670612335, + "learning_rate": 2.6438714499252614e-06, + "loss": 4.035, + "step": 86650 + }, + { + "epoch": 5.887688544639217, + "grad_norm": 0.2472091168165207, + "learning_rate": 2.6434467998369346e-06, + "loss": 4.0666, + "step": 86655 + }, + { + "epoch": 5.888028264709879, + "grad_norm": 0.33590176701545715, + "learning_rate": 2.6430221497486074e-06, + "loss": 3.859, + "step": 86660 + }, + { + "epoch": 5.88836798478054, + "grad_norm": 0.2719195485115051, + "learning_rate": 2.64259749966028e-06, + "loss": 3.9977, + "step": 86665 + }, + { + "epoch": 5.888707704851202, + "grad_norm": 0.3422805666923523, + "learning_rate": 2.642172849571953e-06, + "loss": 3.8523, + "step": 86670 + }, + { + "epoch": 5.8890474249218645, + "grad_norm": 0.28401580452919006, + "learning_rate": 2.641748199483626e-06, + "loss": 3.9142, + "step": 86675 + }, + { + "epoch": 5.889387144992526, + "grad_norm": 0.2606997489929199, + "learning_rate": 2.641323549395298e-06, + "loss": 3.7776, + "step": 86680 + }, + { + "epoch": 5.889726865063188, + "grad_norm": 0.2306491881608963, + "learning_rate": 2.640898899306971e-06, + "loss": 4.0725, + "step": 86685 + }, + { + "epoch": 5.89006658513385, + "grad_norm": 0.19527104496955872, + "learning_rate": 2.6404742492186442e-06, + "loss": 3.7292, + "step": 86690 + }, + { + "epoch": 5.890406305204511, + "grad_norm": 0.2648009955883026, + "learning_rate": 2.640049599130317e-06, + "loss": 3.8087, + "step": 86695 + }, + { + "epoch": 5.890746025275173, + "grad_norm": 0.23998625576496124, + "learning_rate": 2.6396249490419894e-06, + "loss": 3.838, + "step": 86700 + }, + { + "epoch": 5.891085745345835, + "grad_norm": 0.27060508728027344, + "learning_rate": 2.6392002989536626e-06, + "loss": 3.987, + "step": 86705 + }, + { + "epoch": 5.891425465416496, + "grad_norm": 0.34470248222351074, + "learning_rate": 2.6387756488653354e-06, + "loss": 4.014, + "step": 86710 + }, + { + "epoch": 5.891765185487158, + "grad_norm": 0.2593228220939636, + "learning_rate": 2.638350998777008e-06, + "loss": 3.7572, + "step": 86715 + }, + { + "epoch": 5.8921049055578205, + "grad_norm": 0.24402941763401031, + "learning_rate": 2.6379263486886806e-06, + "loss": 3.7856, + "step": 86720 + }, + { + "epoch": 5.892444625628482, + "grad_norm": 0.26686227321624756, + "learning_rate": 2.637501698600354e-06, + "loss": 4.1729, + "step": 86725 + }, + { + "epoch": 5.892784345699144, + "grad_norm": 0.26815494894981384, + "learning_rate": 2.6370770485120262e-06, + "loss": 3.9665, + "step": 86730 + }, + { + "epoch": 5.893124065769806, + "grad_norm": 0.2451542466878891, + "learning_rate": 2.636652398423699e-06, + "loss": 3.7692, + "step": 86735 + }, + { + "epoch": 5.893463785840467, + "grad_norm": 0.2239556610584259, + "learning_rate": 2.6362277483353722e-06, + "loss": 4.0829, + "step": 86740 + }, + { + "epoch": 5.893803505911129, + "grad_norm": 0.36326083540916443, + "learning_rate": 2.6358030982470446e-06, + "loss": 3.849, + "step": 86745 + }, + { + "epoch": 5.894143225981791, + "grad_norm": 0.28684818744659424, + "learning_rate": 2.6353784481587174e-06, + "loss": 3.9528, + "step": 86750 + }, + { + "epoch": 5.894482946052452, + "grad_norm": 0.24307116866111755, + "learning_rate": 2.6349537980703902e-06, + "loss": 4.0229, + "step": 86755 + }, + { + "epoch": 5.8948226661231145, + "grad_norm": 0.2572210431098938, + "learning_rate": 2.634529147982063e-06, + "loss": 3.9561, + "step": 86760 + }, + { + "epoch": 5.8951623861937765, + "grad_norm": 0.27825498580932617, + "learning_rate": 2.634104497893736e-06, + "loss": 3.6707, + "step": 86765 + }, + { + "epoch": 5.895502106264438, + "grad_norm": 0.25773999094963074, + "learning_rate": 2.6336798478054086e-06, + "loss": 3.8729, + "step": 86770 + }, + { + "epoch": 5.8958418263351, + "grad_norm": 0.28167861700057983, + "learning_rate": 2.633255197717081e-06, + "loss": 3.8899, + "step": 86775 + }, + { + "epoch": 5.896181546405762, + "grad_norm": 0.27322331070899963, + "learning_rate": 2.6328305476287542e-06, + "loss": 3.9721, + "step": 86780 + }, + { + "epoch": 5.896521266476423, + "grad_norm": 0.3234032690525055, + "learning_rate": 2.632405897540427e-06, + "loss": 4.0057, + "step": 86785 + }, + { + "epoch": 5.896860986547085, + "grad_norm": 0.3415590226650238, + "learning_rate": 2.6319812474520994e-06, + "loss": 4.0357, + "step": 86790 + }, + { + "epoch": 5.897200706617747, + "grad_norm": 0.3029141426086426, + "learning_rate": 2.6315565973637726e-06, + "loss": 3.8278, + "step": 86795 + }, + { + "epoch": 5.897540426688408, + "grad_norm": 0.268020898103714, + "learning_rate": 2.6311319472754454e-06, + "loss": 3.9737, + "step": 86800 + }, + { + "epoch": 5.8978801467590705, + "grad_norm": 0.6653799414634705, + "learning_rate": 2.630707297187118e-06, + "loss": 4.065, + "step": 86805 + }, + { + "epoch": 5.8982198668297325, + "grad_norm": 0.22538816928863525, + "learning_rate": 2.6302826470987906e-06, + "loss": 3.9302, + "step": 86810 + }, + { + "epoch": 5.898559586900394, + "grad_norm": 0.23309360444545746, + "learning_rate": 2.629857997010464e-06, + "loss": 3.8707, + "step": 86815 + }, + { + "epoch": 5.898899306971056, + "grad_norm": 0.28722694516181946, + "learning_rate": 2.629433346922136e-06, + "loss": 3.9641, + "step": 86820 + }, + { + "epoch": 5.899239027041718, + "grad_norm": 0.19690681993961334, + "learning_rate": 2.629008696833809e-06, + "loss": 4.0136, + "step": 86825 + }, + { + "epoch": 5.899578747112379, + "grad_norm": 0.2499707192182541, + "learning_rate": 2.6285840467454822e-06, + "loss": 3.8683, + "step": 86830 + }, + { + "epoch": 5.899918467183041, + "grad_norm": 0.24767276644706726, + "learning_rate": 2.6281593966571546e-06, + "loss": 3.7752, + "step": 86835 + }, + { + "epoch": 5.900258187253703, + "grad_norm": 0.27982139587402344, + "learning_rate": 2.6277347465688274e-06, + "loss": 3.8737, + "step": 86840 + }, + { + "epoch": 5.900597907324364, + "grad_norm": 0.2711426615715027, + "learning_rate": 2.6273100964805e-06, + "loss": 3.9727, + "step": 86845 + }, + { + "epoch": 5.9009376273950265, + "grad_norm": 0.3331466317176819, + "learning_rate": 2.626885446392173e-06, + "loss": 3.8348, + "step": 86850 + }, + { + "epoch": 5.9012773474656885, + "grad_norm": 0.2692634165287018, + "learning_rate": 2.626460796303846e-06, + "loss": 3.6837, + "step": 86855 + }, + { + "epoch": 5.90161706753635, + "grad_norm": 0.22147932648658752, + "learning_rate": 2.6260361462155186e-06, + "loss": 4.0399, + "step": 86860 + }, + { + "epoch": 5.901956787607012, + "grad_norm": 0.24395959079265594, + "learning_rate": 2.625611496127192e-06, + "loss": 3.8934, + "step": 86865 + }, + { + "epoch": 5.902296507677674, + "grad_norm": 0.30872318148612976, + "learning_rate": 2.6251868460388642e-06, + "loss": 4.0648, + "step": 86870 + }, + { + "epoch": 5.902636227748335, + "grad_norm": 0.4196813106536865, + "learning_rate": 2.624762195950537e-06, + "loss": 4.033, + "step": 86875 + }, + { + "epoch": 5.902975947818997, + "grad_norm": 0.22660544514656067, + "learning_rate": 2.62433754586221e-06, + "loss": 4.0104, + "step": 86880 + }, + { + "epoch": 5.903315667889659, + "grad_norm": 0.28905239701271057, + "learning_rate": 2.6239128957738826e-06, + "loss": 3.6703, + "step": 86885 + }, + { + "epoch": 5.90365538796032, + "grad_norm": 0.24177215993404388, + "learning_rate": 2.6234882456855554e-06, + "loss": 3.9503, + "step": 86890 + }, + { + "epoch": 5.9039951080309825, + "grad_norm": 0.2448253184556961, + "learning_rate": 2.6230635955972282e-06, + "loss": 3.8825, + "step": 86895 + }, + { + "epoch": 5.904334828101645, + "grad_norm": 0.26616954803466797, + "learning_rate": 2.6226389455089006e-06, + "loss": 3.9209, + "step": 86900 + }, + { + "epoch": 5.904674548172306, + "grad_norm": 0.25169768929481506, + "learning_rate": 2.622214295420574e-06, + "loss": 3.8812, + "step": 86905 + }, + { + "epoch": 5.905014268242968, + "grad_norm": 0.2340082973241806, + "learning_rate": 2.6217896453322466e-06, + "loss": 3.7753, + "step": 86910 + }, + { + "epoch": 5.90535398831363, + "grad_norm": 0.2737807035446167, + "learning_rate": 2.621364995243919e-06, + "loss": 3.8735, + "step": 86915 + }, + { + "epoch": 5.905693708384291, + "grad_norm": 0.25546789169311523, + "learning_rate": 2.6209403451555922e-06, + "loss": 3.8776, + "step": 86920 + }, + { + "epoch": 5.906033428454953, + "grad_norm": 0.23946703970432281, + "learning_rate": 2.620515695067265e-06, + "loss": 4.0276, + "step": 86925 + }, + { + "epoch": 5.906373148525615, + "grad_norm": 0.26211094856262207, + "learning_rate": 2.6200910449789374e-06, + "loss": 3.8143, + "step": 86930 + }, + { + "epoch": 5.906712868596276, + "grad_norm": 0.19805581867694855, + "learning_rate": 2.61966639489061e-06, + "loss": 3.5507, + "step": 86935 + }, + { + "epoch": 5.9070525886669385, + "grad_norm": 0.29420098662376404, + "learning_rate": 2.6192417448022834e-06, + "loss": 3.7479, + "step": 86940 + }, + { + "epoch": 5.907392308737601, + "grad_norm": 0.24166052043437958, + "learning_rate": 2.618817094713956e-06, + "loss": 3.8546, + "step": 86945 + }, + { + "epoch": 5.907732028808262, + "grad_norm": 0.3010900318622589, + "learning_rate": 2.6183924446256286e-06, + "loss": 4.1091, + "step": 86950 + }, + { + "epoch": 5.908071748878924, + "grad_norm": 0.20615480840206146, + "learning_rate": 2.617967794537302e-06, + "loss": 3.8058, + "step": 86955 + }, + { + "epoch": 5.908411468949586, + "grad_norm": 0.27488139271736145, + "learning_rate": 2.617543144448974e-06, + "loss": 3.944, + "step": 86960 + }, + { + "epoch": 5.908751189020247, + "grad_norm": 0.25960657000541687, + "learning_rate": 2.617118494360647e-06, + "loss": 3.9465, + "step": 86965 + }, + { + "epoch": 5.909090909090909, + "grad_norm": 0.2665295898914337, + "learning_rate": 2.61669384427232e-06, + "loss": 3.8524, + "step": 86970 + }, + { + "epoch": 5.909430629161571, + "grad_norm": 0.2375592291355133, + "learning_rate": 2.616269194183992e-06, + "loss": 4.2442, + "step": 86975 + }, + { + "epoch": 5.909770349232232, + "grad_norm": 0.2526039779186249, + "learning_rate": 2.6158445440956654e-06, + "loss": 3.8305, + "step": 86980 + }, + { + "epoch": 5.9101100693028945, + "grad_norm": 0.2772965729236603, + "learning_rate": 2.615419894007338e-06, + "loss": 3.932, + "step": 86985 + }, + { + "epoch": 5.910449789373557, + "grad_norm": 0.21517214179039001, + "learning_rate": 2.6149952439190106e-06, + "loss": 3.846, + "step": 86990 + }, + { + "epoch": 5.910789509444218, + "grad_norm": 0.21587175130844116, + "learning_rate": 2.614570593830684e-06, + "loss": 3.9022, + "step": 86995 + }, + { + "epoch": 5.91112922951488, + "grad_norm": 0.3542150855064392, + "learning_rate": 2.6141459437423566e-06, + "loss": 4.0443, + "step": 87000 + }, + { + "epoch": 5.911468949585542, + "grad_norm": 0.2348695695400238, + "learning_rate": 2.613721293654029e-06, + "loss": 3.9384, + "step": 87005 + }, + { + "epoch": 5.911808669656203, + "grad_norm": 0.29697781801223755, + "learning_rate": 2.613296643565702e-06, + "loss": 3.484, + "step": 87010 + }, + { + "epoch": 5.912148389726865, + "grad_norm": 0.2503761351108551, + "learning_rate": 2.612871993477375e-06, + "loss": 3.839, + "step": 87015 + }, + { + "epoch": 5.912488109797527, + "grad_norm": 0.24823561310768127, + "learning_rate": 2.6124473433890474e-06, + "loss": 3.8399, + "step": 87020 + }, + { + "epoch": 5.9128278298681884, + "grad_norm": 0.2809832692146301, + "learning_rate": 2.61202269330072e-06, + "loss": 3.9982, + "step": 87025 + }, + { + "epoch": 5.9131675499388505, + "grad_norm": 0.25656476616859436, + "learning_rate": 2.6115980432123934e-06, + "loss": 4.186, + "step": 87030 + }, + { + "epoch": 5.913507270009513, + "grad_norm": 0.27938711643218994, + "learning_rate": 2.6111733931240662e-06, + "loss": 4.0119, + "step": 87035 + }, + { + "epoch": 5.913846990080174, + "grad_norm": 0.26148366928100586, + "learning_rate": 2.6107487430357386e-06, + "loss": 4.0653, + "step": 87040 + }, + { + "epoch": 5.914186710150836, + "grad_norm": 0.2173541635274887, + "learning_rate": 2.610324092947412e-06, + "loss": 3.7491, + "step": 87045 + }, + { + "epoch": 5.914526430221498, + "grad_norm": 0.3000587821006775, + "learning_rate": 2.6098994428590846e-06, + "loss": 3.9024, + "step": 87050 + }, + { + "epoch": 5.914866150292159, + "grad_norm": 0.3986368477344513, + "learning_rate": 2.609474792770757e-06, + "loss": 4.1102, + "step": 87055 + }, + { + "epoch": 5.915205870362821, + "grad_norm": 0.33577632904052734, + "learning_rate": 2.60905014268243e-06, + "loss": 3.802, + "step": 87060 + }, + { + "epoch": 5.915545590433483, + "grad_norm": 0.29104772210121155, + "learning_rate": 2.608625492594103e-06, + "loss": 3.8749, + "step": 87065 + }, + { + "epoch": 5.9158853105041445, + "grad_norm": 0.29224634170532227, + "learning_rate": 2.6082008425057754e-06, + "loss": 3.9324, + "step": 87070 + }, + { + "epoch": 5.9162250305748065, + "grad_norm": 0.21152803301811218, + "learning_rate": 2.607776192417448e-06, + "loss": 3.8808, + "step": 87075 + }, + { + "epoch": 5.916564750645469, + "grad_norm": 0.3265323042869568, + "learning_rate": 2.6073515423291214e-06, + "loss": 3.9331, + "step": 87080 + }, + { + "epoch": 5.91690447071613, + "grad_norm": 0.27759429812431335, + "learning_rate": 2.606926892240794e-06, + "loss": 3.8015, + "step": 87085 + }, + { + "epoch": 5.917244190786792, + "grad_norm": 0.20427224040031433, + "learning_rate": 2.6065022421524666e-06, + "loss": 4.0526, + "step": 87090 + }, + { + "epoch": 5.917583910857454, + "grad_norm": 0.25047001242637634, + "learning_rate": 2.6060775920641394e-06, + "loss": 3.7418, + "step": 87095 + }, + { + "epoch": 5.917923630928115, + "grad_norm": 0.21915271878242493, + "learning_rate": 2.6056529419758118e-06, + "loss": 3.774, + "step": 87100 + }, + { + "epoch": 5.918263350998777, + "grad_norm": 0.2332753986120224, + "learning_rate": 2.605228291887485e-06, + "loss": 4.0395, + "step": 87105 + }, + { + "epoch": 5.918603071069439, + "grad_norm": 0.23787091672420502, + "learning_rate": 2.604803641799158e-06, + "loss": 3.8864, + "step": 87110 + }, + { + "epoch": 5.9189427911401005, + "grad_norm": 0.214174285531044, + "learning_rate": 2.60437899171083e-06, + "loss": 3.9448, + "step": 87115 + }, + { + "epoch": 5.9192825112107625, + "grad_norm": 0.2497115582227707, + "learning_rate": 2.6039543416225034e-06, + "loss": 3.7077, + "step": 87120 + }, + { + "epoch": 5.919622231281424, + "grad_norm": 0.2987624704837799, + "learning_rate": 2.603529691534176e-06, + "loss": 4.0522, + "step": 87125 + }, + { + "epoch": 5.919961951352086, + "grad_norm": 0.24795612692832947, + "learning_rate": 2.6031050414458486e-06, + "loss": 3.6779, + "step": 87130 + }, + { + "epoch": 5.920301671422748, + "grad_norm": 0.2546152174472809, + "learning_rate": 2.602680391357522e-06, + "loss": 3.9579, + "step": 87135 + }, + { + "epoch": 5.920641391493409, + "grad_norm": 0.21578721702098846, + "learning_rate": 2.6022557412691946e-06, + "loss": 3.8481, + "step": 87140 + }, + { + "epoch": 5.920981111564071, + "grad_norm": 0.3551231324672699, + "learning_rate": 2.601831091180867e-06, + "loss": 4.1212, + "step": 87145 + }, + { + "epoch": 5.921320831634733, + "grad_norm": 0.2299259454011917, + "learning_rate": 2.6014064410925398e-06, + "loss": 3.9454, + "step": 87150 + }, + { + "epoch": 5.921660551705394, + "grad_norm": 0.35109931230545044, + "learning_rate": 2.600981791004213e-06, + "loss": 3.964, + "step": 87155 + }, + { + "epoch": 5.9220002717760565, + "grad_norm": 0.25657162070274353, + "learning_rate": 2.6005571409158854e-06, + "loss": 3.8213, + "step": 87160 + }, + { + "epoch": 5.9223399918467186, + "grad_norm": 0.2592407464981079, + "learning_rate": 2.600132490827558e-06, + "loss": 4.1268, + "step": 87165 + }, + { + "epoch": 5.92267971191738, + "grad_norm": 0.2835884392261505, + "learning_rate": 2.5997078407392314e-06, + "loss": 3.8898, + "step": 87170 + }, + { + "epoch": 5.923019431988042, + "grad_norm": 0.3949066996574402, + "learning_rate": 2.599283190650904e-06, + "loss": 3.7845, + "step": 87175 + }, + { + "epoch": 5.923359152058704, + "grad_norm": 0.34436774253845215, + "learning_rate": 2.5988585405625766e-06, + "loss": 3.9027, + "step": 87180 + }, + { + "epoch": 5.923698872129365, + "grad_norm": 0.2218790501356125, + "learning_rate": 2.5984338904742494e-06, + "loss": 3.8261, + "step": 87185 + }, + { + "epoch": 5.924038592200027, + "grad_norm": 0.29574817419052124, + "learning_rate": 2.5980092403859218e-06, + "loss": 3.907, + "step": 87190 + }, + { + "epoch": 5.924378312270689, + "grad_norm": 0.26544392108917236, + "learning_rate": 2.597584590297595e-06, + "loss": 4.1439, + "step": 87195 + }, + { + "epoch": 5.92471803234135, + "grad_norm": 0.20912839472293854, + "learning_rate": 2.597159940209268e-06, + "loss": 4.0988, + "step": 87200 + }, + { + "epoch": 5.9250577524120125, + "grad_norm": 0.2674880027770996, + "learning_rate": 2.596735290120941e-06, + "loss": 3.8461, + "step": 87205 + }, + { + "epoch": 5.925397472482675, + "grad_norm": 0.27429652214050293, + "learning_rate": 2.5963106400326134e-06, + "loss": 3.8589, + "step": 87210 + }, + { + "epoch": 5.925737192553336, + "grad_norm": 0.334215372800827, + "learning_rate": 2.595885989944286e-06, + "loss": 4.0001, + "step": 87215 + }, + { + "epoch": 5.926076912623998, + "grad_norm": 0.26873424649238586, + "learning_rate": 2.595461339855959e-06, + "loss": 3.901, + "step": 87220 + }, + { + "epoch": 5.92641663269466, + "grad_norm": 0.3285906910896301, + "learning_rate": 2.5950366897676314e-06, + "loss": 3.9495, + "step": 87225 + }, + { + "epoch": 5.926756352765321, + "grad_norm": 0.28505992889404297, + "learning_rate": 2.5946120396793046e-06, + "loss": 3.8235, + "step": 87230 + }, + { + "epoch": 5.927096072835983, + "grad_norm": 0.20084762573242188, + "learning_rate": 2.5941873895909774e-06, + "loss": 3.9258, + "step": 87235 + }, + { + "epoch": 5.927435792906645, + "grad_norm": 0.20335641503334045, + "learning_rate": 2.5937627395026498e-06, + "loss": 3.7152, + "step": 87240 + }, + { + "epoch": 5.927775512977306, + "grad_norm": 0.34662720561027527, + "learning_rate": 2.593338089414323e-06, + "loss": 3.7828, + "step": 87245 + }, + { + "epoch": 5.9281152330479685, + "grad_norm": 0.31191059947013855, + "learning_rate": 2.592913439325996e-06, + "loss": 4.0888, + "step": 87250 + }, + { + "epoch": 5.928454953118631, + "grad_norm": 0.3482598662376404, + "learning_rate": 2.592488789237668e-06, + "loss": 3.6042, + "step": 87255 + }, + { + "epoch": 5.928794673189292, + "grad_norm": 0.4490131735801697, + "learning_rate": 2.592064139149341e-06, + "loss": 3.9308, + "step": 87260 + }, + { + "epoch": 5.929134393259954, + "grad_norm": 0.31588342785835266, + "learning_rate": 2.591639489061014e-06, + "loss": 3.7361, + "step": 87265 + }, + { + "epoch": 5.929474113330616, + "grad_norm": 0.2418823540210724, + "learning_rate": 2.5912148389726866e-06, + "loss": 4.0951, + "step": 87270 + }, + { + "epoch": 5.929813833401277, + "grad_norm": 0.2595503032207489, + "learning_rate": 2.5907901888843594e-06, + "loss": 3.7892, + "step": 87275 + }, + { + "epoch": 5.930153553471939, + "grad_norm": 0.33005836606025696, + "learning_rate": 2.5903655387960326e-06, + "loss": 3.7479, + "step": 87280 + }, + { + "epoch": 5.930493273542601, + "grad_norm": 0.22316081821918488, + "learning_rate": 2.589940888707705e-06, + "loss": 3.8795, + "step": 87285 + }, + { + "epoch": 5.930832993613262, + "grad_norm": 0.25558343529701233, + "learning_rate": 2.5895162386193778e-06, + "loss": 3.9722, + "step": 87290 + }, + { + "epoch": 5.9311727136839245, + "grad_norm": 0.21518567204475403, + "learning_rate": 2.5891765185487163e-06, + "loss": 3.9459, + "step": 87295 + }, + { + "epoch": 5.931512433754587, + "grad_norm": 0.5415807366371155, + "learning_rate": 2.588751868460389e-06, + "loss": 3.9883, + "step": 87300 + }, + { + "epoch": 5.931852153825248, + "grad_norm": 0.2610497772693634, + "learning_rate": 2.5883272183720615e-06, + "loss": 3.7877, + "step": 87305 + }, + { + "epoch": 5.93219187389591, + "grad_norm": 0.2769971489906311, + "learning_rate": 2.5879025682837343e-06, + "loss": 3.9177, + "step": 87310 + }, + { + "epoch": 5.932531593966572, + "grad_norm": 0.2594846487045288, + "learning_rate": 2.5874779181954075e-06, + "loss": 3.9544, + "step": 87315 + }, + { + "epoch": 5.932871314037233, + "grad_norm": 0.1932183802127838, + "learning_rate": 2.58705326810708e-06, + "loss": 3.7517, + "step": 87320 + }, + { + "epoch": 5.933211034107895, + "grad_norm": 0.6498115062713623, + "learning_rate": 2.5866286180187527e-06, + "loss": 3.9094, + "step": 87325 + }, + { + "epoch": 5.933550754178556, + "grad_norm": 0.2486027628183365, + "learning_rate": 2.586203967930426e-06, + "loss": 3.9122, + "step": 87330 + }, + { + "epoch": 5.9338904742492184, + "grad_norm": 0.5118206739425659, + "learning_rate": 2.5857793178420983e-06, + "loss": 4.0126, + "step": 87335 + }, + { + "epoch": 5.9342301943198805, + "grad_norm": 0.24973665177822113, + "learning_rate": 2.585354667753771e-06, + "loss": 4.2088, + "step": 87340 + }, + { + "epoch": 5.934569914390542, + "grad_norm": 0.23232749104499817, + "learning_rate": 2.584930017665444e-06, + "loss": 3.7446, + "step": 87345 + }, + { + "epoch": 5.934909634461204, + "grad_norm": 0.2755412459373474, + "learning_rate": 2.5845053675771167e-06, + "loss": 3.8632, + "step": 87350 + }, + { + "epoch": 5.935249354531866, + "grad_norm": 0.21365360915660858, + "learning_rate": 2.5840807174887895e-06, + "loss": 3.95, + "step": 87355 + }, + { + "epoch": 5.935589074602527, + "grad_norm": 0.29812541604042053, + "learning_rate": 2.5836560674004623e-06, + "loss": 4.0126, + "step": 87360 + }, + { + "epoch": 5.935928794673189, + "grad_norm": 0.2421969175338745, + "learning_rate": 2.5832314173121346e-06, + "loss": 3.8051, + "step": 87365 + }, + { + "epoch": 5.936268514743851, + "grad_norm": 0.2366286963224411, + "learning_rate": 2.582806767223808e-06, + "loss": 3.8143, + "step": 87370 + }, + { + "epoch": 5.936608234814512, + "grad_norm": 0.23830848932266235, + "learning_rate": 2.5823821171354807e-06, + "loss": 3.788, + "step": 87375 + }, + { + "epoch": 5.9369479548851745, + "grad_norm": 0.362316370010376, + "learning_rate": 2.581957467047153e-06, + "loss": 3.8436, + "step": 87380 + }, + { + "epoch": 5.9372876749558365, + "grad_norm": 0.28358152508735657, + "learning_rate": 2.5815328169588263e-06, + "loss": 3.9563, + "step": 87385 + }, + { + "epoch": 5.937627395026498, + "grad_norm": 0.3196503520011902, + "learning_rate": 2.581108166870499e-06, + "loss": 3.7678, + "step": 87390 + }, + { + "epoch": 5.93796711509716, + "grad_norm": 0.26822763681411743, + "learning_rate": 2.5806835167821714e-06, + "loss": 3.7872, + "step": 87395 + }, + { + "epoch": 5.938306835167822, + "grad_norm": 0.22345203161239624, + "learning_rate": 2.5802588666938442e-06, + "loss": 3.5225, + "step": 87400 + }, + { + "epoch": 5.938646555238483, + "grad_norm": 0.32081881165504456, + "learning_rate": 2.5798342166055175e-06, + "loss": 3.8071, + "step": 87405 + }, + { + "epoch": 5.938986275309145, + "grad_norm": 0.2765030562877655, + "learning_rate": 2.5794095665171903e-06, + "loss": 4.0162, + "step": 87410 + }, + { + "epoch": 5.939325995379807, + "grad_norm": 0.26352328062057495, + "learning_rate": 2.5789849164288626e-06, + "loss": 3.7855, + "step": 87415 + }, + { + "epoch": 5.939665715450468, + "grad_norm": 0.26863518357276917, + "learning_rate": 2.578560266340536e-06, + "loss": 3.9301, + "step": 87420 + }, + { + "epoch": 5.9400054355211305, + "grad_norm": 0.2641497254371643, + "learning_rate": 2.5781356162522087e-06, + "loss": 3.6991, + "step": 87425 + }, + { + "epoch": 5.9403451555917925, + "grad_norm": 0.18781058490276337, + "learning_rate": 2.577710966163881e-06, + "loss": 3.5769, + "step": 87430 + }, + { + "epoch": 5.940684875662454, + "grad_norm": 0.21771904826164246, + "learning_rate": 2.577286316075554e-06, + "loss": 3.993, + "step": 87435 + }, + { + "epoch": 5.941024595733116, + "grad_norm": 0.4776995778083801, + "learning_rate": 2.576861665987227e-06, + "loss": 3.9061, + "step": 87440 + }, + { + "epoch": 5.941364315803778, + "grad_norm": 0.2492912858724594, + "learning_rate": 2.5764370158988995e-06, + "loss": 3.8034, + "step": 87445 + }, + { + "epoch": 5.941704035874439, + "grad_norm": 0.23994185030460358, + "learning_rate": 2.5760123658105723e-06, + "loss": 3.7964, + "step": 87450 + }, + { + "epoch": 5.942043755945101, + "grad_norm": 0.24473778903484344, + "learning_rate": 2.5755877157222455e-06, + "loss": 4.1109, + "step": 87455 + }, + { + "epoch": 5.942383476015763, + "grad_norm": 0.2054416984319687, + "learning_rate": 2.575163065633918e-06, + "loss": 3.778, + "step": 87460 + }, + { + "epoch": 5.942723196086424, + "grad_norm": 0.2836366891860962, + "learning_rate": 2.5747384155455907e-06, + "loss": 3.7976, + "step": 87465 + }, + { + "epoch": 5.9430629161570865, + "grad_norm": 0.23921063542366028, + "learning_rate": 2.5743137654572635e-06, + "loss": 3.7684, + "step": 87470 + }, + { + "epoch": 5.943402636227749, + "grad_norm": 0.3032054901123047, + "learning_rate": 2.573889115368936e-06, + "loss": 3.936, + "step": 87475 + }, + { + "epoch": 5.94374235629841, + "grad_norm": 0.2306286245584488, + "learning_rate": 2.573464465280609e-06, + "loss": 3.9526, + "step": 87480 + }, + { + "epoch": 5.944082076369072, + "grad_norm": 0.247982919216156, + "learning_rate": 2.573039815192282e-06, + "loss": 3.8751, + "step": 87485 + }, + { + "epoch": 5.944421796439734, + "grad_norm": 0.2359042465686798, + "learning_rate": 2.5726151651039542e-06, + "loss": 3.7128, + "step": 87490 + }, + { + "epoch": 5.944761516510395, + "grad_norm": 0.2671075165271759, + "learning_rate": 2.5721905150156275e-06, + "loss": 3.9813, + "step": 87495 + }, + { + "epoch": 5.945101236581057, + "grad_norm": 0.34497368335723877, + "learning_rate": 2.5717658649273003e-06, + "loss": 4.112, + "step": 87500 + }, + { + "epoch": 5.945440956651719, + "grad_norm": 0.22433502972126007, + "learning_rate": 2.5713412148389726e-06, + "loss": 3.8458, + "step": 87505 + }, + { + "epoch": 5.94578067672238, + "grad_norm": 0.24650296568870544, + "learning_rate": 2.570916564750646e-06, + "loss": 3.7392, + "step": 87510 + }, + { + "epoch": 5.9461203967930425, + "grad_norm": 0.21916231513023376, + "learning_rate": 2.5704919146623187e-06, + "loss": 3.92, + "step": 87515 + }, + { + "epoch": 5.946460116863705, + "grad_norm": 0.36708274483680725, + "learning_rate": 2.570067264573991e-06, + "loss": 4.0109, + "step": 87520 + }, + { + "epoch": 5.946799836934366, + "grad_norm": 0.227133646607399, + "learning_rate": 2.569642614485664e-06, + "loss": 4.0166, + "step": 87525 + }, + { + "epoch": 5.947139557005028, + "grad_norm": 0.224228173494339, + "learning_rate": 2.569217964397337e-06, + "loss": 3.7849, + "step": 87530 + }, + { + "epoch": 5.94747927707569, + "grad_norm": 0.23783425986766815, + "learning_rate": 2.5687933143090094e-06, + "loss": 4.0052, + "step": 87535 + }, + { + "epoch": 5.947818997146351, + "grad_norm": 0.21257339417934418, + "learning_rate": 2.5683686642206822e-06, + "loss": 4.0432, + "step": 87540 + }, + { + "epoch": 5.948158717217013, + "grad_norm": 0.3164011538028717, + "learning_rate": 2.5679440141323555e-06, + "loss": 3.8206, + "step": 87545 + }, + { + "epoch": 5.948498437287675, + "grad_norm": 0.21591296792030334, + "learning_rate": 2.567519364044028e-06, + "loss": 3.7746, + "step": 87550 + }, + { + "epoch": 5.948838157358336, + "grad_norm": 0.36654552817344666, + "learning_rate": 2.5670947139557006e-06, + "loss": 4.1756, + "step": 87555 + }, + { + "epoch": 5.9491778774289985, + "grad_norm": 0.22300627827644348, + "learning_rate": 2.5666700638673734e-06, + "loss": 3.5914, + "step": 87560 + }, + { + "epoch": 5.949517597499661, + "grad_norm": 0.25691428780555725, + "learning_rate": 2.566245413779046e-06, + "loss": 3.8403, + "step": 87565 + }, + { + "epoch": 5.949857317570322, + "grad_norm": 0.2789161205291748, + "learning_rate": 2.565820763690719e-06, + "loss": 3.9207, + "step": 87570 + }, + { + "epoch": 5.950197037640984, + "grad_norm": 0.2509017288684845, + "learning_rate": 2.565396113602392e-06, + "loss": 3.8925, + "step": 87575 + }, + { + "epoch": 5.950536757711646, + "grad_norm": 0.3868853747844696, + "learning_rate": 2.564971463514065e-06, + "loss": 4.0466, + "step": 87580 + }, + { + "epoch": 5.950876477782307, + "grad_norm": 0.24429330229759216, + "learning_rate": 2.5645468134257375e-06, + "loss": 4.0189, + "step": 87585 + }, + { + "epoch": 5.951216197852969, + "grad_norm": 0.38168928027153015, + "learning_rate": 2.5641221633374103e-06, + "loss": 3.6696, + "step": 87590 + }, + { + "epoch": 5.951555917923631, + "grad_norm": 0.3110882043838501, + "learning_rate": 2.563697513249083e-06, + "loss": 3.7886, + "step": 87595 + }, + { + "epoch": 5.951895637994292, + "grad_norm": 0.2097039520740509, + "learning_rate": 2.5632728631607554e-06, + "loss": 3.6992, + "step": 87600 + }, + { + "epoch": 5.9522353580649545, + "grad_norm": 0.3073577582836151, + "learning_rate": 2.5628482130724287e-06, + "loss": 4.0395, + "step": 87605 + }, + { + "epoch": 5.952575078135617, + "grad_norm": 0.27888014912605286, + "learning_rate": 2.5624235629841015e-06, + "loss": 3.9716, + "step": 87610 + }, + { + "epoch": 5.952914798206278, + "grad_norm": 0.23141072690486908, + "learning_rate": 2.561998912895774e-06, + "loss": 4.0544, + "step": 87615 + }, + { + "epoch": 5.95325451827694, + "grad_norm": 0.2432960420846939, + "learning_rate": 2.561574262807447e-06, + "loss": 3.7095, + "step": 87620 + }, + { + "epoch": 5.953594238347602, + "grad_norm": 0.2858640253543854, + "learning_rate": 2.56114961271912e-06, + "loss": 3.6627, + "step": 87625 + }, + { + "epoch": 5.953933958418263, + "grad_norm": 0.21550323069095612, + "learning_rate": 2.5607249626307922e-06, + "loss": 4.1047, + "step": 87630 + }, + { + "epoch": 5.954273678488925, + "grad_norm": 0.3128422498703003, + "learning_rate": 2.5603003125424655e-06, + "loss": 3.9253, + "step": 87635 + }, + { + "epoch": 5.954613398559587, + "grad_norm": 0.1975450962781906, + "learning_rate": 2.5598756624541383e-06, + "loss": 3.688, + "step": 87640 + }, + { + "epoch": 5.9549531186302485, + "grad_norm": 0.2498738169670105, + "learning_rate": 2.5594510123658106e-06, + "loss": 3.8645, + "step": 87645 + }, + { + "epoch": 5.9552928387009105, + "grad_norm": 0.23850564658641815, + "learning_rate": 2.5590263622774834e-06, + "loss": 3.8754, + "step": 87650 + }, + { + "epoch": 5.955632558771573, + "grad_norm": 0.27080559730529785, + "learning_rate": 2.5586017121891567e-06, + "loss": 3.8434, + "step": 87655 + }, + { + "epoch": 5.955972278842234, + "grad_norm": 0.22699140012264252, + "learning_rate": 2.558177062100829e-06, + "loss": 3.8874, + "step": 87660 + }, + { + "epoch": 5.956311998912896, + "grad_norm": 0.3092653155326843, + "learning_rate": 2.557752412012502e-06, + "loss": 4.0237, + "step": 87665 + }, + { + "epoch": 5.956651718983558, + "grad_norm": 0.3002939522266388, + "learning_rate": 2.557327761924175e-06, + "loss": 3.7516, + "step": 87670 + }, + { + "epoch": 5.956991439054219, + "grad_norm": 0.2846837341785431, + "learning_rate": 2.5569031118358474e-06, + "loss": 3.9821, + "step": 87675 + }, + { + "epoch": 5.957331159124881, + "grad_norm": 0.246709942817688, + "learning_rate": 2.5564784617475202e-06, + "loss": 4.0702, + "step": 87680 + }, + { + "epoch": 5.957670879195543, + "grad_norm": 0.2704564929008484, + "learning_rate": 2.556053811659193e-06, + "loss": 4.01, + "step": 87685 + }, + { + "epoch": 5.9580105992662045, + "grad_norm": 0.30002400279045105, + "learning_rate": 2.5556291615708654e-06, + "loss": 3.7795, + "step": 87690 + }, + { + "epoch": 5.9583503193368665, + "grad_norm": 0.2552725374698639, + "learning_rate": 2.5552045114825386e-06, + "loss": 3.6777, + "step": 87695 + }, + { + "epoch": 5.958690039407529, + "grad_norm": 0.23182259500026703, + "learning_rate": 2.5547798613942114e-06, + "loss": 4.1058, + "step": 87700 + }, + { + "epoch": 5.95902975947819, + "grad_norm": 0.24562683701515198, + "learning_rate": 2.554355211305884e-06, + "loss": 4.2766, + "step": 87705 + }, + { + "epoch": 5.959369479548852, + "grad_norm": 0.2680206894874573, + "learning_rate": 2.553930561217557e-06, + "loss": 4.0294, + "step": 87710 + }, + { + "epoch": 5.959709199619514, + "grad_norm": 0.20610454678535461, + "learning_rate": 2.55350591112923e-06, + "loss": 3.8157, + "step": 87715 + }, + { + "epoch": 5.960048919690175, + "grad_norm": 0.24852462112903595, + "learning_rate": 2.5530812610409022e-06, + "loss": 4.1531, + "step": 87720 + }, + { + "epoch": 5.960388639760837, + "grad_norm": 0.26918497681617737, + "learning_rate": 2.552656610952575e-06, + "loss": 3.9709, + "step": 87725 + }, + { + "epoch": 5.960728359831499, + "grad_norm": 0.277883380651474, + "learning_rate": 2.5522319608642483e-06, + "loss": 4.057, + "step": 87730 + }, + { + "epoch": 5.9610680799021605, + "grad_norm": 0.25477609038352966, + "learning_rate": 2.5518073107759206e-06, + "loss": 3.921, + "step": 87735 + }, + { + "epoch": 5.9614077999728226, + "grad_norm": 0.2610029876232147, + "learning_rate": 2.5513826606875934e-06, + "loss": 4.0277, + "step": 87740 + }, + { + "epoch": 5.961747520043485, + "grad_norm": 0.28030961751937866, + "learning_rate": 2.5509580105992667e-06, + "loss": 3.6446, + "step": 87745 + }, + { + "epoch": 5.962087240114146, + "grad_norm": 0.2371527999639511, + "learning_rate": 2.5505333605109395e-06, + "loss": 4.0904, + "step": 87750 + }, + { + "epoch": 5.962426960184808, + "grad_norm": 0.276883065700531, + "learning_rate": 2.550108710422612e-06, + "loss": 4.075, + "step": 87755 + }, + { + "epoch": 5.96276668025547, + "grad_norm": 0.2857666313648224, + "learning_rate": 2.5496840603342846e-06, + "loss": 4.0011, + "step": 87760 + }, + { + "epoch": 5.963106400326131, + "grad_norm": 0.23830485343933105, + "learning_rate": 2.549259410245958e-06, + "loss": 4.056, + "step": 87765 + }, + { + "epoch": 5.963446120396793, + "grad_norm": 0.2888234853744507, + "learning_rate": 2.5488347601576302e-06, + "loss": 4.103, + "step": 87770 + }, + { + "epoch": 5.963785840467455, + "grad_norm": 0.28791195154190063, + "learning_rate": 2.548410110069303e-06, + "loss": 3.888, + "step": 87775 + }, + { + "epoch": 5.9641255605381165, + "grad_norm": 0.2044059783220291, + "learning_rate": 2.5479854599809763e-06, + "loss": 4.0129, + "step": 87780 + }, + { + "epoch": 5.964465280608779, + "grad_norm": 0.25082066655158997, + "learning_rate": 2.5475608098926486e-06, + "loss": 4.0165, + "step": 87785 + }, + { + "epoch": 5.964805000679441, + "grad_norm": 0.2686486542224884, + "learning_rate": 2.5471361598043214e-06, + "loss": 3.8713, + "step": 87790 + }, + { + "epoch": 5.965144720750102, + "grad_norm": 0.2148762196302414, + "learning_rate": 2.5467115097159947e-06, + "loss": 3.8344, + "step": 87795 + }, + { + "epoch": 5.965484440820764, + "grad_norm": 0.24818755686283112, + "learning_rate": 2.546286859627667e-06, + "loss": 4.0884, + "step": 87800 + }, + { + "epoch": 5.965824160891425, + "grad_norm": 0.25829046964645386, + "learning_rate": 2.54586220953934e-06, + "loss": 3.8623, + "step": 87805 + }, + { + "epoch": 5.966163880962087, + "grad_norm": 0.25247639417648315, + "learning_rate": 2.5454375594510126e-06, + "loss": 3.9567, + "step": 87810 + }, + { + "epoch": 5.966503601032749, + "grad_norm": 0.25429975986480713, + "learning_rate": 2.545012909362685e-06, + "loss": 4.0829, + "step": 87815 + }, + { + "epoch": 5.96684332110341, + "grad_norm": 0.2432921826839447, + "learning_rate": 2.5445882592743582e-06, + "loss": 3.8128, + "step": 87820 + }, + { + "epoch": 5.9671830411740725, + "grad_norm": 0.22944746911525726, + "learning_rate": 2.544163609186031e-06, + "loss": 3.8294, + "step": 87825 + }, + { + "epoch": 5.967522761244735, + "grad_norm": 0.2587786912918091, + "learning_rate": 2.5437389590977034e-06, + "loss": 3.9658, + "step": 87830 + }, + { + "epoch": 5.967862481315396, + "grad_norm": 0.5554602742195129, + "learning_rate": 2.5433143090093766e-06, + "loss": 3.9295, + "step": 87835 + }, + { + "epoch": 5.968202201386058, + "grad_norm": 0.2601313591003418, + "learning_rate": 2.5428896589210494e-06, + "loss": 3.8772, + "step": 87840 + }, + { + "epoch": 5.96854192145672, + "grad_norm": 0.20984899997711182, + "learning_rate": 2.542465008832722e-06, + "loss": 3.9834, + "step": 87845 + }, + { + "epoch": 5.968881641527381, + "grad_norm": 0.27745726704597473, + "learning_rate": 2.5420403587443946e-06, + "loss": 3.9617, + "step": 87850 + }, + { + "epoch": 5.969221361598043, + "grad_norm": 0.3424879312515259, + "learning_rate": 2.541615708656068e-06, + "loss": 3.9086, + "step": 87855 + }, + { + "epoch": 5.969561081668705, + "grad_norm": 0.2841462790966034, + "learning_rate": 2.5411910585677402e-06, + "loss": 4.0195, + "step": 87860 + }, + { + "epoch": 5.969900801739366, + "grad_norm": 0.22344183921813965, + "learning_rate": 2.540766408479413e-06, + "loss": 4.0578, + "step": 87865 + }, + { + "epoch": 5.9702405218100285, + "grad_norm": 0.3041098117828369, + "learning_rate": 2.5403417583910862e-06, + "loss": 3.8671, + "step": 87870 + }, + { + "epoch": 5.970580241880691, + "grad_norm": 0.3396134078502655, + "learning_rate": 2.5399171083027586e-06, + "loss": 3.9874, + "step": 87875 + }, + { + "epoch": 5.970919961951352, + "grad_norm": 0.31919220089912415, + "learning_rate": 2.5394924582144314e-06, + "loss": 3.7807, + "step": 87880 + }, + { + "epoch": 5.971259682022014, + "grad_norm": 0.20191656053066254, + "learning_rate": 2.5390678081261042e-06, + "loss": 4.0158, + "step": 87885 + }, + { + "epoch": 5.971599402092676, + "grad_norm": 0.5088294148445129, + "learning_rate": 2.538643158037777e-06, + "loss": 4.0631, + "step": 87890 + }, + { + "epoch": 5.971939122163337, + "grad_norm": 0.2341032177209854, + "learning_rate": 2.53821850794945e-06, + "loss": 4.0497, + "step": 87895 + }, + { + "epoch": 5.972278842233999, + "grad_norm": 0.27917614579200745, + "learning_rate": 2.5377938578611226e-06, + "loss": 3.6242, + "step": 87900 + }, + { + "epoch": 5.972618562304661, + "grad_norm": 0.20996466279029846, + "learning_rate": 2.537369207772795e-06, + "loss": 3.8121, + "step": 87905 + }, + { + "epoch": 5.9729582823753224, + "grad_norm": 0.2482462078332901, + "learning_rate": 2.5369445576844682e-06, + "loss": 4.1435, + "step": 87910 + }, + { + "epoch": 5.9732980024459845, + "grad_norm": 0.21646089851856232, + "learning_rate": 2.536519907596141e-06, + "loss": 4.0225, + "step": 87915 + }, + { + "epoch": 5.973637722516647, + "grad_norm": 0.7734746932983398, + "learning_rate": 2.5360952575078143e-06, + "loss": 3.9369, + "step": 87920 + }, + { + "epoch": 5.973977442587308, + "grad_norm": 0.2773095965385437, + "learning_rate": 2.5356706074194866e-06, + "loss": 3.7769, + "step": 87925 + }, + { + "epoch": 5.97431716265797, + "grad_norm": 0.20962496101856232, + "learning_rate": 2.5352459573311594e-06, + "loss": 3.9063, + "step": 87930 + }, + { + "epoch": 5.974656882728632, + "grad_norm": 0.2751674950122833, + "learning_rate": 2.5348213072428322e-06, + "loss": 4.0476, + "step": 87935 + }, + { + "epoch": 5.974996602799293, + "grad_norm": 0.21952226758003235, + "learning_rate": 2.5343966571545046e-06, + "loss": 3.7066, + "step": 87940 + }, + { + "epoch": 5.975336322869955, + "grad_norm": 0.19920670986175537, + "learning_rate": 2.533972007066178e-06, + "loss": 4.0186, + "step": 87945 + }, + { + "epoch": 5.975676042940617, + "grad_norm": 0.21354059875011444, + "learning_rate": 2.5335473569778506e-06, + "loss": 3.9715, + "step": 87950 + }, + { + "epoch": 5.9760157630112785, + "grad_norm": 0.2568863034248352, + "learning_rate": 2.533122706889523e-06, + "loss": 3.6747, + "step": 87955 + }, + { + "epoch": 5.9763554830819405, + "grad_norm": 0.24022185802459717, + "learning_rate": 2.5326980568011962e-06, + "loss": 3.8781, + "step": 87960 + }, + { + "epoch": 5.976695203152603, + "grad_norm": 0.21239125728607178, + "learning_rate": 2.532273406712869e-06, + "loss": 3.8202, + "step": 87965 + }, + { + "epoch": 5.977034923223264, + "grad_norm": 0.23657214641571045, + "learning_rate": 2.5318487566245414e-06, + "loss": 3.9172, + "step": 87970 + }, + { + "epoch": 5.977374643293926, + "grad_norm": 0.20922064781188965, + "learning_rate": 2.5314241065362142e-06, + "loss": 4.0909, + "step": 87975 + }, + { + "epoch": 5.977714363364588, + "grad_norm": 0.23923267424106598, + "learning_rate": 2.5309994564478874e-06, + "loss": 3.8714, + "step": 87980 + }, + { + "epoch": 5.978054083435249, + "grad_norm": 0.19762203097343445, + "learning_rate": 2.53057480635956e-06, + "loss": 4.1454, + "step": 87985 + }, + { + "epoch": 5.978393803505911, + "grad_norm": 0.2821315824985504, + "learning_rate": 2.5301501562712326e-06, + "loss": 3.731, + "step": 87990 + }, + { + "epoch": 5.978733523576573, + "grad_norm": 0.25502827763557434, + "learning_rate": 2.529725506182906e-06, + "loss": 3.9458, + "step": 87995 + }, + { + "epoch": 5.9790732436472345, + "grad_norm": 0.27445530891418457, + "learning_rate": 2.5293008560945782e-06, + "loss": 3.8876, + "step": 88000 + }, + { + "epoch": 5.9794129637178965, + "grad_norm": 0.2805841565132141, + "learning_rate": 2.528876206006251e-06, + "loss": 3.9528, + "step": 88005 + }, + { + "epoch": 5.979752683788558, + "grad_norm": 0.22838255763053894, + "learning_rate": 2.528451555917924e-06, + "loss": 3.7425, + "step": 88010 + }, + { + "epoch": 5.98009240385922, + "grad_norm": 0.29028767347335815, + "learning_rate": 2.5280269058295966e-06, + "loss": 3.8395, + "step": 88015 + }, + { + "epoch": 5.980432123929882, + "grad_norm": 0.31161293387413025, + "learning_rate": 2.5276022557412694e-06, + "loss": 3.9076, + "step": 88020 + }, + { + "epoch": 5.980771844000543, + "grad_norm": 0.23553906381130219, + "learning_rate": 2.5271776056529422e-06, + "loss": 3.7662, + "step": 88025 + }, + { + "epoch": 5.981111564071205, + "grad_norm": 0.24734704196453094, + "learning_rate": 2.5267529555646146e-06, + "loss": 3.9748, + "step": 88030 + }, + { + "epoch": 5.981451284141867, + "grad_norm": 0.31537380814552307, + "learning_rate": 2.526328305476288e-06, + "loss": 4.001, + "step": 88035 + }, + { + "epoch": 5.981791004212528, + "grad_norm": 0.29691025614738464, + "learning_rate": 2.5259036553879606e-06, + "loss": 4.1334, + "step": 88040 + }, + { + "epoch": 5.9821307242831905, + "grad_norm": 0.2556968927383423, + "learning_rate": 2.525479005299633e-06, + "loss": 3.9884, + "step": 88045 + }, + { + "epoch": 5.982470444353853, + "grad_norm": 0.26348286867141724, + "learning_rate": 2.5250543552113062e-06, + "loss": 3.9652, + "step": 88050 + }, + { + "epoch": 5.982810164424514, + "grad_norm": 0.2287570983171463, + "learning_rate": 2.524629705122979e-06, + "loss": 3.9243, + "step": 88055 + }, + { + "epoch": 5.983149884495176, + "grad_norm": 0.19926303625106812, + "learning_rate": 2.5242050550346514e-06, + "loss": 3.8169, + "step": 88060 + }, + { + "epoch": 5.983489604565838, + "grad_norm": 0.26473215222358704, + "learning_rate": 2.523780404946324e-06, + "loss": 3.9842, + "step": 88065 + }, + { + "epoch": 5.983829324636499, + "grad_norm": 0.2611251473426819, + "learning_rate": 2.5233557548579974e-06, + "loss": 4.2043, + "step": 88070 + }, + { + "epoch": 5.984169044707161, + "grad_norm": 0.29406940937042236, + "learning_rate": 2.52293110476967e-06, + "loss": 3.7696, + "step": 88075 + }, + { + "epoch": 5.984508764777823, + "grad_norm": 0.3560085892677307, + "learning_rate": 2.5225064546813426e-06, + "loss": 4.3123, + "step": 88080 + }, + { + "epoch": 5.984848484848484, + "grad_norm": 0.3104545772075653, + "learning_rate": 2.522081804593016e-06, + "loss": 3.8363, + "step": 88085 + }, + { + "epoch": 5.9851882049191465, + "grad_norm": 0.2157534509897232, + "learning_rate": 2.5216571545046886e-06, + "loss": 3.9242, + "step": 88090 + }, + { + "epoch": 5.985527924989809, + "grad_norm": 0.42719995975494385, + "learning_rate": 2.521232504416361e-06, + "loss": 3.6828, + "step": 88095 + }, + { + "epoch": 5.98586764506047, + "grad_norm": 0.2604061961174011, + "learning_rate": 2.520807854328034e-06, + "loss": 3.9578, + "step": 88100 + }, + { + "epoch": 5.986207365131132, + "grad_norm": 0.20152801275253296, + "learning_rate": 2.520383204239707e-06, + "loss": 3.7298, + "step": 88105 + }, + { + "epoch": 5.986547085201794, + "grad_norm": 0.26265984773635864, + "learning_rate": 2.5199585541513794e-06, + "loss": 3.9152, + "step": 88110 + }, + { + "epoch": 5.986886805272455, + "grad_norm": 0.2162066251039505, + "learning_rate": 2.5195339040630522e-06, + "loss": 3.852, + "step": 88115 + }, + { + "epoch": 5.987226525343117, + "grad_norm": 0.2116173803806305, + "learning_rate": 2.5191092539747254e-06, + "loss": 3.7181, + "step": 88120 + }, + { + "epoch": 5.987566245413779, + "grad_norm": 0.29738572239875793, + "learning_rate": 2.518684603886398e-06, + "loss": 3.8208, + "step": 88125 + }, + { + "epoch": 5.98790596548444, + "grad_norm": 0.27107471227645874, + "learning_rate": 2.5182599537980706e-06, + "loss": 3.6251, + "step": 88130 + }, + { + "epoch": 5.9882456855551025, + "grad_norm": 0.24422264099121094, + "learning_rate": 2.5178353037097434e-06, + "loss": 3.6667, + "step": 88135 + }, + { + "epoch": 5.988585405625765, + "grad_norm": 0.24648068845272064, + "learning_rate": 2.5174106536214162e-06, + "loss": 3.7698, + "step": 88140 + }, + { + "epoch": 5.988925125696426, + "grad_norm": 0.25043267011642456, + "learning_rate": 2.516986003533089e-06, + "loss": 3.981, + "step": 88145 + }, + { + "epoch": 5.989264845767088, + "grad_norm": 0.2579296827316284, + "learning_rate": 2.516561353444762e-06, + "loss": 3.7858, + "step": 88150 + }, + { + "epoch": 5.98960456583775, + "grad_norm": 0.25565382838249207, + "learning_rate": 2.516136703356434e-06, + "loss": 4.0044, + "step": 88155 + }, + { + "epoch": 5.989944285908411, + "grad_norm": 0.24781431257724762, + "learning_rate": 2.5157120532681074e-06, + "loss": 3.6986, + "step": 88160 + }, + { + "epoch": 5.990284005979073, + "grad_norm": 0.34899041056632996, + "learning_rate": 2.5152874031797802e-06, + "loss": 3.9929, + "step": 88165 + }, + { + "epoch": 5.990623726049735, + "grad_norm": 0.39149147272109985, + "learning_rate": 2.5148627530914526e-06, + "loss": 4.1802, + "step": 88170 + }, + { + "epoch": 5.990963446120396, + "grad_norm": 0.39853164553642273, + "learning_rate": 2.514438103003126e-06, + "loss": 3.9054, + "step": 88175 + }, + { + "epoch": 5.9913031661910585, + "grad_norm": 0.31755319237709045, + "learning_rate": 2.5140134529147986e-06, + "loss": 4.2879, + "step": 88180 + }, + { + "epoch": 5.991642886261721, + "grad_norm": 0.25113773345947266, + "learning_rate": 2.513588802826471e-06, + "loss": 3.9684, + "step": 88185 + }, + { + "epoch": 5.991982606332382, + "grad_norm": 0.25516316294670105, + "learning_rate": 2.513164152738144e-06, + "loss": 3.8795, + "step": 88190 + }, + { + "epoch": 5.992322326403044, + "grad_norm": 0.3107559084892273, + "learning_rate": 2.512739502649817e-06, + "loss": 3.7737, + "step": 88195 + }, + { + "epoch": 5.992662046473706, + "grad_norm": 0.2767326235771179, + "learning_rate": 2.5123148525614894e-06, + "loss": 3.9611, + "step": 88200 + }, + { + "epoch": 5.993001766544367, + "grad_norm": 0.21171648800373077, + "learning_rate": 2.511890202473162e-06, + "loss": 3.7458, + "step": 88205 + }, + { + "epoch": 5.993341486615029, + "grad_norm": 0.22546793520450592, + "learning_rate": 2.5114655523848354e-06, + "loss": 3.9198, + "step": 88210 + }, + { + "epoch": 5.993681206685691, + "grad_norm": 0.24511562287807465, + "learning_rate": 2.511040902296508e-06, + "loss": 3.759, + "step": 88215 + }, + { + "epoch": 5.9940209267563525, + "grad_norm": 0.3589698374271393, + "learning_rate": 2.5106162522081806e-06, + "loss": 3.8363, + "step": 88220 + }, + { + "epoch": 5.9943606468270145, + "grad_norm": 0.25481414794921875, + "learning_rate": 2.5101916021198534e-06, + "loss": 3.8331, + "step": 88225 + }, + { + "epoch": 5.994700366897677, + "grad_norm": 0.32358404994010925, + "learning_rate": 2.5097669520315258e-06, + "loss": 3.9797, + "step": 88230 + }, + { + "epoch": 5.995040086968338, + "grad_norm": 0.25401440262794495, + "learning_rate": 2.509342301943199e-06, + "loss": 4.0163, + "step": 88235 + }, + { + "epoch": 5.995379807039, + "grad_norm": 0.22734218835830688, + "learning_rate": 2.508917651854872e-06, + "loss": 3.8398, + "step": 88240 + }, + { + "epoch": 5.995719527109662, + "grad_norm": 0.21327775716781616, + "learning_rate": 2.508493001766544e-06, + "loss": 3.9409, + "step": 88245 + }, + { + "epoch": 5.996059247180323, + "grad_norm": 0.30032795667648315, + "learning_rate": 2.5080683516782174e-06, + "loss": 4.0881, + "step": 88250 + }, + { + "epoch": 5.996398967250985, + "grad_norm": 0.23093946278095245, + "learning_rate": 2.5076437015898902e-06, + "loss": 3.6527, + "step": 88255 + }, + { + "epoch": 5.996738687321647, + "grad_norm": 0.21661454439163208, + "learning_rate": 2.507219051501563e-06, + "loss": 3.9188, + "step": 88260 + }, + { + "epoch": 5.9970784073923085, + "grad_norm": 0.2791610658168793, + "learning_rate": 2.5067944014132354e-06, + "loss": 3.9415, + "step": 88265 + }, + { + "epoch": 5.9974181274629705, + "grad_norm": 0.2791152894496918, + "learning_rate": 2.5063697513249086e-06, + "loss": 3.8292, + "step": 88270 + }, + { + "epoch": 5.997757847533633, + "grad_norm": 0.2263982892036438, + "learning_rate": 2.5059451012365814e-06, + "loss": 3.9509, + "step": 88275 + }, + { + "epoch": 5.998097567604294, + "grad_norm": 0.25813254714012146, + "learning_rate": 2.505520451148254e-06, + "loss": 3.785, + "step": 88280 + }, + { + "epoch": 5.998437287674956, + "grad_norm": 0.2234659343957901, + "learning_rate": 2.505095801059927e-06, + "loss": 3.8992, + "step": 88285 + }, + { + "epoch": 5.998777007745618, + "grad_norm": 0.2670188248157501, + "learning_rate": 2.5046711509716e-06, + "loss": 3.9118, + "step": 88290 + }, + { + "epoch": 5.999116727816279, + "grad_norm": 0.20987509191036224, + "learning_rate": 2.504246500883272e-06, + "loss": 3.7745, + "step": 88295 + }, + { + "epoch": 5.999456447886941, + "grad_norm": 0.3779248893260956, + "learning_rate": 2.5038218507949454e-06, + "loss": 3.9591, + "step": 88300 + }, + { + "epoch": 5.999796167957603, + "grad_norm": 0.39520204067230225, + "learning_rate": 2.5033972007066182e-06, + "loss": 3.9278, + "step": 88305 + }, + { + "epoch": 6.0, + "eval_bertscore": { + "f1": 0.8444761289264681, + "precision": 0.8551732047963413, + "recall": 0.8348271338003228 + }, + "eval_bleu_4": 0.004714649665768724, + "eval_exact_match": 0.0, + "eval_loss": 3.7413041591644287, + "eval_meteor": 0.0836018916328558, + "eval_rouge": { + "rouge1": 0.13432103425342795, + "rouge2": 0.014534844974374926, + "rougeL": 0.11210065467956493, + "rougeLsum": 0.11212250564737955 + }, + "eval_runtime": 269.3233, + "eval_samples_per_second": 38.315, + "eval_steps_per_second": 4.79, + "step": 88308 + } + ], + "logging_steps": 5, + "max_steps": 117744, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.1555928487795098e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}